{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_13": 11.511599779129028, "ce_loss_26": 11.188396453857422, "ce_loss_39": 11.169448137283325, "ce_loss_52": 1.3891706466674805, "ce_loss_7": 11.556999206542969, "epoch": 0.0001, "grad_norm": 28.059961985369828, "kl_loss_13": 20896.0, "kl_loss_26": 20192.0, "kl_loss_39": 20192.0, "kl_loss_7": 20960.0, "learning_rate": 1e-05, "loss": 41080.0, "step": 1 }, { "ce_loss_13": 11.506269454956055, "ce_loss_26": 11.177568621105618, "ce_loss_39": 11.177141269048056, "ce_loss_52": 1.458960132466422, "ce_loss_7": 11.548744599024454, "epoch": 0.001, "grad_norm": 28.667146352003318, "kl_loss_13": 20782.222222222223, "kl_loss_26": 20106.666666666668, "kl_loss_39": 20110.222222222223, "kl_loss_7": 20867.555555555555, "learning_rate": 0.0001, "loss": 41008.8889, "step": 10 }, { "ce_loss_13": 11.456571412086486, "ce_loss_26": 11.158088731765748, "ce_loss_39": 11.15653133392334, "ce_loss_52": 1.435088688135147, "ce_loss_7": 11.476131820678711, "epoch": 0.002, "grad_norm": 35.67456270110165, "kl_loss_13": 20723.2, "kl_loss_26": 20118.4, "kl_loss_39": 20115.2, "kl_loss_7": 20764.8, "learning_rate": 0.0002, "loss": 40904.0, "step": 20 }, { "ce_loss_13": 11.150281167030334, "ce_loss_26": 11.01296763420105, "ce_loss_39": 11.044581699371339, "ce_loss_52": 1.4344331562519073, "ce_loss_7": 11.054779505729675, "epoch": 0.003, "grad_norm": 54.04879245830703, "kl_loss_13": 20108.8, "kl_loss_26": 19840.0, "kl_loss_39": 19907.2, "kl_loss_7": 19920.0, "learning_rate": 0.0003, "loss": 39847.2, "step": 30 }, { "ce_loss_13": 10.505089902877808, "ce_loss_26": 10.497783017158508, "ce_loss_39": 10.527814579010009, "ce_loss_52": 1.460255417227745, "ce_loss_7": 10.453347158432006, "epoch": 0.004, "grad_norm": 29.567872258029254, "kl_loss_13": 18694.4, "kl_loss_26": 18688.0, "kl_loss_39": 18755.2, "kl_loss_7": 18588.8, "learning_rate": 0.0004, "loss": 37436.0, "step": 40 }, { "ce_loss_13": 10.321261882781982, "ce_loss_26": 10.244042158126831, "ce_loss_39": 10.236308455467224, "ce_loss_52": 1.463668829202652, "ce_loss_7": 10.305184721946716, "epoch": 0.005, "grad_norm": 37.9866452371617, "kl_loss_13": 18329.6, "kl_loss_26": 18163.2, "kl_loss_39": 18140.8, "kl_loss_7": 18288.0, "learning_rate": 0.0005, "loss": 36555.2, "step": 50 }, { "ce_loss_13": 10.226529097557068, "ce_loss_26": 10.111042308807374, "ce_loss_39": 10.11395993232727, "ce_loss_52": 1.4317695140838622, "ce_loss_7": 10.210216856002807, "epoch": 0.006, "grad_norm": 47.58894649950276, "kl_loss_13": 18208.0, "kl_loss_26": 17974.4, "kl_loss_39": 17980.8, "kl_loss_7": 18166.4, "learning_rate": 0.0006, "loss": 36044.0, "step": 60 }, { "ce_loss_13": 10.142269968986511, "ce_loss_26": 10.005614733695984, "ce_loss_39": 10.006310772895812, "ce_loss_52": 1.3979130625724792, "ce_loss_7": 10.13636019229889, "epoch": 0.007, "grad_norm": 55.16387671378209, "kl_loss_13": 18057.6, "kl_loss_26": 17772.8, "kl_loss_39": 17792.0, "kl_loss_7": 18048.0, "learning_rate": 0.0007, "loss": 35718.4, "step": 70 }, { "ce_loss_13": 10.032484984397888, "ce_loss_26": 9.872331905364991, "ce_loss_39": 9.881066274642944, "ce_loss_52": 1.4247985988855363, "ce_loss_7": 10.02949812412262, "epoch": 0.008, "grad_norm": 59.28947925840698, "kl_loss_13": 17811.2, "kl_loss_26": 17488.0, "kl_loss_39": 17500.8, "kl_loss_7": 17808.0, "learning_rate": 0.0008, "loss": 35334.4, "step": 80 }, { "ce_loss_13": 9.942931509017944, "ce_loss_26": 9.76176996231079, "ce_loss_39": 9.775496363639832, "ce_loss_52": 1.4258457243442535, "ce_loss_7": 9.945418453216552, "epoch": 0.009, "grad_norm": 55.94798234885439, "kl_loss_13": 17600.0, "kl_loss_26": 17222.4, "kl_loss_39": 17257.6, "kl_loss_7": 17600.0, "learning_rate": 0.0009000000000000001, "loss": 34900.0, "step": 90 }, { "ce_loss_13": 9.852771949768066, "ce_loss_26": 9.661247444152831, "ce_loss_39": 9.673552298545838, "ce_loss_52": 1.438367447257042, "ce_loss_7": 9.860591006278991, "epoch": 0.01, "grad_norm": 53.3090381296634, "kl_loss_13": 17385.6, "kl_loss_26": 16992.0, "kl_loss_39": 17024.0, "kl_loss_7": 17398.4, "learning_rate": 0.001, "loss": 34482.8, "step": 100 }, { "ce_loss_13": 9.76314606666565, "ce_loss_26": 9.563278603553773, "ce_loss_39": 9.578891181945801, "ce_loss_52": 1.412995059788227, "ce_loss_7": 9.781555676460266, "epoch": 0.011, "grad_norm": 53.230976887502294, "kl_loss_13": 17251.2, "kl_loss_26": 16836.8, "kl_loss_39": 16870.4, "kl_loss_7": 17305.6, "learning_rate": 0.0009999974825027757, "loss": 34052.4, "step": 110 }, { "ce_loss_13": 9.681499814987182, "ce_loss_26": 9.470890092849732, "ce_loss_39": 9.48718273639679, "ce_loss_52": 1.4235966846346855, "ce_loss_7": 9.706467342376708, "epoch": 0.012, "grad_norm": 53.526819502242695, "kl_loss_13": 17049.6, "kl_loss_26": 16612.8, "kl_loss_39": 16648.0, "kl_loss_7": 17100.8, "learning_rate": 0.0009999899300364532, "loss": 33698.0, "step": 120 }, { "ce_loss_13": 9.590748715400697, "ce_loss_26": 9.367487025260925, "ce_loss_39": 9.386657476425171, "ce_loss_52": 1.4183751314878463, "ce_loss_7": 9.621446299552918, "epoch": 0.013, "grad_norm": 52.25839955403129, "kl_loss_13": 16867.2, "kl_loss_26": 16417.6, "kl_loss_39": 16448.0, "kl_loss_7": 16940.8, "learning_rate": 0.0009999773426770863, "loss": 33311.6, "step": 130 }, { "ce_loss_13": 9.527826118469239, "ce_loss_26": 9.299377870559692, "ce_loss_39": 9.321026277542114, "ce_loss_52": 1.445027893781662, "ce_loss_7": 9.561844515800477, "epoch": 0.014, "grad_norm": 52.41222674765903, "kl_loss_13": 16692.8, "kl_loss_26": 16227.2, "kl_loss_39": 16273.6, "kl_loss_7": 16777.6, "learning_rate": 0.0009999597205514296, "loss": 33030.8, "step": 140 }, { "ce_loss_13": 9.486352849006654, "ce_loss_26": 9.252249264717102, "ce_loss_39": 9.267914438247681, "ce_loss_52": 1.4420335739850998, "ce_loss_7": 9.524769949913026, "epoch": 0.015, "grad_norm": 53.790180993856175, "kl_loss_13": 16592.0, "kl_loss_26": 16104.0, "kl_loss_39": 16132.8, "kl_loss_7": 16657.6, "learning_rate": 0.0009999370638369377, "loss": 32769.2, "step": 150 }, { "ce_loss_13": 9.392394828796387, "ce_loss_26": 9.15080394744873, "ce_loss_39": 9.170540618896485, "ce_loss_52": 1.423890632390976, "ce_loss_7": 9.436835885047913, "epoch": 0.016, "grad_norm": 52.65338822593253, "kl_loss_13": 16464.0, "kl_loss_26": 15963.2, "kl_loss_39": 16003.2, "kl_loss_7": 16563.2, "learning_rate": 0.000999909372761763, "loss": 32427.6, "step": 160 }, { "ce_loss_13": 9.328035354614258, "ce_loss_26": 9.082435154914856, "ce_loss_39": 9.1047847032547, "ce_loss_52": 1.4349344044923782, "ce_loss_7": 9.380868554115295, "epoch": 0.017, "grad_norm": 52.40583142267758, "kl_loss_13": 16296.0, "kl_loss_26": 15777.6, "kl_loss_39": 15832.0, "kl_loss_7": 16404.8, "learning_rate": 0.0009998766476047546, "loss": 32178.8, "step": 170 }, { "ce_loss_13": 9.262916254997254, "ce_loss_26": 9.01283278465271, "ce_loss_39": 9.035380673408508, "ce_loss_52": 1.3936711609363557, "ce_loss_7": 9.322635316848755, "epoch": 0.018, "grad_norm": 52.346136942448574, "kl_loss_13": 16240.0, "kl_loss_26": 15716.8, "kl_loss_39": 15769.6, "kl_loss_7": 16363.2, "learning_rate": 0.0009998388886954545, "loss": 31844.4, "step": 180 }, { "ce_loss_13": 9.195696568489074, "ce_loss_26": 8.94514548778534, "ce_loss_39": 8.967138314247132, "ce_loss_52": 1.4523959368467332, "ce_loss_7": 9.263990116119384, "epoch": 0.019, "grad_norm": 51.44547467780483, "kl_loss_13": 15985.6, "kl_loss_26": 15464.0, "kl_loss_39": 15515.2, "kl_loss_7": 16132.8, "learning_rate": 0.0009997960964140947, "loss": 31580.0, "step": 190 }, { "ce_loss_13": 9.109345388412475, "ce_loss_26": 8.856351280212403, "ce_loss_39": 8.882033634185792, "ce_loss_52": 1.425847691297531, "ce_loss_7": 9.182338738441468, "epoch": 0.02, "grad_norm": 51.67086637607359, "kl_loss_13": 15867.2, "kl_loss_26": 15332.8, "kl_loss_39": 15393.6, "kl_loss_7": 16011.2, "learning_rate": 0.0009997482711915926, "loss": 31312.8, "step": 200 }, { "ce_loss_13": 9.036305499076843, "ce_loss_26": 8.778529453277589, "ce_loss_39": 8.803540563583374, "ce_loss_52": 1.4626984983682632, "ce_loss_7": 9.118139266967773, "epoch": 0.021, "grad_norm": 50.078507295298536, "kl_loss_13": 15654.4, "kl_loss_26": 15108.8, "kl_loss_39": 15168.0, "kl_loss_7": 15828.8, "learning_rate": 0.0009996954135095479, "loss": 31012.0, "step": 210 }, { "ce_loss_13": 8.98859736919403, "ce_loss_26": 8.726609206199646, "ce_loss_39": 8.75291087627411, "ce_loss_52": 1.4173608794808388, "ce_loss_7": 9.076500582695008, "epoch": 0.022, "grad_norm": 50.694312927544594, "kl_loss_13": 15625.6, "kl_loss_26": 15073.6, "kl_loss_39": 15126.4, "kl_loss_7": 15811.2, "learning_rate": 0.0009996375239002368, "loss": 30754.8, "step": 220 }, { "ce_loss_13": 8.965013265609741, "ce_loss_26": 8.698115158081055, "ce_loss_39": 8.71800787448883, "ce_loss_52": 1.4263556391000747, "ce_loss_7": 9.061096882820129, "epoch": 0.023, "grad_norm": 50.98814656536181, "kl_loss_13": 15547.2, "kl_loss_26": 14985.6, "kl_loss_39": 15028.8, "kl_loss_7": 15752.0, "learning_rate": 0.0009995746029466072, "loss": 30513.6, "step": 230 }, { "ce_loss_13": 8.891163158416749, "ce_loss_26": 8.611021280288696, "ce_loss_39": 8.63300838470459, "ce_loss_52": 1.42348592877388, "ce_loss_7": 8.998197555541992, "epoch": 0.024, "grad_norm": 51.614487435815626, "kl_loss_13": 15393.6, "kl_loss_26": 14806.4, "kl_loss_39": 14848.0, "kl_loss_7": 15619.2, "learning_rate": 0.0009995066512822719, "loss": 30248.4, "step": 240 }, { "ce_loss_13": 8.831620502471925, "ce_loss_26": 8.545269632339478, "ce_loss_39": 8.565073847770691, "ce_loss_52": 1.452064010500908, "ce_loss_7": 8.941647911071778, "epoch": 0.025, "grad_norm": 50.247038771654694, "kl_loss_13": 15233.6, "kl_loss_26": 14628.8, "kl_loss_39": 14667.2, "kl_loss_7": 15462.4, "learning_rate": 0.000999433669591504, "loss": 29955.6, "step": 250 }, { "ce_loss_13": 8.755429339408874, "ce_loss_26": 8.469949841499329, "ce_loss_39": 8.486004614830017, "ce_loss_52": 1.4328533172607423, "ce_loss_7": 8.874198198318481, "epoch": 0.026, "grad_norm": 49.63162227981652, "kl_loss_13": 15075.2, "kl_loss_26": 14473.6, "kl_loss_39": 14508.8, "kl_loss_7": 15326.4, "learning_rate": 0.000999355658609228, "loss": 29717.2, "step": 260 }, { "ce_loss_13": 8.697371244430542, "ce_loss_26": 8.401416063308716, "ce_loss_39": 8.412619948387146, "ce_loss_52": 1.4436978071928024, "ce_loss_7": 8.819016146659852, "epoch": 0.027, "grad_norm": 51.15355718783705, "kl_loss_13": 14974.4, "kl_loss_26": 14347.2, "kl_loss_39": 14377.6, "kl_loss_7": 15235.2, "learning_rate": 0.0009992726191210138, "loss": 29500.4, "step": 270 }, { "ce_loss_13": 8.66446521282196, "ce_loss_26": 8.36073157787323, "ce_loss_39": 8.373045516014098, "ce_loss_52": 1.4337088972330094, "ce_loss_7": 8.793687105178833, "epoch": 0.028, "grad_norm": 50.66919780267985, "kl_loss_13": 14888.0, "kl_loss_26": 14254.4, "kl_loss_39": 14280.0, "kl_loss_7": 15168.0, "learning_rate": 0.0009991845519630679, "loss": 29316.8, "step": 280 }, { "ce_loss_13": 8.604181599617004, "ce_loss_26": 8.292751049995422, "ce_loss_39": 8.304360592365265, "ce_loss_52": 1.4289869368076324, "ce_loss_7": 8.740529561042786, "epoch": 0.029, "grad_norm": 49.40131878460191, "kl_loss_13": 14785.6, "kl_loss_26": 14123.2, "kl_loss_39": 14147.2, "kl_loss_7": 15065.6, "learning_rate": 0.0009990914580222257, "loss": 29034.0, "step": 290 }, { "ce_loss_13": 8.560984683036803, "ce_loss_26": 8.244250774383545, "ce_loss_39": 8.251447391510009, "ce_loss_52": 1.4629653513431549, "ce_loss_7": 8.694539451599121, "epoch": 0.03, "grad_norm": 49.35953544498034, "kl_loss_13": 14673.6, "kl_loss_26": 13996.8, "kl_loss_39": 14014.4, "kl_loss_7": 14955.2, "learning_rate": 0.0009989933382359422, "loss": 28794.4, "step": 300 }, { "ce_loss_13": 8.470300602912904, "ce_loss_26": 8.14130541086197, "ce_loss_39": 8.148625028133392, "ce_loss_52": 1.4505603075027467, "ce_loss_7": 8.614710068702697, "epoch": 0.031, "grad_norm": 49.67315029837232, "kl_loss_13": 14492.8, "kl_loss_26": 13798.4, "kl_loss_39": 13812.8, "kl_loss_7": 14800.0, "learning_rate": 0.0009988901935922825, "loss": 28550.8, "step": 310 }, { "ce_loss_13": 8.449520301818847, "ce_loss_26": 8.124438393115998, "ce_loss_39": 8.127473723888397, "ce_loss_52": 1.4619350016117096, "ce_loss_7": 8.594109392166137, "epoch": 0.032, "grad_norm": 49.62699299914443, "kl_loss_13": 14432.0, "kl_loss_26": 13737.6, "kl_loss_39": 13745.6, "kl_loss_7": 14737.6, "learning_rate": 0.0009987820251299122, "loss": 28340.4, "step": 320 }, { "ce_loss_13": 8.411065363883973, "ce_loss_26": 8.065169262886048, "ce_loss_39": 8.066172111034394, "ce_loss_52": 1.4555475383996963, "ce_loss_7": 8.563976049423218, "epoch": 0.033, "grad_norm": 48.32599858014616, "kl_loss_13": 14328.0, "kl_loss_26": 13596.8, "kl_loss_39": 13601.6, "kl_loss_7": 14651.2, "learning_rate": 0.0009986688339380862, "loss": 28064.0, "step": 330 }, { "ce_loss_13": 8.342015504837036, "ce_loss_26": 7.984185111522675, "ce_loss_39": 7.983271932601928, "ce_loss_52": 1.4297153055667877, "ce_loss_7": 8.503781509399413, "epoch": 0.034, "grad_norm": 49.50369957464881, "kl_loss_13": 14252.8, "kl_loss_26": 13504.0, "kl_loss_39": 13500.8, "kl_loss_7": 14590.4, "learning_rate": 0.0009985506211566387, "loss": 27837.2, "step": 340 }, { "ce_loss_13": 8.30728188753128, "ce_loss_26": 7.943272864818573, "ce_loss_39": 7.946760308742523, "ce_loss_52": 1.4311424046754837, "ce_loss_7": 8.472229743003846, "epoch": 0.035, "grad_norm": 49.2761544590419, "kl_loss_13": 14160.0, "kl_loss_26": 13390.4, "kl_loss_39": 13393.6, "kl_loss_7": 14508.8, "learning_rate": 0.0009984273879759713, "loss": 27625.6, "step": 350 }, { "ce_loss_13": 8.231205070018769, "ce_loss_26": 7.874649381637573, "ce_loss_39": 7.870936000347138, "ce_loss_52": 1.4489013850688934, "ce_loss_7": 8.397671937942505, "epoch": 0.036, "grad_norm": 49.68016023572489, "kl_loss_13": 13990.4, "kl_loss_26": 13236.8, "kl_loss_39": 13232.0, "kl_loss_7": 14340.8, "learning_rate": 0.0009982991356370402, "loss": 27384.8, "step": 360 }, { "ce_loss_13": 8.171039760112762, "ce_loss_26": 7.7931832551956175, "ce_loss_39": 7.793972992897034, "ce_loss_52": 1.4122098296880723, "ce_loss_7": 8.344593167304993, "epoch": 0.037, "grad_norm": 48.18494426167262, "kl_loss_13": 13920.0, "kl_loss_26": 13128.0, "kl_loss_39": 13129.6, "kl_loss_7": 14291.2, "learning_rate": 0.0009981658654313456, "loss": 27248.4, "step": 370 }, { "ce_loss_13": 8.160165214538575, "ce_loss_26": 7.7771016478538515, "ce_loss_39": 7.771613931655883, "ce_loss_52": 1.4869922280311585, "ce_loss_7": 8.338345170021057, "epoch": 0.038, "grad_norm": 48.80785692423277, "kl_loss_13": 13795.2, "kl_loss_26": 12974.4, "kl_loss_39": 12960.0, "kl_loss_7": 14164.8, "learning_rate": 0.000998027578700917, "loss": 26976.4, "step": 380 }, { "ce_loss_13": 8.052374148368836, "ce_loss_26": 7.658347594738006, "ce_loss_39": 7.6583909630775455, "ce_loss_52": 1.4154588401317596, "ce_loss_7": 8.239335989952087, "epoch": 0.039, "grad_norm": 48.38129929768143, "kl_loss_13": 13680.0, "kl_loss_26": 12851.2, "kl_loss_39": 12844.8, "kl_loss_7": 14078.4, "learning_rate": 0.0009978842768382998, "loss": 26719.2, "step": 390 }, { "ce_loss_13": 8.022306847572327, "ce_loss_26": 7.623832786083222, "ce_loss_39": 7.616167056560516, "ce_loss_52": 1.4509258031845094, "ce_loss_7": 8.209609961509704, "epoch": 0.04, "grad_norm": 48.47003917091678, "kl_loss_13": 13532.8, "kl_loss_26": 12684.8, "kl_loss_39": 12668.8, "kl_loss_7": 13918.4, "learning_rate": 0.0009977359612865424, "loss": 26536.4, "step": 400 }, { "ce_loss_13": 8.007824766635895, "ce_loss_26": 7.614881563186645, "ce_loss_39": 7.602893710136414, "ce_loss_52": 1.4645162731409074, "ce_loss_7": 8.19339075088501, "epoch": 0.041, "grad_norm": 48.05902996814503, "kl_loss_13": 13486.4, "kl_loss_26": 12646.4, "kl_loss_39": 12627.2, "kl_loss_7": 13886.4, "learning_rate": 0.0009975826335391806, "loss": 26319.6, "step": 410 }, { "ce_loss_13": 7.89500185251236, "ce_loss_26": 7.4844276905059814, "ce_loss_39": 7.470773124694825, "ce_loss_52": 1.3909434020519256, "ce_loss_7": 8.089212799072266, "epoch": 0.042, "grad_norm": 47.6532900353141, "kl_loss_13": 13393.6, "kl_loss_26": 12528.0, "kl_loss_39": 12504.0, "kl_loss_7": 13800.0, "learning_rate": 0.0009974242951402235, "loss": 26051.2, "step": 420 }, { "ce_loss_13": 7.848755323886872, "ce_loss_26": 7.436672508716583, "ce_loss_39": 7.420604693889618, "ce_loss_52": 1.4549538046121597, "ce_loss_7": 8.053676557540893, "epoch": 0.043, "grad_norm": 47.461866848016506, "kl_loss_13": 13187.2, "kl_loss_26": 12316.8, "kl_loss_39": 12278.4, "kl_loss_7": 13619.2, "learning_rate": 0.0009972609476841367, "loss": 25819.2, "step": 430 }, { "ce_loss_13": 7.824773061275482, "ce_loss_26": 7.38158438205719, "ce_loss_39": 7.365956795215607, "ce_loss_52": 1.4214935347437858, "ce_loss_7": 8.031265962123872, "epoch": 0.044, "grad_norm": 47.54154044179159, "kl_loss_13": 13169.6, "kl_loss_26": 12241.6, "kl_loss_39": 12208.0, "kl_loss_7": 13604.8, "learning_rate": 0.0009970925928158272, "loss": 25669.2, "step": 440 }, { "ce_loss_13": 7.781041419506073, "ce_loss_26": 7.349016737937927, "ce_loss_39": 7.328068232536316, "ce_loss_52": 1.4457789659500122, "ce_loss_7": 7.986689484119415, "epoch": 0.045, "grad_norm": 47.28488093165922, "kl_loss_13": 13065.6, "kl_loss_26": 12150.4, "kl_loss_39": 12107.2, "kl_loss_7": 13499.2, "learning_rate": 0.000996919232230627, "loss": 25413.2, "step": 450 }, { "ce_loss_13": 7.707467567920685, "ce_loss_26": 7.259365463256836, "ce_loss_39": 7.2402653932571415, "ce_loss_52": 1.4384218811988831, "ce_loss_7": 7.9230645298957825, "epoch": 0.046, "grad_norm": 47.47875544223923, "kl_loss_13": 12928.0, "kl_loss_26": 11974.4, "kl_loss_39": 11940.8, "kl_loss_7": 13380.8, "learning_rate": 0.0009967408676742752, "loss": 25149.6, "step": 460 }, { "ce_loss_13": 7.682056641578674, "ce_loss_26": 7.239385926723481, "ce_loss_39": 7.214358115196228, "ce_loss_52": 1.4293440610170365, "ce_loss_7": 7.903320550918579, "epoch": 0.047, "grad_norm": 47.616227706878504, "kl_loss_13": 12899.2, "kl_loss_26": 11950.4, "kl_loss_39": 11904.0, "kl_loss_7": 13360.0, "learning_rate": 0.0009965575009429006, "loss": 24954.0, "step": 470 }, { "ce_loss_13": 7.673471140861511, "ce_loss_26": 7.22241450548172, "ce_loss_39": 7.191142916679382, "ce_loss_52": 1.4720373705029488, "ce_loss_7": 7.897368836402893, "epoch": 0.048, "grad_norm": 47.35612609507448, "kl_loss_13": 12763.2, "kl_loss_26": 11816.0, "kl_loss_39": 11748.8, "kl_loss_7": 13236.8, "learning_rate": 0.0009963691338830043, "loss": 24784.4, "step": 480 }, { "ce_loss_13": 7.61794912815094, "ce_loss_26": 7.163714337348938, "ce_loss_39": 7.132104587554932, "ce_loss_52": 1.4696507424116134, "ce_loss_7": 7.839430296421051, "epoch": 0.049, "grad_norm": 46.6485917664728, "kl_loss_13": 12664.0, "kl_loss_26": 11691.2, "kl_loss_39": 11627.2, "kl_loss_7": 13139.2, "learning_rate": 0.0009961757683914405, "loss": 24543.2, "step": 490 }, { "ce_loss_13": 7.507795846462249, "ce_loss_26": 7.035580575466156, "ce_loss_39": 7.005417144298553, "ce_loss_52": 1.4070568919181823, "ce_loss_7": 7.743252336978912, "epoch": 0.05, "grad_norm": 46.67870253681081, "kl_loss_13": 12545.6, "kl_loss_26": 11542.4, "kl_loss_39": 11480.0, "kl_loss_7": 13043.2, "learning_rate": 0.0009959774064153978, "loss": 24344.0, "step": 500 }, { "ce_loss_13": 7.495815181732178, "ce_loss_26": 7.0229366540908815, "ce_loss_39": 6.990859532356263, "ce_loss_52": 1.4116702109575272, "ce_loss_7": 7.736506867408752, "epoch": 0.051, "grad_norm": 46.06552599373074, "kl_loss_13": 12505.6, "kl_loss_26": 11507.2, "kl_loss_39": 11433.6, "kl_loss_7": 13009.6, "learning_rate": 0.0009957740499523787, "loss": 24160.0, "step": 510 }, { "ce_loss_13": 7.443893933296204, "ce_loss_26": 6.961520659923553, "ce_loss_39": 6.922756457328797, "ce_loss_52": 1.4424341320991516, "ce_loss_7": 7.6887711644172665, "epoch": 0.052, "grad_norm": 46.57676837877325, "kl_loss_13": 12337.6, "kl_loss_26": 11308.8, "kl_loss_39": 11225.6, "kl_loss_7": 12851.2, "learning_rate": 0.0009955657010501807, "loss": 23900.8, "step": 520 }, { "ce_loss_13": 7.388968002796173, "ce_loss_26": 6.916476762294769, "ce_loss_39": 6.874182558059692, "ce_loss_52": 1.4647160589694976, "ce_loss_7": 7.6311492919921875, "epoch": 0.053, "grad_norm": 46.39702705707381, "kl_loss_13": 12211.2, "kl_loss_26": 11196.8, "kl_loss_39": 11105.6, "kl_loss_7": 12718.4, "learning_rate": 0.000995352361806875, "loss": 23724.4, "step": 530 }, { "ce_loss_13": 7.399884676933288, "ce_loss_26": 6.893076729774475, "ce_loss_39": 6.849111843109131, "ce_loss_52": 1.4278682440519332, "ce_loss_7": 7.656503355503082, "epoch": 0.054, "grad_norm": 45.552992496751486, "kl_loss_13": 12289.6, "kl_loss_26": 11217.6, "kl_loss_39": 11131.2, "kl_loss_7": 12824.0, "learning_rate": 0.0009951340343707852, "loss": 23503.2, "step": 540 }, { "ce_loss_13": 7.299449789524078, "ce_loss_26": 6.799772572517395, "ce_loss_39": 6.753718996047974, "ce_loss_52": 1.447503750026226, "ce_loss_7": 7.554943478107452, "epoch": 0.055, "grad_norm": 45.64724152253333, "kl_loss_13": 12067.2, "kl_loss_26": 10992.0, "kl_loss_39": 10900.8, "kl_loss_7": 12608.0, "learning_rate": 0.0009949107209404665, "loss": 23326.0, "step": 550 }, { "ce_loss_13": 7.293551552295685, "ce_loss_26": 6.792212247848511, "ce_loss_39": 6.740794622898102, "ce_loss_52": 1.4703042089939118, "ce_loss_7": 7.558422005176544, "epoch": 0.056, "grad_norm": 45.60106893131463, "kl_loss_13": 11990.4, "kl_loss_26": 10931.2, "kl_loss_39": 10820.8, "kl_loss_7": 12540.8, "learning_rate": 0.0009946824237646824, "loss": 23102.4, "step": 560 }, { "ce_loss_13": 7.1714133501052855, "ce_loss_26": 6.658501255512237, "ce_loss_39": 6.615163576602936, "ce_loss_52": 1.4395473554730416, "ce_loss_7": 7.438856053352356, "epoch": 0.057, "grad_norm": 45.02028770939811, "kl_loss_13": 11819.2, "kl_loss_26": 10716.8, "kl_loss_39": 10633.6, "kl_loss_7": 12379.2, "learning_rate": 0.0009944491451423828, "loss": 22860.0, "step": 570 }, { "ce_loss_13": 7.202235555648803, "ce_loss_26": 6.668069064617157, "ce_loss_39": 6.622809886932373, "ce_loss_52": 1.4502787292003632, "ce_loss_7": 7.474415194988251, "epoch": 0.058, "grad_norm": 45.87654136914787, "kl_loss_13": 11832.0, "kl_loss_26": 10697.6, "kl_loss_39": 10606.4, "kl_loss_7": 12409.6, "learning_rate": 0.0009942108874226813, "loss": 22680.4, "step": 580 }, { "ce_loss_13": 7.099943065643311, "ce_loss_26": 6.574034261703491, "ce_loss_39": 6.518663537502289, "ce_loss_52": 1.4500610083341599, "ce_loss_7": 7.373777639865875, "epoch": 0.059, "grad_norm": 45.75547743089401, "kl_loss_13": 11633.6, "kl_loss_26": 10512.0, "kl_loss_39": 10403.2, "kl_loss_7": 12201.6, "learning_rate": 0.00099396765300483, "loss": 22462.8, "step": 590 }, { "ce_loss_13": 7.097463607788086, "ce_loss_26": 6.57372156381607, "ce_loss_39": 6.511255967617035, "ce_loss_52": 1.480376410484314, "ce_loss_7": 7.372353208065033, "epoch": 0.06, "grad_norm": 45.20967365382369, "kl_loss_13": 11558.4, "kl_loss_26": 10451.2, "kl_loss_39": 10324.8, "kl_loss_7": 12145.6, "learning_rate": 0.0009937194443381972, "loss": 22282.4, "step": 600 }, { "ce_loss_13": 7.056500816345215, "ce_loss_26": 6.520907533168793, "ce_loss_39": 6.457597935199738, "ce_loss_52": 1.4534808412194251, "ce_loss_7": 7.339275515079498, "epoch": 0.061, "grad_norm": 44.17007512472295, "kl_loss_13": 11520.0, "kl_loss_26": 10382.4, "kl_loss_39": 10249.6, "kl_loss_7": 12110.4, "learning_rate": 0.0009934662639222412, "loss": 22080.0, "step": 610 }, { "ce_loss_13": 6.959627556800842, "ce_loss_26": 6.4156983375549315, "ce_loss_39": 6.353180265426635, "ce_loss_52": 1.4930268943309783, "ce_loss_7": 7.244034695625305, "epoch": 0.062, "grad_norm": 43.92075543466765, "kl_loss_13": 11259.2, "kl_loss_26": 10102.4, "kl_loss_39": 9971.2, "kl_loss_7": 11859.2, "learning_rate": 0.000993208114306486, "loss": 21799.2, "step": 620 }, { "ce_loss_13": 6.937919509410858, "ce_loss_26": 6.402250957489014, "ce_loss_39": 6.331100332736969, "ce_loss_52": 1.4531458377838136, "ce_loss_7": 7.226283407211303, "epoch": 0.063, "grad_norm": 44.52659706916058, "kl_loss_13": 11259.2, "kl_loss_26": 10128.0, "kl_loss_39": 9988.8, "kl_loss_7": 11881.6, "learning_rate": 0.0009929449980904952, "loss": 21667.2, "step": 630 }, { "ce_loss_13": 6.914422643184662, "ce_loss_26": 6.355719900131225, "ce_loss_39": 6.2839093685150145, "ce_loss_52": 1.463460123538971, "ce_loss_7": 7.208615565299988, "epoch": 0.064, "grad_norm": 44.241917484883416, "kl_loss_13": 11203.2, "kl_loss_26": 10004.8, "kl_loss_39": 9865.6, "kl_loss_7": 11827.2, "learning_rate": 0.0009926769179238466, "loss": 21450.4, "step": 640 }, { "ce_loss_13": 6.814666819572449, "ce_loss_26": 6.240503942966461, "ce_loss_39": 6.164217627048492, "ce_loss_52": 1.4213469997048378, "ce_loss_7": 7.121113920211792, "epoch": 0.065, "grad_norm": 45.45585410762684, "kl_loss_13": 11097.6, "kl_loss_26": 9875.2, "kl_loss_39": 9726.4, "kl_loss_7": 11742.4, "learning_rate": 0.000992403876506104, "loss": 21273.2, "step": 650 }, { "ce_loss_13": 6.807473576068878, "ce_loss_26": 6.237039804458618, "ce_loss_39": 6.164605820178986, "ce_loss_52": 1.4794408291578294, "ce_loss_7": 7.109469771385193, "epoch": 0.066, "grad_norm": 43.77904042873825, "kl_loss_13": 10964.8, "kl_loss_26": 9745.6, "kl_loss_39": 9593.6, "kl_loss_7": 11603.2, "learning_rate": 0.0009921258765867918, "loss": 21034.4, "step": 660 }, { "ce_loss_13": 6.720256412029267, "ce_loss_26": 6.124040985107422, "ce_loss_39": 6.048683619499206, "ce_loss_52": 1.4370630145072938, "ce_loss_7": 7.032277429103852, "epoch": 0.067, "grad_norm": 44.21280182860459, "kl_loss_13": 10864.0, "kl_loss_26": 9596.8, "kl_loss_39": 9446.4, "kl_loss_7": 11528.0, "learning_rate": 0.0009918429209653662, "loss": 20815.6, "step": 670 }, { "ce_loss_13": 6.73115086555481, "ce_loss_26": 6.149888730049133, "ce_loss_39": 6.072007644176483, "ce_loss_52": 1.4543532699346542, "ce_loss_7": 7.039518296718597, "epoch": 0.068, "grad_norm": 43.58133426683343, "kl_loss_13": 10844.8, "kl_loss_26": 9603.2, "kl_loss_39": 9433.6, "kl_loss_7": 11494.4, "learning_rate": 0.0009915550124911866, "loss": 20688.4, "step": 680 }, { "ce_loss_13": 6.683139646053315, "ce_loss_26": 6.099281096458435, "ce_loss_39": 6.017751622200012, "ce_loss_52": 1.4289966225624084, "ce_loss_7": 6.9959977746009825, "epoch": 0.069, "grad_norm": 43.03707399207988, "kl_loss_13": 10817.6, "kl_loss_26": 9577.6, "kl_loss_39": 9414.4, "kl_loss_7": 11472.0, "learning_rate": 0.0009912621540634887, "loss": 20494.0, "step": 690 }, { "ce_loss_13": 6.5575969338417055, "ce_loss_26": 5.94709130525589, "ce_loss_39": 5.865298080444336, "ce_loss_52": 1.3811550110578537, "ce_loss_7": 6.883859884738922, "epoch": 0.07, "grad_norm": 43.657034485471186, "kl_loss_13": 10611.2, "kl_loss_26": 9316.8, "kl_loss_39": 9148.8, "kl_loss_7": 11299.2, "learning_rate": 0.0009909643486313534, "loss": 20224.4, "step": 700 }, { "ce_loss_13": 6.581148624420166, "ce_loss_26": 5.951541697978973, "ce_loss_39": 5.867393767833709, "ce_loss_52": 1.417145846784115, "ce_loss_7": 6.908858215808868, "epoch": 0.071, "grad_norm": 42.31273006993064, "kl_loss_13": 10628.8, "kl_loss_26": 9294.4, "kl_loss_39": 9120.0, "kl_loss_7": 11320.0, "learning_rate": 0.000990661599193678, "loss": 20075.6, "step": 710 }, { "ce_loss_13": 6.503521502017975, "ce_loss_26": 5.871239483356476, "ce_loss_39": 5.7897450685501095, "ce_loss_52": 1.4011695250868796, "ce_loss_7": 6.844956791400909, "epoch": 0.072, "grad_norm": 42.36356368480549, "kl_loss_13": 10488.0, "kl_loss_26": 9147.2, "kl_loss_39": 8979.2, "kl_loss_7": 11206.4, "learning_rate": 0.0009903539087991462, "loss": 19811.6, "step": 720 }, { "ce_loss_13": 6.489633810520172, "ce_loss_26": 5.87660802602768, "ce_loss_39": 5.779063713550568, "ce_loss_52": 1.439223274588585, "ce_loss_7": 6.819329023361206, "epoch": 0.073, "grad_norm": 42.98993238073801, "kl_loss_13": 10366.4, "kl_loss_26": 9057.6, "kl_loss_39": 8861.6, "kl_loss_7": 11059.2, "learning_rate": 0.0009900412805461966, "loss": 19744.8, "step": 730 }, { "ce_loss_13": 6.4397171378135685, "ce_loss_26": 5.814687025547028, "ce_loss_39": 5.716581547260285, "ce_loss_52": 1.4390251755714416, "ce_loss_7": 6.779693508148194, "epoch": 0.074, "grad_norm": 42.877595561482536, "kl_loss_13": 10267.2, "kl_loss_26": 8939.2, "kl_loss_39": 8734.4, "kl_loss_7": 10982.4, "learning_rate": 0.0009897237175829927, "loss": 19478.8, "step": 740 }, { "ce_loss_13": 6.3779888391494755, "ce_loss_26": 5.756749665737152, "ce_loss_39": 5.652812826633453, "ce_loss_52": 1.4100830882787705, "ce_loss_7": 6.712257170677185, "epoch": 0.075, "grad_norm": 43.56161359476007, "kl_loss_13": 10203.2, "kl_loss_26": 8863.2, "kl_loss_39": 8649.6, "kl_loss_7": 10920.0, "learning_rate": 0.0009894012231073895, "loss": 19311.6, "step": 750 }, { "ce_loss_13": 6.351921963691711, "ce_loss_26": 5.711096298694611, "ce_loss_39": 5.613580751419067, "ce_loss_52": 1.4703039675951004, "ce_loss_7": 6.6900406837463375, "epoch": 0.076, "grad_norm": 41.581645996763, "kl_loss_13": 10056.0, "kl_loss_26": 8678.4, "kl_loss_39": 8485.6, "kl_loss_7": 10764.8, "learning_rate": 0.0009890738003669028, "loss": 19128.0, "step": 760 }, { "ce_loss_13": 6.329116785526276, "ce_loss_26": 5.685759162902832, "ce_loss_39": 5.58324785232544, "ce_loss_52": 1.4396527051925658, "ce_loss_7": 6.677184915542602, "epoch": 0.077, "grad_norm": 40.86594229703089, "kl_loss_13": 10036.8, "kl_loss_26": 8680.0, "kl_loss_39": 8467.2, "kl_loss_7": 10760.0, "learning_rate": 0.0009887414526586764, "loss": 18930.4, "step": 770 }, { "ce_loss_13": 6.279877305030823, "ce_loss_26": 5.617447376251221, "ce_loss_39": 5.507227098941803, "ce_loss_52": 1.4374216616153717, "ce_loss_7": 6.634308731555938, "epoch": 0.078, "grad_norm": 41.180826238519536, "kl_loss_13": 9923.2, "kl_loss_26": 8513.6, "kl_loss_39": 8292.8, "kl_loss_7": 10667.2, "learning_rate": 0.0009884041833294476, "loss": 18733.6, "step": 780 }, { "ce_loss_13": 6.212144470214843, "ce_loss_26": 5.565514934062958, "ce_loss_39": 5.445651924610138, "ce_loss_52": 1.4184710115194321, "ce_loss_7": 6.563052010536194, "epoch": 0.079, "grad_norm": 41.51169269505913, "kl_loss_13": 9840.0, "kl_loss_26": 8459.2, "kl_loss_39": 8207.2, "kl_loss_7": 10576.0, "learning_rate": 0.000988061995775515, "loss": 18618.8, "step": 790 }, { "ce_loss_13": 6.177972686290741, "ce_loss_26": 5.5426277875900265, "ce_loss_39": 5.430073320865631, "ce_loss_52": 1.4582359090447425, "ce_loss_7": 6.532871425151825, "epoch": 0.08, "grad_norm": 41.06171415513337, "kl_loss_13": 9713.6, "kl_loss_26": 8348.0, "kl_loss_39": 8122.4, "kl_loss_7": 10464.0, "learning_rate": 0.0009877148934427035, "loss": 18370.0, "step": 800 }, { "ce_loss_13": 6.174833989143371, "ce_loss_26": 5.505520594120026, "ce_loss_39": 5.391082692146301, "ce_loss_52": 1.4291342854499818, "ce_loss_7": 6.535899603366852, "epoch": 0.081, "grad_norm": 40.55915083586062, "kl_loss_13": 9748.8, "kl_loss_26": 8332.0, "kl_loss_39": 8094.4, "kl_loss_7": 10502.4, "learning_rate": 0.0009873628798263297, "loss": 18197.2, "step": 810 }, { "ce_loss_13": 6.106976389884949, "ce_loss_26": 5.425193250179291, "ce_loss_39": 5.297831201553345, "ce_loss_52": 1.4520869970321655, "ce_loss_7": 6.4676952958106995, "epoch": 0.082, "grad_norm": 39.176828574493044, "kl_loss_13": 9564.8, "kl_loss_26": 8108.0, "kl_loss_39": 7852.0, "kl_loss_7": 10324.8, "learning_rate": 0.0009870059584711668, "loss": 17988.4, "step": 820 }, { "ce_loss_13": 6.029178476333618, "ce_loss_26": 5.369020164012909, "ce_loss_39": 5.247223997116089, "ce_loss_52": 1.4342376589775085, "ce_loss_7": 6.38949601650238, "epoch": 0.083, "grad_norm": 41.3023886018674, "kl_loss_13": 9422.4, "kl_loss_26": 8008.8, "kl_loss_39": 7756.0, "kl_loss_7": 10184.0, "learning_rate": 0.000986644132971409, "loss": 17788.4, "step": 830 }, { "ce_loss_13": 6.009692323207855, "ce_loss_26": 5.3266006231307985, "ce_loss_39": 5.202202546596527, "ce_loss_52": 1.4376018613576889, "ce_loss_7": 6.372254419326782, "epoch": 0.084, "grad_norm": 39.84971146906691, "kl_loss_13": 9387.2, "kl_loss_26": 7916.8, "kl_loss_39": 7663.2, "kl_loss_7": 10155.2, "learning_rate": 0.0009862774069706345, "loss": 17687.8, "step": 840 }, { "ce_loss_13": 5.948546409606934, "ce_loss_26": 5.290343832969666, "ce_loss_39": 5.16569093465805, "ce_loss_52": 1.4315639585256577, "ce_loss_7": 6.303727805614471, "epoch": 0.085, "grad_norm": 38.79997549953815, "kl_loss_13": 9260.8, "kl_loss_26": 7848.0, "kl_loss_39": 7593.6, "kl_loss_7": 10009.6, "learning_rate": 0.000985905784161771, "loss": 17478.4, "step": 850 }, { "ce_loss_13": 5.976463770866394, "ce_loss_26": 5.285696280002594, "ce_loss_39": 5.158673858642578, "ce_loss_52": 1.4285172358155251, "ce_loss_7": 6.345685577392578, "epoch": 0.086, "grad_norm": 39.11215287158734, "kl_loss_13": 9323.2, "kl_loss_26": 7843.2, "kl_loss_39": 7588.0, "kl_loss_7": 10100.8, "learning_rate": 0.000985529268287055, "loss": 17353.8, "step": 860 }, { "ce_loss_13": 5.890017306804657, "ce_loss_26": 5.188625490665435, "ce_loss_39": 5.061676156520844, "ce_loss_52": 1.427770259976387, "ce_loss_7": 6.267517876625061, "epoch": 0.087, "grad_norm": 38.38012767193544, "kl_loss_13": 9177.6, "kl_loss_26": 7678.4, "kl_loss_39": 7415.2, "kl_loss_7": 9971.2, "learning_rate": 0.0009851478631379982, "loss": 17143.4, "step": 870 }, { "ce_loss_13": 5.8172935247421265, "ce_loss_26": 5.092825090885162, "ce_loss_39": 4.964837598800659, "ce_loss_52": 1.3596146881580353, "ce_loss_7": 6.200971674919129, "epoch": 0.088, "grad_norm": 38.67673909990335, "kl_loss_13": 9150.4, "kl_loss_26": 7612.8, "kl_loss_39": 7350.4, "kl_loss_7": 9947.2, "learning_rate": 0.0009847615725553456, "loss": 17046.8, "step": 880 }, { "ce_loss_13": 5.872656679153442, "ce_loss_26": 5.153263211250305, "ce_loss_39": 5.00926034450531, "ce_loss_52": 1.4231197819113732, "ce_loss_7": 6.253647100925446, "epoch": 0.089, "grad_norm": 38.12938597789528, "kl_loss_13": 9128.0, "kl_loss_26": 7595.2, "kl_loss_39": 7309.6, "kl_loss_7": 9923.2, "learning_rate": 0.0009843704004290394, "loss": 16917.8, "step": 890 }, { "ce_loss_13": 5.798008918762207, "ce_loss_26": 5.091720676422119, "ce_loss_39": 4.938081228733063, "ce_loss_52": 1.4382835403084755, "ce_loss_7": 6.171303284168244, "epoch": 0.09, "grad_norm": 37.16474091329711, "kl_loss_13": 8936.0, "kl_loss_26": 7427.2, "kl_loss_39": 7121.6, "kl_loss_7": 9726.4, "learning_rate": 0.0009839743506981783, "loss": 16656.0, "step": 900 }, { "ce_loss_13": 5.82405720949173, "ce_loss_26": 5.096203732490539, "ce_loss_39": 4.965065968036652, "ce_loss_52": 1.4636528208851813, "ce_loss_7": 6.201912236213684, "epoch": 0.091, "grad_norm": 36.36947394693034, "kl_loss_13": 8939.2, "kl_loss_26": 7354.4, "kl_loss_39": 7080.0, "kl_loss_7": 9747.2, "learning_rate": 0.0009835734273509786, "loss": 16529.0, "step": 910 }, { "ce_loss_13": 5.734641706943512, "ce_loss_26": 5.011995434761047, "ce_loss_39": 4.85785391330719, "ce_loss_52": 1.4457294046878815, "ce_loss_7": 6.11446977853775, "epoch": 0.092, "grad_norm": 36.64530545748263, "kl_loss_13": 8840.8, "kl_loss_26": 7284.8, "kl_loss_39": 6967.2, "kl_loss_7": 9636.8, "learning_rate": 0.0009831676344247342, "loss": 16343.8, "step": 920 }, { "ce_loss_13": 5.681211936473846, "ce_loss_26": 4.933374917507171, "ce_loss_39": 4.788007187843323, "ce_loss_52": 1.3833691507577897, "ce_loss_7": 6.066722130775451, "epoch": 0.093, "grad_norm": 37.601960547328346, "kl_loss_13": 8819.2, "kl_loss_26": 7232.8, "kl_loss_39": 6926.4, "kl_loss_7": 9620.8, "learning_rate": 0.0009827569760057755, "loss": 16262.8, "step": 930 }, { "ce_loss_13": 5.685943353176117, "ce_loss_26": 4.954251933097839, "ce_loss_39": 4.7964679479599, "ce_loss_52": 1.4205988943576813, "ce_loss_7": 6.062719237804413, "epoch": 0.094, "grad_norm": 34.90292075240395, "kl_loss_13": 8707.2, "kl_loss_26": 7154.4, "kl_loss_39": 6841.6, "kl_loss_7": 9496.0, "learning_rate": 0.000982341456229428, "loss": 16011.8, "step": 940 }, { "ce_loss_13": 5.640336573123932, "ce_loss_26": 4.914572691917419, "ce_loss_39": 4.756339108943939, "ce_loss_52": 1.46774483025074, "ce_loss_7": 6.0160892605781555, "epoch": 0.095, "grad_norm": 35.81614055285293, "kl_loss_13": 8553.6, "kl_loss_26": 6999.2, "kl_loss_39": 6671.2, "kl_loss_7": 9348.8, "learning_rate": 0.000981921079279971, "loss": 15864.8, "step": 950 }, { "ce_loss_13": 5.622566449642181, "ce_loss_26": 4.879516458511352, "ce_loss_39": 4.717178559303283, "ce_loss_52": 1.4239765584468842, "ce_loss_7": 5.998941457271576, "epoch": 0.096, "grad_norm": 35.60374709072334, "kl_loss_13": 8597.6, "kl_loss_26": 7007.2, "kl_loss_39": 6670.4, "kl_loss_7": 9384.0, "learning_rate": 0.0009814958493905962, "loss": 15764.6, "step": 960 }, { "ce_loss_13": 5.552615082263946, "ce_loss_26": 4.812614411115646, "ce_loss_39": 4.667081838846206, "ce_loss_52": 1.4328487768769265, "ce_loss_7": 5.935272622108459, "epoch": 0.097, "grad_norm": 34.10755627830643, "kl_loss_13": 8454.4, "kl_loss_26": 6880.8, "kl_loss_39": 6574.4, "kl_loss_7": 9254.4, "learning_rate": 0.0009810657708433637, "loss": 15541.8, "step": 970 }, { "ce_loss_13": 5.534706914424897, "ce_loss_26": 4.787809383869171, "ce_loss_39": 4.631017792224884, "ce_loss_52": 1.4364599764347077, "ce_loss_7": 5.910705745220184, "epoch": 0.098, "grad_norm": 33.30412559481911, "kl_loss_13": 8426.4, "kl_loss_26": 6818.4, "kl_loss_39": 6496.0, "kl_loss_7": 9220.8, "learning_rate": 0.0009806308479691594, "loss": 15486.2, "step": 980 }, { "ce_loss_13": 5.455054485797882, "ce_loss_26": 4.712761473655701, "ce_loss_39": 4.549750781059265, "ce_loss_52": 1.442040067911148, "ce_loss_7": 5.841645193099976, "epoch": 0.099, "grad_norm": 34.061312435089185, "kl_loss_13": 8225.6, "kl_loss_26": 6628.8, "kl_loss_39": 6296.8, "kl_loss_7": 9036.8, "learning_rate": 0.0009801910851476522, "loss": 15382.2, "step": 990 }, { "ce_loss_13": 5.4525358319282535, "ce_loss_26": 4.71786208152771, "ce_loss_39": 4.559128785133362, "ce_loss_52": 1.4428577244281768, "ce_loss_7": 5.826938045024872, "epoch": 0.1, "grad_norm": 33.6953047069211, "kl_loss_13": 8196.0, "kl_loss_26": 6628.8, "kl_loss_39": 6297.6, "kl_loss_7": 8995.2, "learning_rate": 0.0009797464868072487, "loss": 15156.4, "step": 1000 }, { "ce_loss_13": 5.445610964298249, "ce_loss_26": 4.6739885926246645, "ce_loss_39": 4.508283615112305, "ce_loss_52": 1.4168697059154511, "ce_loss_7": 5.834221661090851, "epoch": 0.101, "grad_norm": 32.70197394198742, "kl_loss_13": 8233.6, "kl_loss_26": 6591.2, "kl_loss_39": 6246.4, "kl_loss_7": 9051.2, "learning_rate": 0.0009792970574250492, "loss": 14993.6, "step": 1010 }, { "ce_loss_13": 5.368366336822509, "ce_loss_26": 4.5666680335998535, "ce_loss_39": 4.400501304864884, "ce_loss_52": 1.3814861461520196, "ce_loss_7": 5.762167453765869, "epoch": 0.102, "grad_norm": 32.28847151546708, "kl_loss_13": 8145.6, "kl_loss_26": 6444.0, "kl_loss_39": 6111.2, "kl_loss_7": 8969.6, "learning_rate": 0.0009788428015268028, "loss": 14902.6, "step": 1020 }, { "ce_loss_13": 5.416829228401184, "ce_loss_26": 4.672497856616974, "ce_loss_39": 4.5039793968200685, "ce_loss_52": 1.4590631812810897, "ce_loss_7": 5.7889638304710385, "epoch": 0.103, "grad_norm": 32.94054692999689, "kl_loss_13": 8076.0, "kl_loss_26": 6480.0, "kl_loss_39": 6138.4, "kl_loss_7": 8867.2, "learning_rate": 0.0009783837236868609, "loss": 14715.4, "step": 1030 }, { "ce_loss_13": 5.323912274837494, "ce_loss_26": 4.551275789737701, "ce_loss_39": 4.376495039463043, "ce_loss_52": 1.4376014918088913, "ce_loss_7": 5.712179851531983, "epoch": 0.104, "grad_norm": 32.580444775774126, "kl_loss_13": 7934.4, "kl_loss_26": 6285.6, "kl_loss_39": 5924.0, "kl_loss_7": 8752.0, "learning_rate": 0.0009779198285281327, "loss": 14586.6, "step": 1040 }, { "ce_loss_13": 5.36049393415451, "ce_loss_26": 4.592142331600189, "ce_loss_39": 4.4142293453216555, "ce_loss_52": 1.4498057544231415, "ce_loss_7": 5.73596283197403, "epoch": 0.105, "grad_norm": 31.80216030064833, "kl_loss_13": 7976.8, "kl_loss_26": 6341.6, "kl_loss_39": 5988.8, "kl_loss_7": 8771.2, "learning_rate": 0.0009774511207220368, "loss": 14415.8, "step": 1050 }, { "ce_loss_13": 5.3308478713035585, "ce_loss_26": 4.5750040173530575, "ce_loss_39": 4.39155302643776, "ce_loss_52": 1.4785875469446181, "ce_loss_7": 5.713631689548492, "epoch": 0.106, "grad_norm": 31.180344664143906, "kl_loss_13": 7883.2, "kl_loss_26": 6267.2, "kl_loss_39": 5887.2, "kl_loss_7": 8687.2, "learning_rate": 0.0009769776049884564, "loss": 14270.6, "step": 1060 }, { "ce_loss_13": 5.3476661205291744, "ce_loss_26": 4.561805117130279, "ce_loss_39": 4.391524451971054, "ce_loss_52": 1.4550457745790482, "ce_loss_7": 5.725461614131928, "epoch": 0.107, "grad_norm": 30.9396118714078, "kl_loss_13": 7968.0, "kl_loss_26": 6308.0, "kl_loss_39": 5942.4, "kl_loss_7": 8754.4, "learning_rate": 0.0009764992860956889, "loss": 14268.8, "step": 1070 }, { "ce_loss_13": 5.248398435115814, "ce_loss_26": 4.472868782281876, "ce_loss_39": 4.294939804077148, "ce_loss_52": 1.4237870454788208, "ce_loss_7": 5.6272268176078795, "epoch": 0.108, "grad_norm": 30.45094259940441, "kl_loss_13": 7834.4, "kl_loss_26": 6169.6, "kl_loss_39": 5812.0, "kl_loss_7": 8620.8, "learning_rate": 0.0009760161688604008, "loss": 14058.8, "step": 1080 }, { "ce_loss_13": 5.167715132236481, "ce_loss_26": 4.421313828229904, "ce_loss_39": 4.241793435811997, "ce_loss_52": 1.4617454051971435, "ce_loss_7": 5.536553728580475, "epoch": 0.109, "grad_norm": 30.37346177682711, "kl_loss_13": 7596.8, "kl_loss_26": 5987.2, "kl_loss_39": 5622.4, "kl_loss_7": 8367.2, "learning_rate": 0.0009755282581475768, "loss": 13946.4, "step": 1090 }, { "ce_loss_13": 5.21688460111618, "ce_loss_26": 4.442314791679382, "ce_loss_39": 4.254946118593216, "ce_loss_52": 1.4511815324425696, "ce_loss_7": 5.593706953525543, "epoch": 0.11, "grad_norm": 30.665307755441862, "kl_loss_13": 7699.2, "kl_loss_26": 6030.4, "kl_loss_39": 5652.0, "kl_loss_7": 8485.6, "learning_rate": 0.0009750355588704727, "loss": 13825.8, "step": 1100 }, { "ce_loss_13": 5.112356352806091, "ce_loss_26": 4.310530138015747, "ce_loss_39": 4.125328695774078, "ce_loss_52": 1.4114144504070283, "ce_loss_7": 5.487049925327301, "epoch": 0.111, "grad_norm": 29.407721301071028, "kl_loss_13": 7569.6, "kl_loss_26": 5844.8, "kl_loss_39": 5470.4, "kl_loss_7": 8359.2, "learning_rate": 0.0009745380759905647, "loss": 13627.8, "step": 1110 }, { "ce_loss_13": 5.087459588050843, "ce_loss_26": 4.285146009922028, "ce_loss_39": 4.101419150829315, "ce_loss_52": 1.3823944509029389, "ce_loss_7": 5.47238245010376, "epoch": 0.112, "grad_norm": 28.691648596064702, "kl_loss_13": 7563.2, "kl_loss_26": 5855.2, "kl_loss_39": 5476.0, "kl_loss_7": 8378.4, "learning_rate": 0.0009740358145174998, "loss": 13629.4, "step": 1120 }, { "ce_loss_13": 5.085049080848694, "ce_loss_26": 4.283704102039337, "ce_loss_39": 4.095863288640976, "ce_loss_52": 1.4323463156819343, "ce_loss_7": 5.455269980430603, "epoch": 0.113, "grad_norm": 28.708307757554874, "kl_loss_13": 7468.0, "kl_loss_26": 5755.2, "kl_loss_39": 5364.0, "kl_loss_7": 8244.8, "learning_rate": 0.0009735287795090455, "loss": 13475.0, "step": 1130 }, { "ce_loss_13": 4.9704699397087095, "ce_loss_26": 4.164930063486099, "ce_loss_39": 3.981805819272995, "ce_loss_52": 1.3983336806297302, "ce_loss_7": 5.351285874843597, "epoch": 0.114, "grad_norm": 30.611214200807577, "kl_loss_13": 7311.2, "kl_loss_26": 5585.6, "kl_loss_39": 5199.2, "kl_loss_7": 8099.2, "learning_rate": 0.0009730169760710386, "loss": 13288.2, "step": 1140 }, { "ce_loss_13": 5.094941341876984, "ce_loss_26": 4.275070035457611, "ce_loss_39": 4.084649866819381, "ce_loss_52": 1.4357529014348984, "ce_loss_7": 5.482879185676575, "epoch": 0.115, "grad_norm": 29.988760771554233, "kl_loss_13": 7468.8, "kl_loss_26": 5715.2, "kl_loss_39": 5326.4, "kl_loss_7": 8275.2, "learning_rate": 0.0009725004093573342, "loss": 13196.6, "step": 1150 }, { "ce_loss_13": 4.942017900943756, "ce_loss_26": 4.151496112346649, "ce_loss_39": 3.957593894004822, "ce_loss_52": 1.4098813980817795, "ce_loss_7": 5.323493349552154, "epoch": 0.116, "grad_norm": 30.06505974205765, "kl_loss_13": 7237.6, "kl_loss_26": 5534.4, "kl_loss_39": 5135.2, "kl_loss_7": 8035.2, "learning_rate": 0.0009719790845697534, "loss": 13084.4, "step": 1160 }, { "ce_loss_13": 4.974001240730286, "ce_loss_26": 4.16838675737381, "ce_loss_39": 3.968938571214676, "ce_loss_52": 1.4311222642660142, "ce_loss_7": 5.3611521363258365, "epoch": 0.117, "grad_norm": 28.226442547632384, "kl_loss_13": 7241.6, "kl_loss_26": 5536.8, "kl_loss_39": 5121.6, "kl_loss_7": 8045.6, "learning_rate": 0.0009714530069580309, "loss": 12959.6, "step": 1170 }, { "ce_loss_13": 4.905927586555481, "ce_loss_26": 4.065518736839294, "ce_loss_39": 3.8746193647384644, "ce_loss_52": 1.3948067665100097, "ce_loss_7": 5.290950679779053, "epoch": 0.118, "grad_norm": 26.7183550844667, "kl_loss_13": 7163.2, "kl_loss_26": 5388.0, "kl_loss_39": 4992.8, "kl_loss_7": 7959.2, "learning_rate": 0.0009709221818197624, "loss": 12883.0, "step": 1180 }, { "ce_loss_13": 4.902862447500229, "ce_loss_26": 4.10335453748703, "ce_loss_39": 3.920765632390976, "ce_loss_52": 1.4242859303951263, "ce_loss_7": 5.280882668495178, "epoch": 0.119, "grad_norm": 27.541041012815914, "kl_loss_13": 7096.0, "kl_loss_26": 5372.8, "kl_loss_39": 4987.2, "kl_loss_7": 7895.2, "learning_rate": 0.0009703866145003512, "loss": 12755.6, "step": 1190 }, { "ce_loss_13": 4.91794501543045, "ce_loss_26": 4.094452971220017, "ce_loss_39": 3.892131644487381, "ce_loss_52": 1.4217353582382202, "ce_loss_7": 5.298339033126831, "epoch": 0.12, "grad_norm": 27.647397498896083, "kl_loss_13": 7171.2, "kl_loss_26": 5395.2, "kl_loss_39": 4985.6, "kl_loss_7": 7958.4, "learning_rate": 0.0009698463103929542, "loss": 12646.8, "step": 1200 }, { "ce_loss_13": 4.933221316337585, "ce_loss_26": 4.129461044073105, "ce_loss_39": 3.92621054649353, "ce_loss_52": 1.4751009970903397, "ce_loss_7": 5.315613615512848, "epoch": 0.121, "grad_norm": 26.42127738230431, "kl_loss_13": 7043.2, "kl_loss_26": 5317.6, "kl_loss_39": 4899.2, "kl_loss_7": 7841.6, "learning_rate": 0.0009693012749384279, "loss": 12515.8, "step": 1210 }, { "ce_loss_13": 4.865577363967896, "ce_loss_26": 4.064575934410096, "ce_loss_39": 3.863145834207535, "ce_loss_52": 1.4422448396682739, "ce_loss_7": 5.235701704025269, "epoch": 0.122, "grad_norm": 25.567127248660423, "kl_loss_13": 6988.0, "kl_loss_26": 5271.2, "kl_loss_39": 4855.2, "kl_loss_7": 7760.8, "learning_rate": 0.0009687515136252732, "loss": 12484.2, "step": 1220 }, { "ce_loss_13": 4.875257205963135, "ce_loss_26": 4.068110597133637, "ce_loss_39": 3.866461306810379, "ce_loss_52": 1.4406520485877992, "ce_loss_7": 5.254658281803131, "epoch": 0.123, "grad_norm": 25.60032726683598, "kl_loss_13": 7019.2, "kl_loss_26": 5276.8, "kl_loss_39": 4864.0, "kl_loss_7": 7809.6, "learning_rate": 0.0009681970319895803, "loss": 12358.8, "step": 1230 }, { "ce_loss_13": 4.856750476360321, "ce_loss_26": 4.064332664012909, "ce_loss_39": 3.865360552072525, "ce_loss_52": 1.4734540313482285, "ce_loss_7": 5.232936894893646, "epoch": 0.124, "grad_norm": 27.54111241294886, "kl_loss_13": 6928.8, "kl_loss_26": 5224.8, "kl_loss_39": 4806.4, "kl_loss_7": 7701.6, "learning_rate": 0.0009676378356149733, "loss": 12219.2, "step": 1240 }, { "ce_loss_13": 4.714985811710358, "ce_loss_26": 3.8911590695381166, "ce_loss_39": 3.6897071480751036, "ce_loss_52": 1.4211883068084716, "ce_loss_7": 5.092134392261505, "epoch": 0.125, "grad_norm": 26.159823932609697, "kl_loss_13": 6763.2, "kl_loss_26": 5001.6, "kl_loss_39": 4584.8, "kl_loss_7": 7549.6, "learning_rate": 0.0009670739301325534, "loss": 12043.8, "step": 1250 }, { "ce_loss_13": 4.746155381202698, "ce_loss_26": 3.9151339173316955, "ce_loss_39": 3.717781513929367, "ce_loss_52": 1.3889067679643632, "ce_loss_7": 5.118369615077972, "epoch": 0.126, "grad_norm": 27.210751367526278, "kl_loss_13": 6825.6, "kl_loss_26": 5060.0, "kl_loss_39": 4660.0, "kl_loss_7": 7612.0, "learning_rate": 0.0009665053212208426, "loss": 12020.6, "step": 1260 }, { "ce_loss_13": 4.729644465446472, "ce_loss_26": 3.8812398612499237, "ce_loss_39": 3.686249554157257, "ce_loss_52": 1.421858811378479, "ce_loss_7": 5.109574091434479, "epoch": 0.127, "grad_norm": 24.75325655489569, "kl_loss_13": 6772.8, "kl_loss_26": 4955.6, "kl_loss_39": 4553.2, "kl_loss_7": 7572.8, "learning_rate": 0.0009659320146057262, "loss": 11949.0, "step": 1270 }, { "ce_loss_13": 4.706896722316742, "ce_loss_26": 3.884850525856018, "ce_loss_39": 3.679063153266907, "ce_loss_52": 1.4093473598361015, "ce_loss_7": 5.0908261895179745, "epoch": 0.128, "grad_norm": 25.78294847436066, "kl_loss_13": 6728.0, "kl_loss_26": 4984.8, "kl_loss_39": 4562.4, "kl_loss_7": 7525.6, "learning_rate": 0.0009653540160603955, "loss": 11920.8, "step": 1280 }, { "ce_loss_13": 4.68510691523552, "ce_loss_26": 3.886442297697067, "ce_loss_39": 3.682328295707703, "ce_loss_52": 1.4646018967032433, "ce_loss_7": 5.063843679428101, "epoch": 0.129, "grad_norm": 24.870529388647757, "kl_loss_13": 6587.2, "kl_loss_26": 4871.2, "kl_loss_39": 4451.6, "kl_loss_7": 7384.8, "learning_rate": 0.0009647713314052896, "loss": 11716.1, "step": 1290 }, { "ce_loss_13": 4.6914472579956055, "ce_loss_26": 3.8814328253269195, "ce_loss_39": 3.6700519025325775, "ce_loss_52": 1.428275752067566, "ce_loss_7": 5.065358865261078, "epoch": 0.13, "grad_norm": 24.506148720784267, "kl_loss_13": 6617.6, "kl_loss_26": 4886.4, "kl_loss_39": 4454.8, "kl_loss_7": 7397.6, "learning_rate": 0.0009641839665080363, "loss": 11621.0, "step": 1300 }, { "ce_loss_13": 4.666101861000061, "ce_loss_26": 3.8607113540172575, "ce_loss_39": 3.6500234425067903, "ce_loss_52": 1.4570627421140672, "ce_loss_7": 5.0361028671264645, "epoch": 0.131, "grad_norm": 23.043331928969636, "kl_loss_13": 6562.4, "kl_loss_26": 4820.0, "kl_loss_39": 4389.2, "kl_loss_7": 7340.0, "learning_rate": 0.0009635919272833937, "loss": 11547.2, "step": 1310 }, { "ce_loss_13": 4.57597508430481, "ce_loss_26": 3.757897812128067, "ce_loss_39": 3.541293317079544, "ce_loss_52": 1.417707359790802, "ce_loss_7": 4.957513523101807, "epoch": 0.132, "grad_norm": 23.874048124377556, "kl_loss_13": 6413.6, "kl_loss_26": 4680.0, "kl_loss_39": 4232.4, "kl_loss_7": 7214.4, "learning_rate": 0.0009629952196931902, "loss": 11465.6, "step": 1320 }, { "ce_loss_13": 4.599424958229065, "ce_loss_26": 3.7814753651618958, "ce_loss_39": 3.5756381869316103, "ce_loss_52": 1.435066069662571, "ce_loss_7": 4.962808167934417, "epoch": 0.133, "grad_norm": 27.163150075748632, "kl_loss_13": 6461.6, "kl_loss_26": 4719.2, "kl_loss_39": 4284.8, "kl_loss_7": 7222.4, "learning_rate": 0.0009623938497462645, "loss": 11415.0, "step": 1330 }, { "ce_loss_13": 4.593182015419006, "ce_loss_26": 3.749741864204407, "ce_loss_39": 3.543072348833084, "ce_loss_52": 1.4210506305098534, "ce_loss_7": 4.98070273399353, "epoch": 0.134, "grad_norm": 23.74960078438379, "kl_loss_13": 6465.6, "kl_loss_26": 4674.0, "kl_loss_39": 4258.0, "kl_loss_7": 7272.8, "learning_rate": 0.0009617878234984055, "loss": 11286.2, "step": 1340 }, { "ce_loss_13": 4.5947358965873715, "ce_loss_26": 3.779453754425049, "ce_loss_39": 3.554258805513382, "ce_loss_52": 1.440669310092926, "ce_loss_7": 4.96803480386734, "epoch": 0.135, "grad_norm": 24.32368060127727, "kl_loss_13": 6402.4, "kl_loss_26": 4660.8, "kl_loss_39": 4202.0, "kl_loss_7": 7178.4, "learning_rate": 0.0009611771470522907, "loss": 11138.4, "step": 1350 }, { "ce_loss_13": 4.540663009881973, "ce_loss_26": 3.7292558193206786, "ce_loss_39": 3.5131251573562623, "ce_loss_52": 1.4146902561187744, "ce_loss_7": 4.918925869464874, "epoch": 0.136, "grad_norm": 23.85034387543973, "kl_loss_13": 6383.2, "kl_loss_26": 4642.0, "kl_loss_39": 4189.2, "kl_loss_7": 7181.6, "learning_rate": 0.0009605618265574251, "loss": 11195.2, "step": 1360 }, { "ce_loss_13": 4.595166695117951, "ce_loss_26": 3.7860535979270935, "ce_loss_39": 3.5680083632469177, "ce_loss_52": 1.4854394227266312, "ce_loss_7": 4.9739551663398744, "epoch": 0.137, "grad_norm": 23.647990920122997, "kl_loss_13": 6340.0, "kl_loss_26": 4612.0, "kl_loss_39": 4162.0, "kl_loss_7": 7127.2, "learning_rate": 0.0009599418682100792, "loss": 11028.6, "step": 1370 }, { "ce_loss_13": 4.481674873828888, "ce_loss_26": 3.655723828077316, "ce_loss_39": 3.4441386282444, "ce_loss_52": 1.402228906750679, "ce_loss_7": 4.8588902950286865, "epoch": 0.138, "grad_norm": 23.628756362977956, "kl_loss_13": 6284.8, "kl_loss_26": 4506.8, "kl_loss_39": 4074.4, "kl_loss_7": 7073.6, "learning_rate": 0.0009593172782532268, "loss": 10976.4, "step": 1380 }, { "ce_loss_13": 4.446731185913086, "ce_loss_26": 3.6488849222660065, "ce_loss_39": 3.4283725559711455, "ce_loss_52": 1.425352481007576, "ce_loss_7": 4.823254930973053, "epoch": 0.139, "grad_norm": 23.454937110465252, "kl_loss_13": 6190.4, "kl_loss_26": 4472.0, "kl_loss_39": 4019.6, "kl_loss_7": 6972.0, "learning_rate": 0.0009586880629764817, "loss": 10856.2, "step": 1390 }, { "ce_loss_13": 4.481068539619446, "ce_loss_26": 3.6307739317417145, "ce_loss_39": 3.4139047265052795, "ce_loss_52": 1.3980510637164116, "ce_loss_7": 4.877132707834244, "epoch": 0.14, "grad_norm": 24.701369227992412, "kl_loss_13": 6300.0, "kl_loss_26": 4496.4, "kl_loss_39": 4045.6, "kl_loss_7": 7120.0, "learning_rate": 0.0009580542287160348, "loss": 10848.4, "step": 1400 }, { "ce_loss_13": 4.481135439872742, "ce_loss_26": 3.684358072280884, "ce_loss_39": 3.454758608341217, "ce_loss_52": 1.459119439125061, "ce_loss_7": 4.855645072460175, "epoch": 0.141, "grad_norm": 24.081142665128635, "kl_loss_13": 6146.4, "kl_loss_26": 4455.2, "kl_loss_39": 3985.2, "kl_loss_7": 6936.0, "learning_rate": 0.0009574157818545901, "loss": 10711.8, "step": 1410 }, { "ce_loss_13": 4.440946173667908, "ce_loss_26": 3.6337361335754395, "ce_loss_39": 3.404258185625076, "ce_loss_52": 1.411030687391758, "ce_loss_7": 4.816100597381592, "epoch": 0.142, "grad_norm": 22.547443199755673, "kl_loss_13": 6183.2, "kl_loss_26": 4461.2, "kl_loss_39": 3991.6, "kl_loss_7": 6976.0, "learning_rate": 0.0009567727288213005, "loss": 10724.8, "step": 1420 }, { "ce_loss_13": 4.457812869548798, "ce_loss_26": 3.6864565014839172, "ce_loss_39": 3.452153670787811, "ce_loss_52": 1.4788728266954423, "ce_loss_7": 4.832965791225433, "epoch": 0.143, "grad_norm": 22.827900847688525, "kl_loss_13": 6096.8, "kl_loss_26": 4430.4, "kl_loss_39": 3959.2, "kl_loss_7": 6881.6, "learning_rate": 0.0009561250760917027, "loss": 10616.2, "step": 1430 }, { "ce_loss_13": 4.386147284507752, "ce_loss_26": 3.586784356832504, "ce_loss_39": 3.356312555074692, "ce_loss_52": 1.4125748693943023, "ce_loss_7": 4.7688825011253355, "epoch": 0.144, "grad_norm": 22.45947089806503, "kl_loss_13": 6064.0, "kl_loss_26": 4354.0, "kl_loss_39": 3886.8, "kl_loss_7": 6859.2, "learning_rate": 0.0009554728301876525, "loss": 10473.0, "step": 1440 }, { "ce_loss_13": 4.411167800426483, "ce_loss_26": 3.590187501907349, "ce_loss_39": 3.3659981071949003, "ce_loss_52": 1.4227360993623734, "ce_loss_7": 4.80015469789505, "epoch": 0.145, "grad_norm": 22.134471970609756, "kl_loss_13": 6080.0, "kl_loss_26": 4321.2, "kl_loss_39": 3868.4, "kl_loss_7": 6892.0, "learning_rate": 0.0009548159976772592, "loss": 10449.2, "step": 1450 }, { "ce_loss_13": 4.304250085353852, "ce_loss_26": 3.5215473413467406, "ce_loss_39": 3.3020472466945647, "ce_loss_52": 1.455188724398613, "ce_loss_7": 4.682382225990295, "epoch": 0.146, "grad_norm": 23.296949813068704, "kl_loss_13": 5828.8, "kl_loss_26": 4153.6, "kl_loss_39": 3701.2, "kl_loss_7": 6626.4, "learning_rate": 0.0009541545851748186, "loss": 10336.2, "step": 1460 }, { "ce_loss_13": 4.340453952550888, "ce_loss_26": 3.527716559171677, "ce_loss_39": 3.297034960985184, "ce_loss_52": 1.4175047695636749, "ce_loss_7": 4.720600801706314, "epoch": 0.147, "grad_norm": 23.69989338145452, "kl_loss_13": 5933.6, "kl_loss_26": 4197.2, "kl_loss_39": 3732.0, "kl_loss_7": 6732.8, "learning_rate": 0.0009534885993407473, "loss": 10320.4, "step": 1470 }, { "ce_loss_13": 4.316350519657135, "ce_loss_26": 3.522084206342697, "ce_loss_39": 3.293704879283905, "ce_loss_52": 1.4351435631513596, "ce_loss_7": 4.694415915012359, "epoch": 0.148, "grad_norm": 23.155946924290426, "kl_loss_13": 5859.2, "kl_loss_26": 4177.2, "kl_loss_39": 3708.0, "kl_loss_7": 6649.6, "learning_rate": 0.0009528180468815154, "loss": 10227.4, "step": 1480 }, { "ce_loss_13": 4.34768191576004, "ce_loss_26": 3.5612153470516206, "ce_loss_39": 3.3256225168704985, "ce_loss_52": 1.4718306064605713, "ce_loss_7": 4.73325879573822, "epoch": 0.149, "grad_norm": 22.828288355166595, "kl_loss_13": 5840.8, "kl_loss_26": 4173.2, "kl_loss_39": 3699.2, "kl_loss_7": 6654.4, "learning_rate": 0.0009521429345495787, "loss": 10213.0, "step": 1490 }, { "ce_loss_13": 4.285507726669311, "ce_loss_26": 3.494914507865906, "ce_loss_39": 3.2638841211795806, "ce_loss_52": 1.4394250243902207, "ce_loss_7": 4.674430012702942, "epoch": 0.15, "grad_norm": 22.358102006612796, "kl_loss_13": 5802.4, "kl_loss_26": 4114.4, "kl_loss_39": 3639.2, "kl_loss_7": 6619.2, "learning_rate": 0.0009514632691433108, "loss": 10144.0, "step": 1500 }, { "ce_loss_13": 4.274980753660202, "ce_loss_26": 3.4573559522628785, "ce_loss_39": 3.2235658168792725, "ce_loss_52": 1.4003299355506897, "ce_loss_7": 4.676058840751648, "epoch": 0.151, "grad_norm": 21.54131953317247, "kl_loss_13": 5876.0, "kl_loss_26": 4120.4, "kl_loss_39": 3643.6, "kl_loss_7": 6711.2, "learning_rate": 0.0009507790575069346, "loss": 10084.8, "step": 1510 }, { "ce_loss_13": 4.239670622348785, "ce_loss_26": 3.4528944075107573, "ce_loss_39": 3.221757102012634, "ce_loss_52": 1.4378845229744912, "ce_loss_7": 4.625989091396332, "epoch": 0.152, "grad_norm": 20.883193615641517, "kl_loss_13": 5700.0, "kl_loss_26": 4031.2, "kl_loss_39": 3560.0, "kl_loss_7": 6514.4, "learning_rate": 0.0009500903065304539, "loss": 9981.0, "step": 1520 }, { "ce_loss_13": 4.250718909502029, "ce_loss_26": 3.4539793133735657, "ce_loss_39": 3.2287534534931184, "ce_loss_52": 1.450673970580101, "ce_loss_7": 4.637939321994781, "epoch": 0.153, "grad_norm": 21.937587942461658, "kl_loss_13": 5717.6, "kl_loss_26": 4016.0, "kl_loss_39": 3551.6, "kl_loss_7": 6520.0, "learning_rate": 0.0009493970231495835, "loss": 9886.2, "step": 1530 }, { "ce_loss_13": 4.213818311691284, "ce_loss_26": 3.4225172460079194, "ce_loss_39": 3.192109799385071, "ce_loss_52": 1.4253905564546585, "ce_loss_7": 4.591876769065857, "epoch": 0.154, "grad_norm": 22.11614774484019, "kl_loss_13": 5669.6, "kl_loss_26": 3997.6, "kl_loss_39": 3517.6, "kl_loss_7": 6459.2, "learning_rate": 0.0009486992143456792, "loss": 9848.6, "step": 1540 }, { "ce_loss_13": 4.188841539621353, "ce_loss_26": 3.3936797797679903, "ce_loss_39": 3.15527623295784, "ce_loss_52": 1.4304118230938911, "ce_loss_7": 4.582345807552338, "epoch": 0.155, "grad_norm": 23.528038258157196, "kl_loss_13": 5588.0, "kl_loss_26": 3908.0, "kl_loss_39": 3429.6, "kl_loss_7": 6408.8, "learning_rate": 0.0009479968871456679, "loss": 9804.0, "step": 1550 }, { "ce_loss_13": 4.199311399459839, "ce_loss_26": 3.388694739341736, "ce_loss_39": 3.161331224441528, "ce_loss_52": 1.424024721980095, "ce_loss_7": 4.598053079843521, "epoch": 0.156, "grad_norm": 20.794486798088236, "kl_loss_13": 5643.2, "kl_loss_26": 3922.0, "kl_loss_39": 3462.8, "kl_loss_7": 6473.6, "learning_rate": 0.0009472900486219768, "loss": 9758.3, "step": 1560 }, { "ce_loss_13": 4.16922065615654, "ce_loss_26": 3.3762763381004333, "ce_loss_39": 3.142381691932678, "ce_loss_52": 1.4278530597686767, "ce_loss_7": 4.5707217931747435, "epoch": 0.157, "grad_norm": 21.684568107604626, "kl_loss_13": 5582.4, "kl_loss_26": 3884.4, "kl_loss_39": 3405.2, "kl_loss_7": 6420.0, "learning_rate": 0.000946578705892462, "loss": 9625.8, "step": 1570 }, { "ce_loss_13": 4.178904807567596, "ce_loss_26": 3.382037007808685, "ce_loss_39": 3.137607681751251, "ce_loss_52": 1.4311140328645706, "ce_loss_7": 4.566913962364197, "epoch": 0.158, "grad_norm": 21.97647697577533, "kl_loss_13": 5572.8, "kl_loss_26": 3885.6, "kl_loss_39": 3393.6, "kl_loss_7": 6394.4, "learning_rate": 0.0009458628661203367, "loss": 9608.1, "step": 1580 }, { "ce_loss_13": 4.179209893941879, "ce_loss_26": 3.378256690502167, "ce_loss_39": 3.13810538649559, "ce_loss_52": 1.418858152627945, "ce_loss_7": 4.562736237049103, "epoch": 0.159, "grad_norm": 20.949460387099393, "kl_loss_13": 5612.0, "kl_loss_26": 3914.4, "kl_loss_39": 3428.8, "kl_loss_7": 6420.0, "learning_rate": 0.0009451425365140996, "loss": 9608.8, "step": 1590 }, { "ce_loss_13": 4.1623717725276945, "ce_loss_26": 3.376670056581497, "ce_loss_39": 3.1331222474575045, "ce_loss_52": 1.434694454073906, "ce_loss_7": 4.556069934368134, "epoch": 0.16, "grad_norm": 20.884587914746188, "kl_loss_13": 5573.6, "kl_loss_26": 3878.8, "kl_loss_39": 3382.4, "kl_loss_7": 6402.4, "learning_rate": 0.0009444177243274617, "loss": 9535.6, "step": 1600 }, { "ce_loss_13": 4.082578724622726, "ce_loss_26": 3.289813929796219, "ce_loss_39": 3.0515677452087404, "ce_loss_52": 1.4236672833561896, "ce_loss_7": 4.478320574760437, "epoch": 0.161, "grad_norm": 20.592533194999756, "kl_loss_13": 5423.2, "kl_loss_26": 3736.8, "kl_loss_39": 3249.6, "kl_loss_7": 6253.6, "learning_rate": 0.0009436884368592739, "loss": 9466.0, "step": 1610 }, { "ce_loss_13": 4.142659282684326, "ce_loss_26": 3.3751652896404267, "ce_loss_39": 3.137872564792633, "ce_loss_52": 1.481352651119232, "ce_loss_7": 4.527895116806031, "epoch": 0.162, "grad_norm": 21.486710542968336, "kl_loss_13": 5416.8, "kl_loss_26": 3772.8, "kl_loss_39": 3282.8, "kl_loss_7": 6228.8, "learning_rate": 0.0009429546814534529, "loss": 9367.9, "step": 1620 }, { "ce_loss_13": 4.141797959804535, "ce_loss_26": 3.344935214519501, "ce_loss_39": 3.1015843570232393, "ce_loss_52": 1.4449981674551964, "ce_loss_7": 4.53586882352829, "epoch": 0.163, "grad_norm": 22.257565389083407, "kl_loss_13": 5484.8, "kl_loss_26": 3776.8, "kl_loss_39": 3282.8, "kl_loss_7": 6309.6, "learning_rate": 0.0009422164654989072, "loss": 9391.3, "step": 1630 }, { "ce_loss_13": 4.131260120868683, "ce_loss_26": 3.3283946096897123, "ce_loss_39": 3.08265677690506, "ce_loss_52": 1.446463230252266, "ce_loss_7": 4.523310673236847, "epoch": 0.164, "grad_norm": 20.40514368262374, "kl_loss_13": 5457.6, "kl_loss_26": 3766.0, "kl_loss_39": 3265.6, "kl_loss_7": 6288.8, "learning_rate": 0.0009414737964294635, "loss": 9297.4, "step": 1640 }, { "ce_loss_13": 4.055157667398452, "ce_loss_26": 3.2639363288879393, "ce_loss_39": 3.0281569600105285, "ce_loss_52": 1.451804205775261, "ce_loss_7": 4.447070962190628, "epoch": 0.165, "grad_norm": 21.902920394223948, "kl_loss_13": 5323.2, "kl_loss_26": 3634.0, "kl_loss_39": 3150.0, "kl_loss_7": 6143.2, "learning_rate": 0.000940726681723791, "loss": 9207.4, "step": 1650 }, { "ce_loss_13": 3.9808266043663023, "ce_loss_26": 3.1883151113986967, "ce_loss_39": 2.9586188077926634, "ce_loss_52": 1.4088758006691933, "ce_loss_7": 4.372277349233627, "epoch": 0.166, "grad_norm": 21.183658337296244, "kl_loss_13": 5256.0, "kl_loss_26": 3567.6, "kl_loss_39": 3100.8, "kl_loss_7": 6079.2, "learning_rate": 0.0009399751289053266, "loss": 9204.0, "step": 1660 }, { "ce_loss_13": 4.0190062642097475, "ce_loss_26": 3.22450470328331, "ce_loss_39": 2.987401658296585, "ce_loss_52": 1.3997518077492714, "ce_loss_7": 4.41674884557724, "epoch": 0.167, "grad_norm": 21.809854839151214, "kl_loss_13": 5346.4, "kl_loss_26": 3648.8, "kl_loss_39": 3164.8, "kl_loss_7": 6175.2, "learning_rate": 0.0009392191455421988, "loss": 9183.3, "step": 1670 }, { "ce_loss_13": 3.9647205591201784, "ce_loss_26": 3.1920640766620636, "ce_loss_39": 2.951520323753357, "ce_loss_52": 1.385107731819153, "ce_loss_7": 4.359467995166779, "epoch": 0.168, "grad_norm": 20.368238654152925, "kl_loss_13": 5256.8, "kl_loss_26": 3602.8, "kl_loss_39": 3118.0, "kl_loss_7": 6080.8, "learning_rate": 0.0009384587392471515, "loss": 9080.3, "step": 1680 }, { "ce_loss_13": 3.9925671815872192, "ce_loss_26": 3.2080724120140074, "ce_loss_39": 2.9695263385772703, "ce_loss_52": 1.4156971365213393, "ce_loss_7": 4.380452990531921, "epoch": 0.169, "grad_norm": 20.999177946491915, "kl_loss_13": 5268.0, "kl_loss_26": 3596.4, "kl_loss_39": 3100.8, "kl_loss_7": 6080.8, "learning_rate": 0.0009376939176774678, "loss": 8989.5, "step": 1690 }, { "ce_loss_13": 4.0180779755115505, "ce_loss_26": 3.258928191661835, "ce_loss_39": 3.0135749876499176, "ce_loss_52": 1.450287464261055, "ce_loss_7": 4.408803248405457, "epoch": 0.17, "grad_norm": 19.79885149243747, "kl_loss_13": 5224.8, "kl_loss_26": 3590.8, "kl_loss_39": 3092.0, "kl_loss_7": 6036.8, "learning_rate": 0.0009369246885348925, "loss": 8994.3, "step": 1700 }, { "ce_loss_13": 4.005008333921433, "ce_loss_26": 3.2108667314052584, "ce_loss_39": 2.9691853642463686, "ce_loss_52": 1.4231860041618347, "ce_loss_7": 4.408407872915268, "epoch": 0.171, "grad_norm": 20.19997348661953, "kl_loss_13": 5275.2, "kl_loss_26": 3580.8, "kl_loss_39": 3088.8, "kl_loss_7": 6116.8, "learning_rate": 0.0009361510595655545, "loss": 9032.7, "step": 1710 }, { "ce_loss_13": 4.0295430123806, "ce_loss_26": 3.269349628686905, "ce_loss_39": 3.0168069303035736, "ce_loss_52": 1.4558824241161346, "ce_loss_7": 4.418658912181854, "epoch": 0.172, "grad_norm": 20.29063277895484, "kl_loss_13": 5251.2, "kl_loss_26": 3624.8, "kl_loss_39": 3109.2, "kl_loss_7": 6066.4, "learning_rate": 0.0009353730385598887, "loss": 8917.6, "step": 1720 }, { "ce_loss_13": 3.904751992225647, "ce_loss_26": 3.117345708608627, "ce_loss_39": 2.8736896753311156, "ce_loss_52": 1.404754376411438, "ce_loss_7": 4.301223260164261, "epoch": 0.173, "grad_norm": 21.33168840347063, "kl_loss_13": 5095.2, "kl_loss_26": 3414.0, "kl_loss_39": 2926.0, "kl_loss_7": 5920.8, "learning_rate": 0.0009345906333525581, "loss": 8827.0, "step": 1730 }, { "ce_loss_13": 3.943844336271286, "ce_loss_26": 3.192184156179428, "ce_loss_39": 2.938348424434662, "ce_loss_52": 1.4280656158924103, "ce_loss_7": 4.340347635746002, "epoch": 0.174, "grad_norm": 20.78517763533083, "kl_loss_13": 5121.6, "kl_loss_26": 3512.0, "kl_loss_39": 2997.2, "kl_loss_7": 5951.2, "learning_rate": 0.0009338038518223745, "loss": 8776.4, "step": 1740 }, { "ce_loss_13": 3.9746175587177275, "ce_loss_26": 3.217360532283783, "ce_loss_39": 2.9738565742969514, "ce_loss_52": 1.460440719127655, "ce_loss_7": 4.36298366189003, "epoch": 0.175, "grad_norm": 22.95282400415446, "kl_loss_13": 5116.0, "kl_loss_26": 3500.0, "kl_loss_39": 2996.0, "kl_loss_7": 5937.6, "learning_rate": 0.0009330127018922195, "loss": 8715.7, "step": 1750 }, { "ce_loss_13": 3.903409707546234, "ce_loss_26": 3.1333308279514314, "ce_loss_39": 2.8973484218120573, "ce_loss_52": 1.4389021694660187, "ce_loss_7": 4.297668445110321, "epoch": 0.176, "grad_norm": 19.71888822868113, "kl_loss_13": 5043.2, "kl_loss_26": 3399.6, "kl_loss_39": 2906.0, "kl_loss_7": 5868.8, "learning_rate": 0.0009322171915289634, "loss": 8660.5, "step": 1760 }, { "ce_loss_13": 3.9436737656593324, "ce_loss_26": 3.180408328771591, "ce_loss_39": 2.933422142267227, "ce_loss_52": 1.468785560131073, "ce_loss_7": 4.335923504829407, "epoch": 0.177, "grad_norm": 21.100233895265884, "kl_loss_13": 5036.8, "kl_loss_26": 3416.8, "kl_loss_39": 2893.6, "kl_loss_7": 5853.6, "learning_rate": 0.0009314173287433873, "loss": 8685.1, "step": 1770 }, { "ce_loss_13": 4.000654596090317, "ce_loss_26": 3.24809735417366, "ce_loss_39": 2.9819031238555906, "ce_loss_52": 1.4765232503414154, "ce_loss_7": 4.392659711837768, "epoch": 0.178, "grad_norm": 20.346113365710192, "kl_loss_13": 5141.6, "kl_loss_26": 3518.0, "kl_loss_39": 2990.4, "kl_loss_7": 5964.0, "learning_rate": 0.0009306131215901003, "loss": 8673.2, "step": 1780 }, { "ce_loss_13": 3.9246813535690306, "ce_loss_26": 3.1745048224925996, "ce_loss_39": 2.9245960414409637, "ce_loss_52": 1.4693263441324234, "ce_loss_7": 4.317492133378982, "epoch": 0.179, "grad_norm": 19.397137651046872, "kl_loss_13": 5018.4, "kl_loss_26": 3389.2, "kl_loss_39": 2878.0, "kl_loss_7": 5844.8, "learning_rate": 0.0009298045781674596, "loss": 8564.1, "step": 1790 }, { "ce_loss_13": 3.910848397016525, "ce_loss_26": 3.146669828891754, "ce_loss_39": 2.8859269857406615, "ce_loss_52": 1.419717761874199, "ce_loss_7": 4.3105459094047545, "epoch": 0.18, "grad_norm": 19.504781875531744, "kl_loss_13": 5040.0, "kl_loss_26": 3411.2, "kl_loss_39": 2894.0, "kl_loss_7": 5888.0, "learning_rate": 0.0009289917066174886, "loss": 8563.1, "step": 1800 }, { "ce_loss_13": 3.886237096786499, "ce_loss_26": 3.0992377579212187, "ce_loss_39": 2.8514576256275177, "ce_loss_52": 1.4184779956936837, "ce_loss_7": 4.274789291620254, "epoch": 0.181, "grad_norm": 19.60587995762198, "kl_loss_13": 5042.4, "kl_loss_26": 3362.0, "kl_loss_39": 2856.4, "kl_loss_7": 5855.2, "learning_rate": 0.0009281745151257945, "loss": 8453.1, "step": 1810 }, { "ce_loss_13": 3.9144074499607084, "ce_loss_26": 3.1662435114383696, "ce_loss_39": 2.9089259922504427, "ce_loss_52": 1.478528293967247, "ce_loss_7": 4.301838612556457, "epoch": 0.182, "grad_norm": 19.79968756805323, "kl_loss_13": 4923.2, "kl_loss_26": 3333.2, "kl_loss_39": 2811.6, "kl_loss_7": 5736.8, "learning_rate": 0.0009273530119214868, "loss": 8471.9, "step": 1820 }, { "ce_loss_13": 3.8238776862621306, "ce_loss_26": 3.058783656358719, "ce_loss_39": 2.807573360204697, "ce_loss_52": 1.4178009316325189, "ce_loss_7": 4.227353280782699, "epoch": 0.183, "grad_norm": 19.471751859426742, "kl_loss_13": 4899.2, "kl_loss_26": 3273.6, "kl_loss_39": 2762.8, "kl_loss_7": 5738.4, "learning_rate": 0.0009265272052770935, "loss": 8399.4, "step": 1830 }, { "ce_loss_13": 3.836485821008682, "ce_loss_26": 3.0666845202445985, "ce_loss_39": 2.818044346570969, "ce_loss_52": 1.410616011917591, "ce_loss_7": 4.236015152931214, "epoch": 0.184, "grad_norm": 19.102126667670856, "kl_loss_13": 4940.0, "kl_loss_26": 3286.0, "kl_loss_39": 2780.4, "kl_loss_7": 5788.0, "learning_rate": 0.0009256971035084784, "loss": 8347.7, "step": 1840 }, { "ce_loss_13": 3.8049618661403657, "ce_loss_26": 3.0603095471858976, "ce_loss_39": 2.8144713938236237, "ce_loss_52": 1.4259676218032837, "ce_loss_7": 4.199950724840164, "epoch": 0.185, "grad_norm": 19.382557477849222, "kl_loss_13": 4835.6, "kl_loss_26": 3257.6, "kl_loss_39": 2762.0, "kl_loss_7": 5657.6, "learning_rate": 0.0009248627149747573, "loss": 8313.5, "step": 1850 }, { "ce_loss_13": 3.839466482400894, "ce_loss_26": 3.065267437696457, "ce_loss_39": 2.81703776717186, "ce_loss_52": 1.4297384396195412, "ce_loss_7": 4.240725481510163, "epoch": 0.186, "grad_norm": 20.053275602042234, "kl_loss_13": 4908.4, "kl_loss_26": 3269.6, "kl_loss_39": 2759.6, "kl_loss_7": 5741.6, "learning_rate": 0.0009240240480782129, "loss": 8305.0, "step": 1860 }, { "ce_loss_13": 3.8183856308460236, "ce_loss_26": 3.070716941356659, "ce_loss_39": 2.807391846179962, "ce_loss_52": 1.4359196320176124, "ce_loss_7": 4.214242458343506, "epoch": 0.187, "grad_norm": 19.191079413729856, "kl_loss_13": 4847.2, "kl_loss_26": 3263.6, "kl_loss_39": 2733.2, "kl_loss_7": 5672.8, "learning_rate": 0.0009231811112642122, "loss": 8227.6, "step": 1870 }, { "ce_loss_13": 3.779190129041672, "ce_loss_26": 3.0389082789421082, "ce_loss_39": 2.783122771978378, "ce_loss_52": 1.4208515673875808, "ce_loss_7": 4.162911784648895, "epoch": 0.188, "grad_norm": 20.43241639662848, "kl_loss_13": 4795.2, "kl_loss_26": 3215.2, "kl_loss_39": 2693.2, "kl_loss_7": 5606.4, "learning_rate": 0.0009223339130211192, "loss": 8213.8, "step": 1880 }, { "ce_loss_13": 3.708034944534302, "ce_loss_26": 2.9702564030885696, "ce_loss_39": 2.735187420248985, "ce_loss_52": 1.409775149822235, "ce_loss_7": 4.099964368343353, "epoch": 0.189, "grad_norm": 19.723419677891812, "kl_loss_13": 4691.2, "kl_loss_26": 3118.0, "kl_loss_39": 2625.8, "kl_loss_7": 5516.0, "learning_rate": 0.0009214824618802108, "loss": 8146.0, "step": 1890 }, { "ce_loss_13": 3.848232001066208, "ce_loss_26": 3.06461501121521, "ce_loss_39": 2.81026514172554, "ce_loss_52": 1.4419742107391358, "ce_loss_7": 4.244399529695511, "epoch": 0.19, "grad_norm": 21.429961437314283, "kl_loss_13": 4923.2, "kl_loss_26": 3246.4, "kl_loss_39": 2725.6, "kl_loss_7": 5755.2, "learning_rate": 0.0009206267664155906, "loss": 8168.2, "step": 1900 }, { "ce_loss_13": 3.74611656665802, "ce_loss_26": 2.995819491147995, "ce_loss_39": 2.7504830598831176, "ce_loss_52": 1.429488417506218, "ce_loss_7": 4.137593048810959, "epoch": 0.191, "grad_norm": 20.703417767001277, "kl_loss_13": 4708.8, "kl_loss_26": 3114.8, "kl_loss_39": 2616.8, "kl_loss_7": 5532.0, "learning_rate": 0.0009197668352441024, "loss": 8113.4, "step": 1910 }, { "ce_loss_13": 3.7616052985191346, "ce_loss_26": 3.0063544154167174, "ce_loss_39": 2.7453058779239656, "ce_loss_52": 1.4119072929024696, "ce_loss_7": 4.15754896402359, "epoch": 0.192, "grad_norm": 19.851460642991412, "kl_loss_13": 4782.4, "kl_loss_26": 3173.6, "kl_loss_39": 2644.0, "kl_loss_7": 5609.6, "learning_rate": 0.0009189026770252437, "loss": 8087.1, "step": 1920 }, { "ce_loss_13": 3.7914208650588987, "ce_loss_26": 3.0324925601482393, "ce_loss_39": 2.7728191137313845, "ce_loss_52": 1.4355733066797256, "ce_loss_7": 4.178089827299118, "epoch": 0.193, "grad_norm": 19.2014881795966, "kl_loss_13": 4800.8, "kl_loss_26": 3194.0, "kl_loss_39": 2659.2, "kl_loss_7": 5608.8, "learning_rate": 0.000918034300461078, "loss": 8051.8, "step": 1930 }, { "ce_loss_13": 3.7237455368041994, "ce_loss_26": 2.979181283712387, "ce_loss_39": 2.7309202194213866, "ce_loss_52": 1.4160432904958724, "ce_loss_7": 4.128066539764404, "epoch": 0.194, "grad_norm": 20.3086236795729, "kl_loss_13": 4720.0, "kl_loss_26": 3114.8, "kl_loss_39": 2602.4, "kl_loss_7": 5556.0, "learning_rate": 0.0009171617142961477, "loss": 8041.9, "step": 1940 }, { "ce_loss_13": 3.749431645870209, "ce_loss_26": 2.9937612235546114, "ce_loss_39": 2.744140648841858, "ce_loss_52": 1.4348472714424134, "ce_loss_7": 4.156274873018265, "epoch": 0.195, "grad_norm": 19.294577091174897, "kl_loss_13": 4722.4, "kl_loss_26": 3116.0, "kl_loss_39": 2605.2, "kl_loss_7": 5567.2, "learning_rate": 0.0009162849273173857, "loss": 7982.1, "step": 1950 }, { "ce_loss_13": 3.7092731952667237, "ce_loss_26": 2.9811393320560455, "ce_loss_39": 2.733800619840622, "ce_loss_52": 1.4478029429912567, "ce_loss_7": 4.0951203346252445, "epoch": 0.196, "grad_norm": 18.879569020896366, "kl_loss_13": 4628.0, "kl_loss_26": 3074.0, "kl_loss_39": 2569.2, "kl_loss_7": 5440.8, "learning_rate": 0.0009154039483540273, "loss": 7938.2, "step": 1960 }, { "ce_loss_13": 3.816760164499283, "ce_loss_26": 3.0473806083202364, "ce_loss_39": 2.7949241638183593, "ce_loss_52": 1.466755247116089, "ce_loss_7": 4.206719404458999, "epoch": 0.197, "grad_norm": 18.719442415203407, "kl_loss_13": 4774.4, "kl_loss_26": 3153.6, "kl_loss_39": 2628.8, "kl_loss_7": 5592.0, "learning_rate": 0.0009145187862775209, "loss": 7902.9, "step": 1970 }, { "ce_loss_13": 3.6841754376888276, "ce_loss_26": 2.9468030989170075, "ce_loss_39": 2.694840121269226, "ce_loss_52": 1.4269890293478966, "ce_loss_7": 4.081214648485184, "epoch": 0.198, "grad_norm": 18.93967966275207, "kl_loss_13": 4605.6, "kl_loss_26": 3030.4, "kl_loss_39": 2512.8, "kl_loss_7": 5443.2, "learning_rate": 0.0009136294500014386, "loss": 7824.0, "step": 1980 }, { "ce_loss_13": 3.7859152793884276, "ce_loss_26": 3.025045871734619, "ce_loss_39": 2.761893022060394, "ce_loss_52": 1.439668196439743, "ce_loss_7": 4.175345808267593, "epoch": 0.199, "grad_norm": 18.767073263034167, "kl_loss_13": 4776.0, "kl_loss_26": 3164.0, "kl_loss_39": 2630.4, "kl_loss_7": 5584.0, "learning_rate": 0.000912735948481387, "loss": 7845.9, "step": 1990 }, { "ce_loss_13": 3.7032747983932497, "ce_loss_26": 2.965251809358597, "ce_loss_39": 2.708036279678345, "ce_loss_52": 1.4405365601181983, "ce_loss_7": 4.098276823759079, "epoch": 0.2, "grad_norm": 18.50836033098629, "kl_loss_13": 4630.0, "kl_loss_26": 3052.8, "kl_loss_39": 2530.4, "kl_loss_7": 5457.6, "learning_rate": 0.0009118382907149164, "loss": 7748.9, "step": 2000 }, { "ce_loss_13": 3.717062991857529, "ce_loss_26": 2.979129308462143, "ce_loss_39": 2.7225220024585726, "ce_loss_52": 1.45234707146883, "ce_loss_7": 4.110658597946167, "epoch": 0.201, "grad_norm": 19.844351529751968, "kl_loss_13": 4629.6, "kl_loss_26": 3045.6, "kl_loss_39": 2525.2, "kl_loss_7": 5464.0, "learning_rate": 0.0009109364857414306, "loss": 7809.6, "step": 2010 }, { "ce_loss_13": 3.7279494285583494, "ce_loss_26": 2.9849789261817934, "ce_loss_39": 2.7267052114009855, "ce_loss_52": 1.4486449271440507, "ce_loss_7": 4.1131413102149965, "epoch": 0.202, "grad_norm": 19.33420875029095, "kl_loss_13": 4639.2, "kl_loss_26": 3061.2, "kl_loss_39": 2533.6, "kl_loss_7": 5456.0, "learning_rate": 0.0009100305426420956, "loss": 7708.0, "step": 2020 }, { "ce_loss_13": 3.643654578924179, "ce_loss_26": 2.9251492261886596, "ce_loss_39": 2.667558515071869, "ce_loss_52": 1.4118730872869492, "ce_loss_7": 4.034631943702697, "epoch": 0.203, "grad_norm": 19.14819354564296, "kl_loss_13": 4553.6, "kl_loss_26": 3016.0, "kl_loss_39": 2492.0, "kl_loss_7": 5371.2, "learning_rate": 0.0009091204705397484, "loss": 7699.0, "step": 2030 }, { "ce_loss_13": 3.706267160177231, "ce_loss_26": 2.971902164816856, "ce_loss_39": 2.7119477689266205, "ce_loss_52": 1.4564015328884126, "ce_loss_7": 4.097871041297912, "epoch": 0.204, "grad_norm": 18.950983085399912, "kl_loss_13": 4556.8, "kl_loss_26": 2972.2, "kl_loss_39": 2454.6, "kl_loss_7": 5380.0, "learning_rate": 0.0009082062785988049, "loss": 7671.5, "step": 2040 }, { "ce_loss_13": 3.6526230454444883, "ce_loss_26": 2.905240607261658, "ce_loss_39": 2.654486656188965, "ce_loss_52": 1.4092363178730012, "ce_loss_7": 4.043174755573273, "epoch": 0.205, "grad_norm": 20.137932882736667, "kl_loss_13": 4559.6, "kl_loss_26": 2969.2, "kl_loss_39": 2459.2, "kl_loss_7": 5384.0, "learning_rate": 0.0009072879760251679, "loss": 7662.0, "step": 2050 }, { "ce_loss_13": 3.578403168916702, "ce_loss_26": 2.8548426389694215, "ce_loss_39": 2.615563529729843, "ce_loss_52": 1.4107020199298859, "ce_loss_7": 3.97204332947731, "epoch": 0.206, "grad_norm": 19.383234658877402, "kl_loss_13": 4444.0, "kl_loss_26": 2894.0, "kl_loss_39": 2391.4, "kl_loss_7": 5266.4, "learning_rate": 0.0009063655720661341, "loss": 7643.5, "step": 2060 }, { "ce_loss_13": 3.5892180263996125, "ce_loss_26": 2.855096530914307, "ce_loss_39": 2.6004712164402006, "ce_loss_52": 1.4145165607333183, "ce_loss_7": 3.983754909038544, "epoch": 0.207, "grad_norm": 19.811430759479535, "kl_loss_13": 4467.2, "kl_loss_26": 2904.0, "kl_loss_39": 2385.0, "kl_loss_7": 5291.2, "learning_rate": 0.000905439076010301, "loss": 7534.0, "step": 2070 }, { "ce_loss_13": 3.6425350308418274, "ce_loss_26": 2.9249331414699555, "ce_loss_39": 2.6687968969345093, "ce_loss_52": 1.4602822691202164, "ce_loss_7": 4.025002205371857, "epoch": 0.208, "grad_norm": 19.741988709477923, "kl_loss_13": 4438.4, "kl_loss_26": 2910.0, "kl_loss_39": 2394.8, "kl_loss_7": 5248.8, "learning_rate": 0.0009045084971874737, "loss": 7505.8, "step": 2080 }, { "ce_loss_13": 3.553661996126175, "ce_loss_26": 2.8095623433589934, "ce_loss_39": 2.561682888865471, "ce_loss_52": 1.3855161294341087, "ce_loss_7": 3.9535735607147218, "epoch": 0.209, "grad_norm": 18.406387003890178, "kl_loss_13": 4454.8, "kl_loss_26": 2860.4, "kl_loss_39": 2354.2, "kl_loss_7": 5295.2, "learning_rate": 0.0009035738449685707, "loss": 7532.3, "step": 2090 }, { "ce_loss_13": 3.659823089838028, "ce_loss_26": 2.919918876886368, "ce_loss_39": 2.67503222823143, "ce_loss_52": 1.4627915531396867, "ce_loss_7": 4.0562332451343535, "epoch": 0.21, "grad_norm": 19.907425841295773, "kl_loss_13": 4488.8, "kl_loss_26": 2909.2, "kl_loss_39": 2407.0, "kl_loss_7": 5320.8, "learning_rate": 0.0009026351287655293, "loss": 7517.4, "step": 2100 }, { "ce_loss_13": 3.630769556760788, "ce_loss_26": 2.9068243861198426, "ce_loss_39": 2.642093613743782, "ce_loss_52": 1.4367542505264281, "ce_loss_7": 4.022511690855026, "epoch": 0.211, "grad_norm": 18.196990652810108, "kl_loss_13": 4459.2, "kl_loss_26": 2926.8, "kl_loss_39": 2395.2, "kl_loss_7": 5276.0, "learning_rate": 0.0009016923580312113, "loss": 7412.6, "step": 2110 }, { "ce_loss_13": 3.682375580072403, "ce_loss_26": 2.9616983354091646, "ce_loss_39": 2.6908247590065004, "ce_loss_52": 1.482297733426094, "ce_loss_7": 4.079386693239212, "epoch": 0.212, "grad_norm": 20.88180475150302, "kl_loss_13": 4465.6, "kl_loss_26": 2920.8, "kl_loss_39": 2382.8, "kl_loss_7": 5297.6, "learning_rate": 0.0009007455422593077, "loss": 7402.8, "step": 2120 }, { "ce_loss_13": 3.5886090993881226, "ce_loss_26": 2.8697936654090883, "ce_loss_39": 2.6118789970874787, "ce_loss_52": 1.4324451625347137, "ce_loss_7": 3.989769661426544, "epoch": 0.213, "grad_norm": 21.410550894381295, "kl_loss_13": 4406.0, "kl_loss_26": 2862.8, "kl_loss_39": 2337.2, "kl_loss_7": 5236.0, "learning_rate": 0.0008997946909842425, "loss": 7376.6, "step": 2130 }, { "ce_loss_13": 3.5265793919563295, "ce_loss_26": 2.8049169957637785, "ce_loss_39": 2.5549218744039535, "ce_loss_52": 1.4088917583227158, "ce_loss_7": 3.920357757806778, "epoch": 0.214, "grad_norm": 18.12930136088373, "kl_loss_13": 4307.2, "kl_loss_26": 2766.8, "kl_loss_39": 2250.8, "kl_loss_7": 5132.8, "learning_rate": 0.0008988398137810777, "loss": 7289.0, "step": 2140 }, { "ce_loss_13": 3.493118005990982, "ce_loss_26": 2.7619090020656585, "ce_loss_39": 2.523107871413231, "ce_loss_52": 1.3855519428849221, "ce_loss_7": 3.8948814868927, "epoch": 0.215, "grad_norm": 19.085499107664276, "kl_loss_13": 4317.6, "kl_loss_26": 2765.2, "kl_loss_39": 2265.6, "kl_loss_7": 5148.8, "learning_rate": 0.0008978809202654162, "loss": 7322.3, "step": 2150 }, { "ce_loss_13": 3.5010905504226684, "ce_loss_26": 2.7818336695432664, "ce_loss_39": 2.5317496716976167, "ce_loss_52": 1.410066269338131, "ce_loss_7": 3.8946242213249205, "epoch": 0.216, "grad_norm": 18.12712360145358, "kl_loss_13": 4244.8, "kl_loss_26": 2719.8, "kl_loss_39": 2212.8, "kl_loss_7": 5073.6, "learning_rate": 0.0008969180200933046, "loss": 7287.8, "step": 2160 }, { "ce_loss_13": 3.583745849132538, "ce_loss_26": 2.8594263792037964, "ce_loss_39": 2.5962479442358015, "ce_loss_52": 1.4458883255720139, "ce_loss_7": 3.976805257797241, "epoch": 0.217, "grad_norm": 17.565470208207127, "kl_loss_13": 4359.6, "kl_loss_26": 2818.4, "kl_loss_39": 2285.8, "kl_loss_7": 5177.6, "learning_rate": 0.0008959511229611376, "loss": 7240.9, "step": 2170 }, { "ce_loss_13": 3.580790603160858, "ce_loss_26": 2.859365826845169, "ce_loss_39": 2.5985568940639494, "ce_loss_52": 1.4648242503404618, "ce_loss_7": 3.9631645143032075, "epoch": 0.218, "grad_norm": 18.371856212121592, "kl_loss_13": 4316.0, "kl_loss_26": 2762.4, "kl_loss_39": 2248.2, "kl_loss_7": 5127.2, "learning_rate": 0.0008949802386055581, "loss": 7227.3, "step": 2180 }, { "ce_loss_13": 3.549471515417099, "ce_loss_26": 2.814970576763153, "ce_loss_39": 2.561775863170624, "ce_loss_52": 1.4231164067983628, "ce_loss_7": 3.941384530067444, "epoch": 0.219, "grad_norm": 17.91149753664061, "kl_loss_13": 4321.2, "kl_loss_26": 2784.4, "kl_loss_39": 2263.6, "kl_loss_7": 5152.0, "learning_rate": 0.0008940053768033609, "loss": 7238.8, "step": 2190 }, { "ce_loss_13": 3.5581556379795076, "ce_loss_26": 2.825929582118988, "ce_loss_39": 2.570690780878067, "ce_loss_52": 1.4427952721714974, "ce_loss_7": 3.9514447808265687, "epoch": 0.22, "grad_norm": 19.745379707547666, "kl_loss_13": 4304.8, "kl_loss_26": 2754.0, "kl_loss_39": 2234.6, "kl_loss_7": 5132.8, "learning_rate": 0.0008930265473713938, "loss": 7236.5, "step": 2200 }, { "ce_loss_13": 3.522547519207001, "ce_loss_26": 2.7868906617164613, "ce_loss_39": 2.524843490123749, "ce_loss_52": 1.3902505502104758, "ce_loss_7": 3.9295816838741304, "epoch": 0.221, "grad_norm": 18.58799040514992, "kl_loss_13": 4349.2, "kl_loss_26": 2792.0, "kl_loss_39": 2262.0, "kl_loss_7": 5198.4, "learning_rate": 0.0008920437601664579, "loss": 7187.9, "step": 2210 }, { "ce_loss_13": 3.5173128962516786, "ce_loss_26": 2.804593563079834, "ce_loss_39": 2.557006138563156, "ce_loss_52": 1.4581168740987778, "ce_loss_7": 3.910993677377701, "epoch": 0.222, "grad_norm": 18.95814843355098, "kl_loss_13": 4224.4, "kl_loss_26": 2677.2, "kl_loss_39": 2165.2, "kl_loss_7": 5053.6, "learning_rate": 0.0008910570250852097, "loss": 7168.4, "step": 2220 }, { "ce_loss_13": 3.4375841438770296, "ce_loss_26": 2.7280281484127045, "ce_loss_39": 2.47977514564991, "ce_loss_52": 1.389390866458416, "ce_loss_7": 3.838480031490326, "epoch": 0.223, "grad_norm": 19.685999540006744, "kl_loss_13": 4179.6, "kl_loss_26": 2668.0, "kl_loss_39": 2152.8, "kl_loss_7": 5011.2, "learning_rate": 0.0008900663520640604, "loss": 7080.8, "step": 2230 }, { "ce_loss_13": 3.5171928703784943, "ce_loss_26": 2.8187219202518463, "ce_loss_39": 2.5553694486618044, "ce_loss_52": 1.4541724801063538, "ce_loss_7": 3.8974815249443053, "epoch": 0.224, "grad_norm": 26.45112973751635, "kl_loss_13": 4224.4, "kl_loss_26": 2710.0, "kl_loss_39": 2180.4, "kl_loss_7": 5080.0, "learning_rate": 0.0008890717510790764, "loss": 7105.5, "step": 2240 }, { "ce_loss_13": 3.5361606895923616, "ce_loss_26": 2.8268598556518554, "ce_loss_39": 2.5735931187868117, "ce_loss_52": 1.451829667389393, "ce_loss_7": 3.924015772342682, "epoch": 0.225, "grad_norm": 20.75511317660791, "kl_loss_13": 4247.2, "kl_loss_26": 2732.8, "kl_loss_39": 2219.0, "kl_loss_7": 5061.6, "learning_rate": 0.0008880732321458784, "loss": 7074.7, "step": 2250 }, { "ce_loss_13": 3.4491190731525423, "ce_loss_26": 2.7567967534065247, "ce_loss_39": 2.5041564613580705, "ce_loss_52": 1.4379881560802459, "ce_loss_7": 3.8363168060779573, "epoch": 0.226, "grad_norm": 18.822666188140545, "kl_loss_13": 4102.8, "kl_loss_26": 2621.6, "kl_loss_39": 2107.6, "kl_loss_7": 4917.6, "learning_rate": 0.0008870708053195413, "loss": 7003.4, "step": 2260 }, { "ce_loss_13": 3.4790355801582336, "ce_loss_26": 2.765997165441513, "ce_loss_39": 2.5106064915657043, "ce_loss_52": 1.4210210233926772, "ce_loss_7": 3.8746932446956635, "epoch": 0.227, "grad_norm": 19.45926661054129, "kl_loss_13": 4188.4, "kl_loss_26": 2681.2, "kl_loss_39": 2162.6, "kl_loss_7": 5009.6, "learning_rate": 0.0008860644806944918, "loss": 7002.8, "step": 2270 }, { "ce_loss_13": 3.589535415172577, "ce_loss_26": 2.870089566707611, "ce_loss_39": 2.5925551772117617, "ce_loss_52": 1.4516636282205582, "ce_loss_7": 3.9855311453342437, "epoch": 0.228, "grad_norm": 18.527281302332295, "kl_loss_13": 4335.2, "kl_loss_26": 2802.8, "kl_loss_39": 2252.6, "kl_loss_7": 5164.0, "learning_rate": 0.0008850542684044079, "loss": 7072.1, "step": 2280 }, { "ce_loss_13": 3.4438597559928894, "ce_loss_26": 2.74727184176445, "ce_loss_39": 2.4982927203178407, "ce_loss_52": 1.435218185186386, "ce_loss_7": 3.8294356882572176, "epoch": 0.229, "grad_norm": 18.07396826505032, "kl_loss_13": 4082.8, "kl_loss_26": 2585.6, "kl_loss_39": 2087.0, "kl_loss_7": 4892.8, "learning_rate": 0.0008840401786221159, "loss": 6974.9, "step": 2290 }, { "ce_loss_13": 3.476649820804596, "ce_loss_26": 2.7891372203826905, "ce_loss_39": 2.5390550673007963, "ce_loss_52": 1.4611460983753204, "ce_loss_7": 3.854980993270874, "epoch": 0.23, "grad_norm": 18.716359933554248, "kl_loss_13": 4118.8, "kl_loss_26": 2644.2, "kl_loss_39": 2128.6, "kl_loss_7": 4946.0, "learning_rate": 0.000883022221559489, "loss": 6901.3, "step": 2300 }, { "ce_loss_13": 3.467282909154892, "ce_loss_26": 2.7604516625404356, "ce_loss_39": 2.5003271818161013, "ce_loss_52": 1.4461873590946197, "ce_loss_7": 3.856851851940155, "epoch": 0.231, "grad_norm": 20.721250836400138, "kl_loss_13": 4114.0, "kl_loss_26": 2616.8, "kl_loss_39": 2092.6, "kl_loss_7": 4926.4, "learning_rate": 0.0008820004074673434, "loss": 6876.9, "step": 2310 }, { "ce_loss_13": 3.4266963064670564, "ce_loss_26": 2.719339656829834, "ce_loss_39": 2.4688956409692766, "ce_loss_52": 1.4164521768689156, "ce_loss_7": 3.8023972034454347, "epoch": 0.232, "grad_norm": 17.616553712409548, "kl_loss_13": 4090.4, "kl_loss_26": 2589.2, "kl_loss_39": 2065.4, "kl_loss_7": 4891.2, "learning_rate": 0.0008809747466353355, "loss": 6907.6, "step": 2320 }, { "ce_loss_13": 3.536805588006973, "ce_loss_26": 2.8070395588874817, "ce_loss_39": 2.5501783430576324, "ce_loss_52": 1.4646585762500763, "ce_loss_7": 3.9247563600540163, "epoch": 0.233, "grad_norm": 17.658525951814237, "kl_loss_13": 4218.8, "kl_loss_26": 2666.0, "kl_loss_39": 2147.2, "kl_loss_7": 5044.0, "learning_rate": 0.0008799452493918585, "loss": 6862.9, "step": 2330 }, { "ce_loss_13": 3.3707460284233095, "ce_loss_26": 2.680304506421089, "ce_loss_39": 2.4279863387346268, "ce_loss_52": 1.4292149528861047, "ce_loss_7": 3.7509031653404237, "epoch": 0.234, "grad_norm": 18.293581891673337, "kl_loss_13": 3979.6, "kl_loss_26": 2481.4, "kl_loss_39": 1967.6, "kl_loss_7": 4792.0, "learning_rate": 0.0008789119261039385, "loss": 6860.7, "step": 2340 }, { "ce_loss_13": 3.411047804355621, "ce_loss_26": 2.7024976193904875, "ce_loss_39": 2.4417600989341737, "ce_loss_52": 1.404353639483452, "ce_loss_7": 3.799528968334198, "epoch": 0.235, "grad_norm": 19.60391257341661, "kl_loss_13": 4078.4, "kl_loss_26": 2576.8, "kl_loss_39": 2050.4, "kl_loss_7": 4901.6, "learning_rate": 0.0008778747871771292, "loss": 6770.8, "step": 2350 }, { "ce_loss_13": 3.4045680582523348, "ce_loss_26": 2.6985227525234223, "ce_loss_39": 2.451327767968178, "ce_loss_52": 1.4205562889575958, "ce_loss_7": 3.7882447242736816, "epoch": 0.236, "grad_norm": 18.739033338884607, "kl_loss_13": 4038.4, "kl_loss_26": 2533.2, "kl_loss_39": 2034.8, "kl_loss_7": 4858.4, "learning_rate": 0.0008768338430554083, "loss": 6755.4, "step": 2360 }, { "ce_loss_13": 3.3626140534877775, "ce_loss_26": 2.65916622877121, "ce_loss_39": 2.4009654462337493, "ce_loss_52": 1.3916789084672927, "ce_loss_7": 3.749758929014206, "epoch": 0.237, "grad_norm": 20.138759568479667, "kl_loss_13": 4022.0, "kl_loss_26": 2521.4, "kl_loss_39": 1993.2, "kl_loss_7": 4829.6, "learning_rate": 0.0008757891042210713, "loss": 6791.4, "step": 2370 }, { "ce_loss_13": 3.381242650747299, "ce_loss_26": 2.68309933245182, "ce_loss_39": 2.433691692352295, "ce_loss_52": 1.4095703065395355, "ce_loss_7": 3.772293299436569, "epoch": 0.238, "grad_norm": 17.323586780740413, "kl_loss_13": 4016.4, "kl_loss_26": 2531.4, "kl_loss_39": 2016.2, "kl_loss_7": 4839.6, "learning_rate": 0.0008747405811946271, "loss": 6729.9, "step": 2380 }, { "ce_loss_13": 3.418868046998978, "ce_loss_26": 2.7223224580287932, "ce_loss_39": 2.4728009045124053, "ce_loss_52": 1.446861308813095, "ce_loss_7": 3.7994930267333986, "epoch": 0.239, "grad_norm": 17.668482550134954, "kl_loss_13": 4034.0, "kl_loss_26": 2556.0, "kl_loss_39": 2037.4, "kl_loss_7": 4842.4, "learning_rate": 0.0008736882845346905, "loss": 6764.7, "step": 2390 }, { "ce_loss_13": 3.407726752758026, "ce_loss_26": 2.714660122990608, "ce_loss_39": 2.4702648639678957, "ce_loss_52": 1.4476186811923981, "ce_loss_7": 3.7862841546535493, "epoch": 0.24, "grad_norm": 20.313458924733165, "kl_loss_13": 3988.4, "kl_loss_26": 2507.2, "kl_loss_39": 2008.2, "kl_loss_7": 4794.4, "learning_rate": 0.0008726322248378774, "loss": 6720.4, "step": 2400 }, { "ce_loss_13": 3.4095008313655852, "ce_loss_26": 2.69794414639473, "ce_loss_39": 2.44133580327034, "ce_loss_52": 1.426843424141407, "ce_loss_7": 3.8018106520175934, "epoch": 0.241, "grad_norm": 17.847824779807837, "kl_loss_13": 4039.6, "kl_loss_26": 2535.6, "kl_loss_39": 2007.8, "kl_loss_7": 4864.0, "learning_rate": 0.0008715724127386971, "loss": 6713.1, "step": 2410 }, { "ce_loss_13": 3.370608961582184, "ce_loss_26": 2.688097137212753, "ce_loss_39": 2.435939407348633, "ce_loss_52": 1.4344248950481415, "ce_loss_7": 3.745537704229355, "epoch": 0.242, "grad_norm": 18.03045500704746, "kl_loss_13": 3943.6, "kl_loss_26": 2497.6, "kl_loss_39": 1983.4, "kl_loss_7": 4736.8, "learning_rate": 0.0008705088589094458, "loss": 6611.0, "step": 2420 }, { "ce_loss_13": 3.4596225798130034, "ce_loss_26": 2.7608898997306826, "ce_loss_39": 2.5022788047790527, "ce_loss_52": 1.4623101890087127, "ce_loss_7": 3.839513373374939, "epoch": 0.243, "grad_norm": 18.190444229647287, "kl_loss_13": 4058.0, "kl_loss_26": 2575.6, "kl_loss_39": 2054.6, "kl_loss_7": 4862.4, "learning_rate": 0.0008694415740600988, "loss": 6638.0, "step": 2430 }, { "ce_loss_13": 3.370687645673752, "ce_loss_26": 2.6674872994422913, "ce_loss_39": 2.414305740594864, "ce_loss_52": 1.4324225425720214, "ce_loss_7": 3.767412984371185, "epoch": 0.244, "grad_norm": 18.848120466041497, "kl_loss_13": 3940.4, "kl_loss_26": 2449.4, "kl_loss_39": 1932.2, "kl_loss_7": 4769.6, "learning_rate": 0.0008683705689382025, "loss": 6641.0, "step": 2440 }, { "ce_loss_13": 3.347389942407608, "ce_loss_26": 2.6603663861751556, "ce_loss_39": 2.4186645448207855, "ce_loss_52": 1.452422297000885, "ce_loss_7": 3.731181102991104, "epoch": 0.245, "grad_norm": 17.05503964147942, "kl_loss_13": 3888.8, "kl_loss_26": 2418.6, "kl_loss_39": 1913.8, "kl_loss_7": 4694.4, "learning_rate": 0.0008672958543287666, "loss": 6617.1, "step": 2450 }, { "ce_loss_13": 3.341637074947357, "ce_loss_26": 2.653514164686203, "ce_loss_39": 2.4030585259199144, "ce_loss_52": 1.4203058749437332, "ce_loss_7": 3.7326664865016936, "epoch": 0.246, "grad_norm": 18.028238041954314, "kl_loss_13": 3894.0, "kl_loss_26": 2435.6, "kl_loss_39": 1932.4, "kl_loss_7": 4711.2, "learning_rate": 0.0008662174410541554, "loss": 6537.2, "step": 2460 }, { "ce_loss_13": 3.343936342000961, "ce_loss_26": 2.6602561354637144, "ce_loss_39": 2.4062414824962617, "ce_loss_52": 1.4341940209269524, "ce_loss_7": 3.7255902886390686, "epoch": 0.247, "grad_norm": 17.615419947143096, "kl_loss_13": 3890.8, "kl_loss_26": 2433.8, "kl_loss_39": 1921.6, "kl_loss_7": 4692.0, "learning_rate": 0.0008651353399739787, "loss": 6499.5, "step": 2470 }, { "ce_loss_13": 3.36987144947052, "ce_loss_26": 2.6651594936847687, "ce_loss_39": 2.41037335395813, "ce_loss_52": 1.4215908780694009, "ce_loss_7": 3.75968217253685, "epoch": 0.248, "grad_norm": 19.448152371934484, "kl_loss_13": 3954.4, "kl_loss_26": 2461.0, "kl_loss_39": 1945.4, "kl_loss_7": 4776.0, "learning_rate": 0.0008640495619849821, "loss": 6570.0, "step": 2480 }, { "ce_loss_13": 3.3932483792304993, "ce_loss_26": 2.69803272485733, "ce_loss_39": 2.448850151896477, "ce_loss_52": 1.4729512989521027, "ce_loss_7": 3.7767464458942412, "epoch": 0.249, "grad_norm": 17.76914722627013, "kl_loss_13": 3932.8, "kl_loss_26": 2449.6, "kl_loss_39": 1942.6, "kl_loss_7": 4743.2, "learning_rate": 0.0008629601180209381, "loss": 6472.8, "step": 2490 }, { "ce_loss_13": 3.3404706001281737, "ce_loss_26": 2.6527740180492403, "ce_loss_39": 2.403418445587158, "ce_loss_52": 1.4374166071414947, "ce_loss_7": 3.7279320538043974, "epoch": 0.25, "grad_norm": 18.043688934070087, "kl_loss_13": 3897.6, "kl_loss_26": 2421.6, "kl_loss_39": 1902.0, "kl_loss_7": 4716.0, "learning_rate": 0.000861867019052535, "loss": 6482.2, "step": 2500 }, { "ce_loss_13": 3.408062273263931, "ce_loss_26": 2.704035770893097, "ce_loss_39": 2.46220483481884, "ce_loss_52": 1.4713353991508484, "ce_loss_7": 3.795337921380997, "epoch": 0.251, "grad_norm": 19.728713333582007, "kl_loss_13": 3944.4, "kl_loss_26": 2444.8, "kl_loss_39": 1942.4, "kl_loss_7": 4760.4, "learning_rate": 0.0008607702760872678, "loss": 6463.8, "step": 2510 }, { "ce_loss_13": 3.388230836391449, "ce_loss_26": 2.687982529401779, "ce_loss_39": 2.429553496837616, "ce_loss_52": 1.4563929110765457, "ce_loss_7": 3.781431978940964, "epoch": 0.252, "grad_norm": 18.472247546339297, "kl_loss_13": 3948.8, "kl_loss_26": 2449.6, "kl_loss_39": 1930.0, "kl_loss_7": 4769.6, "learning_rate": 0.0008596699001693256, "loss": 6470.9, "step": 2520 }, { "ce_loss_13": 3.3468190252780916, "ce_loss_26": 2.6537194311618806, "ce_loss_39": 2.38609040081501, "ce_loss_52": 1.4265122324228288, "ce_loss_7": 3.7305682718753816, "epoch": 0.253, "grad_norm": 19.078325524688942, "kl_loss_13": 3872.0, "kl_loss_26": 2409.4, "kl_loss_39": 1875.8, "kl_loss_7": 4679.2, "learning_rate": 0.0008585659023794818, "loss": 6413.9, "step": 2530 }, { "ce_loss_13": 3.3305428862571715, "ce_loss_26": 2.6303874611854554, "ce_loss_39": 2.3649342864751817, "ce_loss_52": 1.4202288419008255, "ce_loss_7": 3.712818431854248, "epoch": 0.254, "grad_norm": 17.612892401319467, "kl_loss_13": 3873.6, "kl_loss_26": 2396.4, "kl_loss_39": 1864.0, "kl_loss_7": 4688.8, "learning_rate": 0.0008574582938349817, "loss": 6377.5, "step": 2540 }, { "ce_loss_13": 3.3927013695240023, "ce_loss_26": 2.7204831540584564, "ce_loss_39": 2.464686703681946, "ce_loss_52": 1.487898689508438, "ce_loss_7": 3.7826764941215516, "epoch": 0.255, "grad_norm": 19.269838722570377, "kl_loss_13": 3875.2, "kl_loss_26": 2432.8, "kl_loss_39": 1907.8, "kl_loss_7": 4701.6, "learning_rate": 0.0008563470856894315, "loss": 6365.3, "step": 2550 }, { "ce_loss_13": 3.366223245859146, "ce_loss_26": 2.6793350696563722, "ce_loss_39": 2.4282671988010405, "ce_loss_52": 1.475812730193138, "ce_loss_7": 3.7593341052532194, "epoch": 0.256, "grad_norm": 17.67137481592248, "kl_loss_13": 3846.8, "kl_loss_26": 2394.4, "kl_loss_39": 1881.4, "kl_loss_7": 4661.6, "learning_rate": 0.0008552322891326845, "loss": 6381.8, "step": 2560 }, { "ce_loss_13": 3.348971825838089, "ce_loss_26": 2.6535234093666076, "ce_loss_39": 2.396692654490471, "ce_loss_52": 1.4521390795707703, "ce_loss_7": 3.737420654296875, "epoch": 0.257, "grad_norm": 17.922791060277248, "kl_loss_13": 3853.2, "kl_loss_26": 2382.8, "kl_loss_39": 1860.0, "kl_loss_7": 4671.6, "learning_rate": 0.0008541139153907296, "loss": 6320.6, "step": 2570 }, { "ce_loss_13": 3.3175282776355743, "ce_loss_26": 2.62116519510746, "ce_loss_39": 2.373508110642433, "ce_loss_52": 1.4496644467115403, "ce_loss_7": 3.7005242466926576, "epoch": 0.258, "grad_norm": 17.919048466696356, "kl_loss_13": 3806.0, "kl_loss_26": 2325.8, "kl_loss_39": 1815.6, "kl_loss_7": 4616.8, "learning_rate": 0.0008529919757255782, "loss": 6324.1, "step": 2580 }, { "ce_loss_13": 3.3524767458438873, "ce_loss_26": 2.672022157907486, "ce_loss_39": 2.42145474255085, "ce_loss_52": 1.490525448322296, "ce_loss_7": 3.731249511241913, "epoch": 0.259, "grad_norm": 17.94332341656099, "kl_loss_13": 3786.8, "kl_loss_26": 2328.6, "kl_loss_39": 1819.0, "kl_loss_7": 4580.4, "learning_rate": 0.0008518664814351503, "loss": 6266.2, "step": 2590 }, { "ce_loss_13": 3.2249048352241516, "ce_loss_26": 2.526710030436516, "ce_loss_39": 2.288080096244812, "ce_loss_52": 1.3951421514153481, "ce_loss_7": 3.6117550313472746, "epoch": 0.26, "grad_norm": 17.453584143766424, "kl_loss_13": 3747.6, "kl_loss_26": 2273.8, "kl_loss_39": 1787.0, "kl_loss_7": 4560.8, "learning_rate": 0.0008507374438531607, "loss": 6263.6, "step": 2600 }, { "ce_loss_13": 3.385346031188965, "ce_loss_26": 2.689622291922569, "ce_loss_39": 2.437500995397568, "ce_loss_52": 1.4798223778605462, "ce_loss_7": 3.780226916074753, "epoch": 0.261, "grad_norm": 17.77067876803282, "kl_loss_13": 3881.6, "kl_loss_26": 2399.4, "kl_loss_39": 1889.6, "kl_loss_7": 4706.4, "learning_rate": 0.0008496048743490053, "loss": 6251.8, "step": 2610 }, { "ce_loss_13": 3.253863149881363, "ce_loss_26": 2.5867208421230314, "ce_loss_39": 2.3403410583734514, "ce_loss_52": 1.4355336636304856, "ce_loss_7": 3.628329038619995, "epoch": 0.262, "grad_norm": 18.13099048831459, "kl_loss_13": 3714.0, "kl_loss_26": 2278.6, "kl_loss_39": 1781.2, "kl_loss_7": 4501.2, "learning_rate": 0.0008484687843276469, "loss": 6230.8, "step": 2620 }, { "ce_loss_13": 3.308898413181305, "ce_loss_26": 2.6307075321674347, "ce_loss_39": 2.381870651245117, "ce_loss_52": 1.4658461689949036, "ce_loss_7": 3.686649763584137, "epoch": 0.263, "grad_norm": 17.246075540125826, "kl_loss_13": 3767.2, "kl_loss_26": 2322.6, "kl_loss_39": 1819.4, "kl_loss_7": 4560.0, "learning_rate": 0.0008473291852294987, "loss": 6261.4, "step": 2630 }, { "ce_loss_13": 3.301677256822586, "ce_loss_26": 2.623979777097702, "ce_loss_39": 2.3711125582456587, "ce_loss_52": 1.4476893723011017, "ce_loss_7": 3.686248630285263, "epoch": 0.264, "grad_norm": 18.725565567002363, "kl_loss_13": 3772.0, "kl_loss_26": 2323.8, "kl_loss_39": 1813.4, "kl_loss_7": 4567.6, "learning_rate": 0.0008461860885303114, "loss": 6186.3, "step": 2640 }, { "ce_loss_13": 3.3081180572509767, "ce_loss_26": 2.5993283450603486, "ce_loss_39": 2.346868970990181, "ce_loss_52": 1.4209152534604073, "ce_loss_7": 3.700265485048294, "epoch": 0.265, "grad_norm": 16.39724372868559, "kl_loss_13": 3844.8, "kl_loss_26": 2352.2, "kl_loss_39": 1835.0, "kl_loss_7": 4663.2, "learning_rate": 0.000845039505741056, "loss": 6224.4, "step": 2650 }, { "ce_loss_13": 3.2767783522605898, "ce_loss_26": 2.610367941856384, "ce_loss_39": 2.3734692215919493, "ce_loss_52": 1.4838453635573388, "ce_loss_7": 3.6544183135032653, "epoch": 0.266, "grad_norm": 18.646431501601995, "kl_loss_13": 3666.4, "kl_loss_26": 2232.8, "kl_loss_39": 1743.2, "kl_loss_7": 4455.2, "learning_rate": 0.0008438894484078086, "loss": 6164.1, "step": 2660 }, { "ce_loss_13": 3.187056082487106, "ce_loss_26": 2.5067271828651427, "ce_loss_39": 2.265036514401436, "ce_loss_52": 1.3943608120083808, "ce_loss_7": 3.5653835415840147, "epoch": 0.267, "grad_norm": 17.226899122841342, "kl_loss_13": 3695.2, "kl_loss_26": 2239.8, "kl_loss_39": 1740.2, "kl_loss_7": 4491.6, "learning_rate": 0.0008427359281116334, "loss": 6116.4, "step": 2670 }, { "ce_loss_13": 3.2635743618011475, "ce_loss_26": 2.5878145933151244, "ce_loss_39": 2.344261533021927, "ce_loss_52": 1.436160460114479, "ce_loss_7": 3.642155331373215, "epoch": 0.268, "grad_norm": 17.358976250170773, "kl_loss_13": 3715.6, "kl_loss_26": 2286.6, "kl_loss_39": 1782.2, "kl_loss_7": 4510.8, "learning_rate": 0.0008415789564684673, "loss": 6120.1, "step": 2680 }, { "ce_loss_13": 3.217154061794281, "ce_loss_26": 2.55003065764904, "ce_loss_39": 2.3082681566476824, "ce_loss_52": 1.4278038635849952, "ce_loss_7": 3.602716547250748, "epoch": 0.269, "grad_norm": 17.314540535001434, "kl_loss_13": 3655.2, "kl_loss_26": 2225.8, "kl_loss_39": 1731.0, "kl_loss_7": 4458.8, "learning_rate": 0.0008404185451290017, "loss": 6184.5, "step": 2690 }, { "ce_loss_13": 3.24159716963768, "ce_loss_26": 2.54996337890625, "ce_loss_39": 2.3031594485044478, "ce_loss_52": 1.4209882378578187, "ce_loss_7": 3.6337377846241, "epoch": 0.27, "grad_norm": 17.492742712849175, "kl_loss_13": 3726.4, "kl_loss_26": 2257.8, "kl_loss_39": 1756.2, "kl_loss_7": 4549.6, "learning_rate": 0.0008392547057785661, "loss": 6062.5, "step": 2700 }, { "ce_loss_13": 3.227788990736008, "ce_loss_26": 2.5358716517686846, "ce_loss_39": 2.285151606798172, "ce_loss_52": 1.4203684866428374, "ce_loss_7": 3.6037886083126067, "epoch": 0.271, "grad_norm": 17.53543108395465, "kl_loss_13": 3669.6, "kl_loss_26": 2215.6, "kl_loss_39": 1712.0, "kl_loss_7": 4470.8, "learning_rate": 0.0008380874501370098, "loss": 6120.8, "step": 2710 }, { "ce_loss_13": 3.1636491239070894, "ce_loss_26": 2.5045939922332763, "ce_loss_39": 2.2759897857904434, "ce_loss_52": 1.4353285342454911, "ce_loss_7": 3.5319547176361086, "epoch": 0.272, "grad_norm": 18.586839239007904, "kl_loss_13": 3539.2, "kl_loss_26": 2133.8, "kl_loss_39": 1666.0, "kl_loss_7": 4328.8, "learning_rate": 0.0008369167899585841, "loss": 6051.0, "step": 2720 }, { "ce_loss_13": 3.221328115463257, "ce_loss_26": 2.535122260451317, "ce_loss_39": 2.2940177261829375, "ce_loss_52": 1.419254219532013, "ce_loss_7": 3.6125965118408203, "epoch": 0.273, "grad_norm": 17.658632608609814, "kl_loss_13": 3688.0, "kl_loss_26": 2233.2, "kl_loss_39": 1733.0, "kl_loss_7": 4510.0, "learning_rate": 0.0008357427370318238, "loss": 6045.8, "step": 2730 }, { "ce_loss_13": 3.2239345014095306, "ce_loss_26": 2.556156021356583, "ce_loss_39": 2.3110478937625887, "ce_loss_52": 1.4514835059642792, "ce_loss_7": 3.6023066878318786, "epoch": 0.274, "grad_norm": 18.16156755157877, "kl_loss_13": 3623.2, "kl_loss_26": 2184.8, "kl_loss_39": 1687.0, "kl_loss_7": 4421.6, "learning_rate": 0.0008345653031794292, "loss": 6081.8, "step": 2740 }, { "ce_loss_13": 3.248935067653656, "ce_loss_26": 2.571800184249878, "ce_loss_39": 2.3257440716028213, "ce_loss_52": 1.4555991351604463, "ce_loss_7": 3.6242997109889985, "epoch": 0.275, "grad_norm": 17.486859359677037, "kl_loss_13": 3626.4, "kl_loss_26": 2208.0, "kl_loss_39": 1712.0, "kl_loss_7": 4417.6, "learning_rate": 0.0008333845002581458, "loss": 5996.5, "step": 2750 }, { "ce_loss_13": 3.2696946620941163, "ce_loss_26": 2.5927825689315798, "ce_loss_39": 2.3452927708625793, "ce_loss_52": 1.4668353974819184, "ce_loss_7": 3.635032969713211, "epoch": 0.276, "grad_norm": 17.043532441273413, "kl_loss_13": 3660.4, "kl_loss_26": 2235.2, "kl_loss_39": 1729.2, "kl_loss_7": 4435.2, "learning_rate": 0.0008322003401586462, "loss": 5989.9, "step": 2760 }, { "ce_loss_13": 3.187331736087799, "ce_loss_26": 2.537285569310188, "ce_loss_39": 2.296916127204895, "ce_loss_52": 1.4428926169872285, "ce_loss_7": 3.55471009016037, "epoch": 0.277, "grad_norm": 18.300410919099367, "kl_loss_13": 3558.4, "kl_loss_26": 2165.8, "kl_loss_39": 1682.4, "kl_loss_7": 4338.0, "learning_rate": 0.0008310128348054094, "loss": 5970.6, "step": 2770 }, { "ce_loss_13": 3.2006444096565247, "ce_loss_26": 2.5205237597227095, "ce_loss_39": 2.282972750067711, "ce_loss_52": 1.4260737136006356, "ce_loss_7": 3.5821855068206787, "epoch": 0.278, "grad_norm": 17.549429821325518, "kl_loss_13": 3649.2, "kl_loss_26": 2206.8, "kl_loss_39": 1708.8, "kl_loss_7": 4441.2, "learning_rate": 0.0008298219961566008, "loss": 5976.9, "step": 2780 }, { "ce_loss_13": 3.1831609129905702, "ce_loss_26": 2.4997926205396652, "ce_loss_39": 2.2601052969694138, "ce_loss_52": 1.403978604078293, "ce_loss_7": 3.553097301721573, "epoch": 0.279, "grad_norm": 17.798788155683486, "kl_loss_13": 3626.8, "kl_loss_26": 2180.8, "kl_loss_39": 1685.6, "kl_loss_7": 4416.0, "learning_rate": 0.0008286278362039527, "loss": 5901.3, "step": 2790 }, { "ce_loss_13": 3.2061972856521606, "ce_loss_26": 2.5348829567432403, "ce_loss_39": 2.29689359664917, "ce_loss_52": 1.4524233996868134, "ce_loss_7": 3.576086735725403, "epoch": 0.28, "grad_norm": 17.920526019853103, "kl_loss_13": 3566.0, "kl_loss_26": 2145.8, "kl_loss_39": 1659.0, "kl_loss_7": 4341.6, "learning_rate": 0.0008274303669726426, "loss": 5875.7, "step": 2800 }, { "ce_loss_13": 3.2895997047424315, "ce_loss_26": 2.6019150614738464, "ce_loss_39": 2.3554980546236037, "ce_loss_52": 1.4626183837652207, "ce_loss_7": 3.680211156606674, "epoch": 0.281, "grad_norm": 18.339396286679403, "kl_loss_13": 3746.0, "kl_loss_26": 2276.4, "kl_loss_39": 1760.4, "kl_loss_7": 4566.0, "learning_rate": 0.0008262296005211721, "loss": 5938.8, "step": 2810 }, { "ce_loss_13": 3.1840124845504763, "ce_loss_26": 2.5105081349611282, "ce_loss_39": 2.2711715549230576, "ce_loss_52": 1.4384785890579224, "ce_loss_7": 3.5735177397727966, "epoch": 0.282, "grad_norm": 17.54448826978921, "kl_loss_13": 3568.4, "kl_loss_26": 2126.0, "kl_loss_39": 1635.8, "kl_loss_7": 4378.4, "learning_rate": 0.0008250255489412463, "loss": 5922.5, "step": 2820 }, { "ce_loss_13": 3.2147216141223907, "ce_loss_26": 2.5306088238954545, "ce_loss_39": 2.2879262059926986, "ce_loss_52": 1.4329912930727005, "ce_loss_7": 3.6062001168727873, "epoch": 0.283, "grad_norm": 16.224521909183913, "kl_loss_13": 3634.4, "kl_loss_26": 2182.2, "kl_loss_39": 1686.4, "kl_loss_7": 4456.0, "learning_rate": 0.0008238182243576511, "loss": 5870.9, "step": 2830 }, { "ce_loss_13": 3.204808014631271, "ce_loss_26": 2.529933416843414, "ce_loss_39": 2.2772836655378343, "ce_loss_52": 1.4382712185382842, "ce_loss_7": 3.593477213382721, "epoch": 0.284, "grad_norm": 17.85408893335792, "kl_loss_13": 3598.4, "kl_loss_26": 2175.4, "kl_loss_39": 1665.8, "kl_loss_7": 4413.6, "learning_rate": 0.0008226076389281315, "loss": 5857.1, "step": 2840 }, { "ce_loss_13": 3.137856882810593, "ce_loss_26": 2.4886497616767884, "ce_loss_39": 2.2476108491420748, "ce_loss_52": 1.4388306617736817, "ce_loss_7": 3.5120018839836122, "epoch": 0.285, "grad_norm": 17.527104476469322, "kl_loss_13": 3468.4, "kl_loss_26": 2085.8, "kl_loss_39": 1594.4, "kl_loss_7": 4250.0, "learning_rate": 0.0008213938048432696, "loss": 5806.8, "step": 2850 }, { "ce_loss_13": 3.134007251262665, "ce_loss_26": 2.4605695813894273, "ce_loss_39": 2.2274533331394197, "ce_loss_52": 1.4070687741041183, "ce_loss_7": 3.51087743639946, "epoch": 0.286, "grad_norm": 16.81428229570175, "kl_loss_13": 3532.8, "kl_loss_26": 2101.0, "kl_loss_39": 1616.2, "kl_loss_7": 4327.2, "learning_rate": 0.0008201767343263612, "loss": 5809.7, "step": 2860 }, { "ce_loss_13": 3.1623673915863035, "ce_loss_26": 2.497657111287117, "ce_loss_39": 2.2517473757267, "ce_loss_52": 1.4269344687461853, "ce_loss_7": 3.543222689628601, "epoch": 0.287, "grad_norm": 16.70636713163243, "kl_loss_13": 3541.2, "kl_loss_26": 2128.4, "kl_loss_39": 1624.8, "kl_loss_7": 4340.0, "learning_rate": 0.0008189564396332927, "loss": 5789.5, "step": 2870 }, { "ce_loss_13": 3.1600242078304293, "ce_loss_26": 2.4901143670082093, "ce_loss_39": 2.262405735254288, "ce_loss_52": 1.447944176197052, "ce_loss_7": 3.5380080163478853, "epoch": 0.288, "grad_norm": 17.918386343579222, "kl_loss_13": 3481.6, "kl_loss_26": 2059.8, "kl_loss_39": 1588.8, "kl_loss_7": 4287.6, "learning_rate": 0.0008177329330524181, "loss": 5812.1, "step": 2880 }, { "ce_loss_13": 3.174392342567444, "ce_loss_26": 2.5159962266683578, "ce_loss_39": 2.2794058710336684, "ce_loss_52": 1.444807243347168, "ce_loss_7": 3.5484564363956452, "epoch": 0.289, "grad_norm": 19.196721110708168, "kl_loss_13": 3507.2, "kl_loss_26": 2102.6, "kl_loss_39": 1624.0, "kl_loss_7": 4290.8, "learning_rate": 0.0008165062269044352, "loss": 5808.4, "step": 2890 }, { "ce_loss_13": 3.1698272943496706, "ce_loss_26": 2.506984257698059, "ce_loss_39": 2.2710763216018677, "ce_loss_52": 1.4491374969482422, "ce_loss_7": 3.553741979598999, "epoch": 0.29, "grad_norm": 16.91235339946444, "kl_loss_13": 3511.2, "kl_loss_26": 2109.2, "kl_loss_39": 1627.2, "kl_loss_7": 4317.6, "learning_rate": 0.0008152763335422613, "loss": 5792.2, "step": 2900 }, { "ce_loss_13": 3.115260285139084, "ce_loss_26": 2.4620468825101853, "ce_loss_39": 2.2224705785512926, "ce_loss_52": 1.4216517835855484, "ce_loss_7": 3.49349564909935, "epoch": 0.291, "grad_norm": 19.52390820528971, "kl_loss_13": 3444.4, "kl_loss_26": 2058.8, "kl_loss_39": 1575.2, "kl_loss_7": 4239.2, "learning_rate": 0.0008140432653509088, "loss": 5744.6, "step": 2910 }, { "ce_loss_13": 3.094448319077492, "ce_loss_26": 2.4242703199386595, "ce_loss_39": 2.1858216524124146, "ce_loss_52": 1.4000759646296501, "ce_loss_7": 3.4697431921958923, "epoch": 0.292, "grad_norm": 16.436813971821554, "kl_loss_13": 3432.0, "kl_loss_26": 2027.8, "kl_loss_39": 1551.4, "kl_loss_7": 4216.4, "learning_rate": 0.0008128070347473608, "loss": 5696.7, "step": 2920 }, { "ce_loss_13": 3.1132335126399995, "ce_loss_26": 2.4610098242759704, "ce_loss_39": 2.2259542405605317, "ce_loss_52": 1.4294333100318908, "ce_loss_7": 3.48697971701622, "epoch": 0.293, "grad_norm": 16.527383629623685, "kl_loss_13": 3448.4, "kl_loss_26": 2043.0, "kl_loss_39": 1562.4, "kl_loss_7": 4237.2, "learning_rate": 0.0008115676541804455, "loss": 5734.0, "step": 2930 }, { "ce_loss_13": 3.0772423684597014, "ce_loss_26": 2.413277891278267, "ce_loss_39": 2.179169711470604, "ce_loss_52": 1.3933877736330031, "ce_loss_7": 3.4470324754714965, "epoch": 0.294, "grad_norm": 17.18835839034016, "kl_loss_13": 3431.6, "kl_loss_26": 2023.0, "kl_loss_39": 1542.8, "kl_loss_7": 4223.6, "learning_rate": 0.0008103251361307119, "loss": 5705.55, "step": 2940 }, { "ce_loss_13": 3.093912643194199, "ce_loss_26": 2.4372138679027557, "ce_loss_39": 2.2068597853183745, "ce_loss_52": 1.4339108556509017, "ce_loss_7": 3.4713816404342652, "epoch": 0.295, "grad_norm": 16.78852919489637, "kl_loss_13": 3395.2, "kl_loss_26": 1998.6, "kl_loss_39": 1533.6, "kl_loss_7": 4186.4, "learning_rate": 0.0008090794931103026, "loss": 5641.3, "step": 2950 }, { "ce_loss_13": 3.1308993637561797, "ce_loss_26": 2.4708112478256226, "ce_loss_39": 2.22915124297142, "ce_loss_52": 1.4374129235744477, "ce_loss_7": 3.4991187393665313, "epoch": 0.296, "grad_norm": 16.768783545647338, "kl_loss_13": 3436.4, "kl_loss_26": 2035.0, "kl_loss_39": 1548.6, "kl_loss_7": 4216.8, "learning_rate": 0.0008078307376628291, "loss": 5645.6, "step": 2960 }, { "ce_loss_13": 3.0990765929222106, "ce_loss_26": 2.452856171131134, "ce_loss_39": 2.21165874004364, "ce_loss_52": 1.4263496309518815, "ce_loss_7": 3.472361743450165, "epoch": 0.297, "grad_norm": 16.652386900592916, "kl_loss_13": 3401.6, "kl_loss_26": 2024.0, "kl_loss_39": 1532.8, "kl_loss_7": 4181.6, "learning_rate": 0.000806578882363245, "loss": 5645.6, "step": 2970 }, { "ce_loss_13": 3.092843067646027, "ce_loss_26": 2.423402965068817, "ce_loss_39": 2.1854313611984253, "ce_loss_52": 1.4033612102270125, "ce_loss_7": 3.474502944946289, "epoch": 0.298, "grad_norm": 17.707727518863294, "kl_loss_13": 3432.4, "kl_loss_26": 2016.2, "kl_loss_39": 1530.6, "kl_loss_7": 4229.2, "learning_rate": 0.0008053239398177191, "loss": 5651.6, "step": 2980 }, { "ce_loss_13": 3.0975674211978914, "ce_loss_26": 2.4374621868133546, "ce_loss_39": 2.209611228108406, "ce_loss_52": 1.4303795397281647, "ce_loss_7": 3.475492590665817, "epoch": 0.299, "grad_norm": 17.746850084884183, "kl_loss_13": 3416.8, "kl_loss_26": 2002.4, "kl_loss_39": 1530.8, "kl_loss_7": 4214.8, "learning_rate": 0.0008040659226635089, "loss": 5630.2, "step": 2990 }, { "ce_loss_13": 3.089507430791855, "ce_loss_26": 2.417942848801613, "ce_loss_39": 2.1790529817342756, "ce_loss_52": 1.3988826781511308, "ce_loss_7": 3.468764144182205, "epoch": 0.3, "grad_norm": 17.0619103802906, "kl_loss_13": 3422.4, "kl_loss_26": 2019.4, "kl_loss_39": 1533.2, "kl_loss_7": 4215.2, "learning_rate": 0.0008028048435688333, "loss": 5562.8, "step": 3000 }, { "ce_loss_13": 3.1326099216938017, "ce_loss_26": 2.471994733810425, "ce_loss_39": 2.2380873382091524, "ce_loss_52": 1.458441223204136, "ce_loss_7": 3.502814435958862, "epoch": 0.301, "grad_norm": 17.222204651935485, "kl_loss_13": 3436.8, "kl_loss_26": 2039.6, "kl_loss_39": 1559.8, "kl_loss_7": 4214.8, "learning_rate": 0.0008015407152327448, "loss": 5664.1, "step": 3010 }, { "ce_loss_13": 3.1731098294258118, "ce_loss_26": 2.504976212978363, "ce_loss_39": 2.260189512372017, "ce_loss_52": 1.4524748474359512, "ce_loss_7": 3.555993539094925, "epoch": 0.302, "grad_norm": 16.72113952146357, "kl_loss_13": 3518.4, "kl_loss_26": 2087.0, "kl_loss_39": 1585.0, "kl_loss_7": 4313.2, "learning_rate": 0.0008002735503850016, "loss": 5589.3, "step": 3020 }, { "ce_loss_13": 3.118546891212463, "ce_loss_26": 2.445932698249817, "ce_loss_39": 2.2121699869632723, "ce_loss_52": 1.4456641212105752, "ce_loss_7": 3.491698741912842, "epoch": 0.303, "grad_norm": 16.572401194454876, "kl_loss_13": 3394.8, "kl_loss_26": 1984.4, "kl_loss_39": 1511.2, "kl_loss_7": 4167.2, "learning_rate": 0.0007990033617859396, "loss": 5580.5, "step": 3030 }, { "ce_loss_13": 3.10419015288353, "ce_loss_26": 2.447344717383385, "ce_loss_39": 2.212172231078148, "ce_loss_52": 1.435505247116089, "ce_loss_7": 3.477084743976593, "epoch": 0.304, "grad_norm": 18.041012229096975, "kl_loss_13": 3420.8, "kl_loss_26": 2017.0, "kl_loss_39": 1525.6, "kl_loss_7": 4208.8, "learning_rate": 0.000797730162226344, "loss": 5556.8, "step": 3040 }, { "ce_loss_13": 3.0508037239313124, "ce_loss_26": 2.3910141468048094, "ce_loss_39": 2.161831411719322, "ce_loss_52": 1.3925497516989709, "ce_loss_7": 3.434678375720978, "epoch": 0.305, "grad_norm": 18.2507403656971, "kl_loss_13": 3360.8, "kl_loss_26": 1961.2, "kl_loss_39": 1491.3, "kl_loss_7": 4164.0, "learning_rate": 0.0007964539645273203, "loss": 5538.7, "step": 3050 }, { "ce_loss_13": 3.143118643760681, "ce_loss_26": 2.503050500154495, "ce_loss_39": 2.269149711728096, "ce_loss_52": 1.4877858996391295, "ce_loss_7": 3.510826712846756, "epoch": 0.306, "grad_norm": 17.20244258760552, "kl_loss_13": 3390.4, "kl_loss_26": 2023.0, "kl_loss_39": 1542.0, "kl_loss_7": 4166.8, "learning_rate": 0.000795174781540165, "loss": 5547.7, "step": 3060 }, { "ce_loss_13": 3.0940939664840696, "ce_loss_26": 2.4377844393253327, "ce_loss_39": 2.2110770642757416, "ce_loss_52": 1.453443130850792, "ce_loss_7": 3.461427628993988, "epoch": 0.307, "grad_norm": 16.19915727951882, "kl_loss_13": 3353.6, "kl_loss_26": 1960.2, "kl_loss_39": 1487.5, "kl_loss_7": 4131.2, "learning_rate": 0.0007938926261462366, "loss": 5534.3, "step": 3070 }, { "ce_loss_13": 3.099123537540436, "ce_loss_26": 2.428412067890167, "ce_loss_39": 2.1952255785465242, "ce_loss_52": 1.4312659561634065, "ce_loss_7": 3.477120190858841, "epoch": 0.308, "grad_norm": 16.90807792363964, "kl_loss_13": 3392.0, "kl_loss_26": 1979.6, "kl_loss_39": 1498.8, "kl_loss_7": 4186.8, "learning_rate": 0.0007926075112568258, "loss": 5523.9, "step": 3080 }, { "ce_loss_13": 3.0900339841842652, "ce_loss_26": 2.429807424545288, "ce_loss_39": 2.1938013613224028, "ce_loss_52": 1.4448837220668793, "ce_loss_7": 3.460611253976822, "epoch": 0.309, "grad_norm": 17.278764306039758, "kl_loss_13": 3363.2, "kl_loss_26": 1959.6, "kl_loss_39": 1476.2, "kl_loss_7": 4153.6, "learning_rate": 0.0007913194498130252, "loss": 5481.8, "step": 3090 }, { "ce_loss_13": 3.0705978155136107, "ce_loss_26": 2.415473333001137, "ce_loss_39": 2.1859600633382796, "ce_loss_52": 1.4387344419956207, "ce_loss_7": 3.4460779249668123, "epoch": 0.31, "grad_norm": 17.744380665090848, "kl_loss_13": 3328.8, "kl_loss_26": 1937.2, "kl_loss_39": 1461.6, "kl_loss_7": 4109.6, "learning_rate": 0.0007900284547855992, "loss": 5494.5, "step": 3100 }, { "ce_loss_13": 3.1120304703712462, "ce_loss_26": 2.437400758266449, "ce_loss_39": 2.1967616409063337, "ce_loss_52": 1.4447504609823227, "ce_loss_7": 3.4870758295059203, "epoch": 0.311, "grad_norm": 17.196298155702294, "kl_loss_13": 3414.8, "kl_loss_26": 1997.2, "kl_loss_39": 1507.4, "kl_loss_7": 4197.6, "learning_rate": 0.0007887345391748532, "loss": 5492.4, "step": 3110 }, { "ce_loss_13": 3.0714461147785186, "ce_loss_26": 2.410064917802811, "ce_loss_39": 2.176686418056488, "ce_loss_52": 1.4264434427022934, "ce_loss_7": 3.440903478860855, "epoch": 0.312, "grad_norm": 17.274723589437542, "kl_loss_13": 3352.8, "kl_loss_26": 1965.6, "kl_loss_39": 1480.0, "kl_loss_7": 4134.0, "learning_rate": 0.0007874377160105036, "loss": 5478.5, "step": 3120 }, { "ce_loss_13": 3.0896170139312744, "ce_loss_26": 2.440164825320244, "ce_loss_39": 2.2094400197267534, "ce_loss_52": 1.4495170325040818, "ce_loss_7": 3.468176656961441, "epoch": 0.313, "grad_norm": 17.297623478505614, "kl_loss_13": 3375.2, "kl_loss_26": 1976.4, "kl_loss_39": 1503.8, "kl_loss_7": 4173.6, "learning_rate": 0.0007861379983515449, "loss": 5461.8, "step": 3130 }, { "ce_loss_13": 3.074652445316315, "ce_loss_26": 2.4188932478427887, "ce_loss_39": 2.1893287271261217, "ce_loss_52": 1.440689930319786, "ce_loss_7": 3.452376401424408, "epoch": 0.314, "grad_norm": 18.06264725868681, "kl_loss_13": 3317.2, "kl_loss_26": 1935.6, "kl_loss_39": 1465.8, "kl_loss_7": 4117.6, "learning_rate": 0.0007848353992861195, "loss": 5464.6, "step": 3140 }, { "ce_loss_13": 3.078055852651596, "ce_loss_26": 2.420557659864426, "ce_loss_39": 2.188734245300293, "ce_loss_52": 1.4398296728730202, "ce_loss_7": 3.457850754261017, "epoch": 0.315, "grad_norm": 17.00206014366557, "kl_loss_13": 3318.4, "kl_loss_26": 1927.6, "kl_loss_39": 1458.6, "kl_loss_7": 4114.0, "learning_rate": 0.0007835299319313853, "loss": 5381.3, "step": 3150 }, { "ce_loss_13": 3.059806948900223, "ce_loss_26": 2.3743002265691757, "ce_loss_39": 2.137469917535782, "ce_loss_52": 1.3903418719768523, "ce_loss_7": 3.4285161972045897, "epoch": 0.316, "grad_norm": 17.46330734643716, "kl_loss_13": 3354.4, "kl_loss_26": 1935.6, "kl_loss_39": 1455.2, "kl_loss_7": 4136.8, "learning_rate": 0.0007822216094333848, "loss": 5407.6, "step": 3160 }, { "ce_loss_13": 3.0990252554416657, "ce_loss_26": 2.4341968923807142, "ce_loss_39": 2.1952569454908373, "ce_loss_52": 1.439093704521656, "ce_loss_7": 3.4743688821792604, "epoch": 0.317, "grad_norm": 18.51989341882003, "kl_loss_13": 3380.4, "kl_loss_26": 1988.0, "kl_loss_39": 1492.0, "kl_loss_7": 4166.4, "learning_rate": 0.0007809104449669101, "loss": 5410.7, "step": 3170 }, { "ce_loss_13": 3.041397601366043, "ce_loss_26": 2.382528102397919, "ce_loss_39": 2.1511587262153626, "ce_loss_52": 1.435599946975708, "ce_loss_7": 3.4155047237873077, "epoch": 0.318, "grad_norm": 17.0483845332083, "kl_loss_13": 3256.8, "kl_loss_26": 1868.0, "kl_loss_39": 1392.8, "kl_loss_7": 4038.0, "learning_rate": 0.0007795964517353734, "loss": 5354.9, "step": 3180 }, { "ce_loss_13": 3.0867488861083983, "ce_loss_26": 2.425813916325569, "ce_loss_39": 2.1947717368602753, "ce_loss_52": 1.4569276213645934, "ce_loss_7": 3.4604012250900267, "epoch": 0.319, "grad_norm": 16.586058341187346, "kl_loss_13": 3328.8, "kl_loss_26": 1930.2, "kl_loss_39": 1449.8, "kl_loss_7": 4120.8, "learning_rate": 0.000778279642970672, "loss": 5344.7, "step": 3190 }, { "ce_loss_13": 3.0399708569049837, "ce_loss_26": 2.3785893470048904, "ce_loss_39": 2.138497656583786, "ce_loss_52": 1.4135656535625458, "ce_loss_7": 3.409178429841995, "epoch": 0.32, "grad_norm": 17.983547533058992, "kl_loss_13": 3304.0, "kl_loss_26": 1914.8, "kl_loss_39": 1428.2, "kl_loss_7": 4088.0, "learning_rate": 0.0007769600319330552, "loss": 5362.0, "step": 3200 }, { "ce_loss_13": 3.1184182286262514, "ce_loss_26": 2.4707882523536684, "ce_loss_39": 2.233083599805832, "ce_loss_52": 1.4817634999752045, "ce_loss_7": 3.4818074285984038, "epoch": 0.321, "grad_norm": 16.87308179941231, "kl_loss_13": 3315.6, "kl_loss_26": 1955.6, "kl_loss_39": 1470.6, "kl_loss_7": 4081.2, "learning_rate": 0.0007756376319109917, "loss": 5372.8, "step": 3210 }, { "ce_loss_13": 3.065703272819519, "ce_loss_26": 2.4234554558992385, "ce_loss_39": 2.1948377937078476, "ce_loss_52": 1.4451974362134934, "ce_loss_7": 3.4377165257930757, "epoch": 0.322, "grad_norm": 17.206055177859767, "kl_loss_13": 3288.4, "kl_loss_26": 1929.2, "kl_loss_39": 1461.2, "kl_loss_7": 4058.8, "learning_rate": 0.0007743124562210351, "loss": 5338.3, "step": 3220 }, { "ce_loss_13": 3.0654458463191987, "ce_loss_26": 2.4097089529037476, "ce_loss_39": 2.1828925907611847, "ce_loss_52": 1.4646209165453912, "ce_loss_7": 3.438837933540344, "epoch": 0.323, "grad_norm": 16.43379975440051, "kl_loss_13": 3246.8, "kl_loss_26": 1868.4, "kl_loss_39": 1403.6, "kl_loss_7": 4034.4, "learning_rate": 0.0007729845182076895, "loss": 5337.95, "step": 3230 }, { "ce_loss_13": 3.019025903940201, "ce_loss_26": 2.3812606751918795, "ce_loss_39": 2.1557460606098173, "ce_loss_52": 1.4495598763227462, "ce_loss_7": 3.390954166650772, "epoch": 0.324, "grad_norm": 17.50409394447208, "kl_loss_13": 3213.2, "kl_loss_26": 1857.6, "kl_loss_39": 1393.2, "kl_loss_7": 3999.2, "learning_rate": 0.0007716538312432765, "loss": 5323.8, "step": 3240 }, { "ce_loss_13": 3.0274185359478, "ce_loss_26": 2.3673853039741517, "ce_loss_39": 2.1314821422100065, "ce_loss_52": 1.4110743701457977, "ce_loss_7": 3.3962999522686004, "epoch": 0.325, "grad_norm": 17.627956174954214, "kl_loss_13": 3281.2, "kl_loss_26": 1900.8, "kl_loss_39": 1419.0, "kl_loss_7": 4056.8, "learning_rate": 0.0007703204087277988, "loss": 5310.9, "step": 3250 }, { "ce_loss_13": 2.995425891876221, "ce_loss_26": 2.3437940657138823, "ce_loss_39": 2.1112417429685593, "ce_loss_52": 1.3956570625305176, "ce_loss_7": 3.3635079681873323, "epoch": 0.326, "grad_norm": 17.092527088154757, "kl_loss_13": 3270.8, "kl_loss_26": 1881.4, "kl_loss_39": 1402.8, "kl_loss_7": 4054.4, "learning_rate": 0.0007689842640888063, "loss": 5291.9, "step": 3260 }, { "ce_loss_13": 3.0584332168102266, "ce_loss_26": 2.4099347323179243, "ce_loss_39": 2.182457607984543, "ce_loss_52": 1.4547152355313302, "ce_loss_7": 3.4290026843547823, "epoch": 0.327, "grad_norm": 17.31361789139729, "kl_loss_13": 3256.0, "kl_loss_26": 1887.8, "kl_loss_39": 1426.4, "kl_loss_7": 4026.4, "learning_rate": 0.0007676454107812607, "loss": 5264.3, "step": 3270 }, { "ce_loss_13": 3.002471148967743, "ce_loss_26": 2.365675774216652, "ce_loss_39": 2.1432500898838045, "ce_loss_52": 1.4313921973109245, "ce_loss_7": 3.3743964791297913, "epoch": 0.328, "grad_norm": 15.793768401685108, "kl_loss_13": 3248.4, "kl_loss_26": 1868.8, "kl_loss_39": 1414.0, "kl_loss_7": 4030.8, "learning_rate": 0.0007663038622873999, "loss": 5285.1, "step": 3280 }, { "ce_loss_13": 3.0830911457538606, "ce_loss_26": 2.4269310742616654, "ce_loss_39": 2.2025650680065154, "ce_loss_52": 1.4690157890319824, "ce_loss_7": 3.454869121313095, "epoch": 0.329, "grad_norm": 17.14503971571328, "kl_loss_13": 3293.2, "kl_loss_26": 1903.8, "kl_loss_39": 1434.0, "kl_loss_7": 4075.6, "learning_rate": 0.0007649596321166025, "loss": 5253.65, "step": 3290 }, { "ce_loss_13": 2.9723230481147764, "ce_loss_26": 2.332389995455742, "ce_loss_39": 2.1094966679811478, "ce_loss_52": 1.4339970767498016, "ce_loss_7": 3.333187943696976, "epoch": 0.33, "grad_norm": 16.633040610576913, "kl_loss_13": 3118.0, "kl_loss_26": 1779.4, "kl_loss_39": 1323.0, "kl_loss_7": 3879.6, "learning_rate": 0.0007636127338052513, "loss": 5233.1, "step": 3300 }, { "ce_loss_13": 2.9914496004581452, "ce_loss_26": 2.3316532552242277, "ce_loss_39": 2.0988477796316145, "ce_loss_52": 1.400558878481388, "ce_loss_7": 3.363678741455078, "epoch": 0.331, "grad_norm": 17.26677566687906, "kl_loss_13": 3257.2, "kl_loss_26": 1858.4, "kl_loss_39": 1375.2, "kl_loss_7": 4042.8, "learning_rate": 0.0007622631809165971, "loss": 5196.15, "step": 3310 }, { "ce_loss_13": 3.064756464958191, "ce_loss_26": 2.422107365727425, "ce_loss_39": 2.192085716128349, "ce_loss_52": 1.4793070062994957, "ce_loss_7": 3.430086314678192, "epoch": 0.332, "grad_norm": 17.330178950251707, "kl_loss_13": 3243.6, "kl_loss_26": 1878.8, "kl_loss_39": 1409.8, "kl_loss_7": 4016.4, "learning_rate": 0.000760910987040623, "loss": 5231.55, "step": 3320 }, { "ce_loss_13": 2.9637326538562774, "ce_loss_26": 2.3162154614925385, "ce_loss_39": 2.097182759642601, "ce_loss_52": 1.4222271725535394, "ce_loss_7": 3.332917684316635, "epoch": 0.333, "grad_norm": 17.085141982864975, "kl_loss_13": 3141.2, "kl_loss_26": 1768.8, "kl_loss_39": 1318.3, "kl_loss_7": 3920.0, "learning_rate": 0.000759556165793906, "loss": 5154.65, "step": 3330 }, { "ce_loss_13": 3.029485374689102, "ce_loss_26": 2.3887200921773912, "ce_loss_39": 2.1607041716575623, "ce_loss_52": 1.4696623742580415, "ce_loss_7": 3.3922773957252503, "epoch": 0.334, "grad_norm": 15.502678294546826, "kl_loss_13": 3185.2, "kl_loss_26": 1826.8, "kl_loss_39": 1360.6, "kl_loss_7": 3948.8, "learning_rate": 0.000758198730819481, "loss": 5180.15, "step": 3340 }, { "ce_loss_13": 3.03846270442009, "ce_loss_26": 2.378660023212433, "ce_loss_39": 2.1477699905633925, "ce_loss_52": 1.4347332805395125, "ce_loss_7": 3.4101031959056853, "epoch": 0.335, "grad_norm": 16.024720398541927, "kl_loss_13": 3270.4, "kl_loss_26": 1883.0, "kl_loss_39": 1406.3, "kl_loss_7": 4056.4, "learning_rate": 0.0007568386957867032, "loss": 5194.3, "step": 3350 }, { "ce_loss_13": 3.008663833141327, "ce_loss_26": 2.3700191140174867, "ce_loss_39": 2.1397975504398348, "ce_loss_52": 1.4545943021774292, "ce_loss_7": 3.3673401892185213, "epoch": 0.336, "grad_norm": 16.053283356705972, "kl_loss_13": 3181.6, "kl_loss_26": 1820.2, "kl_loss_39": 1348.6, "kl_loss_7": 3942.0, "learning_rate": 0.0007554760743911103, "loss": 5153.55, "step": 3360 }, { "ce_loss_13": 2.9829940140247344, "ce_loss_26": 2.3409414261579515, "ce_loss_39": 2.1123215198516845, "ce_loss_52": 1.435232725739479, "ce_loss_7": 3.355481207370758, "epoch": 0.337, "grad_norm": 16.67714591780792, "kl_loss_13": 3152.4, "kl_loss_26": 1792.8, "kl_loss_39": 1321.8, "kl_loss_7": 3938.0, "learning_rate": 0.0007541108803542846, "loss": 5142.8, "step": 3370 }, { "ce_loss_13": 3.027516704797745, "ce_loss_26": 2.386495107412338, "ce_loss_39": 2.1608838021755217, "ce_loss_52": 1.4647808492183685, "ce_loss_7": 3.3936978697776796, "epoch": 0.338, "grad_norm": 16.879054589986723, "kl_loss_13": 3186.4, "kl_loss_26": 1828.6, "kl_loss_39": 1351.9, "kl_loss_7": 3957.6, "learning_rate": 0.0007527431274237149, "loss": 5169.0, "step": 3380 }, { "ce_loss_13": 2.9946301877498627, "ce_loss_26": 2.3546741545200347, "ce_loss_39": 2.1238624840974807, "ce_loss_52": 1.4416128873825074, "ce_loss_7": 3.3628919243812563, "epoch": 0.339, "grad_norm": 18.526031342338438, "kl_loss_13": 3157.6, "kl_loss_26": 1810.6, "kl_loss_39": 1339.1, "kl_loss_7": 3932.0, "learning_rate": 0.0007513728293726579, "loss": 5107.45, "step": 3390 }, { "ce_loss_13": 2.975279802083969, "ce_loss_26": 2.325047069787979, "ce_loss_39": 2.1066873967647552, "ce_loss_52": 1.4382890224456788, "ce_loss_7": 3.3415417432785035, "epoch": 0.34, "grad_norm": 17.516441880993753, "kl_loss_13": 3144.4, "kl_loss_26": 1778.8, "kl_loss_39": 1326.8, "kl_loss_7": 3920.0, "learning_rate": 0.00075, "loss": 5107.0, "step": 3400 }, { "ce_loss_13": 2.9631440460681917, "ce_loss_26": 2.318173348903656, "ce_loss_39": 2.0847090512514113, "ce_loss_52": 1.4200364857912064, "ce_loss_7": 3.3265232741832733, "epoch": 0.341, "grad_norm": 16.134403370320147, "kl_loss_13": 3139.2, "kl_loss_26": 1766.8, "kl_loss_39": 1296.8, "kl_loss_7": 3908.8, "learning_rate": 0.0007486246531301177, "loss": 5097.65, "step": 3410 }, { "ce_loss_13": 2.99331476688385, "ce_loss_26": 2.3580960750579836, "ce_loss_39": 2.1382109016180038, "ce_loss_52": 1.4535871922969819, "ce_loss_7": 3.35606609582901, "epoch": 0.342, "grad_norm": 16.427769884918277, "kl_loss_13": 3148.4, "kl_loss_26": 1796.6, "kl_loss_39": 1349.0, "kl_loss_7": 3921.2, "learning_rate": 0.0007472468026127384, "loss": 5139.75, "step": 3420 }, { "ce_loss_13": 2.921882951259613, "ce_loss_26": 2.2855687588453293, "ce_loss_39": 2.067648893594742, "ce_loss_52": 1.4116598561406135, "ce_loss_7": 3.2927843034267426, "epoch": 0.343, "grad_norm": 16.808808535347794, "kl_loss_13": 3095.8, "kl_loss_26": 1756.2, "kl_loss_39": 1301.9, "kl_loss_7": 3878.0, "learning_rate": 0.000745866462322802, "loss": 5051.15, "step": 3430 }, { "ce_loss_13": 3.04699621796608, "ce_loss_26": 2.399735540151596, "ce_loss_39": 2.176995486021042, "ce_loss_52": 1.496598380804062, "ce_loss_7": 3.418045401573181, "epoch": 0.344, "grad_norm": 16.406856004421037, "kl_loss_13": 3162.0, "kl_loss_26": 1792.4, "kl_loss_39": 1329.5, "kl_loss_7": 3935.6, "learning_rate": 0.0007444836461603195, "loss": 5107.85, "step": 3440 }, { "ce_loss_13": 2.930357199907303, "ce_loss_26": 2.3015194088220596, "ce_loss_39": 2.078587147593498, "ce_loss_52": 1.414345271885395, "ce_loss_7": 3.2914562046527864, "epoch": 0.345, "grad_norm": 16.994168943169583, "kl_loss_13": 3103.6, "kl_loss_26": 1763.0, "kl_loss_39": 1304.8, "kl_loss_7": 3859.2, "learning_rate": 0.0007430983680502344, "loss": 5063.6, "step": 3450 }, { "ce_loss_13": 2.946500468254089, "ce_loss_26": 2.3075433492660524, "ce_loss_39": 2.08428935110569, "ce_loss_52": 1.4254867061972618, "ce_loss_7": 3.309797298908234, "epoch": 0.346, "grad_norm": 16.30494022816115, "kl_loss_13": 3098.0, "kl_loss_26": 1752.4, "kl_loss_39": 1293.6, "kl_loss_7": 3865.2, "learning_rate": 0.0007417106419422819, "loss": 5025.2, "step": 3460 }, { "ce_loss_13": 2.935671639442444, "ce_loss_26": 2.2941204428672792, "ce_loss_39": 2.071269851922989, "ce_loss_52": 1.4085116267204285, "ce_loss_7": 3.3008416891098022, "epoch": 0.347, "grad_norm": 17.30594652330039, "kl_loss_13": 3130.4, "kl_loss_26": 1779.4, "kl_loss_39": 1316.9, "kl_loss_7": 3898.4, "learning_rate": 0.0007403204818108486, "loss": 5043.95, "step": 3470 }, { "ce_loss_13": 2.934108853340149, "ce_loss_26": 2.292804607748985, "ce_loss_39": 2.0716417878866196, "ce_loss_52": 1.4156519144773483, "ce_loss_7": 3.2983541190624237, "epoch": 0.348, "grad_norm": 16.793884082475312, "kl_loss_13": 3072.0, "kl_loss_26": 1727.6, "kl_loss_39": 1271.6, "kl_loss_7": 3841.6, "learning_rate": 0.0007389279016548316, "loss": 5016.3, "step": 3480 }, { "ce_loss_13": 2.8679963111877442, "ce_loss_26": 2.2394334375858307, "ce_loss_39": 2.0239282071590425, "ce_loss_52": 1.3932767808437347, "ce_loss_7": 3.234779417514801, "epoch": 0.349, "grad_norm": 15.946114217155069, "kl_loss_13": 3023.6, "kl_loss_26": 1691.8, "kl_loss_39": 1246.9, "kl_loss_7": 3788.8, "learning_rate": 0.0007375329154974975, "loss": 5018.15, "step": 3490 }, { "ce_loss_13": 2.9285870611667635, "ce_loss_26": 2.287761977314949, "ce_loss_39": 2.063358634710312, "ce_loss_52": 1.4101893305778503, "ce_loss_7": 3.298641562461853, "epoch": 0.35, "grad_norm": 16.997559975693584, "kl_loss_13": 3086.0, "kl_loss_26": 1746.0, "kl_loss_39": 1280.7, "kl_loss_7": 3868.8, "learning_rate": 0.0007361355373863414, "loss": 5018.5, "step": 3500 }, { "ce_loss_13": 2.931669169664383, "ce_loss_26": 2.288780450820923, "ce_loss_39": 2.0723241955041884, "ce_loss_52": 1.4271863222122192, "ce_loss_7": 3.2927916407585145, "epoch": 0.351, "grad_norm": 15.980672586392506, "kl_loss_13": 3077.6, "kl_loss_26": 1726.6, "kl_loss_39": 1278.3, "kl_loss_7": 3843.6, "learning_rate": 0.0007347357813929454, "loss": 4989.7, "step": 3510 }, { "ce_loss_13": 2.928689205646515, "ce_loss_26": 2.2940947294235228, "ce_loss_39": 2.0670880317687987, "ce_loss_52": 1.4149780303239823, "ce_loss_7": 3.293194830417633, "epoch": 0.352, "grad_norm": 16.33808700723808, "kl_loss_13": 3074.8, "kl_loss_26": 1738.8, "kl_loss_39": 1273.7, "kl_loss_7": 3840.0, "learning_rate": 0.0007333336616128369, "loss": 4986.35, "step": 3520 }, { "ce_loss_13": 2.940303909778595, "ce_loss_26": 2.294164848327637, "ce_loss_39": 2.0657031387090683, "ce_loss_52": 1.4308805465698242, "ce_loss_7": 3.3036282479763033, "epoch": 0.353, "grad_norm": 16.29503578207681, "kl_loss_13": 3067.6, "kl_loss_26": 1716.2, "kl_loss_39": 1247.9, "kl_loss_7": 3828.0, "learning_rate": 0.0007319291921653463, "loss": 4998.85, "step": 3530 }, { "ce_loss_13": 2.916954427957535, "ce_loss_26": 2.2819162607192993, "ce_loss_39": 2.0701166808605196, "ce_loss_52": 1.4166931748390197, "ce_loss_7": 3.2759189188480375, "epoch": 0.354, "grad_norm": 17.822669536905938, "kl_loss_13": 3073.6, "kl_loss_26": 1723.6, "kl_loss_39": 1286.2, "kl_loss_7": 3837.2, "learning_rate": 0.0007305223871934656, "loss": 4995.55, "step": 3540 }, { "ce_loss_13": 2.975371015071869, "ce_loss_26": 2.3552831768989564, "ce_loss_39": 2.1292835503816603, "ce_loss_52": 1.4775378912687303, "ce_loss_7": 3.3292273938655854, "epoch": 0.355, "grad_norm": 16.58742338810906, "kl_loss_13": 3054.4, "kl_loss_26": 1739.2, "kl_loss_39": 1279.1, "kl_loss_7": 3808.8, "learning_rate": 0.0007291132608637052, "loss": 4945.1, "step": 3550 }, { "ce_loss_13": 2.972325986623764, "ce_loss_26": 2.3392420560121536, "ce_loss_39": 2.1207040429115294, "ce_loss_52": 1.475459137558937, "ce_loss_7": 3.3383385837078094, "epoch": 0.356, "grad_norm": 16.383650902730043, "kl_loss_13": 3066.4, "kl_loss_26": 1712.2, "kl_loss_39": 1260.8, "kl_loss_7": 3838.8, "learning_rate": 0.0007277018273659516, "loss": 4963.15, "step": 3560 }, { "ce_loss_13": 3.0111460268497465, "ce_loss_26": 2.3827701687812803, "ce_loss_39": 2.1591351449489595, "ce_loss_52": 1.4967001289129258, "ce_loss_7": 3.3728690683841704, "epoch": 0.357, "grad_norm": 16.824381349061323, "kl_loss_13": 3101.2, "kl_loss_26": 1770.6, "kl_loss_39": 1310.2, "kl_loss_7": 3856.8, "learning_rate": 0.0007262881009133242, "loss": 4952.95, "step": 3570 }, { "ce_loss_13": 2.9141285896301268, "ce_loss_26": 2.2805556029081346, "ce_loss_39": 2.065216201543808, "ce_loss_52": 1.4257875666022302, "ce_loss_7": 3.282390242815018, "epoch": 0.358, "grad_norm": 18.058728297237614, "kl_loss_13": 3046.4, "kl_loss_26": 1695.0, "kl_loss_39": 1247.4, "kl_loss_7": 3819.6, "learning_rate": 0.0007248720957420329, "loss": 4964.9, "step": 3580 }, { "ce_loss_13": 2.8828001439571382, "ce_loss_26": 2.253911817073822, "ce_loss_39": 2.030216920375824, "ce_loss_52": 1.40321164727211, "ce_loss_7": 3.251304441690445, "epoch": 0.359, "grad_norm": 16.573678980597606, "kl_loss_13": 3047.2, "kl_loss_26": 1698.2, "kl_loss_39": 1242.3, "kl_loss_7": 3817.6, "learning_rate": 0.0007234538261112341, "loss": 4895.95, "step": 3590 }, { "ce_loss_13": 2.9461396992206574, "ce_loss_26": 2.3021911144256593, "ce_loss_39": 2.0822067111730576, "ce_loss_52": 1.448357391357422, "ce_loss_7": 3.3077784180641174, "epoch": 0.36, "grad_norm": 17.017357286641733, "kl_loss_13": 3060.4, "kl_loss_26": 1708.8, "kl_loss_39": 1253.4, "kl_loss_7": 3833.6, "learning_rate": 0.0007220333063028871, "loss": 4918.35, "step": 3600 }, { "ce_loss_13": 2.846253049373627, "ce_loss_26": 2.224426531791687, "ce_loss_39": 2.0076546490192415, "ce_loss_52": 1.395447552204132, "ce_loss_7": 3.2072394728660583, "epoch": 0.361, "grad_norm": 15.74697405086667, "kl_loss_13": 2978.0, "kl_loss_26": 1658.4, "kl_loss_39": 1211.2, "kl_loss_7": 3738.4, "learning_rate": 0.0007206105506216106, "loss": 4871.3, "step": 3610 }, { "ce_loss_13": 3.0099994122982023, "ce_loss_26": 2.373449808359146, "ce_loss_39": 2.1529267936944962, "ce_loss_52": 1.4900053232908248, "ce_loss_7": 3.3753599405288695, "epoch": 0.362, "grad_norm": 16.970809605735944, "kl_loss_13": 3087.6, "kl_loss_26": 1745.2, "kl_loss_39": 1286.7, "kl_loss_7": 3866.8, "learning_rate": 0.0007191855733945387, "loss": 4947.8, "step": 3620 }, { "ce_loss_13": 2.937187296152115, "ce_loss_26": 2.322328266501427, "ce_loss_39": 2.105859735608101, "ce_loss_52": 1.474419781565666, "ce_loss_7": 3.2971576511859895, "epoch": 0.363, "grad_norm": 17.141982755892812, "kl_loss_13": 3009.6, "kl_loss_26": 1696.8, "kl_loss_39": 1241.1, "kl_loss_7": 3762.4, "learning_rate": 0.0007177583889711762, "loss": 4882.15, "step": 3630 }, { "ce_loss_13": 2.902718555927277, "ce_loss_26": 2.260812908411026, "ce_loss_39": 2.042543429136276, "ce_loss_52": 1.4226751655340195, "ce_loss_7": 3.2698469936847685, "epoch": 0.364, "grad_norm": 17.153862070969048, "kl_loss_13": 3018.8, "kl_loss_26": 1673.0, "kl_loss_39": 1219.0, "kl_loss_7": 3784.4, "learning_rate": 0.0007163290117232541, "loss": 4884.0, "step": 3640 }, { "ce_loss_13": 2.9109850347042086, "ce_loss_26": 2.297417125105858, "ce_loss_39": 2.077428176999092, "ce_loss_52": 1.4550551682710648, "ce_loss_7": 3.268283462524414, "epoch": 0.365, "grad_norm": 16.42744245211514, "kl_loss_13": 2985.2, "kl_loss_26": 1679.2, "kl_loss_39": 1227.9, "kl_loss_7": 3734.0, "learning_rate": 0.0007148974560445859, "loss": 4868.65, "step": 3650 }, { "ce_loss_13": 2.9199238896369932, "ce_loss_26": 2.2848848432302473, "ce_loss_39": 2.060741201043129, "ce_loss_52": 1.4278603106737138, "ce_loss_7": 3.2834209561347962, "epoch": 0.366, "grad_norm": 16.404556741779928, "kl_loss_13": 3024.0, "kl_loss_26": 1686.2, "kl_loss_39": 1230.9, "kl_loss_7": 3786.0, "learning_rate": 0.0007134637363509209, "loss": 4839.5, "step": 3660 }, { "ce_loss_13": 2.9712482690811157, "ce_loss_26": 2.3368860691785813, "ce_loss_39": 2.104892411828041, "ce_loss_52": 1.4633448541164398, "ce_loss_7": 3.332030898332596, "epoch": 0.367, "grad_norm": 15.961228476827497, "kl_loss_13": 3092.4, "kl_loss_26": 1760.2, "kl_loss_39": 1277.5, "kl_loss_7": 3848.0, "learning_rate": 0.0007120278670798009, "loss": 4858.55, "step": 3670 }, { "ce_loss_13": 2.951517391204834, "ce_loss_26": 2.3281659215688704, "ce_loss_39": 2.0995417445898057, "ce_loss_52": 1.4656393617391585, "ce_loss_7": 3.2964209616184235, "epoch": 0.368, "grad_norm": 16.089022609349872, "kl_loss_13": 3003.6, "kl_loss_26": 1696.8, "kl_loss_39": 1232.8, "kl_loss_7": 3745.6, "learning_rate": 0.0007105898626904133, "loss": 4774.9, "step": 3680 }, { "ce_loss_13": 2.870139628648758, "ce_loss_26": 2.2511734038591387, "ce_loss_39": 2.0341389745473863, "ce_loss_52": 1.4250996381044387, "ce_loss_7": 3.2268544733524323, "epoch": 0.369, "grad_norm": 15.247673028968622, "kl_loss_13": 2967.2, "kl_loss_26": 1653.8, "kl_loss_39": 1205.5, "kl_loss_7": 3723.2, "learning_rate": 0.0007091497376634463, "loss": 4807.45, "step": 3690 }, { "ce_loss_13": 2.8762976706027983, "ce_loss_26": 2.256538024544716, "ce_loss_39": 2.043423393368721, "ce_loss_52": 1.4497251689434052, "ce_loss_7": 3.2377980053424835, "epoch": 0.37, "grad_norm": 16.15904103093409, "kl_loss_13": 2914.4, "kl_loss_26": 1609.7, "kl_loss_39": 1170.3, "kl_loss_7": 3672.0, "learning_rate": 0.0007077075065009433, "loss": 4822.75, "step": 3700 }, { "ce_loss_13": 2.865807980298996, "ce_loss_26": 2.2327334135770798, "ce_loss_39": 2.012790763378143, "ce_loss_52": 1.4004584282636643, "ce_loss_7": 3.233772474527359, "epoch": 0.371, "grad_norm": 15.511174434634698, "kl_loss_13": 2980.0, "kl_loss_26": 1666.4, "kl_loss_39": 1214.3, "kl_loss_7": 3742.4, "learning_rate": 0.0007062631837261557, "loss": 4816.1, "step": 3710 }, { "ce_loss_13": 2.903226691484451, "ce_loss_26": 2.2818103432655334, "ce_loss_39": 2.059009611606598, "ce_loss_52": 1.456637406349182, "ce_loss_7": 3.263377320766449, "epoch": 0.372, "grad_norm": 17.120548608123716, "kl_loss_13": 2952.8, "kl_loss_26": 1642.0, "kl_loss_39": 1187.9, "kl_loss_7": 3710.8, "learning_rate": 0.0007048167838833977, "loss": 4745.55, "step": 3720 }, { "ce_loss_13": 2.900358548760414, "ce_loss_26": 2.2638369113206864, "ce_loss_39": 2.043374678492546, "ce_loss_52": 1.4358570337295533, "ce_loss_7": 3.272378832101822, "epoch": 0.373, "grad_norm": 15.762139849070088, "kl_loss_13": 2995.6, "kl_loss_26": 1646.4, "kl_loss_39": 1202.7, "kl_loss_7": 3778.4, "learning_rate": 0.0007033683215379002, "loss": 4819.05, "step": 3730 }, { "ce_loss_13": 2.891742479801178, "ce_loss_26": 2.2577997177839277, "ce_loss_39": 2.042544272542, "ce_loss_52": 1.4357560023665428, "ce_loss_7": 3.2664382100105285, "epoch": 0.374, "grad_norm": 17.991228767593586, "kl_loss_13": 3005.6, "kl_loss_26": 1661.4, "kl_loss_39": 1210.7, "kl_loss_7": 3790.4, "learning_rate": 0.0007019178112756625, "loss": 4801.4, "step": 3740 }, { "ce_loss_13": 2.937167102098465, "ce_loss_26": 2.3048900216817856, "ce_loss_39": 2.077365005016327, "ce_loss_52": 1.4518427148461341, "ce_loss_7": 3.2986050605773927, "epoch": 0.375, "grad_norm": 17.06397612135392, "kl_loss_13": 3048.4, "kl_loss_26": 1714.2, "kl_loss_39": 1240.0, "kl_loss_7": 3808.4, "learning_rate": 0.0007004652677033068, "loss": 4778.45, "step": 3750 }, { "ce_loss_13": 2.953932785987854, "ce_loss_26": 2.3320761770009995, "ce_loss_39": 2.1045148581266404, "ce_loss_52": 1.472703790664673, "ce_loss_7": 3.3274633824825286, "epoch": 0.376, "grad_norm": 16.845736377094994, "kl_loss_13": 3032.0, "kl_loss_26": 1703.8, "kl_loss_39": 1244.7, "kl_loss_7": 3816.0, "learning_rate": 0.0006990107054479312, "loss": 4794.6, "step": 3760 }, { "ce_loss_13": 2.8548416674137114, "ce_loss_26": 2.240122190117836, "ce_loss_39": 2.0189033895730972, "ce_loss_52": 1.4262803480029107, "ce_loss_7": 3.208429366350174, "epoch": 0.377, "grad_norm": 16.84130111884451, "kl_loss_13": 2924.4, "kl_loss_26": 1609.6, "kl_loss_39": 1161.2, "kl_loss_7": 3672.0, "learning_rate": 0.000697554139156961, "loss": 4779.2, "step": 3770 }, { "ce_loss_13": 2.972896063327789, "ce_loss_26": 2.335559439659119, "ce_loss_39": 2.111876127123833, "ce_loss_52": 1.4984043270349503, "ce_loss_7": 3.330926328897476, "epoch": 0.378, "grad_norm": 17.969038221722915, "kl_loss_13": 3002.8, "kl_loss_26": 1674.0, "kl_loss_39": 1211.2, "kl_loss_7": 3762.0, "learning_rate": 0.0006960955834980027, "loss": 4732.4, "step": 3780 }, { "ce_loss_13": 2.863754612207413, "ce_loss_26": 2.228693225979805, "ce_loss_39": 2.0101536750793456, "ce_loss_52": 1.4073660969734192, "ce_loss_7": 3.2303711056709288, "epoch": 0.379, "grad_norm": 15.796823584167846, "kl_loss_13": 2960.8, "kl_loss_26": 1639.0, "kl_loss_39": 1188.6, "kl_loss_7": 3734.4, "learning_rate": 0.0006946350531586958, "loss": 4740.55, "step": 3790 }, { "ce_loss_13": 2.819410902261734, "ce_loss_26": 2.200511318445206, "ce_loss_39": 1.9842332571744918, "ce_loss_52": 1.400177489221096, "ce_loss_7": 3.1923243761062623, "epoch": 0.38, "grad_norm": 17.863959287343352, "kl_loss_13": 2930.0, "kl_loss_26": 1613.6, "kl_loss_39": 1162.1, "kl_loss_7": 3705.2, "learning_rate": 0.0006931725628465643, "loss": 4745.35, "step": 3800 }, { "ce_loss_13": 2.845439475774765, "ce_loss_26": 2.2171025544404985, "ce_loss_39": 1.9986167669296264, "ce_loss_52": 1.4112813830375672, "ce_loss_7": 3.2001422882080077, "epoch": 0.381, "grad_norm": 15.509448386002845, "kl_loss_13": 2924.0, "kl_loss_26": 1603.8, "kl_loss_39": 1151.4, "kl_loss_7": 3677.6, "learning_rate": 0.0006917081272888696, "loss": 4686.25, "step": 3810 }, { "ce_loss_13": 2.875427797436714, "ce_loss_26": 2.2557172268629073, "ce_loss_39": 2.0311311304569246, "ce_loss_52": 1.4279655352234841, "ce_loss_7": 3.230677658319473, "epoch": 0.382, "grad_norm": 17.274488302565285, "kl_loss_13": 2934.0, "kl_loss_26": 1621.0, "kl_loss_39": 1159.3, "kl_loss_7": 3683.6, "learning_rate": 0.0006902417612324615, "loss": 4684.7, "step": 3820 }, { "ce_loss_13": 2.9117272198200226, "ce_loss_26": 2.261174875497818, "ce_loss_39": 2.036722195148468, "ce_loss_52": 1.4152167439460754, "ce_loss_7": 3.282198351621628, "epoch": 0.383, "grad_norm": 17.87083708364157, "kl_loss_13": 3095.2, "kl_loss_26": 1720.4, "kl_loss_39": 1253.4, "kl_loss_7": 3865.2, "learning_rate": 0.00068877347944363, "loss": 4739.15, "step": 3830 }, { "ce_loss_13": 2.8889047384262083, "ce_loss_26": 2.2653014570474626, "ce_loss_39": 2.0420874893665313, "ce_loss_52": 1.4475852727890015, "ce_loss_7": 3.253549599647522, "epoch": 0.384, "grad_norm": 15.6987701916489, "kl_loss_13": 2966.0, "kl_loss_26": 1638.2, "kl_loss_39": 1187.2, "kl_loss_7": 3729.2, "learning_rate": 0.0006873032967079561, "loss": 4730.9, "step": 3840 }, { "ce_loss_13": 2.9057071805000305, "ce_loss_26": 2.2790849953889847, "ce_loss_39": 2.0592786610126494, "ce_loss_52": 1.452454286813736, "ce_loss_7": 3.266382873058319, "epoch": 0.385, "grad_norm": 15.755925332297407, "kl_loss_13": 2962.0, "kl_loss_26": 1636.4, "kl_loss_39": 1179.7, "kl_loss_7": 3722.8, "learning_rate": 0.0006858312278301637, "loss": 4713.7, "step": 3850 }, { "ce_loss_13": 2.8342252016067504, "ce_loss_26": 2.2319850236177445, "ce_loss_39": 2.022706937789917, "ce_loss_52": 1.4418139278888702, "ce_loss_7": 3.186972415447235, "epoch": 0.386, "grad_norm": 17.081089442059948, "kl_loss_13": 2855.2, "kl_loss_26": 1568.0, "kl_loss_39": 1131.2, "kl_loss_7": 3603.2, "learning_rate": 0.0006843572876339704, "loss": 4675.25, "step": 3860 }, { "ce_loss_13": 2.7886572241783143, "ce_loss_26": 2.173486915230751, "ce_loss_39": 1.9662895441055297, "ce_loss_52": 1.3961340665817261, "ce_loss_7": 3.1484048068523407, "epoch": 0.387, "grad_norm": 18.57744828916969, "kl_loss_13": 2842.0, "kl_loss_26": 1551.8, "kl_loss_39": 1125.9, "kl_loss_7": 3587.2, "learning_rate": 0.0006828814909619373, "loss": 4659.8, "step": 3870 }, { "ce_loss_13": 2.84233677983284, "ce_loss_26": 2.2270043969154356, "ce_loss_39": 2.011353349685669, "ce_loss_52": 1.44394671022892, "ce_loss_7": 3.189998263120651, "epoch": 0.388, "grad_norm": 17.116859396660736, "kl_loss_13": 2866.4, "kl_loss_26": 1581.4, "kl_loss_39": 1130.5, "kl_loss_7": 3602.4, "learning_rate": 0.0006814038526753205, "loss": 4652.3, "step": 3880 }, { "ce_loss_13": 2.8899350225925446, "ce_loss_26": 2.268605652451515, "ce_loss_39": 2.047902289032936, "ce_loss_52": 1.462986382842064, "ce_loss_7": 3.2532753586769103, "epoch": 0.389, "grad_norm": 16.277065053757138, "kl_loss_13": 2901.6, "kl_loss_26": 1603.8, "kl_loss_39": 1148.8, "kl_loss_7": 3655.2, "learning_rate": 0.0006799243876539213, "loss": 4644.45, "step": 3890 }, { "ce_loss_13": 2.852635699510574, "ce_loss_26": 2.225254198908806, "ce_loss_39": 2.00534345805645, "ce_loss_52": 1.420480152964592, "ce_loss_7": 3.217593324184418, "epoch": 0.39, "grad_norm": 17.575618857452827, "kl_loss_13": 2895.2, "kl_loss_26": 1582.6, "kl_loss_39": 1134.8, "kl_loss_7": 3662.8, "learning_rate": 0.0006784431107959359, "loss": 4640.8, "step": 3900 }, { "ce_loss_13": 2.9095449209213258, "ce_loss_26": 2.288859358429909, "ce_loss_39": 2.069254148006439, "ce_loss_52": 1.4762457937002182, "ce_loss_7": 3.2724156618118285, "epoch": 0.391, "grad_norm": 15.314925266098216, "kl_loss_13": 2939.6, "kl_loss_26": 1620.2, "kl_loss_39": 1162.8, "kl_loss_7": 3702.8, "learning_rate": 0.0006769600370178059, "loss": 4625.75, "step": 3910 }, { "ce_loss_13": 2.79736613035202, "ce_loss_26": 2.1872033685445786, "ce_loss_39": 1.9660126984119415, "ce_loss_52": 1.3993165016174316, "ce_loss_7": 3.152447110414505, "epoch": 0.392, "grad_norm": 15.234701615575748, "kl_loss_13": 2856.0, "kl_loss_26": 1574.6, "kl_loss_39": 1119.8, "kl_loss_7": 3607.6, "learning_rate": 0.0006754751812540679, "loss": 4587.85, "step": 3920 }, { "ce_loss_13": 2.8410171031951905, "ce_loss_26": 2.2249913841485975, "ce_loss_39": 2.0135372936725617, "ce_loss_52": 1.4371111243963242, "ce_loss_7": 3.2084967494010925, "epoch": 0.393, "grad_norm": 16.62173105303993, "kl_loss_13": 2885.6, "kl_loss_26": 1588.2, "kl_loss_39": 1146.8, "kl_loss_7": 3644.4, "learning_rate": 0.0006739885584572025, "loss": 4635.2, "step": 3930 }, { "ce_loss_13": 2.7806951224803926, "ce_loss_26": 2.1756977647542954, "ce_loss_39": 1.96949442923069, "ce_loss_52": 1.415724617242813, "ce_loss_7": 3.1287400901317595, "epoch": 0.394, "grad_norm": 15.878619218635833, "kl_loss_13": 2836.2, "kl_loss_26": 1541.8, "kl_loss_39": 1104.9, "kl_loss_7": 3581.6, "learning_rate": 0.0006725001835974853, "loss": 4637.75, "step": 3940 }, { "ce_loss_13": 2.85609056353569, "ce_loss_26": 2.228466436266899, "ce_loss_39": 2.011217701435089, "ce_loss_52": 1.4336451053619386, "ce_loss_7": 3.212037581205368, "epoch": 0.395, "grad_norm": 15.588059225669095, "kl_loss_13": 2892.8, "kl_loss_26": 1574.8, "kl_loss_39": 1125.7, "kl_loss_7": 3657.6, "learning_rate": 0.0006710100716628344, "loss": 4584.95, "step": 3950 }, { "ce_loss_13": 2.820618736743927, "ce_loss_26": 2.1797895193099976, "ce_loss_39": 1.9612275928258895, "ce_loss_52": 1.3932116001844406, "ce_loss_7": 3.1924599528312685, "epoch": 0.396, "grad_norm": 14.878251588185849, "kl_loss_13": 2911.2, "kl_loss_26": 1556.2, "kl_loss_39": 1114.5, "kl_loss_7": 3694.0, "learning_rate": 0.0006695182376586602, "loss": 4607.1, "step": 3960 }, { "ce_loss_13": 2.7754017412662506, "ce_loss_26": 2.1572470903396606, "ce_loss_39": 1.9344938546419144, "ce_loss_52": 1.3711352616548538, "ce_loss_7": 3.1346897959709166, "epoch": 0.397, "grad_norm": 15.39943522658609, "kl_loss_13": 2875.2, "kl_loss_26": 1575.1, "kl_loss_39": 1124.5, "kl_loss_7": 3635.6, "learning_rate": 0.000668024696607715, "loss": 4546.3, "step": 3970 }, { "ce_loss_13": 2.7410697996616364, "ce_loss_26": 2.1528750866651536, "ce_loss_39": 1.944345197081566, "ce_loss_52": 1.4029324680566788, "ce_loss_7": 3.0945769369602205, "epoch": 0.398, "grad_norm": 16.69493947597699, "kl_loss_13": 2742.0, "kl_loss_26": 1499.6, "kl_loss_39": 1066.1, "kl_loss_7": 3478.0, "learning_rate": 0.0006665294635499404, "loss": 4509.25, "step": 3980 }, { "ce_loss_13": 2.7935349524021147, "ce_loss_26": 2.191756248474121, "ce_loss_39": 1.9830526530742645, "ce_loss_52": 1.4325652569532394, "ce_loss_7": 3.150054842233658, "epoch": 0.399, "grad_norm": 15.984763021073704, "kl_loss_13": 2764.0, "kl_loss_26": 1503.8, "kl_loss_39": 1075.4, "kl_loss_7": 3508.8, "learning_rate": 0.0006650325535423167, "loss": 4542.85, "step": 3990 }, { "ce_loss_13": 2.7841295659542085, "ce_loss_26": 2.175816202163696, "ce_loss_39": 1.9610484838485718, "ce_loss_52": 1.3994766443967819, "ce_loss_7": 3.1450257122516634, "epoch": 0.4, "grad_norm": 16.383690879711693, "kl_loss_13": 2832.8, "kl_loss_26": 1534.6, "kl_loss_39": 1101.1, "kl_loss_7": 3587.2, "learning_rate": 0.0006635339816587109, "loss": 4584.95, "step": 4000 }, { "ce_loss_13": 2.937473142147064, "ce_loss_26": 2.298046553134918, "ce_loss_39": 2.071186339855194, "ce_loss_52": 1.4680579513311387, "ce_loss_7": 3.2991883754730225, "epoch": 0.401, "grad_norm": 16.69896458470603, "kl_loss_13": 2974.0, "kl_loss_26": 1650.8, "kl_loss_39": 1187.0, "kl_loss_7": 3734.8, "learning_rate": 0.0006620337629897252, "loss": 4574.8, "step": 4010 }, { "ce_loss_13": 2.803048574924469, "ce_loss_26": 2.1910858035087584, "ce_loss_39": 1.977920189499855, "ce_loss_52": 1.4274337738752365, "ce_loss_7": 3.1627039849758147, "epoch": 0.402, "grad_norm": 15.21058574655926, "kl_loss_13": 2803.0, "kl_loss_26": 1508.9, "kl_loss_39": 1074.1, "kl_loss_7": 3558.8, "learning_rate": 0.0006605319126425454, "loss": 4546.4, "step": 4020 }, { "ce_loss_13": 2.8307320177555084, "ce_loss_26": 2.208324944972992, "ce_loss_39": 1.9950761079788208, "ce_loss_52": 1.435056920349598, "ce_loss_7": 3.19031218290329, "epoch": 0.403, "grad_norm": 14.837343102998657, "kl_loss_13": 2876.0, "kl_loss_26": 1550.3, "kl_loss_39": 1112.9, "kl_loss_7": 3638.4, "learning_rate": 0.0006590284457407876, "loss": 4535.35, "step": 4030 }, { "ce_loss_13": 2.8277206301689146, "ce_loss_26": 2.2229607343673705, "ce_loss_39": 2.0126491367816923, "ce_loss_52": 1.465662133693695, "ce_loss_7": 3.178615337610245, "epoch": 0.404, "grad_norm": 15.868817769840305, "kl_loss_13": 2801.6, "kl_loss_26": 1514.6, "kl_loss_39": 1078.5, "kl_loss_7": 3548.0, "learning_rate": 0.0006575233774243465, "loss": 4524.1, "step": 4040 }, { "ce_loss_13": 2.741392558813095, "ce_loss_26": 2.1182916700839995, "ce_loss_39": 1.9061576217412948, "ce_loss_52": 1.3709532082080842, "ce_loss_7": 3.1065491139888763, "epoch": 0.405, "grad_norm": 16.502947013390255, "kl_loss_13": 2798.4, "kl_loss_26": 1495.2, "kl_loss_39": 1058.1, "kl_loss_7": 3565.6, "learning_rate": 0.0006560167228492435, "loss": 4528.6, "step": 4050 }, { "ce_loss_13": 2.8996002614498138, "ce_loss_26": 2.271700030565262, "ce_loss_39": 2.045673191547394, "ce_loss_52": 1.4674718797206878, "ce_loss_7": 3.2622067093849183, "epoch": 0.406, "grad_norm": 15.215707475527795, "kl_loss_13": 2900.0, "kl_loss_26": 1589.6, "kl_loss_39": 1131.4, "kl_loss_7": 3660.8, "learning_rate": 0.0006545084971874737, "loss": 4547.15, "step": 4060 }, { "ce_loss_13": 2.8251163959503174, "ce_loss_26": 2.1874846637248995, "ce_loss_39": 1.9672167718410491, "ce_loss_52": 1.4135777831077576, "ce_loss_7": 3.1873776078224183, "epoch": 0.407, "grad_norm": 15.755939255613459, "kl_loss_13": 2866.0, "kl_loss_26": 1547.6, "kl_loss_39": 1092.9, "kl_loss_7": 3627.6, "learning_rate": 0.0006529987156268526, "loss": 4503.1, "step": 4070 }, { "ce_loss_13": 2.7349390149116517, "ce_loss_26": 2.1141091108322145, "ce_loss_39": 1.909931591153145, "ce_loss_52": 1.3686757802963256, "ce_loss_7": 3.0966077923774717, "epoch": 0.408, "grad_norm": 15.787212276524022, "kl_loss_13": 2801.6, "kl_loss_26": 1509.4, "kl_loss_39": 1071.0, "kl_loss_7": 3562.8, "learning_rate": 0.0006514873933708637, "loss": 4534.05, "step": 4080 }, { "ce_loss_13": 2.742733418941498, "ce_loss_26": 2.1391125679016114, "ce_loss_39": 1.9272442519664765, "ce_loss_52": 1.387654460966587, "ce_loss_7": 3.0977914452552797, "epoch": 0.409, "grad_norm": 15.727797591546214, "kl_loss_13": 2755.6, "kl_loss_26": 1488.4, "kl_loss_39": 1050.3, "kl_loss_7": 3508.0, "learning_rate": 0.0006499745456385053, "loss": 4444.65, "step": 4090 }, { "ce_loss_13": 2.7960755199193956, "ce_loss_26": 2.184322661161423, "ce_loss_39": 1.9677571415901185, "ce_loss_52": 1.4271342948079109, "ce_loss_7": 3.1514409124851226, "epoch": 0.41, "grad_norm": 15.52426613691677, "kl_loss_13": 2809.8, "kl_loss_26": 1518.3, "kl_loss_39": 1075.2, "kl_loss_7": 3551.6, "learning_rate": 0.0006484601876641375, "loss": 4500.65, "step": 4100 }, { "ce_loss_13": 2.8776713728904726, "ce_loss_26": 2.257500499486923, "ce_loss_39": 2.0303492128849028, "ce_loss_52": 1.4582158356904984, "ce_loss_7": 3.2387999415397646, "epoch": 0.411, "grad_norm": 15.93298743678484, "kl_loss_13": 2878.8, "kl_loss_26": 1576.0, "kl_loss_39": 1115.6, "kl_loss_7": 3640.8, "learning_rate": 0.000646944334697328, "loss": 4470.55, "step": 4110 }, { "ce_loss_13": 2.802631789445877, "ce_loss_26": 2.2029493927955626, "ce_loss_39": 2.001139259338379, "ce_loss_52": 1.4623139530420304, "ce_loss_7": 3.155901437997818, "epoch": 0.412, "grad_norm": 14.691054390726734, "kl_loss_13": 2720.8, "kl_loss_26": 1465.2, "kl_loss_39": 1041.4, "kl_loss_7": 3462.4, "learning_rate": 0.0006454270020026995, "loss": 4502.65, "step": 4120 }, { "ce_loss_13": 2.8162184596061706, "ce_loss_26": 2.1934009909629824, "ce_loss_39": 1.979950374364853, "ce_loss_52": 1.4344559267163277, "ce_loss_7": 3.1758966505527497, "epoch": 0.413, "grad_norm": 16.25780643806628, "kl_loss_13": 2816.0, "kl_loss_26": 1518.6, "kl_loss_39": 1077.6, "kl_loss_7": 3573.6, "learning_rate": 0.0006439082048597755, "loss": 4487.45, "step": 4130 }, { "ce_loss_13": 2.787912631034851, "ce_loss_26": 2.1966257959604265, "ce_loss_39": 1.9914580851793289, "ce_loss_52": 1.4511510521173476, "ce_loss_7": 3.1392914772033693, "epoch": 0.414, "grad_norm": 17.37704963704925, "kl_loss_13": 2734.8, "kl_loss_26": 1487.2, "kl_loss_39": 1057.4, "kl_loss_7": 3474.0, "learning_rate": 0.0006423879585628261, "loss": 4448.15, "step": 4140 }, { "ce_loss_13": 2.817258411645889, "ce_loss_26": 2.1947576314210893, "ce_loss_39": 1.9762789696455, "ce_loss_52": 1.433014589548111, "ce_loss_7": 3.182687884569168, "epoch": 0.415, "grad_norm": 15.35502556975723, "kl_loss_13": 2826.8, "kl_loss_26": 1522.0, "kl_loss_39": 1072.5, "kl_loss_7": 3595.2, "learning_rate": 0.0006408662784207149, "loss": 4433.75, "step": 4150 }, { "ce_loss_13": 2.817685341835022, "ce_loss_26": 2.2071537256240843, "ce_loss_39": 1.9907894372940063, "ce_loss_52": 1.4230278193950654, "ce_loss_7": 3.1795800507068632, "epoch": 0.416, "grad_norm": 15.573867614749913, "kl_loss_13": 2866.0, "kl_loss_26": 1558.6, "kl_loss_39": 1107.2, "kl_loss_7": 3632.0, "learning_rate": 0.0006393431797567439, "loss": 4436.3, "step": 4160 }, { "ce_loss_13": 2.819452613592148, "ce_loss_26": 2.213544499874115, "ce_loss_39": 1.9934939831495284, "ce_loss_52": 1.4420817136764525, "ce_loss_7": 3.1729123532772063, "epoch": 0.417, "grad_norm": 15.840337845359416, "kl_loss_13": 2809.4, "kl_loss_26": 1533.4, "kl_loss_39": 1076.0, "kl_loss_7": 3544.0, "learning_rate": 0.0006378186779084996, "loss": 4429.6, "step": 4170 }, { "ce_loss_13": 2.797993552684784, "ce_loss_26": 2.2015393495559694, "ce_loss_39": 1.986987265944481, "ce_loss_52": 1.446770191192627, "ce_loss_7": 3.145763796567917, "epoch": 0.418, "grad_norm": 16.258575254109445, "kl_loss_13": 2768.4, "kl_loss_26": 1520.4, "kl_loss_39": 1076.5, "kl_loss_7": 3502.0, "learning_rate": 0.0006362927882276989, "loss": 4452.8, "step": 4180 }, { "ce_loss_13": 2.809996685385704, "ce_loss_26": 2.1883741706609725, "ce_loss_39": 1.972084966301918, "ce_loss_52": 1.4272316336631774, "ce_loss_7": 3.1641923069953917, "epoch": 0.419, "grad_norm": 17.021132117568744, "kl_loss_13": 2806.8, "kl_loss_26": 1522.7, "kl_loss_39": 1076.0, "kl_loss_7": 3556.4, "learning_rate": 0.000634765526080034, "loss": 4434.25, "step": 4190 }, { "ce_loss_13": 2.7747348487377166, "ce_loss_26": 2.1618224531412125, "ce_loss_39": 1.9505164802074433, "ce_loss_52": 1.4064817115664483, "ce_loss_7": 3.1292604207992554, "epoch": 0.42, "grad_norm": 15.556302486325128, "kl_loss_13": 2777.6, "kl_loss_26": 1495.4, "kl_loss_39": 1055.2, "kl_loss_7": 3523.2, "learning_rate": 0.0006332369068450174, "loss": 4413.55, "step": 4200 }, { "ce_loss_13": 2.748269832134247, "ce_loss_26": 2.145698443055153, "ce_loss_39": 1.935601145029068, "ce_loss_52": 1.4105115324258803, "ce_loss_7": 3.1001435458660125, "epoch": 0.421, "grad_norm": 15.348610438295403, "kl_loss_13": 2742.8, "kl_loss_26": 1480.0, "kl_loss_39": 1039.8, "kl_loss_7": 3490.0, "learning_rate": 0.0006317069459158283, "loss": 4363.8, "step": 4210 }, { "ce_loss_13": 2.7747100263834, "ce_loss_26": 2.16818388402462, "ce_loss_39": 1.9505507349967957, "ce_loss_52": 1.4193186193704606, "ce_loss_7": 3.136371600627899, "epoch": 0.422, "grad_norm": 16.358740351868324, "kl_loss_13": 2764.6, "kl_loss_26": 1481.3, "kl_loss_39": 1040.1, "kl_loss_7": 3516.8, "learning_rate": 0.0006301756586991561, "loss": 4421.65, "step": 4220 }, { "ce_loss_13": 2.8185549050569536, "ce_loss_26": 2.226038011908531, "ce_loss_39": 2.013939729332924, "ce_loss_52": 1.4788149103522301, "ce_loss_7": 3.1706756830215452, "epoch": 0.423, "grad_norm": 14.82164626530813, "kl_loss_13": 2758.0, "kl_loss_26": 1495.6, "kl_loss_39": 1059.3, "kl_loss_7": 3503.2, "learning_rate": 0.0006286430606150459, "loss": 4398.35, "step": 4230 }, { "ce_loss_13": 2.7891676902770994, "ce_loss_26": 2.1986444026231764, "ce_loss_39": 1.9819349884986877, "ce_loss_52": 1.4562569051980971, "ce_loss_7": 3.1411671698093415, "epoch": 0.424, "grad_norm": 15.535941880253773, "kl_loss_13": 2717.2, "kl_loss_26": 1468.6, "kl_loss_39": 1020.2, "kl_loss_7": 3457.2, "learning_rate": 0.0006271091670967436, "loss": 4370.45, "step": 4240 }, { "ce_loss_13": 2.8151471495628355, "ce_loss_26": 2.204052150249481, "ce_loss_39": 1.9960095703601837, "ce_loss_52": 1.45780867934227, "ce_loss_7": 3.1626071453094484, "epoch": 0.425, "grad_norm": 16.39177349451075, "kl_loss_13": 2749.2, "kl_loss_26": 1471.2, "kl_loss_39": 1041.9, "kl_loss_7": 3492.8, "learning_rate": 0.0006255739935905395, "loss": 4354.95, "step": 4250 }, { "ce_loss_13": 2.7719932794570923, "ce_loss_26": 2.1723096281290055, "ce_loss_39": 1.9554951965808869, "ce_loss_52": 1.4198345810174942, "ce_loss_7": 3.134742945432663, "epoch": 0.426, "grad_norm": 17.215386749382045, "kl_loss_13": 2775.6, "kl_loss_26": 1506.6, "kl_loss_39": 1055.7, "kl_loss_7": 3532.8, "learning_rate": 0.0006240375555556145, "loss": 4360.8, "step": 4260 }, { "ce_loss_13": 2.7217872977256774, "ce_loss_26": 2.1173421651124955, "ce_loss_39": 1.9085008651018143, "ce_loss_52": 1.400168927013874, "ce_loss_7": 3.0799288749694824, "epoch": 0.427, "grad_norm": 15.867423276307166, "kl_loss_13": 2701.0, "kl_loss_26": 1432.6, "kl_loss_39": 996.7, "kl_loss_7": 3452.8, "learning_rate": 0.000622499868463882, "loss": 4320.5, "step": 4270 }, { "ce_loss_13": 2.7815617978572846, "ce_loss_26": 2.1786680042743685, "ce_loss_39": 1.9648784220218658, "ce_loss_52": 1.4438522070646287, "ce_loss_7": 3.1414669275283815, "epoch": 0.428, "grad_norm": 16.86028992899928, "kl_loss_13": 2733.2, "kl_loss_26": 1463.0, "kl_loss_39": 1028.0, "kl_loss_7": 3484.8, "learning_rate": 0.0006209609477998338, "loss": 4348.9, "step": 4280 }, { "ce_loss_13": 2.8184913277626036, "ce_loss_26": 2.213253751397133, "ce_loss_39": 1.986603057384491, "ce_loss_52": 1.4555893182754516, "ce_loss_7": 3.1685641705989838, "epoch": 0.429, "grad_norm": 15.40477364702056, "kl_loss_13": 2779.6, "kl_loss_26": 1503.0, "kl_loss_39": 1049.7, "kl_loss_7": 3514.4, "learning_rate": 0.0006194208090603844, "loss": 4374.7, "step": 4290 }, { "ce_loss_13": 2.726405268907547, "ce_loss_26": 2.1394855052232744, "ce_loss_39": 1.9364097625017167, "ce_loss_52": 1.4365313708782197, "ce_loss_7": 3.0755336761474608, "epoch": 0.43, "grad_norm": 14.784393649721942, "kl_loss_13": 2680.0, "kl_loss_26": 1434.2, "kl_loss_39": 1002.6, "kl_loss_7": 3415.2, "learning_rate": 0.0006178794677547138, "loss": 4325.15, "step": 4300 }, { "ce_loss_13": 2.78907487988472, "ce_loss_26": 2.1874548703432084, "ce_loss_39": 1.9674001038074493, "ce_loss_52": 1.4388054758310318, "ce_loss_7": 3.1580194234848022, "epoch": 0.431, "grad_norm": 15.540150658114959, "kl_loss_13": 2772.4, "kl_loss_26": 1489.8, "kl_loss_39": 1036.1, "kl_loss_7": 3534.8, "learning_rate": 0.0006163369394041111, "loss": 4337.1, "step": 4310 }, { "ce_loss_13": 2.7502326130867005, "ce_loss_26": 2.1552721470594407, "ce_loss_39": 1.9502787470817566, "ce_loss_52": 1.4348126232624054, "ce_loss_7": 3.1085386633872987, "epoch": 0.432, "grad_norm": 15.900486211327715, "kl_loss_13": 2709.8, "kl_loss_26": 1438.4, "kl_loss_39": 1010.4, "kl_loss_7": 3455.6, "learning_rate": 0.0006147932395418205, "loss": 4308.0, "step": 4320 }, { "ce_loss_13": 2.7637496650218965, "ce_loss_26": 2.1625583559274673, "ce_loss_39": 1.947121372818947, "ce_loss_52": 1.4198297888040543, "ce_loss_7": 3.1241161942481996, "epoch": 0.433, "grad_norm": 16.260371827994177, "kl_loss_13": 2733.2, "kl_loss_26": 1467.8, "kl_loss_39": 1033.8, "kl_loss_7": 3485.2, "learning_rate": 0.0006132483837128823, "loss": 4327.3, "step": 4330 }, { "ce_loss_13": 2.780191105604172, "ce_loss_26": 2.1823483228683473, "ce_loss_39": 1.9749175161123276, "ce_loss_52": 1.4566338241100312, "ce_loss_7": 3.142491656541824, "epoch": 0.434, "grad_norm": 16.173065879753995, "kl_loss_13": 2713.6, "kl_loss_26": 1446.4, "kl_loss_39": 1012.3, "kl_loss_7": 3465.6, "learning_rate": 0.0006117023874739772, "loss": 4346.0, "step": 4340 }, { "ce_loss_13": 2.756999599933624, "ce_loss_26": 2.151958614587784, "ce_loss_39": 1.9352585464715957, "ce_loss_52": 1.4167816311120986, "ce_loss_7": 3.1229954183101656, "epoch": 0.435, "grad_norm": 16.656646084830363, "kl_loss_13": 2759.6, "kl_loss_26": 1478.0, "kl_loss_39": 1029.0, "kl_loss_7": 3524.0, "learning_rate": 0.0006101552663932703, "loss": 4336.25, "step": 4350 }, { "ce_loss_13": 2.774202525615692, "ce_loss_26": 2.172477602958679, "ce_loss_39": 1.9620429188013078, "ce_loss_52": 1.43767509162426, "ce_loss_7": 3.1362563192844393, "epoch": 0.436, "grad_norm": 16.067338284310296, "kl_loss_13": 2744.4, "kl_loss_26": 1472.4, "kl_loss_39": 1033.1, "kl_loss_7": 3493.6, "learning_rate": 0.0006086070360502539, "loss": 4296.35, "step": 4360 }, { "ce_loss_13": 2.787889677286148, "ce_loss_26": 2.208648791909218, "ce_loss_39": 1.999781733751297, "ce_loss_52": 1.4855108827352523, "ce_loss_7": 3.1249564945697785, "epoch": 0.437, "grad_norm": 15.78991831926034, "kl_loss_13": 2690.0, "kl_loss_26": 1446.0, "kl_loss_39": 1014.8, "kl_loss_7": 3419.2, "learning_rate": 0.0006070577120355903, "loss": 4280.75, "step": 4370 }, { "ce_loss_13": 2.8026595056056975, "ce_loss_26": 2.207309713959694, "ce_loss_39": 2.0008264780044556, "ce_loss_52": 1.4935471057891845, "ce_loss_7": 3.1493531346321104, "epoch": 0.438, "grad_norm": 15.837154081953376, "kl_loss_13": 2679.6, "kl_loss_26": 1429.2, "kl_loss_39": 1001.1, "kl_loss_7": 3413.2, "learning_rate": 0.0006055073099509549, "loss": 4296.35, "step": 4380 }, { "ce_loss_13": 2.755897510051727, "ce_loss_26": 2.1693040400743486, "ce_loss_39": 1.9623985677957534, "ce_loss_52": 1.4462745368480683, "ce_loss_7": 3.1059607326984406, "epoch": 0.439, "grad_norm": 15.629443906703631, "kl_loss_13": 2694.4, "kl_loss_26": 1446.6, "kl_loss_39": 1012.9, "kl_loss_7": 3427.6, "learning_rate": 0.0006039558454088796, "loss": 4277.25, "step": 4390 }, { "ce_loss_13": 2.7673678040504455, "ce_loss_26": 2.159538361430168, "ce_loss_39": 1.9505891352891922, "ce_loss_52": 1.4304020568728446, "ce_loss_7": 3.1244628012180327, "epoch": 0.44, "grad_norm": 15.403089942991496, "kl_loss_13": 2740.4, "kl_loss_26": 1465.4, "kl_loss_39": 1024.9, "kl_loss_7": 3482.4, "learning_rate": 0.0006024033340325954, "loss": 4300.2, "step": 4400 }, { "ce_loss_13": 2.7479640781879424, "ce_loss_26": 2.1436998754739762, "ce_loss_39": 1.9348597198724746, "ce_loss_52": 1.4162754774093629, "ce_loss_7": 3.1057413816452026, "epoch": 0.441, "grad_norm": 16.11204916554698, "kl_loss_13": 2726.0, "kl_loss_26": 1457.7, "kl_loss_39": 1024.6, "kl_loss_7": 3474.8, "learning_rate": 0.0006008497914558743, "loss": 4264.9, "step": 4410 }, { "ce_loss_13": 2.781216788291931, "ce_loss_26": 2.1821627736091616, "ce_loss_39": 1.9793646305799484, "ce_loss_52": 1.456499743461609, "ce_loss_7": 3.1415066480636598, "epoch": 0.442, "grad_norm": 15.839943843413481, "kl_loss_13": 2703.2, "kl_loss_26": 1453.2, "kl_loss_39": 1022.0, "kl_loss_7": 3462.8, "learning_rate": 0.0005992952333228728, "loss": 4320.7, "step": 4420 }, { "ce_loss_13": 2.6314639270305635, "ce_loss_26": 2.0378955364227296, "ce_loss_39": 1.841288161277771, "ce_loss_52": 1.367735171318054, "ce_loss_7": 2.983852916955948, "epoch": 0.443, "grad_norm": 15.715297314159198, "kl_loss_13": 2586.8, "kl_loss_26": 1338.4, "kl_loss_39": 929.4, "kl_loss_7": 3328.4, "learning_rate": 0.0005977396752879741, "loss": 4224.0, "step": 4430 }, { "ce_loss_13": 2.747091996669769, "ce_loss_26": 2.1425569266080857, "ce_loss_39": 1.9297908574342728, "ce_loss_52": 1.4299270451068877, "ce_loss_7": 3.0992358028888702, "epoch": 0.444, "grad_norm": 15.44431585804275, "kl_loss_13": 2688.8, "kl_loss_26": 1428.0, "kl_loss_39": 986.6, "kl_loss_7": 3434.0, "learning_rate": 0.0005961831330156305, "loss": 4224.4, "step": 4440 }, { "ce_loss_13": 2.7738942086696623, "ce_loss_26": 2.1665944904088974, "ce_loss_39": 1.9503348082304002, "ce_loss_52": 1.4405199617147446, "ce_loss_7": 3.131233388185501, "epoch": 0.445, "grad_norm": 15.683988530213393, "kl_loss_13": 2701.2, "kl_loss_26": 1441.4, "kl_loss_39": 999.9, "kl_loss_7": 3451.6, "learning_rate": 0.0005946256221802051, "loss": 4233.55, "step": 4450 }, { "ce_loss_13": 2.6961427688598634, "ce_loss_26": 2.1098335653543474, "ce_loss_39": 1.8978259444236756, "ce_loss_52": 1.415482410788536, "ce_loss_7": 3.044248181581497, "epoch": 0.446, "grad_norm": 15.256632143150593, "kl_loss_13": 2612.8, "kl_loss_26": 1380.2, "kl_loss_39": 951.7, "kl_loss_7": 3350.8, "learning_rate": 0.0005930671584658151, "loss": 4214.65, "step": 4460 }, { "ce_loss_13": 2.732464927434921, "ce_loss_26": 2.141967472434044, "ce_loss_39": 1.929695299267769, "ce_loss_52": 1.4190144926309585, "ce_loss_7": 3.091973352432251, "epoch": 0.447, "grad_norm": 16.38296166656899, "kl_loss_13": 2676.8, "kl_loss_26": 1435.6, "kl_loss_39": 1000.5, "kl_loss_7": 3428.4, "learning_rate": 0.0005915077575661722, "loss": 4280.4, "step": 4470 }, { "ce_loss_13": 2.683997756242752, "ce_loss_26": 2.091261792182922, "ce_loss_39": 1.8827648997306823, "ce_loss_52": 1.3911016047000886, "ce_loss_7": 3.0366858661174776, "epoch": 0.448, "grad_norm": 15.184401397346244, "kl_loss_13": 2642.4, "kl_loss_26": 1399.3, "kl_loss_39": 968.5, "kl_loss_7": 3392.0, "learning_rate": 0.000589947435184427, "loss": 4194.3, "step": 4480 }, { "ce_loss_13": 2.7240218341350557, "ce_loss_26": 2.128333044052124, "ce_loss_39": 1.924179795384407, "ce_loss_52": 1.4498969972133637, "ce_loss_7": 3.0761671125888825, "epoch": 0.449, "grad_norm": 17.07642900561258, "kl_loss_13": 2604.8, "kl_loss_26": 1344.6, "kl_loss_39": 924.8, "kl_loss_7": 3354.0, "learning_rate": 0.0005883862070330078, "loss": 4206.7, "step": 4490 }, { "ce_loss_13": 2.7219722032547, "ce_loss_26": 2.1390156149864197, "ce_loss_39": 1.9286952793598175, "ce_loss_52": 1.4294085174798965, "ce_loss_7": 3.0803197801113127, "epoch": 0.45, "grad_norm": 15.274632326953679, "kl_loss_13": 2621.6, "kl_loss_26": 1403.4, "kl_loss_39": 973.4, "kl_loss_7": 3369.2, "learning_rate": 0.0005868240888334653, "loss": 4211.9, "step": 4500 }, { "ce_loss_13": 2.6810890555381777, "ce_loss_26": 2.105925416946411, "ce_loss_39": 1.9109346747398377, "ce_loss_52": 1.4293138086795807, "ce_loss_7": 3.033176803588867, "epoch": 0.451, "grad_norm": 17.03243058089965, "kl_loss_13": 2608.2, "kl_loss_26": 1375.3, "kl_loss_39": 955.3, "kl_loss_7": 3345.2, "learning_rate": 0.0005852610963163119, "loss": 4209.7, "step": 4510 }, { "ce_loss_13": 2.689431291818619, "ce_loss_26": 2.1146853864192963, "ce_loss_39": 1.9112016946077346, "ce_loss_52": 1.4318486779928208, "ce_loss_7": 3.0395568013191223, "epoch": 0.452, "grad_norm": 15.510330374157597, "kl_loss_13": 2583.2, "kl_loss_26": 1365.8, "kl_loss_39": 947.8, "kl_loss_7": 3313.2, "learning_rate": 0.0005836972452208654, "loss": 4185.25, "step": 4520 }, { "ce_loss_13": 2.7475471079349516, "ce_loss_26": 2.157953730225563, "ce_loss_39": 1.9457335144281387, "ce_loss_52": 1.4404390811920167, "ce_loss_7": 3.104820030927658, "epoch": 0.453, "grad_norm": 15.220972226004102, "kl_loss_13": 2688.4, "kl_loss_26": 1444.2, "kl_loss_39": 1006.8, "kl_loss_7": 3431.6, "learning_rate": 0.0005821325512950885, "loss": 4222.6, "step": 4530 }, { "ce_loss_13": 2.7701157510280607, "ce_loss_26": 2.1823565661907196, "ce_loss_39": 1.9763484060764314, "ce_loss_52": 1.4842363893985748, "ce_loss_7": 3.1242611587047575, "epoch": 0.454, "grad_norm": 16.181871779695452, "kl_loss_13": 2641.6, "kl_loss_26": 1398.0, "kl_loss_39": 968.2, "kl_loss_7": 3387.6, "learning_rate": 0.0005805670302954321, "loss": 4206.95, "step": 4540 }, { "ce_loss_13": 2.69073800444603, "ce_loss_26": 2.1018889248371124, "ce_loss_39": 1.8928476065397262, "ce_loss_52": 1.4156933531165123, "ce_loss_7": 3.051577550172806, "epoch": 0.455, "grad_norm": 15.802548274169151, "kl_loss_13": 2629.2, "kl_loss_26": 1374.2, "kl_loss_39": 934.4, "kl_loss_7": 3382.8, "learning_rate": 0.000579000697986675, "loss": 4173.65, "step": 4550 }, { "ce_loss_13": 2.734744447469711, "ce_loss_26": 2.139357805252075, "ce_loss_39": 1.9327150255441665, "ce_loss_52": 1.44863750487566, "ce_loss_7": 3.083251416683197, "epoch": 0.456, "grad_norm": 15.332335326805197, "kl_loss_13": 2646.0, "kl_loss_26": 1390.4, "kl_loss_39": 959.2, "kl_loss_7": 3391.2, "learning_rate": 0.0005774335701417662, "loss": 4177.45, "step": 4560 }, { "ce_loss_13": 2.696203714609146, "ce_loss_26": 2.102033945918083, "ce_loss_39": 1.8971556156873703, "ce_loss_52": 1.4197801396250724, "ce_loss_7": 3.047564595937729, "epoch": 0.457, "grad_norm": 16.076096882060348, "kl_loss_13": 2600.0, "kl_loss_26": 1362.8, "kl_loss_39": 939.0, "kl_loss_7": 3341.2, "learning_rate": 0.0005758656625416658, "loss": 4183.3, "step": 4570 }, { "ce_loss_13": 2.7472688376903536, "ce_loss_26": 2.1439451813697814, "ce_loss_39": 1.9378813654184341, "ce_loss_52": 1.4498099207878112, "ce_loss_7": 3.1056803286075594, "epoch": 0.458, "grad_norm": 15.434602661166036, "kl_loss_13": 2667.6, "kl_loss_26": 1394.6, "kl_loss_39": 964.2, "kl_loss_7": 3421.6, "learning_rate": 0.0005742969909751859, "loss": 4202.65, "step": 4580 }, { "ce_loss_13": 2.819844591617584, "ce_loss_26": 2.217494735121727, "ce_loss_39": 1.995268750190735, "ce_loss_52": 1.4828792631626129, "ce_loss_7": 3.183567076921463, "epoch": 0.459, "grad_norm": 15.16665840440692, "kl_loss_13": 2715.2, "kl_loss_26": 1447.8, "kl_loss_39": 1000.9, "kl_loss_7": 3469.2, "learning_rate": 0.0005727275712388318, "loss": 4159.15, "step": 4590 }, { "ce_loss_13": 2.757504242658615, "ce_loss_26": 2.1517707139253615, "ce_loss_39": 1.9407849818468095, "ce_loss_52": 1.4452633827924728, "ce_loss_7": 3.1194639682769774, "epoch": 0.46, "grad_norm": 16.189362396808324, "kl_loss_13": 2690.8, "kl_loss_26": 1421.0, "kl_loss_39": 976.6, "kl_loss_7": 3439.6, "learning_rate": 0.0005711574191366427, "loss": 4126.7, "step": 4600 }, { "ce_loss_13": 2.7185553312301636, "ce_loss_26": 2.140464088320732, "ce_loss_39": 1.9312822461128234, "ce_loss_52": 1.4499182224273681, "ce_loss_7": 3.074033808708191, "epoch": 0.461, "grad_norm": 15.796779340482095, "kl_loss_13": 2590.8, "kl_loss_26": 1373.5, "kl_loss_39": 946.6, "kl_loss_7": 3326.0, "learning_rate": 0.0005695865504800327, "loss": 4117.15, "step": 4610 }, { "ce_loss_13": 2.689697802066803, "ce_loss_26": 2.1229261219501496, "ce_loss_39": 1.9236579477787017, "ce_loss_52": 1.4474295616149901, "ce_loss_7": 3.0403923749923707, "epoch": 0.462, "grad_norm": 15.469933015809259, "kl_loss_13": 2559.6, "kl_loss_26": 1353.2, "kl_loss_39": 936.3, "kl_loss_7": 3287.2, "learning_rate": 0.0005680149810876322, "loss": 4141.45, "step": 4620 }, { "ce_loss_13": 2.709194713830948, "ce_loss_26": 2.1067664295434954, "ce_loss_39": 1.8838362753391267, "ce_loss_52": 1.4009573340415955, "ce_loss_7": 3.070843666791916, "epoch": 0.463, "grad_norm": 15.475096434174118, "kl_loss_13": 2674.8, "kl_loss_26": 1399.4, "kl_loss_39": 943.3, "kl_loss_7": 3422.8, "learning_rate": 0.0005664427267851271, "loss": 4160.8, "step": 4630 }, { "ce_loss_13": 2.7120527923107147, "ce_loss_26": 2.120649069547653, "ce_loss_39": 1.9163700252771378, "ce_loss_52": 1.4380100429058076, "ce_loss_7": 3.061056911945343, "epoch": 0.464, "grad_norm": 15.555501351653449, "kl_loss_13": 2608.0, "kl_loss_26": 1354.6, "kl_loss_39": 932.5, "kl_loss_7": 3340.4, "learning_rate": 0.0005648698034051009, "loss": 4170.2, "step": 4640 }, { "ce_loss_13": 2.7389404594898226, "ce_loss_26": 2.1454448729753492, "ce_loss_39": 1.9379454165697099, "ce_loss_52": 1.45494404733181, "ce_loss_7": 3.090787374973297, "epoch": 0.465, "grad_norm": 17.172011566290227, "kl_loss_13": 2617.0, "kl_loss_26": 1379.3, "kl_loss_39": 945.6, "kl_loss_7": 3354.0, "learning_rate": 0.0005632962267868747, "loss": 4137.2, "step": 4650 }, { "ce_loss_13": 2.618588683009148, "ce_loss_26": 2.0511282205581667, "ce_loss_39": 1.857603308558464, "ce_loss_52": 1.3924044981598853, "ce_loss_7": 2.9628873229026795, "epoch": 0.466, "grad_norm": 15.078077143393564, "kl_loss_13": 2528.4, "kl_loss_26": 1330.4, "kl_loss_39": 924.6, "kl_loss_7": 3258.4, "learning_rate": 0.0005617220127763474, "loss": 4108.7, "step": 4660 }, { "ce_loss_13": 2.706065672636032, "ce_loss_26": 2.128128296136856, "ce_loss_39": 1.921997308731079, "ce_loss_52": 1.4441686987876892, "ce_loss_7": 3.059876149892807, "epoch": 0.467, "grad_norm": 16.456195061500843, "kl_loss_13": 2576.0, "kl_loss_26": 1356.8, "kl_loss_39": 938.3, "kl_loss_7": 3319.6, "learning_rate": 0.0005601471772258368, "loss": 4092.5, "step": 4670 }, { "ce_loss_13": 2.691108763217926, "ce_loss_26": 2.1213403046131134, "ce_loss_39": 1.9077781707048416, "ce_loss_52": 1.4377064436674118, "ce_loss_7": 3.0406983733177184, "epoch": 0.468, "grad_norm": 15.284887521485958, "kl_loss_13": 2584.8, "kl_loss_26": 1368.0, "kl_loss_39": 933.0, "kl_loss_7": 3316.4, "learning_rate": 0.0005585717359939192, "loss": 4090.9, "step": 4680 }, { "ce_loss_13": 2.715133213996887, "ce_loss_26": 2.1481328904628754, "ce_loss_39": 1.938890340924263, "ce_loss_52": 1.456964261829853, "ce_loss_7": 3.063201904296875, "epoch": 0.469, "grad_norm": 14.95117298176539, "kl_loss_13": 2573.6, "kl_loss_26": 1372.5, "kl_loss_39": 948.0, "kl_loss_7": 3294.4, "learning_rate": 0.0005569957049452703, "loss": 4067.75, "step": 4690 }, { "ce_loss_13": 2.7355633437633515, "ce_loss_26": 2.130109578371048, "ce_loss_39": 1.9153006434440614, "ce_loss_52": 1.4125859558582305, "ce_loss_7": 3.0963422894477843, "epoch": 0.47, "grad_norm": 15.35914349074289, "kl_loss_13": 2726.0, "kl_loss_26": 1448.2, "kl_loss_39": 1002.1, "kl_loss_7": 3478.0, "learning_rate": 0.0005554190999505056, "loss": 4157.55, "step": 4700 }, { "ce_loss_13": 2.6879218101501463, "ce_loss_26": 2.098815104365349, "ce_loss_39": 1.8952988266944886, "ce_loss_52": 1.4328953355550766, "ce_loss_7": 3.040910530090332, "epoch": 0.471, "grad_norm": 15.95882741217506, "kl_loss_13": 2568.0, "kl_loss_26": 1323.6, "kl_loss_39": 901.3, "kl_loss_7": 3319.6, "learning_rate": 0.0005538419368860196, "loss": 4062.85, "step": 4710 }, { "ce_loss_13": 2.6794900715351107, "ce_loss_26": 2.0905598402023315, "ce_loss_39": 1.8867737114429475, "ce_loss_52": 1.4186928808689117, "ce_loss_7": 3.0324371635913847, "epoch": 0.472, "grad_norm": 15.674607259246525, "kl_loss_13": 2569.4, "kl_loss_26": 1340.6, "kl_loss_39": 914.1, "kl_loss_7": 3303.6, "learning_rate": 0.0005522642316338268, "loss": 4084.1, "step": 4720 }, { "ce_loss_13": 2.7094511866569517, "ce_loss_26": 2.1240269035100936, "ce_loss_39": 1.9192228257656097, "ce_loss_52": 1.4589810997247696, "ce_loss_7": 3.0527816653251647, "epoch": 0.473, "grad_norm": 15.84409146313606, "kl_loss_13": 2555.6, "kl_loss_26": 1330.2, "kl_loss_39": 901.4, "kl_loss_7": 3270.0, "learning_rate": 0.0005506860000814017, "loss": 4024.65, "step": 4730 }, { "ce_loss_13": 2.686307519674301, "ce_loss_26": 2.1099446028470994, "ce_loss_39": 1.9098408967256546, "ce_loss_52": 1.4574729681015015, "ce_loss_7": 3.03258957862854, "epoch": 0.474, "grad_norm": 16.290287645699628, "kl_loss_13": 2538.4, "kl_loss_26": 1308.0, "kl_loss_39": 889.7, "kl_loss_7": 3278.4, "learning_rate": 0.0005491072581215186, "loss": 4058.25, "step": 4740 }, { "ce_loss_13": 2.685838830471039, "ce_loss_26": 2.097555673122406, "ce_loss_39": 1.8953343421220779, "ce_loss_52": 1.4276258319616317, "ce_loss_7": 3.042680394649506, "epoch": 0.475, "grad_norm": 15.834434583556519, "kl_loss_13": 2590.0, "kl_loss_26": 1354.4, "kl_loss_39": 931.0, "kl_loss_7": 3334.4, "learning_rate": 0.0005475280216520913, "loss": 4057.7, "step": 4750 }, { "ce_loss_13": 2.645804923772812, "ce_loss_26": 2.073691374063492, "ce_loss_39": 1.872296717762947, "ce_loss_52": 1.4167594254016875, "ce_loss_7": 2.992863970994949, "epoch": 0.476, "grad_norm": 15.93243827944326, "kl_loss_13": 2532.4, "kl_loss_26": 1326.8, "kl_loss_39": 900.3, "kl_loss_7": 3268.8, "learning_rate": 0.0005459483065760138, "loss": 4104.4, "step": 4760 }, { "ce_loss_13": 2.7042051672935488, "ce_loss_26": 2.1117112547159196, "ce_loss_39": 1.9040750682353973, "ce_loss_52": 1.4314228266477584, "ce_loss_7": 3.0566958367824553, "epoch": 0.477, "grad_norm": 15.353551244219883, "kl_loss_13": 2608.4, "kl_loss_26": 1361.8, "kl_loss_39": 937.4, "kl_loss_7": 3352.4, "learning_rate": 0.0005443681288009991, "loss": 4078.7, "step": 4770 }, { "ce_loss_13": 2.6856160342693327, "ce_loss_26": 2.0829177469015123, "ce_loss_39": 1.8763625353574753, "ce_loss_52": 1.4030324995517731, "ce_loss_7": 3.0448363423347473, "epoch": 0.478, "grad_norm": 16.14315974332923, "kl_loss_13": 2646.4, "kl_loss_26": 1377.9, "kl_loss_39": 941.4, "kl_loss_7": 3398.4, "learning_rate": 0.0005427875042394199, "loss": 4031.6, "step": 4780 }, { "ce_loss_13": 2.6821600914001467, "ce_loss_26": 2.1105621844530105, "ce_loss_39": 1.9080501794815063, "ce_loss_52": 1.4579481482505798, "ce_loss_7": 3.0244390606880187, "epoch": 0.479, "grad_norm": 16.41928903258253, "kl_loss_13": 2508.6, "kl_loss_26": 1304.7, "kl_loss_39": 890.8, "kl_loss_7": 3233.6, "learning_rate": 0.0005412064488081482, "loss": 4041.85, "step": 4790 }, { "ce_loss_13": 2.644778722524643, "ce_loss_26": 2.0624290674924852, "ce_loss_39": 1.8686836928129196, "ce_loss_52": 1.4204004764556886, "ce_loss_7": 2.987608629465103, "epoch": 0.48, "grad_norm": 15.406655412683705, "kl_loss_13": 2521.2, "kl_loss_26": 1293.1, "kl_loss_39": 883.5, "kl_loss_7": 3258.4, "learning_rate": 0.0005396249784283942, "loss": 4018.65, "step": 4800 }, { "ce_loss_13": 2.6721664726734162, "ce_loss_26": 2.0903854191303255, "ce_loss_39": 1.8859550595283507, "ce_loss_52": 1.4339767321944237, "ce_loss_7": 3.022903573513031, "epoch": 0.481, "grad_norm": 15.533152676819311, "kl_loss_13": 2549.2, "kl_loss_26": 1311.5, "kl_loss_39": 887.7, "kl_loss_7": 3276.8, "learning_rate": 0.0005380431090255476, "loss": 4094.5, "step": 4810 }, { "ce_loss_13": 2.7146060168743134, "ce_loss_26": 2.138300836086273, "ce_loss_39": 1.9245162457227707, "ce_loss_52": 1.4378316938877105, "ce_loss_7": 3.0600010454654694, "epoch": 0.482, "grad_norm": 15.889585422309347, "kl_loss_13": 2616.4, "kl_loss_26": 1396.0, "kl_loss_39": 954.3, "kl_loss_7": 3337.2, "learning_rate": 0.0005364608565290155, "loss": 4019.85, "step": 4820 }, { "ce_loss_13": 2.7267942845821382, "ce_loss_26": 2.1315987795591353, "ce_loss_39": 1.92288878262043, "ce_loss_52": 1.4603519141674042, "ce_loss_7": 3.089752674102783, "epoch": 0.483, "grad_norm": 14.942603293515566, "kl_loss_13": 2598.4, "kl_loss_26": 1350.0, "kl_loss_39": 913.7, "kl_loss_7": 3351.2, "learning_rate": 0.0005348782368720626, "loss": 4054.35, "step": 4830 }, { "ce_loss_13": 2.702422133088112, "ce_loss_26": 2.1349568367004395, "ce_loss_39": 1.923803049325943, "ce_loss_52": 1.4461625874042512, "ce_loss_7": 3.0548571348190308, "epoch": 0.484, "grad_norm": 14.924067089524062, "kl_loss_13": 2579.8, "kl_loss_26": 1357.8, "kl_loss_39": 926.9, "kl_loss_7": 3324.0, "learning_rate": 0.000533295265991652, "loss": 4024.65, "step": 4840 }, { "ce_loss_13": 2.6280339270830155, "ce_loss_26": 2.0501014798879624, "ce_loss_39": 1.8476706713438034, "ce_loss_52": 1.3977838337421418, "ce_loss_7": 2.9806483924388885, "epoch": 0.485, "grad_norm": 16.07575861381767, "kl_loss_13": 2515.0, "kl_loss_26": 1293.9, "kl_loss_39": 881.9, "kl_loss_7": 3258.8, "learning_rate": 0.0005317119598282822, "loss": 4003.05, "step": 4850 }, { "ce_loss_13": 2.7017304062843324, "ce_loss_26": 2.123658448457718, "ce_loss_39": 1.9226309835910798, "ce_loss_52": 1.4685954213142396, "ce_loss_7": 3.052913784980774, "epoch": 0.486, "grad_norm": 14.261884924612158, "kl_loss_13": 2552.0, "kl_loss_26": 1330.6, "kl_loss_39": 901.6, "kl_loss_7": 3291.2, "learning_rate": 0.0005301283343258293, "loss": 4032.7, "step": 4860 }, { "ce_loss_13": 2.663911575078964, "ce_loss_26": 2.0811035096645356, "ce_loss_39": 1.8746151685714723, "ce_loss_52": 1.4212765499949456, "ce_loss_7": 3.0172334790229796, "epoch": 0.487, "grad_norm": 15.979242608660392, "kl_loss_13": 2526.0, "kl_loss_26": 1301.0, "kl_loss_39": 883.9, "kl_loss_7": 3276.4, "learning_rate": 0.000528544405431384, "loss": 4020.25, "step": 4870 }, { "ce_loss_13": 2.660174161195755, "ce_loss_26": 2.0942499101161958, "ce_loss_39": 1.8920150458812715, "ce_loss_52": 1.4465530335903167, "ce_loss_7": 3.0074438989162444, "epoch": 0.488, "grad_norm": 14.78308909898239, "kl_loss_13": 2500.8, "kl_loss_26": 1294.5, "kl_loss_39": 873.6, "kl_loss_7": 3228.0, "learning_rate": 0.000526960189095093, "loss": 4016.5, "step": 4880 }, { "ce_loss_13": 2.635312020778656, "ce_loss_26": 2.0795453995466233, "ce_loss_39": 1.8807054102420806, "ce_loss_52": 1.4317258328199387, "ce_loss_7": 2.9746360957622526, "epoch": 0.489, "grad_norm": 15.033016045239172, "kl_loss_13": 2473.6, "kl_loss_26": 1289.8, "kl_loss_39": 880.7, "kl_loss_7": 3185.2, "learning_rate": 0.0005253757012699972, "loss": 3996.8, "step": 4890 }, { "ce_loss_13": 2.68422954082489, "ce_loss_26": 2.1027563750743865, "ce_loss_39": 1.8974193513393403, "ce_loss_52": 1.4426774829626083, "ce_loss_7": 3.037678909301758, "epoch": 0.49, "grad_norm": 15.631967370179172, "kl_loss_13": 2553.2, "kl_loss_26": 1323.6, "kl_loss_39": 894.5, "kl_loss_7": 3295.2, "learning_rate": 0.0005237909579118712, "loss": 3967.65, "step": 4900 }, { "ce_loss_13": 2.680465018749237, "ce_loss_26": 2.1046013057231905, "ce_loss_39": 1.9009331673383714, "ce_loss_52": 1.44480240046978, "ce_loss_7": 3.036197912693024, "epoch": 0.491, "grad_norm": 15.759372326211485, "kl_loss_13": 2524.0, "kl_loss_26": 1301.7, "kl_loss_39": 885.8, "kl_loss_7": 3274.8, "learning_rate": 0.0005222059749790631, "loss": 3979.5, "step": 4910 }, { "ce_loss_13": 2.6961126804351805, "ce_loss_26": 2.1182867020368574, "ce_loss_39": 1.9074458956718445, "ce_loss_52": 1.456271693110466, "ce_loss_7": 3.044221115112305, "epoch": 0.492, "grad_norm": 16.34015666907617, "kl_loss_13": 2536.4, "kl_loss_26": 1322.4, "kl_loss_39": 892.3, "kl_loss_7": 3262.8, "learning_rate": 0.0005206207684323337, "loss": 3964.95, "step": 4920 }, { "ce_loss_13": 2.6250401854515077, "ce_loss_26": 2.041203039884567, "ce_loss_39": 1.8369982630014419, "ce_loss_52": 1.4043618232011794, "ce_loss_7": 2.976498603820801, "epoch": 0.493, "grad_norm": 15.643386607908784, "kl_loss_13": 2504.8, "kl_loss_26": 1279.6, "kl_loss_39": 857.1, "kl_loss_7": 3243.6, "learning_rate": 0.000519035354234695, "loss": 3956.4, "step": 4930 }, { "ce_loss_13": 2.731994906067848, "ce_loss_26": 2.151404523849487, "ce_loss_39": 1.9346221089363098, "ce_loss_52": 1.4679641619324684, "ce_loss_7": 3.0894507080316544, "epoch": 0.494, "grad_norm": 15.972342532535162, "kl_loss_13": 2584.2, "kl_loss_26": 1354.3, "kl_loss_39": 912.1, "kl_loss_7": 3338.8, "learning_rate": 0.0005174497483512506, "loss": 3986.95, "step": 4940 }, { "ce_loss_13": 2.692367374897003, "ce_loss_26": 2.122538897395134, "ce_loss_39": 1.9124073147773744, "ce_loss_52": 1.452483794093132, "ce_loss_7": 3.042392200231552, "epoch": 0.495, "grad_norm": 15.983001798116502, "kl_loss_13": 2533.4, "kl_loss_26": 1327.0, "kl_loss_39": 895.7, "kl_loss_7": 3252.0, "learning_rate": 0.0005158639667490339, "loss": 3967.55, "step": 4950 }, { "ce_loss_13": 2.6044699877500532, "ce_loss_26": 2.0326590865850447, "ce_loss_39": 1.8286783695220947, "ce_loss_52": 1.3870023548603059, "ce_loss_7": 2.947771966457367, "epoch": 0.496, "grad_norm": 15.42571652932802, "kl_loss_13": 2500.2, "kl_loss_26": 1296.2, "kl_loss_39": 875.7, "kl_loss_7": 3230.8, "learning_rate": 0.0005142780253968481, "loss": 3955.8, "step": 4960 }, { "ce_loss_13": 2.642567253112793, "ce_loss_26": 2.0776680946350097, "ce_loss_39": 1.875117465853691, "ce_loss_52": 1.4392555862665177, "ce_loss_7": 2.991294425725937, "epoch": 0.497, "grad_norm": 15.451081778073426, "kl_loss_13": 2501.6, "kl_loss_26": 1291.4, "kl_loss_39": 867.8, "kl_loss_7": 3244.8, "learning_rate": 0.0005126919402651053, "loss": 3945.75, "step": 4970 }, { "ce_loss_13": 2.6330737471580505, "ce_loss_26": 2.0630074977874755, "ce_loss_39": 1.856469190120697, "ce_loss_52": 1.4164980471134185, "ce_loss_7": 2.9769130408763886, "epoch": 0.498, "grad_norm": 15.34176013733142, "kl_loss_13": 2503.2, "kl_loss_26": 1294.2, "kl_loss_39": 874.8, "kl_loss_7": 3228.4, "learning_rate": 0.0005111057273256647, "loss": 3917.2, "step": 4980 }, { "ce_loss_13": 2.6576701521873476, "ce_loss_26": 2.089341068267822, "ce_loss_39": 1.8844720661640166, "ce_loss_52": 1.4477695405483246, "ce_loss_7": 3.0070637345314024, "epoch": 0.499, "grad_norm": 15.00660182184455, "kl_loss_13": 2473.8, "kl_loss_26": 1283.9, "kl_loss_39": 853.9, "kl_loss_7": 3207.6, "learning_rate": 0.0005095194025516733, "loss": 3923.7, "step": 4990 }, { "ce_loss_13": 2.697542816400528, "ce_loss_26": 2.125366801023483, "ce_loss_39": 1.9231987714767456, "ce_loss_52": 1.4726029485464096, "ce_loss_7": 3.0440734326839447, "epoch": 0.5, "grad_norm": 15.160859466153447, "kl_loss_13": 2511.6, "kl_loss_26": 1303.5, "kl_loss_39": 880.0, "kl_loss_7": 3249.2, "learning_rate": 0.000507932981917404, "loss": 3938.75, "step": 5000 }, { "ce_loss_13": 2.554905018210411, "ce_loss_26": 1.9959456473588943, "ce_loss_39": 1.8001023352146148, "ce_loss_52": 1.370650653541088, "ce_loss_7": 2.8987653195858, "epoch": 0.501, "grad_norm": 17.14356490933402, "kl_loss_13": 2439.4, "kl_loss_26": 1259.5, "kl_loss_39": 849.2, "kl_loss_7": 3158.0, "learning_rate": 0.0005063464813980949, "loss": 3915.65, "step": 5010 }, { "ce_loss_13": 2.5977672755718233, "ce_loss_26": 2.0211563646793365, "ce_loss_39": 1.8262152045965194, "ce_loss_52": 1.3998217657208443, "ce_loss_7": 2.950869935750961, "epoch": 0.502, "grad_norm": 14.97441402973133, "kl_loss_13": 2466.8, "kl_loss_26": 1257.6, "kl_loss_39": 848.9, "kl_loss_7": 3202.0, "learning_rate": 0.0005047599169697884, "loss": 3931.7, "step": 5020 }, { "ce_loss_13": 2.6368628799915315, "ce_loss_26": 2.0560896009206773, "ce_loss_39": 1.8569422334432601, "ce_loss_52": 1.429998092353344, "ce_loss_7": 2.981409990787506, "epoch": 0.503, "grad_norm": 15.444575545537123, "kl_loss_13": 2486.4, "kl_loss_26": 1264.6, "kl_loss_39": 845.9, "kl_loss_7": 3216.4, "learning_rate": 0.000503173304609171, "loss": 3936.6, "step": 5030 }, { "ce_loss_13": 2.694787061214447, "ce_loss_26": 2.10608988404274, "ce_loss_39": 1.8957834452390672, "ce_loss_52": 1.4505297511816024, "ce_loss_7": 3.0503607213497164, "epoch": 0.504, "grad_norm": 15.80764020207202, "kl_loss_13": 2558.4, "kl_loss_26": 1318.1, "kl_loss_39": 884.7, "kl_loss_7": 3306.4, "learning_rate": 0.0005015866602934111, "loss": 3939.75, "step": 5040 }, { "ce_loss_13": 2.621226805448532, "ce_loss_26": 2.0584420263767242, "ce_loss_39": 1.861902078986168, "ce_loss_52": 1.4422300636768342, "ce_loss_7": 2.951041603088379, "epoch": 0.505, "grad_norm": 14.881966197673897, "kl_loss_13": 2426.4, "kl_loss_26": 1238.3, "kl_loss_39": 828.1, "kl_loss_7": 3127.6, "learning_rate": 0.0005, "loss": 3934.2, "step": 5050 }, { "ce_loss_13": 2.660506749153137, "ce_loss_26": 2.098774325847626, "ce_loss_39": 1.886313620209694, "ce_loss_52": 1.4474970057606698, "ce_loss_7": 3.011888575553894, "epoch": 0.506, "grad_norm": 14.941506448796416, "kl_loss_13": 2498.2, "kl_loss_26": 1299.9, "kl_loss_39": 867.9, "kl_loss_7": 3221.2, "learning_rate": 0.0004984133397065889, "loss": 3903.9, "step": 5060 }, { "ce_loss_13": 2.626861798763275, "ce_loss_26": 2.0604827493429183, "ce_loss_39": 1.8599964112043381, "ce_loss_52": 1.4386487394571303, "ce_loss_7": 2.975832390785217, "epoch": 0.507, "grad_norm": 15.44952616226393, "kl_loss_13": 2452.8, "kl_loss_26": 1250.8, "kl_loss_39": 833.3, "kl_loss_7": 3188.4, "learning_rate": 0.0004968266953908291, "loss": 3885.3, "step": 5070 }, { "ce_loss_13": 2.562318778038025, "ce_loss_26": 1.9999902278184891, "ce_loss_39": 1.8035208880901337, "ce_loss_52": 1.3915461212396623, "ce_loss_7": 2.904841202497482, "epoch": 0.508, "grad_norm": 14.83407474509182, "kl_loss_13": 2407.2, "kl_loss_26": 1226.2, "kl_loss_39": 815.3, "kl_loss_7": 3134.0, "learning_rate": 0.0004952400830302117, "loss": 3881.25, "step": 5080 }, { "ce_loss_13": 2.5728674054145815, "ce_loss_26": 2.013489532470703, "ce_loss_39": 1.8151986598968506, "ce_loss_52": 1.3948280960321426, "ce_loss_7": 2.9155173718929293, "epoch": 0.509, "grad_norm": 14.64837783343134, "kl_loss_13": 2420.0, "kl_loss_26": 1236.4, "kl_loss_39": 829.6, "kl_loss_7": 3145.6, "learning_rate": 0.0004936535186019053, "loss": 3875.9, "step": 5090 }, { "ce_loss_13": 2.657623714208603, "ce_loss_26": 2.0687634259462357, "ce_loss_39": 1.8608120799064636, "ce_loss_52": 1.418680590391159, "ce_loss_7": 3.0090177237987517, "epoch": 0.51, "grad_norm": 15.50653814601008, "kl_loss_13": 2529.6, "kl_loss_26": 1302.4, "kl_loss_39": 874.3, "kl_loss_7": 3273.2, "learning_rate": 0.000492067018082596, "loss": 3924.45, "step": 5100 }, { "ce_loss_13": 2.651608294248581, "ce_loss_26": 2.0792359739542006, "ce_loss_39": 1.8647643625736237, "ce_loss_52": 1.4323984265327454, "ce_loss_7": 3.0009018778800964, "epoch": 0.511, "grad_norm": 14.358667358457254, "kl_loss_13": 2496.8, "kl_loss_26": 1292.0, "kl_loss_39": 854.6, "kl_loss_7": 3224.0, "learning_rate": 0.0004904805974483267, "loss": 3865.45, "step": 5110 }, { "ce_loss_13": 2.641629362106323, "ce_loss_26": 2.072116160392761, "ce_loss_39": 1.8716523438692092, "ce_loss_52": 1.4481347769498825, "ce_loss_7": 2.989072245359421, "epoch": 0.512, "grad_norm": 15.321352631891271, "kl_loss_13": 2423.6, "kl_loss_26": 1237.5, "kl_loss_39": 826.9, "kl_loss_7": 3150.8, "learning_rate": 0.0004888942726743353, "loss": 3863.15, "step": 5120 }, { "ce_loss_13": 2.6105270087718964, "ce_loss_26": 2.041230320930481, "ce_loss_39": 1.8405257403850555, "ce_loss_52": 1.4164400100708008, "ce_loss_7": 2.9590699791908266, "epoch": 0.513, "grad_norm": 15.778047819453661, "kl_loss_13": 2444.0, "kl_loss_26": 1244.1, "kl_loss_39": 835.9, "kl_loss_7": 3179.2, "learning_rate": 0.0004873080597348947, "loss": 3860.6, "step": 5130 }, { "ce_loss_13": 2.7020871877670287, "ce_loss_26": 2.1217857897281647, "ce_loss_39": 1.910724088549614, "ce_loss_52": 1.4596379309892655, "ce_loss_7": 3.0534912407398225, "epoch": 0.514, "grad_norm": 15.798738758695562, "kl_loss_13": 2550.4, "kl_loss_26": 1321.6, "kl_loss_39": 886.4, "kl_loss_7": 3290.0, "learning_rate": 0.0004857219746031519, "loss": 3877.4, "step": 5140 }, { "ce_loss_13": 2.629297113418579, "ce_loss_26": 2.065913289785385, "ce_loss_39": 1.8652951270341873, "ce_loss_52": 1.4406857430934905, "ce_loss_7": 2.975607615709305, "epoch": 0.515, "grad_norm": 15.843426047808201, "kl_loss_13": 2425.2, "kl_loss_26": 1235.3, "kl_loss_39": 819.8, "kl_loss_7": 3155.2, "learning_rate": 0.0004841360332509663, "loss": 3881.85, "step": 5150 }, { "ce_loss_13": 2.6566128492355348, "ce_loss_26": 2.0851503133773805, "ce_loss_39": 1.879001685976982, "ce_loss_52": 1.4529996007680892, "ce_loss_7": 3.001719295978546, "epoch": 0.516, "grad_norm": 15.230952623778938, "kl_loss_13": 2453.2, "kl_loss_26": 1247.2, "kl_loss_39": 825.4, "kl_loss_7": 3185.6, "learning_rate": 0.0004825502516487497, "loss": 3877.3, "step": 5160 }, { "ce_loss_13": 2.6446830928325653, "ce_loss_26": 2.087994411587715, "ce_loss_39": 1.8902768969535828, "ce_loss_52": 1.4724875479936599, "ce_loss_7": 2.994678741693497, "epoch": 0.517, "grad_norm": 16.39635866458352, "kl_loss_13": 2415.8, "kl_loss_26": 1232.4, "kl_loss_39": 823.7, "kl_loss_7": 3135.2, "learning_rate": 0.00048096464576530507, "loss": 3828.0, "step": 5170 }, { "ce_loss_13": 2.5944429993629456, "ce_loss_26": 2.036814641952515, "ce_loss_39": 1.841314834356308, "ce_loss_52": 1.4209656611084938, "ce_loss_7": 2.9290528297424316, "epoch": 0.518, "grad_norm": 14.479752914773284, "kl_loss_13": 2424.2, "kl_loss_26": 1242.2, "kl_loss_39": 829.4, "kl_loss_7": 3150.4, "learning_rate": 0.00047937923156766646, "loss": 3845.8, "step": 5180 }, { "ce_loss_13": 2.660686820745468, "ce_loss_26": 2.094280996918678, "ce_loss_39": 1.8889294683933258, "ce_loss_52": 1.4619006276130677, "ce_loss_7": 3.008899760246277, "epoch": 0.519, "grad_norm": 16.01644389337825, "kl_loss_13": 2475.6, "kl_loss_26": 1273.8, "kl_loss_39": 850.7, "kl_loss_7": 3204.4, "learning_rate": 0.00047779402502093696, "loss": 3846.45, "step": 5190 }, { "ce_loss_13": 2.5972321003675463, "ce_loss_26": 2.0274816364049912, "ce_loss_39": 1.8247474491596223, "ce_loss_52": 1.4065502554178237, "ce_loss_7": 2.9485019743442535, "epoch": 0.52, "grad_norm": 14.98957972258473, "kl_loss_13": 2450.8, "kl_loss_26": 1255.3, "kl_loss_39": 832.3, "kl_loss_7": 3178.4, "learning_rate": 0.0004762090420881289, "loss": 3895.0, "step": 5200 }, { "ce_loss_13": 2.665889722108841, "ce_loss_26": 2.110061451792717, "ce_loss_39": 1.9084287703037262, "ce_loss_52": 1.4813150823116303, "ce_loss_7": 3.0068661749362944, "epoch": 0.521, "grad_norm": 15.21430509367723, "kl_loss_13": 2448.0, "kl_loss_26": 1268.2, "kl_loss_39": 849.3, "kl_loss_7": 3164.4, "learning_rate": 0.00047462429873000296, "loss": 3816.95, "step": 5210 }, { "ce_loss_13": 2.655850923061371, "ce_loss_26": 2.085476315021515, "ce_loss_39": 1.874689555168152, "ce_loss_52": 1.446556892991066, "ce_loss_7": 3.0054982542991637, "epoch": 0.522, "grad_norm": 16.427127136749267, "kl_loss_13": 2452.4, "kl_loss_26": 1264.0, "kl_loss_39": 836.0, "kl_loss_7": 3188.0, "learning_rate": 0.0004730398109049071, "loss": 3838.95, "step": 5220 }, { "ce_loss_13": 2.677028661966324, "ce_loss_26": 2.119970577955246, "ce_loss_39": 1.92548668384552, "ce_loss_52": 1.4896603375673294, "ce_loss_7": 3.008550250530243, "epoch": 0.523, "grad_norm": 16.0659576233147, "kl_loss_13": 2444.4, "kl_loss_26": 1258.1, "kl_loss_39": 844.1, "kl_loss_7": 3161.6, "learning_rate": 0.000471455594568616, "loss": 3864.55, "step": 5230 }, { "ce_loss_13": 2.6650006234645844, "ce_loss_26": 2.085580566525459, "ce_loss_39": 1.884287601709366, "ce_loss_52": 1.453607079386711, "ce_loss_7": 3.0236206233501433, "epoch": 0.524, "grad_norm": 14.653166262246215, "kl_loss_13": 2489.6, "kl_loss_26": 1272.8, "kl_loss_39": 845.7, "kl_loss_7": 3232.4, "learning_rate": 0.00046987166567417086, "loss": 3878.7, "step": 5240 }, { "ce_loss_13": 2.584680277109146, "ce_loss_26": 2.0209302097558974, "ce_loss_39": 1.8149988621473312, "ce_loss_52": 1.390892931818962, "ce_loss_7": 2.9321685075759887, "epoch": 0.525, "grad_norm": 15.168241253700586, "kl_loss_13": 2428.4, "kl_loss_26": 1252.2, "kl_loss_39": 829.9, "kl_loss_7": 3154.8, "learning_rate": 0.00046828804017171776, "loss": 3851.45, "step": 5250 }, { "ce_loss_13": 2.6199812322854994, "ce_loss_26": 2.072703015804291, "ce_loss_39": 1.8745949417352676, "ce_loss_52": 1.4573716089129447, "ce_loss_7": 2.9569752633571627, "epoch": 0.526, "grad_norm": 15.019656924741254, "kl_loss_13": 2412.4, "kl_loss_26": 1238.7, "kl_loss_39": 825.7, "kl_loss_7": 3130.0, "learning_rate": 0.00046670473400834805, "loss": 3822.4, "step": 5260 }, { "ce_loss_13": 2.614406701922417, "ce_loss_26": 2.038132056593895, "ce_loss_39": 1.835758489370346, "ce_loss_52": 1.4097227707505227, "ce_loss_7": 2.953414297103882, "epoch": 0.527, "grad_norm": 15.237091081862786, "kl_loss_13": 2460.0, "kl_loss_26": 1259.0, "kl_loss_39": 841.0, "kl_loss_7": 3178.4, "learning_rate": 0.00046512176312793734, "loss": 3820.5, "step": 5270 }, { "ce_loss_13": 2.5675832986831666, "ce_loss_26": 2.0170306622982026, "ce_loss_39": 1.8148203998804093, "ce_loss_52": 1.4015038818120957, "ce_loss_7": 2.9166265070438384, "epoch": 0.528, "grad_norm": 15.622184104337453, "kl_loss_13": 2384.4, "kl_loss_26": 1219.7, "kl_loss_39": 809.7, "kl_loss_7": 3111.2, "learning_rate": 0.00046353914347098467, "loss": 3805.5, "step": 5280 }, { "ce_loss_13": 2.600316107273102, "ce_loss_26": 2.038938209414482, "ce_loss_39": 1.8384071439504623, "ce_loss_52": 1.4146052584052087, "ce_loss_7": 2.9424943923950195, "epoch": 0.529, "grad_norm": 16.18662441031771, "kl_loss_13": 2437.8, "kl_loss_26": 1252.1, "kl_loss_39": 829.2, "kl_loss_7": 3150.0, "learning_rate": 0.0004619568909744524, "loss": 3784.9, "step": 5290 }, { "ce_loss_13": 2.6181087523698805, "ce_loss_26": 2.0588752537965775, "ce_loss_39": 1.85763358771801, "ce_loss_52": 1.4420970767736434, "ce_loss_7": 2.962410408258438, "epoch": 0.53, "grad_norm": 15.067130011925716, "kl_loss_13": 2406.6, "kl_loss_26": 1231.8, "kl_loss_39": 819.2, "kl_loss_7": 3132.4, "learning_rate": 0.00046037502157160573, "loss": 3815.9, "step": 5300 }, { "ce_loss_13": 2.5704480171203614, "ce_loss_26": 2.007723420858383, "ce_loss_39": 1.8157259494066238, "ce_loss_52": 1.4100188240408897, "ce_loss_7": 2.9128104388713836, "epoch": 0.531, "grad_norm": 14.940801357367253, "kl_loss_13": 2375.2, "kl_loss_26": 1192.5, "kl_loss_39": 790.2, "kl_loss_7": 3097.2, "learning_rate": 0.00045879355119185207, "loss": 3774.75, "step": 5310 }, { "ce_loss_13": 2.6117980778217316, "ce_loss_26": 2.049151787161827, "ce_loss_39": 1.8556257247924806, "ce_loss_52": 1.4484239488840103, "ce_loss_7": 2.9571595907211305, "epoch": 0.532, "grad_norm": 15.084958990702525, "kl_loss_13": 2412.8, "kl_loss_26": 1214.0, "kl_loss_39": 801.8, "kl_loss_7": 3144.4, "learning_rate": 0.0004572124957605803, "loss": 3796.3, "step": 5320 }, { "ce_loss_13": 2.6047778964042663, "ce_loss_26": 2.0361655563116074, "ce_loss_39": 1.829242792725563, "ce_loss_52": 1.4240625560283662, "ce_loss_7": 2.9437944889068604, "epoch": 0.533, "grad_norm": 14.875930516711946, "kl_loss_13": 2409.0, "kl_loss_26": 1224.1, "kl_loss_39": 807.1, "kl_loss_7": 3128.8, "learning_rate": 0.00045563187119900103, "loss": 3772.25, "step": 5330 }, { "ce_loss_13": 2.5524469763040543, "ce_loss_26": 1.9957795530557632, "ce_loss_39": 1.7972583919763565, "ce_loss_52": 1.3942344322800637, "ce_loss_7": 2.899323457479477, "epoch": 0.534, "grad_norm": 14.888961320169138, "kl_loss_13": 2374.0, "kl_loss_26": 1190.2, "kl_loss_39": 785.5, "kl_loss_7": 3107.6, "learning_rate": 0.00045405169342398633, "loss": 3809.2, "step": 5340 }, { "ce_loss_13": 2.6405540108680725, "ce_loss_26": 2.080012783408165, "ce_loss_39": 1.876559317111969, "ce_loss_52": 1.462306337058544, "ce_loss_7": 2.981167531013489, "epoch": 0.535, "grad_norm": 14.563184685424444, "kl_loss_13": 2406.4, "kl_loss_26": 1229.1, "kl_loss_39": 816.6, "kl_loss_7": 3130.4, "learning_rate": 0.0004524719783479088, "loss": 3797.75, "step": 5350 }, { "ce_loss_13": 2.645623618364334, "ce_loss_26": 2.0881788969039916, "ce_loss_39": 1.888492900133133, "ce_loss_52": 1.4696861803531647, "ce_loss_7": 2.994585871696472, "epoch": 0.536, "grad_norm": 15.457746807868878, "kl_loss_13": 2427.2, "kl_loss_26": 1243.7, "kl_loss_39": 825.7, "kl_loss_7": 3155.2, "learning_rate": 0.00045089274187848144, "loss": 3837.25, "step": 5360 }, { "ce_loss_13": 2.607480788230896, "ce_loss_26": 2.0540437757968903, "ce_loss_39": 1.854740971326828, "ce_loss_52": 1.4391999766230583, "ce_loss_7": 2.955876570940018, "epoch": 0.537, "grad_norm": 15.654126474702101, "kl_loss_13": 2406.0, "kl_loss_26": 1230.1, "kl_loss_39": 814.4, "kl_loss_7": 3128.8, "learning_rate": 0.00044931399991859835, "loss": 3791.2, "step": 5370 }, { "ce_loss_13": 2.5806866496801377, "ce_loss_26": 2.0246922999620436, "ce_loss_39": 1.8295573323965073, "ce_loss_52": 1.433498626947403, "ce_loss_7": 2.932585430145264, "epoch": 0.538, "grad_norm": 15.050745413645357, "kl_loss_13": 2377.8, "kl_loss_26": 1193.3, "kl_loss_39": 790.4, "kl_loss_7": 3116.4, "learning_rate": 0.00044773576836617336, "loss": 3771.2, "step": 5380 }, { "ce_loss_13": 2.5772230982780457, "ce_loss_26": 2.0075444668531417, "ce_loss_39": 1.8076122283935547, "ce_loss_52": 1.3957727670669555, "ce_loss_7": 2.9269763708114622, "epoch": 0.539, "grad_norm": 14.739351688834923, "kl_loss_13": 2418.2, "kl_loss_26": 1217.6, "kl_loss_39": 806.9, "kl_loss_7": 3154.0, "learning_rate": 0.00044615806311398056, "loss": 3764.85, "step": 5390 }, { "ce_loss_13": 2.6164508730173113, "ce_loss_26": 2.0489900171756745, "ce_loss_39": 1.8477211087942123, "ce_loss_52": 1.4378462180495262, "ce_loss_7": 2.9610529631376266, "epoch": 0.54, "grad_norm": 15.257899541784896, "kl_loss_13": 2422.8, "kl_loss_26": 1222.4, "kl_loss_39": 806.9, "kl_loss_7": 3149.2, "learning_rate": 0.00044458090004949454, "loss": 3789.8, "step": 5400 }, { "ce_loss_13": 2.639938807487488, "ce_loss_26": 2.078041157126427, "ce_loss_39": 1.8733548551797867, "ce_loss_52": 1.4663413792848587, "ce_loss_7": 2.978548914194107, "epoch": 0.541, "grad_norm": 15.643709958040844, "kl_loss_13": 2399.4, "kl_loss_26": 1217.8, "kl_loss_39": 804.5, "kl_loss_7": 3111.2, "learning_rate": 0.0004430042950547297, "loss": 3775.4, "step": 5410 }, { "ce_loss_13": 2.6770710349082947, "ce_loss_26": 2.110589724779129, "ce_loss_39": 1.9134115755558014, "ce_loss_52": 1.5031546354293823, "ce_loss_7": 3.0216497242450715, "epoch": 0.542, "grad_norm": 14.919746196312753, "kl_loss_13": 2411.6, "kl_loss_26": 1216.6, "kl_loss_39": 809.1, "kl_loss_7": 3147.6, "learning_rate": 0.0004414282640060809, "loss": 3768.95, "step": 5420 }, { "ce_loss_13": 2.6296884536743166, "ce_loss_26": 2.0768262147903442, "ce_loss_39": 1.8766031116247177, "ce_loss_52": 1.4701504305005073, "ce_loss_7": 2.9714462876319887, "epoch": 0.543, "grad_norm": 14.109291634946612, "kl_loss_13": 2361.6, "kl_loss_26": 1207.6, "kl_loss_39": 794.5, "kl_loss_7": 3073.6, "learning_rate": 0.0004398528227741633, "loss": 3734.5, "step": 5430 }, { "ce_loss_13": 2.5891071379184725, "ce_loss_26": 2.024920642375946, "ce_loss_39": 1.8224879026412963, "ce_loss_52": 1.4225929498672485, "ce_loss_7": 2.939086824655533, "epoch": 0.544, "grad_norm": 15.182283952769808, "kl_loss_13": 2384.4, "kl_loss_26": 1201.0, "kl_loss_39": 785.9, "kl_loss_7": 3118.0, "learning_rate": 0.00043827798722365264, "loss": 3724.5, "step": 5440 }, { "ce_loss_13": 2.521664083003998, "ce_loss_26": 1.9635723412036896, "ce_loss_39": 1.776253479719162, "ce_loss_52": 1.3777382284402848, "ce_loss_7": 2.8684565305709837, "epoch": 0.545, "grad_norm": 14.678349850224247, "kl_loss_13": 2357.4, "kl_loss_26": 1171.5, "kl_loss_39": 776.8, "kl_loss_7": 3081.2, "learning_rate": 0.00043670377321312535, "loss": 3748.6, "step": 5450 }, { "ce_loss_13": 2.6282208263874054, "ce_loss_26": 2.063512918353081, "ce_loss_39": 1.8621816724538802, "ce_loss_52": 1.4465220913290977, "ce_loss_7": 2.9661788761615755, "epoch": 0.546, "grad_norm": 14.95327030090302, "kl_loss_13": 2410.4, "kl_loss_26": 1226.2, "kl_loss_39": 814.8, "kl_loss_7": 3130.0, "learning_rate": 0.0004351301965948991, "loss": 3750.5, "step": 5460 }, { "ce_loss_13": 2.6448668360710146, "ce_loss_26": 2.092715525627136, "ce_loss_39": 1.8841570675373078, "ce_loss_52": 1.465877577662468, "ce_loss_7": 2.983266705274582, "epoch": 0.547, "grad_norm": 14.464098395004918, "kl_loss_13": 2427.2, "kl_loss_26": 1254.5, "kl_loss_39": 827.1, "kl_loss_7": 3138.8, "learning_rate": 0.000433557273214873, "loss": 3760.7, "step": 5470 }, { "ce_loss_13": 2.5597914129495623, "ce_loss_26": 2.0002847105264663, "ce_loss_39": 1.8126664906740189, "ce_loss_52": 1.4252239495515824, "ce_loss_7": 2.89780033826828, "epoch": 0.548, "grad_norm": 14.511201984799017, "kl_loss_13": 2336.4, "kl_loss_26": 1154.4, "kl_loss_39": 760.7, "kl_loss_7": 3046.4, "learning_rate": 0.000431985018912368, "loss": 3744.2, "step": 5480 }, { "ce_loss_13": 2.556107610464096, "ce_loss_26": 2.0089694380760195, "ce_loss_39": 1.8094982028007507, "ce_loss_52": 1.4210678458213806, "ce_loss_7": 2.8992061018943787, "epoch": 0.549, "grad_norm": 14.88322732328834, "kl_loss_13": 2332.4, "kl_loss_26": 1182.7, "kl_loss_39": 771.7, "kl_loss_7": 3056.4, "learning_rate": 0.0004304134495199674, "loss": 3725.95, "step": 5490 }, { "ce_loss_13": 2.5557271778583526, "ce_loss_26": 2.015131750702858, "ce_loss_39": 1.8132081747055053, "ce_loss_52": 1.4253456503152848, "ce_loss_7": 2.8894282221794128, "epoch": 0.55, "grad_norm": 14.509766958879414, "kl_loss_13": 2342.4, "kl_loss_26": 1192.8, "kl_loss_39": 776.15, "kl_loss_7": 3041.2, "learning_rate": 0.0004288425808633575, "loss": 3690.5, "step": 5500 }, { "ce_loss_13": 2.6498306572437285, "ce_loss_26": 2.0864265114068985, "ce_loss_39": 1.884456393122673, "ce_loss_52": 1.4724615901708602, "ce_loss_7": 2.994233113527298, "epoch": 0.551, "grad_norm": 15.223257504626806, "kl_loss_13": 2419.8, "kl_loss_26": 1227.5, "kl_loss_39": 805.6, "kl_loss_7": 3137.2, "learning_rate": 0.0004272724287611684, "loss": 3719.95, "step": 5510 }, { "ce_loss_13": 2.6063999772071837, "ce_loss_26": 2.0453016996383666, "ce_loss_39": 1.8452126443386079, "ce_loss_52": 1.4395337477326393, "ce_loss_7": 2.9420916736125946, "epoch": 0.552, "grad_norm": 14.707511636553823, "kl_loss_13": 2389.2, "kl_loss_26": 1204.1, "kl_loss_39": 791.2, "kl_loss_7": 3102.4, "learning_rate": 0.00042570300902481425, "loss": 3704.35, "step": 5520 }, { "ce_loss_13": 2.563195550441742, "ce_loss_26": 1.9973567068576812, "ce_loss_39": 1.7899492472410201, "ce_loss_52": 1.3841874808073045, "ce_loss_7": 2.9130555033683776, "epoch": 0.553, "grad_norm": 14.788339735855649, "kl_loss_13": 2417.4, "kl_loss_26": 1232.3, "kl_loss_39": 809.2, "kl_loss_7": 3150.4, "learning_rate": 0.00042413433745833423, "loss": 3716.65, "step": 5530 }, { "ce_loss_13": 2.5711400628089907, "ce_loss_26": 2.0185573011636735, "ce_loss_39": 1.8240427374839783, "ce_loss_52": 1.4254193544387816, "ce_loss_7": 2.9173736214637755, "epoch": 0.554, "grad_norm": 14.905176252765152, "kl_loss_13": 2345.2, "kl_loss_26": 1182.6, "kl_loss_39": 781.0, "kl_loss_7": 3069.2, "learning_rate": 0.0004225664298582339, "loss": 3692.3, "step": 5540 }, { "ce_loss_13": 2.5974901139736177, "ce_loss_26": 2.046999195218086, "ce_loss_39": 1.8451066941022873, "ce_loss_52": 1.436999562382698, "ce_loss_7": 2.9430564284324645, "epoch": 0.555, "grad_norm": 16.142335203190324, "kl_loss_13": 2402.8, "kl_loss_26": 1219.2, "kl_loss_39": 797.9, "kl_loss_7": 3125.6, "learning_rate": 0.000420999302013325, "loss": 3720.0, "step": 5550 }, { "ce_loss_13": 2.5756935298442842, "ce_loss_26": 2.00499467253685, "ce_loss_39": 1.8017524302005767, "ce_loss_52": 1.4018217638134955, "ce_loss_7": 2.9280818104743958, "epoch": 0.556, "grad_norm": 15.838882130746192, "kl_loss_13": 2404.2, "kl_loss_26": 1195.7, "kl_loss_39": 787.5, "kl_loss_7": 3148.8, "learning_rate": 0.000419432969704568, "loss": 3744.35, "step": 5560 }, { "ce_loss_13": 2.6485643923282622, "ce_loss_26": 2.0993672519922257, "ce_loss_39": 1.8959094911813736, "ce_loss_52": 1.4813728883862496, "ce_loss_7": 2.994894337654114, "epoch": 0.557, "grad_norm": 14.30299821768554, "kl_loss_13": 2404.0, "kl_loss_26": 1234.2, "kl_loss_39": 810.35, "kl_loss_7": 3130.8, "learning_rate": 0.00041786744870491154, "loss": 3698.4, "step": 5570 }, { "ce_loss_13": 2.6594822227954866, "ce_loss_26": 2.0962711691856386, "ce_loss_39": 1.8888987362384797, "ce_loss_52": 1.477835801243782, "ce_loss_7": 3.003460741043091, "epoch": 0.558, "grad_norm": 14.927142462067188, "kl_loss_13": 2421.4, "kl_loss_26": 1232.3, "kl_loss_39": 804.5, "kl_loss_7": 3142.8, "learning_rate": 0.0004163027547791347, "loss": 3696.4, "step": 5580 }, { "ce_loss_13": 2.6033630073070526, "ce_loss_26": 2.052795875072479, "ce_loss_39": 1.8524764776229858, "ce_loss_52": 1.4589103490114212, "ce_loss_7": 2.9362357556819916, "epoch": 0.559, "grad_norm": 15.035071624412899, "kl_loss_13": 2332.2, "kl_loss_26": 1173.9, "kl_loss_39": 769.3, "kl_loss_7": 3041.6, "learning_rate": 0.0004147389036836881, "loss": 3676.85, "step": 5590 }, { "ce_loss_13": 2.5536737203598023, "ce_loss_26": 2.0111388891935347, "ce_loss_39": 1.8144014358520508, "ce_loss_52": 1.4328835308551788, "ce_loss_7": 2.88922523856163, "epoch": 0.56, "grad_norm": 14.534788987143044, "kl_loss_13": 2316.6, "kl_loss_26": 1164.9, "kl_loss_39": 758.2, "kl_loss_7": 3024.4, "learning_rate": 0.00041317591116653486, "loss": 3694.55, "step": 5600 }, { "ce_loss_13": 2.606983852386475, "ce_loss_26": 2.032891970872879, "ce_loss_39": 1.8340051174163818, "ce_loss_52": 1.4310238301753997, "ce_loss_7": 2.9573661506175997, "epoch": 0.561, "grad_norm": 16.216124841765133, "kl_loss_13": 2427.2, "kl_loss_26": 1205.0, "kl_loss_39": 794.3, "kl_loss_7": 3156.0, "learning_rate": 0.0004116137929669921, "loss": 3679.35, "step": 5610 }, { "ce_loss_13": 2.5370231360197066, "ce_loss_26": 1.9963056713342666, "ce_loss_39": 1.803014099597931, "ce_loss_52": 1.4229481190443038, "ce_loss_7": 2.8732429146766663, "epoch": 0.562, "grad_norm": 15.330886130898222, "kl_loss_13": 2277.4, "kl_loss_26": 1124.3, "kl_loss_39": 729.3, "kl_loss_7": 2988.8, "learning_rate": 0.00041005256481557305, "loss": 3673.5, "step": 5620 }, { "ce_loss_13": 2.6017677545547486, "ce_loss_26": 2.0595314621925356, "ce_loss_39": 1.8631951808929443, "ce_loss_52": 1.4627325683832169, "ce_loss_7": 2.9314393043518066, "epoch": 0.563, "grad_norm": 14.465862339922571, "kl_loss_13": 2332.2, "kl_loss_26": 1190.7, "kl_loss_39": 781.3, "kl_loss_7": 3032.0, "learning_rate": 0.00040849224243382767, "loss": 3672.9, "step": 5630 }, { "ce_loss_13": 2.5594456523656843, "ce_loss_26": 2.00023832321167, "ce_loss_39": 1.8007198423147202, "ce_loss_52": 1.4131150737404823, "ce_loss_7": 2.904243141412735, "epoch": 0.564, "grad_norm": 15.017201171045896, "kl_loss_13": 2360.2, "kl_loss_26": 1180.4, "kl_loss_39": 768.1, "kl_loss_7": 3075.6, "learning_rate": 0.000406932841534185, "loss": 3693.75, "step": 5640 }, { "ce_loss_13": 2.595109748840332, "ce_loss_26": 2.045265626907349, "ce_loss_39": 1.85684574842453, "ce_loss_52": 1.4735014230012893, "ce_loss_7": 2.943417179584503, "epoch": 0.565, "grad_norm": 15.364099861784982, "kl_loss_13": 2322.0, "kl_loss_26": 1155.7, "kl_loss_39": 755.4, "kl_loss_7": 3048.4, "learning_rate": 0.0004053743778197951, "loss": 3668.9, "step": 5650 }, { "ce_loss_13": 2.582511156797409, "ce_loss_26": 2.0224455118179323, "ce_loss_39": 1.8295785069465638, "ce_loss_52": 1.4373956888914108, "ce_loss_7": 2.9102243304252626, "epoch": 0.566, "grad_norm": 14.693957764502153, "kl_loss_13": 2342.4, "kl_loss_26": 1176.8, "kl_loss_39": 776.5, "kl_loss_7": 3046.4, "learning_rate": 0.0004038168669843697, "loss": 3650.65, "step": 5660 }, { "ce_loss_13": 2.603584831953049, "ce_loss_26": 2.04022336602211, "ce_loss_39": 1.8417048037052155, "ce_loss_52": 1.447445745766163, "ce_loss_7": 2.9460166096687317, "epoch": 0.567, "grad_norm": 15.203721551974317, "kl_loss_13": 2379.4, "kl_loss_26": 1187.3, "kl_loss_39": 777.4, "kl_loss_7": 3104.8, "learning_rate": 0.000402260324712026, "loss": 3688.75, "step": 5670 }, { "ce_loss_13": 2.526816266775131, "ce_loss_26": 1.9893273174762727, "ce_loss_39": 1.793796670436859, "ce_loss_52": 1.4289553046226502, "ce_loss_7": 2.8634801030159, "epoch": 0.568, "grad_norm": 14.842310660649245, "kl_loss_13": 2254.2, "kl_loss_26": 1116.4, "kl_loss_39": 714.95, "kl_loss_7": 2954.8, "learning_rate": 0.00040070476667712743, "loss": 3637.75, "step": 5680 }, { "ce_loss_13": 2.615302687883377, "ce_loss_26": 2.059708908200264, "ce_loss_39": 1.8555728137493133, "ce_loss_52": 1.4557225406169891, "ce_loss_7": 2.9577668845653533, "epoch": 0.569, "grad_norm": 14.742701130409761, "kl_loss_13": 2387.6, "kl_loss_26": 1214.9, "kl_loss_39": 792.5, "kl_loss_7": 3105.2, "learning_rate": 0.0003991502085441259, "loss": 3676.05, "step": 5690 }, { "ce_loss_13": 2.5645705699920653, "ce_loss_26": 2.007223817706108, "ce_loss_39": 1.8173159271478654, "ce_loss_52": 1.4376816004514694, "ce_loss_7": 2.895137590169907, "epoch": 0.57, "grad_norm": 15.460594692772787, "kl_loss_13": 2314.8, "kl_loss_26": 1152.0, "kl_loss_39": 753.2, "kl_loss_7": 3024.8, "learning_rate": 0.0003975966659674047, "loss": 3621.95, "step": 5700 }, { "ce_loss_13": 2.559953585267067, "ce_loss_26": 2.032286322116852, "ce_loss_39": 1.845168125629425, "ce_loss_52": 1.461988940834999, "ce_loss_7": 2.9015457332134247, "epoch": 0.571, "grad_norm": 15.171579029800053, "kl_loss_13": 2278.8, "kl_loss_26": 1159.2, "kl_loss_39": 758.6, "kl_loss_7": 2982.0, "learning_rate": 0.0003960441545911204, "loss": 3675.95, "step": 5710 }, { "ce_loss_13": 2.6008632302284242, "ce_loss_26": 2.043924775719643, "ce_loss_39": 1.8485127180814742, "ce_loss_52": 1.4697089165449142, "ce_loss_7": 2.9355547785758973, "epoch": 0.572, "grad_norm": 14.834558653485171, "kl_loss_13": 2319.4, "kl_loss_26": 1152.8, "kl_loss_39": 750.0, "kl_loss_7": 3027.6, "learning_rate": 0.0003944926900490452, "loss": 3638.65, "step": 5720 }, { "ce_loss_13": 2.532833296060562, "ce_loss_26": 1.9711334377527236, "ce_loss_39": 1.77955681681633, "ce_loss_52": 1.4007374957203864, "ce_loss_7": 2.8781362950801848, "epoch": 0.573, "grad_norm": 16.10932164493431, "kl_loss_13": 2337.8, "kl_loss_26": 1151.9, "kl_loss_39": 757.3, "kl_loss_7": 3064.8, "learning_rate": 0.0003929422879644099, "loss": 3650.2, "step": 5730 }, { "ce_loss_13": 2.5908755481243135, "ce_loss_26": 2.0414842426776887, "ce_loss_39": 1.8504431873559952, "ce_loss_52": 1.4606543123722076, "ce_loss_7": 2.926515054702759, "epoch": 0.574, "grad_norm": 14.72871950232802, "kl_loss_13": 2333.4, "kl_loss_26": 1164.1, "kl_loss_39": 763.5, "kl_loss_7": 3044.4, "learning_rate": 0.0003913929639497462, "loss": 3615.45, "step": 5740 }, { "ce_loss_13": 2.591219651699066, "ce_loss_26": 2.044692638516426, "ce_loss_39": 1.8468156188726426, "ce_loss_52": 1.452781331539154, "ce_loss_7": 2.932692265510559, "epoch": 0.575, "grad_norm": 14.536189899304834, "kl_loss_13": 2345.4, "kl_loss_26": 1189.2, "kl_loss_39": 775.0, "kl_loss_7": 3054.8, "learning_rate": 0.00038984473360672965, "loss": 3631.3, "step": 5750 }, { "ce_loss_13": 2.555169379711151, "ce_loss_26": 2.0133784860372543, "ce_loss_39": 1.8234185576438904, "ce_loss_52": 1.4436144948005676, "ce_loss_7": 2.896002060174942, "epoch": 0.576, "grad_norm": 15.34263044515968, "kl_loss_13": 2285.0, "kl_loss_26": 1136.7, "kl_loss_39": 742.35, "kl_loss_7": 2993.2, "learning_rate": 0.0003882976125260229, "loss": 3658.4, "step": 5760 }, { "ce_loss_13": 2.502971774339676, "ce_loss_26": 1.9493688374757767, "ce_loss_39": 1.7570990473031998, "ce_loss_52": 1.3885455280542374, "ce_loss_7": 2.8447738111019136, "epoch": 0.577, "grad_norm": 14.660193254554198, "kl_loss_13": 2295.0, "kl_loss_26": 1140.5, "kl_loss_39": 734.0, "kl_loss_7": 3013.6, "learning_rate": 0.00038675161628711776, "loss": 3632.8, "step": 5770 }, { "ce_loss_13": 2.5624574303627012, "ce_loss_26": 2.0051666617393495, "ce_loss_39": 1.8087779253721237, "ce_loss_52": 1.41810100376606, "ce_loss_7": 2.895737165212631, "epoch": 0.578, "grad_norm": 14.482088330386649, "kl_loss_13": 2343.4, "kl_loss_26": 1176.1, "kl_loss_39": 766.9, "kl_loss_7": 3047.6, "learning_rate": 0.0003852067604581794, "loss": 3602.85, "step": 5780 }, { "ce_loss_13": 2.5270320236682893, "ce_loss_26": 1.9888224333524704, "ce_loss_39": 1.803659090399742, "ce_loss_52": 1.428774857521057, "ce_loss_7": 2.8705179512500765, "epoch": 0.579, "grad_norm": 14.979884111866252, "kl_loss_13": 2273.2, "kl_loss_26": 1136.8, "kl_loss_39": 739.9, "kl_loss_7": 2982.8, "learning_rate": 0.0003836630605958888, "loss": 3603.35, "step": 5790 }, { "ce_loss_13": 2.5794149696826936, "ce_loss_26": 2.0252732813358305, "ce_loss_39": 1.8308797210454941, "ce_loss_52": 1.4384155124425888, "ce_loss_7": 2.9223524034023285, "epoch": 0.58, "grad_norm": 14.8686548037009, "kl_loss_13": 2322.0, "kl_loss_26": 1158.7, "kl_loss_39": 758.3, "kl_loss_7": 3044.0, "learning_rate": 0.0003821205322452863, "loss": 3636.15, "step": 5800 }, { "ce_loss_13": 2.6002571165561674, "ce_loss_26": 2.0497290968894957, "ce_loss_39": 1.8579988300800323, "ce_loss_52": 1.4732319116592407, "ce_loss_7": 2.9367463052272798, "epoch": 0.581, "grad_norm": 15.52768956484863, "kl_loss_13": 2303.2, "kl_loss_26": 1143.4, "kl_loss_39": 743.0, "kl_loss_7": 3018.4, "learning_rate": 0.0003805791909396155, "loss": 3651.1, "step": 5810 }, { "ce_loss_13": 2.5309071093797684, "ce_loss_26": 1.9803194522857666, "ce_loss_39": 1.7861102789640426, "ce_loss_52": 1.410745631158352, "ce_loss_7": 2.8677128195762633, "epoch": 0.582, "grad_norm": 14.66111038706468, "kl_loss_13": 2302.0, "kl_loss_26": 1148.3, "kl_loss_39": 748.9, "kl_loss_7": 3008.0, "learning_rate": 0.0003790390522001662, "loss": 3564.65, "step": 5820 }, { "ce_loss_13": 2.5017096638679504, "ce_loss_26": 1.94625324010849, "ce_loss_39": 1.749154046177864, "ce_loss_52": 1.3770527362823486, "ce_loss_7": 2.8349112212657928, "epoch": 0.583, "grad_norm": 14.629798672555188, "kl_loss_13": 2294.6, "kl_loss_26": 1141.8, "kl_loss_39": 731.9, "kl_loss_7": 3003.2, "learning_rate": 0.0003775001315361183, "loss": 3613.35, "step": 5830 }, { "ce_loss_13": 2.5965979039669036, "ce_loss_26": 2.0513822197914124, "ce_loss_39": 1.8568279683589934, "ce_loss_52": 1.486306893825531, "ce_loss_7": 2.932318705320358, "epoch": 0.584, "grad_norm": 15.483089802029363, "kl_loss_13": 2285.6, "kl_loss_26": 1133.6, "kl_loss_39": 729.5, "kl_loss_7": 2989.2, "learning_rate": 0.0003759624444443858, "loss": 3579.55, "step": 5840 }, { "ce_loss_13": 2.584410917758942, "ce_loss_26": 2.022917777299881, "ce_loss_39": 1.8257231026887895, "ce_loss_52": 1.437566375732422, "ce_loss_7": 2.9283429443836213, "epoch": 0.585, "grad_norm": 15.042900713620126, "kl_loss_13": 2353.8, "kl_loss_26": 1173.8, "kl_loss_39": 763.0, "kl_loss_7": 3075.6, "learning_rate": 0.00037442600640946044, "loss": 3619.6, "step": 5850 }, { "ce_loss_13": 2.5062096178531648, "ce_loss_26": 1.9529441505670548, "ce_loss_39": 1.756454050540924, "ce_loss_52": 1.3877734661102294, "ce_loss_7": 2.852471035718918, "epoch": 0.586, "grad_norm": 15.637281763013858, "kl_loss_13": 2307.4, "kl_loss_26": 1145.7, "kl_loss_39": 736.7, "kl_loss_7": 3028.0, "learning_rate": 0.00037289083290325663, "loss": 3605.0, "step": 5860 }, { "ce_loss_13": 2.550817745923996, "ce_loss_26": 2.0112617135047914, "ce_loss_39": 1.8207211345434189, "ce_loss_52": 1.4542410910129546, "ce_loss_7": 2.8887066781520843, "epoch": 0.587, "grad_norm": 14.48910345968502, "kl_loss_13": 2241.0, "kl_loss_26": 1095.1, "kl_loss_39": 709.9, "kl_loss_7": 2950.4, "learning_rate": 0.0003713569393849543, "loss": 3628.55, "step": 5870 }, { "ce_loss_13": 2.5198557287454606, "ce_loss_26": 1.9734882295131684, "ce_loss_39": 1.7831378549337387, "ce_loss_52": 1.4195168539881706, "ce_loss_7": 2.8633585631847382, "epoch": 0.588, "grad_norm": 14.606482629979089, "kl_loss_13": 2270.0, "kl_loss_26": 1121.9, "kl_loss_39": 724.8, "kl_loss_7": 2990.8, "learning_rate": 0.00036982434130084397, "loss": 3605.15, "step": 5880 }, { "ce_loss_13": 2.5002260982990263, "ce_loss_26": 1.9510222643613815, "ce_loss_39": 1.760866141319275, "ce_loss_52": 1.3993624940514564, "ce_loss_7": 2.8445383846759795, "epoch": 0.589, "grad_norm": 15.157934768830916, "kl_loss_13": 2246.4, "kl_loss_26": 1094.8, "kl_loss_39": 699.8, "kl_loss_7": 2967.6, "learning_rate": 0.00036829305408417166, "loss": 3580.45, "step": 5890 }, { "ce_loss_13": 2.4792177438735963, "ce_loss_26": 1.9329254776239395, "ce_loss_39": 1.7395812034606934, "ce_loss_52": 1.3731168687343598, "ce_loss_7": 2.8229923218488695, "epoch": 0.59, "grad_norm": 14.656512940113272, "kl_loss_13": 2271.0, "kl_loss_26": 1112.9, "kl_loss_39": 720.45, "kl_loss_7": 2995.2, "learning_rate": 0.0003667630931549826, "loss": 3601.65, "step": 5900 }, { "ce_loss_13": 2.6107949793338774, "ce_loss_26": 2.0506039649248122, "ce_loss_39": 1.8547100484371186, "ce_loss_52": 1.4670722007751464, "ce_loss_7": 2.947498029470444, "epoch": 0.591, "grad_norm": 15.535800254061792, "kl_loss_13": 2341.4, "kl_loss_26": 1168.3, "kl_loss_39": 753.2, "kl_loss_7": 3065.2, "learning_rate": 0.00036523447391996613, "loss": 3580.55, "step": 5910 }, { "ce_loss_13": 2.5217359244823454, "ce_loss_26": 1.988610166311264, "ce_loss_39": 1.8043963432312011, "ce_loss_52": 1.4320271372795106, "ce_loss_7": 2.8523899018764496, "epoch": 0.592, "grad_norm": 15.144566757103092, "kl_loss_13": 2242.6, "kl_loss_26": 1108.7, "kl_loss_39": 722.4, "kl_loss_7": 2935.6, "learning_rate": 0.00036370721177230114, "loss": 3609.65, "step": 5920 }, { "ce_loss_13": 2.5657771229743958, "ce_loss_26": 2.024441570043564, "ce_loss_39": 1.8324565082788467, "ce_loss_52": 1.4527549773454667, "ce_loss_7": 2.9029002487659454, "epoch": 0.593, "grad_norm": 14.305280391890491, "kl_loss_13": 2257.4, "kl_loss_26": 1128.5, "kl_loss_39": 727.25, "kl_loss_7": 2971.6, "learning_rate": 0.00036218132209150044, "loss": 3561.25, "step": 5930 }, { "ce_loss_13": 2.5320515751838686, "ce_loss_26": 1.993978601694107, "ce_loss_39": 1.7951736986637115, "ce_loss_52": 1.4332455009222032, "ce_loss_7": 2.8734976410865785, "epoch": 0.594, "grad_norm": 14.731628028248535, "kl_loss_13": 2276.0, "kl_loss_26": 1134.7, "kl_loss_39": 725.65, "kl_loss_7": 2984.8, "learning_rate": 0.0003606568202432562, "loss": 3568.15, "step": 5940 }, { "ce_loss_13": 2.467684972286224, "ce_loss_26": 1.9258863091468812, "ce_loss_39": 1.7374547556042672, "ce_loss_52": 1.3822436913847924, "ce_loss_7": 2.8027748644351957, "epoch": 0.595, "grad_norm": 13.952374257617315, "kl_loss_13": 2221.6, "kl_loss_26": 1099.3, "kl_loss_39": 702.85, "kl_loss_7": 2916.8, "learning_rate": 0.0003591337215792851, "loss": 3573.15, "step": 5950 }, { "ce_loss_13": 2.5612458407878878, "ce_loss_26": 2.005467265844345, "ce_loss_39": 1.8046582967042923, "ce_loss_52": 1.4227981299161911, "ce_loss_7": 2.9120794773101806, "epoch": 0.596, "grad_norm": 14.684621517642583, "kl_loss_13": 2333.2, "kl_loss_26": 1167.6, "kl_loss_39": 756.5, "kl_loss_7": 3067.6, "learning_rate": 0.00035761204143717383, "loss": 3598.3, "step": 5960 }, { "ce_loss_13": 2.539260357618332, "ce_loss_26": 1.997820395231247, "ce_loss_39": 1.7995415717363357, "ce_loss_52": 1.4228885769844055, "ce_loss_7": 2.8742454588413238, "epoch": 0.597, "grad_norm": 14.834521968922566, "kl_loss_13": 2285.6, "kl_loss_26": 1143.0, "kl_loss_39": 733.0, "kl_loss_7": 2992.4, "learning_rate": 0.0003560917951402245, "loss": 3549.75, "step": 5970 }, { "ce_loss_13": 2.514337483048439, "ce_loss_26": 1.9719054281711579, "ce_loss_39": 1.7803026676177978, "ce_loss_52": 1.4242254197597504, "ce_loss_7": 2.8526304841041563, "epoch": 0.598, "grad_norm": 15.184376306356116, "kl_loss_13": 2262.4, "kl_loss_26": 1100.6, "kl_loss_39": 700.55, "kl_loss_7": 2977.6, "learning_rate": 0.00035457299799730046, "loss": 3595.65, "step": 5980 }, { "ce_loss_13": 2.5518115133047106, "ce_loss_26": 2.011714455485344, "ce_loss_39": 1.8179199546575546, "ce_loss_52": 1.4457479059696197, "ce_loss_7": 2.903665816783905, "epoch": 0.599, "grad_norm": 17.70688578121031, "kl_loss_13": 2293.4, "kl_loss_26": 1146.1, "kl_loss_39": 736.9, "kl_loss_7": 3022.8, "learning_rate": 0.0003530556653026721, "loss": 3553.45, "step": 5990 }, { "ce_loss_13": 2.5623465538024903, "ce_loss_26": 2.0235190600156785, "ce_loss_39": 1.8284282714128495, "ce_loss_52": 1.4427544534206391, "ce_loss_7": 2.8994126319885254, "epoch": 0.6, "grad_norm": 14.620466600171055, "kl_loss_13": 2310.4, "kl_loss_26": 1152.8, "kl_loss_39": 754.8, "kl_loss_7": 3022.0, "learning_rate": 0.00035153981233586274, "loss": 3592.9, "step": 6000 }, { "ce_loss_13": 2.592492914199829, "ce_loss_26": 2.0241310060024262, "ce_loss_39": 1.8282486945390701, "ce_loss_52": 1.4501359939575196, "ce_loss_7": 2.934645599126816, "epoch": 0.601, "grad_norm": 15.179282109125554, "kl_loss_13": 2355.8, "kl_loss_26": 1159.4, "kl_loss_39": 745.9, "kl_loss_7": 3082.4, "learning_rate": 0.00035002545436149473, "loss": 3551.8, "step": 6010 }, { "ce_loss_13": 2.4968257695436478, "ce_loss_26": 1.950178360939026, "ce_loss_39": 1.7565111339092254, "ce_loss_52": 1.3940304026007653, "ce_loss_7": 2.8332657337188722, "epoch": 0.602, "grad_norm": 15.101566569350041, "kl_loss_13": 2271.8, "kl_loss_26": 1114.8, "kl_loss_39": 717.3, "kl_loss_7": 2986.8, "learning_rate": 0.0003485126066291364, "loss": 3553.3, "step": 6020 }, { "ce_loss_13": 2.5295323967933654, "ce_loss_26": 1.9963963776826859, "ce_loss_39": 1.8044916093349457, "ce_loss_52": 1.446289749443531, "ce_loss_7": 2.8674661338329317, "epoch": 0.603, "grad_norm": 14.29880611806114, "kl_loss_13": 2227.0, "kl_loss_26": 1097.1, "kl_loss_39": 702.25, "kl_loss_7": 2938.0, "learning_rate": 0.0003470012843731476, "loss": 3534.85, "step": 6030 }, { "ce_loss_13": 2.498071011900902, "ce_loss_26": 1.9493587136268615, "ce_loss_39": 1.76174655854702, "ce_loss_52": 1.407905325293541, "ce_loss_7": 2.847205549478531, "epoch": 0.604, "grad_norm": 14.483859928846364, "kl_loss_13": 2244.8, "kl_loss_26": 1096.9, "kl_loss_39": 701.8, "kl_loss_7": 2975.2, "learning_rate": 0.00034549150281252633, "loss": 3514.35, "step": 6040 }, { "ce_loss_13": 2.4966412246227265, "ce_loss_26": 1.9553426146507262, "ce_loss_39": 1.7613533914089203, "ce_loss_52": 1.4072722673416138, "ce_loss_7": 2.8383829057216645, "epoch": 0.605, "grad_norm": 14.78851376475336, "kl_loss_13": 2235.6, "kl_loss_26": 1090.3, "kl_loss_39": 695.0, "kl_loss_7": 2954.0, "learning_rate": 0.0003439832771507565, "loss": 3563.65, "step": 6050 }, { "ce_loss_13": 2.502397668361664, "ce_loss_26": 1.9571218103170396, "ce_loss_39": 1.7665416598320007, "ce_loss_52": 1.412816160917282, "ce_loss_7": 2.8454441905021666, "epoch": 0.606, "grad_norm": 15.4871318885597, "kl_loss_13": 2242.0, "kl_loss_26": 1096.6, "kl_loss_39": 696.5, "kl_loss_7": 2962.4, "learning_rate": 0.0003424766225756537, "loss": 3510.25, "step": 6060 }, { "ce_loss_13": 2.5223917841911314, "ce_loss_26": 1.9723280429840089, "ce_loss_39": 1.7729612857103347, "ce_loss_52": 1.4048843801021575, "ce_loss_7": 2.8645106673240663, "epoch": 0.607, "grad_norm": 15.573595841239559, "kl_loss_13": 2300.6, "kl_loss_26": 1138.2, "kl_loss_39": 727.85, "kl_loss_7": 3019.6, "learning_rate": 0.00034097155425921255, "loss": 3527.0, "step": 6070 }, { "ce_loss_13": 2.491760790348053, "ce_loss_26": 1.9636650770902633, "ce_loss_39": 1.7728582590818405, "ce_loss_52": 1.4259491577744483, "ce_loss_7": 2.821820729970932, "epoch": 0.608, "grad_norm": 14.653123046442355, "kl_loss_13": 2168.6, "kl_loss_26": 1063.1, "kl_loss_39": 679.3, "kl_loss_7": 2864.4, "learning_rate": 0.0003394680873574546, "loss": 3528.9, "step": 6080 }, { "ce_loss_13": 2.5085246324539185, "ce_loss_26": 1.9797437161207199, "ce_loss_39": 1.7878784984350204, "ce_loss_52": 1.4316603004932404, "ce_loss_7": 2.847382205724716, "epoch": 0.609, "grad_norm": 14.913942335206821, "kl_loss_13": 2211.2, "kl_loss_26": 1091.8, "kl_loss_39": 696.95, "kl_loss_7": 2922.0, "learning_rate": 0.0003379662370102747, "loss": 3549.95, "step": 6090 }, { "ce_loss_13": 2.4759395986795427, "ce_loss_26": 1.949927881360054, "ce_loss_39": 1.7604204803705215, "ce_loss_52": 1.41001408547163, "ce_loss_7": 2.8064420104026793, "epoch": 0.61, "grad_norm": 14.676610718900648, "kl_loss_13": 2185.6, "kl_loss_26": 1077.2, "kl_loss_39": 688.0, "kl_loss_7": 2875.6, "learning_rate": 0.0003364660183412892, "loss": 3507.8, "step": 6100 }, { "ce_loss_13": 2.531206899881363, "ce_loss_26": 1.9836675137281419, "ce_loss_39": 1.7869503110647202, "ce_loss_52": 1.432834729552269, "ce_loss_7": 2.873502719402313, "epoch": 0.611, "grad_norm": 14.829577774305747, "kl_loss_13": 2255.0, "kl_loss_26": 1106.2, "kl_loss_39": 703.75, "kl_loss_7": 2966.4, "learning_rate": 0.0003349674464576834, "loss": 3495.25, "step": 6110 }, { "ce_loss_13": 2.514421796798706, "ce_loss_26": 1.9697007417678833, "ce_loss_39": 1.780946272611618, "ce_loss_52": 1.4202388614416122, "ce_loss_7": 2.8420049071311952, "epoch": 0.612, "grad_norm": 14.8111788086498, "kl_loss_13": 2230.2, "kl_loss_26": 1101.1, "kl_loss_39": 705.2, "kl_loss_7": 2920.0, "learning_rate": 0.00033347053645005966, "loss": 3492.65, "step": 6120 }, { "ce_loss_13": 2.537186449766159, "ce_loss_26": 2.004375171661377, "ce_loss_39": 1.8157330989837646, "ce_loss_52": 1.451986312866211, "ce_loss_7": 2.878436690568924, "epoch": 0.613, "grad_norm": 15.401404522918961, "kl_loss_13": 2242.0, "kl_loss_26": 1113.3, "kl_loss_39": 718.55, "kl_loss_7": 2945.6, "learning_rate": 0.00033197530339228485, "loss": 3459.1, "step": 6130 }, { "ce_loss_13": 2.506669583916664, "ce_loss_26": 1.9629664570093155, "ce_loss_39": 1.78002208173275, "ce_loss_52": 1.4117600202560425, "ce_loss_7": 2.847608286142349, "epoch": 0.614, "grad_norm": 15.390199704579972, "kl_loss_13": 2259.8, "kl_loss_26": 1112.7, "kl_loss_39": 727.8, "kl_loss_7": 2986.8, "learning_rate": 0.00033048176234133967, "loss": 3537.45, "step": 6140 }, { "ce_loss_13": 2.5588534235954286, "ce_loss_26": 2.024568209052086, "ce_loss_39": 1.8284731358289719, "ce_loss_52": 1.4587910890579223, "ce_loss_7": 2.8921659886837006, "epoch": 0.615, "grad_norm": 14.629003698363993, "kl_loss_13": 2259.8, "kl_loss_26": 1126.9, "kl_loss_39": 724.0, "kl_loss_7": 2958.8, "learning_rate": 0.0003289899283371657, "loss": 3536.8, "step": 6150 }, { "ce_loss_13": 2.4743064284324645, "ce_loss_26": 1.9476153373718261, "ce_loss_39": 1.7604322880506516, "ce_loss_52": 1.4059417188167571, "ce_loss_7": 2.7994522780179976, "epoch": 0.616, "grad_norm": 15.287517673195092, "kl_loss_13": 2207.6, "kl_loss_26": 1094.2, "kl_loss_39": 708.05, "kl_loss_7": 2892.4, "learning_rate": 0.0003274998164025148, "loss": 3522.0, "step": 6160 }, { "ce_loss_13": 2.61488196849823, "ce_loss_26": 2.067342773079872, "ce_loss_39": 1.8675355583429336, "ce_loss_52": 1.4927641093730926, "ce_loss_7": 2.948505789041519, "epoch": 0.617, "grad_norm": 14.482727532648099, "kl_loss_13": 2288.0, "kl_loss_26": 1139.7, "kl_loss_39": 729.35, "kl_loss_7": 2993.6, "learning_rate": 0.0003260114415427975, "loss": 3494.95, "step": 6170 }, { "ce_loss_13": 2.5525584638118746, "ce_loss_26": 1.9941669285297394, "ce_loss_39": 1.7988057792186738, "ce_loss_52": 1.4276691943407058, "ce_loss_7": 2.888332962989807, "epoch": 0.618, "grad_norm": 15.194650585699007, "kl_loss_13": 2319.2, "kl_loss_26": 1148.8, "kl_loss_39": 742.3, "kl_loss_7": 3022.8, "learning_rate": 0.0003245248187459323, "loss": 3535.8, "step": 6180 }, { "ce_loss_13": 2.5258300840854644, "ce_loss_26": 1.9903143167495727, "ce_loss_39": 1.8010086834430694, "ce_loss_52": 1.4478329718112946, "ce_loss_7": 2.86256263256073, "epoch": 0.619, "grad_norm": 14.652051426511987, "kl_loss_13": 2228.6, "kl_loss_26": 1088.0, "kl_loss_39": 694.0, "kl_loss_7": 2940.4, "learning_rate": 0.00032303996298219416, "loss": 3513.7, "step": 6190 }, { "ce_loss_13": 2.5733933985233306, "ce_loss_26": 2.0336378514766693, "ce_loss_39": 1.8403378069400786, "ce_loss_52": 1.4681322902441025, "ce_loss_7": 2.920198345184326, "epoch": 0.62, "grad_norm": 15.056132330426859, "kl_loss_13": 2280.4, "kl_loss_26": 1131.3, "kl_loss_39": 733.75, "kl_loss_7": 3002.8, "learning_rate": 0.00032155688920406414, "loss": 3507.7, "step": 6200 }, { "ce_loss_13": 2.5029452949762345, "ce_loss_26": 1.9643093675374985, "ce_loss_39": 1.7774474427103997, "ce_loss_52": 1.4176854699850083, "ce_loss_7": 2.823590323328972, "epoch": 0.621, "grad_norm": 14.49052860339056, "kl_loss_13": 2220.0, "kl_loss_26": 1086.1, "kl_loss_39": 699.45, "kl_loss_7": 2902.8, "learning_rate": 0.0003200756123460788, "loss": 3535.45, "step": 6210 }, { "ce_loss_13": 2.489814931154251, "ce_loss_26": 1.9501473933458329, "ce_loss_39": 1.7617685228586197, "ce_loss_52": 1.41129230260849, "ce_loss_7": 2.816985684633255, "epoch": 0.622, "grad_norm": 14.505478323074753, "kl_loss_13": 2218.6, "kl_loss_26": 1080.9, "kl_loss_39": 689.55, "kl_loss_7": 2918.0, "learning_rate": 0.00031859614732467957, "loss": 3488.95, "step": 6220 }, { "ce_loss_13": 2.5316161155700683, "ce_loss_26": 2.0058958530426025, "ce_loss_39": 1.8137048929929733, "ce_loss_52": 1.461652959883213, "ce_loss_7": 2.8647646605968475, "epoch": 0.623, "grad_norm": 13.917606472624449, "kl_loss_13": 2204.8, "kl_loss_26": 1085.2, "kl_loss_39": 685.1, "kl_loss_7": 2902.4, "learning_rate": 0.00031711850903806275, "loss": 3465.2, "step": 6230 }, { "ce_loss_13": 2.500165891647339, "ce_loss_26": 1.9614384204149247, "ce_loss_39": 1.771432462334633, "ce_loss_52": 1.4076189696788788, "ce_loss_7": 2.832933169603348, "epoch": 0.624, "grad_norm": 14.223738912104434, "kl_loss_13": 2258.6, "kl_loss_26": 1125.7, "kl_loss_39": 725.1, "kl_loss_7": 2956.8, "learning_rate": 0.0003156427123660297, "loss": 3486.3, "step": 6240 }, { "ce_loss_13": 2.5448601841926575, "ce_loss_26": 2.000015211105347, "ce_loss_39": 1.8015096932649612, "ce_loss_52": 1.440242400765419, "ce_loss_7": 2.878038114309311, "epoch": 0.625, "grad_norm": 14.471917182701668, "kl_loss_13": 2262.4, "kl_loss_26": 1124.4, "kl_loss_39": 714.0, "kl_loss_7": 2962.4, "learning_rate": 0.0003141687721698363, "loss": 3490.15, "step": 6250 }, { "ce_loss_13": 2.5199947118759156, "ce_loss_26": 1.986537829041481, "ce_loss_39": 1.796593463420868, "ce_loss_52": 1.4425756543874741, "ce_loss_7": 2.854223221540451, "epoch": 0.626, "grad_norm": 14.607284979944513, "kl_loss_13": 2212.8, "kl_loss_26": 1085.5, "kl_loss_39": 686.9, "kl_loss_7": 2911.6, "learning_rate": 0.00031269670329204396, "loss": 3493.3, "step": 6260 }, { "ce_loss_13": 2.5326361417770387, "ce_loss_26": 2.002932494878769, "ce_loss_39": 1.8195136040449142, "ce_loss_52": 1.4736543655395509, "ce_loss_7": 2.8653872847557067, "epoch": 0.627, "grad_norm": 13.814637143502924, "kl_loss_13": 2172.2, "kl_loss_26": 1054.9, "kl_loss_39": 669.6, "kl_loss_7": 2869.6, "learning_rate": 0.00031122652055637015, "loss": 3492.5, "step": 6270 }, { "ce_loss_13": 2.482167053222656, "ce_loss_26": 1.9512667179107666, "ce_loss_39": 1.7665533930063249, "ce_loss_52": 1.42098408639431, "ce_loss_7": 2.8168558061122893, "epoch": 0.628, "grad_norm": 16.02072851141737, "kl_loss_13": 2193.0, "kl_loss_26": 1069.4, "kl_loss_39": 682.35, "kl_loss_7": 2894.0, "learning_rate": 0.0003097582387675385, "loss": 3459.75, "step": 6280 }, { "ce_loss_13": 2.47237606048584, "ce_loss_26": 1.95036241710186, "ce_loss_39": 1.764494326710701, "ce_loss_52": 1.4227848395705223, "ce_loss_7": 2.8021757781505583, "epoch": 0.629, "grad_norm": 15.255934367745192, "kl_loss_13": 2167.4, "kl_loss_26": 1053.3, "kl_loss_39": 664.9, "kl_loss_7": 2858.8, "learning_rate": 0.00030829187271113034, "loss": 3446.7, "step": 6290 }, { "ce_loss_13": 2.501003822684288, "ce_loss_26": 1.958929392695427, "ce_loss_39": 1.76513631939888, "ce_loss_52": 1.4046356767416, "ce_loss_7": 2.8350019991397857, "epoch": 0.63, "grad_norm": 14.88714427947574, "kl_loss_13": 2270.2, "kl_loss_26": 1123.2, "kl_loss_39": 716.8, "kl_loss_7": 2970.0, "learning_rate": 0.00030682743715343565, "loss": 3508.45, "step": 6300 }, { "ce_loss_13": 2.5800569117069245, "ce_loss_26": 2.0348214149475097, "ce_loss_39": 1.8403811991214751, "ce_loss_52": 1.476933541893959, "ce_loss_7": 2.9134635806083677, "epoch": 0.631, "grad_norm": 14.94722074912583, "kl_loss_13": 2245.0, "kl_loss_26": 1107.2, "kl_loss_39": 706.25, "kl_loss_7": 2952.8, "learning_rate": 0.0003053649468413043, "loss": 3499.45, "step": 6310 }, { "ce_loss_13": 2.5208792209625246, "ce_loss_26": 1.9822687000036239, "ce_loss_39": 1.7960426419973374, "ce_loss_52": 1.4455793976783753, "ce_loss_7": 2.8597992181777956, "epoch": 0.632, "grad_norm": 15.453003389066977, "kl_loss_13": 2216.2, "kl_loss_26": 1070.1, "kl_loss_39": 678.0, "kl_loss_7": 2927.6, "learning_rate": 0.00030390441650199725, "loss": 3483.5, "step": 6320 }, { "ce_loss_13": 2.441950124502182, "ce_loss_26": 1.91085105240345, "ce_loss_39": 1.7248677492141724, "ce_loss_52": 1.3885301351547241, "ce_loss_7": 2.777034705877304, "epoch": 0.633, "grad_norm": 14.901626155092913, "kl_loss_13": 2155.8, "kl_loss_26": 1040.1, "kl_loss_39": 653.2, "kl_loss_7": 2859.2, "learning_rate": 0.00030244586084303903, "loss": 3433.35, "step": 6330 }, { "ce_loss_13": 2.4565594136714934, "ce_loss_26": 1.9154207110404968, "ce_loss_39": 1.7311667442321776, "ce_loss_52": 1.3880270063877105, "ce_loss_7": 2.79474156498909, "epoch": 0.634, "grad_norm": 15.212752240316364, "kl_loss_13": 2198.2, "kl_loss_26": 1053.8, "kl_loss_39": 669.5, "kl_loss_7": 2908.0, "learning_rate": 0.00030098929455206903, "loss": 3450.2, "step": 6340 }, { "ce_loss_13": 2.4875996589660643, "ce_loss_26": 1.9456024587154388, "ce_loss_39": 1.7510357975959778, "ce_loss_52": 1.4120293408632278, "ce_loss_7": 2.8184066474437715, "epoch": 0.635, "grad_norm": 14.444701123760842, "kl_loss_13": 2190.4, "kl_loss_26": 1070.0, "kl_loss_39": 670.4, "kl_loss_7": 2884.8, "learning_rate": 0.00029953473229669324, "loss": 3500.6, "step": 6350 }, { "ce_loss_13": 2.505690813064575, "ce_loss_26": 1.968365904688835, "ce_loss_39": 1.7826423317193985, "ce_loss_52": 1.4394667357206345, "ce_loss_7": 2.8453324735164642, "epoch": 0.636, "grad_norm": 14.480279054372499, "kl_loss_13": 2207.2, "kl_loss_26": 1070.3, "kl_loss_39": 680.95, "kl_loss_7": 2914.4, "learning_rate": 0.00029808218872433767, "loss": 3473.05, "step": 6360 }, { "ce_loss_13": 2.462022843956947, "ce_loss_26": 1.9284056276082993, "ce_loss_39": 1.7475204050540925, "ce_loss_52": 1.3994116008281707, "ce_loss_7": 2.799399584531784, "epoch": 0.637, "grad_norm": 14.854420396233579, "kl_loss_13": 2187.4, "kl_loss_26": 1057.9, "kl_loss_39": 676.9, "kl_loss_7": 2908.4, "learning_rate": 0.0002966316784621, "loss": 3431.55, "step": 6370 }, { "ce_loss_13": 2.46474946141243, "ce_loss_26": 1.9243933081626892, "ce_loss_39": 1.7351078271865845, "ce_loss_52": 1.3951061010360717, "ce_loss_7": 2.809561550617218, "epoch": 0.638, "grad_norm": 14.398656186824772, "kl_loss_13": 2201.4, "kl_loss_26": 1064.6, "kl_loss_39": 673.7, "kl_loss_7": 2913.6, "learning_rate": 0.0002951832161166024, "loss": 3433.1, "step": 6380 }, { "ce_loss_13": 2.524071788787842, "ce_loss_26": 1.9882585108280182, "ce_loss_39": 1.8004582822322845, "ce_loss_52": 1.457192499935627, "ce_loss_7": 2.859678488969803, "epoch": 0.639, "grad_norm": 14.823787609750735, "kl_loss_13": 2198.8, "kl_loss_26": 1062.9, "kl_loss_39": 671.2, "kl_loss_7": 2895.2, "learning_rate": 0.0002937368162738445, "loss": 3448.55, "step": 6390 }, { "ce_loss_13": 2.476853275299072, "ce_loss_26": 1.940243661403656, "ce_loss_39": 1.7542554527521133, "ce_loss_52": 1.4153559118509293, "ce_loss_7": 2.8181783974170687, "epoch": 0.64, "grad_norm": 14.674283953178037, "kl_loss_13": 2177.8, "kl_loss_26": 1053.1, "kl_loss_39": 664.6, "kl_loss_7": 2899.6, "learning_rate": 0.0002922924934990568, "loss": 3441.6, "step": 6400 }, { "ce_loss_13": 2.4689641296863556, "ce_loss_26": 1.933935484290123, "ce_loss_39": 1.7450189381837844, "ce_loss_52": 1.3959352299571037, "ce_loss_7": 2.8043021619319917, "epoch": 0.641, "grad_norm": 16.188741684715673, "kl_loss_13": 2210.6, "kl_loss_26": 1080.2, "kl_loss_39": 681.3, "kl_loss_7": 2920.8, "learning_rate": 0.0002908502623365536, "loss": 3439.95, "step": 6410 }, { "ce_loss_13": 2.51960112452507, "ce_loss_26": 1.9860825181007384, "ce_loss_39": 1.792076262831688, "ce_loss_52": 1.4388326108455658, "ce_loss_7": 2.8559226214885713, "epoch": 0.642, "grad_norm": 15.164079412983817, "kl_loss_13": 2205.0, "kl_loss_26": 1076.5, "kl_loss_39": 686.15, "kl_loss_7": 2899.6, "learning_rate": 0.0002894101373095867, "loss": 3403.75, "step": 6420 }, { "ce_loss_13": 2.57558217048645, "ce_loss_26": 2.037161833047867, "ce_loss_39": 1.8481904029846192, "ce_loss_52": 1.4971486061811448, "ce_loss_7": 2.9037492871284485, "epoch": 0.643, "grad_norm": 14.617370011749491, "kl_loss_13": 2241.0, "kl_loss_26": 1099.6, "kl_loss_39": 706.2, "kl_loss_7": 2940.0, "learning_rate": 0.00028797213292019926, "loss": 3465.25, "step": 6430 }, { "ce_loss_13": 2.4703142285346984, "ce_loss_26": 1.9478118807077407, "ce_loss_39": 1.7643914371728897, "ce_loss_52": 1.4303042769432068, "ce_loss_7": 2.8028221487998963, "epoch": 0.644, "grad_norm": 14.268057235198288, "kl_loss_13": 2150.2, "kl_loss_26": 1039.5, "kl_loss_39": 657.25, "kl_loss_7": 2845.2, "learning_rate": 0.0002865362636490791, "loss": 3397.05, "step": 6440 }, { "ce_loss_13": 2.5057600528001784, "ce_loss_26": 1.963181382417679, "ce_loss_39": 1.766686275601387, "ce_loss_52": 1.4219027027487754, "ce_loss_7": 2.842684781551361, "epoch": 0.645, "grad_norm": 15.007302910220881, "kl_loss_13": 2227.2, "kl_loss_26": 1085.9, "kl_loss_39": 688.6, "kl_loss_7": 2943.2, "learning_rate": 0.0002851025439554142, "loss": 3420.9, "step": 6450 }, { "ce_loss_13": 2.5524505376815796, "ce_loss_26": 2.00695119202137, "ce_loss_39": 1.812732595205307, "ce_loss_52": 1.4619058847427369, "ce_loss_7": 2.89662281870842, "epoch": 0.646, "grad_norm": 14.944231437877365, "kl_loss_13": 2231.2, "kl_loss_26": 1087.1, "kl_loss_39": 688.9, "kl_loss_7": 2952.8, "learning_rate": 0.00028367098827674573, "loss": 3473.25, "step": 6460 }, { "ce_loss_13": 2.5118141055107115, "ce_loss_26": 1.9752016961574554, "ce_loss_39": 1.790860089659691, "ce_loss_52": 1.4514876693487166, "ce_loss_7": 2.8457858681678774, "epoch": 0.647, "grad_norm": 14.47327693725919, "kl_loss_13": 2178.4, "kl_loss_26": 1057.8, "kl_loss_39": 666.1, "kl_loss_7": 2874.4, "learning_rate": 0.00028224161102882397, "loss": 3430.95, "step": 6470 }, { "ce_loss_13": 2.4975059896707537, "ce_loss_26": 1.9533880710601808, "ce_loss_39": 1.7617440074682236, "ce_loss_52": 1.414775413274765, "ce_loss_7": 2.8368520498275758, "epoch": 0.648, "grad_norm": 14.67215053951943, "kl_loss_13": 2218.6, "kl_loss_26": 1078.5, "kl_loss_39": 690.45, "kl_loss_7": 2930.0, "learning_rate": 0.00028081442660546124, "loss": 3435.85, "step": 6480 }, { "ce_loss_13": 2.4587242364883424, "ce_loss_26": 1.9304294764995575, "ce_loss_39": 1.7412324339151382, "ce_loss_52": 1.4029302895069122, "ce_loss_7": 2.7982348799705505, "epoch": 0.649, "grad_norm": 14.728330622094298, "kl_loss_13": 2170.4, "kl_loss_26": 1055.8, "kl_loss_39": 664.85, "kl_loss_7": 2878.8, "learning_rate": 0.0002793894493783892, "loss": 3431.05, "step": 6490 }, { "ce_loss_13": 2.5393730461597444, "ce_loss_26": 1.9960223108530044, "ce_loss_39": 1.808261874318123, "ce_loss_52": 1.45538187623024, "ce_loss_7": 2.8760022819042206, "epoch": 0.65, "grad_norm": 15.357970116880674, "kl_loss_13": 2229.4, "kl_loss_26": 1089.7, "kl_loss_39": 695.95, "kl_loss_7": 2930.8, "learning_rate": 0.0002779666936971129, "loss": 3429.5, "step": 6500 }, { "ce_loss_13": 2.496321311593056, "ce_loss_26": 1.9730540215969086, "ce_loss_39": 1.7846842855215073, "ce_loss_52": 1.444144432246685, "ce_loss_7": 2.8269869565963743, "epoch": 0.651, "grad_norm": 14.142211217794582, "kl_loss_13": 2159.4, "kl_loss_26": 1043.4, "kl_loss_39": 660.95, "kl_loss_7": 2856.0, "learning_rate": 0.00027654617388876614, "loss": 3409.65, "step": 6510 }, { "ce_loss_13": 2.4942274272441862, "ce_loss_26": 1.9708451181650162, "ce_loss_39": 1.781627294421196, "ce_loss_52": 1.436317929625511, "ce_loss_7": 2.824290210008621, "epoch": 0.652, "grad_norm": 14.257728738894219, "kl_loss_13": 2158.0, "kl_loss_26": 1061.3, "kl_loss_39": 676.9, "kl_loss_7": 2841.6, "learning_rate": 0.0002751279042579672, "loss": 3420.3, "step": 6520 }, { "ce_loss_13": 2.474119412899017, "ce_loss_26": 1.9305396527051926, "ce_loss_39": 1.745158138871193, "ce_loss_52": 1.4086509764194488, "ce_loss_7": 2.8088379979133604, "epoch": 0.653, "grad_norm": 14.091894330635501, "kl_loss_13": 2189.8, "kl_loss_26": 1060.2, "kl_loss_39": 671.45, "kl_loss_7": 2896.4, "learning_rate": 0.00027371189908667604, "loss": 3430.2, "step": 6530 }, { "ce_loss_13": 2.5130710184574125, "ce_loss_26": 1.9766233384609222, "ce_loss_39": 1.7931175470352172, "ce_loss_52": 1.4404253482818603, "ce_loss_7": 2.8409298956394196, "epoch": 0.654, "grad_norm": 14.81675411336707, "kl_loss_13": 2196.2, "kl_loss_26": 1079.4, "kl_loss_39": 687.65, "kl_loss_7": 2891.2, "learning_rate": 0.00027229817263404863, "loss": 3395.3, "step": 6540 }, { "ce_loss_13": 2.489423853158951, "ce_loss_26": 1.9515893071889878, "ce_loss_39": 1.7590265810489654, "ce_loss_52": 1.4197510361671448, "ce_loss_7": 2.832774597406387, "epoch": 0.655, "grad_norm": 14.759093026127282, "kl_loss_13": 2178.4, "kl_loss_26": 1050.2, "kl_loss_39": 655.0, "kl_loss_7": 2893.2, "learning_rate": 0.0002708867391362948, "loss": 3416.7, "step": 6550 }, { "ce_loss_13": 2.5174727141857147, "ce_loss_26": 1.9717289686203003, "ce_loss_39": 1.7858693569898605, "ce_loss_52": 1.4473539382219314, "ce_loss_7": 2.8465367794036864, "epoch": 0.656, "grad_norm": 14.064047100581472, "kl_loss_13": 2182.8, "kl_loss_26": 1041.7, "kl_loss_39": 656.75, "kl_loss_7": 2877.6, "learning_rate": 0.0002694776128065345, "loss": 3397.05, "step": 6560 }, { "ce_loss_13": 2.528963714838028, "ce_loss_26": 1.981925156712532, "ce_loss_39": 1.7907918602228166, "ce_loss_52": 1.4499590158462525, "ce_loss_7": 2.871799385547638, "epoch": 0.657, "grad_norm": 14.569880458498675, "kl_loss_13": 2200.6, "kl_loss_26": 1059.1, "kl_loss_39": 660.35, "kl_loss_7": 2915.2, "learning_rate": 0.00026807080783465374, "loss": 3393.8, "step": 6570 }, { "ce_loss_13": 2.5217245757579803, "ce_loss_26": 1.9828781098127366, "ce_loss_39": 1.7871235221624375, "ce_loss_52": 1.4383462622761727, "ce_loss_7": 2.864704269170761, "epoch": 0.658, "grad_norm": 14.405026774063144, "kl_loss_13": 2246.6, "kl_loss_26": 1094.4, "kl_loss_39": 694.55, "kl_loss_7": 2961.2, "learning_rate": 0.00026666633838716316, "loss": 3410.55, "step": 6580 }, { "ce_loss_13": 2.525897091627121, "ce_loss_26": 1.9987964391708375, "ce_loss_39": 1.815827977657318, "ce_loss_52": 1.4701957792043685, "ce_loss_7": 2.8572112381458283, "epoch": 0.659, "grad_norm": 14.76642356275535, "kl_loss_13": 2192.0, "kl_loss_26": 1080.0, "kl_loss_39": 689.6, "kl_loss_7": 2880.4, "learning_rate": 0.00026526421860705474, "loss": 3403.15, "step": 6590 }, { "ce_loss_13": 2.516835355758667, "ce_loss_26": 1.9933927595615386, "ce_loss_39": 1.8040386736392975, "ce_loss_52": 1.4699463561177253, "ce_loss_7": 2.839303117990494, "epoch": 0.66, "grad_norm": 15.026727729458214, "kl_loss_13": 2153.2, "kl_loss_26": 1047.1, "kl_loss_39": 659.7, "kl_loss_7": 2838.8, "learning_rate": 0.0002638644626136587, "loss": 3420.9, "step": 6600 }, { "ce_loss_13": 2.5133367598056795, "ce_loss_26": 1.9757141143083572, "ce_loss_39": 1.7787566870450973, "ce_loss_52": 1.4421678900718689, "ce_loss_7": 2.84059277176857, "epoch": 0.661, "grad_norm": 14.33502800394329, "kl_loss_13": 2167.2, "kl_loss_26": 1050.2, "kl_loss_39": 655.95, "kl_loss_7": 2853.6, "learning_rate": 0.00026246708450250255, "loss": 3363.15, "step": 6610 }, { "ce_loss_13": 2.530620867013931, "ce_loss_26": 2.015302965044975, "ce_loss_39": 1.8304377377033234, "ce_loss_52": 1.4897648423910141, "ce_loss_7": 2.84687961935997, "epoch": 0.662, "grad_norm": 14.322777446508148, "kl_loss_13": 2141.4, "kl_loss_26": 1047.1, "kl_loss_39": 661.0, "kl_loss_7": 2818.4, "learning_rate": 0.00026107209834516854, "loss": 3368.65, "step": 6620 }, { "ce_loss_13": 2.5134909957647324, "ce_loss_26": 1.964612963795662, "ce_loss_39": 1.7641142904758453, "ce_loss_52": 1.4144739270210267, "ce_loss_7": 2.851569724082947, "epoch": 0.663, "grad_norm": 14.388583697005986, "kl_loss_13": 2236.4, "kl_loss_26": 1088.4, "kl_loss_39": 685.8, "kl_loss_7": 2945.2, "learning_rate": 0.0002596795181891514, "loss": 3390.15, "step": 6630 }, { "ce_loss_13": 2.4643958449363708, "ce_loss_26": 1.9360562086105346, "ce_loss_39": 1.7430761098861693, "ce_loss_52": 1.4107858330011367, "ce_loss_7": 2.7886133015155794, "epoch": 0.664, "grad_norm": 14.667067788196036, "kl_loss_13": 2160.8, "kl_loss_26": 1055.2, "kl_loss_39": 663.05, "kl_loss_7": 2836.8, "learning_rate": 0.000258289358057718, "loss": 3433.7, "step": 6640 }, { "ce_loss_13": 2.470621481537819, "ce_loss_26": 1.932977157831192, "ce_loss_39": 1.7458938509225845, "ce_loss_52": 1.40322026014328, "ce_loss_7": 2.8033271014690397, "epoch": 0.665, "grad_norm": 14.559695450600435, "kl_loss_13": 2196.2, "kl_loss_26": 1066.5, "kl_loss_39": 673.3, "kl_loss_7": 2893.6, "learning_rate": 0.0002569016319497657, "loss": 3385.35, "step": 6650 }, { "ce_loss_13": 2.523782452940941, "ce_loss_26": 1.9726400285959245, "ce_loss_39": 1.783732882142067, "ce_loss_52": 1.441526584327221, "ce_loss_7": 2.8546798706054686, "epoch": 0.666, "grad_norm": 14.338494844564005, "kl_loss_13": 2201.0, "kl_loss_26": 1066.9, "kl_loss_39": 675.1, "kl_loss_7": 2913.6, "learning_rate": 0.00025551635383968066, "loss": 3431.65, "step": 6660 }, { "ce_loss_13": 2.496166667342186, "ce_loss_26": 1.9616001814603805, "ce_loss_39": 1.7780775994062423, "ce_loss_52": 1.4437817305326461, "ce_loss_7": 2.8353283524513246, "epoch": 0.667, "grad_norm": 14.333332930511547, "kl_loss_13": 2162.8, "kl_loss_26": 1041.8, "kl_loss_39": 658.3, "kl_loss_7": 2867.2, "learning_rate": 0.00025413353767719804, "loss": 3373.9, "step": 6670 }, { "ce_loss_13": 2.4899742186069487, "ce_loss_26": 1.9639368683099747, "ce_loss_39": 1.7742518305778503, "ce_loss_52": 1.4562912076711654, "ce_loss_7": 2.813701218366623, "epoch": 0.668, "grad_norm": 15.020866565536496, "kl_loss_13": 2118.8, "kl_loss_26": 1008.9, "kl_loss_39": 626.15, "kl_loss_7": 2799.6, "learning_rate": 0.0002527531973872617, "loss": 3354.0, "step": 6680 }, { "ce_loss_13": 2.4495032489299775, "ce_loss_26": 1.9232689619064331, "ce_loss_39": 1.7322240889072418, "ce_loss_52": 1.4083685100078582, "ce_loss_7": 2.7809501469135283, "epoch": 0.669, "grad_norm": 15.157984637483661, "kl_loss_13": 2152.4, "kl_loss_26": 1034.8, "kl_loss_39": 641.05, "kl_loss_7": 2845.2, "learning_rate": 0.0002513753468698826, "loss": 3397.05, "step": 6690 }, { "ce_loss_13": 2.5416204214096068, "ce_loss_26": 1.9909723430871964, "ce_loss_39": 1.797818985581398, "ce_loss_52": 1.4566215574741364, "ce_loss_7": 2.8839422285556795, "epoch": 0.67, "grad_norm": 14.731917297604895, "kl_loss_13": 2207.8, "kl_loss_26": 1067.0, "kl_loss_39": 664.6, "kl_loss_7": 2918.4, "learning_rate": 0.0002500000000000001, "loss": 3410.05, "step": 6700 }, { "ce_loss_13": 2.46220725774765, "ce_loss_26": 1.9426458358764649, "ce_loss_39": 1.7631336867809295, "ce_loss_52": 1.4324473321437836, "ce_loss_7": 2.7896072566509247, "epoch": 0.671, "grad_norm": 14.394157937336324, "kl_loss_13": 2145.4, "kl_loss_26": 1030.2, "kl_loss_39": 651.6, "kl_loss_7": 2836.8, "learning_rate": 0.0002486271706273421, "loss": 3349.6, "step": 6710 }, { "ce_loss_13": 2.4807921826839445, "ce_loss_26": 1.9570556044578553, "ce_loss_39": 1.776718083024025, "ce_loss_52": 1.4477659314870834, "ce_loss_7": 2.8088565468788147, "epoch": 0.672, "grad_norm": 14.57538335602299, "kl_loss_13": 2109.6, "kl_loss_26": 1009.8, "kl_loss_39": 634.25, "kl_loss_7": 2806.0, "learning_rate": 0.0002472568725762853, "loss": 3376.2, "step": 6720 }, { "ce_loss_13": 2.4802849024534224, "ce_loss_26": 1.9443901777267456, "ce_loss_39": 1.7523796886205674, "ce_loss_52": 1.4139477282762527, "ce_loss_7": 2.8129481852054594, "epoch": 0.673, "grad_norm": 14.144296062088605, "kl_loss_13": 2194.4, "kl_loss_26": 1069.6, "kl_loss_39": 674.95, "kl_loss_7": 2888.8, "learning_rate": 0.00024588911964571554, "loss": 3364.55, "step": 6730 }, { "ce_loss_13": 2.5132571697235107, "ce_loss_26": 1.9828839927911759, "ce_loss_39": 1.7922901511192322, "ce_loss_52": 1.4625934183597564, "ce_loss_7": 2.841163671016693, "epoch": 0.674, "grad_norm": 14.199249331732203, "kl_loss_13": 2159.4, "kl_loss_26": 1039.5, "kl_loss_39": 646.65, "kl_loss_7": 2846.0, "learning_rate": 0.00024452392560888974, "loss": 3361.05, "step": 6740 }, { "ce_loss_13": 2.4865836411714555, "ce_loss_26": 1.954461258649826, "ce_loss_39": 1.7649456202983855, "ce_loss_52": 1.4221005111932754, "ce_loss_7": 2.821400898694992, "epoch": 0.675, "grad_norm": 14.682119193582665, "kl_loss_13": 2206.2, "kl_loss_26": 1077.5, "kl_loss_39": 678.9, "kl_loss_7": 2902.4, "learning_rate": 0.00024316130421329695, "loss": 3347.95, "step": 6750 }, { "ce_loss_13": 2.474187096953392, "ce_loss_26": 1.9446223825216293, "ce_loss_39": 1.7641993075609208, "ce_loss_52": 1.436264917254448, "ce_loss_7": 2.8015049755573274, "epoch": 0.676, "grad_norm": 14.72420694586694, "kl_loss_13": 2155.0, "kl_loss_26": 1030.2, "kl_loss_39": 646.8, "kl_loss_7": 2843.6, "learning_rate": 0.00024180126918051909, "loss": 3348.9, "step": 6760 }, { "ce_loss_13": 2.480317395925522, "ce_loss_26": 1.956321433186531, "ce_loss_39": 1.7687535285949707, "ce_loss_52": 1.4266707986593246, "ce_loss_7": 2.808481311798096, "epoch": 0.677, "grad_norm": 15.416504395614744, "kl_loss_13": 2171.4, "kl_loss_26": 1058.8, "kl_loss_39": 666.6, "kl_loss_7": 2869.2, "learning_rate": 0.00024044383420609406, "loss": 3413.1, "step": 6770 }, { "ce_loss_13": 2.5013114362955093, "ce_loss_26": 1.9759581625461577, "ce_loss_39": 1.7926720827817917, "ce_loss_52": 1.4599666327238083, "ce_loss_7": 2.8250863194465636, "epoch": 0.678, "grad_norm": 13.962008513939274, "kl_loss_13": 2133.8, "kl_loss_26": 1039.8, "kl_loss_39": 651.85, "kl_loss_7": 2824.8, "learning_rate": 0.00023908901295937712, "loss": 3372.05, "step": 6780 }, { "ce_loss_13": 2.489407476782799, "ce_loss_26": 1.9609563022851944, "ce_loss_39": 1.778808832168579, "ce_loss_52": 1.454407089948654, "ce_loss_7": 2.8136882543563844, "epoch": 0.679, "grad_norm": 14.19915367627698, "kl_loss_13": 2110.2, "kl_loss_26": 1016.4, "kl_loss_39": 637.05, "kl_loss_7": 2792.0, "learning_rate": 0.00023773681908340283, "loss": 3384.7, "step": 6790 }, { "ce_loss_13": 2.4634098410606384, "ce_loss_26": 1.9384621411561966, "ce_loss_39": 1.7507896840572357, "ce_loss_52": 1.4146029382944107, "ce_loss_7": 2.7943048059940336, "epoch": 0.68, "grad_norm": 14.861512399701681, "kl_loss_13": 2166.4, "kl_loss_26": 1051.5, "kl_loss_39": 658.6, "kl_loss_7": 2860.8, "learning_rate": 0.00023638726619474876, "loss": 3356.85, "step": 6800 }, { "ce_loss_13": 2.5783134520053865, "ce_loss_26": 2.045197767019272, "ce_loss_39": 1.8584345400333404, "ce_loss_52": 1.5217636466026305, "ce_loss_7": 2.9070691764354706, "epoch": 0.681, "grad_norm": 14.552198364638281, "kl_loss_13": 2176.2, "kl_loss_26": 1054.5, "kl_loss_39": 659.7, "kl_loss_7": 2877.6, "learning_rate": 0.0002350403678833976, "loss": 3347.55, "step": 6810 }, { "ce_loss_13": 2.4692390322685243, "ce_loss_26": 1.943667185306549, "ce_loss_39": 1.7523112028837204, "ce_loss_52": 1.426128900051117, "ce_loss_7": 2.7978816986083985, "epoch": 0.682, "grad_norm": 14.998678580001265, "kl_loss_13": 2167.2, "kl_loss_26": 1050.9, "kl_loss_39": 652.95, "kl_loss_7": 2852.4, "learning_rate": 0.00023369613771260007, "loss": 3369.6, "step": 6820 }, { "ce_loss_13": 2.4953604638576508, "ce_loss_26": 1.981321769952774, "ce_loss_39": 1.7999033033847809, "ce_loss_52": 1.4693025022745132, "ce_loss_7": 2.826812982559204, "epoch": 0.683, "grad_norm": 14.167664016838927, "kl_loss_13": 2131.6, "kl_loss_26": 1041.6, "kl_loss_39": 654.4, "kl_loss_7": 2822.8, "learning_rate": 0.00023235458921873925, "loss": 3334.7, "step": 6830 }, { "ce_loss_13": 2.5015017211437227, "ce_loss_26": 1.9650517791509627, "ce_loss_39": 1.7717696577310562, "ce_loss_52": 1.4357108920812607, "ce_loss_7": 2.827932006120682, "epoch": 0.684, "grad_norm": 14.585579391353733, "kl_loss_13": 2162.0, "kl_loss_26": 1051.0, "kl_loss_39": 656.3, "kl_loss_7": 2848.0, "learning_rate": 0.0002310157359111938, "loss": 3348.15, "step": 6840 }, { "ce_loss_13": 2.426406466960907, "ce_loss_26": 1.9015664726495742, "ce_loss_39": 1.718049594759941, "ce_loss_52": 1.3935224622488023, "ce_loss_7": 2.756322818994522, "epoch": 0.685, "grad_norm": 15.055484269746147, "kl_loss_13": 2134.6, "kl_loss_26": 1022.1, "kl_loss_39": 635.6, "kl_loss_7": 2830.0, "learning_rate": 0.0002296795912722014, "loss": 3335.95, "step": 6850 }, { "ce_loss_13": 2.414138987660408, "ce_loss_26": 1.8910569071769714, "ce_loss_39": 1.708191841840744, "ce_loss_52": 1.383391012251377, "ce_loss_7": 2.747501391172409, "epoch": 0.686, "grad_norm": 14.125926922706771, "kl_loss_13": 2113.8, "kl_loss_26": 1014.7, "kl_loss_39": 637.05, "kl_loss_7": 2808.8, "learning_rate": 0.0002283461687567236, "loss": 3303.65, "step": 6860 }, { "ce_loss_13": 2.464204970002174, "ce_loss_26": 1.9375032573938369, "ce_loss_39": 1.7498446986079217, "ce_loss_52": 1.4226357489824295, "ce_loss_7": 2.7877202153205873, "epoch": 0.687, "grad_norm": 14.738238275957917, "kl_loss_13": 2139.2, "kl_loss_26": 1028.9, "kl_loss_39": 644.15, "kl_loss_7": 2826.0, "learning_rate": 0.00022701548179231045, "loss": 3307.2, "step": 6870 }, { "ce_loss_13": 2.494007241725922, "ce_loss_26": 1.9751898407936097, "ce_loss_39": 1.7829045623540878, "ce_loss_52": 1.4496880739927291, "ce_loss_7": 2.8286533296108245, "epoch": 0.688, "grad_norm": 13.852391362837148, "kl_loss_13": 2141.2, "kl_loss_26": 1042.5, "kl_loss_39": 651.3, "kl_loss_7": 2830.8, "learning_rate": 0.00022568754377896516, "loss": 3367.25, "step": 6880 }, { "ce_loss_13": 2.4991670876741408, "ce_loss_26": 1.9593406468629837, "ce_loss_39": 1.7670493572950363, "ce_loss_52": 1.426390787959099, "ce_loss_7": 2.8318077862262725, "epoch": 0.689, "grad_norm": 14.446284686187164, "kl_loss_13": 2202.8, "kl_loss_26": 1070.1, "kl_loss_39": 671.2, "kl_loss_7": 2903.6, "learning_rate": 0.00022436236808900844, "loss": 3351.3, "step": 6890 }, { "ce_loss_13": 2.5084181427955627, "ce_loss_26": 1.9811445116996764, "ce_loss_39": 1.7889297604560852, "ce_loss_52": 1.4664145559072495, "ce_loss_7": 2.8357039868831633, "epoch": 0.69, "grad_norm": 14.484209525578123, "kl_loss_13": 2151.6, "kl_loss_26": 1045.5, "kl_loss_39": 652.25, "kl_loss_7": 2853.6, "learning_rate": 0.00022303996806694487, "loss": 3356.65, "step": 6900 }, { "ce_loss_13": 2.514096361398697, "ce_loss_26": 1.9786822557449342, "ce_loss_39": 1.7935864567756652, "ce_loss_52": 1.4548332244157791, "ce_loss_7": 2.839564120769501, "epoch": 0.691, "grad_norm": 13.76338668087359, "kl_loss_13": 2184.2, "kl_loss_26": 1055.1, "kl_loss_39": 667.4, "kl_loss_7": 2879.6, "learning_rate": 0.00022172035702932823, "loss": 3337.1, "step": 6910 }, { "ce_loss_13": 2.4764732241630556, "ce_loss_26": 1.95515196621418, "ce_loss_39": 1.772997224330902, "ce_loss_52": 1.4487987339496613, "ce_loss_7": 2.8033831179142, "epoch": 0.692, "grad_norm": 14.586346705034499, "kl_loss_13": 2110.2, "kl_loss_26": 1014.4, "kl_loss_39": 638.2, "kl_loss_7": 2793.2, "learning_rate": 0.00022040354826462666, "loss": 3310.75, "step": 6920 }, { "ce_loss_13": 2.468746620416641, "ce_loss_26": 1.9468185782432557, "ce_loss_39": 1.7679857224225999, "ce_loss_52": 1.4481467604637146, "ce_loss_7": 2.789846181869507, "epoch": 0.693, "grad_norm": 15.005402062047136, "kl_loss_13": 2109.6, "kl_loss_26": 1011.9, "kl_loss_39": 635.85, "kl_loss_7": 2789.6, "learning_rate": 0.0002190895550330899, "loss": 3354.5, "step": 6930 }, { "ce_loss_13": 2.4501267641782762, "ce_loss_26": 1.9237078607082367, "ce_loss_39": 1.735903450846672, "ce_loss_52": 1.413103035092354, "ce_loss_7": 2.7897567749023438, "epoch": 0.694, "grad_norm": 14.894293084006021, "kl_loss_13": 2115.4, "kl_loss_26": 1009.8, "kl_loss_39": 623.75, "kl_loss_7": 2825.6, "learning_rate": 0.00021777839056661552, "loss": 3328.85, "step": 6940 }, { "ce_loss_13": 2.5025814145803453, "ce_loss_26": 1.9697064816951753, "ce_loss_39": 1.7858674556016922, "ce_loss_52": 1.459896171092987, "ce_loss_7": 2.8366158485412596, "epoch": 0.695, "grad_norm": 14.62978428769736, "kl_loss_13": 2143.8, "kl_loss_26": 1026.4, "kl_loss_39": 644.0, "kl_loss_7": 2842.8, "learning_rate": 0.0002164700680686147, "loss": 3339.6, "step": 6950 }, { "ce_loss_13": 2.480437287688255, "ce_loss_26": 1.9569555580615998, "ce_loss_39": 1.7707047134637832, "ce_loss_52": 1.4560858264565468, "ce_loss_7": 2.8063796043395994, "epoch": 0.696, "grad_norm": 14.305917479002375, "kl_loss_13": 2104.0, "kl_loss_26": 1003.9, "kl_loss_39": 621.6, "kl_loss_7": 2791.2, "learning_rate": 0.0002151646007138806, "loss": 3346.15, "step": 6960 }, { "ce_loss_13": 2.4904795557260515, "ce_loss_26": 1.9593748480081559, "ce_loss_39": 1.7725317537784577, "ce_loss_52": 1.4421725705266, "ce_loss_7": 2.82717769742012, "epoch": 0.697, "grad_norm": 14.55516871467818, "kl_loss_13": 2171.2, "kl_loss_26": 1046.7, "kl_loss_39": 655.3, "kl_loss_7": 2878.4, "learning_rate": 0.00021386200164845526, "loss": 3321.35, "step": 6970 }, { "ce_loss_13": 2.4793561339378356, "ce_loss_26": 1.9439466089010238, "ce_loss_39": 1.7569547444581985, "ce_loss_52": 1.4289204239845277, "ce_loss_7": 2.807218599319458, "epoch": 0.698, "grad_norm": 13.778785355416604, "kl_loss_13": 2152.0, "kl_loss_26": 1037.7, "kl_loss_39": 642.8, "kl_loss_7": 2848.0, "learning_rate": 0.0002125622839894964, "loss": 3315.5, "step": 6980 }, { "ce_loss_13": 2.585531139373779, "ce_loss_26": 2.031143417954445, "ce_loss_39": 1.8274976074695588, "ce_loss_52": 1.4753503799438477, "ce_loss_7": 2.921118849515915, "epoch": 0.699, "grad_norm": 14.664216613597723, "kl_loss_13": 2260.4, "kl_loss_26": 1103.8, "kl_loss_39": 685.9, "kl_loss_7": 2968.0, "learning_rate": 0.00021126546082514663, "loss": 3365.55, "step": 6990 }, { "ce_loss_13": 2.478432095050812, "ce_loss_26": 1.950699546933174, "ce_loss_39": 1.7660550504922867, "ce_loss_52": 1.4418641477823257, "ce_loss_7": 2.802116149663925, "epoch": 0.7, "grad_norm": 14.414270184643762, "kl_loss_13": 2128.4, "kl_loss_26": 1022.4, "kl_loss_39": 639.9, "kl_loss_7": 2808.8, "learning_rate": 0.00020997154521440098, "loss": 3312.0, "step": 7000 }, { "ce_loss_13": 2.4484674006700518, "ce_loss_26": 1.9334596753120423, "ce_loss_39": 1.754175427556038, "ce_loss_52": 1.4356836065649987, "ce_loss_7": 2.7715867519378663, "epoch": 0.701, "grad_norm": 15.009606673101777, "kl_loss_13": 2096.6, "kl_loss_26": 1005.6, "kl_loss_39": 630.9, "kl_loss_7": 2776.0, "learning_rate": 0.0002086805501869749, "loss": 3296.9, "step": 7010 }, { "ce_loss_13": 2.500411355495453, "ce_loss_26": 1.9781237423419953, "ce_loss_39": 1.7995324105024337, "ce_loss_52": 1.473289003968239, "ce_loss_7": 2.8238317251205443, "epoch": 0.702, "grad_norm": 14.811490845229788, "kl_loss_13": 2108.4, "kl_loss_26": 1018.1, "kl_loss_39": 642.65, "kl_loss_7": 2786.4, "learning_rate": 0.0002073924887431744, "loss": 3301.85, "step": 7020 }, { "ce_loss_13": 2.4331007301807404, "ce_loss_26": 1.925421604514122, "ce_loss_39": 1.740053552389145, "ce_loss_52": 1.4260162442922593, "ce_loss_7": 2.750682008266449, "epoch": 0.703, "grad_norm": 14.093530596736626, "kl_loss_13": 2071.0, "kl_loss_26": 996.4, "kl_loss_39": 613.7, "kl_loss_7": 2741.2, "learning_rate": 0.00020610737385376348, "loss": 3303.45, "step": 7030 }, { "ce_loss_13": 2.438492274284363, "ce_loss_26": 1.917963182926178, "ce_loss_39": 1.731092056632042, "ce_loss_52": 1.4118753850460053, "ce_loss_7": 2.7640158772468566, "epoch": 0.704, "grad_norm": 14.575552477174394, "kl_loss_13": 2113.4, "kl_loss_26": 1019.3, "kl_loss_39": 632.85, "kl_loss_7": 2804.0, "learning_rate": 0.00020482521845983521, "loss": 3301.55, "step": 7040 }, { "ce_loss_13": 2.4910335719585417, "ce_loss_26": 1.9715039610862732, "ce_loss_39": 1.7800425946712495, "ce_loss_52": 1.4519392430782319, "ce_loss_7": 2.819239354133606, "epoch": 0.705, "grad_norm": 14.820051388586238, "kl_loss_13": 2136.2, "kl_loss_26": 1040.9, "kl_loss_39": 651.75, "kl_loss_7": 2824.0, "learning_rate": 0.00020354603547267987, "loss": 3316.6, "step": 7050 }, { "ce_loss_13": 2.4318030804395674, "ce_loss_26": 1.910761234164238, "ce_loss_39": 1.7284663796424866, "ce_loss_52": 1.4129984229803085, "ce_loss_7": 2.7625936210155486, "epoch": 0.706, "grad_norm": 14.59547057670379, "kl_loss_13": 2113.8, "kl_loss_26": 1004.1, "kl_loss_39": 622.2, "kl_loss_7": 2816.4, "learning_rate": 0.00020226983777365604, "loss": 3284.95, "step": 7060 }, { "ce_loss_13": 2.4749036192893983, "ce_loss_26": 1.9450800210237502, "ce_loss_39": 1.761537629365921, "ce_loss_52": 1.4373356252908707, "ce_loss_7": 2.809742730855942, "epoch": 0.707, "grad_norm": 14.651682120589488, "kl_loss_13": 2148.8, "kl_loss_26": 1038.1, "kl_loss_39": 647.05, "kl_loss_7": 2860.4, "learning_rate": 0.00020099663821406056, "loss": 3330.65, "step": 7070 }, { "ce_loss_13": 2.500520494580269, "ce_loss_26": 1.9711394160985947, "ce_loss_39": 1.7850598603487016, "ce_loss_52": 1.4572303384542464, "ce_loss_7": 2.822962099313736, "epoch": 0.708, "grad_norm": 14.695952402949361, "kl_loss_13": 2140.6, "kl_loss_26": 1032.0, "kl_loss_39": 642.6, "kl_loss_7": 2828.8, "learning_rate": 0.00019972644961499853, "loss": 3310.1, "step": 7080 }, { "ce_loss_13": 2.4471381455659866, "ce_loss_26": 1.9142519533634186, "ce_loss_39": 1.7273518294095993, "ce_loss_52": 1.4090154066681861, "ce_loss_7": 2.775345432758331, "epoch": 0.709, "grad_norm": 14.907893630817398, "kl_loss_13": 2145.0, "kl_loss_26": 1028.7, "kl_loss_39": 636.3, "kl_loss_7": 2839.2, "learning_rate": 0.00019845928476725522, "loss": 3284.4, "step": 7090 }, { "ce_loss_13": 2.484838107228279, "ce_loss_26": 1.9752304345369338, "ce_loss_39": 1.7935984045267106, "ce_loss_52": 1.4694376409053802, "ce_loss_7": 2.8051605463027953, "epoch": 0.71, "grad_norm": 14.813324344167642, "kl_loss_13": 2100.4, "kl_loss_26": 1009.5, "kl_loss_39": 629.85, "kl_loss_7": 2784.0, "learning_rate": 0.00019719515643116677, "loss": 3271.1, "step": 7100 }, { "ce_loss_13": 2.449986720085144, "ce_loss_26": 1.915557289123535, "ce_loss_39": 1.7244810461997986, "ce_loss_52": 1.4009377419948579, "ce_loss_7": 2.7815246999263765, "epoch": 0.711, "grad_norm": 14.72405028446814, "kl_loss_13": 2128.8, "kl_loss_26": 1018.3, "kl_loss_39": 632.1, "kl_loss_7": 2824.4, "learning_rate": 0.0001959340773364911, "loss": 3301.5, "step": 7110 }, { "ce_loss_13": 2.4507795870304108, "ce_loss_26": 1.9264422208070755, "ce_loss_39": 1.7369968056678773, "ce_loss_52": 1.4171953916549682, "ce_loss_7": 2.778002160787582, "epoch": 0.712, "grad_norm": 15.123641060610607, "kl_loss_13": 2145.2, "kl_loss_26": 1034.6, "kl_loss_39": 640.6, "kl_loss_7": 2832.0, "learning_rate": 0.0001946760601822809, "loss": 3307.65, "step": 7120 }, { "ce_loss_13": 2.4649185329675674, "ce_loss_26": 1.9448591649532319, "ce_loss_39": 1.7617656499147416, "ce_loss_52": 1.4421708196401597, "ce_loss_7": 2.7942136943340303, "epoch": 0.713, "grad_norm": 13.86141587264665, "kl_loss_13": 2099.6, "kl_loss_26": 996.3, "kl_loss_39": 613.9, "kl_loss_7": 2784.0, "learning_rate": 0.00019342111763675512, "loss": 3264.15, "step": 7130 }, { "ce_loss_13": 2.431650939583778, "ce_loss_26": 1.8971330910921096, "ce_loss_39": 1.7134798288345336, "ce_loss_52": 1.3959884241223335, "ce_loss_7": 2.7688992261886596, "epoch": 0.714, "grad_norm": 14.868179084191116, "kl_loss_13": 2103.6, "kl_loss_26": 997.3, "kl_loss_39": 614.55, "kl_loss_7": 2798.8, "learning_rate": 0.00019216926233717085, "loss": 3302.05, "step": 7140 }, { "ce_loss_13": 2.4574439406394957, "ce_loss_26": 1.9289735972881317, "ce_loss_39": 1.738691231608391, "ce_loss_52": 1.4203062415122987, "ce_loss_7": 2.7880250751972198, "epoch": 0.715, "grad_norm": 14.757879306344181, "kl_loss_13": 2133.4, "kl_loss_26": 1021.4, "kl_loss_39": 631.75, "kl_loss_7": 2823.2, "learning_rate": 0.00019092050688969737, "loss": 3296.5, "step": 7150 }, { "ce_loss_13": 2.4601316511631013, "ce_loss_26": 1.9434845715761184, "ce_loss_39": 1.7585901826620103, "ce_loss_52": 1.4390017569065094, "ce_loss_7": 2.7778116285800936, "epoch": 0.716, "grad_norm": 13.991843131427743, "kl_loss_13": 2085.4, "kl_loss_26": 1007.0, "kl_loss_39": 627.45, "kl_loss_7": 2755.6, "learning_rate": 0.00018967486386928817, "loss": 3286.15, "step": 7160 }, { "ce_loss_13": 2.451919847726822, "ce_loss_26": 1.9279222816228867, "ce_loss_39": 1.7440420866012574, "ce_loss_52": 1.4374898225069046, "ce_loss_7": 2.784500467777252, "epoch": 0.717, "grad_norm": 14.5708304909804, "kl_loss_13": 2095.4, "kl_loss_26": 992.5, "kl_loss_39": 611.75, "kl_loss_7": 2794.0, "learning_rate": 0.00018843234581955443, "loss": 3292.25, "step": 7170 }, { "ce_loss_13": 2.4709593683481215, "ce_loss_26": 1.9460813373327255, "ce_loss_39": 1.7575767368078232, "ce_loss_52": 1.4326383203268052, "ce_loss_7": 2.7951516568660737, "epoch": 0.718, "grad_norm": 14.981137787748375, "kl_loss_13": 2117.8, "kl_loss_26": 1019.7, "kl_loss_39": 633.35, "kl_loss_7": 2803.2, "learning_rate": 0.00018719296525263924, "loss": 3299.6, "step": 7180 }, { "ce_loss_13": 2.4041130542755127, "ce_loss_26": 1.8861528187990189, "ce_loss_39": 1.7035977393388748, "ce_loss_52": 1.4066498517990111, "ce_loss_7": 2.735060691833496, "epoch": 0.719, "grad_norm": 14.986994612654895, "kl_loss_13": 2054.2, "kl_loss_26": 961.6, "kl_loss_39": 585.6, "kl_loss_7": 2750.4, "learning_rate": 0.0001859567346490913, "loss": 3264.25, "step": 7190 }, { "ce_loss_13": 2.521838116645813, "ce_loss_26": 2.004297485947609, "ce_loss_39": 1.810739102959633, "ce_loss_52": 1.4783420652151107, "ce_loss_7": 2.849927377700806, "epoch": 0.72, "grad_norm": 14.181310648182276, "kl_loss_13": 2154.6, "kl_loss_26": 1052.9, "kl_loss_39": 657.35, "kl_loss_7": 2848.4, "learning_rate": 0.0001847236664577389, "loss": 3278.0, "step": 7200 }, { "ce_loss_13": 2.40320103764534, "ce_loss_26": 1.8900187402963637, "ce_loss_39": 1.7100308045744896, "ce_loss_52": 1.3999869018793105, "ce_loss_7": 2.736201885342598, "epoch": 0.721, "grad_norm": 14.793709205482683, "kl_loss_13": 2080.8, "kl_loss_26": 990.7, "kl_loss_39": 610.95, "kl_loss_7": 2780.0, "learning_rate": 0.00018349377309556487, "loss": 3283.25, "step": 7210 }, { "ce_loss_13": 2.441250967979431, "ce_loss_26": 1.9231963992118835, "ce_loss_39": 1.7388009175658226, "ce_loss_52": 1.4282974660396577, "ce_loss_7": 2.7717535465955736, "epoch": 0.722, "grad_norm": 15.59238941344996, "kl_loss_13": 2104.0, "kl_loss_26": 997.6, "kl_loss_39": 616.85, "kl_loss_7": 2801.2, "learning_rate": 0.00018226706694758193, "loss": 3263.75, "step": 7220 }, { "ce_loss_13": 2.495049071311951, "ce_loss_26": 1.973162716627121, "ce_loss_39": 1.7896722644567489, "ce_loss_52": 1.4697123229503632, "ce_loss_7": 2.8224462032318116, "epoch": 0.723, "grad_norm": 13.997878236797012, "kl_loss_13": 2123.2, "kl_loss_26": 1009.4, "kl_loss_39": 628.15, "kl_loss_7": 2823.2, "learning_rate": 0.0001810435603667075, "loss": 3267.75, "step": 7230 }, { "ce_loss_13": 2.4492597192525865, "ce_loss_26": 1.9282636791467667, "ce_loss_39": 1.7473824605345727, "ce_loss_52": 1.4372958570718766, "ce_loss_7": 2.782766741514206, "epoch": 0.724, "grad_norm": 14.73683414882718, "kl_loss_13": 2088.6, "kl_loss_26": 990.1, "kl_loss_39": 613.75, "kl_loss_7": 2799.6, "learning_rate": 0.0001798232656736389, "loss": 3246.35, "step": 7240 }, { "ce_loss_13": 2.514770272374153, "ce_loss_26": 1.9697564780712127, "ce_loss_39": 1.7806446701288223, "ce_loss_52": 1.456220605969429, "ce_loss_7": 2.8567338407039644, "epoch": 0.725, "grad_norm": 14.87142240672514, "kl_loss_13": 2179.8, "kl_loss_26": 1037.2, "kl_loss_39": 644.4, "kl_loss_7": 2889.2, "learning_rate": 0.0001786061951567303, "loss": 3273.6, "step": 7250 }, { "ce_loss_13": 2.4067626029253004, "ce_loss_26": 1.8963259696960448, "ce_loss_39": 1.7143886119127274, "ce_loss_52": 1.4022117048501967, "ce_loss_7": 2.7348236978054046, "epoch": 0.726, "grad_norm": 14.076850795507209, "kl_loss_13": 2076.6, "kl_loss_26": 994.5, "kl_loss_39": 623.5, "kl_loss_7": 2766.8, "learning_rate": 0.00017739236107186857, "loss": 3281.2, "step": 7260 }, { "ce_loss_13": 2.4501163721084596, "ce_loss_26": 1.926158633828163, "ce_loss_39": 1.7434884279966354, "ce_loss_52": 1.4286866545677186, "ce_loss_7": 2.777343970537186, "epoch": 0.727, "grad_norm": 13.813498461115062, "kl_loss_13": 2114.6, "kl_loss_26": 1012.1, "kl_loss_39": 626.6, "kl_loss_7": 2806.0, "learning_rate": 0.00017618177564234904, "loss": 3264.1, "step": 7270 }, { "ce_loss_13": 2.4412575274705888, "ce_loss_26": 1.9107532769441604, "ce_loss_39": 1.7259235098958015, "ce_loss_52": 1.4076966106891633, "ce_loss_7": 2.774235662817955, "epoch": 0.728, "grad_norm": 14.801740311988109, "kl_loss_13": 2113.4, "kl_loss_26": 1003.6, "kl_loss_39": 619.55, "kl_loss_7": 2807.4, "learning_rate": 0.00017497445105875377, "loss": 3298.7, "step": 7280 }, { "ce_loss_13": 2.445681685209274, "ce_loss_26": 1.9345449537038804, "ce_loss_39": 1.7532619833946228, "ce_loss_52": 1.442472691833973, "ce_loss_7": 2.764800661802292, "epoch": 0.729, "grad_norm": 14.618830047757731, "kl_loss_13": 2063.8, "kl_loss_26": 980.7, "kl_loss_39": 603.55, "kl_loss_7": 2742.0, "learning_rate": 0.000173770399478828, "loss": 3226.7, "step": 7290 }, { "ce_loss_13": 2.4301975846290587, "ce_loss_26": 1.9138565450906753, "ce_loss_39": 1.736141037940979, "ce_loss_52": 1.427780945599079, "ce_loss_7": 2.760072636604309, "epoch": 0.73, "grad_norm": 14.242974774472335, "kl_loss_13": 2095.4, "kl_loss_26": 990.5, "kl_loss_39": 610.6, "kl_loss_7": 2791.2, "learning_rate": 0.0001725696330273575, "loss": 3260.65, "step": 7300 }, { "ce_loss_13": 2.4727762907743456, "ce_loss_26": 1.9557204306125642, "ce_loss_39": 1.766261911392212, "ce_loss_52": 1.4398792043328286, "ce_loss_7": 2.79817710518837, "epoch": 0.731, "grad_norm": 14.566153451338561, "kl_loss_13": 2113.6, "kl_loss_26": 1018.5, "kl_loss_39": 628.8, "kl_loss_7": 2796.8, "learning_rate": 0.00017137216379604724, "loss": 3240.75, "step": 7310 }, { "ce_loss_13": 2.490224635601044, "ce_loss_26": 1.954663872718811, "ce_loss_39": 1.7594738394021987, "ce_loss_52": 1.4360380351543427, "ce_loss_7": 2.8263917326927186, "epoch": 0.732, "grad_norm": 13.205540898906253, "kl_loss_13": 2161.8, "kl_loss_26": 1044.6, "kl_loss_39": 637.45, "kl_loss_7": 2862.4, "learning_rate": 0.00017017800384339925, "loss": 3258.4, "step": 7320 }, { "ce_loss_13": 2.4344683617353438, "ce_loss_26": 1.9195960253477096, "ce_loss_39": 1.7325531929731368, "ce_loss_52": 1.419457183778286, "ce_loss_7": 2.7598242580890657, "epoch": 0.733, "grad_norm": 14.107781745249417, "kl_loss_13": 2087.4, "kl_loss_26": 1001.6, "kl_loss_39": 618.5, "kl_loss_7": 2774.8, "learning_rate": 0.00016898716519459073, "loss": 3316.4, "step": 7330 }, { "ce_loss_13": 2.4717041492462157, "ce_loss_26": 1.9320402562618255, "ce_loss_39": 1.730445721745491, "ce_loss_52": 1.3991459339857102, "ce_loss_7": 2.811204159259796, "epoch": 0.734, "grad_norm": 14.159198716486541, "kl_loss_13": 2200.8, "kl_loss_26": 1069.2, "kl_loss_39": 657.15, "kl_loss_7": 2902.8, "learning_rate": 0.00016779965984135375, "loss": 3266.3, "step": 7340 }, { "ce_loss_13": 2.4648273169994352, "ce_loss_26": 1.9446689933538437, "ce_loss_39": 1.7665399879217147, "ce_loss_52": 1.4552808463573457, "ce_loss_7": 2.7818954586982727, "epoch": 0.735, "grad_norm": 13.974138676843918, "kl_loss_13": 2093.0, "kl_loss_26": 999.2, "kl_loss_39": 622.1, "kl_loss_7": 2762.4, "learning_rate": 0.00016661549974185424, "loss": 3232.6, "step": 7350 }, { "ce_loss_13": 2.497272843122482, "ce_loss_26": 1.9733565777540207, "ce_loss_39": 1.7902508676052094, "ce_loss_52": 1.4602745115756988, "ce_loss_7": 2.8227945923805238, "epoch": 0.736, "grad_norm": 15.105414614283358, "kl_loss_13": 2153.6, "kl_loss_26": 1043.2, "kl_loss_39": 652.45, "kl_loss_7": 2843.6, "learning_rate": 0.00016543469682057105, "loss": 3314.1, "step": 7360 }, { "ce_loss_13": 2.4817920327186584, "ce_loss_26": 1.9674188673496247, "ce_loss_39": 1.788041964173317, "ce_loss_52": 1.4778558552265166, "ce_loss_7": 2.801541256904602, "epoch": 0.737, "grad_norm": 14.089468466172162, "kl_loss_13": 2075.4, "kl_loss_26": 985.2, "kl_loss_39": 606.6, "kl_loss_7": 2750.8, "learning_rate": 0.00016425726296817632, "loss": 3279.5, "step": 7370 }, { "ce_loss_13": 2.4628233551979064, "ce_loss_26": 1.944148001074791, "ce_loss_39": 1.7584406644105912, "ce_loss_52": 1.440820676088333, "ce_loss_7": 2.7982202231884004, "epoch": 0.738, "grad_norm": 14.250790129395915, "kl_loss_13": 2096.0, "kl_loss_26": 994.4, "kl_loss_39": 612.4, "kl_loss_7": 2800.4, "learning_rate": 0.00016308321004141607, "loss": 3270.5, "step": 7380 }, { "ce_loss_13": 2.4311512380838396, "ce_loss_26": 1.910204255580902, "ce_loss_39": 1.7292486786842347, "ce_loss_52": 1.4260056450963021, "ce_loss_7": 2.7644225537776945, "epoch": 0.739, "grad_norm": 14.26013452282849, "kl_loss_13": 2064.2, "kl_loss_26": 971.8, "kl_loss_39": 596.1, "kl_loss_7": 2766.0, "learning_rate": 0.00016191254986299043, "loss": 3267.55, "step": 7390 }, { "ce_loss_13": 2.3748191058635713, "ce_loss_26": 1.8720220893621444, "ce_loss_39": 1.695397737622261, "ce_loss_52": 1.3954048216342927, "ce_loss_7": 2.6974743723869326, "epoch": 0.74, "grad_norm": 14.042172223471859, "kl_loss_13": 2036.0, "kl_loss_26": 961.8, "kl_loss_39": 589.15, "kl_loss_7": 2719.6, "learning_rate": 0.00016074529422143398, "loss": 3237.3, "step": 7400 }, { "ce_loss_13": 2.504778665304184, "ce_loss_26": 1.9736295342445374, "ce_loss_39": 1.778898686170578, "ce_loss_52": 1.458841660618782, "ce_loss_7": 2.830203241109848, "epoch": 0.741, "grad_norm": 14.817704298873846, "kl_loss_13": 2137.6, "kl_loss_26": 1026.7, "kl_loss_39": 628.9, "kl_loss_7": 2830.4, "learning_rate": 0.0001595814548709983, "loss": 3256.85, "step": 7410 }, { "ce_loss_13": 2.457485032081604, "ce_loss_26": 1.955030158162117, "ce_loss_39": 1.7744358479976654, "ce_loss_52": 1.4638055652379989, "ce_loss_7": 2.7708797633647917, "epoch": 0.742, "grad_norm": 13.847929994452544, "kl_loss_13": 2053.2, "kl_loss_26": 989.3, "kl_loss_39": 610.65, "kl_loss_7": 2726.4, "learning_rate": 0.00015842104353153285, "loss": 3240.25, "step": 7420 }, { "ce_loss_13": 2.5232761919498445, "ce_loss_26": 1.9848549604415893, "ce_loss_39": 1.7907847046852112, "ce_loss_52": 1.473637193441391, "ce_loss_7": 2.8558732092380525, "epoch": 0.743, "grad_norm": 14.575648272616709, "kl_loss_13": 2149.6, "kl_loss_26": 1018.4, "kl_loss_39": 623.4, "kl_loss_7": 2838.8, "learning_rate": 0.0001572640718883667, "loss": 3254.8, "step": 7430 }, { "ce_loss_13": 2.4647684305906297, "ce_loss_26": 1.9386949807405471, "ce_loss_39": 1.7500061064958572, "ce_loss_52": 1.4324503019452095, "ce_loss_7": 2.7884095788002012, "epoch": 0.744, "grad_norm": 14.394764150644365, "kl_loss_13": 2108.4, "kl_loss_26": 1000.0, "kl_loss_39": 616.55, "kl_loss_7": 2791.2, "learning_rate": 0.0001561105515921915, "loss": 3224.3, "step": 7440 }, { "ce_loss_13": 2.441458174586296, "ce_loss_26": 1.9374703764915466, "ce_loss_39": 1.7534300208091735, "ce_loss_52": 1.4378804206848144, "ce_loss_7": 2.7636309385299684, "epoch": 0.745, "grad_norm": 14.678295349282738, "kl_loss_13": 2068.2, "kl_loss_26": 997.1, "kl_loss_39": 620.85, "kl_loss_7": 2745.6, "learning_rate": 0.0001549604942589441, "loss": 3227.25, "step": 7450 }, { "ce_loss_13": 2.4308183819055555, "ce_loss_26": 1.9100747764110566, "ce_loss_39": 1.7246440201997757, "ce_loss_52": 1.409556159377098, "ce_loss_7": 2.7695399791002275, "epoch": 0.746, "grad_norm": 14.694656979242655, "kl_loss_13": 2094.8, "kl_loss_26": 993.0, "kl_loss_39": 612.55, "kl_loss_7": 2804.8, "learning_rate": 0.00015381391146968864, "loss": 3249.4, "step": 7460 }, { "ce_loss_13": 2.462578612565994, "ce_loss_26": 1.940753996372223, "ce_loss_39": 1.7554692894220352, "ce_loss_52": 1.441029006242752, "ce_loss_7": 2.785427051782608, "epoch": 0.747, "grad_norm": 14.412450315252437, "kl_loss_13": 2103.6, "kl_loss_26": 1003.3, "kl_loss_39": 620.4, "kl_loss_7": 2793.2, "learning_rate": 0.00015267081477050133, "loss": 3242.1, "step": 7470 }, { "ce_loss_13": 2.436378574371338, "ce_loss_26": 1.9284409761428833, "ce_loss_39": 1.7525635540485383, "ce_loss_52": 1.443886636197567, "ce_loss_7": 2.762845513224602, "epoch": 0.748, "grad_norm": 14.082696745240801, "kl_loss_13": 2056.6, "kl_loss_26": 983.1, "kl_loss_39": 607.7, "kl_loss_7": 2742.6, "learning_rate": 0.00015153121567235335, "loss": 3260.75, "step": 7480 }, { "ce_loss_13": 2.4219354510307314, "ce_loss_26": 1.9070833683013917, "ce_loss_39": 1.727812445163727, "ce_loss_52": 1.4214952304959296, "ce_loss_7": 2.751905006170273, "epoch": 0.749, "grad_norm": 14.604071674011259, "kl_loss_13": 2073.4, "kl_loss_26": 983.2, "kl_loss_39": 609.55, "kl_loss_7": 2766.8, "learning_rate": 0.00015039512565099468, "loss": 3240.15, "step": 7490 }, { "ce_loss_13": 2.4254602432250976, "ce_loss_26": 1.9135964632034301, "ce_loss_39": 1.7295757800340652, "ce_loss_52": 1.423691214621067, "ce_loss_7": 2.7512109965085982, "epoch": 0.75, "grad_norm": 13.87053452241645, "kl_loss_13": 2059.4, "kl_loss_26": 978.6, "kl_loss_39": 597.45, "kl_loss_7": 2750.4, "learning_rate": 0.00014926255614683932, "loss": 3260.75, "step": 7500 }, { "ce_loss_13": 2.44639810025692, "ce_loss_26": 1.93405482172966, "ce_loss_39": 1.7547091454267503, "ce_loss_52": 1.4416350960731505, "ce_loss_7": 2.768189311027527, "epoch": 0.751, "grad_norm": 14.071002297078877, "kl_loss_13": 2088.8, "kl_loss_26": 995.6, "kl_loss_39": 616.7, "kl_loss_7": 2768.8, "learning_rate": 0.0001481335185648498, "loss": 3269.45, "step": 7510 }, { "ce_loss_13": 2.496452784538269, "ce_loss_26": 1.9704186409711837, "ce_loss_39": 1.784249845147133, "ce_loss_52": 1.4717927530407906, "ce_loss_7": 2.8235138654708862, "epoch": 0.752, "grad_norm": 14.017187066675143, "kl_loss_13": 2081.6, "kl_loss_26": 989.5, "kl_loss_39": 608.0, "kl_loss_7": 2766.4, "learning_rate": 0.0001470080242744218, "loss": 3222.85, "step": 7520 }, { "ce_loss_13": 2.4962650299072267, "ce_loss_26": 1.98213948905468, "ce_loss_39": 1.7954594939947128, "ce_loss_52": 1.480476987361908, "ce_loss_7": 2.827346932888031, "epoch": 0.753, "grad_norm": 14.186670012527646, "kl_loss_13": 2094.8, "kl_loss_26": 998.5, "kl_loss_39": 611.15, "kl_loss_7": 2785.6, "learning_rate": 0.0001458860846092705, "loss": 3232.0, "step": 7530 }, { "ce_loss_13": 2.4669371783733367, "ce_loss_26": 1.9375512719154357, "ce_loss_39": 1.7506729423999787, "ce_loss_52": 1.4334673672914504, "ce_loss_7": 2.7915061593055723, "epoch": 0.754, "grad_norm": 14.32315966365105, "kl_loss_13": 2109.0, "kl_loss_26": 996.8, "kl_loss_39": 613.7, "kl_loss_7": 2794.0, "learning_rate": 0.00014476771086731566, "loss": 3264.6, "step": 7540 }, { "ce_loss_13": 2.4759989261627195, "ce_loss_26": 1.9555140793323518, "ce_loss_39": 1.7739178657531738, "ce_loss_52": 1.4679069191217422, "ce_loss_7": 2.795573103427887, "epoch": 0.755, "grad_norm": 14.011488201466985, "kl_loss_13": 2070.6, "kl_loss_26": 974.7, "kl_loss_39": 593.9, "kl_loss_7": 2754.8, "learning_rate": 0.00014365291431056872, "loss": 3256.8, "step": 7550 }, { "ce_loss_13": 2.424694412946701, "ce_loss_26": 1.903841146826744, "ce_loss_39": 1.7249469131231308, "ce_loss_52": 1.4185307189822196, "ce_loss_7": 2.7595690310001375, "epoch": 0.756, "grad_norm": 14.79387989637837, "kl_loss_13": 2093.2, "kl_loss_26": 989.9, "kl_loss_39": 610.5, "kl_loss_7": 2791.6, "learning_rate": 0.00014254170616501827, "loss": 3235.5, "step": 7560 }, { "ce_loss_13": 2.4660239934921266, "ce_loss_26": 1.9451529324054717, "ce_loss_39": 1.7564547389745713, "ce_loss_52": 1.4434847444295884, "ce_loss_7": 2.7976350009441378, "epoch": 0.757, "grad_norm": 14.844517268447264, "kl_loss_13": 2085.6, "kl_loss_26": 981.2, "kl_loss_39": 602.45, "kl_loss_7": 2776.4, "learning_rate": 0.0001414340976205183, "loss": 3204.2, "step": 7570 }, { "ce_loss_13": 2.4295350134372713, "ce_loss_26": 1.921643227338791, "ce_loss_39": 1.738620987534523, "ce_loss_52": 1.4355527609586716, "ce_loss_7": 2.7507594347000124, "epoch": 0.758, "grad_norm": 14.398235424743639, "kl_loss_13": 2045.8, "kl_loss_26": 965.6, "kl_loss_39": 594.6, "kl_loss_7": 2732.4, "learning_rate": 0.00014033009983067452, "loss": 3240.7, "step": 7580 }, { "ce_loss_13": 2.4676227152347563, "ce_loss_26": 1.9401549130678177, "ce_loss_39": 1.7562287330627442, "ce_loss_52": 1.4347741633653641, "ce_loss_7": 2.8093821585178373, "epoch": 0.759, "grad_norm": 13.736433284616705, "kl_loss_13": 2138.8, "kl_loss_26": 1028.1, "kl_loss_39": 641.7, "kl_loss_7": 2851.2, "learning_rate": 0.00013922972391273224, "loss": 3240.15, "step": 7590 }, { "ce_loss_13": 2.491154599189758, "ce_loss_26": 1.9668046951293945, "ce_loss_39": 1.7748177736997603, "ce_loss_52": 1.4488209426403045, "ce_loss_7": 2.8212892413139343, "epoch": 0.76, "grad_norm": 14.65229593288616, "kl_loss_13": 2140.6, "kl_loss_26": 1032.9, "kl_loss_39": 637.7, "kl_loss_7": 2829.6, "learning_rate": 0.0001381329809474649, "loss": 3239.9, "step": 7600 }, { "ce_loss_13": 2.3942853659391403, "ce_loss_26": 1.892151090502739, "ce_loss_39": 1.7125076562166215, "ce_loss_52": 1.4143452048301697, "ce_loss_7": 2.720611757040024, "epoch": 0.761, "grad_norm": 13.295903354405345, "kl_loss_13": 2008.0, "kl_loss_26": 952.3, "kl_loss_39": 583.5, "kl_loss_7": 2686.4, "learning_rate": 0.0001370398819790621, "loss": 3228.6, "step": 7610 }, { "ce_loss_13": 2.48261901140213, "ce_loss_26": 1.966281446814537, "ce_loss_39": 1.7803379833698272, "ce_loss_52": 1.4673886984586715, "ce_loss_7": 2.805577594041824, "epoch": 0.762, "grad_norm": 14.322747311567188, "kl_loss_13": 2093.4, "kl_loss_26": 1000.3, "kl_loss_39": 610.3, "kl_loss_7": 2776.0, "learning_rate": 0.00013595043801501794, "loss": 3201.5, "step": 7620 }, { "ce_loss_13": 2.443099784851074, "ce_loss_26": 1.9284921824932098, "ce_loss_39": 1.7422718316316606, "ce_loss_52": 1.435012650489807, "ce_loss_7": 2.7720457434654238, "epoch": 0.763, "grad_norm": 14.405471822802745, "kl_loss_13": 2082.6, "kl_loss_26": 994.1, "kl_loss_39": 608.55, "kl_loss_7": 2773.2, "learning_rate": 0.00013486466002602133, "loss": 3225.725, "step": 7630 }, { "ce_loss_13": 2.37467542886734, "ce_loss_26": 1.8506677508354188, "ce_loss_39": 1.6710956811904907, "ce_loss_52": 1.3840662211179733, "ce_loss_7": 2.7066759169101715, "epoch": 0.764, "grad_norm": 13.948958944433121, "kl_loss_13": 2038.6, "kl_loss_26": 946.7, "kl_loss_39": 574.05, "kl_loss_7": 2722.4, "learning_rate": 0.00013378255894584462, "loss": 3167.8, "step": 7640 }, { "ce_loss_13": 2.446861132979393, "ce_loss_26": 1.934667894244194, "ce_loss_39": 1.7494839936494828, "ce_loss_52": 1.4413674265146255, "ce_loss_7": 2.7717737197875976, "epoch": 0.765, "grad_norm": 14.489695554621445, "kl_loss_13": 2087.6, "kl_loss_26": 996.2, "kl_loss_39": 612.05, "kl_loss_7": 2779.4, "learning_rate": 0.0001327041456712334, "loss": 3229.05, "step": 7650 }, { "ce_loss_13": 2.514678430557251, "ce_loss_26": 1.9890475004911423, "ce_loss_39": 1.8049181282520295, "ce_loss_52": 1.4897184193134307, "ce_loss_7": 2.8373226463794707, "epoch": 0.766, "grad_norm": 13.809410965696319, "kl_loss_13": 2109.8, "kl_loss_26": 1013.7, "kl_loss_39": 623.8, "kl_loss_7": 2784.8, "learning_rate": 0.00013162943106179747, "loss": 3248.2, "step": 7660 }, { "ce_loss_13": 2.4804063200950623, "ce_loss_26": 1.9478756994009019, "ce_loss_39": 1.765100008249283, "ce_loss_52": 1.446313591301441, "ce_loss_7": 2.814295369386673, "epoch": 0.767, "grad_norm": 14.599429508154355, "kl_loss_13": 2147.0, "kl_loss_26": 1023.2, "kl_loss_39": 636.45, "kl_loss_7": 2854.0, "learning_rate": 0.00013055842593990132, "loss": 3217.4, "step": 7670 }, { "ce_loss_13": 2.4887916058301927, "ce_loss_26": 1.9737455695867538, "ce_loss_39": 1.7851827770471573, "ce_loss_52": 1.4613285958766937, "ce_loss_7": 2.8164610981941225, "epoch": 0.768, "grad_norm": 14.229376114254006, "kl_loss_13": 2127.6, "kl_loss_26": 1032.8, "kl_loss_39": 640.5, "kl_loss_7": 2811.2, "learning_rate": 0.00012949114109055414, "loss": 3223.675, "step": 7680 }, { "ce_loss_13": 2.389929732680321, "ce_loss_26": 1.8889827966690063, "ce_loss_39": 1.7090455144643784, "ce_loss_52": 1.4120649307966233, "ce_loss_7": 2.710143965482712, "epoch": 0.769, "grad_norm": 13.823270022584358, "kl_loss_13": 2025.6, "kl_loss_26": 958.6, "kl_loss_39": 589.95, "kl_loss_7": 2700.0, "learning_rate": 0.00012842758726130281, "loss": 3247.75, "step": 7690 }, { "ce_loss_13": 2.444611003994942, "ce_loss_26": 1.9318826824426651, "ce_loss_39": 1.7527276873588562, "ce_loss_52": 1.4516576603055, "ce_loss_7": 2.7657779157161713, "epoch": 0.77, "grad_norm": 14.431273089148151, "kl_loss_13": 2049.2, "kl_loss_26": 963.2, "kl_loss_39": 589.25, "kl_loss_7": 2726.0, "learning_rate": 0.00012736777516212267, "loss": 3216.75, "step": 7700 }, { "ce_loss_13": 2.441952568292618, "ce_loss_26": 1.9228222370147705, "ce_loss_39": 1.7338123947381974, "ce_loss_52": 1.4193892806768418, "ce_loss_7": 2.7718379318714144, "epoch": 0.771, "grad_norm": 13.661830756949985, "kl_loss_13": 2115.2, "kl_loss_26": 1008.4, "kl_loss_39": 620.0, "kl_loss_7": 2804.8, "learning_rate": 0.00012631171546530968, "loss": 3199.55, "step": 7710 }, { "ce_loss_13": 2.4535767167806624, "ce_loss_26": 1.9207569301128387, "ce_loss_39": 1.7325547844171525, "ce_loss_52": 1.4173025369644165, "ce_loss_7": 2.7801734030246736, "epoch": 0.772, "grad_norm": 14.176576196561767, "kl_loss_13": 2111.8, "kl_loss_26": 1005.2, "kl_loss_39": 618.15, "kl_loss_7": 2793.6, "learning_rate": 0.00012525941880537307, "loss": 3214.15, "step": 7720 }, { "ce_loss_13": 2.4484546184539795, "ce_loss_26": 1.9310883104801178, "ce_loss_39": 1.7474435329437257, "ce_loss_52": 1.4392137452960014, "ce_loss_7": 2.779611772298813, "epoch": 0.773, "grad_norm": 14.626780180795521, "kl_loss_13": 2095.4, "kl_loss_26": 994.8, "kl_loss_39": 607.9, "kl_loss_7": 2786.8, "learning_rate": 0.00012421089577892869, "loss": 3191.6, "step": 7730 }, { "ce_loss_13": 2.463806739449501, "ce_loss_26": 1.9203673034906388, "ce_loss_39": 1.7268804877996444, "ce_loss_52": 1.4067875519394875, "ce_loss_7": 2.797054660320282, "epoch": 0.774, "grad_norm": 14.221427151080144, "kl_loss_13": 2151.0, "kl_loss_26": 1032.4, "kl_loss_39": 637.25, "kl_loss_7": 2842.0, "learning_rate": 0.0001231661569445919, "loss": 3214.8, "step": 7740 }, { "ce_loss_13": 2.4840691089630127, "ce_loss_26": 1.9805045217275619, "ce_loss_39": 1.7966417849063874, "ce_loss_52": 1.4883142501115798, "ce_loss_7": 2.8029735326766967, "epoch": 0.775, "grad_norm": 14.614162489546528, "kl_loss_13": 2069.8, "kl_loss_26": 990.4, "kl_loss_39": 609.05, "kl_loss_7": 2743.2, "learning_rate": 0.00012212521282287093, "loss": 3200.5, "step": 7750 }, { "ce_loss_13": 2.4842973172664644, "ce_loss_26": 1.9586603373289109, "ce_loss_39": 1.767923679947853, "ce_loss_52": 1.450934961438179, "ce_loss_7": 2.815416473150253, "epoch": 0.776, "grad_norm": 14.872662321169154, "kl_loss_13": 2137.2, "kl_loss_26": 1028.5, "kl_loss_39": 636.95, "kl_loss_7": 2826.4, "learning_rate": 0.00012108807389606158, "loss": 3221.25, "step": 7760 }, { "ce_loss_13": 2.430084604024887, "ce_loss_26": 1.9105432122945785, "ce_loss_39": 1.7237805485725404, "ce_loss_52": 1.419256439805031, "ce_loss_7": 2.7609162449836733, "epoch": 0.777, "grad_norm": 14.122349060255786, "kl_loss_13": 2075.2, "kl_loss_26": 984.9, "kl_loss_39": 602.95, "kl_loss_7": 2769.2, "learning_rate": 0.00012005475060814159, "loss": 3219.35, "step": 7770 }, { "ce_loss_13": 2.4920803755521774, "ce_loss_26": 1.977930763363838, "ce_loss_39": 1.792539432644844, "ce_loss_52": 1.484375348687172, "ce_loss_7": 2.818430072069168, "epoch": 0.778, "grad_norm": 14.838228117967187, "kl_loss_13": 2079.2, "kl_loss_26": 984.5, "kl_loss_39": 603.55, "kl_loss_7": 2766.8, "learning_rate": 0.00011902525336466464, "loss": 3193.3, "step": 7780 }, { "ce_loss_13": 2.4326795816421507, "ce_loss_26": 1.9094915211200714, "ce_loss_39": 1.7265714228153228, "ce_loss_52": 1.4181353628635407, "ce_loss_7": 2.757854151725769, "epoch": 0.779, "grad_norm": 13.98078513544715, "kl_loss_13": 2078.4, "kl_loss_26": 991.0, "kl_loss_39": 610.15, "kl_loss_7": 2758.8, "learning_rate": 0.00011799959253265668, "loss": 3208.85, "step": 7790 }, { "ce_loss_13": 2.42523156106472, "ce_loss_26": 1.9246951520442963, "ce_loss_39": 1.7428549587726594, "ce_loss_52": 1.4467457503080368, "ce_loss_7": 2.7435911536216735, "epoch": 0.78, "grad_norm": 13.811296718067993, "kl_loss_13": 2022.0, "kl_loss_26": 950.5, "kl_loss_39": 578.7, "kl_loss_7": 2701.6, "learning_rate": 0.00011697777844051105, "loss": 3204.7, "step": 7800 }, { "ce_loss_13": 2.496878683567047, "ce_loss_26": 1.9677572190761565, "ce_loss_39": 1.7739976853132249, "ce_loss_52": 1.459720864892006, "ce_loss_7": 2.8327562749385833, "epoch": 0.781, "grad_norm": 14.08625224164901, "kl_loss_13": 2120.4, "kl_loss_26": 1008.7, "kl_loss_39": 615.4, "kl_loss_7": 2822.4, "learning_rate": 0.00011595982137788402, "loss": 3198.55, "step": 7810 }, { "ce_loss_13": 2.4920260161161423, "ce_loss_26": 1.9670526027679442, "ce_loss_39": 1.783473041653633, "ce_loss_52": 1.4704290598630905, "ce_loss_7": 2.8202459871768952, "epoch": 0.782, "grad_norm": 14.200442027165447, "kl_loss_13": 2103.8, "kl_loss_26": 1004.1, "kl_loss_39": 624.45, "kl_loss_7": 2800.4, "learning_rate": 0.00011494573159559212, "loss": 3223.6, "step": 7820 }, { "ce_loss_13": 2.4327739059925078, "ce_loss_26": 1.9302410751581192, "ce_loss_39": 1.7492202669382095, "ce_loss_52": 1.4421478152275085, "ce_loss_7": 2.7496786177158357, "epoch": 0.783, "grad_norm": 13.561882382659508, "kl_loss_13": 2046.4, "kl_loss_26": 985.3, "kl_loss_39": 604.05, "kl_loss_7": 2712.8, "learning_rate": 0.00011393551930550828, "loss": 3172.1, "step": 7830 }, { "ce_loss_13": 2.456186518073082, "ce_loss_26": 1.9265149384737015, "ce_loss_39": 1.7347608864307404, "ce_loss_52": 1.4248275607824326, "ce_loss_7": 2.7833105325698853, "epoch": 0.784, "grad_norm": 14.152082617728679, "kl_loss_13": 2109.0, "kl_loss_26": 1006.1, "kl_loss_39": 611.45, "kl_loss_7": 2800.8, "learning_rate": 0.00011292919468045875, "loss": 3208.05, "step": 7840 }, { "ce_loss_13": 2.4463070958852766, "ce_loss_26": 1.9245142936706543, "ce_loss_39": 1.736380136013031, "ce_loss_52": 1.4392430812120438, "ce_loss_7": 2.776514196395874, "epoch": 0.785, "grad_norm": 13.366541015007158, "kl_loss_13": 2080.4, "kl_loss_26": 983.3, "kl_loss_39": 597.9, "kl_loss_7": 2776.4, "learning_rate": 0.00011192676785412154, "loss": 3185.35, "step": 7850 }, { "ce_loss_13": 2.421262636780739, "ce_loss_26": 1.9249890923500061, "ce_loss_39": 1.7500147104263306, "ce_loss_52": 1.4561963319778441, "ce_loss_7": 2.741842967271805, "epoch": 0.786, "grad_norm": 15.122779815741898, "kl_loss_13": 1987.4, "kl_loss_26": 937.4, "kl_loss_39": 570.05, "kl_loss_7": 2649.2, "learning_rate": 0.00011092824892092374, "loss": 3155.1, "step": 7860 }, { "ce_loss_13": 2.528256595134735, "ce_loss_26": 2.005132633447647, "ce_loss_39": 1.8220074683427812, "ce_loss_52": 1.49842167198658, "ce_loss_7": 2.8524305701255797, "epoch": 0.787, "grad_norm": 14.392088598180058, "kl_loss_13": 2135.0, "kl_loss_26": 1024.4, "kl_loss_39": 633.4, "kl_loss_7": 2828.0, "learning_rate": 0.0001099336479359398, "loss": 3228.45, "step": 7870 }, { "ce_loss_13": 2.4351921498775484, "ce_loss_26": 1.9118896454572678, "ce_loss_39": 1.7326159566640853, "ce_loss_52": 1.43462935090065, "ce_loss_7": 2.7571564972400666, "epoch": 0.788, "grad_norm": 14.109231297319436, "kl_loss_13": 2083.4, "kl_loss_26": 971.8, "kl_loss_39": 593.25, "kl_loss_7": 2767.2, "learning_rate": 0.00010894297491479043, "loss": 3224.35, "step": 7880 }, { "ce_loss_13": 2.4037249386310577, "ce_loss_26": 1.8971479564905167, "ce_loss_39": 1.713542652130127, "ce_loss_52": 1.416736051440239, "ce_loss_7": 2.7244735300540923, "epoch": 0.789, "grad_norm": 14.646464194445937, "kl_loss_13": 2027.0, "kl_loss_26": 959.6, "kl_loss_39": 587.05, "kl_loss_7": 2694.8, "learning_rate": 0.00010795623983354214, "loss": 3163.9, "step": 7890 }, { "ce_loss_13": 2.462240958213806, "ce_loss_26": 1.9522877007722854, "ce_loss_39": 1.7622255086898804, "ce_loss_52": 1.447747752070427, "ce_loss_7": 2.781180214881897, "epoch": 0.79, "grad_norm": 14.486453301055112, "kl_loss_13": 2093.8, "kl_loss_26": 999.0, "kl_loss_39": 616.7, "kl_loss_7": 2768.0, "learning_rate": 0.00010697345262860636, "loss": 3189.25, "step": 7900 }, { "ce_loss_13": 2.443204700946808, "ce_loss_26": 1.9252238601446152, "ce_loss_39": 1.7538990557193757, "ce_loss_52": 1.4549198508262635, "ce_loss_7": 2.762816107273102, "epoch": 0.791, "grad_norm": 14.756973656564183, "kl_loss_13": 2043.6, "kl_loss_26": 953.1, "kl_loss_39": 582.75, "kl_loss_7": 2730.4, "learning_rate": 0.00010599462319663906, "loss": 3189.25, "step": 7910 }, { "ce_loss_13": 2.4734450757503508, "ce_loss_26": 1.9476019829511642, "ce_loss_39": 1.7569423377513886, "ce_loss_52": 1.448093169927597, "ce_loss_7": 2.7963216602802277, "epoch": 0.792, "grad_norm": 14.148651159008981, "kl_loss_13": 2096.8, "kl_loss_26": 997.8, "kl_loss_39": 608.35, "kl_loss_7": 2784.0, "learning_rate": 0.00010501976139444191, "loss": 3199.1, "step": 7920 }, { "ce_loss_13": 2.460918265581131, "ce_loss_26": 1.9421131610870361, "ce_loss_39": 1.7556595474481582, "ce_loss_52": 1.4451121121644974, "ce_loss_7": 2.7843388080596925, "epoch": 0.793, "grad_norm": 14.590509352597412, "kl_loss_13": 2099.4, "kl_loss_26": 995.3, "kl_loss_39": 605.2, "kl_loss_7": 2780.8, "learning_rate": 0.0001040488770388625, "loss": 3203.35, "step": 7930 }, { "ce_loss_13": 2.379360908269882, "ce_loss_26": 1.8618569314479827, "ce_loss_39": 1.6797795861959457, "ce_loss_52": 1.3791978135704994, "ce_loss_7": 2.708191817998886, "epoch": 0.794, "grad_norm": 14.16954347608881, "kl_loss_13": 2053.0, "kl_loss_26": 969.8, "kl_loss_39": 592.9, "kl_loss_7": 2740.8, "learning_rate": 0.00010308197990669538, "loss": 3181.45, "step": 7940 }, { "ce_loss_13": 2.4218691647052766, "ce_loss_26": 1.9039832711219788, "ce_loss_39": 1.7168581753969192, "ce_loss_52": 1.414185357093811, "ce_loss_7": 2.7429304718971252, "epoch": 0.795, "grad_norm": 13.813266382907072, "kl_loss_13": 2088.6, "kl_loss_26": 990.3, "kl_loss_39": 604.5, "kl_loss_7": 2776.4, "learning_rate": 0.0001021190797345839, "loss": 3178.1, "step": 7950 }, { "ce_loss_13": 2.487806275486946, "ce_loss_26": 1.9796326756477356, "ce_loss_39": 1.7957882821559905, "ce_loss_52": 1.484969075024128, "ce_loss_7": 2.8111346662044525, "epoch": 0.796, "grad_norm": 14.164838891881972, "kl_loss_13": 2047.4, "kl_loss_26": 971.5, "kl_loss_39": 591.45, "kl_loss_7": 2717.2, "learning_rate": 0.00010116018621892236, "loss": 3174.95, "step": 7960 }, { "ce_loss_13": 2.398695731163025, "ce_loss_26": 1.8858011841773987, "ce_loss_39": 1.7020757973194123, "ce_loss_52": 1.4106510564684869, "ce_loss_7": 2.721856439113617, "epoch": 0.797, "grad_norm": 14.161604323369735, "kl_loss_13": 2034.0, "kl_loss_26": 953.8, "kl_loss_39": 575.1, "kl_loss_7": 2712.8, "learning_rate": 0.00010020530901575753, "loss": 3177.95, "step": 7970 }, { "ce_loss_13": 2.427662065625191, "ce_loss_26": 1.921607220172882, "ce_loss_39": 1.741393145918846, "ce_loss_52": 1.438681322336197, "ce_loss_7": 2.746969664096832, "epoch": 0.798, "grad_norm": 14.770550097448835, "kl_loss_13": 2042.0, "kl_loss_26": 965.9, "kl_loss_39": 592.1, "kl_loss_7": 2705.2, "learning_rate": 9.925445774069231e-05, "loss": 3170.6, "step": 7980 }, { "ce_loss_13": 2.4366293847560883, "ce_loss_26": 1.9163338214159011, "ce_loss_39": 1.7320117831230164, "ce_loss_52": 1.4146809190511704, "ce_loss_7": 2.772968965768814, "epoch": 0.799, "grad_norm": 13.97785133684068, "kl_loss_13": 2098.2, "kl_loss_26": 1008.8, "kl_loss_39": 619.4, "kl_loss_7": 2795.6, "learning_rate": 9.830764196878872e-05, "loss": 3210.25, "step": 7990 }, { "ce_loss_13": 2.519176536798477, "ce_loss_26": 2.0015997767448424, "ce_loss_39": 1.8160304486751557, "ce_loss_52": 1.4840710669755937, "ce_loss_7": 2.842372918128967, "epoch": 0.8, "grad_norm": 13.949092199595771, "kl_loss_13": 2133.0, "kl_loss_26": 1036.3, "kl_loss_39": 652.95, "kl_loss_7": 2818.8, "learning_rate": 9.736487123447069e-05, "loss": 3181.95, "step": 8000 }, { "ce_loss_13": 2.4726769655942915, "ce_loss_26": 1.9741164237260818, "ce_loss_39": 1.790221494436264, "ce_loss_52": 1.482371485233307, "ce_loss_7": 2.799379500746727, "epoch": 0.801, "grad_norm": 13.707660286112029, "kl_loss_13": 2038.0, "kl_loss_26": 977.9, "kl_loss_39": 599.8, "kl_loss_7": 2716.8, "learning_rate": 9.642615503142926e-05, "loss": 3173.65, "step": 8010 }, { "ce_loss_13": 2.4063422054052355, "ce_loss_26": 1.904910859465599, "ce_loss_39": 1.7236665695905686, "ce_loss_52": 1.4340474352240562, "ce_loss_7": 2.726147544384003, "epoch": 0.802, "grad_norm": 14.759158260471319, "kl_loss_13": 2004.0, "kl_loss_26": 947.5, "kl_loss_39": 570.8, "kl_loss_7": 2678.8, "learning_rate": 9.549150281252633e-05, "loss": 3210.3, "step": 8020 }, { "ce_loss_13": 2.439296191930771, "ce_loss_26": 1.930625182390213, "ce_loss_39": 1.7471411645412445, "ce_loss_52": 1.4468423128128052, "ce_loss_7": 2.759518486261368, "epoch": 0.803, "grad_norm": 14.138193136913905, "kl_loss_13": 2029.8, "kl_loss_26": 959.0, "kl_loss_39": 580.8, "kl_loss_7": 2710.0, "learning_rate": 9.4560923989699e-05, "loss": 3188.85, "step": 8030 }, { "ce_loss_13": 2.383785030245781, "ce_loss_26": 1.8730993419885635, "ce_loss_39": 1.689489060640335, "ce_loss_52": 1.3906694814562797, "ce_loss_7": 2.715326648950577, "epoch": 0.804, "grad_norm": 14.525642481529378, "kl_loss_13": 2063.0, "kl_loss_26": 966.1, "kl_loss_39": 591.6, "kl_loss_7": 2756.4, "learning_rate": 9.363442793386607e-05, "loss": 3171.0, "step": 8040 }, { "ce_loss_13": 2.436983805894852, "ce_loss_26": 1.9301572561264038, "ce_loss_39": 1.7478452265262603, "ce_loss_52": 1.4431490540504455, "ce_loss_7": 2.7593387603759765, "epoch": 0.805, "grad_norm": 14.17334080276183, "kl_loss_13": 2038.0, "kl_loss_26": 967.0, "kl_loss_39": 590.6, "kl_loss_7": 2714.4, "learning_rate": 9.271202397483213e-05, "loss": 3157.4, "step": 8050 }, { "ce_loss_13": 2.4742675691843035, "ce_loss_26": 1.9660158514976502, "ce_loss_39": 1.7886695712804794, "ce_loss_52": 1.4793777346611023, "ce_loss_7": 2.799517345428467, "epoch": 0.806, "grad_norm": 14.807557523499636, "kl_loss_13": 2047.0, "kl_loss_26": 977.2, "kl_loss_39": 606.0, "kl_loss_7": 2719.6, "learning_rate": 9.179372140119524e-05, "loss": 3197.5, "step": 8060 }, { "ce_loss_13": 2.404207941889763, "ce_loss_26": 1.8871434926986694, "ce_loss_39": 1.7082382440567017, "ce_loss_52": 1.4106020584702492, "ce_loss_7": 2.7318927943706512, "epoch": 0.807, "grad_norm": 14.241838480847589, "kl_loss_13": 2054.2, "kl_loss_26": 966.9, "kl_loss_39": 589.9, "kl_loss_7": 2745.6, "learning_rate": 9.087952946025175e-05, "loss": 3174.15, "step": 8070 }, { "ce_loss_13": 2.405471110343933, "ce_loss_26": 1.9090194314718247, "ce_loss_39": 1.7282894462347032, "ce_loss_52": 1.4298398733139037, "ce_loss_7": 2.7295302629470823, "epoch": 0.808, "grad_norm": 14.246105109452872, "kl_loss_13": 2023.6, "kl_loss_26": 956.4, "kl_loss_39": 585.5, "kl_loss_7": 2704.0, "learning_rate": 8.996945735790446e-05, "loss": 3220.95, "step": 8080 }, { "ce_loss_13": 2.4230452179908752, "ce_loss_26": 1.9108994454145432, "ce_loss_39": 1.7252773225307465, "ce_loss_52": 1.4297530561685563, "ce_loss_7": 2.7479528963565825, "epoch": 0.809, "grad_norm": 14.241539859828608, "kl_loss_13": 2056.4, "kl_loss_26": 974.5, "kl_loss_39": 592.4, "kl_loss_7": 2738.0, "learning_rate": 8.906351425856951e-05, "loss": 3187.2, "step": 8090 }, { "ce_loss_13": 2.5193986773490904, "ce_loss_26": 2.004713475704193, "ce_loss_39": 1.8214319556951524, "ce_loss_52": 1.5133182466030122, "ce_loss_7": 2.836967188119888, "epoch": 0.81, "grad_norm": 13.856573501498156, "kl_loss_13": 2059.0, "kl_loss_26": 983.5, "kl_loss_39": 604.75, "kl_loss_7": 2738.8, "learning_rate": 8.816170928508365e-05, "loss": 3199.5, "step": 8100 }, { "ce_loss_13": 2.463338887691498, "ce_loss_26": 1.9509768843650819, "ce_loss_39": 1.7714523404836655, "ce_loss_52": 1.4671026438474655, "ce_loss_7": 2.7907890677452087, "epoch": 0.811, "grad_norm": 14.442792993010306, "kl_loss_13": 2039.6, "kl_loss_26": 963.7, "kl_loss_39": 594.85, "kl_loss_7": 2727.2, "learning_rate": 8.7264051518613e-05, "loss": 3182.6, "step": 8110 }, { "ce_loss_13": 2.35435933470726, "ce_loss_26": 1.8515879094600678, "ce_loss_39": 1.675445196032524, "ce_loss_52": 1.3868303269147872, "ce_loss_7": 2.6736503660678865, "epoch": 0.812, "grad_norm": 15.186200546439618, "kl_loss_13": 2007.0, "kl_loss_26": 943.9, "kl_loss_39": 575.7, "kl_loss_7": 2679.2, "learning_rate": 8.637054999856148e-05, "loss": 3182.4, "step": 8120 }, { "ce_loss_13": 2.4665314495563506, "ce_loss_26": 1.9507400870323182, "ce_loss_39": 1.7698681026697158, "ce_loss_52": 1.463287603855133, "ce_loss_7": 2.790166562795639, "epoch": 0.813, "grad_norm": 14.702441450046226, "kl_loss_13": 2081.6, "kl_loss_26": 994.3, "kl_loss_39": 606.6, "kl_loss_7": 2757.2, "learning_rate": 8.548121372247918e-05, "loss": 3195.8, "step": 8130 }, { "ce_loss_13": 2.4175081342458724, "ce_loss_26": 1.894961017370224, "ce_loss_39": 1.7168689727783204, "ce_loss_52": 1.4184614822268486, "ce_loss_7": 2.74496705532074, "epoch": 0.814, "grad_norm": 14.150480690009331, "kl_loss_13": 2042.4, "kl_loss_26": 956.7, "kl_loss_39": 584.15, "kl_loss_7": 2725.2, "learning_rate": 8.459605164597267e-05, "loss": 3148.95, "step": 8140 }, { "ce_loss_13": 2.4137402385473252, "ce_loss_26": 1.907509195804596, "ce_loss_39": 1.7280682563781737, "ce_loss_52": 1.4408938705921173, "ce_loss_7": 2.734111136198044, "epoch": 0.815, "grad_norm": 14.607407219816483, "kl_loss_13": 2031.8, "kl_loss_26": 952.0, "kl_loss_39": 576.85, "kl_loss_7": 2711.2, "learning_rate": 8.371507268261436e-05, "loss": 3141.15, "step": 8150 }, { "ce_loss_13": 2.45993629693985, "ce_loss_26": 1.9489454805850983, "ce_loss_39": 1.7687684744596481, "ce_loss_52": 1.4672614842653275, "ce_loss_7": 2.7731840908527374, "epoch": 0.816, "grad_norm": 13.941573996490533, "kl_loss_13": 2049.4, "kl_loss_26": 971.7, "kl_loss_39": 590.15, "kl_loss_7": 2723.6, "learning_rate": 8.283828570385238e-05, "loss": 3167.65, "step": 8160 }, { "ce_loss_13": 2.457903391122818, "ce_loss_26": 1.9481427311897277, "ce_loss_39": 1.7692535519599915, "ce_loss_52": 1.473311385512352, "ce_loss_7": 2.7900636374950407, "epoch": 0.817, "grad_norm": 13.820607552825622, "kl_loss_13": 2034.8, "kl_loss_26": 960.7, "kl_loss_39": 581.95, "kl_loss_7": 2732.8, "learning_rate": 8.196569953892202e-05, "loss": 3175.55, "step": 8170 }, { "ce_loss_13": 2.424106791615486, "ce_loss_26": 1.9179951936006545, "ce_loss_39": 1.7393042415380477, "ce_loss_52": 1.4501032710075379, "ce_loss_7": 2.7422122418880464, "epoch": 0.818, "grad_norm": 13.95954126780279, "kl_loss_13": 2017.4, "kl_loss_26": 939.3, "kl_loss_39": 570.5, "kl_loss_7": 2684.4, "learning_rate": 8.109732297475635e-05, "loss": 3172.8, "step": 8180 }, { "ce_loss_13": 2.4480927348136903, "ce_loss_26": 1.9403320997953415, "ce_loss_39": 1.7644436001777648, "ce_loss_52": 1.4613411754369736, "ce_loss_7": 2.767207592725754, "epoch": 0.819, "grad_norm": 15.077790655269643, "kl_loss_13": 2034.2, "kl_loss_26": 959.3, "kl_loss_39": 589.55, "kl_loss_7": 2707.6, "learning_rate": 8.023316475589754e-05, "loss": 3151.8, "step": 8190 }, { "ce_loss_13": 2.389411324262619, "ce_loss_26": 1.8819621950387955, "ce_loss_39": 1.7063862174749374, "ce_loss_52": 1.4187449038028717, "ce_loss_7": 2.7186341762542723, "epoch": 0.82, "grad_norm": 14.069790186558153, "kl_loss_13": 2015.6, "kl_loss_26": 934.0, "kl_loss_39": 570.4, "kl_loss_7": 2711.2, "learning_rate": 7.937323358440934e-05, "loss": 3158.45, "step": 8200 }, { "ce_loss_13": 2.463495451211929, "ce_loss_26": 1.9460216015577316, "ce_loss_39": 1.7633485794067383, "ce_loss_52": 1.459425413608551, "ce_loss_7": 2.7843497574329374, "epoch": 0.821, "grad_norm": 14.129619264599885, "kl_loss_13": 2034.2, "kl_loss_26": 962.3, "kl_loss_39": 588.55, "kl_loss_7": 2710.0, "learning_rate": 7.851753811978923e-05, "loss": 3172.55, "step": 8210 }, { "ce_loss_13": 2.3558076560497283, "ce_loss_26": 1.8546594500541687, "ce_loss_39": 1.6799951493740082, "ce_loss_52": 1.392863529920578, "ce_loss_7": 2.6725959718227386, "epoch": 0.822, "grad_norm": 13.287512232663138, "kl_loss_13": 1979.6, "kl_loss_26": 919.4, "kl_loss_39": 562.4, "kl_loss_7": 2652.0, "learning_rate": 7.766608697888095e-05, "loss": 3151.0, "step": 8220 }, { "ce_loss_13": 2.4153092801570892, "ce_loss_26": 1.9061992377042771, "ce_loss_39": 1.7255131870508194, "ce_loss_52": 1.4212424442172051, "ce_loss_7": 2.748347020149231, "epoch": 0.823, "grad_norm": 14.43300896159423, "kl_loss_13": 2076.6, "kl_loss_26": 977.9, "kl_loss_39": 599.3, "kl_loss_7": 2767.2, "learning_rate": 7.681888873578785e-05, "loss": 3171.1, "step": 8230 }, { "ce_loss_13": 2.4028283417224885, "ce_loss_26": 1.9037913769483565, "ce_loss_39": 1.729577499628067, "ce_loss_52": 1.4313182592391969, "ce_loss_7": 2.7214892983436583, "epoch": 0.824, "grad_norm": 14.091655498494992, "kl_loss_13": 1997.8, "kl_loss_26": 945.5, "kl_loss_39": 582.95, "kl_loss_7": 2668.0, "learning_rate": 7.597595192178702e-05, "loss": 3129.45, "step": 8240 }, { "ce_loss_13": 2.385217198729515, "ce_loss_26": 1.8788419783115387, "ce_loss_39": 1.7016818612813949, "ce_loss_52": 1.4092606633901597, "ce_loss_7": 2.714561605453491, "epoch": 0.825, "grad_norm": 14.092565500396708, "kl_loss_13": 2015.4, "kl_loss_26": 950.7, "kl_loss_39": 578.35, "kl_loss_7": 2700.8, "learning_rate": 7.513728502524286e-05, "loss": 3103.55, "step": 8250 }, { "ce_loss_13": 2.411863788962364, "ce_loss_26": 1.8953818708658219, "ce_loss_39": 1.712015947699547, "ce_loss_52": 1.4228723630309106, "ce_loss_7": 2.7335788309574127, "epoch": 0.826, "grad_norm": 14.683616091837887, "kl_loss_13": 2027.2, "kl_loss_26": 946.2, "kl_loss_39": 569.45, "kl_loss_7": 2700.8, "learning_rate": 7.430289649152156e-05, "loss": 3186.45, "step": 8260 }, { "ce_loss_13": 2.4488519340753556, "ce_loss_26": 1.9516287744045258, "ce_loss_39": 1.7749818950891494, "ce_loss_52": 1.479728889465332, "ce_loss_7": 2.7712768018245697, "epoch": 0.827, "grad_norm": 13.955547255495208, "kl_loss_13": 1991.0, "kl_loss_26": 940.5, "kl_loss_39": 573.4, "kl_loss_7": 2658.4, "learning_rate": 7.347279472290646e-05, "loss": 3163.475, "step": 8270 }, { "ce_loss_13": 2.3786238610744475, "ce_loss_26": 1.8683661013841628, "ce_loss_39": 1.690899032354355, "ce_loss_52": 1.4009160608053208, "ce_loss_7": 2.705441731214523, "epoch": 0.828, "grad_norm": 14.039626698915084, "kl_loss_13": 2016.2, "kl_loss_26": 938.6, "kl_loss_39": 571.15, "kl_loss_7": 2694.4, "learning_rate": 7.264698807851328e-05, "loss": 3118.5, "step": 8280 }, { "ce_loss_13": 2.466960498690605, "ce_loss_26": 1.946975302696228, "ce_loss_39": 1.7688733905553817, "ce_loss_52": 1.4637437134981155, "ce_loss_7": 2.8006490588188173, "epoch": 0.829, "grad_norm": 14.106474347437752, "kl_loss_13": 2090.4, "kl_loss_26": 984.1, "kl_loss_39": 609.1, "kl_loss_7": 2790.4, "learning_rate": 7.182548487420554e-05, "loss": 3184.4, "step": 8290 }, { "ce_loss_13": 2.5116629540920257, "ce_loss_26": 1.9985181391239166, "ce_loss_39": 1.8124066442251205, "ce_loss_52": 1.497176530957222, "ce_loss_7": 2.8335696399211883, "epoch": 0.83, "grad_norm": 14.139665843791814, "kl_loss_13": 2098.8, "kl_loss_26": 1001.4, "kl_loss_39": 614.35, "kl_loss_7": 2780.6, "learning_rate": 7.100829338251146e-05, "loss": 3198.35, "step": 8300 }, { "ce_loss_13": 2.4611269533634186, "ce_loss_26": 1.9527796864509583, "ce_loss_39": 1.7704098969697952, "ce_loss_52": 1.462582492828369, "ce_loss_7": 2.7841490387916563, "epoch": 0.831, "grad_norm": 13.992596562086698, "kl_loss_13": 2062.6, "kl_loss_26": 988.0, "kl_loss_39": 605.35, "kl_loss_7": 2743.2, "learning_rate": 7.019542183254046e-05, "loss": 3175.4, "step": 8310 }, { "ce_loss_13": 2.4340964376926424, "ce_loss_26": 1.9217961221933364, "ce_loss_39": 1.7397069931030273, "ce_loss_52": 1.4361872345209121, "ce_loss_7": 2.759904479980469, "epoch": 0.832, "grad_norm": 14.479445741622119, "kl_loss_13": 2039.6, "kl_loss_26": 971.9, "kl_loss_39": 599.4, "kl_loss_7": 2716.0, "learning_rate": 6.938687840989971e-05, "loss": 3159.45, "step": 8320 }, { "ce_loss_13": 2.4414653837680818, "ce_loss_26": 1.928215390443802, "ce_loss_39": 1.7369425565004348, "ce_loss_52": 1.4351924806833267, "ce_loss_7": 2.76216436624527, "epoch": 0.833, "grad_norm": 14.95621914365995, "kl_loss_13": 2059.8, "kl_loss_26": 983.2, "kl_loss_39": 594.6, "kl_loss_7": 2738.8, "learning_rate": 6.858267125661271e-05, "loss": 3174.0, "step": 8330 }, { "ce_loss_13": 2.400873589515686, "ce_loss_26": 1.90281642973423, "ce_loss_39": 1.7204748094081879, "ce_loss_52": 1.4244474336504935, "ce_loss_7": 2.7152935564517975, "epoch": 0.834, "grad_norm": 14.103644864595271, "kl_loss_13": 2033.8, "kl_loss_26": 973.5, "kl_loss_39": 591.7, "kl_loss_7": 2704.8, "learning_rate": 6.778280847103668e-05, "loss": 3170.05, "step": 8340 }, { "ce_loss_13": 2.374238893389702, "ce_loss_26": 1.8607898473739624, "ce_loss_39": 1.6831828862428666, "ce_loss_52": 1.396483090519905, "ce_loss_7": 2.704102611541748, "epoch": 0.835, "grad_norm": 14.599308778967789, "kl_loss_13": 2031.4, "kl_loss_26": 934.1, "kl_loss_39": 566.7, "kl_loss_7": 2719.2, "learning_rate": 6.698729810778065e-05, "loss": 3150.95, "step": 8350 }, { "ce_loss_13": 2.4648724853992463, "ce_loss_26": 1.9494601666927338, "ce_loss_39": 1.7671974629163743, "ce_loss_52": 1.461349506676197, "ce_loss_7": 2.794687694311142, "epoch": 0.836, "grad_norm": 14.476811719238219, "kl_loss_13": 2060.2, "kl_loss_26": 979.5, "kl_loss_39": 601.55, "kl_loss_7": 2750.8, "learning_rate": 6.619614817762538e-05, "loss": 3140.75, "step": 8360 }, { "ce_loss_13": 2.4076102912425994, "ce_loss_26": 1.9041061371564865, "ce_loss_39": 1.7215475410223007, "ce_loss_52": 1.426313552260399, "ce_loss_7": 2.729493075609207, "epoch": 0.837, "grad_norm": 14.66675091557762, "kl_loss_13": 2014.4, "kl_loss_26": 958.9, "kl_loss_39": 585.55, "kl_loss_7": 2689.6, "learning_rate": 6.540936664744196e-05, "loss": 3161.6, "step": 8370 }, { "ce_loss_13": 2.4151067316532133, "ce_loss_26": 1.9024922668933868, "ce_loss_39": 1.7205139189958571, "ce_loss_52": 1.4315023928880692, "ce_loss_7": 2.733920103311539, "epoch": 0.838, "grad_norm": 13.748708793526905, "kl_loss_13": 2020.6, "kl_loss_26": 957.3, "kl_loss_39": 574.45, "kl_loss_7": 2690.8, "learning_rate": 6.462696144011149e-05, "loss": 3148.0, "step": 8380 }, { "ce_loss_13": 2.4298708856105806, "ce_loss_26": 1.9152013957500458, "ce_loss_39": 1.737311202287674, "ce_loss_52": 1.441886842250824, "ce_loss_7": 2.757741445302963, "epoch": 0.839, "grad_norm": 14.560412200339597, "kl_loss_13": 2019.6, "kl_loss_26": 946.5, "kl_loss_39": 576.85, "kl_loss_7": 2709.2, "learning_rate": 6.384894043444567e-05, "loss": 3144.45, "step": 8390 }, { "ce_loss_13": 2.4371220886707308, "ce_loss_26": 1.9178409904241562, "ce_loss_39": 1.7318467199802399, "ce_loss_52": 1.427680206298828, "ce_loss_7": 2.7716069161891936, "epoch": 0.84, "grad_norm": 13.228002541168403, "kl_loss_13": 2067.8, "kl_loss_26": 990.0, "kl_loss_39": 597.05, "kl_loss_7": 2766.8, "learning_rate": 6.307531146510753e-05, "loss": 3145.15, "step": 8400 }, { "ce_loss_13": 2.4668938338756563, "ce_loss_26": 1.9483750283718109, "ce_loss_39": 1.7615112096071244, "ce_loss_52": 1.4629206866025926, "ce_loss_7": 2.7949269711971283, "epoch": 0.841, "grad_norm": 14.638289805261575, "kl_loss_13": 2070.8, "kl_loss_26": 979.1, "kl_loss_39": 588.85, "kl_loss_7": 2755.2, "learning_rate": 6.230608232253226e-05, "loss": 3135.55, "step": 8410 }, { "ce_loss_13": 2.498942193388939, "ce_loss_26": 1.9719986289739608, "ce_loss_39": 1.7773171186447143, "ce_loss_52": 1.4589012682437896, "ce_loss_7": 2.8206138908863068, "epoch": 0.842, "grad_norm": 14.547037792354297, "kl_loss_13": 2143.2, "kl_loss_26": 1027.6, "kl_loss_39": 626.5, "kl_loss_7": 2830.0, "learning_rate": 6.154126075284855e-05, "loss": 3179.05, "step": 8420 }, { "ce_loss_13": 2.339446923136711, "ce_loss_26": 1.836980375647545, "ce_loss_39": 1.66352079808712, "ce_loss_52": 1.3777535080909729, "ce_loss_7": 2.658161628246307, "epoch": 0.843, "grad_norm": 13.822498461755101, "kl_loss_13": 2006.0, "kl_loss_26": 933.6, "kl_loss_39": 565.3, "kl_loss_7": 2679.6, "learning_rate": 6.078085445780129e-05, "loss": 3158.075, "step": 8430 }, { "ce_loss_13": 2.4486551761627195, "ce_loss_26": 1.9329589813947679, "ce_loss_39": 1.7519038885831832, "ce_loss_52": 1.4464032799005508, "ce_loss_7": 2.776008838415146, "epoch": 0.844, "grad_norm": 13.676549255063142, "kl_loss_13": 2050.2, "kl_loss_26": 971.9, "kl_loss_39": 595.05, "kl_loss_7": 2741.2, "learning_rate": 6.002487109467347e-05, "loss": 3155.95, "step": 8440 }, { "ce_loss_13": 2.467064255475998, "ce_loss_26": 1.9566147327423096, "ce_loss_39": 1.7722377121448516, "ce_loss_52": 1.4749930799007416, "ce_loss_7": 2.7902898490428925, "epoch": 0.845, "grad_norm": 15.195098706872598, "kl_loss_13": 2033.8, "kl_loss_26": 959.4, "kl_loss_39": 581.0, "kl_loss_7": 2712.0, "learning_rate": 5.927331827620902e-05, "loss": 3169.7, "step": 8450 }, { "ce_loss_13": 2.389095312356949, "ce_loss_26": 1.8802162408828735, "ce_loss_39": 1.7042785853147506, "ce_loss_52": 1.4126853346824646, "ce_loss_7": 2.714692497253418, "epoch": 0.846, "grad_norm": 14.567703807315205, "kl_loss_13": 2016.0, "kl_loss_26": 940.7, "kl_loss_39": 574.25, "kl_loss_7": 2700.8, "learning_rate": 5.852620357053651e-05, "loss": 3111.0, "step": 8460 }, { "ce_loss_13": 2.4798492193222046, "ce_loss_26": 1.9612985998392105, "ce_loss_39": 1.7813422173261642, "ce_loss_52": 1.4634388938546181, "ce_loss_7": 2.8045433819293977, "epoch": 0.847, "grad_norm": 13.718944289269317, "kl_loss_13": 2114.4, "kl_loss_26": 1008.1, "kl_loss_39": 624.7, "kl_loss_7": 2801.2, "learning_rate": 5.778353450109286e-05, "loss": 3195.2, "step": 8470 }, { "ce_loss_13": 2.38386265039444, "ce_loss_26": 1.8872032672166825, "ce_loss_39": 1.710950767993927, "ce_loss_52": 1.432640826702118, "ce_loss_7": 2.694201183319092, "epoch": 0.848, "grad_norm": 14.259266485735452, "kl_loss_13": 1968.6, "kl_loss_26": 914.3, "kl_loss_39": 549.1, "kl_loss_7": 2623.2, "learning_rate": 5.7045318546547206e-05, "loss": 3137.025, "step": 8480 }, { "ce_loss_13": 2.427330991625786, "ce_loss_26": 1.910456082224846, "ce_loss_39": 1.7292529791593552, "ce_loss_52": 1.4334994465112687, "ce_loss_7": 2.7538663387298583, "epoch": 0.849, "grad_norm": 13.656054783413236, "kl_loss_13": 2033.4, "kl_loss_26": 954.9, "kl_loss_39": 577.85, "kl_loss_7": 2719.6, "learning_rate": 5.631156314072605e-05, "loss": 3150.65, "step": 8490 }, { "ce_loss_13": 2.471989703178406, "ce_loss_26": 1.9505236119031906, "ce_loss_39": 1.7668047726154328, "ce_loss_52": 1.4608478724956513, "ce_loss_7": 2.8063031315803526, "epoch": 0.85, "grad_norm": 13.681266151650567, "kl_loss_13": 2080.4, "kl_loss_26": 982.8, "kl_loss_39": 600.35, "kl_loss_7": 2780.8, "learning_rate": 5.5582275672538315e-05, "loss": 3137.95, "step": 8500 }, { "ce_loss_13": 2.4605359852313997, "ce_loss_26": 1.939817100763321, "ce_loss_39": 1.753285875916481, "ce_loss_52": 1.4498123317956924, "ce_loss_7": 2.7863478004932403, "epoch": 0.851, "grad_norm": 14.038129063761419, "kl_loss_13": 2055.6, "kl_loss_26": 978.1, "kl_loss_39": 593.3, "kl_loss_7": 2740.0, "learning_rate": 5.4857463485900484e-05, "loss": 3144.75, "step": 8510 }, { "ce_loss_13": 2.450565594434738, "ce_loss_26": 1.9354894876480102, "ce_loss_39": 1.7570757120847702, "ce_loss_52": 1.458202052116394, "ce_loss_7": 2.770776855945587, "epoch": 0.852, "grad_norm": 13.945411873449233, "kl_loss_13": 2028.8, "kl_loss_26": 956.4, "kl_loss_39": 584.4, "kl_loss_7": 2704.4, "learning_rate": 5.413713387966329e-05, "loss": 3147.35, "step": 8520 }, { "ce_loss_13": 2.384511134028435, "ce_loss_26": 1.885845959186554, "ce_loss_39": 1.7117790162563324, "ce_loss_52": 1.4215580940246582, "ce_loss_7": 2.6971611440181733, "epoch": 0.853, "grad_norm": 14.280539701015499, "kl_loss_13": 1983.0, "kl_loss_26": 937.9, "kl_loss_39": 571.6, "kl_loss_7": 2646.0, "learning_rate": 5.34212941075381e-05, "loss": 3138.7, "step": 8530 }, { "ce_loss_13": 2.4187089085578917, "ce_loss_26": 1.9180882632732392, "ce_loss_39": 1.7486219108104706, "ce_loss_52": 1.4668725609779358, "ce_loss_7": 2.7367011964321137, "epoch": 0.854, "grad_norm": 13.894895708827713, "kl_loss_13": 1988.6, "kl_loss_26": 918.2, "kl_loss_39": 556.1, "kl_loss_7": 2660.4, "learning_rate": 5.270995137802315e-05, "loss": 3116.45, "step": 8540 }, { "ce_loss_13": 2.415088692307472, "ce_loss_26": 1.9041693419218064, "ce_loss_39": 1.7267356216907501, "ce_loss_52": 1.4279317557811737, "ce_loss_7": 2.7469893753528596, "epoch": 0.855, "grad_norm": 14.31134935015859, "kl_loss_13": 2040.4, "kl_loss_26": 959.5, "kl_loss_39": 585.9, "kl_loss_7": 2737.2, "learning_rate": 5.2003112854332125e-05, "loss": 3108.3, "step": 8550 }, { "ce_loss_13": 2.461696755886078, "ce_loss_26": 1.9468512892723084, "ce_loss_39": 1.7631896048784257, "ce_loss_52": 1.4654896438121796, "ce_loss_7": 2.780461609363556, "epoch": 0.856, "grad_norm": 14.365329511248612, "kl_loss_13": 2070.0, "kl_loss_26": 974.7, "kl_loss_39": 589.2, "kl_loss_7": 2744.0, "learning_rate": 5.130078565432089e-05, "loss": 3173.9, "step": 8560 }, { "ce_loss_13": 2.4338246136903763, "ce_loss_26": 1.9260531306266784, "ce_loss_39": 1.746686053276062, "ce_loss_52": 1.4506986886262894, "ce_loss_7": 2.7584114193916323, "epoch": 0.857, "grad_norm": 13.58696871299719, "kl_loss_13": 2040.8, "kl_loss_26": 958.4, "kl_loss_39": 587.6, "kl_loss_7": 2720.8, "learning_rate": 5.060297685041659e-05, "loss": 3124.45, "step": 8570 }, { "ce_loss_13": 2.465153419971466, "ce_loss_26": 1.9324041992425918, "ce_loss_39": 1.747180885076523, "ce_loss_52": 1.4378513038158416, "ce_loss_7": 2.8079187512397765, "epoch": 0.858, "grad_norm": 13.69346881706034, "kl_loss_13": 2127.2, "kl_loss_26": 1005.7, "kl_loss_39": 610.45, "kl_loss_7": 2842.8, "learning_rate": 4.99096934695461e-05, "loss": 3135.55, "step": 8580 }, { "ce_loss_13": 2.428412067890167, "ce_loss_26": 1.9264894247055053, "ce_loss_39": 1.748258227109909, "ce_loss_52": 1.4425264418125152, "ce_loss_7": 2.7523947954177856, "epoch": 0.859, "grad_norm": 14.416520228353955, "kl_loss_13": 2024.4, "kl_loss_26": 961.5, "kl_loss_39": 593.95, "kl_loss_7": 2704.0, "learning_rate": 4.922094249306558e-05, "loss": 3141.65, "step": 8590 }, { "ce_loss_13": 2.379640507698059, "ce_loss_26": 1.8703870117664336, "ce_loss_39": 1.6925265491008759, "ce_loss_52": 1.4014200061559676, "ce_loss_7": 2.7008225679397584, "epoch": 0.86, "grad_norm": 14.548164181811538, "kl_loss_13": 2019.0, "kl_loss_26": 952.8, "kl_loss_39": 580.7, "kl_loss_7": 2704.4, "learning_rate": 4.853673085668947e-05, "loss": 3164.35, "step": 8600 }, { "ce_loss_13": 2.3882300436496733, "ce_loss_26": 1.8816026329994202, "ce_loss_39": 1.701096272468567, "ce_loss_52": 1.4169353902339936, "ce_loss_7": 2.711241126060486, "epoch": 0.861, "grad_norm": 13.997789256280235, "kl_loss_13": 2014.0, "kl_loss_26": 941.1, "kl_loss_39": 561.45, "kl_loss_7": 2697.6, "learning_rate": 4.78570654504214e-05, "loss": 3160.75, "step": 8610 }, { "ce_loss_13": 2.4367538392543793, "ce_loss_26": 1.9072220534086228, "ce_loss_39": 1.724021741747856, "ce_loss_52": 1.4287808299064637, "ce_loss_7": 2.7643966376781464, "epoch": 0.862, "grad_norm": 13.935763093108008, "kl_loss_13": 2067.0, "kl_loss_26": 962.6, "kl_loss_39": 576.05, "kl_loss_7": 2765.6, "learning_rate": 4.7181953118484556e-05, "loss": 3127.35, "step": 8620 }, { "ce_loss_13": 2.424758407473564, "ce_loss_26": 1.9169064074754716, "ce_loss_39": 1.7346068799495697, "ce_loss_52": 1.4445551723241805, "ce_loss_7": 2.7455954015254975, "epoch": 0.863, "grad_norm": 14.314671068665351, "kl_loss_13": 2012.8, "kl_loss_26": 950.4, "kl_loss_39": 570.95, "kl_loss_7": 2679.2, "learning_rate": 4.651140065925269e-05, "loss": 3115.55, "step": 8630 }, { "ce_loss_13": 2.5092544078826906, "ce_loss_26": 1.9829395413398743, "ce_loss_39": 1.7923680394887924, "ce_loss_52": 1.481997686624527, "ce_loss_7": 2.838895618915558, "epoch": 0.864, "grad_norm": 14.264233585666156, "kl_loss_13": 2110.2, "kl_loss_26": 1005.9, "kl_loss_39": 616.9, "kl_loss_7": 2805.2, "learning_rate": 4.58454148251814e-05, "loss": 3142.0, "step": 8640 }, { "ce_loss_13": 2.43146056830883, "ce_loss_26": 1.9165301382541657, "ce_loss_39": 1.7361394971609116, "ce_loss_52": 1.4455705016851426, "ce_loss_7": 2.7573634922504424, "epoch": 0.865, "grad_norm": 13.3838976309547, "kl_loss_13": 2018.4, "kl_loss_26": 941.1, "kl_loss_39": 567.4, "kl_loss_7": 2702.0, "learning_rate": 4.518400232274078e-05, "loss": 3128.55, "step": 8650 }, { "ce_loss_13": 2.4056933134794236, "ce_loss_26": 1.8982498347759247, "ce_loss_39": 1.7155311942100524, "ce_loss_52": 1.4283919543027879, "ce_loss_7": 2.7339151203632355, "epoch": 0.866, "grad_norm": 13.776729544594009, "kl_loss_13": 2029.0, "kl_loss_26": 958.5, "kl_loss_39": 575.6, "kl_loss_7": 2712.0, "learning_rate": 4.452716981234745e-05, "loss": 3168.35, "step": 8660 }, { "ce_loss_13": 2.4304875314235685, "ce_loss_26": 1.9086535692214965, "ce_loss_39": 1.731420534849167, "ce_loss_52": 1.4311698615550994, "ce_loss_7": 2.76586651802063, "epoch": 0.867, "grad_norm": 14.03220486850696, "kl_loss_13": 2043.6, "kl_loss_26": 956.8, "kl_loss_39": 585.05, "kl_loss_7": 2738.0, "learning_rate": 4.3874923908297335e-05, "loss": 3147.75, "step": 8670 }, { "ce_loss_13": 2.422931173443794, "ce_loss_26": 1.8919979512691498, "ce_loss_39": 1.7088686615228652, "ce_loss_52": 1.4077896371483802, "ce_loss_7": 2.757069969177246, "epoch": 0.868, "grad_norm": 14.116898056864903, "kl_loss_13": 2080.8, "kl_loss_26": 974.7, "kl_loss_39": 600.2, "kl_loss_7": 2778.4, "learning_rate": 4.322727117869951e-05, "loss": 3132.8, "step": 8680 }, { "ce_loss_13": 2.4079675406217573, "ce_loss_26": 1.9030864268541337, "ce_loss_39": 1.7309907704591752, "ce_loss_52": 1.4429899513721467, "ce_loss_7": 2.725788599252701, "epoch": 0.869, "grad_norm": 13.611192303364593, "kl_loss_13": 1991.0, "kl_loss_26": 927.4, "kl_loss_39": 558.5, "kl_loss_7": 2668.4, "learning_rate": 4.2584218145409916e-05, "loss": 3135.35, "step": 8690 }, { "ce_loss_13": 2.3869937509298325, "ce_loss_26": 1.8738444805145265, "ce_loss_39": 1.6931981056928636, "ce_loss_52": 1.400150865316391, "ce_loss_7": 2.709615921974182, "epoch": 0.87, "grad_norm": 14.271447400785455, "kl_loss_13": 2015.8, "kl_loss_26": 944.7, "kl_loss_39": 573.1, "kl_loss_7": 2692.8, "learning_rate": 4.194577128396521e-05, "loss": 3114.95, "step": 8700 }, { "ce_loss_13": 2.465124714374542, "ce_loss_26": 1.961993396282196, "ce_loss_39": 1.7869856834411622, "ce_loss_52": 1.4856192290782928, "ce_loss_7": 2.785760098695755, "epoch": 0.871, "grad_norm": 13.70337275168833, "kl_loss_13": 2015.8, "kl_loss_26": 955.1, "kl_loss_39": 582.9, "kl_loss_7": 2693.6, "learning_rate": 4.1311937023518264e-05, "loss": 3146.8, "step": 8710 }, { "ce_loss_13": 2.432892268896103, "ce_loss_26": 1.9152618199586868, "ce_loss_39": 1.7295002430677413, "ce_loss_52": 1.4359665989875794, "ce_loss_7": 2.7681704640388487, "epoch": 0.872, "grad_norm": 14.713342569921842, "kl_loss_13": 2035.8, "kl_loss_26": 958.4, "kl_loss_39": 578.45, "kl_loss_7": 2721.6, "learning_rate": 4.0682721746773344e-05, "loss": 3128.35, "step": 8720 }, { "ce_loss_13": 2.3963799655437468, "ce_loss_26": 1.8889498293399811, "ce_loss_39": 1.7117068350315094, "ce_loss_52": 1.4276205718517303, "ce_loss_7": 2.7207881271839143, "epoch": 0.873, "grad_norm": 14.269783754551202, "kl_loss_13": 2001.0, "kl_loss_26": 932.4, "kl_loss_39": 561.8, "kl_loss_7": 2688.8, "learning_rate": 4.0058131789920904e-05, "loss": 3131.65, "step": 8730 }, { "ce_loss_13": 2.418975955247879, "ce_loss_26": 1.8960406243801118, "ce_loss_39": 1.7133139997720719, "ce_loss_52": 1.4180105909705163, "ce_loss_7": 2.7614206850528715, "epoch": 0.874, "grad_norm": 14.114592900051933, "kl_loss_13": 2079.4, "kl_loss_26": 975.5, "kl_loss_39": 598.35, "kl_loss_7": 2790.4, "learning_rate": 3.9438173442575e-05, "loss": 3100.7, "step": 8740 }, { "ce_loss_13": 2.4537671864032746, "ce_loss_26": 1.946785607933998, "ce_loss_39": 1.7634260147809981, "ce_loss_52": 1.4690344750881195, "ce_loss_7": 2.775768506526947, "epoch": 0.875, "grad_norm": 14.35027266854894, "kl_loss_13": 2027.0, "kl_loss_26": 955.8, "kl_loss_39": 572.25, "kl_loss_7": 2707.2, "learning_rate": 3.882285294770937e-05, "loss": 3145.7, "step": 8750 }, { "ce_loss_13": 2.4146986842155456, "ce_loss_26": 1.8880977869033813, "ce_loss_39": 1.7063632160425186, "ce_loss_52": 1.403985047340393, "ce_loss_7": 2.741646242141724, "epoch": 0.876, "grad_norm": 14.018669111873198, "kl_loss_13": 2049.4, "kl_loss_26": 969.5, "kl_loss_39": 590.2, "kl_loss_7": 2731.6, "learning_rate": 3.821217650159453e-05, "loss": 3139.0, "step": 8760 }, { "ce_loss_13": 2.3532112538814545, "ce_loss_26": 1.8646484702825545, "ce_loss_39": 1.6983740404248238, "ce_loss_52": 1.4196315869688987, "ce_loss_7": 2.666028293967247, "epoch": 0.877, "grad_norm": 13.69993271737963, "kl_loss_13": 1919.2, "kl_loss_26": 890.8, "kl_loss_39": 542.85, "kl_loss_7": 2582.0, "learning_rate": 3.760615025373543e-05, "loss": 3109.35, "step": 8770 }, { "ce_loss_13": 2.4570236086845396, "ce_loss_26": 1.938426810503006, "ce_loss_39": 1.7564183056354523, "ce_loss_52": 1.453689630329609, "ce_loss_7": 2.787124717235565, "epoch": 0.878, "grad_norm": 14.764457343001293, "kl_loss_13": 2048.4, "kl_loss_26": 962.8, "kl_loss_39": 585.3, "kl_loss_7": 2732.8, "learning_rate": 3.700478030680987e-05, "loss": 3153.7, "step": 8780 }, { "ce_loss_13": 2.4401866495609283, "ce_loss_26": 1.9257715612649917, "ce_loss_39": 1.7440615922212601, "ce_loss_52": 1.4473457425832748, "ce_loss_7": 2.76454553604126, "epoch": 0.879, "grad_norm": 13.624797595400036, "kl_loss_13": 2038.4, "kl_loss_26": 958.5, "kl_loss_39": 579.85, "kl_loss_7": 2717.6, "learning_rate": 3.6408072716606344e-05, "loss": 3158.95, "step": 8790 }, { "ce_loss_13": 2.3960734605789185, "ce_loss_26": 1.8874068677425384, "ce_loss_39": 1.7073772728443146, "ce_loss_52": 1.4144951313734055, "ce_loss_7": 2.7204393565654756, "epoch": 0.88, "grad_norm": 13.565627756275363, "kl_loss_13": 2008.0, "kl_loss_26": 944.3, "kl_loss_39": 574.6, "kl_loss_7": 2694.0, "learning_rate": 3.5816033491963716e-05, "loss": 3127.45, "step": 8800 }, { "ce_loss_13": 2.426618826389313, "ce_loss_26": 1.921605721116066, "ce_loss_39": 1.7422660619020462, "ce_loss_52": 1.4409180462360383, "ce_loss_7": 2.7458843529224395, "epoch": 0.881, "grad_norm": 14.680252482648882, "kl_loss_13": 2012.6, "kl_loss_26": 955.2, "kl_loss_39": 587.45, "kl_loss_7": 2678.0, "learning_rate": 3.522866859471047e-05, "loss": 3106.075, "step": 8810 }, { "ce_loss_13": 2.4589924097061155, "ce_loss_26": 1.9394809186458588, "ce_loss_39": 1.7563898861408234, "ce_loss_52": 1.459865990281105, "ce_loss_7": 2.780824285745621, "epoch": 0.882, "grad_norm": 13.410206661523853, "kl_loss_13": 2065.4, "kl_loss_26": 975.8, "kl_loss_39": 594.35, "kl_loss_7": 2745.6, "learning_rate": 3.46459839396045e-05, "loss": 3162.5, "step": 8820 }, { "ce_loss_13": 2.444491392374039, "ce_loss_26": 1.919321459531784, "ce_loss_39": 1.7317459166049958, "ce_loss_52": 1.4241959005594254, "ce_loss_7": 2.7687501907348633, "epoch": 0.883, "grad_norm": 13.604154476294898, "kl_loss_13": 2078.6, "kl_loss_26": 980.1, "kl_loss_39": 596.05, "kl_loss_7": 2764.8, "learning_rate": 3.406798539427386e-05, "loss": 3137.15, "step": 8830 }, { "ce_loss_13": 2.4456652402877808, "ce_loss_26": 1.9403656631708146, "ce_loss_39": 1.7609369516372682, "ce_loss_52": 1.4744407176971435, "ce_loss_7": 2.762556844949722, "epoch": 0.884, "grad_norm": 14.0628693009416, "kl_loss_13": 1994.0, "kl_loss_26": 932.0, "kl_loss_39": 557.15, "kl_loss_7": 2667.2, "learning_rate": 3.349467877915746e-05, "loss": 3099.2, "step": 8840 }, { "ce_loss_13": 2.4576884746551513, "ce_loss_26": 1.9445500463247298, "ce_loss_39": 1.764642345905304, "ce_loss_52": 1.4638898521661758, "ce_loss_7": 2.7794412195682527, "epoch": 0.885, "grad_norm": 13.731348532638435, "kl_loss_13": 2048.2, "kl_loss_26": 971.4, "kl_loss_39": 596.3, "kl_loss_7": 2722.8, "learning_rate": 3.292606986744667e-05, "loss": 3152.675, "step": 8850 }, { "ce_loss_13": 2.4683742761611938, "ce_loss_26": 1.9468118786811828, "ce_loss_39": 1.7659433901309967, "ce_loss_52": 1.4723648518323897, "ce_loss_7": 2.794690328836441, "epoch": 0.886, "grad_norm": 14.768179589766811, "kl_loss_13": 2053.6, "kl_loss_26": 960.6, "kl_loss_39": 581.1, "kl_loss_7": 2740.0, "learning_rate": 3.23621643850267e-05, "loss": 3135.9, "step": 8860 }, { "ce_loss_13": 2.3716426849365235, "ce_loss_26": 1.8652270317077637, "ce_loss_39": 1.6864412546157836, "ce_loss_52": 1.4016476958990096, "ce_loss_7": 2.6954882085323333, "epoch": 0.887, "grad_norm": 13.861337667665934, "kl_loss_13": 1995.8, "kl_loss_26": 932.7, "kl_loss_39": 558.6, "kl_loss_7": 2668.8, "learning_rate": 3.180296801041971e-05, "loss": 3116.05, "step": 8870 }, { "ce_loss_13": 2.406647819280624, "ce_loss_26": 1.8954071879386902, "ce_loss_39": 1.7191426277160644, "ce_loss_52": 1.4318138241767884, "ce_loss_7": 2.7277746230363844, "epoch": 0.888, "grad_norm": 14.18696677740657, "kl_loss_13": 1999.0, "kl_loss_26": 937.5, "kl_loss_39": 568.1, "kl_loss_7": 2671.2, "learning_rate": 3.124848637472688e-05, "loss": 3120.85, "step": 8880 }, { "ce_loss_13": 2.4183617502450945, "ce_loss_26": 1.9087469071149825, "ce_loss_39": 1.731605476140976, "ce_loss_52": 1.4397760301828384, "ce_loss_7": 2.7426804542541503, "epoch": 0.889, "grad_norm": 14.005614784571057, "kl_loss_13": 2023.4, "kl_loss_26": 938.6, "kl_loss_39": 572.0, "kl_loss_7": 2704.0, "learning_rate": 3.069872506157212e-05, "loss": 3140.35, "step": 8890 }, { "ce_loss_13": 2.3538796246051787, "ce_loss_26": 1.8491210967302323, "ce_loss_39": 1.6793664902448655, "ce_loss_52": 1.4003751114010812, "ce_loss_7": 2.6818648397922518, "epoch": 0.89, "grad_norm": 13.507964025411896, "kl_loss_13": 1969.8, "kl_loss_26": 909.9, "kl_loss_39": 551.1, "kl_loss_7": 2655.2, "learning_rate": 3.0153689607045842e-05, "loss": 3115.9, "step": 8900 }, { "ce_loss_13": 2.3964832425117493, "ce_loss_26": 1.8831844747066497, "ce_loss_39": 1.7046507805585862, "ce_loss_52": 1.4187346428632737, "ce_loss_7": 2.72187722325325, "epoch": 0.891, "grad_norm": 14.258655617102612, "kl_loss_13": 2013.6, "kl_loss_26": 929.9, "kl_loss_39": 559.3, "kl_loss_7": 2705.2, "learning_rate": 2.9613385499648926e-05, "loss": 3133.85, "step": 8910 }, { "ce_loss_13": 2.3858442664146424, "ce_loss_26": 1.8839757442474365, "ce_loss_39": 1.709816351532936, "ce_loss_52": 1.430666272342205, "ce_loss_7": 2.6985366463661196, "epoch": 0.892, "grad_norm": 14.05296041871021, "kl_loss_13": 1977.6, "kl_loss_26": 918.7, "kl_loss_39": 552.4, "kl_loss_7": 2640.0, "learning_rate": 2.9077818180237692e-05, "loss": 3161.85, "step": 8920 }, { "ce_loss_13": 2.4120604634284972, "ce_loss_26": 1.9131643176078796, "ce_loss_39": 1.741806897521019, "ce_loss_52": 1.4555314972996711, "ce_loss_7": 2.7308483004570006, "epoch": 0.893, "grad_norm": 14.216852449574784, "kl_loss_13": 1989.0, "kl_loss_26": 931.9, "kl_loss_39": 565.55, "kl_loss_7": 2658.8, "learning_rate": 2.8546993041969172e-05, "loss": 3113.0, "step": 8930 }, { "ce_loss_13": 2.417782390117645, "ce_loss_26": 1.9075176060199737, "ce_loss_39": 1.7230383425951004, "ce_loss_52": 1.428551298379898, "ce_loss_7": 2.737876206636429, "epoch": 0.894, "grad_norm": 13.81624953017635, "kl_loss_13": 2024.2, "kl_loss_26": 956.0, "kl_loss_39": 579.25, "kl_loss_7": 2694.8, "learning_rate": 2.802091543024671e-05, "loss": 3118.3, "step": 8940 }, { "ce_loss_13": 2.4281566560268404, "ce_loss_26": 1.9209083169698715, "ce_loss_39": 1.7325717121362687, "ce_loss_52": 1.4276321291923524, "ce_loss_7": 2.7494910418987275, "epoch": 0.895, "grad_norm": 14.667622613471202, "kl_loss_13": 2068.0, "kl_loss_26": 993.0, "kl_loss_39": 599.95, "kl_loss_7": 2738.4, "learning_rate": 2.7499590642665774e-05, "loss": 3152.3, "step": 8950 }, { "ce_loss_13": 2.4123401612043383, "ce_loss_26": 1.9057573080062866, "ce_loss_39": 1.725106343626976, "ce_loss_52": 1.43211932182312, "ce_loss_7": 2.7356060326099394, "epoch": 0.896, "grad_norm": 13.84770444101258, "kl_loss_13": 2014.8, "kl_loss_26": 952.6, "kl_loss_39": 575.75, "kl_loss_7": 2696.2, "learning_rate": 2.6983023928961405e-05, "loss": 3131.5, "step": 8960 }, { "ce_loss_13": 2.3709884881973267, "ce_loss_26": 1.868075394630432, "ce_loss_39": 1.6942670613527298, "ce_loss_52": 1.3973538905382157, "ce_loss_7": 2.69813577234745, "epoch": 0.897, "grad_norm": 14.584597117704368, "kl_loss_13": 2004.8, "kl_loss_26": 940.7, "kl_loss_39": 576.3, "kl_loss_7": 2687.2, "learning_rate": 2.6471220490954628e-05, "loss": 3144.15, "step": 8970 }, { "ce_loss_13": 2.421136862039566, "ce_loss_26": 1.9213113605976104, "ce_loss_39": 1.7483066588640213, "ce_loss_52": 1.4667307168245316, "ce_loss_7": 2.7391393184661865, "epoch": 0.898, "grad_norm": 14.053042951615785, "kl_loss_13": 1968.0, "kl_loss_26": 914.9, "kl_loss_39": 554.05, "kl_loss_7": 2636.4, "learning_rate": 2.596418548250029e-05, "loss": 3077.8, "step": 8980 }, { "ce_loss_13": 2.383489468693733, "ce_loss_26": 1.8838744014501572, "ce_loss_39": 1.7122972816228867, "ce_loss_52": 1.4258252471685409, "ce_loss_7": 2.7002600908279417, "epoch": 0.899, "grad_norm": 13.786718301064743, "kl_loss_13": 1996.2, "kl_loss_26": 931.7, "kl_loss_39": 571.05, "kl_loss_7": 2666.4, "learning_rate": 2.5461924009435368e-05, "loss": 3075.25, "step": 8990 }, { "ce_loss_13": 2.3905730485916137, "ce_loss_26": 1.8808085292577743, "ce_loss_39": 1.7040841788053513, "ce_loss_52": 1.4255578130483628, "ce_loss_7": 2.7172752916812897, "epoch": 0.9, "grad_norm": 14.142930179347214, "kl_loss_13": 1983.0, "kl_loss_26": 912.3, "kl_loss_39": 546.25, "kl_loss_7": 2670.0, "learning_rate": 2.4964441129527336e-05, "loss": 3116.85, "step": 9000 }, { "ce_loss_13": 2.43511378467083, "ce_loss_26": 1.926012173295021, "ce_loss_39": 1.7415268182754517, "ce_loss_52": 1.4408889025449754, "ce_loss_7": 2.761294722557068, "epoch": 0.901, "grad_norm": 13.928144692729873, "kl_loss_13": 2050.6, "kl_loss_26": 980.3, "kl_loss_39": 592.65, "kl_loss_7": 2732.0, "learning_rate": 2.4471741852423235e-05, "loss": 3132.7, "step": 9010 }, { "ce_loss_13": 2.3582999795675277, "ce_loss_26": 1.8586651980876923, "ce_loss_39": 1.6817226380109787, "ce_loss_52": 1.3925694867968559, "ce_loss_7": 2.6693267047405245, "epoch": 0.902, "grad_norm": 14.116392246566738, "kl_loss_13": 1978.6, "kl_loss_26": 929.3, "kl_loss_39": 571.5, "kl_loss_7": 2638.8, "learning_rate": 2.3983831139599287e-05, "loss": 3114.75, "step": 9020 }, { "ce_loss_13": 2.418634516000748, "ce_loss_26": 1.9078166902065277, "ce_loss_39": 1.7249888181686401, "ce_loss_52": 1.4335400015115738, "ce_loss_7": 2.7412546992301943, "epoch": 0.903, "grad_norm": 13.637306812880459, "kl_loss_13": 2036.0, "kl_loss_26": 950.4, "kl_loss_39": 575.5, "kl_loss_7": 2710.0, "learning_rate": 2.3500713904311022e-05, "loss": 3133.35, "step": 9030 }, { "ce_loss_13": 2.3931309431791306, "ce_loss_26": 1.877245968580246, "ce_loss_39": 1.7018247723579407, "ce_loss_52": 1.4155418664216994, "ce_loss_7": 2.7214196979999543, "epoch": 0.904, "grad_norm": 14.785773850168622, "kl_loss_13": 2014.2, "kl_loss_26": 931.6, "kl_loss_39": 566.7, "kl_loss_7": 2697.2, "learning_rate": 2.3022395011543685e-05, "loss": 3107.4, "step": 9040 }, { "ce_loss_13": 2.4141552269458773, "ce_loss_26": 1.9003157913684845, "ce_loss_39": 1.7119427561759948, "ce_loss_52": 1.4218156844377519, "ce_loss_7": 2.738201731443405, "epoch": 0.905, "grad_norm": 15.004045135842484, "kl_loss_13": 2046.4, "kl_loss_26": 956.8, "kl_loss_39": 574.8, "kl_loss_7": 2728.0, "learning_rate": 2.2548879277963063e-05, "loss": 3131.35, "step": 9050 }, { "ce_loss_13": 2.4368073105812074, "ce_loss_26": 1.9358865648508072, "ce_loss_39": 1.758334356546402, "ce_loss_52": 1.4668044418096542, "ce_loss_7": 2.7515600681304933, "epoch": 0.906, "grad_norm": 14.409368784921233, "kl_loss_13": 1999.0, "kl_loss_26": 944.8, "kl_loss_39": 573.2, "kl_loss_7": 2666.4, "learning_rate": 2.208017147186736e-05, "loss": 3129.15, "step": 9060 }, { "ce_loss_13": 2.4492080837488173, "ce_loss_26": 1.9512592017650605, "ce_loss_39": 1.7678290545940398, "ce_loss_52": 1.4713279128074646, "ce_loss_7": 2.772859865427017, "epoch": 0.907, "grad_norm": 14.409868370647818, "kl_loss_13": 2027.6, "kl_loss_26": 962.9, "kl_loss_39": 580.75, "kl_loss_7": 2708.4, "learning_rate": 2.1616276313139227e-05, "loss": 3136.45, "step": 9070 }, { "ce_loss_13": 2.3629566222429275, "ce_loss_26": 1.866456887125969, "ce_loss_39": 1.692075565457344, "ce_loss_52": 1.4036286368966102, "ce_loss_7": 2.681915229558945, "epoch": 0.908, "grad_norm": 13.121489500516578, "kl_loss_13": 1981.6, "kl_loss_26": 934.0, "kl_loss_39": 568.15, "kl_loss_7": 2653.6, "learning_rate": 2.1157198473197415e-05, "loss": 3145.35, "step": 9080 }, { "ce_loss_13": 2.4373748511075974, "ce_loss_26": 1.9266161501407624, "ce_loss_39": 1.7425854057073593, "ce_loss_52": 1.4446089684963226, "ce_loss_7": 2.7682818710803985, "epoch": 0.909, "grad_norm": 14.230511024408004, "kl_loss_13": 2056.6, "kl_loss_26": 973.2, "kl_loss_39": 588.0, "kl_loss_7": 2750.4, "learning_rate": 2.0702942574950812e-05, "loss": 3127.5, "step": 9090 }, { "ce_loss_13": 2.4287337332963945, "ce_loss_26": 1.9172603338956833, "ce_loss_39": 1.7392981857061387, "ce_loss_52": 1.4425375372171403, "ce_loss_7": 2.7510289788246154, "epoch": 0.91, "grad_norm": 14.057406508919666, "kl_loss_13": 2046.6, "kl_loss_26": 964.4, "kl_loss_39": 593.1, "kl_loss_7": 2722.4, "learning_rate": 2.025351319275137e-05, "loss": 3121.6, "step": 9100 }, { "ce_loss_13": 2.4320779502391816, "ce_loss_26": 1.9104785054922104, "ce_loss_39": 1.7267427280545236, "ce_loss_52": 1.437354525923729, "ce_loss_7": 2.7588888108730316, "epoch": 0.911, "grad_norm": 14.001627192514695, "kl_loss_13": 2041.6, "kl_loss_26": 960.0, "kl_loss_39": 579.65, "kl_loss_7": 2727.2, "learning_rate": 1.9808914852347816e-05, "loss": 3132.0, "step": 9110 }, { "ce_loss_13": 2.4577192962169647, "ce_loss_26": 1.9406163454055787, "ce_loss_39": 1.7553843706846237, "ce_loss_52": 1.4540565699338912, "ce_loss_7": 2.7937385022640226, "epoch": 0.912, "grad_norm": 14.03191291932469, "kl_loss_13": 2055.4, "kl_loss_26": 970.4, "kl_loss_39": 590.75, "kl_loss_7": 2749.6, "learning_rate": 1.9369152030840554e-05, "loss": 3137.8, "step": 9120 }, { "ce_loss_13": 2.37432479262352, "ce_loss_26": 1.874118760228157, "ce_loss_39": 1.6963829159736634, "ce_loss_52": 1.4185152500867844, "ce_loss_7": 2.692077511548996, "epoch": 0.913, "grad_norm": 14.634301127161054, "kl_loss_13": 1966.6, "kl_loss_26": 913.3, "kl_loss_39": 546.15, "kl_loss_7": 2640.0, "learning_rate": 1.893422915663645e-05, "loss": 3130.35, "step": 9130 }, { "ce_loss_13": 2.485106924176216, "ce_loss_26": 1.9777852237224578, "ce_loss_39": 1.7995707392692566, "ce_loss_52": 1.4930359899997712, "ce_loss_7": 2.800886517763138, "epoch": 0.914, "grad_norm": 13.987291564294976, "kl_loss_13": 2050.4, "kl_loss_26": 985.4, "kl_loss_39": 605.6, "kl_loss_7": 2732.4, "learning_rate": 1.850415060940386e-05, "loss": 3103.7, "step": 9140 }, { "ce_loss_13": 2.4284686923027037, "ce_loss_26": 1.928224155306816, "ce_loss_39": 1.7489481002092362, "ce_loss_52": 1.4568757116794586, "ce_loss_7": 2.7469853341579435, "epoch": 0.915, "grad_norm": 14.256565267478091, "kl_loss_13": 2014.8, "kl_loss_26": 950.5, "kl_loss_39": 576.35, "kl_loss_7": 2683.2, "learning_rate": 1.8078920720028978e-05, "loss": 3089.85, "step": 9150 }, { "ce_loss_13": 2.397159770131111, "ce_loss_26": 1.9007071822881698, "ce_loss_39": 1.7246300727128983, "ce_loss_52": 1.4446155533194542, "ce_loss_7": 2.7181631565093993, "epoch": 0.916, "grad_norm": 14.893106945439357, "kl_loss_13": 1959.0, "kl_loss_26": 905.8, "kl_loss_39": 547.35, "kl_loss_7": 2633.6, "learning_rate": 1.765854377057219e-05, "loss": 3113.25, "step": 9160 }, { "ce_loss_13": 2.391612654924393, "ce_loss_26": 1.8863595336675645, "ce_loss_39": 1.7089181810617446, "ce_loss_52": 1.4133323535323143, "ce_loss_7": 2.7117814838886263, "epoch": 0.917, "grad_norm": 13.727089751993333, "kl_loss_13": 2026.6, "kl_loss_26": 952.4, "kl_loss_39": 579.55, "kl_loss_7": 2699.2, "learning_rate": 1.724302399422456e-05, "loss": 3114.9, "step": 9170 }, { "ce_loss_13": 2.3898652464151384, "ce_loss_26": 1.8965242326259613, "ce_loss_39": 1.7190593391656876, "ce_loss_52": 1.4282272264361382, "ce_loss_7": 2.70514101088047, "epoch": 0.918, "grad_norm": 14.466760453933187, "kl_loss_13": 1964.6, "kl_loss_26": 920.2, "kl_loss_39": 557.4, "kl_loss_7": 2637.2, "learning_rate": 1.683236557526574e-05, "loss": 3117.7, "step": 9180 }, { "ce_loss_13": 2.363905116915703, "ce_loss_26": 1.8732207268476486, "ce_loss_39": 1.6985140055418015, "ce_loss_52": 1.4114336684346198, "ce_loss_7": 2.6808120787143705, "epoch": 0.919, "grad_norm": 13.740003598916001, "kl_loss_13": 1964.6, "kl_loss_26": 921.6, "kl_loss_39": 557.8, "kl_loss_7": 2632.0, "learning_rate": 1.6426572649021475e-05, "loss": 3121.6, "step": 9190 }, { "ce_loss_13": 2.4235911548137663, "ce_loss_26": 1.911160832643509, "ce_loss_39": 1.7327239394187928, "ce_loss_52": 1.4489689737558364, "ce_loss_7": 2.740164947509766, "epoch": 0.92, "grad_norm": 14.26925795970607, "kl_loss_13": 2008.2, "kl_loss_26": 933.2, "kl_loss_39": 559.8, "kl_loss_7": 2678.0, "learning_rate": 1.6025649301821876e-05, "loss": 3113.6, "step": 9200 }, { "ce_loss_13": 2.4683689922094345, "ce_loss_26": 1.9488540649414063, "ce_loss_39": 1.7631026744842528, "ce_loss_52": 1.4620952308177948, "ce_loss_7": 2.794441765546799, "epoch": 0.921, "grad_norm": 14.076823047271347, "kl_loss_13": 2051.4, "kl_loss_26": 970.8, "kl_loss_39": 588.7, "kl_loss_7": 2729.2, "learning_rate": 1.5629599570960716e-05, "loss": 3104.8, "step": 9210 }, { "ce_loss_13": 2.3585807204246523, "ce_loss_26": 1.8531203657388686, "ce_loss_39": 1.6820847302675248, "ce_loss_52": 1.4014029562473298, "ce_loss_7": 2.6779536455869675, "epoch": 0.922, "grad_norm": 13.947276602522905, "kl_loss_13": 1987.2, "kl_loss_26": 913.9, "kl_loss_39": 557.25, "kl_loss_7": 2664.4, "learning_rate": 1.5238427444654367e-05, "loss": 3096.2, "step": 9220 }, { "ce_loss_13": 2.3690007477998734, "ce_loss_26": 1.8584237039089202, "ce_loss_39": 1.6857000291347504, "ce_loss_52": 1.3957198202610015, "ce_loss_7": 2.697771596908569, "epoch": 0.923, "grad_norm": 13.90974821940606, "kl_loss_13": 2016.4, "kl_loss_26": 937.5, "kl_loss_39": 574.65, "kl_loss_7": 2710.0, "learning_rate": 1.4852136862001764e-05, "loss": 3120.95, "step": 9230 }, { "ce_loss_13": 2.386936154961586, "ce_loss_26": 1.8795882225036622, "ce_loss_39": 1.7048650175333022, "ce_loss_52": 1.4252549767494203, "ce_loss_7": 2.7094414860010145, "epoch": 0.924, "grad_norm": 14.075158349443141, "kl_loss_13": 1994.4, "kl_loss_26": 926.6, "kl_loss_39": 558.55, "kl_loss_7": 2674.0, "learning_rate": 1.4470731712944884e-05, "loss": 3095.9, "step": 9240 }, { "ce_loss_13": 2.480703926086426, "ce_loss_26": 1.9693680822849273, "ce_loss_39": 1.7812021166086196, "ce_loss_52": 1.4721150636672973, "ce_loss_7": 2.808130669593811, "epoch": 0.925, "grad_norm": 13.823292097408816, "kl_loss_13": 2057.0, "kl_loss_26": 989.9, "kl_loss_39": 602.9, "kl_loss_7": 2740.4, "learning_rate": 1.4094215838229174e-05, "loss": 3114.1, "step": 9250 }, { "ce_loss_13": 2.415296331048012, "ce_loss_26": 1.9087094902992248, "ce_loss_39": 1.7277994453907013, "ce_loss_52": 1.4381081372499467, "ce_loss_7": 2.738765448331833, "epoch": 0.926, "grad_norm": 14.170973390394844, "kl_loss_13": 2029.4, "kl_loss_26": 949.8, "kl_loss_39": 570.55, "kl_loss_7": 2714.0, "learning_rate": 1.372259302936546e-05, "loss": 3105.95, "step": 9260 }, { "ce_loss_13": 2.348677235841751, "ce_loss_26": 1.8576316490769387, "ce_loss_39": 1.6828459605574608, "ce_loss_52": 1.3921316027641297, "ce_loss_7": 2.6635967582464217, "epoch": 0.927, "grad_norm": 14.119512693749174, "kl_loss_13": 1987.8, "kl_loss_26": 934.2, "kl_loss_39": 566.95, "kl_loss_7": 2660.0, "learning_rate": 1.3355867028591206e-05, "loss": 3097.55, "step": 9270 }, { "ce_loss_13": 2.3999607056379317, "ce_loss_26": 1.8902852058410644, "ce_loss_39": 1.7127097964286804, "ce_loss_52": 1.4248219341039658, "ce_loss_7": 2.7230198085308075, "epoch": 0.928, "grad_norm": 14.447160188213138, "kl_loss_13": 1992.2, "kl_loss_26": 927.2, "kl_loss_39": 561.6, "kl_loss_7": 2679.2, "learning_rate": 1.2994041528833267e-05, "loss": 3090.65, "step": 9280 }, { "ce_loss_13": 2.502937263250351, "ce_loss_26": 1.9786556929349899, "ce_loss_39": 1.7848447173833848, "ce_loss_52": 1.4703152477741241, "ce_loss_7": 2.837230235338211, "epoch": 0.929, "grad_norm": 13.618067390259025, "kl_loss_13": 2119.0, "kl_loss_26": 1012.1, "kl_loss_39": 614.9, "kl_loss_7": 2810.8, "learning_rate": 1.2637120173670358e-05, "loss": 3139.05, "step": 9290 }, { "ce_loss_13": 2.4561884820461275, "ce_loss_26": 1.9343136429786683, "ce_loss_39": 1.747347640991211, "ce_loss_52": 1.4338387340307235, "ce_loss_7": 2.7828237235546114, "epoch": 0.93, "grad_norm": 14.04284428447472, "kl_loss_13": 2098.8, "kl_loss_26": 1005.3, "kl_loss_39": 621.35, "kl_loss_7": 2786.4, "learning_rate": 1.2285106557296478e-05, "loss": 3143.8, "step": 9300 }, { "ce_loss_13": 2.3713702976703646, "ce_loss_26": 1.868654829263687, "ce_loss_39": 1.6906698912382125, "ce_loss_52": 1.409242296218872, "ce_loss_7": 2.7044317960739135, "epoch": 0.931, "grad_norm": 14.323129542469303, "kl_loss_13": 2006.0, "kl_loss_26": 926.0, "kl_loss_39": 564.0, "kl_loss_7": 2676.4, "learning_rate": 1.1938004224484989e-05, "loss": 3116.75, "step": 9310 }, { "ce_loss_13": 2.428625673055649, "ce_loss_26": 1.9227415055036545, "ce_loss_39": 1.7408353060483932, "ce_loss_52": 1.4440293073654176, "ce_loss_7": 2.7480487704277037, "epoch": 0.932, "grad_norm": 13.286547502225938, "kl_loss_13": 2016.8, "kl_loss_26": 948.6, "kl_loss_39": 573.45, "kl_loss_7": 2695.6, "learning_rate": 1.1595816670552429e-05, "loss": 3098.9, "step": 9320 }, { "ce_loss_13": 2.3783950984477995, "ce_loss_26": 1.871030893921852, "ce_loss_39": 1.6935174107551574, "ce_loss_52": 1.4064364448189735, "ce_loss_7": 2.709024131298065, "epoch": 0.933, "grad_norm": 13.906014402683894, "kl_loss_13": 2025.8, "kl_loss_26": 943.2, "kl_loss_39": 573.45, "kl_loss_7": 2710.0, "learning_rate": 1.1258547341323699e-05, "loss": 3112.75, "step": 9330 }, { "ce_loss_13": 2.418292981386185, "ce_loss_26": 1.9166706264019013, "ce_loss_39": 1.7358173072338103, "ce_loss_52": 1.4464493066072464, "ce_loss_7": 2.7368280410766603, "epoch": 0.934, "grad_norm": 13.814674823123692, "kl_loss_13": 2007.6, "kl_loss_26": 936.8, "kl_loss_39": 570.05, "kl_loss_7": 2680.4, "learning_rate": 1.0926199633097156e-05, "loss": 3089.9, "step": 9340 }, { "ce_loss_13": 2.421270787715912, "ce_loss_26": 1.9069751173257827, "ce_loss_39": 1.7246074616909026, "ce_loss_52": 1.4261210292577744, "ce_loss_7": 2.7438779413700103, "epoch": 0.935, "grad_norm": 14.043137766458962, "kl_loss_13": 2039.4, "kl_loss_26": 967.3, "kl_loss_39": 585.6, "kl_loss_7": 2722.8, "learning_rate": 1.0598776892610684e-05, "loss": 3103.7, "step": 9350 }, { "ce_loss_13": 2.4761788189411162, "ce_loss_26": 1.9670240104198455, "ce_loss_39": 1.7913133591413497, "ce_loss_52": 1.5009067565202714, "ce_loss_7": 2.7962326526641847, "epoch": 0.936, "grad_norm": 13.716488729093573, "kl_loss_13": 2008.0, "kl_loss_26": 945.5, "kl_loss_39": 573.65, "kl_loss_7": 2685.6, "learning_rate": 1.0276282417007399e-05, "loss": 3106.15, "step": 9360 }, { "ce_loss_13": 2.418415975570679, "ce_loss_26": 1.916797822713852, "ce_loss_39": 1.741264235973358, "ce_loss_52": 1.452787458896637, "ce_loss_7": 2.744573098421097, "epoch": 0.937, "grad_norm": 13.43858517345926, "kl_loss_13": 1999.2, "kl_loss_26": 936.2, "kl_loss_39": 569.25, "kl_loss_7": 2682.0, "learning_rate": 9.958719453803277e-06, "loss": 3109.9, "step": 9370 }, { "ce_loss_13": 2.3895679712295532, "ce_loss_26": 1.8777837812900544, "ce_loss_39": 1.6948266059160233, "ce_loss_52": 1.4046757638454437, "ce_loss_7": 2.7213546216487883, "epoch": 0.938, "grad_norm": 14.159970740647127, "kl_loss_13": 2015.8, "kl_loss_26": 945.0, "kl_loss_39": 569.8, "kl_loss_7": 2706.4, "learning_rate": 9.646091200853802e-06, "loss": 3110.975, "step": 9380 }, { "ce_loss_13": 2.389327567815781, "ce_loss_26": 1.8952437072992325, "ce_loss_39": 1.7221183687448502, "ce_loss_52": 1.4358048617839814, "ce_loss_7": 2.7049793720245363, "epoch": 0.939, "grad_norm": 13.344604295880838, "kl_loss_13": 1964.8, "kl_loss_26": 921.8, "kl_loss_39": 560.1, "kl_loss_7": 2638.0, "learning_rate": 9.338400806321978e-06, "loss": 3087.6, "step": 9390 }, { "ce_loss_13": 2.4069793194532396, "ce_loss_26": 1.8967778533697128, "ce_loss_39": 1.7197368562221527, "ce_loss_52": 1.436858707666397, "ce_loss_7": 2.7245797514915466, "epoch": 0.94, "grad_norm": 13.71832049440062, "kl_loss_13": 2002.0, "kl_loss_26": 937.3, "kl_loss_39": 566.65, "kl_loss_7": 2677.2, "learning_rate": 9.035651368646646e-06, "loss": 3124.5, "step": 9400 }, { "ce_loss_13": 2.3785787016153335, "ce_loss_26": 1.8779745906591416, "ce_loss_39": 1.7003175497055054, "ce_loss_52": 1.4152926355600357, "ce_loss_7": 2.6983864098787307, "epoch": 0.941, "grad_norm": 14.37750392989465, "kl_loss_13": 1983.0, "kl_loss_26": 932.4, "kl_loss_39": 563.55, "kl_loss_7": 2649.2, "learning_rate": 8.737845936511335e-06, "loss": 3126.15, "step": 9410 }, { "ce_loss_13": 2.4120055079460143, "ce_loss_26": 1.898421436548233, "ce_loss_39": 1.7264380306005478, "ce_loss_52": 1.4368474900722503, "ce_loss_7": 2.737997555732727, "epoch": 0.942, "grad_norm": 13.918132865929852, "kl_loss_13": 2037.6, "kl_loss_26": 952.9, "kl_loss_39": 578.25, "kl_loss_7": 2716.0, "learning_rate": 8.444987508813451e-06, "loss": 3086.75, "step": 9420 }, { "ce_loss_13": 2.4210041254758834, "ce_loss_26": 1.9090048849582673, "ce_loss_39": 1.732128456234932, "ce_loss_52": 1.4372442662715912, "ce_loss_7": 2.743331879377365, "epoch": 0.943, "grad_norm": 13.763467997379733, "kl_loss_13": 2025.8, "kl_loss_26": 953.0, "kl_loss_39": 582.75, "kl_loss_7": 2709.6, "learning_rate": 8.157079034633974e-06, "loss": 3102.0, "step": 9430 }, { "ce_loss_13": 2.390827241539955, "ce_loss_26": 1.8993241131305694, "ce_loss_39": 1.720950961112976, "ce_loss_52": 1.4380876436829566, "ce_loss_7": 2.708938491344452, "epoch": 0.944, "grad_norm": 13.65095927372199, "kl_loss_13": 1961.4, "kl_loss_26": 921.2, "kl_loss_39": 551.1, "kl_loss_7": 2626.8, "learning_rate": 7.874123413208145e-06, "loss": 3097.4, "step": 9440 }, { "ce_loss_13": 2.377914309501648, "ce_loss_26": 1.8749269813299179, "ce_loss_39": 1.699908110499382, "ce_loss_52": 1.4185950323939323, "ce_loss_7": 2.700740724802017, "epoch": 0.945, "grad_norm": 13.057374043926089, "kl_loss_13": 2000.2, "kl_loss_26": 931.2, "kl_loss_39": 558.45, "kl_loss_7": 2681.6, "learning_rate": 7.59612349389599e-06, "loss": 3113.0, "step": 9450 }, { "ce_loss_13": 2.428489762544632, "ce_loss_26": 1.9228288322687148, "ce_loss_39": 1.7434315174818038, "ce_loss_52": 1.4502023369073869, "ce_loss_7": 2.745371562242508, "epoch": 0.946, "grad_norm": 13.467471129956634, "kl_loss_13": 2003.8, "kl_loss_26": 954.8, "kl_loss_39": 579.9, "kl_loss_7": 2674.8, "learning_rate": 7.323082076153509e-06, "loss": 3110.35, "step": 9460 }, { "ce_loss_13": 2.3993911921977995, "ce_loss_26": 1.8969668239355086, "ce_loss_39": 1.7201234728097916, "ce_loss_52": 1.4261724770069122, "ce_loss_7": 2.723515260219574, "epoch": 0.947, "grad_norm": 14.123562965002943, "kl_loss_13": 1995.2, "kl_loss_26": 940.5, "kl_loss_39": 572.85, "kl_loss_7": 2665.2, "learning_rate": 7.055001909504755e-06, "loss": 3103.5, "step": 9470 }, { "ce_loss_13": 2.371204599738121, "ce_loss_26": 1.8650381177663804, "ce_loss_39": 1.6901476740837098, "ce_loss_52": 1.4073528528213501, "ce_loss_7": 2.69525728225708, "epoch": 0.948, "grad_norm": 13.7395501584386, "kl_loss_13": 2001.2, "kl_loss_26": 926.3, "kl_loss_39": 559.35, "kl_loss_7": 2675.2, "learning_rate": 6.791885693514133e-06, "loss": 3117.9, "step": 9480 }, { "ce_loss_13": 2.3874946534633636, "ce_loss_26": 1.8710165858268737, "ce_loss_39": 1.6918294131755829, "ce_loss_52": 1.4066254168748855, "ce_loss_7": 2.718043899536133, "epoch": 0.949, "grad_norm": 14.444149745541107, "kl_loss_13": 2026.6, "kl_loss_26": 936.5, "kl_loss_39": 563.35, "kl_loss_7": 2720.4, "learning_rate": 6.533736077758867e-06, "loss": 3144.35, "step": 9490 }, { "ce_loss_13": 2.3802697598934173, "ce_loss_26": 1.8696208387613296, "ce_loss_39": 1.6913905203342439, "ce_loss_52": 1.4053042978048325, "ce_loss_7": 2.706346648931503, "epoch": 0.95, "grad_norm": 13.683323615533283, "kl_loss_13": 2001.6, "kl_loss_26": 931.6, "kl_loss_39": 565.45, "kl_loss_7": 2685.6, "learning_rate": 6.2805556618028556e-06, "loss": 3132.15, "step": 9500 }, { "ce_loss_13": 2.45430488884449, "ce_loss_26": 1.9479888796806335, "ce_loss_39": 1.7709876328706742, "ce_loss_52": 1.4815271288156509, "ce_loss_7": 2.7643910527229307, "epoch": 0.951, "grad_norm": 14.559144572280042, "kl_loss_13": 2000.2, "kl_loss_26": 946.3, "kl_loss_39": 575.65, "kl_loss_7": 2660.0, "learning_rate": 6.032346995169968e-06, "loss": 3124.85, "step": 9510 }, { "ce_loss_13": 2.4814863801002502, "ce_loss_26": 1.976859924197197, "ce_loss_39": 1.795655995607376, "ce_loss_52": 1.4911505609750748, "ce_loss_7": 2.796732819080353, "epoch": 0.952, "grad_norm": 14.307827869099665, "kl_loss_13": 2052.0, "kl_loss_26": 982.8, "kl_loss_39": 599.65, "kl_loss_7": 2722.8, "learning_rate": 5.789112577318789e-06, "loss": 3131.25, "step": 9520 }, { "ce_loss_13": 2.370393967628479, "ce_loss_26": 1.8703551948070527, "ce_loss_39": 1.6911178916692733, "ce_loss_52": 1.397288253903389, "ce_loss_7": 2.6921759128570555, "epoch": 0.953, "grad_norm": 13.532268906242434, "kl_loss_13": 2004.2, "kl_loss_26": 951.2, "kl_loss_39": 578.45, "kl_loss_7": 2674.4, "learning_rate": 5.550854857617194e-06, "loss": 3093.1, "step": 9530 }, { "ce_loss_13": 2.3659786969423293, "ce_loss_26": 1.855891814827919, "ce_loss_39": 1.6768086194992065, "ce_loss_52": 1.3908632963895797, "ce_loss_7": 2.6853357315063477, "epoch": 0.954, "grad_norm": 14.753075788642166, "kl_loss_13": 2010.2, "kl_loss_26": 935.5, "kl_loss_39": 563.55, "kl_loss_7": 2681.6, "learning_rate": 5.317576235317756e-06, "loss": 3120.8, "step": 9540 }, { "ce_loss_13": 2.4337273120880125, "ce_loss_26": 1.9248733311891555, "ce_loss_39": 1.7499325275421143, "ce_loss_52": 1.4674480736255646, "ce_loss_7": 2.7530871987342835, "epoch": 0.955, "grad_norm": 13.311149092546296, "kl_loss_13": 1991.6, "kl_loss_26": 925.5, "kl_loss_39": 562.15, "kl_loss_7": 2663.6, "learning_rate": 5.089279059533658e-06, "loss": 3080.15, "step": 9550 }, { "ce_loss_13": 2.477786514163017, "ce_loss_26": 1.9535741955041885, "ce_loss_39": 1.7720552951097488, "ce_loss_52": 1.468274374306202, "ce_loss_7": 2.806012988090515, "epoch": 0.956, "grad_norm": 13.557904970493434, "kl_loss_13": 2082.8, "kl_loss_26": 980.6, "kl_loss_39": 601.5, "kl_loss_7": 2773.6, "learning_rate": 4.865965629214819e-06, "loss": 3106.7, "step": 9560 }, { "ce_loss_13": 2.4565936863422393, "ce_loss_26": 1.9573397368192673, "ce_loss_39": 1.773080477118492, "ce_loss_52": 1.4730938911437987, "ce_loss_7": 2.7732574224472044, "epoch": 0.957, "grad_norm": 14.224632183872556, "kl_loss_13": 2023.4, "kl_loss_26": 967.5, "kl_loss_39": 591.9, "kl_loss_7": 2693.2, "learning_rate": 4.6476381931251366e-06, "loss": 3121.6, "step": 9570 }, { "ce_loss_13": 2.3847708880901335, "ce_loss_26": 1.8858718812465667, "ce_loss_39": 1.7032645136117934, "ce_loss_52": 1.4209994703531266, "ce_loss_7": 2.708657431602478, "epoch": 0.958, "grad_norm": 13.699065805322247, "kl_loss_13": 1983.8, "kl_loss_26": 921.9, "kl_loss_39": 551.45, "kl_loss_7": 2664.4, "learning_rate": 4.434298949819449e-06, "loss": 3097.9, "step": 9580 }, { "ce_loss_13": 2.417462554574013, "ce_loss_26": 1.9159747958183289, "ce_loss_39": 1.7397230744361878, "ce_loss_52": 1.4456302881240846, "ce_loss_7": 2.7354709684848784, "epoch": 0.959, "grad_norm": 13.043144407859455, "kl_loss_13": 1990.0, "kl_loss_26": 941.8, "kl_loss_39": 574.8, "kl_loss_7": 2662.4, "learning_rate": 4.2259500476214406e-06, "loss": 3095.8, "step": 9590 }, { "ce_loss_13": 2.427861177921295, "ce_loss_26": 1.9228525012731552, "ce_loss_39": 1.7392638593912124, "ce_loss_52": 1.445976984500885, "ce_loss_7": 2.7459777116775514, "epoch": 0.96, "grad_norm": 13.619745852010974, "kl_loss_13": 2021.4, "kl_loss_26": 954.5, "kl_loss_39": 577.9, "kl_loss_7": 2698.8, "learning_rate": 4.02259358460233e-06, "loss": 3122.9, "step": 9600 }, { "ce_loss_13": 2.46402502655983, "ce_loss_26": 1.9587235629558564, "ce_loss_39": 1.7794159650802612, "ce_loss_52": 1.4795134991407395, "ce_loss_7": 2.7920902401208876, "epoch": 0.961, "grad_norm": 13.957824607337622, "kl_loss_13": 2034.8, "kl_loss_26": 964.3, "kl_loss_39": 588.35, "kl_loss_7": 2719.6, "learning_rate": 3.8242316085594916e-06, "loss": 3106.9, "step": 9610 }, { "ce_loss_13": 2.4026571094989775, "ce_loss_26": 1.8842627108097076, "ce_loss_39": 1.6962791502475738, "ce_loss_52": 1.4012041926383971, "ce_loss_7": 2.7310706257820128, "epoch": 0.962, "grad_norm": 13.847893098332637, "kl_loss_13": 2054.8, "kl_loss_26": 965.5, "kl_loss_39": 579.7, "kl_loss_7": 2743.2, "learning_rate": 3.630866116995757e-06, "loss": 3149.6, "step": 9620 }, { "ce_loss_13": 2.3699823945760725, "ce_loss_26": 1.8775872141122818, "ce_loss_39": 1.7078934848308562, "ce_loss_52": 1.427069191634655, "ce_loss_7": 2.68268860578537, "epoch": 0.963, "grad_norm": 14.006927808523123, "kl_loss_13": 1949.4, "kl_loss_26": 909.6, "kl_loss_39": 550.8, "kl_loss_7": 2613.6, "learning_rate": 3.4424990570994797e-06, "loss": 3088.75, "step": 9630 }, { "ce_loss_13": 2.4296331614255906, "ce_loss_26": 1.923089200258255, "ce_loss_39": 1.7481043189764023, "ce_loss_52": 1.458402395248413, "ce_loss_7": 2.7494013249874114, "epoch": 0.964, "grad_norm": 13.971614829348757, "kl_loss_13": 1997.2, "kl_loss_26": 933.6, "kl_loss_39": 570.15, "kl_loss_7": 2669.6, "learning_rate": 3.2591323257248896e-06, "loss": 3114.2, "step": 9640 }, { "ce_loss_13": 2.4253605216741563, "ce_loss_26": 1.920077031850815, "ce_loss_39": 1.7424649715423584, "ce_loss_52": 1.4570231169462204, "ce_loss_7": 2.744251537322998, "epoch": 0.965, "grad_norm": 13.868560987861438, "kl_loss_13": 2000.2, "kl_loss_26": 937.9, "kl_loss_39": 567.9, "kl_loss_7": 2672.0, "learning_rate": 3.0807677693729385e-06, "loss": 3117.15, "step": 9650 }, { "ce_loss_13": 2.445168226957321, "ce_loss_26": 1.9308179676532746, "ce_loss_39": 1.7556968212127686, "ce_loss_52": 1.462259876728058, "ce_loss_7": 2.7607338547706606, "epoch": 0.966, "grad_norm": 13.868630790191208, "kl_loss_13": 2030.0, "kl_loss_26": 955.1, "kl_loss_39": 582.55, "kl_loss_7": 2710.4, "learning_rate": 2.9074071841727055e-06, "loss": 3136.15, "step": 9660 }, { "ce_loss_13": 2.3746090680360794, "ce_loss_26": 1.8696930974721908, "ce_loss_39": 1.6932952284812928, "ce_loss_52": 1.4037827536463738, "ce_loss_7": 2.6961917489767075, "epoch": 0.967, "grad_norm": 13.741681034653173, "kl_loss_13": 2003.4, "kl_loss_26": 943.6, "kl_loss_39": 570.8, "kl_loss_7": 2675.6, "learning_rate": 2.739052315863355e-06, "loss": 3123.325, "step": 9670 }, { "ce_loss_13": 2.4609276592731475, "ce_loss_26": 1.9465218961238862, "ce_loss_39": 1.755302396416664, "ce_loss_52": 1.4513660728931428, "ce_loss_7": 2.783074140548706, "epoch": 0.968, "grad_norm": 13.849931878044563, "kl_loss_13": 2075.0, "kl_loss_26": 988.8, "kl_loss_39": 596.8, "kl_loss_7": 2755.6, "learning_rate": 2.5757048597765396e-06, "loss": 3108.55, "step": 9680 }, { "ce_loss_13": 2.3671315789222716, "ce_loss_26": 1.8652487874031067, "ce_loss_39": 1.6895152300596237, "ce_loss_52": 1.4115738093852996, "ce_loss_7": 2.6859952569007874, "epoch": 0.969, "grad_norm": 14.271644916152136, "kl_loss_13": 1975.0, "kl_loss_26": 910.5, "kl_loss_39": 544.7, "kl_loss_7": 2649.2, "learning_rate": 2.417366460819359e-06, "loss": 3094.15, "step": 9690 }, { "ce_loss_13": 2.4030214190483092, "ce_loss_26": 1.8979378938674927, "ce_loss_39": 1.7221683353185653, "ce_loss_52": 1.4381350710988046, "ce_loss_7": 2.723929351568222, "epoch": 0.97, "grad_norm": 13.991735266099393, "kl_loss_13": 1985.0, "kl_loss_26": 927.7, "kl_loss_39": 554.55, "kl_loss_7": 2656.0, "learning_rate": 2.2640387134577057e-06, "loss": 3121.05, "step": 9700 }, { "ce_loss_13": 2.388926792144775, "ce_loss_26": 1.8821999937295915, "ce_loss_39": 1.7065987050533296, "ce_loss_52": 1.427994754910469, "ce_loss_7": 2.7048760533332823, "epoch": 0.971, "grad_norm": 14.179862402525806, "kl_loss_13": 1973.6, "kl_loss_26": 911.0, "kl_loss_39": 548.95, "kl_loss_7": 2646.4, "learning_rate": 2.115723161700278e-06, "loss": 3136.0, "step": 9710 }, { "ce_loss_13": 2.448368564248085, "ce_loss_26": 1.9373161673545838, "ce_loss_39": 1.7496683716773986, "ce_loss_52": 1.4501112252473831, "ce_loss_7": 2.7681332349777223, "epoch": 0.972, "grad_norm": 13.312305733104958, "kl_loss_13": 2063.8, "kl_loss_26": 986.5, "kl_loss_39": 593.8, "kl_loss_7": 2737.6, "learning_rate": 1.9724212990830937e-06, "loss": 3096.45, "step": 9720 }, { "ce_loss_13": 2.40165196955204, "ce_loss_26": 1.9097970753908158, "ce_loss_39": 1.732149314880371, "ce_loss_52": 1.4418910443782806, "ce_loss_7": 2.7182520925998688, "epoch": 0.973, "grad_norm": 13.186413860645287, "kl_loss_13": 1986.6, "kl_loss_26": 939.2, "kl_loss_39": 575.8, "kl_loss_7": 2649.6, "learning_rate": 1.8341345686543331e-06, "loss": 3096.7, "step": 9730 }, { "ce_loss_13": 2.4751327097415925, "ce_loss_26": 1.9718121886253357, "ce_loss_39": 1.7938049882650375, "ce_loss_52": 1.510325726866722, "ce_loss_7": 2.790462166070938, "epoch": 0.974, "grad_norm": 13.542368115510277, "kl_loss_13": 1991.0, "kl_loss_26": 919.4, "kl_loss_39": 555.25, "kl_loss_7": 2656.0, "learning_rate": 1.7008643629596864e-06, "loss": 3139.35, "step": 9740 }, { "ce_loss_13": 2.448350805044174, "ce_loss_26": 1.9393187165260315, "ce_loss_39": 1.756307190656662, "ce_loss_52": 1.4625631257891656, "ce_loss_7": 2.7839869439601896, "epoch": 0.975, "grad_norm": 14.153531935288711, "kl_loss_13": 2030.2, "kl_loss_26": 953.5, "kl_loss_39": 573.5, "kl_loss_7": 2731.6, "learning_rate": 1.5726120240288633e-06, "loss": 3091.65, "step": 9750 }, { "ce_loss_13": 2.493607670068741, "ce_loss_26": 1.9704292267560959, "ce_loss_39": 1.7810406684875488, "ce_loss_52": 1.4734347879886627, "ce_loss_7": 2.8225875020027162, "epoch": 0.976, "grad_norm": 13.917924014106907, "kl_loss_13": 2092.8, "kl_loss_26": 995.9, "kl_loss_39": 599.65, "kl_loss_7": 2777.6, "learning_rate": 1.4493788433612708e-06, "loss": 3106.15, "step": 9760 }, { "ce_loss_13": 2.389453822374344, "ce_loss_26": 1.889643257856369, "ce_loss_39": 1.7158276617527009, "ce_loss_52": 1.4275161743164062, "ce_loss_7": 2.7086060285568236, "epoch": 0.977, "grad_norm": 13.614062458586098, "kl_loss_13": 1971.2, "kl_loss_26": 924.0, "kl_loss_39": 561.0, "kl_loss_7": 2646.0, "learning_rate": 1.3311660619138578e-06, "loss": 3083.9, "step": 9770 }, { "ce_loss_13": 2.387269985675812, "ce_loss_26": 1.8734027475118638, "ce_loss_39": 1.6912487357854844, "ce_loss_52": 1.4031418770551682, "ce_loss_7": 2.7191080808639527, "epoch": 0.978, "grad_norm": 14.36678395992836, "kl_loss_13": 2023.2, "kl_loss_26": 943.6, "kl_loss_39": 565.3, "kl_loss_7": 2722.0, "learning_rate": 1.2179748700879012e-06, "loss": 3100.55, "step": 9780 }, { "ce_loss_13": 2.3631068110466003, "ce_loss_26": 1.861675202846527, "ce_loss_39": 1.6827853351831437, "ce_loss_52": 1.3983743026852609, "ce_loss_7": 2.6796926259994507, "epoch": 0.979, "grad_norm": 14.06766234963321, "kl_loss_13": 1993.6, "kl_loss_26": 930.1, "kl_loss_39": 561.8, "kl_loss_7": 2668.4, "learning_rate": 1.1098064077174619e-06, "loss": 3119.95, "step": 9790 }, { "ce_loss_13": 2.4609683632850645, "ce_loss_26": 1.948616126179695, "ce_loss_39": 1.7647934973239898, "ce_loss_52": 1.454009547829628, "ce_loss_7": 2.785689663887024, "epoch": 0.98, "grad_norm": 13.39999724987333, "kl_loss_13": 2070.0, "kl_loss_26": 989.6, "kl_loss_39": 607.8, "kl_loss_7": 2749.6, "learning_rate": 1.006661764057837e-06, "loss": 3101.0, "step": 9800 }, { "ce_loss_13": 2.3849492847919462, "ce_loss_26": 1.8622053205966949, "ce_loss_39": 1.6866901487112045, "ce_loss_52": 1.3904988124966622, "ce_loss_7": 2.713945233821869, "epoch": 0.981, "grad_norm": 13.776704297392317, "kl_loss_13": 2060.2, "kl_loss_26": 959.2, "kl_loss_39": 584.2, "kl_loss_7": 2753.2, "learning_rate": 9.085419777743465e-07, "loss": 3145.375, "step": 9810 }, { "ce_loss_13": 2.423547920584679, "ce_loss_26": 1.9212449431419372, "ce_loss_39": 1.7484615802764893, "ce_loss_52": 1.4525489255785942, "ce_loss_7": 2.7415110945701597, "epoch": 0.982, "grad_norm": 13.800146507088886, "kl_loss_13": 2026.2, "kl_loss_26": 956.5, "kl_loss_39": 585.85, "kl_loss_7": 2700.8, "learning_rate": 8.15448036932176e-07, "loss": 3140.075, "step": 9820 }, { "ce_loss_13": 2.4265142381191254, "ce_loss_26": 1.9230076640844345, "ce_loss_39": 1.7403117150068284, "ce_loss_52": 1.44933120906353, "ce_loss_7": 2.743628019094467, "epoch": 0.983, "grad_norm": 13.731982955757704, "kl_loss_13": 2035.6, "kl_loss_26": 967.5, "kl_loss_39": 589.65, "kl_loss_7": 2710.4, "learning_rate": 7.273808789862724e-07, "loss": 3097.325, "step": 9830 }, { "ce_loss_13": 2.432727184891701, "ce_loss_26": 1.9222358494997025, "ce_loss_39": 1.7392481476068498, "ce_loss_52": 1.4496447369456291, "ce_loss_7": 2.7563742280006407, "epoch": 0.984, "grad_norm": 14.26920700265725, "kl_loss_13": 2031.0, "kl_loss_26": 951.8, "kl_loss_39": 579.1, "kl_loss_7": 2718.0, "learning_rate": 6.443413907720186e-07, "loss": 3091.6, "step": 9840 }, { "ce_loss_13": 2.3425186455249785, "ce_loss_26": 1.8573100596666337, "ce_loss_39": 1.6887821286916733, "ce_loss_52": 1.3970566481351852, "ce_loss_7": 2.6601881802082064, "epoch": 0.985, "grad_norm": 14.017763045011195, "kl_loss_13": 1953.8, "kl_loss_26": 920.7, "kl_loss_39": 567.3, "kl_loss_7": 2625.2, "learning_rate": 5.663304084960185e-07, "loss": 3110.05, "step": 9850 }, { "ce_loss_13": 2.3719563096761704, "ce_loss_26": 1.8737152755260467, "ce_loss_39": 1.698423257470131, "ce_loss_52": 1.4208501130342484, "ce_loss_7": 2.6962190210819243, "epoch": 0.986, "grad_norm": 14.355657718500265, "kl_loss_13": 1944.8, "kl_loss_26": 901.7, "kl_loss_39": 537.7, "kl_loss_7": 2617.8, "learning_rate": 4.933487177280482e-07, "loss": 3084.175, "step": 9860 }, { "ce_loss_13": 2.4431921422481535, "ce_loss_26": 1.9350731909275054, "ce_loss_39": 1.7554681122303009, "ce_loss_52": 1.4598491072654725, "ce_loss_7": 2.761775279045105, "epoch": 0.987, "grad_norm": 14.504857971707292, "kl_loss_13": 2021.8, "kl_loss_26": 959.6, "kl_loss_39": 582.75, "kl_loss_7": 2686.8, "learning_rate": 4.2539705339295075e-07, "loss": 3095.3, "step": 9870 }, { "ce_loss_13": 2.3912162601947786, "ce_loss_26": 1.8867509424686433, "ce_loss_39": 1.7147331923246383, "ce_loss_52": 1.4274780035018921, "ce_loss_7": 2.7152205407619476, "epoch": 0.988, "grad_norm": 13.898651253722077, "kl_loss_13": 1978.6, "kl_loss_26": 918.4, "kl_loss_39": 560.0, "kl_loss_7": 2653.6, "learning_rate": 3.6247609976319816e-07, "loss": 3111.5, "step": 9880 }, { "ce_loss_13": 2.4592268586158754, "ce_loss_26": 1.9460572868585586, "ce_loss_39": 1.7624834805727005, "ce_loss_52": 1.4708713114261627, "ce_loss_7": 2.780032974481583, "epoch": 0.989, "grad_norm": 13.162597089199776, "kl_loss_13": 2017.4, "kl_loss_26": 950.2, "kl_loss_39": 574.95, "kl_loss_7": 2680.0, "learning_rate": 3.0458649045211895e-07, "loss": 3104.75, "step": 9890 }, { "ce_loss_13": 2.318621850013733, "ce_loss_26": 1.8226055085659028, "ce_loss_39": 1.6485673993825913, "ce_loss_52": 1.3747537702322006, "ce_loss_7": 2.6328552305698394, "epoch": 0.99, "grad_norm": 14.177492293626315, "kl_loss_13": 1957.6, "kl_loss_26": 901.5, "kl_loss_39": 543.2, "kl_loss_7": 2623.6, "learning_rate": 2.517288084074587e-07, "loss": 3097.8, "step": 9900 }, { "ce_loss_13": 2.450270253419876, "ce_loss_26": 1.9614087045192719, "ce_loss_39": 1.820476683974266, "ce_loss_52": 1.4863917350769043, "ce_loss_7": 2.776776838302612, "epoch": 0.991, "grad_norm": 13.637096409193225, "kl_loss_13": 2041.0, "kl_loss_26": 993.3, "kl_loss_39": 622.6, "kl_loss_7": 2724.0, "learning_rate": 2.0390358590538505e-07, "loss": 3133.55, "step": 9910 }, { "ce_loss_13": 2.4164200723171234, "ce_loss_26": 1.9075238525867462, "ce_loss_39": 1.7269282668828965, "ce_loss_52": 1.4252729326486588, "ce_loss_7": 2.7410891175270082, "epoch": 0.992, "grad_norm": 14.06088505281845, "kl_loss_13": 2041.6, "kl_loss_26": 973.6, "kl_loss_39": 593.7, "kl_loss_7": 2722.8, "learning_rate": 1.61111304545436e-07, "loss": 3101.25, "step": 9920 }, { "ce_loss_13": 2.4260194152593613, "ce_loss_26": 1.9236795336008072, "ce_loss_39": 1.7412341982126236, "ce_loss_52": 1.445477369427681, "ce_loss_7": 2.750396305322647, "epoch": 0.993, "grad_norm": 13.813190658092928, "kl_loss_13": 2043.4, "kl_loss_26": 963.8, "kl_loss_39": 591.1, "kl_loss_7": 2727.2, "learning_rate": 1.2335239524541298e-07, "loss": 3113.2, "step": 9930 }, { "ce_loss_13": 2.4003034621477126, "ce_loss_26": 1.8995478272438049, "ce_loss_39": 1.7172971665859222, "ce_loss_52": 1.4267783105373382, "ce_loss_7": 2.730258399248123, "epoch": 0.994, "grad_norm": 14.088698553106907, "kl_loss_13": 2017.2, "kl_loss_26": 955.6, "kl_loss_39": 580.2, "kl_loss_7": 2698.8, "learning_rate": 9.06272382371065e-08, "loss": 3112.45, "step": 9940 }, { "ce_loss_13": 2.3436710268259047, "ce_loss_26": 1.860148760676384, "ce_loss_39": 1.6837237626314163, "ce_loss_52": 1.405136799812317, "ce_loss_7": 2.659210926294327, "epoch": 0.995, "grad_norm": 13.69734193648067, "kl_loss_13": 1922.2, "kl_loss_26": 893.6, "kl_loss_39": 537.7, "kl_loss_7": 2583.2, "learning_rate": 6.293616306246586e-08, "loss": 3122.45, "step": 9950 }, { "ce_loss_13": 2.4028525710105897, "ce_loss_26": 1.9053959518671035, "ce_loss_39": 1.734974354505539, "ce_loss_52": 1.4462752103805543, "ce_loss_7": 2.7197851181030273, "epoch": 0.996, "grad_norm": 14.052327245568536, "kl_loss_13": 1985.8, "kl_loss_26": 933.9, "kl_loss_39": 572.05, "kl_loss_7": 2659.2, "learning_rate": 4.027944857032395e-08, "loss": 3115.75, "step": 9960 }, { "ce_loss_13": 2.3983709454536437, "ce_loss_26": 1.897152093052864, "ce_loss_39": 1.7151427894830704, "ce_loss_52": 1.4271936371922493, "ce_loss_7": 2.7171947032213213, "epoch": 0.997, "grad_norm": 13.53314291583234, "kl_loss_13": 1999.0, "kl_loss_26": 942.7, "kl_loss_39": 568.8, "kl_loss_7": 2670.0, "learning_rate": 2.265732291356626e-08, "loss": 3094.175, "step": 9970 }, { "ce_loss_13": 2.3423147082328795, "ce_loss_26": 1.8376368135213852, "ce_loss_39": 1.6704195857048034, "ce_loss_52": 1.3995309814810752, "ce_loss_7": 2.6571378737688063, "epoch": 0.998, "grad_norm": 13.139494465989356, "kl_loss_13": 1954.0, "kl_loss_26": 894.2, "kl_loss_39": 540.1, "kl_loss_7": 2622.6, "learning_rate": 1.0069963546743833e-08, "loss": 3091.8, "step": 9980 }, { "ce_loss_13": 2.371971958875656, "ce_loss_26": 1.8749190032482148, "ce_loss_39": 1.7022694885730743, "ce_loss_52": 1.4237666621804237, "ce_loss_7": 2.694683998823166, "epoch": 0.999, "grad_norm": 13.948245708314273, "kl_loss_13": 1952.8, "kl_loss_26": 907.5, "kl_loss_39": 547.75, "kl_loss_7": 2620.8, "learning_rate": 2.517497224463483e-09, "loss": 3089.0, "step": 9990 }, { "ce_loss_13": 2.4080661326646804, "ce_loss_26": 1.896571347117424, "ce_loss_39": 1.7096484139561654, "ce_loss_52": 1.4170419454574585, "ce_loss_7": 2.7374835878610613, "epoch": 1.0, "grad_norm": 13.857134097975196, "kl_loss_13": 2044.2, "kl_loss_26": 961.3, "kl_loss_39": 577.05, "kl_loss_7": 2731.6, "learning_rate": 0.0, "loss": 3103.8, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0167830278176768e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }