diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_13": 11.511599779129028, + "ce_loss_26": 11.188396453857422, + "ce_loss_39": 11.169448137283325, + "ce_loss_52": 1.3891706466674805, + "ce_loss_7": 11.556999206542969, + "epoch": 0.0001, + "grad_norm": 28.059961985369828, + "kl_loss_13": 20896.0, + "kl_loss_26": 20192.0, + "kl_loss_39": 20192.0, + "kl_loss_7": 20960.0, + "learning_rate": 1e-05, + "loss": 41080.0, + "step": 1 + }, + { + "ce_loss_13": 11.506269454956055, + "ce_loss_26": 11.177568621105618, + "ce_loss_39": 11.177141269048056, + "ce_loss_52": 1.458960132466422, + "ce_loss_7": 11.548744599024454, + "epoch": 0.001, + "grad_norm": 28.667146352003318, + "kl_loss_13": 20782.222222222223, + "kl_loss_26": 20106.666666666668, + "kl_loss_39": 20110.222222222223, + "kl_loss_7": 20867.555555555555, + "learning_rate": 0.0001, + "loss": 41008.8889, + "step": 10 + }, + { + "ce_loss_13": 11.456571412086486, + "ce_loss_26": 11.158088731765748, + "ce_loss_39": 11.15653133392334, + "ce_loss_52": 1.435088688135147, + "ce_loss_7": 11.476131820678711, + "epoch": 0.002, + "grad_norm": 35.67456270110165, + "kl_loss_13": 20723.2, + "kl_loss_26": 20118.4, + "kl_loss_39": 20115.2, + "kl_loss_7": 20764.8, + "learning_rate": 0.0002, + "loss": 40904.0, + "step": 20 + }, + { + "ce_loss_13": 11.150281167030334, + "ce_loss_26": 11.01296763420105, + "ce_loss_39": 11.044581699371339, + "ce_loss_52": 1.4344331562519073, + "ce_loss_7": 11.054779505729675, + "epoch": 0.003, + "grad_norm": 54.04879245830703, + "kl_loss_13": 20108.8, + "kl_loss_26": 19840.0, + "kl_loss_39": 19907.2, + "kl_loss_7": 19920.0, + "learning_rate": 0.0003, + "loss": 39847.2, + "step": 30 + }, + { + "ce_loss_13": 10.505089902877808, + "ce_loss_26": 10.497783017158508, + "ce_loss_39": 10.527814579010009, + "ce_loss_52": 1.460255417227745, + "ce_loss_7": 10.453347158432006, + "epoch": 0.004, + "grad_norm": 29.567872258029254, + "kl_loss_13": 18694.4, + "kl_loss_26": 18688.0, + "kl_loss_39": 18755.2, + "kl_loss_7": 18588.8, + "learning_rate": 0.0004, + "loss": 37436.0, + "step": 40 + }, + { + "ce_loss_13": 10.321261882781982, + "ce_loss_26": 10.244042158126831, + "ce_loss_39": 10.236308455467224, + "ce_loss_52": 1.463668829202652, + "ce_loss_7": 10.305184721946716, + "epoch": 0.005, + "grad_norm": 37.9866452371617, + "kl_loss_13": 18329.6, + "kl_loss_26": 18163.2, + "kl_loss_39": 18140.8, + "kl_loss_7": 18288.0, + "learning_rate": 0.0005, + "loss": 36555.2, + "step": 50 + }, + { + "ce_loss_13": 10.226529097557068, + "ce_loss_26": 10.111042308807374, + "ce_loss_39": 10.11395993232727, + "ce_loss_52": 1.4317695140838622, + "ce_loss_7": 10.210216856002807, + "epoch": 0.006, + "grad_norm": 47.58894649950276, + "kl_loss_13": 18208.0, + "kl_loss_26": 17974.4, + "kl_loss_39": 17980.8, + "kl_loss_7": 18166.4, + "learning_rate": 0.0006, + "loss": 36044.0, + "step": 60 + }, + { + "ce_loss_13": 10.142269968986511, + "ce_loss_26": 10.005614733695984, + "ce_loss_39": 10.006310772895812, + "ce_loss_52": 1.3979130625724792, + "ce_loss_7": 10.13636019229889, + "epoch": 0.007, + "grad_norm": 55.16387671378209, + "kl_loss_13": 18057.6, + "kl_loss_26": 17772.8, + "kl_loss_39": 17792.0, + "kl_loss_7": 18048.0, + "learning_rate": 0.0007, + "loss": 35718.4, + "step": 70 + }, + { + "ce_loss_13": 10.032484984397888, + "ce_loss_26": 9.872331905364991, + "ce_loss_39": 9.881066274642944, + "ce_loss_52": 1.4247985988855363, + "ce_loss_7": 10.02949812412262, + "epoch": 0.008, + "grad_norm": 59.28947925840698, + "kl_loss_13": 17811.2, + "kl_loss_26": 17488.0, + "kl_loss_39": 17500.8, + "kl_loss_7": 17808.0, + "learning_rate": 0.0008, + "loss": 35334.4, + "step": 80 + }, + { + "ce_loss_13": 9.942931509017944, + "ce_loss_26": 9.76176996231079, + "ce_loss_39": 9.775496363639832, + "ce_loss_52": 1.4258457243442535, + "ce_loss_7": 9.945418453216552, + "epoch": 0.009, + "grad_norm": 55.94798234885439, + "kl_loss_13": 17600.0, + "kl_loss_26": 17222.4, + "kl_loss_39": 17257.6, + "kl_loss_7": 17600.0, + "learning_rate": 0.0009000000000000001, + "loss": 34900.0, + "step": 90 + }, + { + "ce_loss_13": 9.852771949768066, + "ce_loss_26": 9.661247444152831, + "ce_loss_39": 9.673552298545838, + "ce_loss_52": 1.438367447257042, + "ce_loss_7": 9.860591006278991, + "epoch": 0.01, + "grad_norm": 53.3090381296634, + "kl_loss_13": 17385.6, + "kl_loss_26": 16992.0, + "kl_loss_39": 17024.0, + "kl_loss_7": 17398.4, + "learning_rate": 0.001, + "loss": 34482.8, + "step": 100 + }, + { + "ce_loss_13": 9.76314606666565, + "ce_loss_26": 9.563278603553773, + "ce_loss_39": 9.578891181945801, + "ce_loss_52": 1.412995059788227, + "ce_loss_7": 9.781555676460266, + "epoch": 0.011, + "grad_norm": 53.230976887502294, + "kl_loss_13": 17251.2, + "kl_loss_26": 16836.8, + "kl_loss_39": 16870.4, + "kl_loss_7": 17305.6, + "learning_rate": 0.0009999974825027757, + "loss": 34052.4, + "step": 110 + }, + { + "ce_loss_13": 9.681499814987182, + "ce_loss_26": 9.470890092849732, + "ce_loss_39": 9.48718273639679, + "ce_loss_52": 1.4235966846346855, + "ce_loss_7": 9.706467342376708, + "epoch": 0.012, + "grad_norm": 53.526819502242695, + "kl_loss_13": 17049.6, + "kl_loss_26": 16612.8, + "kl_loss_39": 16648.0, + "kl_loss_7": 17100.8, + "learning_rate": 0.0009999899300364532, + "loss": 33698.0, + "step": 120 + }, + { + "ce_loss_13": 9.590748715400697, + "ce_loss_26": 9.367487025260925, + "ce_loss_39": 9.386657476425171, + "ce_loss_52": 1.4183751314878463, + "ce_loss_7": 9.621446299552918, + "epoch": 0.013, + "grad_norm": 52.25839955403129, + "kl_loss_13": 16867.2, + "kl_loss_26": 16417.6, + "kl_loss_39": 16448.0, + "kl_loss_7": 16940.8, + "learning_rate": 0.0009999773426770863, + "loss": 33311.6, + "step": 130 + }, + { + "ce_loss_13": 9.527826118469239, + "ce_loss_26": 9.299377870559692, + "ce_loss_39": 9.321026277542114, + "ce_loss_52": 1.445027893781662, + "ce_loss_7": 9.561844515800477, + "epoch": 0.014, + "grad_norm": 52.41222674765903, + "kl_loss_13": 16692.8, + "kl_loss_26": 16227.2, + "kl_loss_39": 16273.6, + "kl_loss_7": 16777.6, + "learning_rate": 0.0009999597205514296, + "loss": 33030.8, + "step": 140 + }, + { + "ce_loss_13": 9.486352849006654, + "ce_loss_26": 9.252249264717102, + "ce_loss_39": 9.267914438247681, + "ce_loss_52": 1.4420335739850998, + "ce_loss_7": 9.524769949913026, + "epoch": 0.015, + "grad_norm": 53.790180993856175, + "kl_loss_13": 16592.0, + "kl_loss_26": 16104.0, + "kl_loss_39": 16132.8, + "kl_loss_7": 16657.6, + "learning_rate": 0.0009999370638369377, + "loss": 32769.2, + "step": 150 + }, + { + "ce_loss_13": 9.392394828796387, + "ce_loss_26": 9.15080394744873, + "ce_loss_39": 9.170540618896485, + "ce_loss_52": 1.423890632390976, + "ce_loss_7": 9.436835885047913, + "epoch": 0.016, + "grad_norm": 52.65338822593253, + "kl_loss_13": 16464.0, + "kl_loss_26": 15963.2, + "kl_loss_39": 16003.2, + "kl_loss_7": 16563.2, + "learning_rate": 0.000999909372761763, + "loss": 32427.6, + "step": 160 + }, + { + "ce_loss_13": 9.328035354614258, + "ce_loss_26": 9.082435154914856, + "ce_loss_39": 9.1047847032547, + "ce_loss_52": 1.4349344044923782, + "ce_loss_7": 9.380868554115295, + "epoch": 0.017, + "grad_norm": 52.40583142267758, + "kl_loss_13": 16296.0, + "kl_loss_26": 15777.6, + "kl_loss_39": 15832.0, + "kl_loss_7": 16404.8, + "learning_rate": 0.0009998766476047546, + "loss": 32178.8, + "step": 170 + }, + { + "ce_loss_13": 9.262916254997254, + "ce_loss_26": 9.01283278465271, + "ce_loss_39": 9.035380673408508, + "ce_loss_52": 1.3936711609363557, + "ce_loss_7": 9.322635316848755, + "epoch": 0.018, + "grad_norm": 52.346136942448574, + "kl_loss_13": 16240.0, + "kl_loss_26": 15716.8, + "kl_loss_39": 15769.6, + "kl_loss_7": 16363.2, + "learning_rate": 0.0009998388886954545, + "loss": 31844.4, + "step": 180 + }, + { + "ce_loss_13": 9.195696568489074, + "ce_loss_26": 8.94514548778534, + "ce_loss_39": 8.967138314247132, + "ce_loss_52": 1.4523959368467332, + "ce_loss_7": 9.263990116119384, + "epoch": 0.019, + "grad_norm": 51.44547467780483, + "kl_loss_13": 15985.6, + "kl_loss_26": 15464.0, + "kl_loss_39": 15515.2, + "kl_loss_7": 16132.8, + "learning_rate": 0.0009997960964140947, + "loss": 31580.0, + "step": 190 + }, + { + "ce_loss_13": 9.109345388412475, + "ce_loss_26": 8.856351280212403, + "ce_loss_39": 8.882033634185792, + "ce_loss_52": 1.425847691297531, + "ce_loss_7": 9.182338738441468, + "epoch": 0.02, + "grad_norm": 51.67086637607359, + "kl_loss_13": 15867.2, + "kl_loss_26": 15332.8, + "kl_loss_39": 15393.6, + "kl_loss_7": 16011.2, + "learning_rate": 0.0009997482711915926, + "loss": 31312.8, + "step": 200 + }, + { + "ce_loss_13": 9.036305499076843, + "ce_loss_26": 8.778529453277589, + "ce_loss_39": 8.803540563583374, + "ce_loss_52": 1.4626984983682632, + "ce_loss_7": 9.118139266967773, + "epoch": 0.021, + "grad_norm": 50.078507295298536, + "kl_loss_13": 15654.4, + "kl_loss_26": 15108.8, + "kl_loss_39": 15168.0, + "kl_loss_7": 15828.8, + "learning_rate": 0.0009996954135095479, + "loss": 31012.0, + "step": 210 + }, + { + "ce_loss_13": 8.98859736919403, + "ce_loss_26": 8.726609206199646, + "ce_loss_39": 8.75291087627411, + "ce_loss_52": 1.4173608794808388, + "ce_loss_7": 9.076500582695008, + "epoch": 0.022, + "grad_norm": 50.694312927544594, + "kl_loss_13": 15625.6, + "kl_loss_26": 15073.6, + "kl_loss_39": 15126.4, + "kl_loss_7": 15811.2, + "learning_rate": 0.0009996375239002368, + "loss": 30754.8, + "step": 220 + }, + { + "ce_loss_13": 8.965013265609741, + "ce_loss_26": 8.698115158081055, + "ce_loss_39": 8.71800787448883, + "ce_loss_52": 1.4263556391000747, + "ce_loss_7": 9.061096882820129, + "epoch": 0.023, + "grad_norm": 50.98814656536181, + "kl_loss_13": 15547.2, + "kl_loss_26": 14985.6, + "kl_loss_39": 15028.8, + "kl_loss_7": 15752.0, + "learning_rate": 0.0009995746029466072, + "loss": 30513.6, + "step": 230 + }, + { + "ce_loss_13": 8.891163158416749, + "ce_loss_26": 8.611021280288696, + "ce_loss_39": 8.63300838470459, + "ce_loss_52": 1.42348592877388, + "ce_loss_7": 8.998197555541992, + "epoch": 0.024, + "grad_norm": 51.614487435815626, + "kl_loss_13": 15393.6, + "kl_loss_26": 14806.4, + "kl_loss_39": 14848.0, + "kl_loss_7": 15619.2, + "learning_rate": 0.0009995066512822719, + "loss": 30248.4, + "step": 240 + }, + { + "ce_loss_13": 8.831620502471925, + "ce_loss_26": 8.545269632339478, + "ce_loss_39": 8.565073847770691, + "ce_loss_52": 1.452064010500908, + "ce_loss_7": 8.941647911071778, + "epoch": 0.025, + "grad_norm": 50.247038771654694, + "kl_loss_13": 15233.6, + "kl_loss_26": 14628.8, + "kl_loss_39": 14667.2, + "kl_loss_7": 15462.4, + "learning_rate": 0.000999433669591504, + "loss": 29955.6, + "step": 250 + }, + { + "ce_loss_13": 8.755429339408874, + "ce_loss_26": 8.469949841499329, + "ce_loss_39": 8.486004614830017, + "ce_loss_52": 1.4328533172607423, + "ce_loss_7": 8.874198198318481, + "epoch": 0.026, + "grad_norm": 49.63162227981652, + "kl_loss_13": 15075.2, + "kl_loss_26": 14473.6, + "kl_loss_39": 14508.8, + "kl_loss_7": 15326.4, + "learning_rate": 0.000999355658609228, + "loss": 29717.2, + "step": 260 + }, + { + "ce_loss_13": 8.697371244430542, + "ce_loss_26": 8.401416063308716, + "ce_loss_39": 8.412619948387146, + "ce_loss_52": 1.4436978071928024, + "ce_loss_7": 8.819016146659852, + "epoch": 0.027, + "grad_norm": 51.15355718783705, + "kl_loss_13": 14974.4, + "kl_loss_26": 14347.2, + "kl_loss_39": 14377.6, + "kl_loss_7": 15235.2, + "learning_rate": 0.0009992726191210138, + "loss": 29500.4, + "step": 270 + }, + { + "ce_loss_13": 8.66446521282196, + "ce_loss_26": 8.36073157787323, + "ce_loss_39": 8.373045516014098, + "ce_loss_52": 1.4337088972330094, + "ce_loss_7": 8.793687105178833, + "epoch": 0.028, + "grad_norm": 50.66919780267985, + "kl_loss_13": 14888.0, + "kl_loss_26": 14254.4, + "kl_loss_39": 14280.0, + "kl_loss_7": 15168.0, + "learning_rate": 0.0009991845519630679, + "loss": 29316.8, + "step": 280 + }, + { + "ce_loss_13": 8.604181599617004, + "ce_loss_26": 8.292751049995422, + "ce_loss_39": 8.304360592365265, + "ce_loss_52": 1.4289869368076324, + "ce_loss_7": 8.740529561042786, + "epoch": 0.029, + "grad_norm": 49.40131878460191, + "kl_loss_13": 14785.6, + "kl_loss_26": 14123.2, + "kl_loss_39": 14147.2, + "kl_loss_7": 15065.6, + "learning_rate": 0.0009990914580222257, + "loss": 29034.0, + "step": 290 + }, + { + "ce_loss_13": 8.560984683036803, + "ce_loss_26": 8.244250774383545, + "ce_loss_39": 8.251447391510009, + "ce_loss_52": 1.4629653513431549, + "ce_loss_7": 8.694539451599121, + "epoch": 0.03, + "grad_norm": 49.35953544498034, + "kl_loss_13": 14673.6, + "kl_loss_26": 13996.8, + "kl_loss_39": 14014.4, + "kl_loss_7": 14955.2, + "learning_rate": 0.0009989933382359422, + "loss": 28794.4, + "step": 300 + }, + { + "ce_loss_13": 8.470300602912904, + "ce_loss_26": 8.14130541086197, + "ce_loss_39": 8.148625028133392, + "ce_loss_52": 1.4505603075027467, + "ce_loss_7": 8.614710068702697, + "epoch": 0.031, + "grad_norm": 49.67315029837232, + "kl_loss_13": 14492.8, + "kl_loss_26": 13798.4, + "kl_loss_39": 13812.8, + "kl_loss_7": 14800.0, + "learning_rate": 0.0009988901935922825, + "loss": 28550.8, + "step": 310 + }, + { + "ce_loss_13": 8.449520301818847, + "ce_loss_26": 8.124438393115998, + "ce_loss_39": 8.127473723888397, + "ce_loss_52": 1.4619350016117096, + "ce_loss_7": 8.594109392166137, + "epoch": 0.032, + "grad_norm": 49.62699299914443, + "kl_loss_13": 14432.0, + "kl_loss_26": 13737.6, + "kl_loss_39": 13745.6, + "kl_loss_7": 14737.6, + "learning_rate": 0.0009987820251299122, + "loss": 28340.4, + "step": 320 + }, + { + "ce_loss_13": 8.411065363883973, + "ce_loss_26": 8.065169262886048, + "ce_loss_39": 8.066172111034394, + "ce_loss_52": 1.4555475383996963, + "ce_loss_7": 8.563976049423218, + "epoch": 0.033, + "grad_norm": 48.32599858014616, + "kl_loss_13": 14328.0, + "kl_loss_26": 13596.8, + "kl_loss_39": 13601.6, + "kl_loss_7": 14651.2, + "learning_rate": 0.0009986688339380862, + "loss": 28064.0, + "step": 330 + }, + { + "ce_loss_13": 8.342015504837036, + "ce_loss_26": 7.984185111522675, + "ce_loss_39": 7.983271932601928, + "ce_loss_52": 1.4297153055667877, + "ce_loss_7": 8.503781509399413, + "epoch": 0.034, + "grad_norm": 49.50369957464881, + "kl_loss_13": 14252.8, + "kl_loss_26": 13504.0, + "kl_loss_39": 13500.8, + "kl_loss_7": 14590.4, + "learning_rate": 0.0009985506211566387, + "loss": 27837.2, + "step": 340 + }, + { + "ce_loss_13": 8.30728188753128, + "ce_loss_26": 7.943272864818573, + "ce_loss_39": 7.946760308742523, + "ce_loss_52": 1.4311424046754837, + "ce_loss_7": 8.472229743003846, + "epoch": 0.035, + "grad_norm": 49.2761544590419, + "kl_loss_13": 14160.0, + "kl_loss_26": 13390.4, + "kl_loss_39": 13393.6, + "kl_loss_7": 14508.8, + "learning_rate": 0.0009984273879759713, + "loss": 27625.6, + "step": 350 + }, + { + "ce_loss_13": 8.231205070018769, + "ce_loss_26": 7.874649381637573, + "ce_loss_39": 7.870936000347138, + "ce_loss_52": 1.4489013850688934, + "ce_loss_7": 8.397671937942505, + "epoch": 0.036, + "grad_norm": 49.68016023572489, + "kl_loss_13": 13990.4, + "kl_loss_26": 13236.8, + "kl_loss_39": 13232.0, + "kl_loss_7": 14340.8, + "learning_rate": 0.0009982991356370402, + "loss": 27384.8, + "step": 360 + }, + { + "ce_loss_13": 8.171039760112762, + "ce_loss_26": 7.7931832551956175, + "ce_loss_39": 7.793972992897034, + "ce_loss_52": 1.4122098296880723, + "ce_loss_7": 8.344593167304993, + "epoch": 0.037, + "grad_norm": 48.18494426167262, + "kl_loss_13": 13920.0, + "kl_loss_26": 13128.0, + "kl_loss_39": 13129.6, + "kl_loss_7": 14291.2, + "learning_rate": 0.0009981658654313456, + "loss": 27248.4, + "step": 370 + }, + { + "ce_loss_13": 8.160165214538575, + "ce_loss_26": 7.7771016478538515, + "ce_loss_39": 7.771613931655883, + "ce_loss_52": 1.4869922280311585, + "ce_loss_7": 8.338345170021057, + "epoch": 0.038, + "grad_norm": 48.80785692423277, + "kl_loss_13": 13795.2, + "kl_loss_26": 12974.4, + "kl_loss_39": 12960.0, + "kl_loss_7": 14164.8, + "learning_rate": 0.000998027578700917, + "loss": 26976.4, + "step": 380 + }, + { + "ce_loss_13": 8.052374148368836, + "ce_loss_26": 7.658347594738006, + "ce_loss_39": 7.6583909630775455, + "ce_loss_52": 1.4154588401317596, + "ce_loss_7": 8.239335989952087, + "epoch": 0.039, + "grad_norm": 48.38129929768143, + "kl_loss_13": 13680.0, + "kl_loss_26": 12851.2, + "kl_loss_39": 12844.8, + "kl_loss_7": 14078.4, + "learning_rate": 0.0009978842768382998, + "loss": 26719.2, + "step": 390 + }, + { + "ce_loss_13": 8.022306847572327, + "ce_loss_26": 7.623832786083222, + "ce_loss_39": 7.616167056560516, + "ce_loss_52": 1.4509258031845094, + "ce_loss_7": 8.209609961509704, + "epoch": 0.04, + "grad_norm": 48.47003917091678, + "kl_loss_13": 13532.8, + "kl_loss_26": 12684.8, + "kl_loss_39": 12668.8, + "kl_loss_7": 13918.4, + "learning_rate": 0.0009977359612865424, + "loss": 26536.4, + "step": 400 + }, + { + "ce_loss_13": 8.007824766635895, + "ce_loss_26": 7.614881563186645, + "ce_loss_39": 7.602893710136414, + "ce_loss_52": 1.4645162731409074, + "ce_loss_7": 8.19339075088501, + "epoch": 0.041, + "grad_norm": 48.05902996814503, + "kl_loss_13": 13486.4, + "kl_loss_26": 12646.4, + "kl_loss_39": 12627.2, + "kl_loss_7": 13886.4, + "learning_rate": 0.0009975826335391806, + "loss": 26319.6, + "step": 410 + }, + { + "ce_loss_13": 7.89500185251236, + "ce_loss_26": 7.4844276905059814, + "ce_loss_39": 7.470773124694825, + "ce_loss_52": 1.3909434020519256, + "ce_loss_7": 8.089212799072266, + "epoch": 0.042, + "grad_norm": 47.6532900353141, + "kl_loss_13": 13393.6, + "kl_loss_26": 12528.0, + "kl_loss_39": 12504.0, + "kl_loss_7": 13800.0, + "learning_rate": 0.0009974242951402235, + "loss": 26051.2, + "step": 420 + }, + { + "ce_loss_13": 7.848755323886872, + "ce_loss_26": 7.436672508716583, + "ce_loss_39": 7.420604693889618, + "ce_loss_52": 1.4549538046121597, + "ce_loss_7": 8.053676557540893, + "epoch": 0.043, + "grad_norm": 47.461866848016506, + "kl_loss_13": 13187.2, + "kl_loss_26": 12316.8, + "kl_loss_39": 12278.4, + "kl_loss_7": 13619.2, + "learning_rate": 0.0009972609476841367, + "loss": 25819.2, + "step": 430 + }, + { + "ce_loss_13": 7.824773061275482, + "ce_loss_26": 7.38158438205719, + "ce_loss_39": 7.365956795215607, + "ce_loss_52": 1.4214935347437858, + "ce_loss_7": 8.031265962123872, + "epoch": 0.044, + "grad_norm": 47.54154044179159, + "kl_loss_13": 13169.6, + "kl_loss_26": 12241.6, + "kl_loss_39": 12208.0, + "kl_loss_7": 13604.8, + "learning_rate": 0.0009970925928158272, + "loss": 25669.2, + "step": 440 + }, + { + "ce_loss_13": 7.781041419506073, + "ce_loss_26": 7.349016737937927, + "ce_loss_39": 7.328068232536316, + "ce_loss_52": 1.4457789659500122, + "ce_loss_7": 7.986689484119415, + "epoch": 0.045, + "grad_norm": 47.28488093165922, + "kl_loss_13": 13065.6, + "kl_loss_26": 12150.4, + "kl_loss_39": 12107.2, + "kl_loss_7": 13499.2, + "learning_rate": 0.000996919232230627, + "loss": 25413.2, + "step": 450 + }, + { + "ce_loss_13": 7.707467567920685, + "ce_loss_26": 7.259365463256836, + "ce_loss_39": 7.2402653932571415, + "ce_loss_52": 1.4384218811988831, + "ce_loss_7": 7.9230645298957825, + "epoch": 0.046, + "grad_norm": 47.47875544223923, + "kl_loss_13": 12928.0, + "kl_loss_26": 11974.4, + "kl_loss_39": 11940.8, + "kl_loss_7": 13380.8, + "learning_rate": 0.0009967408676742752, + "loss": 25149.6, + "step": 460 + }, + { + "ce_loss_13": 7.682056641578674, + "ce_loss_26": 7.239385926723481, + "ce_loss_39": 7.214358115196228, + "ce_loss_52": 1.4293440610170365, + "ce_loss_7": 7.903320550918579, + "epoch": 0.047, + "grad_norm": 47.616227706878504, + "kl_loss_13": 12899.2, + "kl_loss_26": 11950.4, + "kl_loss_39": 11904.0, + "kl_loss_7": 13360.0, + "learning_rate": 0.0009965575009429006, + "loss": 24954.0, + "step": 470 + }, + { + "ce_loss_13": 7.673471140861511, + "ce_loss_26": 7.22241450548172, + "ce_loss_39": 7.191142916679382, + "ce_loss_52": 1.4720373705029488, + "ce_loss_7": 7.897368836402893, + "epoch": 0.048, + "grad_norm": 47.35612609507448, + "kl_loss_13": 12763.2, + "kl_loss_26": 11816.0, + "kl_loss_39": 11748.8, + "kl_loss_7": 13236.8, + "learning_rate": 0.0009963691338830043, + "loss": 24784.4, + "step": 480 + }, + { + "ce_loss_13": 7.61794912815094, + "ce_loss_26": 7.163714337348938, + "ce_loss_39": 7.132104587554932, + "ce_loss_52": 1.4696507424116134, + "ce_loss_7": 7.839430296421051, + "epoch": 0.049, + "grad_norm": 46.6485917664728, + "kl_loss_13": 12664.0, + "kl_loss_26": 11691.2, + "kl_loss_39": 11627.2, + "kl_loss_7": 13139.2, + "learning_rate": 0.0009961757683914405, + "loss": 24543.2, + "step": 490 + }, + { + "ce_loss_13": 7.507795846462249, + "ce_loss_26": 7.035580575466156, + "ce_loss_39": 7.005417144298553, + "ce_loss_52": 1.4070568919181823, + "ce_loss_7": 7.743252336978912, + "epoch": 0.05, + "grad_norm": 46.67870253681081, + "kl_loss_13": 12545.6, + "kl_loss_26": 11542.4, + "kl_loss_39": 11480.0, + "kl_loss_7": 13043.2, + "learning_rate": 0.0009959774064153978, + "loss": 24344.0, + "step": 500 + }, + { + "ce_loss_13": 7.495815181732178, + "ce_loss_26": 7.0229366540908815, + "ce_loss_39": 6.990859532356263, + "ce_loss_52": 1.4116702109575272, + "ce_loss_7": 7.736506867408752, + "epoch": 0.051, + "grad_norm": 46.06552599373074, + "kl_loss_13": 12505.6, + "kl_loss_26": 11507.2, + "kl_loss_39": 11433.6, + "kl_loss_7": 13009.6, + "learning_rate": 0.0009957740499523787, + "loss": 24160.0, + "step": 510 + }, + { + "ce_loss_13": 7.443893933296204, + "ce_loss_26": 6.961520659923553, + "ce_loss_39": 6.922756457328797, + "ce_loss_52": 1.4424341320991516, + "ce_loss_7": 7.6887711644172665, + "epoch": 0.052, + "grad_norm": 46.57676837877325, + "kl_loss_13": 12337.6, + "kl_loss_26": 11308.8, + "kl_loss_39": 11225.6, + "kl_loss_7": 12851.2, + "learning_rate": 0.0009955657010501807, + "loss": 23900.8, + "step": 520 + }, + { + "ce_loss_13": 7.388968002796173, + "ce_loss_26": 6.916476762294769, + "ce_loss_39": 6.874182558059692, + "ce_loss_52": 1.4647160589694976, + "ce_loss_7": 7.6311492919921875, + "epoch": 0.053, + "grad_norm": 46.39702705707381, + "kl_loss_13": 12211.2, + "kl_loss_26": 11196.8, + "kl_loss_39": 11105.6, + "kl_loss_7": 12718.4, + "learning_rate": 0.000995352361806875, + "loss": 23724.4, + "step": 530 + }, + { + "ce_loss_13": 7.399884676933288, + "ce_loss_26": 6.893076729774475, + "ce_loss_39": 6.849111843109131, + "ce_loss_52": 1.4278682440519332, + "ce_loss_7": 7.656503355503082, + "epoch": 0.054, + "grad_norm": 45.552992496751486, + "kl_loss_13": 12289.6, + "kl_loss_26": 11217.6, + "kl_loss_39": 11131.2, + "kl_loss_7": 12824.0, + "learning_rate": 0.0009951340343707852, + "loss": 23503.2, + "step": 540 + }, + { + "ce_loss_13": 7.299449789524078, + "ce_loss_26": 6.799772572517395, + "ce_loss_39": 6.753718996047974, + "ce_loss_52": 1.447503750026226, + "ce_loss_7": 7.554943478107452, + "epoch": 0.055, + "grad_norm": 45.64724152253333, + "kl_loss_13": 12067.2, + "kl_loss_26": 10992.0, + "kl_loss_39": 10900.8, + "kl_loss_7": 12608.0, + "learning_rate": 0.0009949107209404665, + "loss": 23326.0, + "step": 550 + }, + { + "ce_loss_13": 7.293551552295685, + "ce_loss_26": 6.792212247848511, + "ce_loss_39": 6.740794622898102, + "ce_loss_52": 1.4703042089939118, + "ce_loss_7": 7.558422005176544, + "epoch": 0.056, + "grad_norm": 45.60106893131463, + "kl_loss_13": 11990.4, + "kl_loss_26": 10931.2, + "kl_loss_39": 10820.8, + "kl_loss_7": 12540.8, + "learning_rate": 0.0009946824237646824, + "loss": 23102.4, + "step": 560 + }, + { + "ce_loss_13": 7.1714133501052855, + "ce_loss_26": 6.658501255512237, + "ce_loss_39": 6.615163576602936, + "ce_loss_52": 1.4395473554730416, + "ce_loss_7": 7.438856053352356, + "epoch": 0.057, + "grad_norm": 45.02028770939811, + "kl_loss_13": 11819.2, + "kl_loss_26": 10716.8, + "kl_loss_39": 10633.6, + "kl_loss_7": 12379.2, + "learning_rate": 0.0009944491451423828, + "loss": 22860.0, + "step": 570 + }, + { + "ce_loss_13": 7.202235555648803, + "ce_loss_26": 6.668069064617157, + "ce_loss_39": 6.622809886932373, + "ce_loss_52": 1.4502787292003632, + "ce_loss_7": 7.474415194988251, + "epoch": 0.058, + "grad_norm": 45.87654136914787, + "kl_loss_13": 11832.0, + "kl_loss_26": 10697.6, + "kl_loss_39": 10606.4, + "kl_loss_7": 12409.6, + "learning_rate": 0.0009942108874226813, + "loss": 22680.4, + "step": 580 + }, + { + "ce_loss_13": 7.099943065643311, + "ce_loss_26": 6.574034261703491, + "ce_loss_39": 6.518663537502289, + "ce_loss_52": 1.4500610083341599, + "ce_loss_7": 7.373777639865875, + "epoch": 0.059, + "grad_norm": 45.75547743089401, + "kl_loss_13": 11633.6, + "kl_loss_26": 10512.0, + "kl_loss_39": 10403.2, + "kl_loss_7": 12201.6, + "learning_rate": 0.00099396765300483, + "loss": 22462.8, + "step": 590 + }, + { + "ce_loss_13": 7.097463607788086, + "ce_loss_26": 6.57372156381607, + "ce_loss_39": 6.511255967617035, + "ce_loss_52": 1.480376410484314, + "ce_loss_7": 7.372353208065033, + "epoch": 0.06, + "grad_norm": 45.20967365382369, + "kl_loss_13": 11558.4, + "kl_loss_26": 10451.2, + "kl_loss_39": 10324.8, + "kl_loss_7": 12145.6, + "learning_rate": 0.0009937194443381972, + "loss": 22282.4, + "step": 600 + }, + { + "ce_loss_13": 7.056500816345215, + "ce_loss_26": 6.520907533168793, + "ce_loss_39": 6.457597935199738, + "ce_loss_52": 1.4534808412194251, + "ce_loss_7": 7.339275515079498, + "epoch": 0.061, + "grad_norm": 44.17007512472295, + "kl_loss_13": 11520.0, + "kl_loss_26": 10382.4, + "kl_loss_39": 10249.6, + "kl_loss_7": 12110.4, + "learning_rate": 0.0009934662639222412, + "loss": 22080.0, + "step": 610 + }, + { + "ce_loss_13": 6.959627556800842, + "ce_loss_26": 6.4156983375549315, + "ce_loss_39": 6.353180265426635, + "ce_loss_52": 1.4930268943309783, + "ce_loss_7": 7.244034695625305, + "epoch": 0.062, + "grad_norm": 43.92075543466765, + "kl_loss_13": 11259.2, + "kl_loss_26": 10102.4, + "kl_loss_39": 9971.2, + "kl_loss_7": 11859.2, + "learning_rate": 0.000993208114306486, + "loss": 21799.2, + "step": 620 + }, + { + "ce_loss_13": 6.937919509410858, + "ce_loss_26": 6.402250957489014, + "ce_loss_39": 6.331100332736969, + "ce_loss_52": 1.4531458377838136, + "ce_loss_7": 7.226283407211303, + "epoch": 0.063, + "grad_norm": 44.52659706916058, + "kl_loss_13": 11259.2, + "kl_loss_26": 10128.0, + "kl_loss_39": 9988.8, + "kl_loss_7": 11881.6, + "learning_rate": 0.0009929449980904952, + "loss": 21667.2, + "step": 630 + }, + { + "ce_loss_13": 6.914422643184662, + "ce_loss_26": 6.355719900131225, + "ce_loss_39": 6.2839093685150145, + "ce_loss_52": 1.463460123538971, + "ce_loss_7": 7.208615565299988, + "epoch": 0.064, + "grad_norm": 44.241917484883416, + "kl_loss_13": 11203.2, + "kl_loss_26": 10004.8, + "kl_loss_39": 9865.6, + "kl_loss_7": 11827.2, + "learning_rate": 0.0009926769179238466, + "loss": 21450.4, + "step": 640 + }, + { + "ce_loss_13": 6.814666819572449, + "ce_loss_26": 6.240503942966461, + "ce_loss_39": 6.164217627048492, + "ce_loss_52": 1.4213469997048378, + "ce_loss_7": 7.121113920211792, + "epoch": 0.065, + "grad_norm": 45.45585410762684, + "kl_loss_13": 11097.6, + "kl_loss_26": 9875.2, + "kl_loss_39": 9726.4, + "kl_loss_7": 11742.4, + "learning_rate": 0.000992403876506104, + "loss": 21273.2, + "step": 650 + }, + { + "ce_loss_13": 6.807473576068878, + "ce_loss_26": 6.237039804458618, + "ce_loss_39": 6.164605820178986, + "ce_loss_52": 1.4794408291578294, + "ce_loss_7": 7.109469771385193, + "epoch": 0.066, + "grad_norm": 43.77904042873825, + "kl_loss_13": 10964.8, + "kl_loss_26": 9745.6, + "kl_loss_39": 9593.6, + "kl_loss_7": 11603.2, + "learning_rate": 0.0009921258765867918, + "loss": 21034.4, + "step": 660 + }, + { + "ce_loss_13": 6.720256412029267, + "ce_loss_26": 6.124040985107422, + "ce_loss_39": 6.048683619499206, + "ce_loss_52": 1.4370630145072938, + "ce_loss_7": 7.032277429103852, + "epoch": 0.067, + "grad_norm": 44.21280182860459, + "kl_loss_13": 10864.0, + "kl_loss_26": 9596.8, + "kl_loss_39": 9446.4, + "kl_loss_7": 11528.0, + "learning_rate": 0.0009918429209653662, + "loss": 20815.6, + "step": 670 + }, + { + "ce_loss_13": 6.73115086555481, + "ce_loss_26": 6.149888730049133, + "ce_loss_39": 6.072007644176483, + "ce_loss_52": 1.4543532699346542, + "ce_loss_7": 7.039518296718597, + "epoch": 0.068, + "grad_norm": 43.58133426683343, + "kl_loss_13": 10844.8, + "kl_loss_26": 9603.2, + "kl_loss_39": 9433.6, + "kl_loss_7": 11494.4, + "learning_rate": 0.0009915550124911866, + "loss": 20688.4, + "step": 680 + }, + { + "ce_loss_13": 6.683139646053315, + "ce_loss_26": 6.099281096458435, + "ce_loss_39": 6.017751622200012, + "ce_loss_52": 1.4289966225624084, + "ce_loss_7": 6.9959977746009825, + "epoch": 0.069, + "grad_norm": 43.03707399207988, + "kl_loss_13": 10817.6, + "kl_loss_26": 9577.6, + "kl_loss_39": 9414.4, + "kl_loss_7": 11472.0, + "learning_rate": 0.0009912621540634887, + "loss": 20494.0, + "step": 690 + }, + { + "ce_loss_13": 6.5575969338417055, + "ce_loss_26": 5.94709130525589, + "ce_loss_39": 5.865298080444336, + "ce_loss_52": 1.3811550110578537, + "ce_loss_7": 6.883859884738922, + "epoch": 0.07, + "grad_norm": 43.657034485471186, + "kl_loss_13": 10611.2, + "kl_loss_26": 9316.8, + "kl_loss_39": 9148.8, + "kl_loss_7": 11299.2, + "learning_rate": 0.0009909643486313534, + "loss": 20224.4, + "step": 700 + }, + { + "ce_loss_13": 6.581148624420166, + "ce_loss_26": 5.951541697978973, + "ce_loss_39": 5.867393767833709, + "ce_loss_52": 1.417145846784115, + "ce_loss_7": 6.908858215808868, + "epoch": 0.071, + "grad_norm": 42.31273006993064, + "kl_loss_13": 10628.8, + "kl_loss_26": 9294.4, + "kl_loss_39": 9120.0, + "kl_loss_7": 11320.0, + "learning_rate": 0.000990661599193678, + "loss": 20075.6, + "step": 710 + }, + { + "ce_loss_13": 6.503521502017975, + "ce_loss_26": 5.871239483356476, + "ce_loss_39": 5.7897450685501095, + "ce_loss_52": 1.4011695250868796, + "ce_loss_7": 6.844956791400909, + "epoch": 0.072, + "grad_norm": 42.36356368480549, + "kl_loss_13": 10488.0, + "kl_loss_26": 9147.2, + "kl_loss_39": 8979.2, + "kl_loss_7": 11206.4, + "learning_rate": 0.0009903539087991462, + "loss": 19811.6, + "step": 720 + }, + { + "ce_loss_13": 6.489633810520172, + "ce_loss_26": 5.87660802602768, + "ce_loss_39": 5.779063713550568, + "ce_loss_52": 1.439223274588585, + "ce_loss_7": 6.819329023361206, + "epoch": 0.073, + "grad_norm": 42.98993238073801, + "kl_loss_13": 10366.4, + "kl_loss_26": 9057.6, + "kl_loss_39": 8861.6, + "kl_loss_7": 11059.2, + "learning_rate": 0.0009900412805461966, + "loss": 19744.8, + "step": 730 + }, + { + "ce_loss_13": 6.4397171378135685, + "ce_loss_26": 5.814687025547028, + "ce_loss_39": 5.716581547260285, + "ce_loss_52": 1.4390251755714416, + "ce_loss_7": 6.779693508148194, + "epoch": 0.074, + "grad_norm": 42.877595561482536, + "kl_loss_13": 10267.2, + "kl_loss_26": 8939.2, + "kl_loss_39": 8734.4, + "kl_loss_7": 10982.4, + "learning_rate": 0.0009897237175829927, + "loss": 19478.8, + "step": 740 + }, + { + "ce_loss_13": 6.3779888391494755, + "ce_loss_26": 5.756749665737152, + "ce_loss_39": 5.652812826633453, + "ce_loss_52": 1.4100830882787705, + "ce_loss_7": 6.712257170677185, + "epoch": 0.075, + "grad_norm": 43.56161359476007, + "kl_loss_13": 10203.2, + "kl_loss_26": 8863.2, + "kl_loss_39": 8649.6, + "kl_loss_7": 10920.0, + "learning_rate": 0.0009894012231073895, + "loss": 19311.6, + "step": 750 + }, + { + "ce_loss_13": 6.351921963691711, + "ce_loss_26": 5.711096298694611, + "ce_loss_39": 5.613580751419067, + "ce_loss_52": 1.4703039675951004, + "ce_loss_7": 6.6900406837463375, + "epoch": 0.076, + "grad_norm": 41.581645996763, + "kl_loss_13": 10056.0, + "kl_loss_26": 8678.4, + "kl_loss_39": 8485.6, + "kl_loss_7": 10764.8, + "learning_rate": 0.0009890738003669028, + "loss": 19128.0, + "step": 760 + }, + { + "ce_loss_13": 6.329116785526276, + "ce_loss_26": 5.685759162902832, + "ce_loss_39": 5.58324785232544, + "ce_loss_52": 1.4396527051925658, + "ce_loss_7": 6.677184915542602, + "epoch": 0.077, + "grad_norm": 40.86594229703089, + "kl_loss_13": 10036.8, + "kl_loss_26": 8680.0, + "kl_loss_39": 8467.2, + "kl_loss_7": 10760.0, + "learning_rate": 0.0009887414526586764, + "loss": 18930.4, + "step": 770 + }, + { + "ce_loss_13": 6.279877305030823, + "ce_loss_26": 5.617447376251221, + "ce_loss_39": 5.507227098941803, + "ce_loss_52": 1.4374216616153717, + "ce_loss_7": 6.634308731555938, + "epoch": 0.078, + "grad_norm": 41.180826238519536, + "kl_loss_13": 9923.2, + "kl_loss_26": 8513.6, + "kl_loss_39": 8292.8, + "kl_loss_7": 10667.2, + "learning_rate": 0.0009884041833294476, + "loss": 18733.6, + "step": 780 + }, + { + "ce_loss_13": 6.212144470214843, + "ce_loss_26": 5.565514934062958, + "ce_loss_39": 5.445651924610138, + "ce_loss_52": 1.4184710115194321, + "ce_loss_7": 6.563052010536194, + "epoch": 0.079, + "grad_norm": 41.51169269505913, + "kl_loss_13": 9840.0, + "kl_loss_26": 8459.2, + "kl_loss_39": 8207.2, + "kl_loss_7": 10576.0, + "learning_rate": 0.000988061995775515, + "loss": 18618.8, + "step": 790 + }, + { + "ce_loss_13": 6.177972686290741, + "ce_loss_26": 5.5426277875900265, + "ce_loss_39": 5.430073320865631, + "ce_loss_52": 1.4582359090447425, + "ce_loss_7": 6.532871425151825, + "epoch": 0.08, + "grad_norm": 41.06171415513337, + "kl_loss_13": 9713.6, + "kl_loss_26": 8348.0, + "kl_loss_39": 8122.4, + "kl_loss_7": 10464.0, + "learning_rate": 0.0009877148934427035, + "loss": 18370.0, + "step": 800 + }, + { + "ce_loss_13": 6.174833989143371, + "ce_loss_26": 5.505520594120026, + "ce_loss_39": 5.391082692146301, + "ce_loss_52": 1.4291342854499818, + "ce_loss_7": 6.535899603366852, + "epoch": 0.081, + "grad_norm": 40.55915083586062, + "kl_loss_13": 9748.8, + "kl_loss_26": 8332.0, + "kl_loss_39": 8094.4, + "kl_loss_7": 10502.4, + "learning_rate": 0.0009873628798263297, + "loss": 18197.2, + "step": 810 + }, + { + "ce_loss_13": 6.106976389884949, + "ce_loss_26": 5.425193250179291, + "ce_loss_39": 5.297831201553345, + "ce_loss_52": 1.4520869970321655, + "ce_loss_7": 6.4676952958106995, + "epoch": 0.082, + "grad_norm": 39.176828574493044, + "kl_loss_13": 9564.8, + "kl_loss_26": 8108.0, + "kl_loss_39": 7852.0, + "kl_loss_7": 10324.8, + "learning_rate": 0.0009870059584711668, + "loss": 17988.4, + "step": 820 + }, + { + "ce_loss_13": 6.029178476333618, + "ce_loss_26": 5.369020164012909, + "ce_loss_39": 5.247223997116089, + "ce_loss_52": 1.4342376589775085, + "ce_loss_7": 6.38949601650238, + "epoch": 0.083, + "grad_norm": 41.3023886018674, + "kl_loss_13": 9422.4, + "kl_loss_26": 8008.8, + "kl_loss_39": 7756.0, + "kl_loss_7": 10184.0, + "learning_rate": 0.000986644132971409, + "loss": 17788.4, + "step": 830 + }, + { + "ce_loss_13": 6.009692323207855, + "ce_loss_26": 5.3266006231307985, + "ce_loss_39": 5.202202546596527, + "ce_loss_52": 1.4376018613576889, + "ce_loss_7": 6.372254419326782, + "epoch": 0.084, + "grad_norm": 39.84971146906691, + "kl_loss_13": 9387.2, + "kl_loss_26": 7916.8, + "kl_loss_39": 7663.2, + "kl_loss_7": 10155.2, + "learning_rate": 0.0009862774069706345, + "loss": 17687.8, + "step": 840 + }, + { + "ce_loss_13": 5.948546409606934, + "ce_loss_26": 5.290343832969666, + "ce_loss_39": 5.16569093465805, + "ce_loss_52": 1.4315639585256577, + "ce_loss_7": 6.303727805614471, + "epoch": 0.085, + "grad_norm": 38.79997549953815, + "kl_loss_13": 9260.8, + "kl_loss_26": 7848.0, + "kl_loss_39": 7593.6, + "kl_loss_7": 10009.6, + "learning_rate": 0.000985905784161771, + "loss": 17478.4, + "step": 850 + }, + { + "ce_loss_13": 5.976463770866394, + "ce_loss_26": 5.285696280002594, + "ce_loss_39": 5.158673858642578, + "ce_loss_52": 1.4285172358155251, + "ce_loss_7": 6.345685577392578, + "epoch": 0.086, + "grad_norm": 39.11215287158734, + "kl_loss_13": 9323.2, + "kl_loss_26": 7843.2, + "kl_loss_39": 7588.0, + "kl_loss_7": 10100.8, + "learning_rate": 0.000985529268287055, + "loss": 17353.8, + "step": 860 + }, + { + "ce_loss_13": 5.890017306804657, + "ce_loss_26": 5.188625490665435, + "ce_loss_39": 5.061676156520844, + "ce_loss_52": 1.427770259976387, + "ce_loss_7": 6.267517876625061, + "epoch": 0.087, + "grad_norm": 38.38012767193544, + "kl_loss_13": 9177.6, + "kl_loss_26": 7678.4, + "kl_loss_39": 7415.2, + "kl_loss_7": 9971.2, + "learning_rate": 0.0009851478631379982, + "loss": 17143.4, + "step": 870 + }, + { + "ce_loss_13": 5.8172935247421265, + "ce_loss_26": 5.092825090885162, + "ce_loss_39": 4.964837598800659, + "ce_loss_52": 1.3596146881580353, + "ce_loss_7": 6.200971674919129, + "epoch": 0.088, + "grad_norm": 38.67673909990335, + "kl_loss_13": 9150.4, + "kl_loss_26": 7612.8, + "kl_loss_39": 7350.4, + "kl_loss_7": 9947.2, + "learning_rate": 0.0009847615725553456, + "loss": 17046.8, + "step": 880 + }, + { + "ce_loss_13": 5.872656679153442, + "ce_loss_26": 5.153263211250305, + "ce_loss_39": 5.00926034450531, + "ce_loss_52": 1.4231197819113732, + "ce_loss_7": 6.253647100925446, + "epoch": 0.089, + "grad_norm": 38.12938597789528, + "kl_loss_13": 9128.0, + "kl_loss_26": 7595.2, + "kl_loss_39": 7309.6, + "kl_loss_7": 9923.2, + "learning_rate": 0.0009843704004290394, + "loss": 16917.8, + "step": 890 + }, + { + "ce_loss_13": 5.798008918762207, + "ce_loss_26": 5.091720676422119, + "ce_loss_39": 4.938081228733063, + "ce_loss_52": 1.4382835403084755, + "ce_loss_7": 6.171303284168244, + "epoch": 0.09, + "grad_norm": 37.16474091329711, + "kl_loss_13": 8936.0, + "kl_loss_26": 7427.2, + "kl_loss_39": 7121.6, + "kl_loss_7": 9726.4, + "learning_rate": 0.0009839743506981783, + "loss": 16656.0, + "step": 900 + }, + { + "ce_loss_13": 5.82405720949173, + "ce_loss_26": 5.096203732490539, + "ce_loss_39": 4.965065968036652, + "ce_loss_52": 1.4636528208851813, + "ce_loss_7": 6.201912236213684, + "epoch": 0.091, + "grad_norm": 36.36947394693034, + "kl_loss_13": 8939.2, + "kl_loss_26": 7354.4, + "kl_loss_39": 7080.0, + "kl_loss_7": 9747.2, + "learning_rate": 0.0009835734273509786, + "loss": 16529.0, + "step": 910 + }, + { + "ce_loss_13": 5.734641706943512, + "ce_loss_26": 5.011995434761047, + "ce_loss_39": 4.85785391330719, + "ce_loss_52": 1.4457294046878815, + "ce_loss_7": 6.11446977853775, + "epoch": 0.092, + "grad_norm": 36.64530545748263, + "kl_loss_13": 8840.8, + "kl_loss_26": 7284.8, + "kl_loss_39": 6967.2, + "kl_loss_7": 9636.8, + "learning_rate": 0.0009831676344247342, + "loss": 16343.8, + "step": 920 + }, + { + "ce_loss_13": 5.681211936473846, + "ce_loss_26": 4.933374917507171, + "ce_loss_39": 4.788007187843323, + "ce_loss_52": 1.3833691507577897, + "ce_loss_7": 6.066722130775451, + "epoch": 0.093, + "grad_norm": 37.601960547328346, + "kl_loss_13": 8819.2, + "kl_loss_26": 7232.8, + "kl_loss_39": 6926.4, + "kl_loss_7": 9620.8, + "learning_rate": 0.0009827569760057755, + "loss": 16262.8, + "step": 930 + }, + { + "ce_loss_13": 5.685943353176117, + "ce_loss_26": 4.954251933097839, + "ce_loss_39": 4.7964679479599, + "ce_loss_52": 1.4205988943576813, + "ce_loss_7": 6.062719237804413, + "epoch": 0.094, + "grad_norm": 34.90292075240395, + "kl_loss_13": 8707.2, + "kl_loss_26": 7154.4, + "kl_loss_39": 6841.6, + "kl_loss_7": 9496.0, + "learning_rate": 0.000982341456229428, + "loss": 16011.8, + "step": 940 + }, + { + "ce_loss_13": 5.640336573123932, + "ce_loss_26": 4.914572691917419, + "ce_loss_39": 4.756339108943939, + "ce_loss_52": 1.46774483025074, + "ce_loss_7": 6.0160892605781555, + "epoch": 0.095, + "grad_norm": 35.81614055285293, + "kl_loss_13": 8553.6, + "kl_loss_26": 6999.2, + "kl_loss_39": 6671.2, + "kl_loss_7": 9348.8, + "learning_rate": 0.000981921079279971, + "loss": 15864.8, + "step": 950 + }, + { + "ce_loss_13": 5.622566449642181, + "ce_loss_26": 4.879516458511352, + "ce_loss_39": 4.717178559303283, + "ce_loss_52": 1.4239765584468842, + "ce_loss_7": 5.998941457271576, + "epoch": 0.096, + "grad_norm": 35.60374709072334, + "kl_loss_13": 8597.6, + "kl_loss_26": 7007.2, + "kl_loss_39": 6670.4, + "kl_loss_7": 9384.0, + "learning_rate": 0.0009814958493905962, + "loss": 15764.6, + "step": 960 + }, + { + "ce_loss_13": 5.552615082263946, + "ce_loss_26": 4.812614411115646, + "ce_loss_39": 4.667081838846206, + "ce_loss_52": 1.4328487768769265, + "ce_loss_7": 5.935272622108459, + "epoch": 0.097, + "grad_norm": 34.10755627830643, + "kl_loss_13": 8454.4, + "kl_loss_26": 6880.8, + "kl_loss_39": 6574.4, + "kl_loss_7": 9254.4, + "learning_rate": 0.0009810657708433637, + "loss": 15541.8, + "step": 970 + }, + { + "ce_loss_13": 5.534706914424897, + "ce_loss_26": 4.787809383869171, + "ce_loss_39": 4.631017792224884, + "ce_loss_52": 1.4364599764347077, + "ce_loss_7": 5.910705745220184, + "epoch": 0.098, + "grad_norm": 33.30412559481911, + "kl_loss_13": 8426.4, + "kl_loss_26": 6818.4, + "kl_loss_39": 6496.0, + "kl_loss_7": 9220.8, + "learning_rate": 0.0009806308479691594, + "loss": 15486.2, + "step": 980 + }, + { + "ce_loss_13": 5.455054485797882, + "ce_loss_26": 4.712761473655701, + "ce_loss_39": 4.549750781059265, + "ce_loss_52": 1.442040067911148, + "ce_loss_7": 5.841645193099976, + "epoch": 0.099, + "grad_norm": 34.061312435089185, + "kl_loss_13": 8225.6, + "kl_loss_26": 6628.8, + "kl_loss_39": 6296.8, + "kl_loss_7": 9036.8, + "learning_rate": 0.0009801910851476522, + "loss": 15382.2, + "step": 990 + }, + { + "ce_loss_13": 5.4525358319282535, + "ce_loss_26": 4.71786208152771, + "ce_loss_39": 4.559128785133362, + "ce_loss_52": 1.4428577244281768, + "ce_loss_7": 5.826938045024872, + "epoch": 0.1, + "grad_norm": 33.6953047069211, + "kl_loss_13": 8196.0, + "kl_loss_26": 6628.8, + "kl_loss_39": 6297.6, + "kl_loss_7": 8995.2, + "learning_rate": 0.0009797464868072487, + "loss": 15156.4, + "step": 1000 + }, + { + "ce_loss_13": 5.445610964298249, + "ce_loss_26": 4.6739885926246645, + "ce_loss_39": 4.508283615112305, + "ce_loss_52": 1.4168697059154511, + "ce_loss_7": 5.834221661090851, + "epoch": 0.101, + "grad_norm": 32.70197394198742, + "kl_loss_13": 8233.6, + "kl_loss_26": 6591.2, + "kl_loss_39": 6246.4, + "kl_loss_7": 9051.2, + "learning_rate": 0.0009792970574250492, + "loss": 14993.6, + "step": 1010 + }, + { + "ce_loss_13": 5.368366336822509, + "ce_loss_26": 4.5666680335998535, + "ce_loss_39": 4.400501304864884, + "ce_loss_52": 1.3814861461520196, + "ce_loss_7": 5.762167453765869, + "epoch": 0.102, + "grad_norm": 32.28847151546708, + "kl_loss_13": 8145.6, + "kl_loss_26": 6444.0, + "kl_loss_39": 6111.2, + "kl_loss_7": 8969.6, + "learning_rate": 0.0009788428015268028, + "loss": 14902.6, + "step": 1020 + }, + { + "ce_loss_13": 5.416829228401184, + "ce_loss_26": 4.672497856616974, + "ce_loss_39": 4.5039793968200685, + "ce_loss_52": 1.4590631812810897, + "ce_loss_7": 5.7889638304710385, + "epoch": 0.103, + "grad_norm": 32.94054692999689, + "kl_loss_13": 8076.0, + "kl_loss_26": 6480.0, + "kl_loss_39": 6138.4, + "kl_loss_7": 8867.2, + "learning_rate": 0.0009783837236868609, + "loss": 14715.4, + "step": 1030 + }, + { + "ce_loss_13": 5.323912274837494, + "ce_loss_26": 4.551275789737701, + "ce_loss_39": 4.376495039463043, + "ce_loss_52": 1.4376014918088913, + "ce_loss_7": 5.712179851531983, + "epoch": 0.104, + "grad_norm": 32.580444775774126, + "kl_loss_13": 7934.4, + "kl_loss_26": 6285.6, + "kl_loss_39": 5924.0, + "kl_loss_7": 8752.0, + "learning_rate": 0.0009779198285281327, + "loss": 14586.6, + "step": 1040 + }, + { + "ce_loss_13": 5.36049393415451, + "ce_loss_26": 4.592142331600189, + "ce_loss_39": 4.4142293453216555, + "ce_loss_52": 1.4498057544231415, + "ce_loss_7": 5.73596283197403, + "epoch": 0.105, + "grad_norm": 31.80216030064833, + "kl_loss_13": 7976.8, + "kl_loss_26": 6341.6, + "kl_loss_39": 5988.8, + "kl_loss_7": 8771.2, + "learning_rate": 0.0009774511207220368, + "loss": 14415.8, + "step": 1050 + }, + { + "ce_loss_13": 5.3308478713035585, + "ce_loss_26": 4.5750040173530575, + "ce_loss_39": 4.39155302643776, + "ce_loss_52": 1.4785875469446181, + "ce_loss_7": 5.713631689548492, + "epoch": 0.106, + "grad_norm": 31.180344664143906, + "kl_loss_13": 7883.2, + "kl_loss_26": 6267.2, + "kl_loss_39": 5887.2, + "kl_loss_7": 8687.2, + "learning_rate": 0.0009769776049884564, + "loss": 14270.6, + "step": 1060 + }, + { + "ce_loss_13": 5.3476661205291744, + "ce_loss_26": 4.561805117130279, + "ce_loss_39": 4.391524451971054, + "ce_loss_52": 1.4550457745790482, + "ce_loss_7": 5.725461614131928, + "epoch": 0.107, + "grad_norm": 30.9396118714078, + "kl_loss_13": 7968.0, + "kl_loss_26": 6308.0, + "kl_loss_39": 5942.4, + "kl_loss_7": 8754.4, + "learning_rate": 0.0009764992860956889, + "loss": 14268.8, + "step": 1070 + }, + { + "ce_loss_13": 5.248398435115814, + "ce_loss_26": 4.472868782281876, + "ce_loss_39": 4.294939804077148, + "ce_loss_52": 1.4237870454788208, + "ce_loss_7": 5.6272268176078795, + "epoch": 0.108, + "grad_norm": 30.45094259940441, + "kl_loss_13": 7834.4, + "kl_loss_26": 6169.6, + "kl_loss_39": 5812.0, + "kl_loss_7": 8620.8, + "learning_rate": 0.0009760161688604008, + "loss": 14058.8, + "step": 1080 + }, + { + "ce_loss_13": 5.167715132236481, + "ce_loss_26": 4.421313828229904, + "ce_loss_39": 4.241793435811997, + "ce_loss_52": 1.4617454051971435, + "ce_loss_7": 5.536553728580475, + "epoch": 0.109, + "grad_norm": 30.37346177682711, + "kl_loss_13": 7596.8, + "kl_loss_26": 5987.2, + "kl_loss_39": 5622.4, + "kl_loss_7": 8367.2, + "learning_rate": 0.0009755282581475768, + "loss": 13946.4, + "step": 1090 + }, + { + "ce_loss_13": 5.21688460111618, + "ce_loss_26": 4.442314791679382, + "ce_loss_39": 4.254946118593216, + "ce_loss_52": 1.4511815324425696, + "ce_loss_7": 5.593706953525543, + "epoch": 0.11, + "grad_norm": 30.665307755441862, + "kl_loss_13": 7699.2, + "kl_loss_26": 6030.4, + "kl_loss_39": 5652.0, + "kl_loss_7": 8485.6, + "learning_rate": 0.0009750355588704727, + "loss": 13825.8, + "step": 1100 + }, + { + "ce_loss_13": 5.112356352806091, + "ce_loss_26": 4.310530138015747, + "ce_loss_39": 4.125328695774078, + "ce_loss_52": 1.4114144504070283, + "ce_loss_7": 5.487049925327301, + "epoch": 0.111, + "grad_norm": 29.407721301071028, + "kl_loss_13": 7569.6, + "kl_loss_26": 5844.8, + "kl_loss_39": 5470.4, + "kl_loss_7": 8359.2, + "learning_rate": 0.0009745380759905647, + "loss": 13627.8, + "step": 1110 + }, + { + "ce_loss_13": 5.087459588050843, + "ce_loss_26": 4.285146009922028, + "ce_loss_39": 4.101419150829315, + "ce_loss_52": 1.3823944509029389, + "ce_loss_7": 5.47238245010376, + "epoch": 0.112, + "grad_norm": 28.691648596064702, + "kl_loss_13": 7563.2, + "kl_loss_26": 5855.2, + "kl_loss_39": 5476.0, + "kl_loss_7": 8378.4, + "learning_rate": 0.0009740358145174998, + "loss": 13629.4, + "step": 1120 + }, + { + "ce_loss_13": 5.085049080848694, + "ce_loss_26": 4.283704102039337, + "ce_loss_39": 4.095863288640976, + "ce_loss_52": 1.4323463156819343, + "ce_loss_7": 5.455269980430603, + "epoch": 0.113, + "grad_norm": 28.708307757554874, + "kl_loss_13": 7468.0, + "kl_loss_26": 5755.2, + "kl_loss_39": 5364.0, + "kl_loss_7": 8244.8, + "learning_rate": 0.0009735287795090455, + "loss": 13475.0, + "step": 1130 + }, + { + "ce_loss_13": 4.9704699397087095, + "ce_loss_26": 4.164930063486099, + "ce_loss_39": 3.981805819272995, + "ce_loss_52": 1.3983336806297302, + "ce_loss_7": 5.351285874843597, + "epoch": 0.114, + "grad_norm": 30.611214200807577, + "kl_loss_13": 7311.2, + "kl_loss_26": 5585.6, + "kl_loss_39": 5199.2, + "kl_loss_7": 8099.2, + "learning_rate": 0.0009730169760710386, + "loss": 13288.2, + "step": 1140 + }, + { + "ce_loss_13": 5.094941341876984, + "ce_loss_26": 4.275070035457611, + "ce_loss_39": 4.084649866819381, + "ce_loss_52": 1.4357529014348984, + "ce_loss_7": 5.482879185676575, + "epoch": 0.115, + "grad_norm": 29.988760771554233, + "kl_loss_13": 7468.8, + "kl_loss_26": 5715.2, + "kl_loss_39": 5326.4, + "kl_loss_7": 8275.2, + "learning_rate": 0.0009725004093573342, + "loss": 13196.6, + "step": 1150 + }, + { + "ce_loss_13": 4.942017900943756, + "ce_loss_26": 4.151496112346649, + "ce_loss_39": 3.957593894004822, + "ce_loss_52": 1.4098813980817795, + "ce_loss_7": 5.323493349552154, + "epoch": 0.116, + "grad_norm": 30.06505974205765, + "kl_loss_13": 7237.6, + "kl_loss_26": 5534.4, + "kl_loss_39": 5135.2, + "kl_loss_7": 8035.2, + "learning_rate": 0.0009719790845697534, + "loss": 13084.4, + "step": 1160 + }, + { + "ce_loss_13": 4.974001240730286, + "ce_loss_26": 4.16838675737381, + "ce_loss_39": 3.968938571214676, + "ce_loss_52": 1.4311222642660142, + "ce_loss_7": 5.3611521363258365, + "epoch": 0.117, + "grad_norm": 28.226442547632384, + "kl_loss_13": 7241.6, + "kl_loss_26": 5536.8, + "kl_loss_39": 5121.6, + "kl_loss_7": 8045.6, + "learning_rate": 0.0009714530069580309, + "loss": 12959.6, + "step": 1170 + }, + { + "ce_loss_13": 4.905927586555481, + "ce_loss_26": 4.065518736839294, + "ce_loss_39": 3.8746193647384644, + "ce_loss_52": 1.3948067665100097, + "ce_loss_7": 5.290950679779053, + "epoch": 0.118, + "grad_norm": 26.7183550844667, + "kl_loss_13": 7163.2, + "kl_loss_26": 5388.0, + "kl_loss_39": 4992.8, + "kl_loss_7": 7959.2, + "learning_rate": 0.0009709221818197624, + "loss": 12883.0, + "step": 1180 + }, + { + "ce_loss_13": 4.902862447500229, + "ce_loss_26": 4.10335453748703, + "ce_loss_39": 3.920765632390976, + "ce_loss_52": 1.4242859303951263, + "ce_loss_7": 5.280882668495178, + "epoch": 0.119, + "grad_norm": 27.541041012815914, + "kl_loss_13": 7096.0, + "kl_loss_26": 5372.8, + "kl_loss_39": 4987.2, + "kl_loss_7": 7895.2, + "learning_rate": 0.0009703866145003512, + "loss": 12755.6, + "step": 1190 + }, + { + "ce_loss_13": 4.91794501543045, + "ce_loss_26": 4.094452971220017, + "ce_loss_39": 3.892131644487381, + "ce_loss_52": 1.4217353582382202, + "ce_loss_7": 5.298339033126831, + "epoch": 0.12, + "grad_norm": 27.647397498896083, + "kl_loss_13": 7171.2, + "kl_loss_26": 5395.2, + "kl_loss_39": 4985.6, + "kl_loss_7": 7958.4, + "learning_rate": 0.0009698463103929542, + "loss": 12646.8, + "step": 1200 + }, + { + "ce_loss_13": 4.933221316337585, + "ce_loss_26": 4.129461044073105, + "ce_loss_39": 3.92621054649353, + "ce_loss_52": 1.4751009970903397, + "ce_loss_7": 5.315613615512848, + "epoch": 0.121, + "grad_norm": 26.42127738230431, + "kl_loss_13": 7043.2, + "kl_loss_26": 5317.6, + "kl_loss_39": 4899.2, + "kl_loss_7": 7841.6, + "learning_rate": 0.0009693012749384279, + "loss": 12515.8, + "step": 1210 + }, + { + "ce_loss_13": 4.865577363967896, + "ce_loss_26": 4.064575934410096, + "ce_loss_39": 3.863145834207535, + "ce_loss_52": 1.4422448396682739, + "ce_loss_7": 5.235701704025269, + "epoch": 0.122, + "grad_norm": 25.567127248660423, + "kl_loss_13": 6988.0, + "kl_loss_26": 5271.2, + "kl_loss_39": 4855.2, + "kl_loss_7": 7760.8, + "learning_rate": 0.0009687515136252732, + "loss": 12484.2, + "step": 1220 + }, + { + "ce_loss_13": 4.875257205963135, + "ce_loss_26": 4.068110597133637, + "ce_loss_39": 3.866461306810379, + "ce_loss_52": 1.4406520485877992, + "ce_loss_7": 5.254658281803131, + "epoch": 0.123, + "grad_norm": 25.60032726683598, + "kl_loss_13": 7019.2, + "kl_loss_26": 5276.8, + "kl_loss_39": 4864.0, + "kl_loss_7": 7809.6, + "learning_rate": 0.0009681970319895803, + "loss": 12358.8, + "step": 1230 + }, + { + "ce_loss_13": 4.856750476360321, + "ce_loss_26": 4.064332664012909, + "ce_loss_39": 3.865360552072525, + "ce_loss_52": 1.4734540313482285, + "ce_loss_7": 5.232936894893646, + "epoch": 0.124, + "grad_norm": 27.54111241294886, + "kl_loss_13": 6928.8, + "kl_loss_26": 5224.8, + "kl_loss_39": 4806.4, + "kl_loss_7": 7701.6, + "learning_rate": 0.0009676378356149733, + "loss": 12219.2, + "step": 1240 + }, + { + "ce_loss_13": 4.714985811710358, + "ce_loss_26": 3.8911590695381166, + "ce_loss_39": 3.6897071480751036, + "ce_loss_52": 1.4211883068084716, + "ce_loss_7": 5.092134392261505, + "epoch": 0.125, + "grad_norm": 26.159823932609697, + "kl_loss_13": 6763.2, + "kl_loss_26": 5001.6, + "kl_loss_39": 4584.8, + "kl_loss_7": 7549.6, + "learning_rate": 0.0009670739301325534, + "loss": 12043.8, + "step": 1250 + }, + { + "ce_loss_13": 4.746155381202698, + "ce_loss_26": 3.9151339173316955, + "ce_loss_39": 3.717781513929367, + "ce_loss_52": 1.3889067679643632, + "ce_loss_7": 5.118369615077972, + "epoch": 0.126, + "grad_norm": 27.210751367526278, + "kl_loss_13": 6825.6, + "kl_loss_26": 5060.0, + "kl_loss_39": 4660.0, + "kl_loss_7": 7612.0, + "learning_rate": 0.0009665053212208426, + "loss": 12020.6, + "step": 1260 + }, + { + "ce_loss_13": 4.729644465446472, + "ce_loss_26": 3.8812398612499237, + "ce_loss_39": 3.686249554157257, + "ce_loss_52": 1.421858811378479, + "ce_loss_7": 5.109574091434479, + "epoch": 0.127, + "grad_norm": 24.75325655489569, + "kl_loss_13": 6772.8, + "kl_loss_26": 4955.6, + "kl_loss_39": 4553.2, + "kl_loss_7": 7572.8, + "learning_rate": 0.0009659320146057262, + "loss": 11949.0, + "step": 1270 + }, + { + "ce_loss_13": 4.706896722316742, + "ce_loss_26": 3.884850525856018, + "ce_loss_39": 3.679063153266907, + "ce_loss_52": 1.4093473598361015, + "ce_loss_7": 5.0908261895179745, + "epoch": 0.128, + "grad_norm": 25.78294847436066, + "kl_loss_13": 6728.0, + "kl_loss_26": 4984.8, + "kl_loss_39": 4562.4, + "kl_loss_7": 7525.6, + "learning_rate": 0.0009653540160603955, + "loss": 11920.8, + "step": 1280 + }, + { + "ce_loss_13": 4.68510691523552, + "ce_loss_26": 3.886442297697067, + "ce_loss_39": 3.682328295707703, + "ce_loss_52": 1.4646018967032433, + "ce_loss_7": 5.063843679428101, + "epoch": 0.129, + "grad_norm": 24.870529388647757, + "kl_loss_13": 6587.2, + "kl_loss_26": 4871.2, + "kl_loss_39": 4451.6, + "kl_loss_7": 7384.8, + "learning_rate": 0.0009647713314052896, + "loss": 11716.1, + "step": 1290 + }, + { + "ce_loss_13": 4.6914472579956055, + "ce_loss_26": 3.8814328253269195, + "ce_loss_39": 3.6700519025325775, + "ce_loss_52": 1.428275752067566, + "ce_loss_7": 5.065358865261078, + "epoch": 0.13, + "grad_norm": 24.506148720784267, + "kl_loss_13": 6617.6, + "kl_loss_26": 4886.4, + "kl_loss_39": 4454.8, + "kl_loss_7": 7397.6, + "learning_rate": 0.0009641839665080363, + "loss": 11621.0, + "step": 1300 + }, + { + "ce_loss_13": 4.666101861000061, + "ce_loss_26": 3.8607113540172575, + "ce_loss_39": 3.6500234425067903, + "ce_loss_52": 1.4570627421140672, + "ce_loss_7": 5.0361028671264645, + "epoch": 0.131, + "grad_norm": 23.043331928969636, + "kl_loss_13": 6562.4, + "kl_loss_26": 4820.0, + "kl_loss_39": 4389.2, + "kl_loss_7": 7340.0, + "learning_rate": 0.0009635919272833937, + "loss": 11547.2, + "step": 1310 + }, + { + "ce_loss_13": 4.57597508430481, + "ce_loss_26": 3.757897812128067, + "ce_loss_39": 3.541293317079544, + "ce_loss_52": 1.417707359790802, + "ce_loss_7": 4.957513523101807, + "epoch": 0.132, + "grad_norm": 23.874048124377556, + "kl_loss_13": 6413.6, + "kl_loss_26": 4680.0, + "kl_loss_39": 4232.4, + "kl_loss_7": 7214.4, + "learning_rate": 0.0009629952196931902, + "loss": 11465.6, + "step": 1320 + }, + { + "ce_loss_13": 4.599424958229065, + "ce_loss_26": 3.7814753651618958, + "ce_loss_39": 3.5756381869316103, + "ce_loss_52": 1.435066069662571, + "ce_loss_7": 4.962808167934417, + "epoch": 0.133, + "grad_norm": 27.163150075748632, + "kl_loss_13": 6461.6, + "kl_loss_26": 4719.2, + "kl_loss_39": 4284.8, + "kl_loss_7": 7222.4, + "learning_rate": 0.0009623938497462645, + "loss": 11415.0, + "step": 1330 + }, + { + "ce_loss_13": 4.593182015419006, + "ce_loss_26": 3.749741864204407, + "ce_loss_39": 3.543072348833084, + "ce_loss_52": 1.4210506305098534, + "ce_loss_7": 4.98070273399353, + "epoch": 0.134, + "grad_norm": 23.74960078438379, + "kl_loss_13": 6465.6, + "kl_loss_26": 4674.0, + "kl_loss_39": 4258.0, + "kl_loss_7": 7272.8, + "learning_rate": 0.0009617878234984055, + "loss": 11286.2, + "step": 1340 + }, + { + "ce_loss_13": 4.5947358965873715, + "ce_loss_26": 3.779453754425049, + "ce_loss_39": 3.554258805513382, + "ce_loss_52": 1.440669310092926, + "ce_loss_7": 4.96803480386734, + "epoch": 0.135, + "grad_norm": 24.32368060127727, + "kl_loss_13": 6402.4, + "kl_loss_26": 4660.8, + "kl_loss_39": 4202.0, + "kl_loss_7": 7178.4, + "learning_rate": 0.0009611771470522907, + "loss": 11138.4, + "step": 1350 + }, + { + "ce_loss_13": 4.540663009881973, + "ce_loss_26": 3.7292558193206786, + "ce_loss_39": 3.5131251573562623, + "ce_loss_52": 1.4146902561187744, + "ce_loss_7": 4.918925869464874, + "epoch": 0.136, + "grad_norm": 23.85034387543973, + "kl_loss_13": 6383.2, + "kl_loss_26": 4642.0, + "kl_loss_39": 4189.2, + "kl_loss_7": 7181.6, + "learning_rate": 0.0009605618265574251, + "loss": 11195.2, + "step": 1360 + }, + { + "ce_loss_13": 4.595166695117951, + "ce_loss_26": 3.7860535979270935, + "ce_loss_39": 3.5680083632469177, + "ce_loss_52": 1.4854394227266312, + "ce_loss_7": 4.9739551663398744, + "epoch": 0.137, + "grad_norm": 23.647990920122997, + "kl_loss_13": 6340.0, + "kl_loss_26": 4612.0, + "kl_loss_39": 4162.0, + "kl_loss_7": 7127.2, + "learning_rate": 0.0009599418682100792, + "loss": 11028.6, + "step": 1370 + }, + { + "ce_loss_13": 4.481674873828888, + "ce_loss_26": 3.655723828077316, + "ce_loss_39": 3.4441386282444, + "ce_loss_52": 1.402228906750679, + "ce_loss_7": 4.8588902950286865, + "epoch": 0.138, + "grad_norm": 23.628756362977956, + "kl_loss_13": 6284.8, + "kl_loss_26": 4506.8, + "kl_loss_39": 4074.4, + "kl_loss_7": 7073.6, + "learning_rate": 0.0009593172782532268, + "loss": 10976.4, + "step": 1380 + }, + { + "ce_loss_13": 4.446731185913086, + "ce_loss_26": 3.6488849222660065, + "ce_loss_39": 3.4283725559711455, + "ce_loss_52": 1.425352481007576, + "ce_loss_7": 4.823254930973053, + "epoch": 0.139, + "grad_norm": 23.454937110465252, + "kl_loss_13": 6190.4, + "kl_loss_26": 4472.0, + "kl_loss_39": 4019.6, + "kl_loss_7": 6972.0, + "learning_rate": 0.0009586880629764817, + "loss": 10856.2, + "step": 1390 + }, + { + "ce_loss_13": 4.481068539619446, + "ce_loss_26": 3.6307739317417145, + "ce_loss_39": 3.4139047265052795, + "ce_loss_52": 1.3980510637164116, + "ce_loss_7": 4.877132707834244, + "epoch": 0.14, + "grad_norm": 24.701369227992412, + "kl_loss_13": 6300.0, + "kl_loss_26": 4496.4, + "kl_loss_39": 4045.6, + "kl_loss_7": 7120.0, + "learning_rate": 0.0009580542287160348, + "loss": 10848.4, + "step": 1400 + }, + { + "ce_loss_13": 4.481135439872742, + "ce_loss_26": 3.684358072280884, + "ce_loss_39": 3.454758608341217, + "ce_loss_52": 1.459119439125061, + "ce_loss_7": 4.855645072460175, + "epoch": 0.141, + "grad_norm": 24.081142665128635, + "kl_loss_13": 6146.4, + "kl_loss_26": 4455.2, + "kl_loss_39": 3985.2, + "kl_loss_7": 6936.0, + "learning_rate": 0.0009574157818545901, + "loss": 10711.8, + "step": 1410 + }, + { + "ce_loss_13": 4.440946173667908, + "ce_loss_26": 3.6337361335754395, + "ce_loss_39": 3.404258185625076, + "ce_loss_52": 1.411030687391758, + "ce_loss_7": 4.816100597381592, + "epoch": 0.142, + "grad_norm": 22.547443199755673, + "kl_loss_13": 6183.2, + "kl_loss_26": 4461.2, + "kl_loss_39": 3991.6, + "kl_loss_7": 6976.0, + "learning_rate": 0.0009567727288213005, + "loss": 10724.8, + "step": 1420 + }, + { + "ce_loss_13": 4.457812869548798, + "ce_loss_26": 3.6864565014839172, + "ce_loss_39": 3.452153670787811, + "ce_loss_52": 1.4788728266954423, + "ce_loss_7": 4.832965791225433, + "epoch": 0.143, + "grad_norm": 22.827900847688525, + "kl_loss_13": 6096.8, + "kl_loss_26": 4430.4, + "kl_loss_39": 3959.2, + "kl_loss_7": 6881.6, + "learning_rate": 0.0009561250760917027, + "loss": 10616.2, + "step": 1430 + }, + { + "ce_loss_13": 4.386147284507752, + "ce_loss_26": 3.586784356832504, + "ce_loss_39": 3.356312555074692, + "ce_loss_52": 1.4125748693943023, + "ce_loss_7": 4.7688825011253355, + "epoch": 0.144, + "grad_norm": 22.45947089806503, + "kl_loss_13": 6064.0, + "kl_loss_26": 4354.0, + "kl_loss_39": 3886.8, + "kl_loss_7": 6859.2, + "learning_rate": 0.0009554728301876525, + "loss": 10473.0, + "step": 1440 + }, + { + "ce_loss_13": 4.411167800426483, + "ce_loss_26": 3.590187501907349, + "ce_loss_39": 3.3659981071949003, + "ce_loss_52": 1.4227360993623734, + "ce_loss_7": 4.80015469789505, + "epoch": 0.145, + "grad_norm": 22.134471970609756, + "kl_loss_13": 6080.0, + "kl_loss_26": 4321.2, + "kl_loss_39": 3868.4, + "kl_loss_7": 6892.0, + "learning_rate": 0.0009548159976772592, + "loss": 10449.2, + "step": 1450 + }, + { + "ce_loss_13": 4.304250085353852, + "ce_loss_26": 3.5215473413467406, + "ce_loss_39": 3.3020472466945647, + "ce_loss_52": 1.455188724398613, + "ce_loss_7": 4.682382225990295, + "epoch": 0.146, + "grad_norm": 23.296949813068704, + "kl_loss_13": 5828.8, + "kl_loss_26": 4153.6, + "kl_loss_39": 3701.2, + "kl_loss_7": 6626.4, + "learning_rate": 0.0009541545851748186, + "loss": 10336.2, + "step": 1460 + }, + { + "ce_loss_13": 4.340453952550888, + "ce_loss_26": 3.527716559171677, + "ce_loss_39": 3.297034960985184, + "ce_loss_52": 1.4175047695636749, + "ce_loss_7": 4.720600801706314, + "epoch": 0.147, + "grad_norm": 23.69989338145452, + "kl_loss_13": 5933.6, + "kl_loss_26": 4197.2, + "kl_loss_39": 3732.0, + "kl_loss_7": 6732.8, + "learning_rate": 0.0009534885993407473, + "loss": 10320.4, + "step": 1470 + }, + { + "ce_loss_13": 4.316350519657135, + "ce_loss_26": 3.522084206342697, + "ce_loss_39": 3.293704879283905, + "ce_loss_52": 1.4351435631513596, + "ce_loss_7": 4.694415915012359, + "epoch": 0.148, + "grad_norm": 23.155946924290426, + "kl_loss_13": 5859.2, + "kl_loss_26": 4177.2, + "kl_loss_39": 3708.0, + "kl_loss_7": 6649.6, + "learning_rate": 0.0009528180468815154, + "loss": 10227.4, + "step": 1480 + }, + { + "ce_loss_13": 4.34768191576004, + "ce_loss_26": 3.5612153470516206, + "ce_loss_39": 3.3256225168704985, + "ce_loss_52": 1.4718306064605713, + "ce_loss_7": 4.73325879573822, + "epoch": 0.149, + "grad_norm": 22.828288355166595, + "kl_loss_13": 5840.8, + "kl_loss_26": 4173.2, + "kl_loss_39": 3699.2, + "kl_loss_7": 6654.4, + "learning_rate": 0.0009521429345495787, + "loss": 10213.0, + "step": 1490 + }, + { + "ce_loss_13": 4.285507726669311, + "ce_loss_26": 3.494914507865906, + "ce_loss_39": 3.2638841211795806, + "ce_loss_52": 1.4394250243902207, + "ce_loss_7": 4.674430012702942, + "epoch": 0.15, + "grad_norm": 22.358102006612796, + "kl_loss_13": 5802.4, + "kl_loss_26": 4114.4, + "kl_loss_39": 3639.2, + "kl_loss_7": 6619.2, + "learning_rate": 0.0009514632691433108, + "loss": 10144.0, + "step": 1500 + }, + { + "ce_loss_13": 4.274980753660202, + "ce_loss_26": 3.4573559522628785, + "ce_loss_39": 3.2235658168792725, + "ce_loss_52": 1.4003299355506897, + "ce_loss_7": 4.676058840751648, + "epoch": 0.151, + "grad_norm": 21.54131953317247, + "kl_loss_13": 5876.0, + "kl_loss_26": 4120.4, + "kl_loss_39": 3643.6, + "kl_loss_7": 6711.2, + "learning_rate": 0.0009507790575069346, + "loss": 10084.8, + "step": 1510 + }, + { + "ce_loss_13": 4.239670622348785, + "ce_loss_26": 3.4528944075107573, + "ce_loss_39": 3.221757102012634, + "ce_loss_52": 1.4378845229744912, + "ce_loss_7": 4.625989091396332, + "epoch": 0.152, + "grad_norm": 20.883193615641517, + "kl_loss_13": 5700.0, + "kl_loss_26": 4031.2, + "kl_loss_39": 3560.0, + "kl_loss_7": 6514.4, + "learning_rate": 0.0009500903065304539, + "loss": 9981.0, + "step": 1520 + }, + { + "ce_loss_13": 4.250718909502029, + "ce_loss_26": 3.4539793133735657, + "ce_loss_39": 3.2287534534931184, + "ce_loss_52": 1.450673970580101, + "ce_loss_7": 4.637939321994781, + "epoch": 0.153, + "grad_norm": 21.937587942461658, + "kl_loss_13": 5717.6, + "kl_loss_26": 4016.0, + "kl_loss_39": 3551.6, + "kl_loss_7": 6520.0, + "learning_rate": 0.0009493970231495835, + "loss": 9886.2, + "step": 1530 + }, + { + "ce_loss_13": 4.213818311691284, + "ce_loss_26": 3.4225172460079194, + "ce_loss_39": 3.192109799385071, + "ce_loss_52": 1.4253905564546585, + "ce_loss_7": 4.591876769065857, + "epoch": 0.154, + "grad_norm": 22.11614774484019, + "kl_loss_13": 5669.6, + "kl_loss_26": 3997.6, + "kl_loss_39": 3517.6, + "kl_loss_7": 6459.2, + "learning_rate": 0.0009486992143456792, + "loss": 9848.6, + "step": 1540 + }, + { + "ce_loss_13": 4.188841539621353, + "ce_loss_26": 3.3936797797679903, + "ce_loss_39": 3.15527623295784, + "ce_loss_52": 1.4304118230938911, + "ce_loss_7": 4.582345807552338, + "epoch": 0.155, + "grad_norm": 23.528038258157196, + "kl_loss_13": 5588.0, + "kl_loss_26": 3908.0, + "kl_loss_39": 3429.6, + "kl_loss_7": 6408.8, + "learning_rate": 0.0009479968871456679, + "loss": 9804.0, + "step": 1550 + }, + { + "ce_loss_13": 4.199311399459839, + "ce_loss_26": 3.388694739341736, + "ce_loss_39": 3.161331224441528, + "ce_loss_52": 1.424024721980095, + "ce_loss_7": 4.598053079843521, + "epoch": 0.156, + "grad_norm": 20.794486798088236, + "kl_loss_13": 5643.2, + "kl_loss_26": 3922.0, + "kl_loss_39": 3462.8, + "kl_loss_7": 6473.6, + "learning_rate": 0.0009472900486219768, + "loss": 9758.3, + "step": 1560 + }, + { + "ce_loss_13": 4.16922065615654, + "ce_loss_26": 3.3762763381004333, + "ce_loss_39": 3.142381691932678, + "ce_loss_52": 1.4278530597686767, + "ce_loss_7": 4.5707217931747435, + "epoch": 0.157, + "grad_norm": 21.684568107604626, + "kl_loss_13": 5582.4, + "kl_loss_26": 3884.4, + "kl_loss_39": 3405.2, + "kl_loss_7": 6420.0, + "learning_rate": 0.000946578705892462, + "loss": 9625.8, + "step": 1570 + }, + { + "ce_loss_13": 4.178904807567596, + "ce_loss_26": 3.382037007808685, + "ce_loss_39": 3.137607681751251, + "ce_loss_52": 1.4311140328645706, + "ce_loss_7": 4.566913962364197, + "epoch": 0.158, + "grad_norm": 21.97647697577533, + "kl_loss_13": 5572.8, + "kl_loss_26": 3885.6, + "kl_loss_39": 3393.6, + "kl_loss_7": 6394.4, + "learning_rate": 0.0009458628661203367, + "loss": 9608.1, + "step": 1580 + }, + { + "ce_loss_13": 4.179209893941879, + "ce_loss_26": 3.378256690502167, + "ce_loss_39": 3.13810538649559, + "ce_loss_52": 1.418858152627945, + "ce_loss_7": 4.562736237049103, + "epoch": 0.159, + "grad_norm": 20.949460387099393, + "kl_loss_13": 5612.0, + "kl_loss_26": 3914.4, + "kl_loss_39": 3428.8, + "kl_loss_7": 6420.0, + "learning_rate": 0.0009451425365140996, + "loss": 9608.8, + "step": 1590 + }, + { + "ce_loss_13": 4.1623717725276945, + "ce_loss_26": 3.376670056581497, + "ce_loss_39": 3.1331222474575045, + "ce_loss_52": 1.434694454073906, + "ce_loss_7": 4.556069934368134, + "epoch": 0.16, + "grad_norm": 20.884587914746188, + "kl_loss_13": 5573.6, + "kl_loss_26": 3878.8, + "kl_loss_39": 3382.4, + "kl_loss_7": 6402.4, + "learning_rate": 0.0009444177243274617, + "loss": 9535.6, + "step": 1600 + }, + { + "ce_loss_13": 4.082578724622726, + "ce_loss_26": 3.289813929796219, + "ce_loss_39": 3.0515677452087404, + "ce_loss_52": 1.4236672833561896, + "ce_loss_7": 4.478320574760437, + "epoch": 0.161, + "grad_norm": 20.592533194999756, + "kl_loss_13": 5423.2, + "kl_loss_26": 3736.8, + "kl_loss_39": 3249.6, + "kl_loss_7": 6253.6, + "learning_rate": 0.0009436884368592739, + "loss": 9466.0, + "step": 1610 + }, + { + "ce_loss_13": 4.142659282684326, + "ce_loss_26": 3.3751652896404267, + "ce_loss_39": 3.137872564792633, + "ce_loss_52": 1.481352651119232, + "ce_loss_7": 4.527895116806031, + "epoch": 0.162, + "grad_norm": 21.486710542968336, + "kl_loss_13": 5416.8, + "kl_loss_26": 3772.8, + "kl_loss_39": 3282.8, + "kl_loss_7": 6228.8, + "learning_rate": 0.0009429546814534529, + "loss": 9367.9, + "step": 1620 + }, + { + "ce_loss_13": 4.141797959804535, + "ce_loss_26": 3.344935214519501, + "ce_loss_39": 3.1015843570232393, + "ce_loss_52": 1.4449981674551964, + "ce_loss_7": 4.53586882352829, + "epoch": 0.163, + "grad_norm": 22.257565389083407, + "kl_loss_13": 5484.8, + "kl_loss_26": 3776.8, + "kl_loss_39": 3282.8, + "kl_loss_7": 6309.6, + "learning_rate": 0.0009422164654989072, + "loss": 9391.3, + "step": 1630 + }, + { + "ce_loss_13": 4.131260120868683, + "ce_loss_26": 3.3283946096897123, + "ce_loss_39": 3.08265677690506, + "ce_loss_52": 1.446463230252266, + "ce_loss_7": 4.523310673236847, + "epoch": 0.164, + "grad_norm": 20.40514368262374, + "kl_loss_13": 5457.6, + "kl_loss_26": 3766.0, + "kl_loss_39": 3265.6, + "kl_loss_7": 6288.8, + "learning_rate": 0.0009414737964294635, + "loss": 9297.4, + "step": 1640 + }, + { + "ce_loss_13": 4.055157667398452, + "ce_loss_26": 3.2639363288879393, + "ce_loss_39": 3.0281569600105285, + "ce_loss_52": 1.451804205775261, + "ce_loss_7": 4.447070962190628, + "epoch": 0.165, + "grad_norm": 21.902920394223948, + "kl_loss_13": 5323.2, + "kl_loss_26": 3634.0, + "kl_loss_39": 3150.0, + "kl_loss_7": 6143.2, + "learning_rate": 0.000940726681723791, + "loss": 9207.4, + "step": 1650 + }, + { + "ce_loss_13": 3.9808266043663023, + "ce_loss_26": 3.1883151113986967, + "ce_loss_39": 2.9586188077926634, + "ce_loss_52": 1.4088758006691933, + "ce_loss_7": 4.372277349233627, + "epoch": 0.166, + "grad_norm": 21.183658337296244, + "kl_loss_13": 5256.0, + "kl_loss_26": 3567.6, + "kl_loss_39": 3100.8, + "kl_loss_7": 6079.2, + "learning_rate": 0.0009399751289053266, + "loss": 9204.0, + "step": 1660 + }, + { + "ce_loss_13": 4.0190062642097475, + "ce_loss_26": 3.22450470328331, + "ce_loss_39": 2.987401658296585, + "ce_loss_52": 1.3997518077492714, + "ce_loss_7": 4.41674884557724, + "epoch": 0.167, + "grad_norm": 21.809854839151214, + "kl_loss_13": 5346.4, + "kl_loss_26": 3648.8, + "kl_loss_39": 3164.8, + "kl_loss_7": 6175.2, + "learning_rate": 0.0009392191455421988, + "loss": 9183.3, + "step": 1670 + }, + { + "ce_loss_13": 3.9647205591201784, + "ce_loss_26": 3.1920640766620636, + "ce_loss_39": 2.951520323753357, + "ce_loss_52": 1.385107731819153, + "ce_loss_7": 4.359467995166779, + "epoch": 0.168, + "grad_norm": 20.368238654152925, + "kl_loss_13": 5256.8, + "kl_loss_26": 3602.8, + "kl_loss_39": 3118.0, + "kl_loss_7": 6080.8, + "learning_rate": 0.0009384587392471515, + "loss": 9080.3, + "step": 1680 + }, + { + "ce_loss_13": 3.9925671815872192, + "ce_loss_26": 3.2080724120140074, + "ce_loss_39": 2.9695263385772703, + "ce_loss_52": 1.4156971365213393, + "ce_loss_7": 4.380452990531921, + "epoch": 0.169, + "grad_norm": 20.999177946491915, + "kl_loss_13": 5268.0, + "kl_loss_26": 3596.4, + "kl_loss_39": 3100.8, + "kl_loss_7": 6080.8, + "learning_rate": 0.0009376939176774678, + "loss": 8989.5, + "step": 1690 + }, + { + "ce_loss_13": 4.0180779755115505, + "ce_loss_26": 3.258928191661835, + "ce_loss_39": 3.0135749876499176, + "ce_loss_52": 1.450287464261055, + "ce_loss_7": 4.408803248405457, + "epoch": 0.17, + "grad_norm": 19.79885149243747, + "kl_loss_13": 5224.8, + "kl_loss_26": 3590.8, + "kl_loss_39": 3092.0, + "kl_loss_7": 6036.8, + "learning_rate": 0.0009369246885348925, + "loss": 8994.3, + "step": 1700 + }, + { + "ce_loss_13": 4.005008333921433, + "ce_loss_26": 3.2108667314052584, + "ce_loss_39": 2.9691853642463686, + "ce_loss_52": 1.4231860041618347, + "ce_loss_7": 4.408407872915268, + "epoch": 0.171, + "grad_norm": 20.19997348661953, + "kl_loss_13": 5275.2, + "kl_loss_26": 3580.8, + "kl_loss_39": 3088.8, + "kl_loss_7": 6116.8, + "learning_rate": 0.0009361510595655545, + "loss": 9032.7, + "step": 1710 + }, + { + "ce_loss_13": 4.0295430123806, + "ce_loss_26": 3.269349628686905, + "ce_loss_39": 3.0168069303035736, + "ce_loss_52": 1.4558824241161346, + "ce_loss_7": 4.418658912181854, + "epoch": 0.172, + "grad_norm": 20.29063277895484, + "kl_loss_13": 5251.2, + "kl_loss_26": 3624.8, + "kl_loss_39": 3109.2, + "kl_loss_7": 6066.4, + "learning_rate": 0.0009353730385598887, + "loss": 8917.6, + "step": 1720 + }, + { + "ce_loss_13": 3.904751992225647, + "ce_loss_26": 3.117345708608627, + "ce_loss_39": 2.8736896753311156, + "ce_loss_52": 1.404754376411438, + "ce_loss_7": 4.301223260164261, + "epoch": 0.173, + "grad_norm": 21.33168840347063, + "kl_loss_13": 5095.2, + "kl_loss_26": 3414.0, + "kl_loss_39": 2926.0, + "kl_loss_7": 5920.8, + "learning_rate": 0.0009345906333525581, + "loss": 8827.0, + "step": 1730 + }, + { + "ce_loss_13": 3.943844336271286, + "ce_loss_26": 3.192184156179428, + "ce_loss_39": 2.938348424434662, + "ce_loss_52": 1.4280656158924103, + "ce_loss_7": 4.340347635746002, + "epoch": 0.174, + "grad_norm": 20.78517763533083, + "kl_loss_13": 5121.6, + "kl_loss_26": 3512.0, + "kl_loss_39": 2997.2, + "kl_loss_7": 5951.2, + "learning_rate": 0.0009338038518223745, + "loss": 8776.4, + "step": 1740 + }, + { + "ce_loss_13": 3.9746175587177275, + "ce_loss_26": 3.217360532283783, + "ce_loss_39": 2.9738565742969514, + "ce_loss_52": 1.460440719127655, + "ce_loss_7": 4.36298366189003, + "epoch": 0.175, + "grad_norm": 22.95282400415446, + "kl_loss_13": 5116.0, + "kl_loss_26": 3500.0, + "kl_loss_39": 2996.0, + "kl_loss_7": 5937.6, + "learning_rate": 0.0009330127018922195, + "loss": 8715.7, + "step": 1750 + }, + { + "ce_loss_13": 3.903409707546234, + "ce_loss_26": 3.1333308279514314, + "ce_loss_39": 2.8973484218120573, + "ce_loss_52": 1.4389021694660187, + "ce_loss_7": 4.297668445110321, + "epoch": 0.176, + "grad_norm": 19.71888822868113, + "kl_loss_13": 5043.2, + "kl_loss_26": 3399.6, + "kl_loss_39": 2906.0, + "kl_loss_7": 5868.8, + "learning_rate": 0.0009322171915289634, + "loss": 8660.5, + "step": 1760 + }, + { + "ce_loss_13": 3.9436737656593324, + "ce_loss_26": 3.180408328771591, + "ce_loss_39": 2.933422142267227, + "ce_loss_52": 1.468785560131073, + "ce_loss_7": 4.335923504829407, + "epoch": 0.177, + "grad_norm": 21.100233895265884, + "kl_loss_13": 5036.8, + "kl_loss_26": 3416.8, + "kl_loss_39": 2893.6, + "kl_loss_7": 5853.6, + "learning_rate": 0.0009314173287433873, + "loss": 8685.1, + "step": 1770 + }, + { + "ce_loss_13": 4.000654596090317, + "ce_loss_26": 3.24809735417366, + "ce_loss_39": 2.9819031238555906, + "ce_loss_52": 1.4765232503414154, + "ce_loss_7": 4.392659711837768, + "epoch": 0.178, + "grad_norm": 20.346113365710192, + "kl_loss_13": 5141.6, + "kl_loss_26": 3518.0, + "kl_loss_39": 2990.4, + "kl_loss_7": 5964.0, + "learning_rate": 0.0009306131215901003, + "loss": 8673.2, + "step": 1780 + }, + { + "ce_loss_13": 3.9246813535690306, + "ce_loss_26": 3.1745048224925996, + "ce_loss_39": 2.9245960414409637, + "ce_loss_52": 1.4693263441324234, + "ce_loss_7": 4.317492133378982, + "epoch": 0.179, + "grad_norm": 19.397137651046872, + "kl_loss_13": 5018.4, + "kl_loss_26": 3389.2, + "kl_loss_39": 2878.0, + "kl_loss_7": 5844.8, + "learning_rate": 0.0009298045781674596, + "loss": 8564.1, + "step": 1790 + }, + { + "ce_loss_13": 3.910848397016525, + "ce_loss_26": 3.146669828891754, + "ce_loss_39": 2.8859269857406615, + "ce_loss_52": 1.419717761874199, + "ce_loss_7": 4.3105459094047545, + "epoch": 0.18, + "grad_norm": 19.504781875531744, + "kl_loss_13": 5040.0, + "kl_loss_26": 3411.2, + "kl_loss_39": 2894.0, + "kl_loss_7": 5888.0, + "learning_rate": 0.0009289917066174886, + "loss": 8563.1, + "step": 1800 + }, + { + "ce_loss_13": 3.886237096786499, + "ce_loss_26": 3.0992377579212187, + "ce_loss_39": 2.8514576256275177, + "ce_loss_52": 1.4184779956936837, + "ce_loss_7": 4.274789291620254, + "epoch": 0.181, + "grad_norm": 19.60587995762198, + "kl_loss_13": 5042.4, + "kl_loss_26": 3362.0, + "kl_loss_39": 2856.4, + "kl_loss_7": 5855.2, + "learning_rate": 0.0009281745151257945, + "loss": 8453.1, + "step": 1810 + }, + { + "ce_loss_13": 3.9144074499607084, + "ce_loss_26": 3.1662435114383696, + "ce_loss_39": 2.9089259922504427, + "ce_loss_52": 1.478528293967247, + "ce_loss_7": 4.301838612556457, + "epoch": 0.182, + "grad_norm": 19.79968756805323, + "kl_loss_13": 4923.2, + "kl_loss_26": 3333.2, + "kl_loss_39": 2811.6, + "kl_loss_7": 5736.8, + "learning_rate": 0.0009273530119214868, + "loss": 8471.9, + "step": 1820 + }, + { + "ce_loss_13": 3.8238776862621306, + "ce_loss_26": 3.058783656358719, + "ce_loss_39": 2.807573360204697, + "ce_loss_52": 1.4178009316325189, + "ce_loss_7": 4.227353280782699, + "epoch": 0.183, + "grad_norm": 19.471751859426742, + "kl_loss_13": 4899.2, + "kl_loss_26": 3273.6, + "kl_loss_39": 2762.8, + "kl_loss_7": 5738.4, + "learning_rate": 0.0009265272052770935, + "loss": 8399.4, + "step": 1830 + }, + { + "ce_loss_13": 3.836485821008682, + "ce_loss_26": 3.0666845202445985, + "ce_loss_39": 2.818044346570969, + "ce_loss_52": 1.410616011917591, + "ce_loss_7": 4.236015152931214, + "epoch": 0.184, + "grad_norm": 19.102126667670856, + "kl_loss_13": 4940.0, + "kl_loss_26": 3286.0, + "kl_loss_39": 2780.4, + "kl_loss_7": 5788.0, + "learning_rate": 0.0009256971035084784, + "loss": 8347.7, + "step": 1840 + }, + { + "ce_loss_13": 3.8049618661403657, + "ce_loss_26": 3.0603095471858976, + "ce_loss_39": 2.8144713938236237, + "ce_loss_52": 1.4259676218032837, + "ce_loss_7": 4.199950724840164, + "epoch": 0.185, + "grad_norm": 19.382557477849222, + "kl_loss_13": 4835.6, + "kl_loss_26": 3257.6, + "kl_loss_39": 2762.0, + "kl_loss_7": 5657.6, + "learning_rate": 0.0009248627149747573, + "loss": 8313.5, + "step": 1850 + }, + { + "ce_loss_13": 3.839466482400894, + "ce_loss_26": 3.065267437696457, + "ce_loss_39": 2.81703776717186, + "ce_loss_52": 1.4297384396195412, + "ce_loss_7": 4.240725481510163, + "epoch": 0.186, + "grad_norm": 20.053275602042234, + "kl_loss_13": 4908.4, + "kl_loss_26": 3269.6, + "kl_loss_39": 2759.6, + "kl_loss_7": 5741.6, + "learning_rate": 0.0009240240480782129, + "loss": 8305.0, + "step": 1860 + }, + { + "ce_loss_13": 3.8183856308460236, + "ce_loss_26": 3.070716941356659, + "ce_loss_39": 2.807391846179962, + "ce_loss_52": 1.4359196320176124, + "ce_loss_7": 4.214242458343506, + "epoch": 0.187, + "grad_norm": 19.191079413729856, + "kl_loss_13": 4847.2, + "kl_loss_26": 3263.6, + "kl_loss_39": 2733.2, + "kl_loss_7": 5672.8, + "learning_rate": 0.0009231811112642122, + "loss": 8227.6, + "step": 1870 + }, + { + "ce_loss_13": 3.779190129041672, + "ce_loss_26": 3.0389082789421082, + "ce_loss_39": 2.783122771978378, + "ce_loss_52": 1.4208515673875808, + "ce_loss_7": 4.162911784648895, + "epoch": 0.188, + "grad_norm": 20.43241639662848, + "kl_loss_13": 4795.2, + "kl_loss_26": 3215.2, + "kl_loss_39": 2693.2, + "kl_loss_7": 5606.4, + "learning_rate": 0.0009223339130211192, + "loss": 8213.8, + "step": 1880 + }, + { + "ce_loss_13": 3.708034944534302, + "ce_loss_26": 2.9702564030885696, + "ce_loss_39": 2.735187420248985, + "ce_loss_52": 1.409775149822235, + "ce_loss_7": 4.099964368343353, + "epoch": 0.189, + "grad_norm": 19.723419677891812, + "kl_loss_13": 4691.2, + "kl_loss_26": 3118.0, + "kl_loss_39": 2625.8, + "kl_loss_7": 5516.0, + "learning_rate": 0.0009214824618802108, + "loss": 8146.0, + "step": 1890 + }, + { + "ce_loss_13": 3.848232001066208, + "ce_loss_26": 3.06461501121521, + "ce_loss_39": 2.81026514172554, + "ce_loss_52": 1.4419742107391358, + "ce_loss_7": 4.244399529695511, + "epoch": 0.19, + "grad_norm": 21.429961437314283, + "kl_loss_13": 4923.2, + "kl_loss_26": 3246.4, + "kl_loss_39": 2725.6, + "kl_loss_7": 5755.2, + "learning_rate": 0.0009206267664155906, + "loss": 8168.2, + "step": 1900 + }, + { + "ce_loss_13": 3.74611656665802, + "ce_loss_26": 2.995819491147995, + "ce_loss_39": 2.7504830598831176, + "ce_loss_52": 1.429488417506218, + "ce_loss_7": 4.137593048810959, + "epoch": 0.191, + "grad_norm": 20.703417767001277, + "kl_loss_13": 4708.8, + "kl_loss_26": 3114.8, + "kl_loss_39": 2616.8, + "kl_loss_7": 5532.0, + "learning_rate": 0.0009197668352441024, + "loss": 8113.4, + "step": 1910 + }, + { + "ce_loss_13": 3.7616052985191346, + "ce_loss_26": 3.0063544154167174, + "ce_loss_39": 2.7453058779239656, + "ce_loss_52": 1.4119072929024696, + "ce_loss_7": 4.15754896402359, + "epoch": 0.192, + "grad_norm": 19.851460642991412, + "kl_loss_13": 4782.4, + "kl_loss_26": 3173.6, + "kl_loss_39": 2644.0, + "kl_loss_7": 5609.6, + "learning_rate": 0.0009189026770252437, + "loss": 8087.1, + "step": 1920 + }, + { + "ce_loss_13": 3.7914208650588987, + "ce_loss_26": 3.0324925601482393, + "ce_loss_39": 2.7728191137313845, + "ce_loss_52": 1.4355733066797256, + "ce_loss_7": 4.178089827299118, + "epoch": 0.193, + "grad_norm": 19.2014881795966, + "kl_loss_13": 4800.8, + "kl_loss_26": 3194.0, + "kl_loss_39": 2659.2, + "kl_loss_7": 5608.8, + "learning_rate": 0.000918034300461078, + "loss": 8051.8, + "step": 1930 + }, + { + "ce_loss_13": 3.7237455368041994, + "ce_loss_26": 2.979181283712387, + "ce_loss_39": 2.7309202194213866, + "ce_loss_52": 1.4160432904958724, + "ce_loss_7": 4.128066539764404, + "epoch": 0.194, + "grad_norm": 20.3086236795729, + "kl_loss_13": 4720.0, + "kl_loss_26": 3114.8, + "kl_loss_39": 2602.4, + "kl_loss_7": 5556.0, + "learning_rate": 0.0009171617142961477, + "loss": 8041.9, + "step": 1940 + }, + { + "ce_loss_13": 3.749431645870209, + "ce_loss_26": 2.9937612235546114, + "ce_loss_39": 2.744140648841858, + "ce_loss_52": 1.4348472714424134, + "ce_loss_7": 4.156274873018265, + "epoch": 0.195, + "grad_norm": 19.294577091174897, + "kl_loss_13": 4722.4, + "kl_loss_26": 3116.0, + "kl_loss_39": 2605.2, + "kl_loss_7": 5567.2, + "learning_rate": 0.0009162849273173857, + "loss": 7982.1, + "step": 1950 + }, + { + "ce_loss_13": 3.7092731952667237, + "ce_loss_26": 2.9811393320560455, + "ce_loss_39": 2.733800619840622, + "ce_loss_52": 1.4478029429912567, + "ce_loss_7": 4.0951203346252445, + "epoch": 0.196, + "grad_norm": 18.879569020896366, + "kl_loss_13": 4628.0, + "kl_loss_26": 3074.0, + "kl_loss_39": 2569.2, + "kl_loss_7": 5440.8, + "learning_rate": 0.0009154039483540273, + "loss": 7938.2, + "step": 1960 + }, + { + "ce_loss_13": 3.816760164499283, + "ce_loss_26": 3.0473806083202364, + "ce_loss_39": 2.7949241638183593, + "ce_loss_52": 1.466755247116089, + "ce_loss_7": 4.206719404458999, + "epoch": 0.197, + "grad_norm": 18.719442415203407, + "kl_loss_13": 4774.4, + "kl_loss_26": 3153.6, + "kl_loss_39": 2628.8, + "kl_loss_7": 5592.0, + "learning_rate": 0.0009145187862775209, + "loss": 7902.9, + "step": 1970 + }, + { + "ce_loss_13": 3.6841754376888276, + "ce_loss_26": 2.9468030989170075, + "ce_loss_39": 2.694840121269226, + "ce_loss_52": 1.4269890293478966, + "ce_loss_7": 4.081214648485184, + "epoch": 0.198, + "grad_norm": 18.93967966275207, + "kl_loss_13": 4605.6, + "kl_loss_26": 3030.4, + "kl_loss_39": 2512.8, + "kl_loss_7": 5443.2, + "learning_rate": 0.0009136294500014386, + "loss": 7824.0, + "step": 1980 + }, + { + "ce_loss_13": 3.7859152793884276, + "ce_loss_26": 3.025045871734619, + "ce_loss_39": 2.761893022060394, + "ce_loss_52": 1.439668196439743, + "ce_loss_7": 4.175345808267593, + "epoch": 0.199, + "grad_norm": 18.767073263034167, + "kl_loss_13": 4776.0, + "kl_loss_26": 3164.0, + "kl_loss_39": 2630.4, + "kl_loss_7": 5584.0, + "learning_rate": 0.000912735948481387, + "loss": 7845.9, + "step": 1990 + }, + { + "ce_loss_13": 3.7032747983932497, + "ce_loss_26": 2.965251809358597, + "ce_loss_39": 2.708036279678345, + "ce_loss_52": 1.4405365601181983, + "ce_loss_7": 4.098276823759079, + "epoch": 0.2, + "grad_norm": 18.50836033098629, + "kl_loss_13": 4630.0, + "kl_loss_26": 3052.8, + "kl_loss_39": 2530.4, + "kl_loss_7": 5457.6, + "learning_rate": 0.0009118382907149164, + "loss": 7748.9, + "step": 2000 + }, + { + "ce_loss_13": 3.717062991857529, + "ce_loss_26": 2.979129308462143, + "ce_loss_39": 2.7225220024585726, + "ce_loss_52": 1.45234707146883, + "ce_loss_7": 4.110658597946167, + "epoch": 0.201, + "grad_norm": 19.844351529751968, + "kl_loss_13": 4629.6, + "kl_loss_26": 3045.6, + "kl_loss_39": 2525.2, + "kl_loss_7": 5464.0, + "learning_rate": 0.0009109364857414306, + "loss": 7809.6, + "step": 2010 + }, + { + "ce_loss_13": 3.7279494285583494, + "ce_loss_26": 2.9849789261817934, + "ce_loss_39": 2.7267052114009855, + "ce_loss_52": 1.4486449271440507, + "ce_loss_7": 4.1131413102149965, + "epoch": 0.202, + "grad_norm": 19.33420875029095, + "kl_loss_13": 4639.2, + "kl_loss_26": 3061.2, + "kl_loss_39": 2533.6, + "kl_loss_7": 5456.0, + "learning_rate": 0.0009100305426420956, + "loss": 7708.0, + "step": 2020 + }, + { + "ce_loss_13": 3.643654578924179, + "ce_loss_26": 2.9251492261886596, + "ce_loss_39": 2.667558515071869, + "ce_loss_52": 1.4118730872869492, + "ce_loss_7": 4.034631943702697, + "epoch": 0.203, + "grad_norm": 19.14819354564296, + "kl_loss_13": 4553.6, + "kl_loss_26": 3016.0, + "kl_loss_39": 2492.0, + "kl_loss_7": 5371.2, + "learning_rate": 0.0009091204705397484, + "loss": 7699.0, + "step": 2030 + }, + { + "ce_loss_13": 3.706267160177231, + "ce_loss_26": 2.971902164816856, + "ce_loss_39": 2.7119477689266205, + "ce_loss_52": 1.4564015328884126, + "ce_loss_7": 4.097871041297912, + "epoch": 0.204, + "grad_norm": 18.950983085399912, + "kl_loss_13": 4556.8, + "kl_loss_26": 2972.2, + "kl_loss_39": 2454.6, + "kl_loss_7": 5380.0, + "learning_rate": 0.0009082062785988049, + "loss": 7671.5, + "step": 2040 + }, + { + "ce_loss_13": 3.6526230454444883, + "ce_loss_26": 2.905240607261658, + "ce_loss_39": 2.654486656188965, + "ce_loss_52": 1.4092363178730012, + "ce_loss_7": 4.043174755573273, + "epoch": 0.205, + "grad_norm": 20.137932882736667, + "kl_loss_13": 4559.6, + "kl_loss_26": 2969.2, + "kl_loss_39": 2459.2, + "kl_loss_7": 5384.0, + "learning_rate": 0.0009072879760251679, + "loss": 7662.0, + "step": 2050 + }, + { + "ce_loss_13": 3.578403168916702, + "ce_loss_26": 2.8548426389694215, + "ce_loss_39": 2.615563529729843, + "ce_loss_52": 1.4107020199298859, + "ce_loss_7": 3.97204332947731, + "epoch": 0.206, + "grad_norm": 19.383234658877402, + "kl_loss_13": 4444.0, + "kl_loss_26": 2894.0, + "kl_loss_39": 2391.4, + "kl_loss_7": 5266.4, + "learning_rate": 0.0009063655720661341, + "loss": 7643.5, + "step": 2060 + }, + { + "ce_loss_13": 3.5892180263996125, + "ce_loss_26": 2.855096530914307, + "ce_loss_39": 2.6004712164402006, + "ce_loss_52": 1.4145165607333183, + "ce_loss_7": 3.983754909038544, + "epoch": 0.207, + "grad_norm": 19.811430759479535, + "kl_loss_13": 4467.2, + "kl_loss_26": 2904.0, + "kl_loss_39": 2385.0, + "kl_loss_7": 5291.2, + "learning_rate": 0.000905439076010301, + "loss": 7534.0, + "step": 2070 + }, + { + "ce_loss_13": 3.6425350308418274, + "ce_loss_26": 2.9249331414699555, + "ce_loss_39": 2.6687968969345093, + "ce_loss_52": 1.4602822691202164, + "ce_loss_7": 4.025002205371857, + "epoch": 0.208, + "grad_norm": 19.741988709477923, + "kl_loss_13": 4438.4, + "kl_loss_26": 2910.0, + "kl_loss_39": 2394.8, + "kl_loss_7": 5248.8, + "learning_rate": 0.0009045084971874737, + "loss": 7505.8, + "step": 2080 + }, + { + "ce_loss_13": 3.553661996126175, + "ce_loss_26": 2.8095623433589934, + "ce_loss_39": 2.561682888865471, + "ce_loss_52": 1.3855161294341087, + "ce_loss_7": 3.9535735607147218, + "epoch": 0.209, + "grad_norm": 18.406387003890178, + "kl_loss_13": 4454.8, + "kl_loss_26": 2860.4, + "kl_loss_39": 2354.2, + "kl_loss_7": 5295.2, + "learning_rate": 0.0009035738449685707, + "loss": 7532.3, + "step": 2090 + }, + { + "ce_loss_13": 3.659823089838028, + "ce_loss_26": 2.919918876886368, + "ce_loss_39": 2.67503222823143, + "ce_loss_52": 1.4627915531396867, + "ce_loss_7": 4.0562332451343535, + "epoch": 0.21, + "grad_norm": 19.907425841295773, + "kl_loss_13": 4488.8, + "kl_loss_26": 2909.2, + "kl_loss_39": 2407.0, + "kl_loss_7": 5320.8, + "learning_rate": 0.0009026351287655293, + "loss": 7517.4, + "step": 2100 + }, + { + "ce_loss_13": 3.630769556760788, + "ce_loss_26": 2.9068243861198426, + "ce_loss_39": 2.642093613743782, + "ce_loss_52": 1.4367542505264281, + "ce_loss_7": 4.022511690855026, + "epoch": 0.211, + "grad_norm": 18.196990652810108, + "kl_loss_13": 4459.2, + "kl_loss_26": 2926.8, + "kl_loss_39": 2395.2, + "kl_loss_7": 5276.0, + "learning_rate": 0.0009016923580312113, + "loss": 7412.6, + "step": 2110 + }, + { + "ce_loss_13": 3.682375580072403, + "ce_loss_26": 2.9616983354091646, + "ce_loss_39": 2.6908247590065004, + "ce_loss_52": 1.482297733426094, + "ce_loss_7": 4.079386693239212, + "epoch": 0.212, + "grad_norm": 20.88180475150302, + "kl_loss_13": 4465.6, + "kl_loss_26": 2920.8, + "kl_loss_39": 2382.8, + "kl_loss_7": 5297.6, + "learning_rate": 0.0009007455422593077, + "loss": 7402.8, + "step": 2120 + }, + { + "ce_loss_13": 3.5886090993881226, + "ce_loss_26": 2.8697936654090883, + "ce_loss_39": 2.6118789970874787, + "ce_loss_52": 1.4324451625347137, + "ce_loss_7": 3.989769661426544, + "epoch": 0.213, + "grad_norm": 21.410550894381295, + "kl_loss_13": 4406.0, + "kl_loss_26": 2862.8, + "kl_loss_39": 2337.2, + "kl_loss_7": 5236.0, + "learning_rate": 0.0008997946909842425, + "loss": 7376.6, + "step": 2130 + }, + { + "ce_loss_13": 3.5265793919563295, + "ce_loss_26": 2.8049169957637785, + "ce_loss_39": 2.5549218744039535, + "ce_loss_52": 1.4088917583227158, + "ce_loss_7": 3.920357757806778, + "epoch": 0.214, + "grad_norm": 18.12930136088373, + "kl_loss_13": 4307.2, + "kl_loss_26": 2766.8, + "kl_loss_39": 2250.8, + "kl_loss_7": 5132.8, + "learning_rate": 0.0008988398137810777, + "loss": 7289.0, + "step": 2140 + }, + { + "ce_loss_13": 3.493118005990982, + "ce_loss_26": 2.7619090020656585, + "ce_loss_39": 2.523107871413231, + "ce_loss_52": 1.3855519428849221, + "ce_loss_7": 3.8948814868927, + "epoch": 0.215, + "grad_norm": 19.085499107664276, + "kl_loss_13": 4317.6, + "kl_loss_26": 2765.2, + "kl_loss_39": 2265.6, + "kl_loss_7": 5148.8, + "learning_rate": 0.0008978809202654162, + "loss": 7322.3, + "step": 2150 + }, + { + "ce_loss_13": 3.5010905504226684, + "ce_loss_26": 2.7818336695432664, + "ce_loss_39": 2.5317496716976167, + "ce_loss_52": 1.410066269338131, + "ce_loss_7": 3.8946242213249205, + "epoch": 0.216, + "grad_norm": 18.12712360145358, + "kl_loss_13": 4244.8, + "kl_loss_26": 2719.8, + "kl_loss_39": 2212.8, + "kl_loss_7": 5073.6, + "learning_rate": 0.0008969180200933046, + "loss": 7287.8, + "step": 2160 + }, + { + "ce_loss_13": 3.583745849132538, + "ce_loss_26": 2.8594263792037964, + "ce_loss_39": 2.5962479442358015, + "ce_loss_52": 1.4458883255720139, + "ce_loss_7": 3.976805257797241, + "epoch": 0.217, + "grad_norm": 17.565470208207127, + "kl_loss_13": 4359.6, + "kl_loss_26": 2818.4, + "kl_loss_39": 2285.8, + "kl_loss_7": 5177.6, + "learning_rate": 0.0008959511229611376, + "loss": 7240.9, + "step": 2170 + }, + { + "ce_loss_13": 3.580790603160858, + "ce_loss_26": 2.859365826845169, + "ce_loss_39": 2.5985568940639494, + "ce_loss_52": 1.4648242503404618, + "ce_loss_7": 3.9631645143032075, + "epoch": 0.218, + "grad_norm": 18.371856212121592, + "kl_loss_13": 4316.0, + "kl_loss_26": 2762.4, + "kl_loss_39": 2248.2, + "kl_loss_7": 5127.2, + "learning_rate": 0.0008949802386055581, + "loss": 7227.3, + "step": 2180 + }, + { + "ce_loss_13": 3.549471515417099, + "ce_loss_26": 2.814970576763153, + "ce_loss_39": 2.561775863170624, + "ce_loss_52": 1.4231164067983628, + "ce_loss_7": 3.941384530067444, + "epoch": 0.219, + "grad_norm": 17.91149753664061, + "kl_loss_13": 4321.2, + "kl_loss_26": 2784.4, + "kl_loss_39": 2263.6, + "kl_loss_7": 5152.0, + "learning_rate": 0.0008940053768033609, + "loss": 7238.8, + "step": 2190 + }, + { + "ce_loss_13": 3.5581556379795076, + "ce_loss_26": 2.825929582118988, + "ce_loss_39": 2.570690780878067, + "ce_loss_52": 1.4427952721714974, + "ce_loss_7": 3.9514447808265687, + "epoch": 0.22, + "grad_norm": 19.745379707547666, + "kl_loss_13": 4304.8, + "kl_loss_26": 2754.0, + "kl_loss_39": 2234.6, + "kl_loss_7": 5132.8, + "learning_rate": 0.0008930265473713938, + "loss": 7236.5, + "step": 2200 + }, + { + "ce_loss_13": 3.522547519207001, + "ce_loss_26": 2.7868906617164613, + "ce_loss_39": 2.524843490123749, + "ce_loss_52": 1.3902505502104758, + "ce_loss_7": 3.9295816838741304, + "epoch": 0.221, + "grad_norm": 18.58799040514992, + "kl_loss_13": 4349.2, + "kl_loss_26": 2792.0, + "kl_loss_39": 2262.0, + "kl_loss_7": 5198.4, + "learning_rate": 0.0008920437601664579, + "loss": 7187.9, + "step": 2210 + }, + { + "ce_loss_13": 3.5173128962516786, + "ce_loss_26": 2.804593563079834, + "ce_loss_39": 2.557006138563156, + "ce_loss_52": 1.4581168740987778, + "ce_loss_7": 3.910993677377701, + "epoch": 0.222, + "grad_norm": 18.95814843355098, + "kl_loss_13": 4224.4, + "kl_loss_26": 2677.2, + "kl_loss_39": 2165.2, + "kl_loss_7": 5053.6, + "learning_rate": 0.0008910570250852097, + "loss": 7168.4, + "step": 2220 + }, + { + "ce_loss_13": 3.4375841438770296, + "ce_loss_26": 2.7280281484127045, + "ce_loss_39": 2.47977514564991, + "ce_loss_52": 1.389390866458416, + "ce_loss_7": 3.838480031490326, + "epoch": 0.223, + "grad_norm": 19.685999540006744, + "kl_loss_13": 4179.6, + "kl_loss_26": 2668.0, + "kl_loss_39": 2152.8, + "kl_loss_7": 5011.2, + "learning_rate": 0.0008900663520640604, + "loss": 7080.8, + "step": 2230 + }, + { + "ce_loss_13": 3.5171928703784943, + "ce_loss_26": 2.8187219202518463, + "ce_loss_39": 2.5553694486618044, + "ce_loss_52": 1.4541724801063538, + "ce_loss_7": 3.8974815249443053, + "epoch": 0.224, + "grad_norm": 26.45112973751635, + "kl_loss_13": 4224.4, + "kl_loss_26": 2710.0, + "kl_loss_39": 2180.4, + "kl_loss_7": 5080.0, + "learning_rate": 0.0008890717510790764, + "loss": 7105.5, + "step": 2240 + }, + { + "ce_loss_13": 3.5361606895923616, + "ce_loss_26": 2.8268598556518554, + "ce_loss_39": 2.5735931187868117, + "ce_loss_52": 1.451829667389393, + "ce_loss_7": 3.924015772342682, + "epoch": 0.225, + "grad_norm": 20.75511317660791, + "kl_loss_13": 4247.2, + "kl_loss_26": 2732.8, + "kl_loss_39": 2219.0, + "kl_loss_7": 5061.6, + "learning_rate": 0.0008880732321458784, + "loss": 7074.7, + "step": 2250 + }, + { + "ce_loss_13": 3.4491190731525423, + "ce_loss_26": 2.7567967534065247, + "ce_loss_39": 2.5041564613580705, + "ce_loss_52": 1.4379881560802459, + "ce_loss_7": 3.8363168060779573, + "epoch": 0.226, + "grad_norm": 18.822666188140545, + "kl_loss_13": 4102.8, + "kl_loss_26": 2621.6, + "kl_loss_39": 2107.6, + "kl_loss_7": 4917.6, + "learning_rate": 0.0008870708053195413, + "loss": 7003.4, + "step": 2260 + }, + { + "ce_loss_13": 3.4790355801582336, + "ce_loss_26": 2.765997165441513, + "ce_loss_39": 2.5106064915657043, + "ce_loss_52": 1.4210210233926772, + "ce_loss_7": 3.8746932446956635, + "epoch": 0.227, + "grad_norm": 19.45926661054129, + "kl_loss_13": 4188.4, + "kl_loss_26": 2681.2, + "kl_loss_39": 2162.6, + "kl_loss_7": 5009.6, + "learning_rate": 0.0008860644806944918, + "loss": 7002.8, + "step": 2270 + }, + { + "ce_loss_13": 3.589535415172577, + "ce_loss_26": 2.870089566707611, + "ce_loss_39": 2.5925551772117617, + "ce_loss_52": 1.4516636282205582, + "ce_loss_7": 3.9855311453342437, + "epoch": 0.228, + "grad_norm": 18.527281302332295, + "kl_loss_13": 4335.2, + "kl_loss_26": 2802.8, + "kl_loss_39": 2252.6, + "kl_loss_7": 5164.0, + "learning_rate": 0.0008850542684044079, + "loss": 7072.1, + "step": 2280 + }, + { + "ce_loss_13": 3.4438597559928894, + "ce_loss_26": 2.74727184176445, + "ce_loss_39": 2.4982927203178407, + "ce_loss_52": 1.435218185186386, + "ce_loss_7": 3.8294356882572176, + "epoch": 0.229, + "grad_norm": 18.07396826505032, + "kl_loss_13": 4082.8, + "kl_loss_26": 2585.6, + "kl_loss_39": 2087.0, + "kl_loss_7": 4892.8, + "learning_rate": 0.0008840401786221159, + "loss": 6974.9, + "step": 2290 + }, + { + "ce_loss_13": 3.476649820804596, + "ce_loss_26": 2.7891372203826905, + "ce_loss_39": 2.5390550673007963, + "ce_loss_52": 1.4611460983753204, + "ce_loss_7": 3.854980993270874, + "epoch": 0.23, + "grad_norm": 18.716359933554248, + "kl_loss_13": 4118.8, + "kl_loss_26": 2644.2, + "kl_loss_39": 2128.6, + "kl_loss_7": 4946.0, + "learning_rate": 0.000883022221559489, + "loss": 6901.3, + "step": 2300 + }, + { + "ce_loss_13": 3.467282909154892, + "ce_loss_26": 2.7604516625404356, + "ce_loss_39": 2.5003271818161013, + "ce_loss_52": 1.4461873590946197, + "ce_loss_7": 3.856851851940155, + "epoch": 0.231, + "grad_norm": 20.721250836400138, + "kl_loss_13": 4114.0, + "kl_loss_26": 2616.8, + "kl_loss_39": 2092.6, + "kl_loss_7": 4926.4, + "learning_rate": 0.0008820004074673434, + "loss": 6876.9, + "step": 2310 + }, + { + "ce_loss_13": 3.4266963064670564, + "ce_loss_26": 2.719339656829834, + "ce_loss_39": 2.4688956409692766, + "ce_loss_52": 1.4164521768689156, + "ce_loss_7": 3.8023972034454347, + "epoch": 0.232, + "grad_norm": 17.616553712409548, + "kl_loss_13": 4090.4, + "kl_loss_26": 2589.2, + "kl_loss_39": 2065.4, + "kl_loss_7": 4891.2, + "learning_rate": 0.0008809747466353355, + "loss": 6907.6, + "step": 2320 + }, + { + "ce_loss_13": 3.536805588006973, + "ce_loss_26": 2.8070395588874817, + "ce_loss_39": 2.5501783430576324, + "ce_loss_52": 1.4646585762500763, + "ce_loss_7": 3.9247563600540163, + "epoch": 0.233, + "grad_norm": 17.658525951814237, + "kl_loss_13": 4218.8, + "kl_loss_26": 2666.0, + "kl_loss_39": 2147.2, + "kl_loss_7": 5044.0, + "learning_rate": 0.0008799452493918585, + "loss": 6862.9, + "step": 2330 + }, + { + "ce_loss_13": 3.3707460284233095, + "ce_loss_26": 2.680304506421089, + "ce_loss_39": 2.4279863387346268, + "ce_loss_52": 1.4292149528861047, + "ce_loss_7": 3.7509031653404237, + "epoch": 0.234, + "grad_norm": 18.293581891673337, + "kl_loss_13": 3979.6, + "kl_loss_26": 2481.4, + "kl_loss_39": 1967.6, + "kl_loss_7": 4792.0, + "learning_rate": 0.0008789119261039385, + "loss": 6860.7, + "step": 2340 + }, + { + "ce_loss_13": 3.411047804355621, + "ce_loss_26": 2.7024976193904875, + "ce_loss_39": 2.4417600989341737, + "ce_loss_52": 1.404353639483452, + "ce_loss_7": 3.799528968334198, + "epoch": 0.235, + "grad_norm": 19.60391257341661, + "kl_loss_13": 4078.4, + "kl_loss_26": 2576.8, + "kl_loss_39": 2050.4, + "kl_loss_7": 4901.6, + "learning_rate": 0.0008778747871771292, + "loss": 6770.8, + "step": 2350 + }, + { + "ce_loss_13": 3.4045680582523348, + "ce_loss_26": 2.6985227525234223, + "ce_loss_39": 2.451327767968178, + "ce_loss_52": 1.4205562889575958, + "ce_loss_7": 3.7882447242736816, + "epoch": 0.236, + "grad_norm": 18.739033338884607, + "kl_loss_13": 4038.4, + "kl_loss_26": 2533.2, + "kl_loss_39": 2034.8, + "kl_loss_7": 4858.4, + "learning_rate": 0.0008768338430554083, + "loss": 6755.4, + "step": 2360 + }, + { + "ce_loss_13": 3.3626140534877775, + "ce_loss_26": 2.65916622877121, + "ce_loss_39": 2.4009654462337493, + "ce_loss_52": 1.3916789084672927, + "ce_loss_7": 3.749758929014206, + "epoch": 0.237, + "grad_norm": 20.138759568479667, + "kl_loss_13": 4022.0, + "kl_loss_26": 2521.4, + "kl_loss_39": 1993.2, + "kl_loss_7": 4829.6, + "learning_rate": 0.0008757891042210713, + "loss": 6791.4, + "step": 2370 + }, + { + "ce_loss_13": 3.381242650747299, + "ce_loss_26": 2.68309933245182, + "ce_loss_39": 2.433691692352295, + "ce_loss_52": 1.4095703065395355, + "ce_loss_7": 3.772293299436569, + "epoch": 0.238, + "grad_norm": 17.323586780740413, + "kl_loss_13": 4016.4, + "kl_loss_26": 2531.4, + "kl_loss_39": 2016.2, + "kl_loss_7": 4839.6, + "learning_rate": 0.0008747405811946271, + "loss": 6729.9, + "step": 2380 + }, + { + "ce_loss_13": 3.418868046998978, + "ce_loss_26": 2.7223224580287932, + "ce_loss_39": 2.4728009045124053, + "ce_loss_52": 1.446861308813095, + "ce_loss_7": 3.7994930267333986, + "epoch": 0.239, + "grad_norm": 17.668482550134954, + "kl_loss_13": 4034.0, + "kl_loss_26": 2556.0, + "kl_loss_39": 2037.4, + "kl_loss_7": 4842.4, + "learning_rate": 0.0008736882845346905, + "loss": 6764.7, + "step": 2390 + }, + { + "ce_loss_13": 3.407726752758026, + "ce_loss_26": 2.714660122990608, + "ce_loss_39": 2.4702648639678957, + "ce_loss_52": 1.4476186811923981, + "ce_loss_7": 3.7862841546535493, + "epoch": 0.24, + "grad_norm": 20.313458924733165, + "kl_loss_13": 3988.4, + "kl_loss_26": 2507.2, + "kl_loss_39": 2008.2, + "kl_loss_7": 4794.4, + "learning_rate": 0.0008726322248378774, + "loss": 6720.4, + "step": 2400 + }, + { + "ce_loss_13": 3.4095008313655852, + "ce_loss_26": 2.69794414639473, + "ce_loss_39": 2.44133580327034, + "ce_loss_52": 1.426843424141407, + "ce_loss_7": 3.8018106520175934, + "epoch": 0.241, + "grad_norm": 17.847824779807837, + "kl_loss_13": 4039.6, + "kl_loss_26": 2535.6, + "kl_loss_39": 2007.8, + "kl_loss_7": 4864.0, + "learning_rate": 0.0008715724127386971, + "loss": 6713.1, + "step": 2410 + }, + { + "ce_loss_13": 3.370608961582184, + "ce_loss_26": 2.688097137212753, + "ce_loss_39": 2.435939407348633, + "ce_loss_52": 1.4344248950481415, + "ce_loss_7": 3.745537704229355, + "epoch": 0.242, + "grad_norm": 18.03045500704746, + "kl_loss_13": 3943.6, + "kl_loss_26": 2497.6, + "kl_loss_39": 1983.4, + "kl_loss_7": 4736.8, + "learning_rate": 0.0008705088589094458, + "loss": 6611.0, + "step": 2420 + }, + { + "ce_loss_13": 3.4596225798130034, + "ce_loss_26": 2.7608898997306826, + "ce_loss_39": 2.5022788047790527, + "ce_loss_52": 1.4623101890087127, + "ce_loss_7": 3.839513373374939, + "epoch": 0.243, + "grad_norm": 18.190444229647287, + "kl_loss_13": 4058.0, + "kl_loss_26": 2575.6, + "kl_loss_39": 2054.6, + "kl_loss_7": 4862.4, + "learning_rate": 0.0008694415740600988, + "loss": 6638.0, + "step": 2430 + }, + { + "ce_loss_13": 3.370687645673752, + "ce_loss_26": 2.6674872994422913, + "ce_loss_39": 2.414305740594864, + "ce_loss_52": 1.4324225425720214, + "ce_loss_7": 3.767412984371185, + "epoch": 0.244, + "grad_norm": 18.848120466041497, + "kl_loss_13": 3940.4, + "kl_loss_26": 2449.4, + "kl_loss_39": 1932.2, + "kl_loss_7": 4769.6, + "learning_rate": 0.0008683705689382025, + "loss": 6641.0, + "step": 2440 + }, + { + "ce_loss_13": 3.347389942407608, + "ce_loss_26": 2.6603663861751556, + "ce_loss_39": 2.4186645448207855, + "ce_loss_52": 1.452422297000885, + "ce_loss_7": 3.731181102991104, + "epoch": 0.245, + "grad_norm": 17.05503964147942, + "kl_loss_13": 3888.8, + "kl_loss_26": 2418.6, + "kl_loss_39": 1913.8, + "kl_loss_7": 4694.4, + "learning_rate": 0.0008672958543287666, + "loss": 6617.1, + "step": 2450 + }, + { + "ce_loss_13": 3.341637074947357, + "ce_loss_26": 2.653514164686203, + "ce_loss_39": 2.4030585259199144, + "ce_loss_52": 1.4203058749437332, + "ce_loss_7": 3.7326664865016936, + "epoch": 0.246, + "grad_norm": 18.028238041954314, + "kl_loss_13": 3894.0, + "kl_loss_26": 2435.6, + "kl_loss_39": 1932.4, + "kl_loss_7": 4711.2, + "learning_rate": 0.0008662174410541554, + "loss": 6537.2, + "step": 2460 + }, + { + "ce_loss_13": 3.343936342000961, + "ce_loss_26": 2.6602561354637144, + "ce_loss_39": 2.4062414824962617, + "ce_loss_52": 1.4341940209269524, + "ce_loss_7": 3.7255902886390686, + "epoch": 0.247, + "grad_norm": 17.615419947143096, + "kl_loss_13": 3890.8, + "kl_loss_26": 2433.8, + "kl_loss_39": 1921.6, + "kl_loss_7": 4692.0, + "learning_rate": 0.0008651353399739787, + "loss": 6499.5, + "step": 2470 + }, + { + "ce_loss_13": 3.36987144947052, + "ce_loss_26": 2.6651594936847687, + "ce_loss_39": 2.41037335395813, + "ce_loss_52": 1.4215908780694009, + "ce_loss_7": 3.75968217253685, + "epoch": 0.248, + "grad_norm": 19.448152371934484, + "kl_loss_13": 3954.4, + "kl_loss_26": 2461.0, + "kl_loss_39": 1945.4, + "kl_loss_7": 4776.0, + "learning_rate": 0.0008640495619849821, + "loss": 6570.0, + "step": 2480 + }, + { + "ce_loss_13": 3.3932483792304993, + "ce_loss_26": 2.69803272485733, + "ce_loss_39": 2.448850151896477, + "ce_loss_52": 1.4729512989521027, + "ce_loss_7": 3.7767464458942412, + "epoch": 0.249, + "grad_norm": 17.76914722627013, + "kl_loss_13": 3932.8, + "kl_loss_26": 2449.6, + "kl_loss_39": 1942.6, + "kl_loss_7": 4743.2, + "learning_rate": 0.0008629601180209381, + "loss": 6472.8, + "step": 2490 + }, + { + "ce_loss_13": 3.3404706001281737, + "ce_loss_26": 2.6527740180492403, + "ce_loss_39": 2.403418445587158, + "ce_loss_52": 1.4374166071414947, + "ce_loss_7": 3.7279320538043974, + "epoch": 0.25, + "grad_norm": 18.043688934070087, + "kl_loss_13": 3897.6, + "kl_loss_26": 2421.6, + "kl_loss_39": 1902.0, + "kl_loss_7": 4716.0, + "learning_rate": 0.000861867019052535, + "loss": 6482.2, + "step": 2500 + }, + { + "ce_loss_13": 3.408062273263931, + "ce_loss_26": 2.704035770893097, + "ce_loss_39": 2.46220483481884, + "ce_loss_52": 1.4713353991508484, + "ce_loss_7": 3.795337921380997, + "epoch": 0.251, + "grad_norm": 19.728713333582007, + "kl_loss_13": 3944.4, + "kl_loss_26": 2444.8, + "kl_loss_39": 1942.4, + "kl_loss_7": 4760.4, + "learning_rate": 0.0008607702760872678, + "loss": 6463.8, + "step": 2510 + }, + { + "ce_loss_13": 3.388230836391449, + "ce_loss_26": 2.687982529401779, + "ce_loss_39": 2.429553496837616, + "ce_loss_52": 1.4563929110765457, + "ce_loss_7": 3.781431978940964, + "epoch": 0.252, + "grad_norm": 18.472247546339297, + "kl_loss_13": 3948.8, + "kl_loss_26": 2449.6, + "kl_loss_39": 1930.0, + "kl_loss_7": 4769.6, + "learning_rate": 0.0008596699001693256, + "loss": 6470.9, + "step": 2520 + }, + { + "ce_loss_13": 3.3468190252780916, + "ce_loss_26": 2.6537194311618806, + "ce_loss_39": 2.38609040081501, + "ce_loss_52": 1.4265122324228288, + "ce_loss_7": 3.7305682718753816, + "epoch": 0.253, + "grad_norm": 19.078325524688942, + "kl_loss_13": 3872.0, + "kl_loss_26": 2409.4, + "kl_loss_39": 1875.8, + "kl_loss_7": 4679.2, + "learning_rate": 0.0008585659023794818, + "loss": 6413.9, + "step": 2530 + }, + { + "ce_loss_13": 3.3305428862571715, + "ce_loss_26": 2.6303874611854554, + "ce_loss_39": 2.3649342864751817, + "ce_loss_52": 1.4202288419008255, + "ce_loss_7": 3.712818431854248, + "epoch": 0.254, + "grad_norm": 17.612892401319467, + "kl_loss_13": 3873.6, + "kl_loss_26": 2396.4, + "kl_loss_39": 1864.0, + "kl_loss_7": 4688.8, + "learning_rate": 0.0008574582938349817, + "loss": 6377.5, + "step": 2540 + }, + { + "ce_loss_13": 3.3927013695240023, + "ce_loss_26": 2.7204831540584564, + "ce_loss_39": 2.464686703681946, + "ce_loss_52": 1.487898689508438, + "ce_loss_7": 3.7826764941215516, + "epoch": 0.255, + "grad_norm": 19.269838722570377, + "kl_loss_13": 3875.2, + "kl_loss_26": 2432.8, + "kl_loss_39": 1907.8, + "kl_loss_7": 4701.6, + "learning_rate": 0.0008563470856894315, + "loss": 6365.3, + "step": 2550 + }, + { + "ce_loss_13": 3.366223245859146, + "ce_loss_26": 2.6793350696563722, + "ce_loss_39": 2.4282671988010405, + "ce_loss_52": 1.475812730193138, + "ce_loss_7": 3.7593341052532194, + "epoch": 0.256, + "grad_norm": 17.67137481592248, + "kl_loss_13": 3846.8, + "kl_loss_26": 2394.4, + "kl_loss_39": 1881.4, + "kl_loss_7": 4661.6, + "learning_rate": 0.0008552322891326845, + "loss": 6381.8, + "step": 2560 + }, + { + "ce_loss_13": 3.348971825838089, + "ce_loss_26": 2.6535234093666076, + "ce_loss_39": 2.396692654490471, + "ce_loss_52": 1.4521390795707703, + "ce_loss_7": 3.737420654296875, + "epoch": 0.257, + "grad_norm": 17.922791060277248, + "kl_loss_13": 3853.2, + "kl_loss_26": 2382.8, + "kl_loss_39": 1860.0, + "kl_loss_7": 4671.6, + "learning_rate": 0.0008541139153907296, + "loss": 6320.6, + "step": 2570 + }, + { + "ce_loss_13": 3.3175282776355743, + "ce_loss_26": 2.62116519510746, + "ce_loss_39": 2.373508110642433, + "ce_loss_52": 1.4496644467115403, + "ce_loss_7": 3.7005242466926576, + "epoch": 0.258, + "grad_norm": 17.919048466696356, + "kl_loss_13": 3806.0, + "kl_loss_26": 2325.8, + "kl_loss_39": 1815.6, + "kl_loss_7": 4616.8, + "learning_rate": 0.0008529919757255782, + "loss": 6324.1, + "step": 2580 + }, + { + "ce_loss_13": 3.3524767458438873, + "ce_loss_26": 2.672022157907486, + "ce_loss_39": 2.42145474255085, + "ce_loss_52": 1.490525448322296, + "ce_loss_7": 3.731249511241913, + "epoch": 0.259, + "grad_norm": 17.94332341656099, + "kl_loss_13": 3786.8, + "kl_loss_26": 2328.6, + "kl_loss_39": 1819.0, + "kl_loss_7": 4580.4, + "learning_rate": 0.0008518664814351503, + "loss": 6266.2, + "step": 2590 + }, + { + "ce_loss_13": 3.2249048352241516, + "ce_loss_26": 2.526710030436516, + "ce_loss_39": 2.288080096244812, + "ce_loss_52": 1.3951421514153481, + "ce_loss_7": 3.6117550313472746, + "epoch": 0.26, + "grad_norm": 17.453584143766424, + "kl_loss_13": 3747.6, + "kl_loss_26": 2273.8, + "kl_loss_39": 1787.0, + "kl_loss_7": 4560.8, + "learning_rate": 0.0008507374438531607, + "loss": 6263.6, + "step": 2600 + }, + { + "ce_loss_13": 3.385346031188965, + "ce_loss_26": 2.689622291922569, + "ce_loss_39": 2.437500995397568, + "ce_loss_52": 1.4798223778605462, + "ce_loss_7": 3.780226916074753, + "epoch": 0.261, + "grad_norm": 17.77067876803282, + "kl_loss_13": 3881.6, + "kl_loss_26": 2399.4, + "kl_loss_39": 1889.6, + "kl_loss_7": 4706.4, + "learning_rate": 0.0008496048743490053, + "loss": 6251.8, + "step": 2610 + }, + { + "ce_loss_13": 3.253863149881363, + "ce_loss_26": 2.5867208421230314, + "ce_loss_39": 2.3403410583734514, + "ce_loss_52": 1.4355336636304856, + "ce_loss_7": 3.628329038619995, + "epoch": 0.262, + "grad_norm": 18.13099048831459, + "kl_loss_13": 3714.0, + "kl_loss_26": 2278.6, + "kl_loss_39": 1781.2, + "kl_loss_7": 4501.2, + "learning_rate": 0.0008484687843276469, + "loss": 6230.8, + "step": 2620 + }, + { + "ce_loss_13": 3.308898413181305, + "ce_loss_26": 2.6307075321674347, + "ce_loss_39": 2.381870651245117, + "ce_loss_52": 1.4658461689949036, + "ce_loss_7": 3.686649763584137, + "epoch": 0.263, + "grad_norm": 17.246075540125826, + "kl_loss_13": 3767.2, + "kl_loss_26": 2322.6, + "kl_loss_39": 1819.4, + "kl_loss_7": 4560.0, + "learning_rate": 0.0008473291852294987, + "loss": 6261.4, + "step": 2630 + }, + { + "ce_loss_13": 3.301677256822586, + "ce_loss_26": 2.623979777097702, + "ce_loss_39": 2.3711125582456587, + "ce_loss_52": 1.4476893723011017, + "ce_loss_7": 3.686248630285263, + "epoch": 0.264, + "grad_norm": 18.725565567002363, + "kl_loss_13": 3772.0, + "kl_loss_26": 2323.8, + "kl_loss_39": 1813.4, + "kl_loss_7": 4567.6, + "learning_rate": 0.0008461860885303114, + "loss": 6186.3, + "step": 2640 + }, + { + "ce_loss_13": 3.3081180572509767, + "ce_loss_26": 2.5993283450603486, + "ce_loss_39": 2.346868970990181, + "ce_loss_52": 1.4209152534604073, + "ce_loss_7": 3.700265485048294, + "epoch": 0.265, + "grad_norm": 16.39724372868559, + "kl_loss_13": 3844.8, + "kl_loss_26": 2352.2, + "kl_loss_39": 1835.0, + "kl_loss_7": 4663.2, + "learning_rate": 0.000845039505741056, + "loss": 6224.4, + "step": 2650 + }, + { + "ce_loss_13": 3.2767783522605898, + "ce_loss_26": 2.610367941856384, + "ce_loss_39": 2.3734692215919493, + "ce_loss_52": 1.4838453635573388, + "ce_loss_7": 3.6544183135032653, + "epoch": 0.266, + "grad_norm": 18.646431501601995, + "kl_loss_13": 3666.4, + "kl_loss_26": 2232.8, + "kl_loss_39": 1743.2, + "kl_loss_7": 4455.2, + "learning_rate": 0.0008438894484078086, + "loss": 6164.1, + "step": 2660 + }, + { + "ce_loss_13": 3.187056082487106, + "ce_loss_26": 2.5067271828651427, + "ce_loss_39": 2.265036514401436, + "ce_loss_52": 1.3943608120083808, + "ce_loss_7": 3.5653835415840147, + "epoch": 0.267, + "grad_norm": 17.226899122841342, + "kl_loss_13": 3695.2, + "kl_loss_26": 2239.8, + "kl_loss_39": 1740.2, + "kl_loss_7": 4491.6, + "learning_rate": 0.0008427359281116334, + "loss": 6116.4, + "step": 2670 + }, + { + "ce_loss_13": 3.2635743618011475, + "ce_loss_26": 2.5878145933151244, + "ce_loss_39": 2.344261533021927, + "ce_loss_52": 1.436160460114479, + "ce_loss_7": 3.642155331373215, + "epoch": 0.268, + "grad_norm": 17.358976250170773, + "kl_loss_13": 3715.6, + "kl_loss_26": 2286.6, + "kl_loss_39": 1782.2, + "kl_loss_7": 4510.8, + "learning_rate": 0.0008415789564684673, + "loss": 6120.1, + "step": 2680 + }, + { + "ce_loss_13": 3.217154061794281, + "ce_loss_26": 2.55003065764904, + "ce_loss_39": 2.3082681566476824, + "ce_loss_52": 1.4278038635849952, + "ce_loss_7": 3.602716547250748, + "epoch": 0.269, + "grad_norm": 17.314540535001434, + "kl_loss_13": 3655.2, + "kl_loss_26": 2225.8, + "kl_loss_39": 1731.0, + "kl_loss_7": 4458.8, + "learning_rate": 0.0008404185451290017, + "loss": 6184.5, + "step": 2690 + }, + { + "ce_loss_13": 3.24159716963768, + "ce_loss_26": 2.54996337890625, + "ce_loss_39": 2.3031594485044478, + "ce_loss_52": 1.4209882378578187, + "ce_loss_7": 3.6337377846241, + "epoch": 0.27, + "grad_norm": 17.492742712849175, + "kl_loss_13": 3726.4, + "kl_loss_26": 2257.8, + "kl_loss_39": 1756.2, + "kl_loss_7": 4549.6, + "learning_rate": 0.0008392547057785661, + "loss": 6062.5, + "step": 2700 + }, + { + "ce_loss_13": 3.227788990736008, + "ce_loss_26": 2.5358716517686846, + "ce_loss_39": 2.285151606798172, + "ce_loss_52": 1.4203684866428374, + "ce_loss_7": 3.6037886083126067, + "epoch": 0.271, + "grad_norm": 17.53543108395465, + "kl_loss_13": 3669.6, + "kl_loss_26": 2215.6, + "kl_loss_39": 1712.0, + "kl_loss_7": 4470.8, + "learning_rate": 0.0008380874501370098, + "loss": 6120.8, + "step": 2710 + }, + { + "ce_loss_13": 3.1636491239070894, + "ce_loss_26": 2.5045939922332763, + "ce_loss_39": 2.2759897857904434, + "ce_loss_52": 1.4353285342454911, + "ce_loss_7": 3.5319547176361086, + "epoch": 0.272, + "grad_norm": 18.586839239007904, + "kl_loss_13": 3539.2, + "kl_loss_26": 2133.8, + "kl_loss_39": 1666.0, + "kl_loss_7": 4328.8, + "learning_rate": 0.0008369167899585841, + "loss": 6051.0, + "step": 2720 + }, + { + "ce_loss_13": 3.221328115463257, + "ce_loss_26": 2.535122260451317, + "ce_loss_39": 2.2940177261829375, + "ce_loss_52": 1.419254219532013, + "ce_loss_7": 3.6125965118408203, + "epoch": 0.273, + "grad_norm": 17.658632608609814, + "kl_loss_13": 3688.0, + "kl_loss_26": 2233.2, + "kl_loss_39": 1733.0, + "kl_loss_7": 4510.0, + "learning_rate": 0.0008357427370318238, + "loss": 6045.8, + "step": 2730 + }, + { + "ce_loss_13": 3.2239345014095306, + "ce_loss_26": 2.556156021356583, + "ce_loss_39": 2.3110478937625887, + "ce_loss_52": 1.4514835059642792, + "ce_loss_7": 3.6023066878318786, + "epoch": 0.274, + "grad_norm": 18.16156755157877, + "kl_loss_13": 3623.2, + "kl_loss_26": 2184.8, + "kl_loss_39": 1687.0, + "kl_loss_7": 4421.6, + "learning_rate": 0.0008345653031794292, + "loss": 6081.8, + "step": 2740 + }, + { + "ce_loss_13": 3.248935067653656, + "ce_loss_26": 2.571800184249878, + "ce_loss_39": 2.3257440716028213, + "ce_loss_52": 1.4555991351604463, + "ce_loss_7": 3.6242997109889985, + "epoch": 0.275, + "grad_norm": 17.486859359677037, + "kl_loss_13": 3626.4, + "kl_loss_26": 2208.0, + "kl_loss_39": 1712.0, + "kl_loss_7": 4417.6, + "learning_rate": 0.0008333845002581458, + "loss": 5996.5, + "step": 2750 + }, + { + "ce_loss_13": 3.2696946620941163, + "ce_loss_26": 2.5927825689315798, + "ce_loss_39": 2.3452927708625793, + "ce_loss_52": 1.4668353974819184, + "ce_loss_7": 3.635032969713211, + "epoch": 0.276, + "grad_norm": 17.043532441273413, + "kl_loss_13": 3660.4, + "kl_loss_26": 2235.2, + "kl_loss_39": 1729.2, + "kl_loss_7": 4435.2, + "learning_rate": 0.0008322003401586462, + "loss": 5989.9, + "step": 2760 + }, + { + "ce_loss_13": 3.187331736087799, + "ce_loss_26": 2.537285569310188, + "ce_loss_39": 2.296916127204895, + "ce_loss_52": 1.4428926169872285, + "ce_loss_7": 3.55471009016037, + "epoch": 0.277, + "grad_norm": 18.300410919099367, + "kl_loss_13": 3558.4, + "kl_loss_26": 2165.8, + "kl_loss_39": 1682.4, + "kl_loss_7": 4338.0, + "learning_rate": 0.0008310128348054094, + "loss": 5970.6, + "step": 2770 + }, + { + "ce_loss_13": 3.2006444096565247, + "ce_loss_26": 2.5205237597227095, + "ce_loss_39": 2.282972750067711, + "ce_loss_52": 1.4260737136006356, + "ce_loss_7": 3.5821855068206787, + "epoch": 0.278, + "grad_norm": 17.549429821325518, + "kl_loss_13": 3649.2, + "kl_loss_26": 2206.8, + "kl_loss_39": 1708.8, + "kl_loss_7": 4441.2, + "learning_rate": 0.0008298219961566008, + "loss": 5976.9, + "step": 2780 + }, + { + "ce_loss_13": 3.1831609129905702, + "ce_loss_26": 2.4997926205396652, + "ce_loss_39": 2.2601052969694138, + "ce_loss_52": 1.403978604078293, + "ce_loss_7": 3.553097301721573, + "epoch": 0.279, + "grad_norm": 17.798788155683486, + "kl_loss_13": 3626.8, + "kl_loss_26": 2180.8, + "kl_loss_39": 1685.6, + "kl_loss_7": 4416.0, + "learning_rate": 0.0008286278362039527, + "loss": 5901.3, + "step": 2790 + }, + { + "ce_loss_13": 3.2061972856521606, + "ce_loss_26": 2.5348829567432403, + "ce_loss_39": 2.29689359664917, + "ce_loss_52": 1.4524233996868134, + "ce_loss_7": 3.576086735725403, + "epoch": 0.28, + "grad_norm": 17.920526019853103, + "kl_loss_13": 3566.0, + "kl_loss_26": 2145.8, + "kl_loss_39": 1659.0, + "kl_loss_7": 4341.6, + "learning_rate": 0.0008274303669726426, + "loss": 5875.7, + "step": 2800 + }, + { + "ce_loss_13": 3.2895997047424315, + "ce_loss_26": 2.6019150614738464, + "ce_loss_39": 2.3554980546236037, + "ce_loss_52": 1.4626183837652207, + "ce_loss_7": 3.680211156606674, + "epoch": 0.281, + "grad_norm": 18.339396286679403, + "kl_loss_13": 3746.0, + "kl_loss_26": 2276.4, + "kl_loss_39": 1760.4, + "kl_loss_7": 4566.0, + "learning_rate": 0.0008262296005211721, + "loss": 5938.8, + "step": 2810 + }, + { + "ce_loss_13": 3.1840124845504763, + "ce_loss_26": 2.5105081349611282, + "ce_loss_39": 2.2711715549230576, + "ce_loss_52": 1.4384785890579224, + "ce_loss_7": 3.5735177397727966, + "epoch": 0.282, + "grad_norm": 17.54448826978921, + "kl_loss_13": 3568.4, + "kl_loss_26": 2126.0, + "kl_loss_39": 1635.8, + "kl_loss_7": 4378.4, + "learning_rate": 0.0008250255489412463, + "loss": 5922.5, + "step": 2820 + }, + { + "ce_loss_13": 3.2147216141223907, + "ce_loss_26": 2.5306088238954545, + "ce_loss_39": 2.2879262059926986, + "ce_loss_52": 1.4329912930727005, + "ce_loss_7": 3.6062001168727873, + "epoch": 0.283, + "grad_norm": 16.224521909183913, + "kl_loss_13": 3634.4, + "kl_loss_26": 2182.2, + "kl_loss_39": 1686.4, + "kl_loss_7": 4456.0, + "learning_rate": 0.0008238182243576511, + "loss": 5870.9, + "step": 2830 + }, + { + "ce_loss_13": 3.204808014631271, + "ce_loss_26": 2.529933416843414, + "ce_loss_39": 2.2772836655378343, + "ce_loss_52": 1.4382712185382842, + "ce_loss_7": 3.593477213382721, + "epoch": 0.284, + "grad_norm": 17.85408893335792, + "kl_loss_13": 3598.4, + "kl_loss_26": 2175.4, + "kl_loss_39": 1665.8, + "kl_loss_7": 4413.6, + "learning_rate": 0.0008226076389281315, + "loss": 5857.1, + "step": 2840 + }, + { + "ce_loss_13": 3.137856882810593, + "ce_loss_26": 2.4886497616767884, + "ce_loss_39": 2.2476108491420748, + "ce_loss_52": 1.4388306617736817, + "ce_loss_7": 3.5120018839836122, + "epoch": 0.285, + "grad_norm": 17.527104476469322, + "kl_loss_13": 3468.4, + "kl_loss_26": 2085.8, + "kl_loss_39": 1594.4, + "kl_loss_7": 4250.0, + "learning_rate": 0.0008213938048432696, + "loss": 5806.8, + "step": 2850 + }, + { + "ce_loss_13": 3.134007251262665, + "ce_loss_26": 2.4605695813894273, + "ce_loss_39": 2.2274533331394197, + "ce_loss_52": 1.4070687741041183, + "ce_loss_7": 3.51087743639946, + "epoch": 0.286, + "grad_norm": 16.81428229570175, + "kl_loss_13": 3532.8, + "kl_loss_26": 2101.0, + "kl_loss_39": 1616.2, + "kl_loss_7": 4327.2, + "learning_rate": 0.0008201767343263612, + "loss": 5809.7, + "step": 2860 + }, + { + "ce_loss_13": 3.1623673915863035, + "ce_loss_26": 2.497657111287117, + "ce_loss_39": 2.2517473757267, + "ce_loss_52": 1.4269344687461853, + "ce_loss_7": 3.543222689628601, + "epoch": 0.287, + "grad_norm": 16.70636713163243, + "kl_loss_13": 3541.2, + "kl_loss_26": 2128.4, + "kl_loss_39": 1624.8, + "kl_loss_7": 4340.0, + "learning_rate": 0.0008189564396332927, + "loss": 5789.5, + "step": 2870 + }, + { + "ce_loss_13": 3.1600242078304293, + "ce_loss_26": 2.4901143670082093, + "ce_loss_39": 2.262405735254288, + "ce_loss_52": 1.447944176197052, + "ce_loss_7": 3.5380080163478853, + "epoch": 0.288, + "grad_norm": 17.918386343579222, + "kl_loss_13": 3481.6, + "kl_loss_26": 2059.8, + "kl_loss_39": 1588.8, + "kl_loss_7": 4287.6, + "learning_rate": 0.0008177329330524181, + "loss": 5812.1, + "step": 2880 + }, + { + "ce_loss_13": 3.174392342567444, + "ce_loss_26": 2.5159962266683578, + "ce_loss_39": 2.2794058710336684, + "ce_loss_52": 1.444807243347168, + "ce_loss_7": 3.5484564363956452, + "epoch": 0.289, + "grad_norm": 19.196721110708168, + "kl_loss_13": 3507.2, + "kl_loss_26": 2102.6, + "kl_loss_39": 1624.0, + "kl_loss_7": 4290.8, + "learning_rate": 0.0008165062269044352, + "loss": 5808.4, + "step": 2890 + }, + { + "ce_loss_13": 3.1698272943496706, + "ce_loss_26": 2.506984257698059, + "ce_loss_39": 2.2710763216018677, + "ce_loss_52": 1.4491374969482422, + "ce_loss_7": 3.553741979598999, + "epoch": 0.29, + "grad_norm": 16.91235339946444, + "kl_loss_13": 3511.2, + "kl_loss_26": 2109.2, + "kl_loss_39": 1627.2, + "kl_loss_7": 4317.6, + "learning_rate": 0.0008152763335422613, + "loss": 5792.2, + "step": 2900 + }, + { + "ce_loss_13": 3.115260285139084, + "ce_loss_26": 2.4620468825101853, + "ce_loss_39": 2.2224705785512926, + "ce_loss_52": 1.4216517835855484, + "ce_loss_7": 3.49349564909935, + "epoch": 0.291, + "grad_norm": 19.52390820528971, + "kl_loss_13": 3444.4, + "kl_loss_26": 2058.8, + "kl_loss_39": 1575.2, + "kl_loss_7": 4239.2, + "learning_rate": 0.0008140432653509088, + "loss": 5744.6, + "step": 2910 + }, + { + "ce_loss_13": 3.094448319077492, + "ce_loss_26": 2.4242703199386595, + "ce_loss_39": 2.1858216524124146, + "ce_loss_52": 1.4000759646296501, + "ce_loss_7": 3.4697431921958923, + "epoch": 0.292, + "grad_norm": 16.436813971821554, + "kl_loss_13": 3432.0, + "kl_loss_26": 2027.8, + "kl_loss_39": 1551.4, + "kl_loss_7": 4216.4, + "learning_rate": 0.0008128070347473608, + "loss": 5696.7, + "step": 2920 + }, + { + "ce_loss_13": 3.1132335126399995, + "ce_loss_26": 2.4610098242759704, + "ce_loss_39": 2.2259542405605317, + "ce_loss_52": 1.4294333100318908, + "ce_loss_7": 3.48697971701622, + "epoch": 0.293, + "grad_norm": 16.527383629623685, + "kl_loss_13": 3448.4, + "kl_loss_26": 2043.0, + "kl_loss_39": 1562.4, + "kl_loss_7": 4237.2, + "learning_rate": 0.0008115676541804455, + "loss": 5734.0, + "step": 2930 + }, + { + "ce_loss_13": 3.0772423684597014, + "ce_loss_26": 2.413277891278267, + "ce_loss_39": 2.179169711470604, + "ce_loss_52": 1.3933877736330031, + "ce_loss_7": 3.4470324754714965, + "epoch": 0.294, + "grad_norm": 17.18835839034016, + "kl_loss_13": 3431.6, + "kl_loss_26": 2023.0, + "kl_loss_39": 1542.8, + "kl_loss_7": 4223.6, + "learning_rate": 0.0008103251361307119, + "loss": 5705.55, + "step": 2940 + }, + { + "ce_loss_13": 3.093912643194199, + "ce_loss_26": 2.4372138679027557, + "ce_loss_39": 2.2068597853183745, + "ce_loss_52": 1.4339108556509017, + "ce_loss_7": 3.4713816404342652, + "epoch": 0.295, + "grad_norm": 16.78852919489637, + "kl_loss_13": 3395.2, + "kl_loss_26": 1998.6, + "kl_loss_39": 1533.6, + "kl_loss_7": 4186.4, + "learning_rate": 0.0008090794931103026, + "loss": 5641.3, + "step": 2950 + }, + { + "ce_loss_13": 3.1308993637561797, + "ce_loss_26": 2.4708112478256226, + "ce_loss_39": 2.22915124297142, + "ce_loss_52": 1.4374129235744477, + "ce_loss_7": 3.4991187393665313, + "epoch": 0.296, + "grad_norm": 16.768783545647338, + "kl_loss_13": 3436.4, + "kl_loss_26": 2035.0, + "kl_loss_39": 1548.6, + "kl_loss_7": 4216.8, + "learning_rate": 0.0008078307376628291, + "loss": 5645.6, + "step": 2960 + }, + { + "ce_loss_13": 3.0990765929222106, + "ce_loss_26": 2.452856171131134, + "ce_loss_39": 2.21165874004364, + "ce_loss_52": 1.4263496309518815, + "ce_loss_7": 3.472361743450165, + "epoch": 0.297, + "grad_norm": 16.652386900592916, + "kl_loss_13": 3401.6, + "kl_loss_26": 2024.0, + "kl_loss_39": 1532.8, + "kl_loss_7": 4181.6, + "learning_rate": 0.000806578882363245, + "loss": 5645.6, + "step": 2970 + }, + { + "ce_loss_13": 3.092843067646027, + "ce_loss_26": 2.423402965068817, + "ce_loss_39": 2.1854313611984253, + "ce_loss_52": 1.4033612102270125, + "ce_loss_7": 3.474502944946289, + "epoch": 0.298, + "grad_norm": 17.707727518863294, + "kl_loss_13": 3432.4, + "kl_loss_26": 2016.2, + "kl_loss_39": 1530.6, + "kl_loss_7": 4229.2, + "learning_rate": 0.0008053239398177191, + "loss": 5651.6, + "step": 2980 + }, + { + "ce_loss_13": 3.0975674211978914, + "ce_loss_26": 2.4374621868133546, + "ce_loss_39": 2.209611228108406, + "ce_loss_52": 1.4303795397281647, + "ce_loss_7": 3.475492590665817, + "epoch": 0.299, + "grad_norm": 17.746850084884183, + "kl_loss_13": 3416.8, + "kl_loss_26": 2002.4, + "kl_loss_39": 1530.8, + "kl_loss_7": 4214.8, + "learning_rate": 0.0008040659226635089, + "loss": 5630.2, + "step": 2990 + }, + { + "ce_loss_13": 3.089507430791855, + "ce_loss_26": 2.417942848801613, + "ce_loss_39": 2.1790529817342756, + "ce_loss_52": 1.3988826781511308, + "ce_loss_7": 3.468764144182205, + "epoch": 0.3, + "grad_norm": 17.0619103802906, + "kl_loss_13": 3422.4, + "kl_loss_26": 2019.4, + "kl_loss_39": 1533.2, + "kl_loss_7": 4215.2, + "learning_rate": 0.0008028048435688333, + "loss": 5562.8, + "step": 3000 + }, + { + "ce_loss_13": 3.1326099216938017, + "ce_loss_26": 2.471994733810425, + "ce_loss_39": 2.2380873382091524, + "ce_loss_52": 1.458441223204136, + "ce_loss_7": 3.502814435958862, + "epoch": 0.301, + "grad_norm": 17.222204651935485, + "kl_loss_13": 3436.8, + "kl_loss_26": 2039.6, + "kl_loss_39": 1559.8, + "kl_loss_7": 4214.8, + "learning_rate": 0.0008015407152327448, + "loss": 5664.1, + "step": 3010 + }, + { + "ce_loss_13": 3.1731098294258118, + "ce_loss_26": 2.504976212978363, + "ce_loss_39": 2.260189512372017, + "ce_loss_52": 1.4524748474359512, + "ce_loss_7": 3.555993539094925, + "epoch": 0.302, + "grad_norm": 16.72113952146357, + "kl_loss_13": 3518.4, + "kl_loss_26": 2087.0, + "kl_loss_39": 1585.0, + "kl_loss_7": 4313.2, + "learning_rate": 0.0008002735503850016, + "loss": 5589.3, + "step": 3020 + }, + { + "ce_loss_13": 3.118546891212463, + "ce_loss_26": 2.445932698249817, + "ce_loss_39": 2.2121699869632723, + "ce_loss_52": 1.4456641212105752, + "ce_loss_7": 3.491698741912842, + "epoch": 0.303, + "grad_norm": 16.572401194454876, + "kl_loss_13": 3394.8, + "kl_loss_26": 1984.4, + "kl_loss_39": 1511.2, + "kl_loss_7": 4167.2, + "learning_rate": 0.0007990033617859396, + "loss": 5580.5, + "step": 3030 + }, + { + "ce_loss_13": 3.10419015288353, + "ce_loss_26": 2.447344717383385, + "ce_loss_39": 2.212172231078148, + "ce_loss_52": 1.435505247116089, + "ce_loss_7": 3.477084743976593, + "epoch": 0.304, + "grad_norm": 18.041012229096975, + "kl_loss_13": 3420.8, + "kl_loss_26": 2017.0, + "kl_loss_39": 1525.6, + "kl_loss_7": 4208.8, + "learning_rate": 0.000797730162226344, + "loss": 5556.8, + "step": 3040 + }, + { + "ce_loss_13": 3.0508037239313124, + "ce_loss_26": 2.3910141468048094, + "ce_loss_39": 2.161831411719322, + "ce_loss_52": 1.3925497516989709, + "ce_loss_7": 3.434678375720978, + "epoch": 0.305, + "grad_norm": 18.2507403656971, + "kl_loss_13": 3360.8, + "kl_loss_26": 1961.2, + "kl_loss_39": 1491.3, + "kl_loss_7": 4164.0, + "learning_rate": 0.0007964539645273203, + "loss": 5538.7, + "step": 3050 + }, + { + "ce_loss_13": 3.143118643760681, + "ce_loss_26": 2.503050500154495, + "ce_loss_39": 2.269149711728096, + "ce_loss_52": 1.4877858996391295, + "ce_loss_7": 3.510826712846756, + "epoch": 0.306, + "grad_norm": 17.20244258760552, + "kl_loss_13": 3390.4, + "kl_loss_26": 2023.0, + "kl_loss_39": 1542.0, + "kl_loss_7": 4166.8, + "learning_rate": 0.000795174781540165, + "loss": 5547.7, + "step": 3060 + }, + { + "ce_loss_13": 3.0940939664840696, + "ce_loss_26": 2.4377844393253327, + "ce_loss_39": 2.2110770642757416, + "ce_loss_52": 1.453443130850792, + "ce_loss_7": 3.461427628993988, + "epoch": 0.307, + "grad_norm": 16.19915727951882, + "kl_loss_13": 3353.6, + "kl_loss_26": 1960.2, + "kl_loss_39": 1487.5, + "kl_loss_7": 4131.2, + "learning_rate": 0.0007938926261462366, + "loss": 5534.3, + "step": 3070 + }, + { + "ce_loss_13": 3.099123537540436, + "ce_loss_26": 2.428412067890167, + "ce_loss_39": 2.1952255785465242, + "ce_loss_52": 1.4312659561634065, + "ce_loss_7": 3.477120190858841, + "epoch": 0.308, + "grad_norm": 16.90807792363964, + "kl_loss_13": 3392.0, + "kl_loss_26": 1979.6, + "kl_loss_39": 1498.8, + "kl_loss_7": 4186.8, + "learning_rate": 0.0007926075112568258, + "loss": 5523.9, + "step": 3080 + }, + { + "ce_loss_13": 3.0900339841842652, + "ce_loss_26": 2.429807424545288, + "ce_loss_39": 2.1938013613224028, + "ce_loss_52": 1.4448837220668793, + "ce_loss_7": 3.460611253976822, + "epoch": 0.309, + "grad_norm": 17.278764306039758, + "kl_loss_13": 3363.2, + "kl_loss_26": 1959.6, + "kl_loss_39": 1476.2, + "kl_loss_7": 4153.6, + "learning_rate": 0.0007913194498130252, + "loss": 5481.8, + "step": 3090 + }, + { + "ce_loss_13": 3.0705978155136107, + "ce_loss_26": 2.415473333001137, + "ce_loss_39": 2.1859600633382796, + "ce_loss_52": 1.4387344419956207, + "ce_loss_7": 3.4460779249668123, + "epoch": 0.31, + "grad_norm": 17.744380665090848, + "kl_loss_13": 3328.8, + "kl_loss_26": 1937.2, + "kl_loss_39": 1461.6, + "kl_loss_7": 4109.6, + "learning_rate": 0.0007900284547855992, + "loss": 5494.5, + "step": 3100 + }, + { + "ce_loss_13": 3.1120304703712462, + "ce_loss_26": 2.437400758266449, + "ce_loss_39": 2.1967616409063337, + "ce_loss_52": 1.4447504609823227, + "ce_loss_7": 3.4870758295059203, + "epoch": 0.311, + "grad_norm": 17.196298155702294, + "kl_loss_13": 3414.8, + "kl_loss_26": 1997.2, + "kl_loss_39": 1507.4, + "kl_loss_7": 4197.6, + "learning_rate": 0.0007887345391748532, + "loss": 5492.4, + "step": 3110 + }, + { + "ce_loss_13": 3.0714461147785186, + "ce_loss_26": 2.410064917802811, + "ce_loss_39": 2.176686418056488, + "ce_loss_52": 1.4264434427022934, + "ce_loss_7": 3.440903478860855, + "epoch": 0.312, + "grad_norm": 17.274723589437542, + "kl_loss_13": 3352.8, + "kl_loss_26": 1965.6, + "kl_loss_39": 1480.0, + "kl_loss_7": 4134.0, + "learning_rate": 0.0007874377160105036, + "loss": 5478.5, + "step": 3120 + }, + { + "ce_loss_13": 3.0896170139312744, + "ce_loss_26": 2.440164825320244, + "ce_loss_39": 2.2094400197267534, + "ce_loss_52": 1.4495170325040818, + "ce_loss_7": 3.468176656961441, + "epoch": 0.313, + "grad_norm": 17.297623478505614, + "kl_loss_13": 3375.2, + "kl_loss_26": 1976.4, + "kl_loss_39": 1503.8, + "kl_loss_7": 4173.6, + "learning_rate": 0.0007861379983515449, + "loss": 5461.8, + "step": 3130 + }, + { + "ce_loss_13": 3.074652445316315, + "ce_loss_26": 2.4188932478427887, + "ce_loss_39": 2.1893287271261217, + "ce_loss_52": 1.440689930319786, + "ce_loss_7": 3.452376401424408, + "epoch": 0.314, + "grad_norm": 18.06264725868681, + "kl_loss_13": 3317.2, + "kl_loss_26": 1935.6, + "kl_loss_39": 1465.8, + "kl_loss_7": 4117.6, + "learning_rate": 0.0007848353992861195, + "loss": 5464.6, + "step": 3140 + }, + { + "ce_loss_13": 3.078055852651596, + "ce_loss_26": 2.420557659864426, + "ce_loss_39": 2.188734245300293, + "ce_loss_52": 1.4398296728730202, + "ce_loss_7": 3.457850754261017, + "epoch": 0.315, + "grad_norm": 17.00206014366557, + "kl_loss_13": 3318.4, + "kl_loss_26": 1927.6, + "kl_loss_39": 1458.6, + "kl_loss_7": 4114.0, + "learning_rate": 0.0007835299319313853, + "loss": 5381.3, + "step": 3150 + }, + { + "ce_loss_13": 3.059806948900223, + "ce_loss_26": 2.3743002265691757, + "ce_loss_39": 2.137469917535782, + "ce_loss_52": 1.3903418719768523, + "ce_loss_7": 3.4285161972045897, + "epoch": 0.316, + "grad_norm": 17.46330734643716, + "kl_loss_13": 3354.4, + "kl_loss_26": 1935.6, + "kl_loss_39": 1455.2, + "kl_loss_7": 4136.8, + "learning_rate": 0.0007822216094333848, + "loss": 5407.6, + "step": 3160 + }, + { + "ce_loss_13": 3.0990252554416657, + "ce_loss_26": 2.4341968923807142, + "ce_loss_39": 2.1952569454908373, + "ce_loss_52": 1.439093704521656, + "ce_loss_7": 3.4743688821792604, + "epoch": 0.317, + "grad_norm": 18.51989341882003, + "kl_loss_13": 3380.4, + "kl_loss_26": 1988.0, + "kl_loss_39": 1492.0, + "kl_loss_7": 4166.4, + "learning_rate": 0.0007809104449669101, + "loss": 5410.7, + "step": 3170 + }, + { + "ce_loss_13": 3.041397601366043, + "ce_loss_26": 2.382528102397919, + "ce_loss_39": 2.1511587262153626, + "ce_loss_52": 1.435599946975708, + "ce_loss_7": 3.4155047237873077, + "epoch": 0.318, + "grad_norm": 17.0483845332083, + "kl_loss_13": 3256.8, + "kl_loss_26": 1868.0, + "kl_loss_39": 1392.8, + "kl_loss_7": 4038.0, + "learning_rate": 0.0007795964517353734, + "loss": 5354.9, + "step": 3180 + }, + { + "ce_loss_13": 3.0867488861083983, + "ce_loss_26": 2.425813916325569, + "ce_loss_39": 2.1947717368602753, + "ce_loss_52": 1.4569276213645934, + "ce_loss_7": 3.4604012250900267, + "epoch": 0.319, + "grad_norm": 16.586058341187346, + "kl_loss_13": 3328.8, + "kl_loss_26": 1930.2, + "kl_loss_39": 1449.8, + "kl_loss_7": 4120.8, + "learning_rate": 0.000778279642970672, + "loss": 5344.7, + "step": 3190 + }, + { + "ce_loss_13": 3.0399708569049837, + "ce_loss_26": 2.3785893470048904, + "ce_loss_39": 2.138497656583786, + "ce_loss_52": 1.4135656535625458, + "ce_loss_7": 3.409178429841995, + "epoch": 0.32, + "grad_norm": 17.983547533058992, + "kl_loss_13": 3304.0, + "kl_loss_26": 1914.8, + "kl_loss_39": 1428.2, + "kl_loss_7": 4088.0, + "learning_rate": 0.0007769600319330552, + "loss": 5362.0, + "step": 3200 + }, + { + "ce_loss_13": 3.1184182286262514, + "ce_loss_26": 2.4707882523536684, + "ce_loss_39": 2.233083599805832, + "ce_loss_52": 1.4817634999752045, + "ce_loss_7": 3.4818074285984038, + "epoch": 0.321, + "grad_norm": 16.87308179941231, + "kl_loss_13": 3315.6, + "kl_loss_26": 1955.6, + "kl_loss_39": 1470.6, + "kl_loss_7": 4081.2, + "learning_rate": 0.0007756376319109917, + "loss": 5372.8, + "step": 3210 + }, + { + "ce_loss_13": 3.065703272819519, + "ce_loss_26": 2.4234554558992385, + "ce_loss_39": 2.1948377937078476, + "ce_loss_52": 1.4451974362134934, + "ce_loss_7": 3.4377165257930757, + "epoch": 0.322, + "grad_norm": 17.206055177859767, + "kl_loss_13": 3288.4, + "kl_loss_26": 1929.2, + "kl_loss_39": 1461.2, + "kl_loss_7": 4058.8, + "learning_rate": 0.0007743124562210351, + "loss": 5338.3, + "step": 3220 + }, + { + "ce_loss_13": 3.0654458463191987, + "ce_loss_26": 2.4097089529037476, + "ce_loss_39": 2.1828925907611847, + "ce_loss_52": 1.4646209165453912, + "ce_loss_7": 3.438837933540344, + "epoch": 0.323, + "grad_norm": 16.43379975440051, + "kl_loss_13": 3246.8, + "kl_loss_26": 1868.4, + "kl_loss_39": 1403.6, + "kl_loss_7": 4034.4, + "learning_rate": 0.0007729845182076895, + "loss": 5337.95, + "step": 3230 + }, + { + "ce_loss_13": 3.019025903940201, + "ce_loss_26": 2.3812606751918795, + "ce_loss_39": 2.1557460606098173, + "ce_loss_52": 1.4495598763227462, + "ce_loss_7": 3.390954166650772, + "epoch": 0.324, + "grad_norm": 17.50409394447208, + "kl_loss_13": 3213.2, + "kl_loss_26": 1857.6, + "kl_loss_39": 1393.2, + "kl_loss_7": 3999.2, + "learning_rate": 0.0007716538312432765, + "loss": 5323.8, + "step": 3240 + }, + { + "ce_loss_13": 3.0274185359478, + "ce_loss_26": 2.3673853039741517, + "ce_loss_39": 2.1314821422100065, + "ce_loss_52": 1.4110743701457977, + "ce_loss_7": 3.3962999522686004, + "epoch": 0.325, + "grad_norm": 17.627956174954214, + "kl_loss_13": 3281.2, + "kl_loss_26": 1900.8, + "kl_loss_39": 1419.0, + "kl_loss_7": 4056.8, + "learning_rate": 0.0007703204087277988, + "loss": 5310.9, + "step": 3250 + }, + { + "ce_loss_13": 2.995425891876221, + "ce_loss_26": 2.3437940657138823, + "ce_loss_39": 2.1112417429685593, + "ce_loss_52": 1.3956570625305176, + "ce_loss_7": 3.3635079681873323, + "epoch": 0.326, + "grad_norm": 17.092527088154757, + "kl_loss_13": 3270.8, + "kl_loss_26": 1881.4, + "kl_loss_39": 1402.8, + "kl_loss_7": 4054.4, + "learning_rate": 0.0007689842640888063, + "loss": 5291.9, + "step": 3260 + }, + { + "ce_loss_13": 3.0584332168102266, + "ce_loss_26": 2.4099347323179243, + "ce_loss_39": 2.182457607984543, + "ce_loss_52": 1.4547152355313302, + "ce_loss_7": 3.4290026843547823, + "epoch": 0.327, + "grad_norm": 17.31361789139729, + "kl_loss_13": 3256.0, + "kl_loss_26": 1887.8, + "kl_loss_39": 1426.4, + "kl_loss_7": 4026.4, + "learning_rate": 0.0007676454107812607, + "loss": 5264.3, + "step": 3270 + }, + { + "ce_loss_13": 3.002471148967743, + "ce_loss_26": 2.365675774216652, + "ce_loss_39": 2.1432500898838045, + "ce_loss_52": 1.4313921973109245, + "ce_loss_7": 3.3743964791297913, + "epoch": 0.328, + "grad_norm": 15.793768401685108, + "kl_loss_13": 3248.4, + "kl_loss_26": 1868.8, + "kl_loss_39": 1414.0, + "kl_loss_7": 4030.8, + "learning_rate": 0.0007663038622873999, + "loss": 5285.1, + "step": 3280 + }, + { + "ce_loss_13": 3.0830911457538606, + "ce_loss_26": 2.4269310742616654, + "ce_loss_39": 2.2025650680065154, + "ce_loss_52": 1.4690157890319824, + "ce_loss_7": 3.454869121313095, + "epoch": 0.329, + "grad_norm": 17.14503971571328, + "kl_loss_13": 3293.2, + "kl_loss_26": 1903.8, + "kl_loss_39": 1434.0, + "kl_loss_7": 4075.6, + "learning_rate": 0.0007649596321166025, + "loss": 5253.65, + "step": 3290 + }, + { + "ce_loss_13": 2.9723230481147764, + "ce_loss_26": 2.332389995455742, + "ce_loss_39": 2.1094966679811478, + "ce_loss_52": 1.4339970767498016, + "ce_loss_7": 3.333187943696976, + "epoch": 0.33, + "grad_norm": 16.633040610576913, + "kl_loss_13": 3118.0, + "kl_loss_26": 1779.4, + "kl_loss_39": 1323.0, + "kl_loss_7": 3879.6, + "learning_rate": 0.0007636127338052513, + "loss": 5233.1, + "step": 3300 + }, + { + "ce_loss_13": 2.9914496004581452, + "ce_loss_26": 2.3316532552242277, + "ce_loss_39": 2.0988477796316145, + "ce_loss_52": 1.400558878481388, + "ce_loss_7": 3.363678741455078, + "epoch": 0.331, + "grad_norm": 17.26677566687906, + "kl_loss_13": 3257.2, + "kl_loss_26": 1858.4, + "kl_loss_39": 1375.2, + "kl_loss_7": 4042.8, + "learning_rate": 0.0007622631809165971, + "loss": 5196.15, + "step": 3310 + }, + { + "ce_loss_13": 3.064756464958191, + "ce_loss_26": 2.422107365727425, + "ce_loss_39": 2.192085716128349, + "ce_loss_52": 1.4793070062994957, + "ce_loss_7": 3.430086314678192, + "epoch": 0.332, + "grad_norm": 17.330178950251707, + "kl_loss_13": 3243.6, + "kl_loss_26": 1878.8, + "kl_loss_39": 1409.8, + "kl_loss_7": 4016.4, + "learning_rate": 0.000760910987040623, + "loss": 5231.55, + "step": 3320 + }, + { + "ce_loss_13": 2.9637326538562774, + "ce_loss_26": 2.3162154614925385, + "ce_loss_39": 2.097182759642601, + "ce_loss_52": 1.4222271725535394, + "ce_loss_7": 3.332917684316635, + "epoch": 0.333, + "grad_norm": 17.085141982864975, + "kl_loss_13": 3141.2, + "kl_loss_26": 1768.8, + "kl_loss_39": 1318.3, + "kl_loss_7": 3920.0, + "learning_rate": 0.000759556165793906, + "loss": 5154.65, + "step": 3330 + }, + { + "ce_loss_13": 3.029485374689102, + "ce_loss_26": 2.3887200921773912, + "ce_loss_39": 2.1607041716575623, + "ce_loss_52": 1.4696623742580415, + "ce_loss_7": 3.3922773957252503, + "epoch": 0.334, + "grad_norm": 15.502678294546826, + "kl_loss_13": 3185.2, + "kl_loss_26": 1826.8, + "kl_loss_39": 1360.6, + "kl_loss_7": 3948.8, + "learning_rate": 0.000758198730819481, + "loss": 5180.15, + "step": 3340 + }, + { + "ce_loss_13": 3.03846270442009, + "ce_loss_26": 2.378660023212433, + "ce_loss_39": 2.1477699905633925, + "ce_loss_52": 1.4347332805395125, + "ce_loss_7": 3.4101031959056853, + "epoch": 0.335, + "grad_norm": 16.024720398541927, + "kl_loss_13": 3270.4, + "kl_loss_26": 1883.0, + "kl_loss_39": 1406.3, + "kl_loss_7": 4056.4, + "learning_rate": 0.0007568386957867032, + "loss": 5194.3, + "step": 3350 + }, + { + "ce_loss_13": 3.008663833141327, + "ce_loss_26": 2.3700191140174867, + "ce_loss_39": 2.1397975504398348, + "ce_loss_52": 1.4545943021774292, + "ce_loss_7": 3.3673401892185213, + "epoch": 0.336, + "grad_norm": 16.053283356705972, + "kl_loss_13": 3181.6, + "kl_loss_26": 1820.2, + "kl_loss_39": 1348.6, + "kl_loss_7": 3942.0, + "learning_rate": 0.0007554760743911103, + "loss": 5153.55, + "step": 3360 + }, + { + "ce_loss_13": 2.9829940140247344, + "ce_loss_26": 2.3409414261579515, + "ce_loss_39": 2.1123215198516845, + "ce_loss_52": 1.435232725739479, + "ce_loss_7": 3.355481207370758, + "epoch": 0.337, + "grad_norm": 16.67714591780792, + "kl_loss_13": 3152.4, + "kl_loss_26": 1792.8, + "kl_loss_39": 1321.8, + "kl_loss_7": 3938.0, + "learning_rate": 0.0007541108803542846, + "loss": 5142.8, + "step": 3370 + }, + { + "ce_loss_13": 3.027516704797745, + "ce_loss_26": 2.386495107412338, + "ce_loss_39": 2.1608838021755217, + "ce_loss_52": 1.4647808492183685, + "ce_loss_7": 3.3936978697776796, + "epoch": 0.338, + "grad_norm": 16.879054589986723, + "kl_loss_13": 3186.4, + "kl_loss_26": 1828.6, + "kl_loss_39": 1351.9, + "kl_loss_7": 3957.6, + "learning_rate": 0.0007527431274237149, + "loss": 5169.0, + "step": 3380 + }, + { + "ce_loss_13": 2.9946301877498627, + "ce_loss_26": 2.3546741545200347, + "ce_loss_39": 2.1238624840974807, + "ce_loss_52": 1.4416128873825074, + "ce_loss_7": 3.3628919243812563, + "epoch": 0.339, + "grad_norm": 18.526031342338438, + "kl_loss_13": 3157.6, + "kl_loss_26": 1810.6, + "kl_loss_39": 1339.1, + "kl_loss_7": 3932.0, + "learning_rate": 0.0007513728293726579, + "loss": 5107.45, + "step": 3390 + }, + { + "ce_loss_13": 2.975279802083969, + "ce_loss_26": 2.325047069787979, + "ce_loss_39": 2.1066873967647552, + "ce_loss_52": 1.4382890224456788, + "ce_loss_7": 3.3415417432785035, + "epoch": 0.34, + "grad_norm": 17.516441880993753, + "kl_loss_13": 3144.4, + "kl_loss_26": 1778.8, + "kl_loss_39": 1326.8, + "kl_loss_7": 3920.0, + "learning_rate": 0.00075, + "loss": 5107.0, + "step": 3400 + }, + { + "ce_loss_13": 2.9631440460681917, + "ce_loss_26": 2.318173348903656, + "ce_loss_39": 2.0847090512514113, + "ce_loss_52": 1.4200364857912064, + "ce_loss_7": 3.3265232741832733, + "epoch": 0.341, + "grad_norm": 16.134403370320147, + "kl_loss_13": 3139.2, + "kl_loss_26": 1766.8, + "kl_loss_39": 1296.8, + "kl_loss_7": 3908.8, + "learning_rate": 0.0007486246531301177, + "loss": 5097.65, + "step": 3410 + }, + { + "ce_loss_13": 2.99331476688385, + "ce_loss_26": 2.3580960750579836, + "ce_loss_39": 2.1382109016180038, + "ce_loss_52": 1.4535871922969819, + "ce_loss_7": 3.35606609582901, + "epoch": 0.342, + "grad_norm": 16.427769884918277, + "kl_loss_13": 3148.4, + "kl_loss_26": 1796.6, + "kl_loss_39": 1349.0, + "kl_loss_7": 3921.2, + "learning_rate": 0.0007472468026127384, + "loss": 5139.75, + "step": 3420 + }, + { + "ce_loss_13": 2.921882951259613, + "ce_loss_26": 2.2855687588453293, + "ce_loss_39": 2.067648893594742, + "ce_loss_52": 1.4116598561406135, + "ce_loss_7": 3.2927843034267426, + "epoch": 0.343, + "grad_norm": 16.808808535347794, + "kl_loss_13": 3095.8, + "kl_loss_26": 1756.2, + "kl_loss_39": 1301.9, + "kl_loss_7": 3878.0, + "learning_rate": 0.000745866462322802, + "loss": 5051.15, + "step": 3430 + }, + { + "ce_loss_13": 3.04699621796608, + "ce_loss_26": 2.399735540151596, + "ce_loss_39": 2.176995486021042, + "ce_loss_52": 1.496598380804062, + "ce_loss_7": 3.418045401573181, + "epoch": 0.344, + "grad_norm": 16.406856004421037, + "kl_loss_13": 3162.0, + "kl_loss_26": 1792.4, + "kl_loss_39": 1329.5, + "kl_loss_7": 3935.6, + "learning_rate": 0.0007444836461603195, + "loss": 5107.85, + "step": 3440 + }, + { + "ce_loss_13": 2.930357199907303, + "ce_loss_26": 2.3015194088220596, + "ce_loss_39": 2.078587147593498, + "ce_loss_52": 1.414345271885395, + "ce_loss_7": 3.2914562046527864, + "epoch": 0.345, + "grad_norm": 16.994168943169583, + "kl_loss_13": 3103.6, + "kl_loss_26": 1763.0, + "kl_loss_39": 1304.8, + "kl_loss_7": 3859.2, + "learning_rate": 0.0007430983680502344, + "loss": 5063.6, + "step": 3450 + }, + { + "ce_loss_13": 2.946500468254089, + "ce_loss_26": 2.3075433492660524, + "ce_loss_39": 2.08428935110569, + "ce_loss_52": 1.4254867061972618, + "ce_loss_7": 3.309797298908234, + "epoch": 0.346, + "grad_norm": 16.30494022816115, + "kl_loss_13": 3098.0, + "kl_loss_26": 1752.4, + "kl_loss_39": 1293.6, + "kl_loss_7": 3865.2, + "learning_rate": 0.0007417106419422819, + "loss": 5025.2, + "step": 3460 + }, + { + "ce_loss_13": 2.935671639442444, + "ce_loss_26": 2.2941204428672792, + "ce_loss_39": 2.071269851922989, + "ce_loss_52": 1.4085116267204285, + "ce_loss_7": 3.3008416891098022, + "epoch": 0.347, + "grad_norm": 17.30594652330039, + "kl_loss_13": 3130.4, + "kl_loss_26": 1779.4, + "kl_loss_39": 1316.9, + "kl_loss_7": 3898.4, + "learning_rate": 0.0007403204818108486, + "loss": 5043.95, + "step": 3470 + }, + { + "ce_loss_13": 2.934108853340149, + "ce_loss_26": 2.292804607748985, + "ce_loss_39": 2.0716417878866196, + "ce_loss_52": 1.4156519144773483, + "ce_loss_7": 3.2983541190624237, + "epoch": 0.348, + "grad_norm": 16.793884082475312, + "kl_loss_13": 3072.0, + "kl_loss_26": 1727.6, + "kl_loss_39": 1271.6, + "kl_loss_7": 3841.6, + "learning_rate": 0.0007389279016548316, + "loss": 5016.3, + "step": 3480 + }, + { + "ce_loss_13": 2.8679963111877442, + "ce_loss_26": 2.2394334375858307, + "ce_loss_39": 2.0239282071590425, + "ce_loss_52": 1.3932767808437347, + "ce_loss_7": 3.234779417514801, + "epoch": 0.349, + "grad_norm": 15.946114217155069, + "kl_loss_13": 3023.6, + "kl_loss_26": 1691.8, + "kl_loss_39": 1246.9, + "kl_loss_7": 3788.8, + "learning_rate": 0.0007375329154974975, + "loss": 5018.15, + "step": 3490 + }, + { + "ce_loss_13": 2.9285870611667635, + "ce_loss_26": 2.287761977314949, + "ce_loss_39": 2.063358634710312, + "ce_loss_52": 1.4101893305778503, + "ce_loss_7": 3.298641562461853, + "epoch": 0.35, + "grad_norm": 16.997559975693584, + "kl_loss_13": 3086.0, + "kl_loss_26": 1746.0, + "kl_loss_39": 1280.7, + "kl_loss_7": 3868.8, + "learning_rate": 0.0007361355373863414, + "loss": 5018.5, + "step": 3500 + }, + { + "ce_loss_13": 2.931669169664383, + "ce_loss_26": 2.288780450820923, + "ce_loss_39": 2.0723241955041884, + "ce_loss_52": 1.4271863222122192, + "ce_loss_7": 3.2927916407585145, + "epoch": 0.351, + "grad_norm": 15.980672586392506, + "kl_loss_13": 3077.6, + "kl_loss_26": 1726.6, + "kl_loss_39": 1278.3, + "kl_loss_7": 3843.6, + "learning_rate": 0.0007347357813929454, + "loss": 4989.7, + "step": 3510 + }, + { + "ce_loss_13": 2.928689205646515, + "ce_loss_26": 2.2940947294235228, + "ce_loss_39": 2.0670880317687987, + "ce_loss_52": 1.4149780303239823, + "ce_loss_7": 3.293194830417633, + "epoch": 0.352, + "grad_norm": 16.33808700723808, + "kl_loss_13": 3074.8, + "kl_loss_26": 1738.8, + "kl_loss_39": 1273.7, + "kl_loss_7": 3840.0, + "learning_rate": 0.0007333336616128369, + "loss": 4986.35, + "step": 3520 + }, + { + "ce_loss_13": 2.940303909778595, + "ce_loss_26": 2.294164848327637, + "ce_loss_39": 2.0657031387090683, + "ce_loss_52": 1.4308805465698242, + "ce_loss_7": 3.3036282479763033, + "epoch": 0.353, + "grad_norm": 16.29503578207681, + "kl_loss_13": 3067.6, + "kl_loss_26": 1716.2, + "kl_loss_39": 1247.9, + "kl_loss_7": 3828.0, + "learning_rate": 0.0007319291921653463, + "loss": 4998.85, + "step": 3530 + }, + { + "ce_loss_13": 2.916954427957535, + "ce_loss_26": 2.2819162607192993, + "ce_loss_39": 2.0701166808605196, + "ce_loss_52": 1.4166931748390197, + "ce_loss_7": 3.2759189188480375, + "epoch": 0.354, + "grad_norm": 17.822669536905938, + "kl_loss_13": 3073.6, + "kl_loss_26": 1723.6, + "kl_loss_39": 1286.2, + "kl_loss_7": 3837.2, + "learning_rate": 0.0007305223871934656, + "loss": 4995.55, + "step": 3540 + }, + { + "ce_loss_13": 2.975371015071869, + "ce_loss_26": 2.3552831768989564, + "ce_loss_39": 2.1292835503816603, + "ce_loss_52": 1.4775378912687303, + "ce_loss_7": 3.3292273938655854, + "epoch": 0.355, + "grad_norm": 16.58742338810906, + "kl_loss_13": 3054.4, + "kl_loss_26": 1739.2, + "kl_loss_39": 1279.1, + "kl_loss_7": 3808.8, + "learning_rate": 0.0007291132608637052, + "loss": 4945.1, + "step": 3550 + }, + { + "ce_loss_13": 2.972325986623764, + "ce_loss_26": 2.3392420560121536, + "ce_loss_39": 2.1207040429115294, + "ce_loss_52": 1.475459137558937, + "ce_loss_7": 3.3383385837078094, + "epoch": 0.356, + "grad_norm": 16.383650902730043, + "kl_loss_13": 3066.4, + "kl_loss_26": 1712.2, + "kl_loss_39": 1260.8, + "kl_loss_7": 3838.8, + "learning_rate": 0.0007277018273659516, + "loss": 4963.15, + "step": 3560 + }, + { + "ce_loss_13": 3.0111460268497465, + "ce_loss_26": 2.3827701687812803, + "ce_loss_39": 2.1591351449489595, + "ce_loss_52": 1.4967001289129258, + "ce_loss_7": 3.3728690683841704, + "epoch": 0.357, + "grad_norm": 16.824381349061323, + "kl_loss_13": 3101.2, + "kl_loss_26": 1770.6, + "kl_loss_39": 1310.2, + "kl_loss_7": 3856.8, + "learning_rate": 0.0007262881009133242, + "loss": 4952.95, + "step": 3570 + }, + { + "ce_loss_13": 2.9141285896301268, + "ce_loss_26": 2.2805556029081346, + "ce_loss_39": 2.065216201543808, + "ce_loss_52": 1.4257875666022302, + "ce_loss_7": 3.282390242815018, + "epoch": 0.358, + "grad_norm": 18.058728297237614, + "kl_loss_13": 3046.4, + "kl_loss_26": 1695.0, + "kl_loss_39": 1247.4, + "kl_loss_7": 3819.6, + "learning_rate": 0.0007248720957420329, + "loss": 4964.9, + "step": 3580 + }, + { + "ce_loss_13": 2.8828001439571382, + "ce_loss_26": 2.253911817073822, + "ce_loss_39": 2.030216920375824, + "ce_loss_52": 1.40321164727211, + "ce_loss_7": 3.251304441690445, + "epoch": 0.359, + "grad_norm": 16.573678980597606, + "kl_loss_13": 3047.2, + "kl_loss_26": 1698.2, + "kl_loss_39": 1242.3, + "kl_loss_7": 3817.6, + "learning_rate": 0.0007234538261112341, + "loss": 4895.95, + "step": 3590 + }, + { + "ce_loss_13": 2.9461396992206574, + "ce_loss_26": 2.3021911144256593, + "ce_loss_39": 2.0822067111730576, + "ce_loss_52": 1.448357391357422, + "ce_loss_7": 3.3077784180641174, + "epoch": 0.36, + "grad_norm": 17.017357286641733, + "kl_loss_13": 3060.4, + "kl_loss_26": 1708.8, + "kl_loss_39": 1253.4, + "kl_loss_7": 3833.6, + "learning_rate": 0.0007220333063028871, + "loss": 4918.35, + "step": 3600 + }, + { + "ce_loss_13": 2.846253049373627, + "ce_loss_26": 2.224426531791687, + "ce_loss_39": 2.0076546490192415, + "ce_loss_52": 1.395447552204132, + "ce_loss_7": 3.2072394728660583, + "epoch": 0.361, + "grad_norm": 15.74697405086667, + "kl_loss_13": 2978.0, + "kl_loss_26": 1658.4, + "kl_loss_39": 1211.2, + "kl_loss_7": 3738.4, + "learning_rate": 0.0007206105506216106, + "loss": 4871.3, + "step": 3610 + }, + { + "ce_loss_13": 3.0099994122982023, + "ce_loss_26": 2.373449808359146, + "ce_loss_39": 2.1529267936944962, + "ce_loss_52": 1.4900053232908248, + "ce_loss_7": 3.3753599405288695, + "epoch": 0.362, + "grad_norm": 16.970809605735944, + "kl_loss_13": 3087.6, + "kl_loss_26": 1745.2, + "kl_loss_39": 1286.7, + "kl_loss_7": 3866.8, + "learning_rate": 0.0007191855733945387, + "loss": 4947.8, + "step": 3620 + }, + { + "ce_loss_13": 2.937187296152115, + "ce_loss_26": 2.322328266501427, + "ce_loss_39": 2.105859735608101, + "ce_loss_52": 1.474419781565666, + "ce_loss_7": 3.2971576511859895, + "epoch": 0.363, + "grad_norm": 17.141982755892812, + "kl_loss_13": 3009.6, + "kl_loss_26": 1696.8, + "kl_loss_39": 1241.1, + "kl_loss_7": 3762.4, + "learning_rate": 0.0007177583889711762, + "loss": 4882.15, + "step": 3630 + }, + { + "ce_loss_13": 2.902718555927277, + "ce_loss_26": 2.260812908411026, + "ce_loss_39": 2.042543429136276, + "ce_loss_52": 1.4226751655340195, + "ce_loss_7": 3.2698469936847685, + "epoch": 0.364, + "grad_norm": 17.153862070969048, + "kl_loss_13": 3018.8, + "kl_loss_26": 1673.0, + "kl_loss_39": 1219.0, + "kl_loss_7": 3784.4, + "learning_rate": 0.0007163290117232541, + "loss": 4884.0, + "step": 3640 + }, + { + "ce_loss_13": 2.9109850347042086, + "ce_loss_26": 2.297417125105858, + "ce_loss_39": 2.077428176999092, + "ce_loss_52": 1.4550551682710648, + "ce_loss_7": 3.268283462524414, + "epoch": 0.365, + "grad_norm": 16.42744245211514, + "kl_loss_13": 2985.2, + "kl_loss_26": 1679.2, + "kl_loss_39": 1227.9, + "kl_loss_7": 3734.0, + "learning_rate": 0.0007148974560445859, + "loss": 4868.65, + "step": 3650 + }, + { + "ce_loss_13": 2.9199238896369932, + "ce_loss_26": 2.2848848432302473, + "ce_loss_39": 2.060741201043129, + "ce_loss_52": 1.4278603106737138, + "ce_loss_7": 3.2834209561347962, + "epoch": 0.366, + "grad_norm": 16.404556741779928, + "kl_loss_13": 3024.0, + "kl_loss_26": 1686.2, + "kl_loss_39": 1230.9, + "kl_loss_7": 3786.0, + "learning_rate": 0.0007134637363509209, + "loss": 4839.5, + "step": 3660 + }, + { + "ce_loss_13": 2.9712482690811157, + "ce_loss_26": 2.3368860691785813, + "ce_loss_39": 2.104892411828041, + "ce_loss_52": 1.4633448541164398, + "ce_loss_7": 3.332030898332596, + "epoch": 0.367, + "grad_norm": 15.961228476827497, + "kl_loss_13": 3092.4, + "kl_loss_26": 1760.2, + "kl_loss_39": 1277.5, + "kl_loss_7": 3848.0, + "learning_rate": 0.0007120278670798009, + "loss": 4858.55, + "step": 3670 + }, + { + "ce_loss_13": 2.951517391204834, + "ce_loss_26": 2.3281659215688704, + "ce_loss_39": 2.0995417445898057, + "ce_loss_52": 1.4656393617391585, + "ce_loss_7": 3.2964209616184235, + "epoch": 0.368, + "grad_norm": 16.089022609349872, + "kl_loss_13": 3003.6, + "kl_loss_26": 1696.8, + "kl_loss_39": 1232.8, + "kl_loss_7": 3745.6, + "learning_rate": 0.0007105898626904133, + "loss": 4774.9, + "step": 3680 + }, + { + "ce_loss_13": 2.870139628648758, + "ce_loss_26": 2.2511734038591387, + "ce_loss_39": 2.0341389745473863, + "ce_loss_52": 1.4250996381044387, + "ce_loss_7": 3.2268544733524323, + "epoch": 0.369, + "grad_norm": 15.247673028968622, + "kl_loss_13": 2967.2, + "kl_loss_26": 1653.8, + "kl_loss_39": 1205.5, + "kl_loss_7": 3723.2, + "learning_rate": 0.0007091497376634463, + "loss": 4807.45, + "step": 3690 + }, + { + "ce_loss_13": 2.8762976706027983, + "ce_loss_26": 2.256538024544716, + "ce_loss_39": 2.043423393368721, + "ce_loss_52": 1.4497251689434052, + "ce_loss_7": 3.2377980053424835, + "epoch": 0.37, + "grad_norm": 16.15904103093409, + "kl_loss_13": 2914.4, + "kl_loss_26": 1609.7, + "kl_loss_39": 1170.3, + "kl_loss_7": 3672.0, + "learning_rate": 0.0007077075065009433, + "loss": 4822.75, + "step": 3700 + }, + { + "ce_loss_13": 2.865807980298996, + "ce_loss_26": 2.2327334135770798, + "ce_loss_39": 2.012790763378143, + "ce_loss_52": 1.4004584282636643, + "ce_loss_7": 3.233772474527359, + "epoch": 0.371, + "grad_norm": 15.511174434634698, + "kl_loss_13": 2980.0, + "kl_loss_26": 1666.4, + "kl_loss_39": 1214.3, + "kl_loss_7": 3742.4, + "learning_rate": 0.0007062631837261557, + "loss": 4816.1, + "step": 3710 + }, + { + "ce_loss_13": 2.903226691484451, + "ce_loss_26": 2.2818103432655334, + "ce_loss_39": 2.059009611606598, + "ce_loss_52": 1.456637406349182, + "ce_loss_7": 3.263377320766449, + "epoch": 0.372, + "grad_norm": 17.120548608123716, + "kl_loss_13": 2952.8, + "kl_loss_26": 1642.0, + "kl_loss_39": 1187.9, + "kl_loss_7": 3710.8, + "learning_rate": 0.0007048167838833977, + "loss": 4745.55, + "step": 3720 + }, + { + "ce_loss_13": 2.900358548760414, + "ce_loss_26": 2.2638369113206864, + "ce_loss_39": 2.043374678492546, + "ce_loss_52": 1.4358570337295533, + "ce_loss_7": 3.272378832101822, + "epoch": 0.373, + "grad_norm": 15.762139849070088, + "kl_loss_13": 2995.6, + "kl_loss_26": 1646.4, + "kl_loss_39": 1202.7, + "kl_loss_7": 3778.4, + "learning_rate": 0.0007033683215379002, + "loss": 4819.05, + "step": 3730 + }, + { + "ce_loss_13": 2.891742479801178, + "ce_loss_26": 2.2577997177839277, + "ce_loss_39": 2.042544272542, + "ce_loss_52": 1.4357560023665428, + "ce_loss_7": 3.2664382100105285, + "epoch": 0.374, + "grad_norm": 17.991228767593586, + "kl_loss_13": 3005.6, + "kl_loss_26": 1661.4, + "kl_loss_39": 1210.7, + "kl_loss_7": 3790.4, + "learning_rate": 0.0007019178112756625, + "loss": 4801.4, + "step": 3740 + }, + { + "ce_loss_13": 2.937167102098465, + "ce_loss_26": 2.3048900216817856, + "ce_loss_39": 2.077365005016327, + "ce_loss_52": 1.4518427148461341, + "ce_loss_7": 3.2986050605773927, + "epoch": 0.375, + "grad_norm": 17.06397612135392, + "kl_loss_13": 3048.4, + "kl_loss_26": 1714.2, + "kl_loss_39": 1240.0, + "kl_loss_7": 3808.4, + "learning_rate": 0.0007004652677033068, + "loss": 4778.45, + "step": 3750 + }, + { + "ce_loss_13": 2.953932785987854, + "ce_loss_26": 2.3320761770009995, + "ce_loss_39": 2.1045148581266404, + "ce_loss_52": 1.472703790664673, + "ce_loss_7": 3.3274633824825286, + "epoch": 0.376, + "grad_norm": 16.845736377094994, + "kl_loss_13": 3032.0, + "kl_loss_26": 1703.8, + "kl_loss_39": 1244.7, + "kl_loss_7": 3816.0, + "learning_rate": 0.0006990107054479312, + "loss": 4794.6, + "step": 3760 + }, + { + "ce_loss_13": 2.8548416674137114, + "ce_loss_26": 2.240122190117836, + "ce_loss_39": 2.0189033895730972, + "ce_loss_52": 1.4262803480029107, + "ce_loss_7": 3.208429366350174, + "epoch": 0.377, + "grad_norm": 16.84130111884451, + "kl_loss_13": 2924.4, + "kl_loss_26": 1609.6, + "kl_loss_39": 1161.2, + "kl_loss_7": 3672.0, + "learning_rate": 0.000697554139156961, + "loss": 4779.2, + "step": 3770 + }, + { + "ce_loss_13": 2.972896063327789, + "ce_loss_26": 2.335559439659119, + "ce_loss_39": 2.111876127123833, + "ce_loss_52": 1.4984043270349503, + "ce_loss_7": 3.330926328897476, + "epoch": 0.378, + "grad_norm": 17.969038221722915, + "kl_loss_13": 3002.8, + "kl_loss_26": 1674.0, + "kl_loss_39": 1211.2, + "kl_loss_7": 3762.0, + "learning_rate": 0.0006960955834980027, + "loss": 4732.4, + "step": 3780 + }, + { + "ce_loss_13": 2.863754612207413, + "ce_loss_26": 2.228693225979805, + "ce_loss_39": 2.0101536750793456, + "ce_loss_52": 1.4073660969734192, + "ce_loss_7": 3.2303711056709288, + "epoch": 0.379, + "grad_norm": 15.796823584167846, + "kl_loss_13": 2960.8, + "kl_loss_26": 1639.0, + "kl_loss_39": 1188.6, + "kl_loss_7": 3734.4, + "learning_rate": 0.0006946350531586958, + "loss": 4740.55, + "step": 3790 + }, + { + "ce_loss_13": 2.819410902261734, + "ce_loss_26": 2.200511318445206, + "ce_loss_39": 1.9842332571744918, + "ce_loss_52": 1.400177489221096, + "ce_loss_7": 3.1923243761062623, + "epoch": 0.38, + "grad_norm": 17.863959287343352, + "kl_loss_13": 2930.0, + "kl_loss_26": 1613.6, + "kl_loss_39": 1162.1, + "kl_loss_7": 3705.2, + "learning_rate": 0.0006931725628465643, + "loss": 4745.35, + "step": 3800 + }, + { + "ce_loss_13": 2.845439475774765, + "ce_loss_26": 2.2171025544404985, + "ce_loss_39": 1.9986167669296264, + "ce_loss_52": 1.4112813830375672, + "ce_loss_7": 3.2001422882080077, + "epoch": 0.381, + "grad_norm": 15.509448386002845, + "kl_loss_13": 2924.0, + "kl_loss_26": 1603.8, + "kl_loss_39": 1151.4, + "kl_loss_7": 3677.6, + "learning_rate": 0.0006917081272888696, + "loss": 4686.25, + "step": 3810 + }, + { + "ce_loss_13": 2.875427797436714, + "ce_loss_26": 2.2557172268629073, + "ce_loss_39": 2.0311311304569246, + "ce_loss_52": 1.4279655352234841, + "ce_loss_7": 3.230677658319473, + "epoch": 0.382, + "grad_norm": 17.274488302565285, + "kl_loss_13": 2934.0, + "kl_loss_26": 1621.0, + "kl_loss_39": 1159.3, + "kl_loss_7": 3683.6, + "learning_rate": 0.0006902417612324615, + "loss": 4684.7, + "step": 3820 + }, + { + "ce_loss_13": 2.9117272198200226, + "ce_loss_26": 2.261174875497818, + "ce_loss_39": 2.036722195148468, + "ce_loss_52": 1.4152167439460754, + "ce_loss_7": 3.282198351621628, + "epoch": 0.383, + "grad_norm": 17.87083708364157, + "kl_loss_13": 3095.2, + "kl_loss_26": 1720.4, + "kl_loss_39": 1253.4, + "kl_loss_7": 3865.2, + "learning_rate": 0.00068877347944363, + "loss": 4739.15, + "step": 3830 + }, + { + "ce_loss_13": 2.8889047384262083, + "ce_loss_26": 2.2653014570474626, + "ce_loss_39": 2.0420874893665313, + "ce_loss_52": 1.4475852727890015, + "ce_loss_7": 3.253549599647522, + "epoch": 0.384, + "grad_norm": 15.6987701916489, + "kl_loss_13": 2966.0, + "kl_loss_26": 1638.2, + "kl_loss_39": 1187.2, + "kl_loss_7": 3729.2, + "learning_rate": 0.0006873032967079561, + "loss": 4730.9, + "step": 3840 + }, + { + "ce_loss_13": 2.9057071805000305, + "ce_loss_26": 2.2790849953889847, + "ce_loss_39": 2.0592786610126494, + "ce_loss_52": 1.452454286813736, + "ce_loss_7": 3.266382873058319, + "epoch": 0.385, + "grad_norm": 15.755925332297407, + "kl_loss_13": 2962.0, + "kl_loss_26": 1636.4, + "kl_loss_39": 1179.7, + "kl_loss_7": 3722.8, + "learning_rate": 0.0006858312278301637, + "loss": 4713.7, + "step": 3850 + }, + { + "ce_loss_13": 2.8342252016067504, + "ce_loss_26": 2.2319850236177445, + "ce_loss_39": 2.022706937789917, + "ce_loss_52": 1.4418139278888702, + "ce_loss_7": 3.186972415447235, + "epoch": 0.386, + "grad_norm": 17.081089442059948, + "kl_loss_13": 2855.2, + "kl_loss_26": 1568.0, + "kl_loss_39": 1131.2, + "kl_loss_7": 3603.2, + "learning_rate": 0.0006843572876339704, + "loss": 4675.25, + "step": 3860 + }, + { + "ce_loss_13": 2.7886572241783143, + "ce_loss_26": 2.173486915230751, + "ce_loss_39": 1.9662895441055297, + "ce_loss_52": 1.3961340665817261, + "ce_loss_7": 3.1484048068523407, + "epoch": 0.387, + "grad_norm": 18.57744828916969, + "kl_loss_13": 2842.0, + "kl_loss_26": 1551.8, + "kl_loss_39": 1125.9, + "kl_loss_7": 3587.2, + "learning_rate": 0.0006828814909619373, + "loss": 4659.8, + "step": 3870 + }, + { + "ce_loss_13": 2.84233677983284, + "ce_loss_26": 2.2270043969154356, + "ce_loss_39": 2.011353349685669, + "ce_loss_52": 1.44394671022892, + "ce_loss_7": 3.189998263120651, + "epoch": 0.388, + "grad_norm": 17.116859396660736, + "kl_loss_13": 2866.4, + "kl_loss_26": 1581.4, + "kl_loss_39": 1130.5, + "kl_loss_7": 3602.4, + "learning_rate": 0.0006814038526753205, + "loss": 4652.3, + "step": 3880 + }, + { + "ce_loss_13": 2.8899350225925446, + "ce_loss_26": 2.268605652451515, + "ce_loss_39": 2.047902289032936, + "ce_loss_52": 1.462986382842064, + "ce_loss_7": 3.2532753586769103, + "epoch": 0.389, + "grad_norm": 16.277065053757138, + "kl_loss_13": 2901.6, + "kl_loss_26": 1603.8, + "kl_loss_39": 1148.8, + "kl_loss_7": 3655.2, + "learning_rate": 0.0006799243876539213, + "loss": 4644.45, + "step": 3890 + }, + { + "ce_loss_13": 2.852635699510574, + "ce_loss_26": 2.225254198908806, + "ce_loss_39": 2.00534345805645, + "ce_loss_52": 1.420480152964592, + "ce_loss_7": 3.217593324184418, + "epoch": 0.39, + "grad_norm": 17.575618857452827, + "kl_loss_13": 2895.2, + "kl_loss_26": 1582.6, + "kl_loss_39": 1134.8, + "kl_loss_7": 3662.8, + "learning_rate": 0.0006784431107959359, + "loss": 4640.8, + "step": 3900 + }, + { + "ce_loss_13": 2.9095449209213258, + "ce_loss_26": 2.288859358429909, + "ce_loss_39": 2.069254148006439, + "ce_loss_52": 1.4762457937002182, + "ce_loss_7": 3.2724156618118285, + "epoch": 0.391, + "grad_norm": 15.314925266098216, + "kl_loss_13": 2939.6, + "kl_loss_26": 1620.2, + "kl_loss_39": 1162.8, + "kl_loss_7": 3702.8, + "learning_rate": 0.0006769600370178059, + "loss": 4625.75, + "step": 3910 + }, + { + "ce_loss_13": 2.79736613035202, + "ce_loss_26": 2.1872033685445786, + "ce_loss_39": 1.9660126984119415, + "ce_loss_52": 1.3993165016174316, + "ce_loss_7": 3.152447110414505, + "epoch": 0.392, + "grad_norm": 15.234701615575748, + "kl_loss_13": 2856.0, + "kl_loss_26": 1574.6, + "kl_loss_39": 1119.8, + "kl_loss_7": 3607.6, + "learning_rate": 0.0006754751812540679, + "loss": 4587.85, + "step": 3920 + }, + { + "ce_loss_13": 2.8410171031951905, + "ce_loss_26": 2.2249913841485975, + "ce_loss_39": 2.0135372936725617, + "ce_loss_52": 1.4371111243963242, + "ce_loss_7": 3.2084967494010925, + "epoch": 0.393, + "grad_norm": 16.62173105303993, + "kl_loss_13": 2885.6, + "kl_loss_26": 1588.2, + "kl_loss_39": 1146.8, + "kl_loss_7": 3644.4, + "learning_rate": 0.0006739885584572025, + "loss": 4635.2, + "step": 3930 + }, + { + "ce_loss_13": 2.7806951224803926, + "ce_loss_26": 2.1756977647542954, + "ce_loss_39": 1.96949442923069, + "ce_loss_52": 1.415724617242813, + "ce_loss_7": 3.1287400901317595, + "epoch": 0.394, + "grad_norm": 15.878619218635833, + "kl_loss_13": 2836.2, + "kl_loss_26": 1541.8, + "kl_loss_39": 1104.9, + "kl_loss_7": 3581.6, + "learning_rate": 0.0006725001835974853, + "loss": 4637.75, + "step": 3940 + }, + { + "ce_loss_13": 2.85609056353569, + "ce_loss_26": 2.228466436266899, + "ce_loss_39": 2.011217701435089, + "ce_loss_52": 1.4336451053619386, + "ce_loss_7": 3.212037581205368, + "epoch": 0.395, + "grad_norm": 15.588059225669095, + "kl_loss_13": 2892.8, + "kl_loss_26": 1574.8, + "kl_loss_39": 1125.7, + "kl_loss_7": 3657.6, + "learning_rate": 0.0006710100716628344, + "loss": 4584.95, + "step": 3950 + }, + { + "ce_loss_13": 2.820618736743927, + "ce_loss_26": 2.1797895193099976, + "ce_loss_39": 1.9612275928258895, + "ce_loss_52": 1.3932116001844406, + "ce_loss_7": 3.1924599528312685, + "epoch": 0.396, + "grad_norm": 14.878251588185849, + "kl_loss_13": 2911.2, + "kl_loss_26": 1556.2, + "kl_loss_39": 1114.5, + "kl_loss_7": 3694.0, + "learning_rate": 0.0006695182376586602, + "loss": 4607.1, + "step": 3960 + }, + { + "ce_loss_13": 2.7754017412662506, + "ce_loss_26": 2.1572470903396606, + "ce_loss_39": 1.9344938546419144, + "ce_loss_52": 1.3711352616548538, + "ce_loss_7": 3.1346897959709166, + "epoch": 0.397, + "grad_norm": 15.39943522658609, + "kl_loss_13": 2875.2, + "kl_loss_26": 1575.1, + "kl_loss_39": 1124.5, + "kl_loss_7": 3635.6, + "learning_rate": 0.000668024696607715, + "loss": 4546.3, + "step": 3970 + }, + { + "ce_loss_13": 2.7410697996616364, + "ce_loss_26": 2.1528750866651536, + "ce_loss_39": 1.944345197081566, + "ce_loss_52": 1.4029324680566788, + "ce_loss_7": 3.0945769369602205, + "epoch": 0.398, + "grad_norm": 16.69493947597699, + "kl_loss_13": 2742.0, + "kl_loss_26": 1499.6, + "kl_loss_39": 1066.1, + "kl_loss_7": 3478.0, + "learning_rate": 0.0006665294635499404, + "loss": 4509.25, + "step": 3980 + }, + { + "ce_loss_13": 2.7935349524021147, + "ce_loss_26": 2.191756248474121, + "ce_loss_39": 1.9830526530742645, + "ce_loss_52": 1.4325652569532394, + "ce_loss_7": 3.150054842233658, + "epoch": 0.399, + "grad_norm": 15.984763021073704, + "kl_loss_13": 2764.0, + "kl_loss_26": 1503.8, + "kl_loss_39": 1075.4, + "kl_loss_7": 3508.8, + "learning_rate": 0.0006650325535423167, + "loss": 4542.85, + "step": 3990 + }, + { + "ce_loss_13": 2.7841295659542085, + "ce_loss_26": 2.175816202163696, + "ce_loss_39": 1.9610484838485718, + "ce_loss_52": 1.3994766443967819, + "ce_loss_7": 3.1450257122516634, + "epoch": 0.4, + "grad_norm": 16.383690879711693, + "kl_loss_13": 2832.8, + "kl_loss_26": 1534.6, + "kl_loss_39": 1101.1, + "kl_loss_7": 3587.2, + "learning_rate": 0.0006635339816587109, + "loss": 4584.95, + "step": 4000 + }, + { + "ce_loss_13": 2.937473142147064, + "ce_loss_26": 2.298046553134918, + "ce_loss_39": 2.071186339855194, + "ce_loss_52": 1.4680579513311387, + "ce_loss_7": 3.2991883754730225, + "epoch": 0.401, + "grad_norm": 16.69896458470603, + "kl_loss_13": 2974.0, + "kl_loss_26": 1650.8, + "kl_loss_39": 1187.0, + "kl_loss_7": 3734.8, + "learning_rate": 0.0006620337629897252, + "loss": 4574.8, + "step": 4010 + }, + { + "ce_loss_13": 2.803048574924469, + "ce_loss_26": 2.1910858035087584, + "ce_loss_39": 1.977920189499855, + "ce_loss_52": 1.4274337738752365, + "ce_loss_7": 3.1627039849758147, + "epoch": 0.402, + "grad_norm": 15.21058574655926, + "kl_loss_13": 2803.0, + "kl_loss_26": 1508.9, + "kl_loss_39": 1074.1, + "kl_loss_7": 3558.8, + "learning_rate": 0.0006605319126425454, + "loss": 4546.4, + "step": 4020 + }, + { + "ce_loss_13": 2.8307320177555084, + "ce_loss_26": 2.208324944972992, + "ce_loss_39": 1.9950761079788208, + "ce_loss_52": 1.435056920349598, + "ce_loss_7": 3.19031218290329, + "epoch": 0.403, + "grad_norm": 14.837343102998657, + "kl_loss_13": 2876.0, + "kl_loss_26": 1550.3, + "kl_loss_39": 1112.9, + "kl_loss_7": 3638.4, + "learning_rate": 0.0006590284457407876, + "loss": 4535.35, + "step": 4030 + }, + { + "ce_loss_13": 2.8277206301689146, + "ce_loss_26": 2.2229607343673705, + "ce_loss_39": 2.0126491367816923, + "ce_loss_52": 1.465662133693695, + "ce_loss_7": 3.178615337610245, + "epoch": 0.404, + "grad_norm": 15.868817769840305, + "kl_loss_13": 2801.6, + "kl_loss_26": 1514.6, + "kl_loss_39": 1078.5, + "kl_loss_7": 3548.0, + "learning_rate": 0.0006575233774243465, + "loss": 4524.1, + "step": 4040 + }, + { + "ce_loss_13": 2.741392558813095, + "ce_loss_26": 2.1182916700839995, + "ce_loss_39": 1.9061576217412948, + "ce_loss_52": 1.3709532082080842, + "ce_loss_7": 3.1065491139888763, + "epoch": 0.405, + "grad_norm": 16.502947013390255, + "kl_loss_13": 2798.4, + "kl_loss_26": 1495.2, + "kl_loss_39": 1058.1, + "kl_loss_7": 3565.6, + "learning_rate": 0.0006560167228492435, + "loss": 4528.6, + "step": 4050 + }, + { + "ce_loss_13": 2.8996002614498138, + "ce_loss_26": 2.271700030565262, + "ce_loss_39": 2.045673191547394, + "ce_loss_52": 1.4674718797206878, + "ce_loss_7": 3.2622067093849183, + "epoch": 0.406, + "grad_norm": 15.215707475527795, + "kl_loss_13": 2900.0, + "kl_loss_26": 1589.6, + "kl_loss_39": 1131.4, + "kl_loss_7": 3660.8, + "learning_rate": 0.0006545084971874737, + "loss": 4547.15, + "step": 4060 + }, + { + "ce_loss_13": 2.8251163959503174, + "ce_loss_26": 2.1874846637248995, + "ce_loss_39": 1.9672167718410491, + "ce_loss_52": 1.4135777831077576, + "ce_loss_7": 3.1873776078224183, + "epoch": 0.407, + "grad_norm": 15.755939255613459, + "kl_loss_13": 2866.0, + "kl_loss_26": 1547.6, + "kl_loss_39": 1092.9, + "kl_loss_7": 3627.6, + "learning_rate": 0.0006529987156268526, + "loss": 4503.1, + "step": 4070 + }, + { + "ce_loss_13": 2.7349390149116517, + "ce_loss_26": 2.1141091108322145, + "ce_loss_39": 1.909931591153145, + "ce_loss_52": 1.3686757802963256, + "ce_loss_7": 3.0966077923774717, + "epoch": 0.408, + "grad_norm": 15.787212276524022, + "kl_loss_13": 2801.6, + "kl_loss_26": 1509.4, + "kl_loss_39": 1071.0, + "kl_loss_7": 3562.8, + "learning_rate": 0.0006514873933708637, + "loss": 4534.05, + "step": 4080 + }, + { + "ce_loss_13": 2.742733418941498, + "ce_loss_26": 2.1391125679016114, + "ce_loss_39": 1.9272442519664765, + "ce_loss_52": 1.387654460966587, + "ce_loss_7": 3.0977914452552797, + "epoch": 0.409, + "grad_norm": 15.727797591546214, + "kl_loss_13": 2755.6, + "kl_loss_26": 1488.4, + "kl_loss_39": 1050.3, + "kl_loss_7": 3508.0, + "learning_rate": 0.0006499745456385053, + "loss": 4444.65, + "step": 4090 + }, + { + "ce_loss_13": 2.7960755199193956, + "ce_loss_26": 2.184322661161423, + "ce_loss_39": 1.9677571415901185, + "ce_loss_52": 1.4271342948079109, + "ce_loss_7": 3.1514409124851226, + "epoch": 0.41, + "grad_norm": 15.52426613691677, + "kl_loss_13": 2809.8, + "kl_loss_26": 1518.3, + "kl_loss_39": 1075.2, + "kl_loss_7": 3551.6, + "learning_rate": 0.0006484601876641375, + "loss": 4500.65, + "step": 4100 + }, + { + "ce_loss_13": 2.8776713728904726, + "ce_loss_26": 2.257500499486923, + "ce_loss_39": 2.0303492128849028, + "ce_loss_52": 1.4582158356904984, + "ce_loss_7": 3.2387999415397646, + "epoch": 0.411, + "grad_norm": 15.93298743678484, + "kl_loss_13": 2878.8, + "kl_loss_26": 1576.0, + "kl_loss_39": 1115.6, + "kl_loss_7": 3640.8, + "learning_rate": 0.000646944334697328, + "loss": 4470.55, + "step": 4110 + }, + { + "ce_loss_13": 2.802631789445877, + "ce_loss_26": 2.2029493927955626, + "ce_loss_39": 2.001139259338379, + "ce_loss_52": 1.4623139530420304, + "ce_loss_7": 3.155901437997818, + "epoch": 0.412, + "grad_norm": 14.691054390726734, + "kl_loss_13": 2720.8, + "kl_loss_26": 1465.2, + "kl_loss_39": 1041.4, + "kl_loss_7": 3462.4, + "learning_rate": 0.0006454270020026995, + "loss": 4502.65, + "step": 4120 + }, + { + "ce_loss_13": 2.8162184596061706, + "ce_loss_26": 2.1934009909629824, + "ce_loss_39": 1.979950374364853, + "ce_loss_52": 1.4344559267163277, + "ce_loss_7": 3.1758966505527497, + "epoch": 0.413, + "grad_norm": 16.25780643806628, + "kl_loss_13": 2816.0, + "kl_loss_26": 1518.6, + "kl_loss_39": 1077.6, + "kl_loss_7": 3573.6, + "learning_rate": 0.0006439082048597755, + "loss": 4487.45, + "step": 4130 + }, + { + "ce_loss_13": 2.787912631034851, + "ce_loss_26": 2.1966257959604265, + "ce_loss_39": 1.9914580851793289, + "ce_loss_52": 1.4511510521173476, + "ce_loss_7": 3.1392914772033693, + "epoch": 0.414, + "grad_norm": 17.37704963704925, + "kl_loss_13": 2734.8, + "kl_loss_26": 1487.2, + "kl_loss_39": 1057.4, + "kl_loss_7": 3474.0, + "learning_rate": 0.0006423879585628261, + "loss": 4448.15, + "step": 4140 + }, + { + "ce_loss_13": 2.817258411645889, + "ce_loss_26": 2.1947576314210893, + "ce_loss_39": 1.9762789696455, + "ce_loss_52": 1.433014589548111, + "ce_loss_7": 3.182687884569168, + "epoch": 0.415, + "grad_norm": 15.35502556975723, + "kl_loss_13": 2826.8, + "kl_loss_26": 1522.0, + "kl_loss_39": 1072.5, + "kl_loss_7": 3595.2, + "learning_rate": 0.0006408662784207149, + "loss": 4433.75, + "step": 4150 + }, + { + "ce_loss_13": 2.817685341835022, + "ce_loss_26": 2.2071537256240843, + "ce_loss_39": 1.9907894372940063, + "ce_loss_52": 1.4230278193950654, + "ce_loss_7": 3.1795800507068632, + "epoch": 0.416, + "grad_norm": 15.573867614749913, + "kl_loss_13": 2866.0, + "kl_loss_26": 1558.6, + "kl_loss_39": 1107.2, + "kl_loss_7": 3632.0, + "learning_rate": 0.0006393431797567439, + "loss": 4436.3, + "step": 4160 + }, + { + "ce_loss_13": 2.819452613592148, + "ce_loss_26": 2.213544499874115, + "ce_loss_39": 1.9934939831495284, + "ce_loss_52": 1.4420817136764525, + "ce_loss_7": 3.1729123532772063, + "epoch": 0.417, + "grad_norm": 15.840337845359416, + "kl_loss_13": 2809.4, + "kl_loss_26": 1533.4, + "kl_loss_39": 1076.0, + "kl_loss_7": 3544.0, + "learning_rate": 0.0006378186779084996, + "loss": 4429.6, + "step": 4170 + }, + { + "ce_loss_13": 2.797993552684784, + "ce_loss_26": 2.2015393495559694, + "ce_loss_39": 1.986987265944481, + "ce_loss_52": 1.446770191192627, + "ce_loss_7": 3.145763796567917, + "epoch": 0.418, + "grad_norm": 16.258575254109445, + "kl_loss_13": 2768.4, + "kl_loss_26": 1520.4, + "kl_loss_39": 1076.5, + "kl_loss_7": 3502.0, + "learning_rate": 0.0006362927882276989, + "loss": 4452.8, + "step": 4180 + }, + { + "ce_loss_13": 2.809996685385704, + "ce_loss_26": 2.1883741706609725, + "ce_loss_39": 1.972084966301918, + "ce_loss_52": 1.4272316336631774, + "ce_loss_7": 3.1641923069953917, + "epoch": 0.419, + "grad_norm": 17.021132117568744, + "kl_loss_13": 2806.8, + "kl_loss_26": 1522.7, + "kl_loss_39": 1076.0, + "kl_loss_7": 3556.4, + "learning_rate": 0.000634765526080034, + "loss": 4434.25, + "step": 4190 + }, + { + "ce_loss_13": 2.7747348487377166, + "ce_loss_26": 2.1618224531412125, + "ce_loss_39": 1.9505164802074433, + "ce_loss_52": 1.4064817115664483, + "ce_loss_7": 3.1292604207992554, + "epoch": 0.42, + "grad_norm": 15.556302486325128, + "kl_loss_13": 2777.6, + "kl_loss_26": 1495.4, + "kl_loss_39": 1055.2, + "kl_loss_7": 3523.2, + "learning_rate": 0.0006332369068450174, + "loss": 4413.55, + "step": 4200 + }, + { + "ce_loss_13": 2.748269832134247, + "ce_loss_26": 2.145698443055153, + "ce_loss_39": 1.935601145029068, + "ce_loss_52": 1.4105115324258803, + "ce_loss_7": 3.1001435458660125, + "epoch": 0.421, + "grad_norm": 15.348610438295403, + "kl_loss_13": 2742.8, + "kl_loss_26": 1480.0, + "kl_loss_39": 1039.8, + "kl_loss_7": 3490.0, + "learning_rate": 0.0006317069459158283, + "loss": 4363.8, + "step": 4210 + }, + { + "ce_loss_13": 2.7747100263834, + "ce_loss_26": 2.16818388402462, + "ce_loss_39": 1.9505507349967957, + "ce_loss_52": 1.4193186193704606, + "ce_loss_7": 3.136371600627899, + "epoch": 0.422, + "grad_norm": 16.358740351868324, + "kl_loss_13": 2764.6, + "kl_loss_26": 1481.3, + "kl_loss_39": 1040.1, + "kl_loss_7": 3516.8, + "learning_rate": 0.0006301756586991561, + "loss": 4421.65, + "step": 4220 + }, + { + "ce_loss_13": 2.8185549050569536, + "ce_loss_26": 2.226038011908531, + "ce_loss_39": 2.013939729332924, + "ce_loss_52": 1.4788149103522301, + "ce_loss_7": 3.1706756830215452, + "epoch": 0.423, + "grad_norm": 14.82164626530813, + "kl_loss_13": 2758.0, + "kl_loss_26": 1495.6, + "kl_loss_39": 1059.3, + "kl_loss_7": 3503.2, + "learning_rate": 0.0006286430606150459, + "loss": 4398.35, + "step": 4230 + }, + { + "ce_loss_13": 2.7891676902770994, + "ce_loss_26": 2.1986444026231764, + "ce_loss_39": 1.9819349884986877, + "ce_loss_52": 1.4562569051980971, + "ce_loss_7": 3.1411671698093415, + "epoch": 0.424, + "grad_norm": 15.535941880253773, + "kl_loss_13": 2717.2, + "kl_loss_26": 1468.6, + "kl_loss_39": 1020.2, + "kl_loss_7": 3457.2, + "learning_rate": 0.0006271091670967436, + "loss": 4370.45, + "step": 4240 + }, + { + "ce_loss_13": 2.8151471495628355, + "ce_loss_26": 2.204052150249481, + "ce_loss_39": 1.9960095703601837, + "ce_loss_52": 1.45780867934227, + "ce_loss_7": 3.1626071453094484, + "epoch": 0.425, + "grad_norm": 16.39177349451075, + "kl_loss_13": 2749.2, + "kl_loss_26": 1471.2, + "kl_loss_39": 1041.9, + "kl_loss_7": 3492.8, + "learning_rate": 0.0006255739935905395, + "loss": 4354.95, + "step": 4250 + }, + { + "ce_loss_13": 2.7719932794570923, + "ce_loss_26": 2.1723096281290055, + "ce_loss_39": 1.9554951965808869, + "ce_loss_52": 1.4198345810174942, + "ce_loss_7": 3.134742945432663, + "epoch": 0.426, + "grad_norm": 17.215386749382045, + "kl_loss_13": 2775.6, + "kl_loss_26": 1506.6, + "kl_loss_39": 1055.7, + "kl_loss_7": 3532.8, + "learning_rate": 0.0006240375555556145, + "loss": 4360.8, + "step": 4260 + }, + { + "ce_loss_13": 2.7217872977256774, + "ce_loss_26": 2.1173421651124955, + "ce_loss_39": 1.9085008651018143, + "ce_loss_52": 1.400168927013874, + "ce_loss_7": 3.0799288749694824, + "epoch": 0.427, + "grad_norm": 15.867423276307166, + "kl_loss_13": 2701.0, + "kl_loss_26": 1432.6, + "kl_loss_39": 996.7, + "kl_loss_7": 3452.8, + "learning_rate": 0.000622499868463882, + "loss": 4320.5, + "step": 4270 + }, + { + "ce_loss_13": 2.7815617978572846, + "ce_loss_26": 2.1786680042743685, + "ce_loss_39": 1.9648784220218658, + "ce_loss_52": 1.4438522070646287, + "ce_loss_7": 3.1414669275283815, + "epoch": 0.428, + "grad_norm": 16.86028992899928, + "kl_loss_13": 2733.2, + "kl_loss_26": 1463.0, + "kl_loss_39": 1028.0, + "kl_loss_7": 3484.8, + "learning_rate": 0.0006209609477998338, + "loss": 4348.9, + "step": 4280 + }, + { + "ce_loss_13": 2.8184913277626036, + "ce_loss_26": 2.213253751397133, + "ce_loss_39": 1.986603057384491, + "ce_loss_52": 1.4555893182754516, + "ce_loss_7": 3.1685641705989838, + "epoch": 0.429, + "grad_norm": 15.40477364702056, + "kl_loss_13": 2779.6, + "kl_loss_26": 1503.0, + "kl_loss_39": 1049.7, + "kl_loss_7": 3514.4, + "learning_rate": 0.0006194208090603844, + "loss": 4374.7, + "step": 4290 + }, + { + "ce_loss_13": 2.726405268907547, + "ce_loss_26": 2.1394855052232744, + "ce_loss_39": 1.9364097625017167, + "ce_loss_52": 1.4365313708782197, + "ce_loss_7": 3.0755336761474608, + "epoch": 0.43, + "grad_norm": 14.784393649721942, + "kl_loss_13": 2680.0, + "kl_loss_26": 1434.2, + "kl_loss_39": 1002.6, + "kl_loss_7": 3415.2, + "learning_rate": 0.0006178794677547138, + "loss": 4325.15, + "step": 4300 + }, + { + "ce_loss_13": 2.78907487988472, + "ce_loss_26": 2.1874548703432084, + "ce_loss_39": 1.9674001038074493, + "ce_loss_52": 1.4388054758310318, + "ce_loss_7": 3.1580194234848022, + "epoch": 0.431, + "grad_norm": 15.540150658114959, + "kl_loss_13": 2772.4, + "kl_loss_26": 1489.8, + "kl_loss_39": 1036.1, + "kl_loss_7": 3534.8, + "learning_rate": 0.0006163369394041111, + "loss": 4337.1, + "step": 4310 + }, + { + "ce_loss_13": 2.7502326130867005, + "ce_loss_26": 2.1552721470594407, + "ce_loss_39": 1.9502787470817566, + "ce_loss_52": 1.4348126232624054, + "ce_loss_7": 3.1085386633872987, + "epoch": 0.432, + "grad_norm": 15.900486211327715, + "kl_loss_13": 2709.8, + "kl_loss_26": 1438.4, + "kl_loss_39": 1010.4, + "kl_loss_7": 3455.6, + "learning_rate": 0.0006147932395418205, + "loss": 4308.0, + "step": 4320 + }, + { + "ce_loss_13": 2.7637496650218965, + "ce_loss_26": 2.1625583559274673, + "ce_loss_39": 1.947121372818947, + "ce_loss_52": 1.4198297888040543, + "ce_loss_7": 3.1241161942481996, + "epoch": 0.433, + "grad_norm": 16.260371827994177, + "kl_loss_13": 2733.2, + "kl_loss_26": 1467.8, + "kl_loss_39": 1033.8, + "kl_loss_7": 3485.2, + "learning_rate": 0.0006132483837128823, + "loss": 4327.3, + "step": 4330 + }, + { + "ce_loss_13": 2.780191105604172, + "ce_loss_26": 2.1823483228683473, + "ce_loss_39": 1.9749175161123276, + "ce_loss_52": 1.4566338241100312, + "ce_loss_7": 3.142491656541824, + "epoch": 0.434, + "grad_norm": 16.173065879753995, + "kl_loss_13": 2713.6, + "kl_loss_26": 1446.4, + "kl_loss_39": 1012.3, + "kl_loss_7": 3465.6, + "learning_rate": 0.0006117023874739772, + "loss": 4346.0, + "step": 4340 + }, + { + "ce_loss_13": 2.756999599933624, + "ce_loss_26": 2.151958614587784, + "ce_loss_39": 1.9352585464715957, + "ce_loss_52": 1.4167816311120986, + "ce_loss_7": 3.1229954183101656, + "epoch": 0.435, + "grad_norm": 16.656646084830363, + "kl_loss_13": 2759.6, + "kl_loss_26": 1478.0, + "kl_loss_39": 1029.0, + "kl_loss_7": 3524.0, + "learning_rate": 0.0006101552663932703, + "loss": 4336.25, + "step": 4350 + }, + { + "ce_loss_13": 2.774202525615692, + "ce_loss_26": 2.172477602958679, + "ce_loss_39": 1.9620429188013078, + "ce_loss_52": 1.43767509162426, + "ce_loss_7": 3.1362563192844393, + "epoch": 0.436, + "grad_norm": 16.067338284310296, + "kl_loss_13": 2744.4, + "kl_loss_26": 1472.4, + "kl_loss_39": 1033.1, + "kl_loss_7": 3493.6, + "learning_rate": 0.0006086070360502539, + "loss": 4296.35, + "step": 4360 + }, + { + "ce_loss_13": 2.787889677286148, + "ce_loss_26": 2.208648791909218, + "ce_loss_39": 1.999781733751297, + "ce_loss_52": 1.4855108827352523, + "ce_loss_7": 3.1249564945697785, + "epoch": 0.437, + "grad_norm": 15.78991831926034, + "kl_loss_13": 2690.0, + "kl_loss_26": 1446.0, + "kl_loss_39": 1014.8, + "kl_loss_7": 3419.2, + "learning_rate": 0.0006070577120355903, + "loss": 4280.75, + "step": 4370 + }, + { + "ce_loss_13": 2.8026595056056975, + "ce_loss_26": 2.207309713959694, + "ce_loss_39": 2.0008264780044556, + "ce_loss_52": 1.4935471057891845, + "ce_loss_7": 3.1493531346321104, + "epoch": 0.438, + "grad_norm": 15.837154081953376, + "kl_loss_13": 2679.6, + "kl_loss_26": 1429.2, + "kl_loss_39": 1001.1, + "kl_loss_7": 3413.2, + "learning_rate": 0.0006055073099509549, + "loss": 4296.35, + "step": 4380 + }, + { + "ce_loss_13": 2.755897510051727, + "ce_loss_26": 2.1693040400743486, + "ce_loss_39": 1.9623985677957534, + "ce_loss_52": 1.4462745368480683, + "ce_loss_7": 3.1059607326984406, + "epoch": 0.439, + "grad_norm": 15.629443906703631, + "kl_loss_13": 2694.4, + "kl_loss_26": 1446.6, + "kl_loss_39": 1012.9, + "kl_loss_7": 3427.6, + "learning_rate": 0.0006039558454088796, + "loss": 4277.25, + "step": 4390 + }, + { + "ce_loss_13": 2.7673678040504455, + "ce_loss_26": 2.159538361430168, + "ce_loss_39": 1.9505891352891922, + "ce_loss_52": 1.4304020568728446, + "ce_loss_7": 3.1244628012180327, + "epoch": 0.44, + "grad_norm": 15.403089942991496, + "kl_loss_13": 2740.4, + "kl_loss_26": 1465.4, + "kl_loss_39": 1024.9, + "kl_loss_7": 3482.4, + "learning_rate": 0.0006024033340325954, + "loss": 4300.2, + "step": 4400 + }, + { + "ce_loss_13": 2.7479640781879424, + "ce_loss_26": 2.1436998754739762, + "ce_loss_39": 1.9348597198724746, + "ce_loss_52": 1.4162754774093629, + "ce_loss_7": 3.1057413816452026, + "epoch": 0.441, + "grad_norm": 16.11204916554698, + "kl_loss_13": 2726.0, + "kl_loss_26": 1457.7, + "kl_loss_39": 1024.6, + "kl_loss_7": 3474.8, + "learning_rate": 0.0006008497914558743, + "loss": 4264.9, + "step": 4410 + }, + { + "ce_loss_13": 2.781216788291931, + "ce_loss_26": 2.1821627736091616, + "ce_loss_39": 1.9793646305799484, + "ce_loss_52": 1.456499743461609, + "ce_loss_7": 3.1415066480636598, + "epoch": 0.442, + "grad_norm": 15.839943843413481, + "kl_loss_13": 2703.2, + "kl_loss_26": 1453.2, + "kl_loss_39": 1022.0, + "kl_loss_7": 3462.8, + "learning_rate": 0.0005992952333228728, + "loss": 4320.7, + "step": 4420 + }, + { + "ce_loss_13": 2.6314639270305635, + "ce_loss_26": 2.0378955364227296, + "ce_loss_39": 1.841288161277771, + "ce_loss_52": 1.367735171318054, + "ce_loss_7": 2.983852916955948, + "epoch": 0.443, + "grad_norm": 15.715297314159198, + "kl_loss_13": 2586.8, + "kl_loss_26": 1338.4, + "kl_loss_39": 929.4, + "kl_loss_7": 3328.4, + "learning_rate": 0.0005977396752879741, + "loss": 4224.0, + "step": 4430 + }, + { + "ce_loss_13": 2.747091996669769, + "ce_loss_26": 2.1425569266080857, + "ce_loss_39": 1.9297908574342728, + "ce_loss_52": 1.4299270451068877, + "ce_loss_7": 3.0992358028888702, + "epoch": 0.444, + "grad_norm": 15.44431585804275, + "kl_loss_13": 2688.8, + "kl_loss_26": 1428.0, + "kl_loss_39": 986.6, + "kl_loss_7": 3434.0, + "learning_rate": 0.0005961831330156305, + "loss": 4224.4, + "step": 4440 + }, + { + "ce_loss_13": 2.7738942086696623, + "ce_loss_26": 2.1665944904088974, + "ce_loss_39": 1.9503348082304002, + "ce_loss_52": 1.4405199617147446, + "ce_loss_7": 3.131233388185501, + "epoch": 0.445, + "grad_norm": 15.683988530213393, + "kl_loss_13": 2701.2, + "kl_loss_26": 1441.4, + "kl_loss_39": 999.9, + "kl_loss_7": 3451.6, + "learning_rate": 0.0005946256221802051, + "loss": 4233.55, + "step": 4450 + }, + { + "ce_loss_13": 2.6961427688598634, + "ce_loss_26": 2.1098335653543474, + "ce_loss_39": 1.8978259444236756, + "ce_loss_52": 1.415482410788536, + "ce_loss_7": 3.044248181581497, + "epoch": 0.446, + "grad_norm": 15.256632143150593, + "kl_loss_13": 2612.8, + "kl_loss_26": 1380.2, + "kl_loss_39": 951.7, + "kl_loss_7": 3350.8, + "learning_rate": 0.0005930671584658151, + "loss": 4214.65, + "step": 4460 + }, + { + "ce_loss_13": 2.732464927434921, + "ce_loss_26": 2.141967472434044, + "ce_loss_39": 1.929695299267769, + "ce_loss_52": 1.4190144926309585, + "ce_loss_7": 3.091973352432251, + "epoch": 0.447, + "grad_norm": 16.38296166656899, + "kl_loss_13": 2676.8, + "kl_loss_26": 1435.6, + "kl_loss_39": 1000.5, + "kl_loss_7": 3428.4, + "learning_rate": 0.0005915077575661722, + "loss": 4280.4, + "step": 4470 + }, + { + "ce_loss_13": 2.683997756242752, + "ce_loss_26": 2.091261792182922, + "ce_loss_39": 1.8827648997306823, + "ce_loss_52": 1.3911016047000886, + "ce_loss_7": 3.0366858661174776, + "epoch": 0.448, + "grad_norm": 15.184401397346244, + "kl_loss_13": 2642.4, + "kl_loss_26": 1399.3, + "kl_loss_39": 968.5, + "kl_loss_7": 3392.0, + "learning_rate": 0.000589947435184427, + "loss": 4194.3, + "step": 4480 + }, + { + "ce_loss_13": 2.7240218341350557, + "ce_loss_26": 2.128333044052124, + "ce_loss_39": 1.924179795384407, + "ce_loss_52": 1.4498969972133637, + "ce_loss_7": 3.0761671125888825, + "epoch": 0.449, + "grad_norm": 17.07642900561258, + "kl_loss_13": 2604.8, + "kl_loss_26": 1344.6, + "kl_loss_39": 924.8, + "kl_loss_7": 3354.0, + "learning_rate": 0.0005883862070330078, + "loss": 4206.7, + "step": 4490 + }, + { + "ce_loss_13": 2.7219722032547, + "ce_loss_26": 2.1390156149864197, + "ce_loss_39": 1.9286952793598175, + "ce_loss_52": 1.4294085174798965, + "ce_loss_7": 3.0803197801113127, + "epoch": 0.45, + "grad_norm": 15.274632326953679, + "kl_loss_13": 2621.6, + "kl_loss_26": 1403.4, + "kl_loss_39": 973.4, + "kl_loss_7": 3369.2, + "learning_rate": 0.0005868240888334653, + "loss": 4211.9, + "step": 4500 + }, + { + "ce_loss_13": 2.6810890555381777, + "ce_loss_26": 2.105925416946411, + "ce_loss_39": 1.9109346747398377, + "ce_loss_52": 1.4293138086795807, + "ce_loss_7": 3.033176803588867, + "epoch": 0.451, + "grad_norm": 17.03243058089965, + "kl_loss_13": 2608.2, + "kl_loss_26": 1375.3, + "kl_loss_39": 955.3, + "kl_loss_7": 3345.2, + "learning_rate": 0.0005852610963163119, + "loss": 4209.7, + "step": 4510 + }, + { + "ce_loss_13": 2.689431291818619, + "ce_loss_26": 2.1146853864192963, + "ce_loss_39": 1.9112016946077346, + "ce_loss_52": 1.4318486779928208, + "ce_loss_7": 3.0395568013191223, + "epoch": 0.452, + "grad_norm": 15.510330374157597, + "kl_loss_13": 2583.2, + "kl_loss_26": 1365.8, + "kl_loss_39": 947.8, + "kl_loss_7": 3313.2, + "learning_rate": 0.0005836972452208654, + "loss": 4185.25, + "step": 4520 + }, + { + "ce_loss_13": 2.7475471079349516, + "ce_loss_26": 2.157953730225563, + "ce_loss_39": 1.9457335144281387, + "ce_loss_52": 1.4404390811920167, + "ce_loss_7": 3.104820030927658, + "epoch": 0.453, + "grad_norm": 15.220972226004102, + "kl_loss_13": 2688.4, + "kl_loss_26": 1444.2, + "kl_loss_39": 1006.8, + "kl_loss_7": 3431.6, + "learning_rate": 0.0005821325512950885, + "loss": 4222.6, + "step": 4530 + }, + { + "ce_loss_13": 2.7701157510280607, + "ce_loss_26": 2.1823565661907196, + "ce_loss_39": 1.9763484060764314, + "ce_loss_52": 1.4842363893985748, + "ce_loss_7": 3.1242611587047575, + "epoch": 0.454, + "grad_norm": 16.181871779695452, + "kl_loss_13": 2641.6, + "kl_loss_26": 1398.0, + "kl_loss_39": 968.2, + "kl_loss_7": 3387.6, + "learning_rate": 0.0005805670302954321, + "loss": 4206.95, + "step": 4540 + }, + { + "ce_loss_13": 2.69073800444603, + "ce_loss_26": 2.1018889248371124, + "ce_loss_39": 1.8928476065397262, + "ce_loss_52": 1.4156933531165123, + "ce_loss_7": 3.051577550172806, + "epoch": 0.455, + "grad_norm": 15.802548274169151, + "kl_loss_13": 2629.2, + "kl_loss_26": 1374.2, + "kl_loss_39": 934.4, + "kl_loss_7": 3382.8, + "learning_rate": 0.000579000697986675, + "loss": 4173.65, + "step": 4550 + }, + { + "ce_loss_13": 2.734744447469711, + "ce_loss_26": 2.139357805252075, + "ce_loss_39": 1.9327150255441665, + "ce_loss_52": 1.44863750487566, + "ce_loss_7": 3.083251416683197, + "epoch": 0.456, + "grad_norm": 15.332335326805197, + "kl_loss_13": 2646.0, + "kl_loss_26": 1390.4, + "kl_loss_39": 959.2, + "kl_loss_7": 3391.2, + "learning_rate": 0.0005774335701417662, + "loss": 4177.45, + "step": 4560 + }, + { + "ce_loss_13": 2.696203714609146, + "ce_loss_26": 2.102033945918083, + "ce_loss_39": 1.8971556156873703, + "ce_loss_52": 1.4197801396250724, + "ce_loss_7": 3.047564595937729, + "epoch": 0.457, + "grad_norm": 16.076096882060348, + "kl_loss_13": 2600.0, + "kl_loss_26": 1362.8, + "kl_loss_39": 939.0, + "kl_loss_7": 3341.2, + "learning_rate": 0.0005758656625416658, + "loss": 4183.3, + "step": 4570 + }, + { + "ce_loss_13": 2.7472688376903536, + "ce_loss_26": 2.1439451813697814, + "ce_loss_39": 1.9378813654184341, + "ce_loss_52": 1.4498099207878112, + "ce_loss_7": 3.1056803286075594, + "epoch": 0.458, + "grad_norm": 15.434602661166036, + "kl_loss_13": 2667.6, + "kl_loss_26": 1394.6, + "kl_loss_39": 964.2, + "kl_loss_7": 3421.6, + "learning_rate": 0.0005742969909751859, + "loss": 4202.65, + "step": 4580 + }, + { + "ce_loss_13": 2.819844591617584, + "ce_loss_26": 2.217494735121727, + "ce_loss_39": 1.995268750190735, + "ce_loss_52": 1.4828792631626129, + "ce_loss_7": 3.183567076921463, + "epoch": 0.459, + "grad_norm": 15.16665840440692, + "kl_loss_13": 2715.2, + "kl_loss_26": 1447.8, + "kl_loss_39": 1000.9, + "kl_loss_7": 3469.2, + "learning_rate": 0.0005727275712388318, + "loss": 4159.15, + "step": 4590 + }, + { + "ce_loss_13": 2.757504242658615, + "ce_loss_26": 2.1517707139253615, + "ce_loss_39": 1.9407849818468095, + "ce_loss_52": 1.4452633827924728, + "ce_loss_7": 3.1194639682769774, + "epoch": 0.46, + "grad_norm": 16.189362396808324, + "kl_loss_13": 2690.8, + "kl_loss_26": 1421.0, + "kl_loss_39": 976.6, + "kl_loss_7": 3439.6, + "learning_rate": 0.0005711574191366427, + "loss": 4126.7, + "step": 4600 + }, + { + "ce_loss_13": 2.7185553312301636, + "ce_loss_26": 2.140464088320732, + "ce_loss_39": 1.9312822461128234, + "ce_loss_52": 1.4499182224273681, + "ce_loss_7": 3.074033808708191, + "epoch": 0.461, + "grad_norm": 15.796779340482095, + "kl_loss_13": 2590.8, + "kl_loss_26": 1373.5, + "kl_loss_39": 946.6, + "kl_loss_7": 3326.0, + "learning_rate": 0.0005695865504800327, + "loss": 4117.15, + "step": 4610 + }, + { + "ce_loss_13": 2.689697802066803, + "ce_loss_26": 2.1229261219501496, + "ce_loss_39": 1.9236579477787017, + "ce_loss_52": 1.4474295616149901, + "ce_loss_7": 3.0403923749923707, + "epoch": 0.462, + "grad_norm": 15.469933015809259, + "kl_loss_13": 2559.6, + "kl_loss_26": 1353.2, + "kl_loss_39": 936.3, + "kl_loss_7": 3287.2, + "learning_rate": 0.0005680149810876322, + "loss": 4141.45, + "step": 4620 + }, + { + "ce_loss_13": 2.709194713830948, + "ce_loss_26": 2.1067664295434954, + "ce_loss_39": 1.8838362753391267, + "ce_loss_52": 1.4009573340415955, + "ce_loss_7": 3.070843666791916, + "epoch": 0.463, + "grad_norm": 15.475096434174118, + "kl_loss_13": 2674.8, + "kl_loss_26": 1399.4, + "kl_loss_39": 943.3, + "kl_loss_7": 3422.8, + "learning_rate": 0.0005664427267851271, + "loss": 4160.8, + "step": 4630 + }, + { + "ce_loss_13": 2.7120527923107147, + "ce_loss_26": 2.120649069547653, + "ce_loss_39": 1.9163700252771378, + "ce_loss_52": 1.4380100429058076, + "ce_loss_7": 3.061056911945343, + "epoch": 0.464, + "grad_norm": 15.555501351653449, + "kl_loss_13": 2608.0, + "kl_loss_26": 1354.6, + "kl_loss_39": 932.5, + "kl_loss_7": 3340.4, + "learning_rate": 0.0005648698034051009, + "loss": 4170.2, + "step": 4640 + }, + { + "ce_loss_13": 2.7389404594898226, + "ce_loss_26": 2.1454448729753492, + "ce_loss_39": 1.9379454165697099, + "ce_loss_52": 1.45494404733181, + "ce_loss_7": 3.090787374973297, + "epoch": 0.465, + "grad_norm": 17.172011566290227, + "kl_loss_13": 2617.0, + "kl_loss_26": 1379.3, + "kl_loss_39": 945.6, + "kl_loss_7": 3354.0, + "learning_rate": 0.0005632962267868747, + "loss": 4137.2, + "step": 4650 + }, + { + "ce_loss_13": 2.618588683009148, + "ce_loss_26": 2.0511282205581667, + "ce_loss_39": 1.857603308558464, + "ce_loss_52": 1.3924044981598853, + "ce_loss_7": 2.9628873229026795, + "epoch": 0.466, + "grad_norm": 15.078077143393564, + "kl_loss_13": 2528.4, + "kl_loss_26": 1330.4, + "kl_loss_39": 924.6, + "kl_loss_7": 3258.4, + "learning_rate": 0.0005617220127763474, + "loss": 4108.7, + "step": 4660 + }, + { + "ce_loss_13": 2.706065672636032, + "ce_loss_26": 2.128128296136856, + "ce_loss_39": 1.921997308731079, + "ce_loss_52": 1.4441686987876892, + "ce_loss_7": 3.059876149892807, + "epoch": 0.467, + "grad_norm": 16.456195061500843, + "kl_loss_13": 2576.0, + "kl_loss_26": 1356.8, + "kl_loss_39": 938.3, + "kl_loss_7": 3319.6, + "learning_rate": 0.0005601471772258368, + "loss": 4092.5, + "step": 4670 + }, + { + "ce_loss_13": 2.691108763217926, + "ce_loss_26": 2.1213403046131134, + "ce_loss_39": 1.9077781707048416, + "ce_loss_52": 1.4377064436674118, + "ce_loss_7": 3.0406983733177184, + "epoch": 0.468, + "grad_norm": 15.284887521485958, + "kl_loss_13": 2584.8, + "kl_loss_26": 1368.0, + "kl_loss_39": 933.0, + "kl_loss_7": 3316.4, + "learning_rate": 0.0005585717359939192, + "loss": 4090.9, + "step": 4680 + }, + { + "ce_loss_13": 2.715133213996887, + "ce_loss_26": 2.1481328904628754, + "ce_loss_39": 1.938890340924263, + "ce_loss_52": 1.456964261829853, + "ce_loss_7": 3.063201904296875, + "epoch": 0.469, + "grad_norm": 14.95117298176539, + "kl_loss_13": 2573.6, + "kl_loss_26": 1372.5, + "kl_loss_39": 948.0, + "kl_loss_7": 3294.4, + "learning_rate": 0.0005569957049452703, + "loss": 4067.75, + "step": 4690 + }, + { + "ce_loss_13": 2.7355633437633515, + "ce_loss_26": 2.130109578371048, + "ce_loss_39": 1.9153006434440614, + "ce_loss_52": 1.4125859558582305, + "ce_loss_7": 3.0963422894477843, + "epoch": 0.47, + "grad_norm": 15.35914349074289, + "kl_loss_13": 2726.0, + "kl_loss_26": 1448.2, + "kl_loss_39": 1002.1, + "kl_loss_7": 3478.0, + "learning_rate": 0.0005554190999505056, + "loss": 4157.55, + "step": 4700 + }, + { + "ce_loss_13": 2.6879218101501463, + "ce_loss_26": 2.098815104365349, + "ce_loss_39": 1.8952988266944886, + "ce_loss_52": 1.4328953355550766, + "ce_loss_7": 3.040910530090332, + "epoch": 0.471, + "grad_norm": 15.95882741217506, + "kl_loss_13": 2568.0, + "kl_loss_26": 1323.6, + "kl_loss_39": 901.3, + "kl_loss_7": 3319.6, + "learning_rate": 0.0005538419368860196, + "loss": 4062.85, + "step": 4710 + }, + { + "ce_loss_13": 2.6794900715351107, + "ce_loss_26": 2.0905598402023315, + "ce_loss_39": 1.8867737114429475, + "ce_loss_52": 1.4186928808689117, + "ce_loss_7": 3.0324371635913847, + "epoch": 0.472, + "grad_norm": 15.674607259246525, + "kl_loss_13": 2569.4, + "kl_loss_26": 1340.6, + "kl_loss_39": 914.1, + "kl_loss_7": 3303.6, + "learning_rate": 0.0005522642316338268, + "loss": 4084.1, + "step": 4720 + }, + { + "ce_loss_13": 2.7094511866569517, + "ce_loss_26": 2.1240269035100936, + "ce_loss_39": 1.9192228257656097, + "ce_loss_52": 1.4589810997247696, + "ce_loss_7": 3.0527816653251647, + "epoch": 0.473, + "grad_norm": 15.84409146313606, + "kl_loss_13": 2555.6, + "kl_loss_26": 1330.2, + "kl_loss_39": 901.4, + "kl_loss_7": 3270.0, + "learning_rate": 0.0005506860000814017, + "loss": 4024.65, + "step": 4730 + }, + { + "ce_loss_13": 2.686307519674301, + "ce_loss_26": 2.1099446028470994, + "ce_loss_39": 1.9098408967256546, + "ce_loss_52": 1.4574729681015015, + "ce_loss_7": 3.03258957862854, + "epoch": 0.474, + "grad_norm": 16.290287645699628, + "kl_loss_13": 2538.4, + "kl_loss_26": 1308.0, + "kl_loss_39": 889.7, + "kl_loss_7": 3278.4, + "learning_rate": 0.0005491072581215186, + "loss": 4058.25, + "step": 4740 + }, + { + "ce_loss_13": 2.685838830471039, + "ce_loss_26": 2.097555673122406, + "ce_loss_39": 1.8953343421220779, + "ce_loss_52": 1.4276258319616317, + "ce_loss_7": 3.042680394649506, + "epoch": 0.475, + "grad_norm": 15.834434583556519, + "kl_loss_13": 2590.0, + "kl_loss_26": 1354.4, + "kl_loss_39": 931.0, + "kl_loss_7": 3334.4, + "learning_rate": 0.0005475280216520913, + "loss": 4057.7, + "step": 4750 + }, + { + "ce_loss_13": 2.645804923772812, + "ce_loss_26": 2.073691374063492, + "ce_loss_39": 1.872296717762947, + "ce_loss_52": 1.4167594254016875, + "ce_loss_7": 2.992863970994949, + "epoch": 0.476, + "grad_norm": 15.93243827944326, + "kl_loss_13": 2532.4, + "kl_loss_26": 1326.8, + "kl_loss_39": 900.3, + "kl_loss_7": 3268.8, + "learning_rate": 0.0005459483065760138, + "loss": 4104.4, + "step": 4760 + }, + { + "ce_loss_13": 2.7042051672935488, + "ce_loss_26": 2.1117112547159196, + "ce_loss_39": 1.9040750682353973, + "ce_loss_52": 1.4314228266477584, + "ce_loss_7": 3.0566958367824553, + "epoch": 0.477, + "grad_norm": 15.353551244219883, + "kl_loss_13": 2608.4, + "kl_loss_26": 1361.8, + "kl_loss_39": 937.4, + "kl_loss_7": 3352.4, + "learning_rate": 0.0005443681288009991, + "loss": 4078.7, + "step": 4770 + }, + { + "ce_loss_13": 2.6856160342693327, + "ce_loss_26": 2.0829177469015123, + "ce_loss_39": 1.8763625353574753, + "ce_loss_52": 1.4030324995517731, + "ce_loss_7": 3.0448363423347473, + "epoch": 0.478, + "grad_norm": 16.14315974332923, + "kl_loss_13": 2646.4, + "kl_loss_26": 1377.9, + "kl_loss_39": 941.4, + "kl_loss_7": 3398.4, + "learning_rate": 0.0005427875042394199, + "loss": 4031.6, + "step": 4780 + }, + { + "ce_loss_13": 2.6821600914001467, + "ce_loss_26": 2.1105621844530105, + "ce_loss_39": 1.9080501794815063, + "ce_loss_52": 1.4579481482505798, + "ce_loss_7": 3.0244390606880187, + "epoch": 0.479, + "grad_norm": 16.41928903258253, + "kl_loss_13": 2508.6, + "kl_loss_26": 1304.7, + "kl_loss_39": 890.8, + "kl_loss_7": 3233.6, + "learning_rate": 0.0005412064488081482, + "loss": 4041.85, + "step": 4790 + }, + { + "ce_loss_13": 2.644778722524643, + "ce_loss_26": 2.0624290674924852, + "ce_loss_39": 1.8686836928129196, + "ce_loss_52": 1.4204004764556886, + "ce_loss_7": 2.987608629465103, + "epoch": 0.48, + "grad_norm": 15.406655412683705, + "kl_loss_13": 2521.2, + "kl_loss_26": 1293.1, + "kl_loss_39": 883.5, + "kl_loss_7": 3258.4, + "learning_rate": 0.0005396249784283942, + "loss": 4018.65, + "step": 4800 + }, + { + "ce_loss_13": 2.6721664726734162, + "ce_loss_26": 2.0903854191303255, + "ce_loss_39": 1.8859550595283507, + "ce_loss_52": 1.4339767321944237, + "ce_loss_7": 3.022903573513031, + "epoch": 0.481, + "grad_norm": 15.533152676819311, + "kl_loss_13": 2549.2, + "kl_loss_26": 1311.5, + "kl_loss_39": 887.7, + "kl_loss_7": 3276.8, + "learning_rate": 0.0005380431090255476, + "loss": 4094.5, + "step": 4810 + }, + { + "ce_loss_13": 2.7146060168743134, + "ce_loss_26": 2.138300836086273, + "ce_loss_39": 1.9245162457227707, + "ce_loss_52": 1.4378316938877105, + "ce_loss_7": 3.0600010454654694, + "epoch": 0.482, + "grad_norm": 15.889585422309347, + "kl_loss_13": 2616.4, + "kl_loss_26": 1396.0, + "kl_loss_39": 954.3, + "kl_loss_7": 3337.2, + "learning_rate": 0.0005364608565290155, + "loss": 4019.85, + "step": 4820 + }, + { + "ce_loss_13": 2.7267942845821382, + "ce_loss_26": 2.1315987795591353, + "ce_loss_39": 1.92288878262043, + "ce_loss_52": 1.4603519141674042, + "ce_loss_7": 3.089752674102783, + "epoch": 0.483, + "grad_norm": 14.942603293515566, + "kl_loss_13": 2598.4, + "kl_loss_26": 1350.0, + "kl_loss_39": 913.7, + "kl_loss_7": 3351.2, + "learning_rate": 0.0005348782368720626, + "loss": 4054.35, + "step": 4830 + }, + { + "ce_loss_13": 2.702422133088112, + "ce_loss_26": 2.1349568367004395, + "ce_loss_39": 1.923803049325943, + "ce_loss_52": 1.4461625874042512, + "ce_loss_7": 3.0548571348190308, + "epoch": 0.484, + "grad_norm": 14.924067089524062, + "kl_loss_13": 2579.8, + "kl_loss_26": 1357.8, + "kl_loss_39": 926.9, + "kl_loss_7": 3324.0, + "learning_rate": 0.000533295265991652, + "loss": 4024.65, + "step": 4840 + }, + { + "ce_loss_13": 2.6280339270830155, + "ce_loss_26": 2.0501014798879624, + "ce_loss_39": 1.8476706713438034, + "ce_loss_52": 1.3977838337421418, + "ce_loss_7": 2.9806483924388885, + "epoch": 0.485, + "grad_norm": 16.07575861381767, + "kl_loss_13": 2515.0, + "kl_loss_26": 1293.9, + "kl_loss_39": 881.9, + "kl_loss_7": 3258.8, + "learning_rate": 0.0005317119598282822, + "loss": 4003.05, + "step": 4850 + }, + { + "ce_loss_13": 2.7017304062843324, + "ce_loss_26": 2.123658448457718, + "ce_loss_39": 1.9226309835910798, + "ce_loss_52": 1.4685954213142396, + "ce_loss_7": 3.052913784980774, + "epoch": 0.486, + "grad_norm": 14.261884924612158, + "kl_loss_13": 2552.0, + "kl_loss_26": 1330.6, + "kl_loss_39": 901.6, + "kl_loss_7": 3291.2, + "learning_rate": 0.0005301283343258293, + "loss": 4032.7, + "step": 4860 + }, + { + "ce_loss_13": 2.663911575078964, + "ce_loss_26": 2.0811035096645356, + "ce_loss_39": 1.8746151685714723, + "ce_loss_52": 1.4212765499949456, + "ce_loss_7": 3.0172334790229796, + "epoch": 0.487, + "grad_norm": 15.979242608660392, + "kl_loss_13": 2526.0, + "kl_loss_26": 1301.0, + "kl_loss_39": 883.9, + "kl_loss_7": 3276.4, + "learning_rate": 0.000528544405431384, + "loss": 4020.25, + "step": 4870 + }, + { + "ce_loss_13": 2.660174161195755, + "ce_loss_26": 2.0942499101161958, + "ce_loss_39": 1.8920150458812715, + "ce_loss_52": 1.4465530335903167, + "ce_loss_7": 3.0074438989162444, + "epoch": 0.488, + "grad_norm": 14.78308909898239, + "kl_loss_13": 2500.8, + "kl_loss_26": 1294.5, + "kl_loss_39": 873.6, + "kl_loss_7": 3228.0, + "learning_rate": 0.000526960189095093, + "loss": 4016.5, + "step": 4880 + }, + { + "ce_loss_13": 2.635312020778656, + "ce_loss_26": 2.0795453995466233, + "ce_loss_39": 1.8807054102420806, + "ce_loss_52": 1.4317258328199387, + "ce_loss_7": 2.9746360957622526, + "epoch": 0.489, + "grad_norm": 15.033016045239172, + "kl_loss_13": 2473.6, + "kl_loss_26": 1289.8, + "kl_loss_39": 880.7, + "kl_loss_7": 3185.2, + "learning_rate": 0.0005253757012699972, + "loss": 3996.8, + "step": 4890 + }, + { + "ce_loss_13": 2.68422954082489, + "ce_loss_26": 2.1027563750743865, + "ce_loss_39": 1.8974193513393403, + "ce_loss_52": 1.4426774829626083, + "ce_loss_7": 3.037678909301758, + "epoch": 0.49, + "grad_norm": 15.631967370179172, + "kl_loss_13": 2553.2, + "kl_loss_26": 1323.6, + "kl_loss_39": 894.5, + "kl_loss_7": 3295.2, + "learning_rate": 0.0005237909579118712, + "loss": 3967.65, + "step": 4900 + }, + { + "ce_loss_13": 2.680465018749237, + "ce_loss_26": 2.1046013057231905, + "ce_loss_39": 1.9009331673383714, + "ce_loss_52": 1.44480240046978, + "ce_loss_7": 3.036197912693024, + "epoch": 0.491, + "grad_norm": 15.759372326211485, + "kl_loss_13": 2524.0, + "kl_loss_26": 1301.7, + "kl_loss_39": 885.8, + "kl_loss_7": 3274.8, + "learning_rate": 0.0005222059749790631, + "loss": 3979.5, + "step": 4910 + }, + { + "ce_loss_13": 2.6961126804351805, + "ce_loss_26": 2.1182867020368574, + "ce_loss_39": 1.9074458956718445, + "ce_loss_52": 1.456271693110466, + "ce_loss_7": 3.044221115112305, + "epoch": 0.492, + "grad_norm": 16.34015666907617, + "kl_loss_13": 2536.4, + "kl_loss_26": 1322.4, + "kl_loss_39": 892.3, + "kl_loss_7": 3262.8, + "learning_rate": 0.0005206207684323337, + "loss": 3964.95, + "step": 4920 + }, + { + "ce_loss_13": 2.6250401854515077, + "ce_loss_26": 2.041203039884567, + "ce_loss_39": 1.8369982630014419, + "ce_loss_52": 1.4043618232011794, + "ce_loss_7": 2.976498603820801, + "epoch": 0.493, + "grad_norm": 15.643386607908784, + "kl_loss_13": 2504.8, + "kl_loss_26": 1279.6, + "kl_loss_39": 857.1, + "kl_loss_7": 3243.6, + "learning_rate": 0.000519035354234695, + "loss": 3956.4, + "step": 4930 + }, + { + "ce_loss_13": 2.731994906067848, + "ce_loss_26": 2.151404523849487, + "ce_loss_39": 1.9346221089363098, + "ce_loss_52": 1.4679641619324684, + "ce_loss_7": 3.0894507080316544, + "epoch": 0.494, + "grad_norm": 15.972342532535162, + "kl_loss_13": 2584.2, + "kl_loss_26": 1354.3, + "kl_loss_39": 912.1, + "kl_loss_7": 3338.8, + "learning_rate": 0.0005174497483512506, + "loss": 3986.95, + "step": 4940 + }, + { + "ce_loss_13": 2.692367374897003, + "ce_loss_26": 2.122538897395134, + "ce_loss_39": 1.9124073147773744, + "ce_loss_52": 1.452483794093132, + "ce_loss_7": 3.042392200231552, + "epoch": 0.495, + "grad_norm": 15.983001798116502, + "kl_loss_13": 2533.4, + "kl_loss_26": 1327.0, + "kl_loss_39": 895.7, + "kl_loss_7": 3252.0, + "learning_rate": 0.0005158639667490339, + "loss": 3967.55, + "step": 4950 + }, + { + "ce_loss_13": 2.6044699877500532, + "ce_loss_26": 2.0326590865850447, + "ce_loss_39": 1.8286783695220947, + "ce_loss_52": 1.3870023548603059, + "ce_loss_7": 2.947771966457367, + "epoch": 0.496, + "grad_norm": 15.42571652932802, + "kl_loss_13": 2500.2, + "kl_loss_26": 1296.2, + "kl_loss_39": 875.7, + "kl_loss_7": 3230.8, + "learning_rate": 0.0005142780253968481, + "loss": 3955.8, + "step": 4960 + }, + { + "ce_loss_13": 2.642567253112793, + "ce_loss_26": 2.0776680946350097, + "ce_loss_39": 1.875117465853691, + "ce_loss_52": 1.4392555862665177, + "ce_loss_7": 2.991294425725937, + "epoch": 0.497, + "grad_norm": 15.451081778073426, + "kl_loss_13": 2501.6, + "kl_loss_26": 1291.4, + "kl_loss_39": 867.8, + "kl_loss_7": 3244.8, + "learning_rate": 0.0005126919402651053, + "loss": 3945.75, + "step": 4970 + }, + { + "ce_loss_13": 2.6330737471580505, + "ce_loss_26": 2.0630074977874755, + "ce_loss_39": 1.856469190120697, + "ce_loss_52": 1.4164980471134185, + "ce_loss_7": 2.9769130408763886, + "epoch": 0.498, + "grad_norm": 15.34176013733142, + "kl_loss_13": 2503.2, + "kl_loss_26": 1294.2, + "kl_loss_39": 874.8, + "kl_loss_7": 3228.4, + "learning_rate": 0.0005111057273256647, + "loss": 3917.2, + "step": 4980 + }, + { + "ce_loss_13": 2.6576701521873476, + "ce_loss_26": 2.089341068267822, + "ce_loss_39": 1.8844720661640166, + "ce_loss_52": 1.4477695405483246, + "ce_loss_7": 3.0070637345314024, + "epoch": 0.499, + "grad_norm": 15.00660182184455, + "kl_loss_13": 2473.8, + "kl_loss_26": 1283.9, + "kl_loss_39": 853.9, + "kl_loss_7": 3207.6, + "learning_rate": 0.0005095194025516733, + "loss": 3923.7, + "step": 4990 + }, + { + "ce_loss_13": 2.697542816400528, + "ce_loss_26": 2.125366801023483, + "ce_loss_39": 1.9231987714767456, + "ce_loss_52": 1.4726029485464096, + "ce_loss_7": 3.0440734326839447, + "epoch": 0.5, + "grad_norm": 15.160859466153447, + "kl_loss_13": 2511.6, + "kl_loss_26": 1303.5, + "kl_loss_39": 880.0, + "kl_loss_7": 3249.2, + "learning_rate": 0.000507932981917404, + "loss": 3938.75, + "step": 5000 + }, + { + "ce_loss_13": 2.554905018210411, + "ce_loss_26": 1.9959456473588943, + "ce_loss_39": 1.8001023352146148, + "ce_loss_52": 1.370650653541088, + "ce_loss_7": 2.8987653195858, + "epoch": 0.501, + "grad_norm": 17.14356490933402, + "kl_loss_13": 2439.4, + "kl_loss_26": 1259.5, + "kl_loss_39": 849.2, + "kl_loss_7": 3158.0, + "learning_rate": 0.0005063464813980949, + "loss": 3915.65, + "step": 5010 + }, + { + "ce_loss_13": 2.5977672755718233, + "ce_loss_26": 2.0211563646793365, + "ce_loss_39": 1.8262152045965194, + "ce_loss_52": 1.3998217657208443, + "ce_loss_7": 2.950869935750961, + "epoch": 0.502, + "grad_norm": 14.97441402973133, + "kl_loss_13": 2466.8, + "kl_loss_26": 1257.6, + "kl_loss_39": 848.9, + "kl_loss_7": 3202.0, + "learning_rate": 0.0005047599169697884, + "loss": 3931.7, + "step": 5020 + }, + { + "ce_loss_13": 2.6368628799915315, + "ce_loss_26": 2.0560896009206773, + "ce_loss_39": 1.8569422334432601, + "ce_loss_52": 1.429998092353344, + "ce_loss_7": 2.981409990787506, + "epoch": 0.503, + "grad_norm": 15.444575545537123, + "kl_loss_13": 2486.4, + "kl_loss_26": 1264.6, + "kl_loss_39": 845.9, + "kl_loss_7": 3216.4, + "learning_rate": 0.000503173304609171, + "loss": 3936.6, + "step": 5030 + }, + { + "ce_loss_13": 2.694787061214447, + "ce_loss_26": 2.10608988404274, + "ce_loss_39": 1.8957834452390672, + "ce_loss_52": 1.4505297511816024, + "ce_loss_7": 3.0503607213497164, + "epoch": 0.504, + "grad_norm": 15.80764020207202, + "kl_loss_13": 2558.4, + "kl_loss_26": 1318.1, + "kl_loss_39": 884.7, + "kl_loss_7": 3306.4, + "learning_rate": 0.0005015866602934111, + "loss": 3939.75, + "step": 5040 + }, + { + "ce_loss_13": 2.621226805448532, + "ce_loss_26": 2.0584420263767242, + "ce_loss_39": 1.861902078986168, + "ce_loss_52": 1.4422300636768342, + "ce_loss_7": 2.951041603088379, + "epoch": 0.505, + "grad_norm": 14.881966197673897, + "kl_loss_13": 2426.4, + "kl_loss_26": 1238.3, + "kl_loss_39": 828.1, + "kl_loss_7": 3127.6, + "learning_rate": 0.0005, + "loss": 3934.2, + "step": 5050 + }, + { + "ce_loss_13": 2.660506749153137, + "ce_loss_26": 2.098774325847626, + "ce_loss_39": 1.886313620209694, + "ce_loss_52": 1.4474970057606698, + "ce_loss_7": 3.011888575553894, + "epoch": 0.506, + "grad_norm": 14.941506448796416, + "kl_loss_13": 2498.2, + "kl_loss_26": 1299.9, + "kl_loss_39": 867.9, + "kl_loss_7": 3221.2, + "learning_rate": 0.0004984133397065889, + "loss": 3903.9, + "step": 5060 + }, + { + "ce_loss_13": 2.626861798763275, + "ce_loss_26": 2.0604827493429183, + "ce_loss_39": 1.8599964112043381, + "ce_loss_52": 1.4386487394571303, + "ce_loss_7": 2.975832390785217, + "epoch": 0.507, + "grad_norm": 15.44952616226393, + "kl_loss_13": 2452.8, + "kl_loss_26": 1250.8, + "kl_loss_39": 833.3, + "kl_loss_7": 3188.4, + "learning_rate": 0.0004968266953908291, + "loss": 3885.3, + "step": 5070 + }, + { + "ce_loss_13": 2.562318778038025, + "ce_loss_26": 1.9999902278184891, + "ce_loss_39": 1.8035208880901337, + "ce_loss_52": 1.3915461212396623, + "ce_loss_7": 2.904841202497482, + "epoch": 0.508, + "grad_norm": 14.83407474509182, + "kl_loss_13": 2407.2, + "kl_loss_26": 1226.2, + "kl_loss_39": 815.3, + "kl_loss_7": 3134.0, + "learning_rate": 0.0004952400830302117, + "loss": 3881.25, + "step": 5080 + }, + { + "ce_loss_13": 2.5728674054145815, + "ce_loss_26": 2.013489532470703, + "ce_loss_39": 1.8151986598968506, + "ce_loss_52": 1.3948280960321426, + "ce_loss_7": 2.9155173718929293, + "epoch": 0.509, + "grad_norm": 14.64837783343134, + "kl_loss_13": 2420.0, + "kl_loss_26": 1236.4, + "kl_loss_39": 829.6, + "kl_loss_7": 3145.6, + "learning_rate": 0.0004936535186019053, + "loss": 3875.9, + "step": 5090 + }, + { + "ce_loss_13": 2.657623714208603, + "ce_loss_26": 2.0687634259462357, + "ce_loss_39": 1.8608120799064636, + "ce_loss_52": 1.418680590391159, + "ce_loss_7": 3.0090177237987517, + "epoch": 0.51, + "grad_norm": 15.50653814601008, + "kl_loss_13": 2529.6, + "kl_loss_26": 1302.4, + "kl_loss_39": 874.3, + "kl_loss_7": 3273.2, + "learning_rate": 0.000492067018082596, + "loss": 3924.45, + "step": 5100 + }, + { + "ce_loss_13": 2.651608294248581, + "ce_loss_26": 2.0792359739542006, + "ce_loss_39": 1.8647643625736237, + "ce_loss_52": 1.4323984265327454, + "ce_loss_7": 3.0009018778800964, + "epoch": 0.511, + "grad_norm": 14.358667358457254, + "kl_loss_13": 2496.8, + "kl_loss_26": 1292.0, + "kl_loss_39": 854.6, + "kl_loss_7": 3224.0, + "learning_rate": 0.0004904805974483267, + "loss": 3865.45, + "step": 5110 + }, + { + "ce_loss_13": 2.641629362106323, + "ce_loss_26": 2.072116160392761, + "ce_loss_39": 1.8716523438692092, + "ce_loss_52": 1.4481347769498825, + "ce_loss_7": 2.989072245359421, + "epoch": 0.512, + "grad_norm": 15.321352631891271, + "kl_loss_13": 2423.6, + "kl_loss_26": 1237.5, + "kl_loss_39": 826.9, + "kl_loss_7": 3150.8, + "learning_rate": 0.0004888942726743353, + "loss": 3863.15, + "step": 5120 + }, + { + "ce_loss_13": 2.6105270087718964, + "ce_loss_26": 2.041230320930481, + "ce_loss_39": 1.8405257403850555, + "ce_loss_52": 1.4164400100708008, + "ce_loss_7": 2.9590699791908266, + "epoch": 0.513, + "grad_norm": 15.778047819453661, + "kl_loss_13": 2444.0, + "kl_loss_26": 1244.1, + "kl_loss_39": 835.9, + "kl_loss_7": 3179.2, + "learning_rate": 0.0004873080597348947, + "loss": 3860.6, + "step": 5130 + }, + { + "ce_loss_13": 2.7020871877670287, + "ce_loss_26": 2.1217857897281647, + "ce_loss_39": 1.910724088549614, + "ce_loss_52": 1.4596379309892655, + "ce_loss_7": 3.0534912407398225, + "epoch": 0.514, + "grad_norm": 15.798738758695562, + "kl_loss_13": 2550.4, + "kl_loss_26": 1321.6, + "kl_loss_39": 886.4, + "kl_loss_7": 3290.0, + "learning_rate": 0.0004857219746031519, + "loss": 3877.4, + "step": 5140 + }, + { + "ce_loss_13": 2.629297113418579, + "ce_loss_26": 2.065913289785385, + "ce_loss_39": 1.8652951270341873, + "ce_loss_52": 1.4406857430934905, + "ce_loss_7": 2.975607615709305, + "epoch": 0.515, + "grad_norm": 15.843426047808201, + "kl_loss_13": 2425.2, + "kl_loss_26": 1235.3, + "kl_loss_39": 819.8, + "kl_loss_7": 3155.2, + "learning_rate": 0.0004841360332509663, + "loss": 3881.85, + "step": 5150 + }, + { + "ce_loss_13": 2.6566128492355348, + "ce_loss_26": 2.0851503133773805, + "ce_loss_39": 1.879001685976982, + "ce_loss_52": 1.4529996007680892, + "ce_loss_7": 3.001719295978546, + "epoch": 0.516, + "grad_norm": 15.230952623778938, + "kl_loss_13": 2453.2, + "kl_loss_26": 1247.2, + "kl_loss_39": 825.4, + "kl_loss_7": 3185.6, + "learning_rate": 0.0004825502516487497, + "loss": 3877.3, + "step": 5160 + }, + { + "ce_loss_13": 2.6446830928325653, + "ce_loss_26": 2.087994411587715, + "ce_loss_39": 1.8902768969535828, + "ce_loss_52": 1.4724875479936599, + "ce_loss_7": 2.994678741693497, + "epoch": 0.517, + "grad_norm": 16.39635866458352, + "kl_loss_13": 2415.8, + "kl_loss_26": 1232.4, + "kl_loss_39": 823.7, + "kl_loss_7": 3135.2, + "learning_rate": 0.00048096464576530507, + "loss": 3828.0, + "step": 5170 + }, + { + "ce_loss_13": 2.5944429993629456, + "ce_loss_26": 2.036814641952515, + "ce_loss_39": 1.841314834356308, + "ce_loss_52": 1.4209656611084938, + "ce_loss_7": 2.9290528297424316, + "epoch": 0.518, + "grad_norm": 14.479752914773284, + "kl_loss_13": 2424.2, + "kl_loss_26": 1242.2, + "kl_loss_39": 829.4, + "kl_loss_7": 3150.4, + "learning_rate": 0.00047937923156766646, + "loss": 3845.8, + "step": 5180 + }, + { + "ce_loss_13": 2.660686820745468, + "ce_loss_26": 2.094280996918678, + "ce_loss_39": 1.8889294683933258, + "ce_loss_52": 1.4619006276130677, + "ce_loss_7": 3.008899760246277, + "epoch": 0.519, + "grad_norm": 16.01644389337825, + "kl_loss_13": 2475.6, + "kl_loss_26": 1273.8, + "kl_loss_39": 850.7, + "kl_loss_7": 3204.4, + "learning_rate": 0.00047779402502093696, + "loss": 3846.45, + "step": 5190 + }, + { + "ce_loss_13": 2.5972321003675463, + "ce_loss_26": 2.0274816364049912, + "ce_loss_39": 1.8247474491596223, + "ce_loss_52": 1.4065502554178237, + "ce_loss_7": 2.9485019743442535, + "epoch": 0.52, + "grad_norm": 14.98957972258473, + "kl_loss_13": 2450.8, + "kl_loss_26": 1255.3, + "kl_loss_39": 832.3, + "kl_loss_7": 3178.4, + "learning_rate": 0.0004762090420881289, + "loss": 3895.0, + "step": 5200 + }, + { + "ce_loss_13": 2.665889722108841, + "ce_loss_26": 2.110061451792717, + "ce_loss_39": 1.9084287703037262, + "ce_loss_52": 1.4813150823116303, + "ce_loss_7": 3.0068661749362944, + "epoch": 0.521, + "grad_norm": 15.21430509367723, + "kl_loss_13": 2448.0, + "kl_loss_26": 1268.2, + "kl_loss_39": 849.3, + "kl_loss_7": 3164.4, + "learning_rate": 0.00047462429873000296, + "loss": 3816.95, + "step": 5210 + }, + { + "ce_loss_13": 2.655850923061371, + "ce_loss_26": 2.085476315021515, + "ce_loss_39": 1.874689555168152, + "ce_loss_52": 1.446556892991066, + "ce_loss_7": 3.0054982542991637, + "epoch": 0.522, + "grad_norm": 16.427127136749267, + "kl_loss_13": 2452.4, + "kl_loss_26": 1264.0, + "kl_loss_39": 836.0, + "kl_loss_7": 3188.0, + "learning_rate": 0.0004730398109049071, + "loss": 3838.95, + "step": 5220 + }, + { + "ce_loss_13": 2.677028661966324, + "ce_loss_26": 2.119970577955246, + "ce_loss_39": 1.92548668384552, + "ce_loss_52": 1.4896603375673294, + "ce_loss_7": 3.008550250530243, + "epoch": 0.523, + "grad_norm": 16.0659576233147, + "kl_loss_13": 2444.4, + "kl_loss_26": 1258.1, + "kl_loss_39": 844.1, + "kl_loss_7": 3161.6, + "learning_rate": 0.000471455594568616, + "loss": 3864.55, + "step": 5230 + }, + { + "ce_loss_13": 2.6650006234645844, + "ce_loss_26": 2.085580566525459, + "ce_loss_39": 1.884287601709366, + "ce_loss_52": 1.453607079386711, + "ce_loss_7": 3.0236206233501433, + "epoch": 0.524, + "grad_norm": 14.653166262246215, + "kl_loss_13": 2489.6, + "kl_loss_26": 1272.8, + "kl_loss_39": 845.7, + "kl_loss_7": 3232.4, + "learning_rate": 0.00046987166567417086, + "loss": 3878.7, + "step": 5240 + }, + { + "ce_loss_13": 2.584680277109146, + "ce_loss_26": 2.0209302097558974, + "ce_loss_39": 1.8149988621473312, + "ce_loss_52": 1.390892931818962, + "ce_loss_7": 2.9321685075759887, + "epoch": 0.525, + "grad_norm": 15.168241253700586, + "kl_loss_13": 2428.4, + "kl_loss_26": 1252.2, + "kl_loss_39": 829.9, + "kl_loss_7": 3154.8, + "learning_rate": 0.00046828804017171776, + "loss": 3851.45, + "step": 5250 + }, + { + "ce_loss_13": 2.6199812322854994, + "ce_loss_26": 2.072703015804291, + "ce_loss_39": 1.8745949417352676, + "ce_loss_52": 1.4573716089129447, + "ce_loss_7": 2.9569752633571627, + "epoch": 0.526, + "grad_norm": 15.019656924741254, + "kl_loss_13": 2412.4, + "kl_loss_26": 1238.7, + "kl_loss_39": 825.7, + "kl_loss_7": 3130.0, + "learning_rate": 0.00046670473400834805, + "loss": 3822.4, + "step": 5260 + }, + { + "ce_loss_13": 2.614406701922417, + "ce_loss_26": 2.038132056593895, + "ce_loss_39": 1.835758489370346, + "ce_loss_52": 1.4097227707505227, + "ce_loss_7": 2.953414297103882, + "epoch": 0.527, + "grad_norm": 15.237091081862786, + "kl_loss_13": 2460.0, + "kl_loss_26": 1259.0, + "kl_loss_39": 841.0, + "kl_loss_7": 3178.4, + "learning_rate": 0.00046512176312793734, + "loss": 3820.5, + "step": 5270 + }, + { + "ce_loss_13": 2.5675832986831666, + "ce_loss_26": 2.0170306622982026, + "ce_loss_39": 1.8148203998804093, + "ce_loss_52": 1.4015038818120957, + "ce_loss_7": 2.9166265070438384, + "epoch": 0.528, + "grad_norm": 15.622184104337453, + "kl_loss_13": 2384.4, + "kl_loss_26": 1219.7, + "kl_loss_39": 809.7, + "kl_loss_7": 3111.2, + "learning_rate": 0.00046353914347098467, + "loss": 3805.5, + "step": 5280 + }, + { + "ce_loss_13": 2.600316107273102, + "ce_loss_26": 2.038938209414482, + "ce_loss_39": 1.8384071439504623, + "ce_loss_52": 1.4146052584052087, + "ce_loss_7": 2.9424943923950195, + "epoch": 0.529, + "grad_norm": 16.18662441031771, + "kl_loss_13": 2437.8, + "kl_loss_26": 1252.1, + "kl_loss_39": 829.2, + "kl_loss_7": 3150.0, + "learning_rate": 0.0004619568909744524, + "loss": 3784.9, + "step": 5290 + }, + { + "ce_loss_13": 2.6181087523698805, + "ce_loss_26": 2.0588752537965775, + "ce_loss_39": 1.85763358771801, + "ce_loss_52": 1.4420970767736434, + "ce_loss_7": 2.962410408258438, + "epoch": 0.53, + "grad_norm": 15.067130011925716, + "kl_loss_13": 2406.6, + "kl_loss_26": 1231.8, + "kl_loss_39": 819.2, + "kl_loss_7": 3132.4, + "learning_rate": 0.00046037502157160573, + "loss": 3815.9, + "step": 5300 + }, + { + "ce_loss_13": 2.5704480171203614, + "ce_loss_26": 2.007723420858383, + "ce_loss_39": 1.8157259494066238, + "ce_loss_52": 1.4100188240408897, + "ce_loss_7": 2.9128104388713836, + "epoch": 0.531, + "grad_norm": 14.940801357367253, + "kl_loss_13": 2375.2, + "kl_loss_26": 1192.5, + "kl_loss_39": 790.2, + "kl_loss_7": 3097.2, + "learning_rate": 0.00045879355119185207, + "loss": 3774.75, + "step": 5310 + }, + { + "ce_loss_13": 2.6117980778217316, + "ce_loss_26": 2.049151787161827, + "ce_loss_39": 1.8556257247924806, + "ce_loss_52": 1.4484239488840103, + "ce_loss_7": 2.9571595907211305, + "epoch": 0.532, + "grad_norm": 15.084958990702525, + "kl_loss_13": 2412.8, + "kl_loss_26": 1214.0, + "kl_loss_39": 801.8, + "kl_loss_7": 3144.4, + "learning_rate": 0.0004572124957605803, + "loss": 3796.3, + "step": 5320 + }, + { + "ce_loss_13": 2.6047778964042663, + "ce_loss_26": 2.0361655563116074, + "ce_loss_39": 1.829242792725563, + "ce_loss_52": 1.4240625560283662, + "ce_loss_7": 2.9437944889068604, + "epoch": 0.533, + "grad_norm": 14.875930516711946, + "kl_loss_13": 2409.0, + "kl_loss_26": 1224.1, + "kl_loss_39": 807.1, + "kl_loss_7": 3128.8, + "learning_rate": 0.00045563187119900103, + "loss": 3772.25, + "step": 5330 + }, + { + "ce_loss_13": 2.5524469763040543, + "ce_loss_26": 1.9957795530557632, + "ce_loss_39": 1.7972583919763565, + "ce_loss_52": 1.3942344322800637, + "ce_loss_7": 2.899323457479477, + "epoch": 0.534, + "grad_norm": 14.888961320169138, + "kl_loss_13": 2374.0, + "kl_loss_26": 1190.2, + "kl_loss_39": 785.5, + "kl_loss_7": 3107.6, + "learning_rate": 0.00045405169342398633, + "loss": 3809.2, + "step": 5340 + }, + { + "ce_loss_13": 2.6405540108680725, + "ce_loss_26": 2.080012783408165, + "ce_loss_39": 1.876559317111969, + "ce_loss_52": 1.462306337058544, + "ce_loss_7": 2.981167531013489, + "epoch": 0.535, + "grad_norm": 14.563184685424444, + "kl_loss_13": 2406.4, + "kl_loss_26": 1229.1, + "kl_loss_39": 816.6, + "kl_loss_7": 3130.4, + "learning_rate": 0.0004524719783479088, + "loss": 3797.75, + "step": 5350 + }, + { + "ce_loss_13": 2.645623618364334, + "ce_loss_26": 2.0881788969039916, + "ce_loss_39": 1.888492900133133, + "ce_loss_52": 1.4696861803531647, + "ce_loss_7": 2.994585871696472, + "epoch": 0.536, + "grad_norm": 15.457746807868878, + "kl_loss_13": 2427.2, + "kl_loss_26": 1243.7, + "kl_loss_39": 825.7, + "kl_loss_7": 3155.2, + "learning_rate": 0.00045089274187848144, + "loss": 3837.25, + "step": 5360 + }, + { + "ce_loss_13": 2.607480788230896, + "ce_loss_26": 2.0540437757968903, + "ce_loss_39": 1.854740971326828, + "ce_loss_52": 1.4391999766230583, + "ce_loss_7": 2.955876570940018, + "epoch": 0.537, + "grad_norm": 15.654126474702101, + "kl_loss_13": 2406.0, + "kl_loss_26": 1230.1, + "kl_loss_39": 814.4, + "kl_loss_7": 3128.8, + "learning_rate": 0.00044931399991859835, + "loss": 3791.2, + "step": 5370 + }, + { + "ce_loss_13": 2.5806866496801377, + "ce_loss_26": 2.0246922999620436, + "ce_loss_39": 1.8295573323965073, + "ce_loss_52": 1.433498626947403, + "ce_loss_7": 2.932585430145264, + "epoch": 0.538, + "grad_norm": 15.050745413645357, + "kl_loss_13": 2377.8, + "kl_loss_26": 1193.3, + "kl_loss_39": 790.4, + "kl_loss_7": 3116.4, + "learning_rate": 0.00044773576836617336, + "loss": 3771.2, + "step": 5380 + }, + { + "ce_loss_13": 2.5772230982780457, + "ce_loss_26": 2.0075444668531417, + "ce_loss_39": 1.8076122283935547, + "ce_loss_52": 1.3957727670669555, + "ce_loss_7": 2.9269763708114622, + "epoch": 0.539, + "grad_norm": 14.739351688834923, + "kl_loss_13": 2418.2, + "kl_loss_26": 1217.6, + "kl_loss_39": 806.9, + "kl_loss_7": 3154.0, + "learning_rate": 0.00044615806311398056, + "loss": 3764.85, + "step": 5390 + }, + { + "ce_loss_13": 2.6164508730173113, + "ce_loss_26": 2.0489900171756745, + "ce_loss_39": 1.8477211087942123, + "ce_loss_52": 1.4378462180495262, + "ce_loss_7": 2.9610529631376266, + "epoch": 0.54, + "grad_norm": 15.257899541784896, + "kl_loss_13": 2422.8, + "kl_loss_26": 1222.4, + "kl_loss_39": 806.9, + "kl_loss_7": 3149.2, + "learning_rate": 0.00044458090004949454, + "loss": 3789.8, + "step": 5400 + }, + { + "ce_loss_13": 2.639938807487488, + "ce_loss_26": 2.078041157126427, + "ce_loss_39": 1.8733548551797867, + "ce_loss_52": 1.4663413792848587, + "ce_loss_7": 2.978548914194107, + "epoch": 0.541, + "grad_norm": 15.643709958040844, + "kl_loss_13": 2399.4, + "kl_loss_26": 1217.8, + "kl_loss_39": 804.5, + "kl_loss_7": 3111.2, + "learning_rate": 0.0004430042950547297, + "loss": 3775.4, + "step": 5410 + }, + { + "ce_loss_13": 2.6770710349082947, + "ce_loss_26": 2.110589724779129, + "ce_loss_39": 1.9134115755558014, + "ce_loss_52": 1.5031546354293823, + "ce_loss_7": 3.0216497242450715, + "epoch": 0.542, + "grad_norm": 14.919746196312753, + "kl_loss_13": 2411.6, + "kl_loss_26": 1216.6, + "kl_loss_39": 809.1, + "kl_loss_7": 3147.6, + "learning_rate": 0.0004414282640060809, + "loss": 3768.95, + "step": 5420 + }, + { + "ce_loss_13": 2.6296884536743166, + "ce_loss_26": 2.0768262147903442, + "ce_loss_39": 1.8766031116247177, + "ce_loss_52": 1.4701504305005073, + "ce_loss_7": 2.9714462876319887, + "epoch": 0.543, + "grad_norm": 14.109291634946612, + "kl_loss_13": 2361.6, + "kl_loss_26": 1207.6, + "kl_loss_39": 794.5, + "kl_loss_7": 3073.6, + "learning_rate": 0.0004398528227741633, + "loss": 3734.5, + "step": 5430 + }, + { + "ce_loss_13": 2.5891071379184725, + "ce_loss_26": 2.024920642375946, + "ce_loss_39": 1.8224879026412963, + "ce_loss_52": 1.4225929498672485, + "ce_loss_7": 2.939086824655533, + "epoch": 0.544, + "grad_norm": 15.182283952769808, + "kl_loss_13": 2384.4, + "kl_loss_26": 1201.0, + "kl_loss_39": 785.9, + "kl_loss_7": 3118.0, + "learning_rate": 0.00043827798722365264, + "loss": 3724.5, + "step": 5440 + }, + { + "ce_loss_13": 2.521664083003998, + "ce_loss_26": 1.9635723412036896, + "ce_loss_39": 1.776253479719162, + "ce_loss_52": 1.3777382284402848, + "ce_loss_7": 2.8684565305709837, + "epoch": 0.545, + "grad_norm": 14.678349850224247, + "kl_loss_13": 2357.4, + "kl_loss_26": 1171.5, + "kl_loss_39": 776.8, + "kl_loss_7": 3081.2, + "learning_rate": 0.00043670377321312535, + "loss": 3748.6, + "step": 5450 + }, + { + "ce_loss_13": 2.6282208263874054, + "ce_loss_26": 2.063512918353081, + "ce_loss_39": 1.8621816724538802, + "ce_loss_52": 1.4465220913290977, + "ce_loss_7": 2.9661788761615755, + "epoch": 0.546, + "grad_norm": 14.95327030090302, + "kl_loss_13": 2410.4, + "kl_loss_26": 1226.2, + "kl_loss_39": 814.8, + "kl_loss_7": 3130.0, + "learning_rate": 0.0004351301965948991, + "loss": 3750.5, + "step": 5460 + }, + { + "ce_loss_13": 2.6448668360710146, + "ce_loss_26": 2.092715525627136, + "ce_loss_39": 1.8841570675373078, + "ce_loss_52": 1.465877577662468, + "ce_loss_7": 2.983266705274582, + "epoch": 0.547, + "grad_norm": 14.464098395004918, + "kl_loss_13": 2427.2, + "kl_loss_26": 1254.5, + "kl_loss_39": 827.1, + "kl_loss_7": 3138.8, + "learning_rate": 0.000433557273214873, + "loss": 3760.7, + "step": 5470 + }, + { + "ce_loss_13": 2.5597914129495623, + "ce_loss_26": 2.0002847105264663, + "ce_loss_39": 1.8126664906740189, + "ce_loss_52": 1.4252239495515824, + "ce_loss_7": 2.89780033826828, + "epoch": 0.548, + "grad_norm": 14.511201984799017, + "kl_loss_13": 2336.4, + "kl_loss_26": 1154.4, + "kl_loss_39": 760.7, + "kl_loss_7": 3046.4, + "learning_rate": 0.000431985018912368, + "loss": 3744.2, + "step": 5480 + }, + { + "ce_loss_13": 2.556107610464096, + "ce_loss_26": 2.0089694380760195, + "ce_loss_39": 1.8094982028007507, + "ce_loss_52": 1.4210678458213806, + "ce_loss_7": 2.8992061018943787, + "epoch": 0.549, + "grad_norm": 14.88322732328834, + "kl_loss_13": 2332.4, + "kl_loss_26": 1182.7, + "kl_loss_39": 771.7, + "kl_loss_7": 3056.4, + "learning_rate": 0.0004304134495199674, + "loss": 3725.95, + "step": 5490 + }, + { + "ce_loss_13": 2.5557271778583526, + "ce_loss_26": 2.015131750702858, + "ce_loss_39": 1.8132081747055053, + "ce_loss_52": 1.4253456503152848, + "ce_loss_7": 2.8894282221794128, + "epoch": 0.55, + "grad_norm": 14.509766958879414, + "kl_loss_13": 2342.4, + "kl_loss_26": 1192.8, + "kl_loss_39": 776.15, + "kl_loss_7": 3041.2, + "learning_rate": 0.0004288425808633575, + "loss": 3690.5, + "step": 5500 + }, + { + "ce_loss_13": 2.6498306572437285, + "ce_loss_26": 2.0864265114068985, + "ce_loss_39": 1.884456393122673, + "ce_loss_52": 1.4724615901708602, + "ce_loss_7": 2.994233113527298, + "epoch": 0.551, + "grad_norm": 15.223257504626806, + "kl_loss_13": 2419.8, + "kl_loss_26": 1227.5, + "kl_loss_39": 805.6, + "kl_loss_7": 3137.2, + "learning_rate": 0.0004272724287611684, + "loss": 3719.95, + "step": 5510 + }, + { + "ce_loss_13": 2.6063999772071837, + "ce_loss_26": 2.0453016996383666, + "ce_loss_39": 1.8452126443386079, + "ce_loss_52": 1.4395337477326393, + "ce_loss_7": 2.9420916736125946, + "epoch": 0.552, + "grad_norm": 14.707511636553823, + "kl_loss_13": 2389.2, + "kl_loss_26": 1204.1, + "kl_loss_39": 791.2, + "kl_loss_7": 3102.4, + "learning_rate": 0.00042570300902481425, + "loss": 3704.35, + "step": 5520 + }, + { + "ce_loss_13": 2.563195550441742, + "ce_loss_26": 1.9973567068576812, + "ce_loss_39": 1.7899492472410201, + "ce_loss_52": 1.3841874808073045, + "ce_loss_7": 2.9130555033683776, + "epoch": 0.553, + "grad_norm": 14.788339735855649, + "kl_loss_13": 2417.4, + "kl_loss_26": 1232.3, + "kl_loss_39": 809.2, + "kl_loss_7": 3150.4, + "learning_rate": 0.00042413433745833423, + "loss": 3716.65, + "step": 5530 + }, + { + "ce_loss_13": 2.5711400628089907, + "ce_loss_26": 2.0185573011636735, + "ce_loss_39": 1.8240427374839783, + "ce_loss_52": 1.4254193544387816, + "ce_loss_7": 2.9173736214637755, + "epoch": 0.554, + "grad_norm": 14.905176252765152, + "kl_loss_13": 2345.2, + "kl_loss_26": 1182.6, + "kl_loss_39": 781.0, + "kl_loss_7": 3069.2, + "learning_rate": 0.0004225664298582339, + "loss": 3692.3, + "step": 5540 + }, + { + "ce_loss_13": 2.5974901139736177, + "ce_loss_26": 2.046999195218086, + "ce_loss_39": 1.8451066941022873, + "ce_loss_52": 1.436999562382698, + "ce_loss_7": 2.9430564284324645, + "epoch": 0.555, + "grad_norm": 16.142335203190324, + "kl_loss_13": 2402.8, + "kl_loss_26": 1219.2, + "kl_loss_39": 797.9, + "kl_loss_7": 3125.6, + "learning_rate": 0.000420999302013325, + "loss": 3720.0, + "step": 5550 + }, + { + "ce_loss_13": 2.5756935298442842, + "ce_loss_26": 2.00499467253685, + "ce_loss_39": 1.8017524302005767, + "ce_loss_52": 1.4018217638134955, + "ce_loss_7": 2.9280818104743958, + "epoch": 0.556, + "grad_norm": 15.838882130746192, + "kl_loss_13": 2404.2, + "kl_loss_26": 1195.7, + "kl_loss_39": 787.5, + "kl_loss_7": 3148.8, + "learning_rate": 0.000419432969704568, + "loss": 3744.35, + "step": 5560 + }, + { + "ce_loss_13": 2.6485643923282622, + "ce_loss_26": 2.0993672519922257, + "ce_loss_39": 1.8959094911813736, + "ce_loss_52": 1.4813728883862496, + "ce_loss_7": 2.994894337654114, + "epoch": 0.557, + "grad_norm": 14.30299821768554, + "kl_loss_13": 2404.0, + "kl_loss_26": 1234.2, + "kl_loss_39": 810.35, + "kl_loss_7": 3130.8, + "learning_rate": 0.00041786744870491154, + "loss": 3698.4, + "step": 5570 + }, + { + "ce_loss_13": 2.6594822227954866, + "ce_loss_26": 2.0962711691856386, + "ce_loss_39": 1.8888987362384797, + "ce_loss_52": 1.477835801243782, + "ce_loss_7": 3.003460741043091, + "epoch": 0.558, + "grad_norm": 14.927142462067188, + "kl_loss_13": 2421.4, + "kl_loss_26": 1232.3, + "kl_loss_39": 804.5, + "kl_loss_7": 3142.8, + "learning_rate": 0.0004163027547791347, + "loss": 3696.4, + "step": 5580 + }, + { + "ce_loss_13": 2.6033630073070526, + "ce_loss_26": 2.052795875072479, + "ce_loss_39": 1.8524764776229858, + "ce_loss_52": 1.4589103490114212, + "ce_loss_7": 2.9362357556819916, + "epoch": 0.559, + "grad_norm": 15.035071624412899, + "kl_loss_13": 2332.2, + "kl_loss_26": 1173.9, + "kl_loss_39": 769.3, + "kl_loss_7": 3041.6, + "learning_rate": 0.0004147389036836881, + "loss": 3676.85, + "step": 5590 + }, + { + "ce_loss_13": 2.5536737203598023, + "ce_loss_26": 2.0111388891935347, + "ce_loss_39": 1.8144014358520508, + "ce_loss_52": 1.4328835308551788, + "ce_loss_7": 2.88922523856163, + "epoch": 0.56, + "grad_norm": 14.534788987143044, + "kl_loss_13": 2316.6, + "kl_loss_26": 1164.9, + "kl_loss_39": 758.2, + "kl_loss_7": 3024.4, + "learning_rate": 0.00041317591116653486, + "loss": 3694.55, + "step": 5600 + }, + { + "ce_loss_13": 2.606983852386475, + "ce_loss_26": 2.032891970872879, + "ce_loss_39": 1.8340051174163818, + "ce_loss_52": 1.4310238301753997, + "ce_loss_7": 2.9573661506175997, + "epoch": 0.561, + "grad_norm": 16.216124841765133, + "kl_loss_13": 2427.2, + "kl_loss_26": 1205.0, + "kl_loss_39": 794.3, + "kl_loss_7": 3156.0, + "learning_rate": 0.0004116137929669921, + "loss": 3679.35, + "step": 5610 + }, + { + "ce_loss_13": 2.5370231360197066, + "ce_loss_26": 1.9963056713342666, + "ce_loss_39": 1.803014099597931, + "ce_loss_52": 1.4229481190443038, + "ce_loss_7": 2.8732429146766663, + "epoch": 0.562, + "grad_norm": 15.330886130898222, + "kl_loss_13": 2277.4, + "kl_loss_26": 1124.3, + "kl_loss_39": 729.3, + "kl_loss_7": 2988.8, + "learning_rate": 0.00041005256481557305, + "loss": 3673.5, + "step": 5620 + }, + { + "ce_loss_13": 2.6017677545547486, + "ce_loss_26": 2.0595314621925356, + "ce_loss_39": 1.8631951808929443, + "ce_loss_52": 1.4627325683832169, + "ce_loss_7": 2.9314393043518066, + "epoch": 0.563, + "grad_norm": 14.465862339922571, + "kl_loss_13": 2332.2, + "kl_loss_26": 1190.7, + "kl_loss_39": 781.3, + "kl_loss_7": 3032.0, + "learning_rate": 0.00040849224243382767, + "loss": 3672.9, + "step": 5630 + }, + { + "ce_loss_13": 2.5594456523656843, + "ce_loss_26": 2.00023832321167, + "ce_loss_39": 1.8007198423147202, + "ce_loss_52": 1.4131150737404823, + "ce_loss_7": 2.904243141412735, + "epoch": 0.564, + "grad_norm": 15.017201171045896, + "kl_loss_13": 2360.2, + "kl_loss_26": 1180.4, + "kl_loss_39": 768.1, + "kl_loss_7": 3075.6, + "learning_rate": 0.000406932841534185, + "loss": 3693.75, + "step": 5640 + }, + { + "ce_loss_13": 2.595109748840332, + "ce_loss_26": 2.045265626907349, + "ce_loss_39": 1.85684574842453, + "ce_loss_52": 1.4735014230012893, + "ce_loss_7": 2.943417179584503, + "epoch": 0.565, + "grad_norm": 15.364099861784982, + "kl_loss_13": 2322.0, + "kl_loss_26": 1155.7, + "kl_loss_39": 755.4, + "kl_loss_7": 3048.4, + "learning_rate": 0.0004053743778197951, + "loss": 3668.9, + "step": 5650 + }, + { + "ce_loss_13": 2.582511156797409, + "ce_loss_26": 2.0224455118179323, + "ce_loss_39": 1.8295785069465638, + "ce_loss_52": 1.4373956888914108, + "ce_loss_7": 2.9102243304252626, + "epoch": 0.566, + "grad_norm": 14.693957764502153, + "kl_loss_13": 2342.4, + "kl_loss_26": 1176.8, + "kl_loss_39": 776.5, + "kl_loss_7": 3046.4, + "learning_rate": 0.0004038168669843697, + "loss": 3650.65, + "step": 5660 + }, + { + "ce_loss_13": 2.603584831953049, + "ce_loss_26": 2.04022336602211, + "ce_loss_39": 1.8417048037052155, + "ce_loss_52": 1.447445745766163, + "ce_loss_7": 2.9460166096687317, + "epoch": 0.567, + "grad_norm": 15.203721551974317, + "kl_loss_13": 2379.4, + "kl_loss_26": 1187.3, + "kl_loss_39": 777.4, + "kl_loss_7": 3104.8, + "learning_rate": 0.000402260324712026, + "loss": 3688.75, + "step": 5670 + }, + { + "ce_loss_13": 2.526816266775131, + "ce_loss_26": 1.9893273174762727, + "ce_loss_39": 1.793796670436859, + "ce_loss_52": 1.4289553046226502, + "ce_loss_7": 2.8634801030159, + "epoch": 0.568, + "grad_norm": 14.842310660649245, + "kl_loss_13": 2254.2, + "kl_loss_26": 1116.4, + "kl_loss_39": 714.95, + "kl_loss_7": 2954.8, + "learning_rate": 0.00040070476667712743, + "loss": 3637.75, + "step": 5680 + }, + { + "ce_loss_13": 2.615302687883377, + "ce_loss_26": 2.059708908200264, + "ce_loss_39": 1.8555728137493133, + "ce_loss_52": 1.4557225406169891, + "ce_loss_7": 2.9577668845653533, + "epoch": 0.569, + "grad_norm": 14.742701130409761, + "kl_loss_13": 2387.6, + "kl_loss_26": 1214.9, + "kl_loss_39": 792.5, + "kl_loss_7": 3105.2, + "learning_rate": 0.0003991502085441259, + "loss": 3676.05, + "step": 5690 + }, + { + "ce_loss_13": 2.5645705699920653, + "ce_loss_26": 2.007223817706108, + "ce_loss_39": 1.8173159271478654, + "ce_loss_52": 1.4376816004514694, + "ce_loss_7": 2.895137590169907, + "epoch": 0.57, + "grad_norm": 15.460594692772787, + "kl_loss_13": 2314.8, + "kl_loss_26": 1152.0, + "kl_loss_39": 753.2, + "kl_loss_7": 3024.8, + "learning_rate": 0.0003975966659674047, + "loss": 3621.95, + "step": 5700 + }, + { + "ce_loss_13": 2.559953585267067, + "ce_loss_26": 2.032286322116852, + "ce_loss_39": 1.845168125629425, + "ce_loss_52": 1.461988940834999, + "ce_loss_7": 2.9015457332134247, + "epoch": 0.571, + "grad_norm": 15.171579029800053, + "kl_loss_13": 2278.8, + "kl_loss_26": 1159.2, + "kl_loss_39": 758.6, + "kl_loss_7": 2982.0, + "learning_rate": 0.0003960441545911204, + "loss": 3675.95, + "step": 5710 + }, + { + "ce_loss_13": 2.6008632302284242, + "ce_loss_26": 2.043924775719643, + "ce_loss_39": 1.8485127180814742, + "ce_loss_52": 1.4697089165449142, + "ce_loss_7": 2.9355547785758973, + "epoch": 0.572, + "grad_norm": 14.834558653485171, + "kl_loss_13": 2319.4, + "kl_loss_26": 1152.8, + "kl_loss_39": 750.0, + "kl_loss_7": 3027.6, + "learning_rate": 0.0003944926900490452, + "loss": 3638.65, + "step": 5720 + }, + { + "ce_loss_13": 2.532833296060562, + "ce_loss_26": 1.9711334377527236, + "ce_loss_39": 1.77955681681633, + "ce_loss_52": 1.4007374957203864, + "ce_loss_7": 2.8781362950801848, + "epoch": 0.573, + "grad_norm": 16.10932164493431, + "kl_loss_13": 2337.8, + "kl_loss_26": 1151.9, + "kl_loss_39": 757.3, + "kl_loss_7": 3064.8, + "learning_rate": 0.0003929422879644099, + "loss": 3650.2, + "step": 5730 + }, + { + "ce_loss_13": 2.5908755481243135, + "ce_loss_26": 2.0414842426776887, + "ce_loss_39": 1.8504431873559952, + "ce_loss_52": 1.4606543123722076, + "ce_loss_7": 2.926515054702759, + "epoch": 0.574, + "grad_norm": 14.72871950232802, + "kl_loss_13": 2333.4, + "kl_loss_26": 1164.1, + "kl_loss_39": 763.5, + "kl_loss_7": 3044.4, + "learning_rate": 0.0003913929639497462, + "loss": 3615.45, + "step": 5740 + }, + { + "ce_loss_13": 2.591219651699066, + "ce_loss_26": 2.044692638516426, + "ce_loss_39": 1.8468156188726426, + "ce_loss_52": 1.452781331539154, + "ce_loss_7": 2.932692265510559, + "epoch": 0.575, + "grad_norm": 14.536189899304834, + "kl_loss_13": 2345.4, + "kl_loss_26": 1189.2, + "kl_loss_39": 775.0, + "kl_loss_7": 3054.8, + "learning_rate": 0.00038984473360672965, + "loss": 3631.3, + "step": 5750 + }, + { + "ce_loss_13": 2.555169379711151, + "ce_loss_26": 2.0133784860372543, + "ce_loss_39": 1.8234185576438904, + "ce_loss_52": 1.4436144948005676, + "ce_loss_7": 2.896002060174942, + "epoch": 0.576, + "grad_norm": 15.34263044515968, + "kl_loss_13": 2285.0, + "kl_loss_26": 1136.7, + "kl_loss_39": 742.35, + "kl_loss_7": 2993.2, + "learning_rate": 0.0003882976125260229, + "loss": 3658.4, + "step": 5760 + }, + { + "ce_loss_13": 2.502971774339676, + "ce_loss_26": 1.9493688374757767, + "ce_loss_39": 1.7570990473031998, + "ce_loss_52": 1.3885455280542374, + "ce_loss_7": 2.8447738111019136, + "epoch": 0.577, + "grad_norm": 14.660193254554198, + "kl_loss_13": 2295.0, + "kl_loss_26": 1140.5, + "kl_loss_39": 734.0, + "kl_loss_7": 3013.6, + "learning_rate": 0.00038675161628711776, + "loss": 3632.8, + "step": 5770 + }, + { + "ce_loss_13": 2.5624574303627012, + "ce_loss_26": 2.0051666617393495, + "ce_loss_39": 1.8087779253721237, + "ce_loss_52": 1.41810100376606, + "ce_loss_7": 2.895737165212631, + "epoch": 0.578, + "grad_norm": 14.482088330386649, + "kl_loss_13": 2343.4, + "kl_loss_26": 1176.1, + "kl_loss_39": 766.9, + "kl_loss_7": 3047.6, + "learning_rate": 0.0003852067604581794, + "loss": 3602.85, + "step": 5780 + }, + { + "ce_loss_13": 2.5270320236682893, + "ce_loss_26": 1.9888224333524704, + "ce_loss_39": 1.803659090399742, + "ce_loss_52": 1.428774857521057, + "ce_loss_7": 2.8705179512500765, + "epoch": 0.579, + "grad_norm": 14.979884111866252, + "kl_loss_13": 2273.2, + "kl_loss_26": 1136.8, + "kl_loss_39": 739.9, + "kl_loss_7": 2982.8, + "learning_rate": 0.0003836630605958888, + "loss": 3603.35, + "step": 5790 + }, + { + "ce_loss_13": 2.5794149696826936, + "ce_loss_26": 2.0252732813358305, + "ce_loss_39": 1.8308797210454941, + "ce_loss_52": 1.4384155124425888, + "ce_loss_7": 2.9223524034023285, + "epoch": 0.58, + "grad_norm": 14.8686548037009, + "kl_loss_13": 2322.0, + "kl_loss_26": 1158.7, + "kl_loss_39": 758.3, + "kl_loss_7": 3044.0, + "learning_rate": 0.0003821205322452863, + "loss": 3636.15, + "step": 5800 + }, + { + "ce_loss_13": 2.6002571165561674, + "ce_loss_26": 2.0497290968894957, + "ce_loss_39": 1.8579988300800323, + "ce_loss_52": 1.4732319116592407, + "ce_loss_7": 2.9367463052272798, + "epoch": 0.581, + "grad_norm": 15.52768956484863, + "kl_loss_13": 2303.2, + "kl_loss_26": 1143.4, + "kl_loss_39": 743.0, + "kl_loss_7": 3018.4, + "learning_rate": 0.0003805791909396155, + "loss": 3651.1, + "step": 5810 + }, + { + "ce_loss_13": 2.5309071093797684, + "ce_loss_26": 1.9803194522857666, + "ce_loss_39": 1.7861102789640426, + "ce_loss_52": 1.410745631158352, + "ce_loss_7": 2.8677128195762633, + "epoch": 0.582, + "grad_norm": 14.66111038706468, + "kl_loss_13": 2302.0, + "kl_loss_26": 1148.3, + "kl_loss_39": 748.9, + "kl_loss_7": 3008.0, + "learning_rate": 0.0003790390522001662, + "loss": 3564.65, + "step": 5820 + }, + { + "ce_loss_13": 2.5017096638679504, + "ce_loss_26": 1.94625324010849, + "ce_loss_39": 1.749154046177864, + "ce_loss_52": 1.3770527362823486, + "ce_loss_7": 2.8349112212657928, + "epoch": 0.583, + "grad_norm": 14.629798672555188, + "kl_loss_13": 2294.6, + "kl_loss_26": 1141.8, + "kl_loss_39": 731.9, + "kl_loss_7": 3003.2, + "learning_rate": 0.0003775001315361183, + "loss": 3613.35, + "step": 5830 + }, + { + "ce_loss_13": 2.5965979039669036, + "ce_loss_26": 2.0513822197914124, + "ce_loss_39": 1.8568279683589934, + "ce_loss_52": 1.486306893825531, + "ce_loss_7": 2.932318705320358, + "epoch": 0.584, + "grad_norm": 15.483089802029363, + "kl_loss_13": 2285.6, + "kl_loss_26": 1133.6, + "kl_loss_39": 729.5, + "kl_loss_7": 2989.2, + "learning_rate": 0.0003759624444443858, + "loss": 3579.55, + "step": 5840 + }, + { + "ce_loss_13": 2.584410917758942, + "ce_loss_26": 2.022917777299881, + "ce_loss_39": 1.8257231026887895, + "ce_loss_52": 1.437566375732422, + "ce_loss_7": 2.9283429443836213, + "epoch": 0.585, + "grad_norm": 15.042900713620126, + "kl_loss_13": 2353.8, + "kl_loss_26": 1173.8, + "kl_loss_39": 763.0, + "kl_loss_7": 3075.6, + "learning_rate": 0.00037442600640946044, + "loss": 3619.6, + "step": 5850 + }, + { + "ce_loss_13": 2.5062096178531648, + "ce_loss_26": 1.9529441505670548, + "ce_loss_39": 1.756454050540924, + "ce_loss_52": 1.3877734661102294, + "ce_loss_7": 2.852471035718918, + "epoch": 0.586, + "grad_norm": 15.637281763013858, + "kl_loss_13": 2307.4, + "kl_loss_26": 1145.7, + "kl_loss_39": 736.7, + "kl_loss_7": 3028.0, + "learning_rate": 0.00037289083290325663, + "loss": 3605.0, + "step": 5860 + }, + { + "ce_loss_13": 2.550817745923996, + "ce_loss_26": 2.0112617135047914, + "ce_loss_39": 1.8207211345434189, + "ce_loss_52": 1.4542410910129546, + "ce_loss_7": 2.8887066781520843, + "epoch": 0.587, + "grad_norm": 14.48910345968502, + "kl_loss_13": 2241.0, + "kl_loss_26": 1095.1, + "kl_loss_39": 709.9, + "kl_loss_7": 2950.4, + "learning_rate": 0.0003713569393849543, + "loss": 3628.55, + "step": 5870 + }, + { + "ce_loss_13": 2.5198557287454606, + "ce_loss_26": 1.9734882295131684, + "ce_loss_39": 1.7831378549337387, + "ce_loss_52": 1.4195168539881706, + "ce_loss_7": 2.8633585631847382, + "epoch": 0.588, + "grad_norm": 14.606482629979089, + "kl_loss_13": 2270.0, + "kl_loss_26": 1121.9, + "kl_loss_39": 724.8, + "kl_loss_7": 2990.8, + "learning_rate": 0.00036982434130084397, + "loss": 3605.15, + "step": 5880 + }, + { + "ce_loss_13": 2.5002260982990263, + "ce_loss_26": 1.9510222643613815, + "ce_loss_39": 1.760866141319275, + "ce_loss_52": 1.3993624940514564, + "ce_loss_7": 2.8445383846759795, + "epoch": 0.589, + "grad_norm": 15.157934768830916, + "kl_loss_13": 2246.4, + "kl_loss_26": 1094.8, + "kl_loss_39": 699.8, + "kl_loss_7": 2967.6, + "learning_rate": 0.00036829305408417166, + "loss": 3580.45, + "step": 5890 + }, + { + "ce_loss_13": 2.4792177438735963, + "ce_loss_26": 1.9329254776239395, + "ce_loss_39": 1.7395812034606934, + "ce_loss_52": 1.3731168687343598, + "ce_loss_7": 2.8229923218488695, + "epoch": 0.59, + "grad_norm": 14.656512940113272, + "kl_loss_13": 2271.0, + "kl_loss_26": 1112.9, + "kl_loss_39": 720.45, + "kl_loss_7": 2995.2, + "learning_rate": 0.0003667630931549826, + "loss": 3601.65, + "step": 5900 + }, + { + "ce_loss_13": 2.6107949793338774, + "ce_loss_26": 2.0506039649248122, + "ce_loss_39": 1.8547100484371186, + "ce_loss_52": 1.4670722007751464, + "ce_loss_7": 2.947498029470444, + "epoch": 0.591, + "grad_norm": 15.535800254061792, + "kl_loss_13": 2341.4, + "kl_loss_26": 1168.3, + "kl_loss_39": 753.2, + "kl_loss_7": 3065.2, + "learning_rate": 0.00036523447391996613, + "loss": 3580.55, + "step": 5910 + }, + { + "ce_loss_13": 2.5217359244823454, + "ce_loss_26": 1.988610166311264, + "ce_loss_39": 1.8043963432312011, + "ce_loss_52": 1.4320271372795106, + "ce_loss_7": 2.8523899018764496, + "epoch": 0.592, + "grad_norm": 15.144566757103092, + "kl_loss_13": 2242.6, + "kl_loss_26": 1108.7, + "kl_loss_39": 722.4, + "kl_loss_7": 2935.6, + "learning_rate": 0.00036370721177230114, + "loss": 3609.65, + "step": 5920 + }, + { + "ce_loss_13": 2.5657771229743958, + "ce_loss_26": 2.024441570043564, + "ce_loss_39": 1.8324565082788467, + "ce_loss_52": 1.4527549773454667, + "ce_loss_7": 2.9029002487659454, + "epoch": 0.593, + "grad_norm": 14.305280391890491, + "kl_loss_13": 2257.4, + "kl_loss_26": 1128.5, + "kl_loss_39": 727.25, + "kl_loss_7": 2971.6, + "learning_rate": 0.00036218132209150044, + "loss": 3561.25, + "step": 5930 + }, + { + "ce_loss_13": 2.5320515751838686, + "ce_loss_26": 1.993978601694107, + "ce_loss_39": 1.7951736986637115, + "ce_loss_52": 1.4332455009222032, + "ce_loss_7": 2.8734976410865785, + "epoch": 0.594, + "grad_norm": 14.731628028248535, + "kl_loss_13": 2276.0, + "kl_loss_26": 1134.7, + "kl_loss_39": 725.65, + "kl_loss_7": 2984.8, + "learning_rate": 0.0003606568202432562, + "loss": 3568.15, + "step": 5940 + }, + { + "ce_loss_13": 2.467684972286224, + "ce_loss_26": 1.9258863091468812, + "ce_loss_39": 1.7374547556042672, + "ce_loss_52": 1.3822436913847924, + "ce_loss_7": 2.8027748644351957, + "epoch": 0.595, + "grad_norm": 13.952374257617315, + "kl_loss_13": 2221.6, + "kl_loss_26": 1099.3, + "kl_loss_39": 702.85, + "kl_loss_7": 2916.8, + "learning_rate": 0.0003591337215792851, + "loss": 3573.15, + "step": 5950 + }, + { + "ce_loss_13": 2.5612458407878878, + "ce_loss_26": 2.005467265844345, + "ce_loss_39": 1.8046582967042923, + "ce_loss_52": 1.4227981299161911, + "ce_loss_7": 2.9120794773101806, + "epoch": 0.596, + "grad_norm": 14.684621517642583, + "kl_loss_13": 2333.2, + "kl_loss_26": 1167.6, + "kl_loss_39": 756.5, + "kl_loss_7": 3067.6, + "learning_rate": 0.00035761204143717383, + "loss": 3598.3, + "step": 5960 + }, + { + "ce_loss_13": 2.539260357618332, + "ce_loss_26": 1.997820395231247, + "ce_loss_39": 1.7995415717363357, + "ce_loss_52": 1.4228885769844055, + "ce_loss_7": 2.8742454588413238, + "epoch": 0.597, + "grad_norm": 14.834521968922566, + "kl_loss_13": 2285.6, + "kl_loss_26": 1143.0, + "kl_loss_39": 733.0, + "kl_loss_7": 2992.4, + "learning_rate": 0.0003560917951402245, + "loss": 3549.75, + "step": 5970 + }, + { + "ce_loss_13": 2.514337483048439, + "ce_loss_26": 1.9719054281711579, + "ce_loss_39": 1.7803026676177978, + "ce_loss_52": 1.4242254197597504, + "ce_loss_7": 2.8526304841041563, + "epoch": 0.598, + "grad_norm": 15.184376306356116, + "kl_loss_13": 2262.4, + "kl_loss_26": 1100.6, + "kl_loss_39": 700.55, + "kl_loss_7": 2977.6, + "learning_rate": 0.00035457299799730046, + "loss": 3595.65, + "step": 5980 + }, + { + "ce_loss_13": 2.5518115133047106, + "ce_loss_26": 2.011714455485344, + "ce_loss_39": 1.8179199546575546, + "ce_loss_52": 1.4457479059696197, + "ce_loss_7": 2.903665816783905, + "epoch": 0.599, + "grad_norm": 17.70688578121031, + "kl_loss_13": 2293.4, + "kl_loss_26": 1146.1, + "kl_loss_39": 736.9, + "kl_loss_7": 3022.8, + "learning_rate": 0.0003530556653026721, + "loss": 3553.45, + "step": 5990 + }, + { + "ce_loss_13": 2.5623465538024903, + "ce_loss_26": 2.0235190600156785, + "ce_loss_39": 1.8284282714128495, + "ce_loss_52": 1.4427544534206391, + "ce_loss_7": 2.8994126319885254, + "epoch": 0.6, + "grad_norm": 14.620466600171055, + "kl_loss_13": 2310.4, + "kl_loss_26": 1152.8, + "kl_loss_39": 754.8, + "kl_loss_7": 3022.0, + "learning_rate": 0.00035153981233586274, + "loss": 3592.9, + "step": 6000 + }, + { + "ce_loss_13": 2.592492914199829, + "ce_loss_26": 2.0241310060024262, + "ce_loss_39": 1.8282486945390701, + "ce_loss_52": 1.4501359939575196, + "ce_loss_7": 2.934645599126816, + "epoch": 0.601, + "grad_norm": 15.179282109125554, + "kl_loss_13": 2355.8, + "kl_loss_26": 1159.4, + "kl_loss_39": 745.9, + "kl_loss_7": 3082.4, + "learning_rate": 0.00035002545436149473, + "loss": 3551.8, + "step": 6010 + }, + { + "ce_loss_13": 2.4968257695436478, + "ce_loss_26": 1.950178360939026, + "ce_loss_39": 1.7565111339092254, + "ce_loss_52": 1.3940304026007653, + "ce_loss_7": 2.8332657337188722, + "epoch": 0.602, + "grad_norm": 15.101566569350041, + "kl_loss_13": 2271.8, + "kl_loss_26": 1114.8, + "kl_loss_39": 717.3, + "kl_loss_7": 2986.8, + "learning_rate": 0.0003485126066291364, + "loss": 3553.3, + "step": 6020 + }, + { + "ce_loss_13": 2.5295323967933654, + "ce_loss_26": 1.9963963776826859, + "ce_loss_39": 1.8044916093349457, + "ce_loss_52": 1.446289749443531, + "ce_loss_7": 2.8674661338329317, + "epoch": 0.603, + "grad_norm": 14.29880611806114, + "kl_loss_13": 2227.0, + "kl_loss_26": 1097.1, + "kl_loss_39": 702.25, + "kl_loss_7": 2938.0, + "learning_rate": 0.0003470012843731476, + "loss": 3534.85, + "step": 6030 + }, + { + "ce_loss_13": 2.498071011900902, + "ce_loss_26": 1.9493587136268615, + "ce_loss_39": 1.76174655854702, + "ce_loss_52": 1.407905325293541, + "ce_loss_7": 2.847205549478531, + "epoch": 0.604, + "grad_norm": 14.483859928846364, + "kl_loss_13": 2244.8, + "kl_loss_26": 1096.9, + "kl_loss_39": 701.8, + "kl_loss_7": 2975.2, + "learning_rate": 0.00034549150281252633, + "loss": 3514.35, + "step": 6040 + }, + { + "ce_loss_13": 2.4966412246227265, + "ce_loss_26": 1.9553426146507262, + "ce_loss_39": 1.7613533914089203, + "ce_loss_52": 1.4072722673416138, + "ce_loss_7": 2.8383829057216645, + "epoch": 0.605, + "grad_norm": 14.78851376475336, + "kl_loss_13": 2235.6, + "kl_loss_26": 1090.3, + "kl_loss_39": 695.0, + "kl_loss_7": 2954.0, + "learning_rate": 0.0003439832771507565, + "loss": 3563.65, + "step": 6050 + }, + { + "ce_loss_13": 2.502397668361664, + "ce_loss_26": 1.9571218103170396, + "ce_loss_39": 1.7665416598320007, + "ce_loss_52": 1.412816160917282, + "ce_loss_7": 2.8454441905021666, + "epoch": 0.606, + "grad_norm": 15.4871318885597, + "kl_loss_13": 2242.0, + "kl_loss_26": 1096.6, + "kl_loss_39": 696.5, + "kl_loss_7": 2962.4, + "learning_rate": 0.0003424766225756537, + "loss": 3510.25, + "step": 6060 + }, + { + "ce_loss_13": 2.5223917841911314, + "ce_loss_26": 1.9723280429840089, + "ce_loss_39": 1.7729612857103347, + "ce_loss_52": 1.4048843801021575, + "ce_loss_7": 2.8645106673240663, + "epoch": 0.607, + "grad_norm": 15.573595841239559, + "kl_loss_13": 2300.6, + "kl_loss_26": 1138.2, + "kl_loss_39": 727.85, + "kl_loss_7": 3019.6, + "learning_rate": 0.00034097155425921255, + "loss": 3527.0, + "step": 6070 + }, + { + "ce_loss_13": 2.491760790348053, + "ce_loss_26": 1.9636650770902633, + "ce_loss_39": 1.7728582590818405, + "ce_loss_52": 1.4259491577744483, + "ce_loss_7": 2.821820729970932, + "epoch": 0.608, + "grad_norm": 14.653123046442355, + "kl_loss_13": 2168.6, + "kl_loss_26": 1063.1, + "kl_loss_39": 679.3, + "kl_loss_7": 2864.4, + "learning_rate": 0.0003394680873574546, + "loss": 3528.9, + "step": 6080 + }, + { + "ce_loss_13": 2.5085246324539185, + "ce_loss_26": 1.9797437161207199, + "ce_loss_39": 1.7878784984350204, + "ce_loss_52": 1.4316603004932404, + "ce_loss_7": 2.847382205724716, + "epoch": 0.609, + "grad_norm": 14.913942335206821, + "kl_loss_13": 2211.2, + "kl_loss_26": 1091.8, + "kl_loss_39": 696.95, + "kl_loss_7": 2922.0, + "learning_rate": 0.0003379662370102747, + "loss": 3549.95, + "step": 6090 + }, + { + "ce_loss_13": 2.4759395986795427, + "ce_loss_26": 1.949927881360054, + "ce_loss_39": 1.7604204803705215, + "ce_loss_52": 1.41001408547163, + "ce_loss_7": 2.8064420104026793, + "epoch": 0.61, + "grad_norm": 14.676610718900648, + "kl_loss_13": 2185.6, + "kl_loss_26": 1077.2, + "kl_loss_39": 688.0, + "kl_loss_7": 2875.6, + "learning_rate": 0.0003364660183412892, + "loss": 3507.8, + "step": 6100 + }, + { + "ce_loss_13": 2.531206899881363, + "ce_loss_26": 1.9836675137281419, + "ce_loss_39": 1.7869503110647202, + "ce_loss_52": 1.432834729552269, + "ce_loss_7": 2.873502719402313, + "epoch": 0.611, + "grad_norm": 14.829577774305747, + "kl_loss_13": 2255.0, + "kl_loss_26": 1106.2, + "kl_loss_39": 703.75, + "kl_loss_7": 2966.4, + "learning_rate": 0.0003349674464576834, + "loss": 3495.25, + "step": 6110 + }, + { + "ce_loss_13": 2.514421796798706, + "ce_loss_26": 1.9697007417678833, + "ce_loss_39": 1.780946272611618, + "ce_loss_52": 1.4202388614416122, + "ce_loss_7": 2.8420049071311952, + "epoch": 0.612, + "grad_norm": 14.8111788086498, + "kl_loss_13": 2230.2, + "kl_loss_26": 1101.1, + "kl_loss_39": 705.2, + "kl_loss_7": 2920.0, + "learning_rate": 0.00033347053645005966, + "loss": 3492.65, + "step": 6120 + }, + { + "ce_loss_13": 2.537186449766159, + "ce_loss_26": 2.004375171661377, + "ce_loss_39": 1.8157330989837646, + "ce_loss_52": 1.451986312866211, + "ce_loss_7": 2.878436690568924, + "epoch": 0.613, + "grad_norm": 15.401404522918961, + "kl_loss_13": 2242.0, + "kl_loss_26": 1113.3, + "kl_loss_39": 718.55, + "kl_loss_7": 2945.6, + "learning_rate": 0.00033197530339228485, + "loss": 3459.1, + "step": 6130 + }, + { + "ce_loss_13": 2.506669583916664, + "ce_loss_26": 1.9629664570093155, + "ce_loss_39": 1.78002208173275, + "ce_loss_52": 1.4117600202560425, + "ce_loss_7": 2.847608286142349, + "epoch": 0.614, + "grad_norm": 15.390199704579972, + "kl_loss_13": 2259.8, + "kl_loss_26": 1112.7, + "kl_loss_39": 727.8, + "kl_loss_7": 2986.8, + "learning_rate": 0.00033048176234133967, + "loss": 3537.45, + "step": 6140 + }, + { + "ce_loss_13": 2.5588534235954286, + "ce_loss_26": 2.024568209052086, + "ce_loss_39": 1.8284731358289719, + "ce_loss_52": 1.4587910890579223, + "ce_loss_7": 2.8921659886837006, + "epoch": 0.615, + "grad_norm": 14.629003698363993, + "kl_loss_13": 2259.8, + "kl_loss_26": 1126.9, + "kl_loss_39": 724.0, + "kl_loss_7": 2958.8, + "learning_rate": 0.0003289899283371657, + "loss": 3536.8, + "step": 6150 + }, + { + "ce_loss_13": 2.4743064284324645, + "ce_loss_26": 1.9476153373718261, + "ce_loss_39": 1.7604322880506516, + "ce_loss_52": 1.4059417188167571, + "ce_loss_7": 2.7994522780179976, + "epoch": 0.616, + "grad_norm": 15.287517673195092, + "kl_loss_13": 2207.6, + "kl_loss_26": 1094.2, + "kl_loss_39": 708.05, + "kl_loss_7": 2892.4, + "learning_rate": 0.0003274998164025148, + "loss": 3522.0, + "step": 6160 + }, + { + "ce_loss_13": 2.61488196849823, + "ce_loss_26": 2.067342773079872, + "ce_loss_39": 1.8675355583429336, + "ce_loss_52": 1.4927641093730926, + "ce_loss_7": 2.948505789041519, + "epoch": 0.617, + "grad_norm": 14.482727532648099, + "kl_loss_13": 2288.0, + "kl_loss_26": 1139.7, + "kl_loss_39": 729.35, + "kl_loss_7": 2993.6, + "learning_rate": 0.0003260114415427975, + "loss": 3494.95, + "step": 6170 + }, + { + "ce_loss_13": 2.5525584638118746, + "ce_loss_26": 1.9941669285297394, + "ce_loss_39": 1.7988057792186738, + "ce_loss_52": 1.4276691943407058, + "ce_loss_7": 2.888332962989807, + "epoch": 0.618, + "grad_norm": 15.194650585699007, + "kl_loss_13": 2319.2, + "kl_loss_26": 1148.8, + "kl_loss_39": 742.3, + "kl_loss_7": 3022.8, + "learning_rate": 0.0003245248187459323, + "loss": 3535.8, + "step": 6180 + }, + { + "ce_loss_13": 2.5258300840854644, + "ce_loss_26": 1.9903143167495727, + "ce_loss_39": 1.8010086834430694, + "ce_loss_52": 1.4478329718112946, + "ce_loss_7": 2.86256263256073, + "epoch": 0.619, + "grad_norm": 14.652051426511987, + "kl_loss_13": 2228.6, + "kl_loss_26": 1088.0, + "kl_loss_39": 694.0, + "kl_loss_7": 2940.4, + "learning_rate": 0.00032303996298219416, + "loss": 3513.7, + "step": 6190 + }, + { + "ce_loss_13": 2.5733933985233306, + "ce_loss_26": 2.0336378514766693, + "ce_loss_39": 1.8403378069400786, + "ce_loss_52": 1.4681322902441025, + "ce_loss_7": 2.920198345184326, + "epoch": 0.62, + "grad_norm": 15.056132330426859, + "kl_loss_13": 2280.4, + "kl_loss_26": 1131.3, + "kl_loss_39": 733.75, + "kl_loss_7": 3002.8, + "learning_rate": 0.00032155688920406414, + "loss": 3507.7, + "step": 6200 + }, + { + "ce_loss_13": 2.5029452949762345, + "ce_loss_26": 1.9643093675374985, + "ce_loss_39": 1.7774474427103997, + "ce_loss_52": 1.4176854699850083, + "ce_loss_7": 2.823590323328972, + "epoch": 0.621, + "grad_norm": 14.49052860339056, + "kl_loss_13": 2220.0, + "kl_loss_26": 1086.1, + "kl_loss_39": 699.45, + "kl_loss_7": 2902.8, + "learning_rate": 0.0003200756123460788, + "loss": 3535.45, + "step": 6210 + }, + { + "ce_loss_13": 2.489814931154251, + "ce_loss_26": 1.9501473933458329, + "ce_loss_39": 1.7617685228586197, + "ce_loss_52": 1.41129230260849, + "ce_loss_7": 2.816985684633255, + "epoch": 0.622, + "grad_norm": 14.505478323074753, + "kl_loss_13": 2218.6, + "kl_loss_26": 1080.9, + "kl_loss_39": 689.55, + "kl_loss_7": 2918.0, + "learning_rate": 0.00031859614732467957, + "loss": 3488.95, + "step": 6220 + }, + { + "ce_loss_13": 2.5316161155700683, + "ce_loss_26": 2.0058958530426025, + "ce_loss_39": 1.8137048929929733, + "ce_loss_52": 1.461652959883213, + "ce_loss_7": 2.8647646605968475, + "epoch": 0.623, + "grad_norm": 13.917606472624449, + "kl_loss_13": 2204.8, + "kl_loss_26": 1085.2, + "kl_loss_39": 685.1, + "kl_loss_7": 2902.4, + "learning_rate": 0.00031711850903806275, + "loss": 3465.2, + "step": 6230 + }, + { + "ce_loss_13": 2.500165891647339, + "ce_loss_26": 1.9614384204149247, + "ce_loss_39": 1.771432462334633, + "ce_loss_52": 1.4076189696788788, + "ce_loss_7": 2.832933169603348, + "epoch": 0.624, + "grad_norm": 14.223738912104434, + "kl_loss_13": 2258.6, + "kl_loss_26": 1125.7, + "kl_loss_39": 725.1, + "kl_loss_7": 2956.8, + "learning_rate": 0.0003156427123660297, + "loss": 3486.3, + "step": 6240 + }, + { + "ce_loss_13": 2.5448601841926575, + "ce_loss_26": 2.000015211105347, + "ce_loss_39": 1.8015096932649612, + "ce_loss_52": 1.440242400765419, + "ce_loss_7": 2.878038114309311, + "epoch": 0.625, + "grad_norm": 14.471917182701668, + "kl_loss_13": 2262.4, + "kl_loss_26": 1124.4, + "kl_loss_39": 714.0, + "kl_loss_7": 2962.4, + "learning_rate": 0.0003141687721698363, + "loss": 3490.15, + "step": 6250 + }, + { + "ce_loss_13": 2.5199947118759156, + "ce_loss_26": 1.986537829041481, + "ce_loss_39": 1.796593463420868, + "ce_loss_52": 1.4425756543874741, + "ce_loss_7": 2.854223221540451, + "epoch": 0.626, + "grad_norm": 14.607284979944513, + "kl_loss_13": 2212.8, + "kl_loss_26": 1085.5, + "kl_loss_39": 686.9, + "kl_loss_7": 2911.6, + "learning_rate": 0.00031269670329204396, + "loss": 3493.3, + "step": 6260 + }, + { + "ce_loss_13": 2.5326361417770387, + "ce_loss_26": 2.002932494878769, + "ce_loss_39": 1.8195136040449142, + "ce_loss_52": 1.4736543655395509, + "ce_loss_7": 2.8653872847557067, + "epoch": 0.627, + "grad_norm": 13.814637143502924, + "kl_loss_13": 2172.2, + "kl_loss_26": 1054.9, + "kl_loss_39": 669.6, + "kl_loss_7": 2869.6, + "learning_rate": 0.00031122652055637015, + "loss": 3492.5, + "step": 6270 + }, + { + "ce_loss_13": 2.482167053222656, + "ce_loss_26": 1.9512667179107666, + "ce_loss_39": 1.7665533930063249, + "ce_loss_52": 1.42098408639431, + "ce_loss_7": 2.8168558061122893, + "epoch": 0.628, + "grad_norm": 16.02072851141737, + "kl_loss_13": 2193.0, + "kl_loss_26": 1069.4, + "kl_loss_39": 682.35, + "kl_loss_7": 2894.0, + "learning_rate": 0.0003097582387675385, + "loss": 3459.75, + "step": 6280 + }, + { + "ce_loss_13": 2.47237606048584, + "ce_loss_26": 1.95036241710186, + "ce_loss_39": 1.764494326710701, + "ce_loss_52": 1.4227848395705223, + "ce_loss_7": 2.8021757781505583, + "epoch": 0.629, + "grad_norm": 15.255934367745192, + "kl_loss_13": 2167.4, + "kl_loss_26": 1053.3, + "kl_loss_39": 664.9, + "kl_loss_7": 2858.8, + "learning_rate": 0.00030829187271113034, + "loss": 3446.7, + "step": 6290 + }, + { + "ce_loss_13": 2.501003822684288, + "ce_loss_26": 1.958929392695427, + "ce_loss_39": 1.76513631939888, + "ce_loss_52": 1.4046356767416, + "ce_loss_7": 2.8350019991397857, + "epoch": 0.63, + "grad_norm": 14.88714427947574, + "kl_loss_13": 2270.2, + "kl_loss_26": 1123.2, + "kl_loss_39": 716.8, + "kl_loss_7": 2970.0, + "learning_rate": 0.00030682743715343565, + "loss": 3508.45, + "step": 6300 + }, + { + "ce_loss_13": 2.5800569117069245, + "ce_loss_26": 2.0348214149475097, + "ce_loss_39": 1.8403811991214751, + "ce_loss_52": 1.476933541893959, + "ce_loss_7": 2.9134635806083677, + "epoch": 0.631, + "grad_norm": 14.94722074912583, + "kl_loss_13": 2245.0, + "kl_loss_26": 1107.2, + "kl_loss_39": 706.25, + "kl_loss_7": 2952.8, + "learning_rate": 0.0003053649468413043, + "loss": 3499.45, + "step": 6310 + }, + { + "ce_loss_13": 2.5208792209625246, + "ce_loss_26": 1.9822687000036239, + "ce_loss_39": 1.7960426419973374, + "ce_loss_52": 1.4455793976783753, + "ce_loss_7": 2.8597992181777956, + "epoch": 0.632, + "grad_norm": 15.453003389066977, + "kl_loss_13": 2216.2, + "kl_loss_26": 1070.1, + "kl_loss_39": 678.0, + "kl_loss_7": 2927.6, + "learning_rate": 0.00030390441650199725, + "loss": 3483.5, + "step": 6320 + }, + { + "ce_loss_13": 2.441950124502182, + "ce_loss_26": 1.91085105240345, + "ce_loss_39": 1.7248677492141724, + "ce_loss_52": 1.3885301351547241, + "ce_loss_7": 2.777034705877304, + "epoch": 0.633, + "grad_norm": 14.901626155092913, + "kl_loss_13": 2155.8, + "kl_loss_26": 1040.1, + "kl_loss_39": 653.2, + "kl_loss_7": 2859.2, + "learning_rate": 0.00030244586084303903, + "loss": 3433.35, + "step": 6330 + }, + { + "ce_loss_13": 2.4565594136714934, + "ce_loss_26": 1.9154207110404968, + "ce_loss_39": 1.7311667442321776, + "ce_loss_52": 1.3880270063877105, + "ce_loss_7": 2.79474156498909, + "epoch": 0.634, + "grad_norm": 15.212752240316364, + "kl_loss_13": 2198.2, + "kl_loss_26": 1053.8, + "kl_loss_39": 669.5, + "kl_loss_7": 2908.0, + "learning_rate": 0.00030098929455206903, + "loss": 3450.2, + "step": 6340 + }, + { + "ce_loss_13": 2.4875996589660643, + "ce_loss_26": 1.9456024587154388, + "ce_loss_39": 1.7510357975959778, + "ce_loss_52": 1.4120293408632278, + "ce_loss_7": 2.8184066474437715, + "epoch": 0.635, + "grad_norm": 14.444701123760842, + "kl_loss_13": 2190.4, + "kl_loss_26": 1070.0, + "kl_loss_39": 670.4, + "kl_loss_7": 2884.8, + "learning_rate": 0.00029953473229669324, + "loss": 3500.6, + "step": 6350 + }, + { + "ce_loss_13": 2.505690813064575, + "ce_loss_26": 1.968365904688835, + "ce_loss_39": 1.7826423317193985, + "ce_loss_52": 1.4394667357206345, + "ce_loss_7": 2.8453324735164642, + "epoch": 0.636, + "grad_norm": 14.480279054372499, + "kl_loss_13": 2207.2, + "kl_loss_26": 1070.3, + "kl_loss_39": 680.95, + "kl_loss_7": 2914.4, + "learning_rate": 0.00029808218872433767, + "loss": 3473.05, + "step": 6360 + }, + { + "ce_loss_13": 2.462022843956947, + "ce_loss_26": 1.9284056276082993, + "ce_loss_39": 1.7475204050540925, + "ce_loss_52": 1.3994116008281707, + "ce_loss_7": 2.799399584531784, + "epoch": 0.637, + "grad_norm": 14.854420396233579, + "kl_loss_13": 2187.4, + "kl_loss_26": 1057.9, + "kl_loss_39": 676.9, + "kl_loss_7": 2908.4, + "learning_rate": 0.0002966316784621, + "loss": 3431.55, + "step": 6370 + }, + { + "ce_loss_13": 2.46474946141243, + "ce_loss_26": 1.9243933081626892, + "ce_loss_39": 1.7351078271865845, + "ce_loss_52": 1.3951061010360717, + "ce_loss_7": 2.809561550617218, + "epoch": 0.638, + "grad_norm": 14.398656186824772, + "kl_loss_13": 2201.4, + "kl_loss_26": 1064.6, + "kl_loss_39": 673.7, + "kl_loss_7": 2913.6, + "learning_rate": 0.0002951832161166024, + "loss": 3433.1, + "step": 6380 + }, + { + "ce_loss_13": 2.524071788787842, + "ce_loss_26": 1.9882585108280182, + "ce_loss_39": 1.8004582822322845, + "ce_loss_52": 1.457192499935627, + "ce_loss_7": 2.859678488969803, + "epoch": 0.639, + "grad_norm": 14.823787609750735, + "kl_loss_13": 2198.8, + "kl_loss_26": 1062.9, + "kl_loss_39": 671.2, + "kl_loss_7": 2895.2, + "learning_rate": 0.0002937368162738445, + "loss": 3448.55, + "step": 6390 + }, + { + "ce_loss_13": 2.476853275299072, + "ce_loss_26": 1.940243661403656, + "ce_loss_39": 1.7542554527521133, + "ce_loss_52": 1.4153559118509293, + "ce_loss_7": 2.8181783974170687, + "epoch": 0.64, + "grad_norm": 14.674283953178037, + "kl_loss_13": 2177.8, + "kl_loss_26": 1053.1, + "kl_loss_39": 664.6, + "kl_loss_7": 2899.6, + "learning_rate": 0.0002922924934990568, + "loss": 3441.6, + "step": 6400 + }, + { + "ce_loss_13": 2.4689641296863556, + "ce_loss_26": 1.933935484290123, + "ce_loss_39": 1.7450189381837844, + "ce_loss_52": 1.3959352299571037, + "ce_loss_7": 2.8043021619319917, + "epoch": 0.641, + "grad_norm": 16.188741684715673, + "kl_loss_13": 2210.6, + "kl_loss_26": 1080.2, + "kl_loss_39": 681.3, + "kl_loss_7": 2920.8, + "learning_rate": 0.0002908502623365536, + "loss": 3439.95, + "step": 6410 + }, + { + "ce_loss_13": 2.51960112452507, + "ce_loss_26": 1.9860825181007384, + "ce_loss_39": 1.792076262831688, + "ce_loss_52": 1.4388326108455658, + "ce_loss_7": 2.8559226214885713, + "epoch": 0.642, + "grad_norm": 15.164079412983817, + "kl_loss_13": 2205.0, + "kl_loss_26": 1076.5, + "kl_loss_39": 686.15, + "kl_loss_7": 2899.6, + "learning_rate": 0.0002894101373095867, + "loss": 3403.75, + "step": 6420 + }, + { + "ce_loss_13": 2.57558217048645, + "ce_loss_26": 2.037161833047867, + "ce_loss_39": 1.8481904029846192, + "ce_loss_52": 1.4971486061811448, + "ce_loss_7": 2.9037492871284485, + "epoch": 0.643, + "grad_norm": 14.617370011749491, + "kl_loss_13": 2241.0, + "kl_loss_26": 1099.6, + "kl_loss_39": 706.2, + "kl_loss_7": 2940.0, + "learning_rate": 0.00028797213292019926, + "loss": 3465.25, + "step": 6430 + }, + { + "ce_loss_13": 2.4703142285346984, + "ce_loss_26": 1.9478118807077407, + "ce_loss_39": 1.7643914371728897, + "ce_loss_52": 1.4303042769432068, + "ce_loss_7": 2.8028221487998963, + "epoch": 0.644, + "grad_norm": 14.268057235198288, + "kl_loss_13": 2150.2, + "kl_loss_26": 1039.5, + "kl_loss_39": 657.25, + "kl_loss_7": 2845.2, + "learning_rate": 0.0002865362636490791, + "loss": 3397.05, + "step": 6440 + }, + { + "ce_loss_13": 2.5057600528001784, + "ce_loss_26": 1.963181382417679, + "ce_loss_39": 1.766686275601387, + "ce_loss_52": 1.4219027027487754, + "ce_loss_7": 2.842684781551361, + "epoch": 0.645, + "grad_norm": 15.007302910220881, + "kl_loss_13": 2227.2, + "kl_loss_26": 1085.9, + "kl_loss_39": 688.6, + "kl_loss_7": 2943.2, + "learning_rate": 0.0002851025439554142, + "loss": 3420.9, + "step": 6450 + }, + { + "ce_loss_13": 2.5524505376815796, + "ce_loss_26": 2.00695119202137, + "ce_loss_39": 1.812732595205307, + "ce_loss_52": 1.4619058847427369, + "ce_loss_7": 2.89662281870842, + "epoch": 0.646, + "grad_norm": 14.944231437877365, + "kl_loss_13": 2231.2, + "kl_loss_26": 1087.1, + "kl_loss_39": 688.9, + "kl_loss_7": 2952.8, + "learning_rate": 0.00028367098827674573, + "loss": 3473.25, + "step": 6460 + }, + { + "ce_loss_13": 2.5118141055107115, + "ce_loss_26": 1.9752016961574554, + "ce_loss_39": 1.790860089659691, + "ce_loss_52": 1.4514876693487166, + "ce_loss_7": 2.8457858681678774, + "epoch": 0.647, + "grad_norm": 14.47327693725919, + "kl_loss_13": 2178.4, + "kl_loss_26": 1057.8, + "kl_loss_39": 666.1, + "kl_loss_7": 2874.4, + "learning_rate": 0.00028224161102882397, + "loss": 3430.95, + "step": 6470 + }, + { + "ce_loss_13": 2.4975059896707537, + "ce_loss_26": 1.9533880710601808, + "ce_loss_39": 1.7617440074682236, + "ce_loss_52": 1.414775413274765, + "ce_loss_7": 2.8368520498275758, + "epoch": 0.648, + "grad_norm": 14.67215053951943, + "kl_loss_13": 2218.6, + "kl_loss_26": 1078.5, + "kl_loss_39": 690.45, + "kl_loss_7": 2930.0, + "learning_rate": 0.00028081442660546124, + "loss": 3435.85, + "step": 6480 + }, + { + "ce_loss_13": 2.4587242364883424, + "ce_loss_26": 1.9304294764995575, + "ce_loss_39": 1.7412324339151382, + "ce_loss_52": 1.4029302895069122, + "ce_loss_7": 2.7982348799705505, + "epoch": 0.649, + "grad_norm": 14.728330622094298, + "kl_loss_13": 2170.4, + "kl_loss_26": 1055.8, + "kl_loss_39": 664.85, + "kl_loss_7": 2878.8, + "learning_rate": 0.0002793894493783892, + "loss": 3431.05, + "step": 6490 + }, + { + "ce_loss_13": 2.5393730461597444, + "ce_loss_26": 1.9960223108530044, + "ce_loss_39": 1.808261874318123, + "ce_loss_52": 1.45538187623024, + "ce_loss_7": 2.8760022819042206, + "epoch": 0.65, + "grad_norm": 15.357970116880674, + "kl_loss_13": 2229.4, + "kl_loss_26": 1089.7, + "kl_loss_39": 695.95, + "kl_loss_7": 2930.8, + "learning_rate": 0.0002779666936971129, + "loss": 3429.5, + "step": 6500 + }, + { + "ce_loss_13": 2.496321311593056, + "ce_loss_26": 1.9730540215969086, + "ce_loss_39": 1.7846842855215073, + "ce_loss_52": 1.444144432246685, + "ce_loss_7": 2.8269869565963743, + "epoch": 0.651, + "grad_norm": 14.142211217794582, + "kl_loss_13": 2159.4, + "kl_loss_26": 1043.4, + "kl_loss_39": 660.95, + "kl_loss_7": 2856.0, + "learning_rate": 0.00027654617388876614, + "loss": 3409.65, + "step": 6510 + }, + { + "ce_loss_13": 2.4942274272441862, + "ce_loss_26": 1.9708451181650162, + "ce_loss_39": 1.781627294421196, + "ce_loss_52": 1.436317929625511, + "ce_loss_7": 2.824290210008621, + "epoch": 0.652, + "grad_norm": 14.257728738894219, + "kl_loss_13": 2158.0, + "kl_loss_26": 1061.3, + "kl_loss_39": 676.9, + "kl_loss_7": 2841.6, + "learning_rate": 0.0002751279042579672, + "loss": 3420.3, + "step": 6520 + }, + { + "ce_loss_13": 2.474119412899017, + "ce_loss_26": 1.9305396527051926, + "ce_loss_39": 1.745158138871193, + "ce_loss_52": 1.4086509764194488, + "ce_loss_7": 2.8088379979133604, + "epoch": 0.653, + "grad_norm": 14.091894330635501, + "kl_loss_13": 2189.8, + "kl_loss_26": 1060.2, + "kl_loss_39": 671.45, + "kl_loss_7": 2896.4, + "learning_rate": 0.00027371189908667604, + "loss": 3430.2, + "step": 6530 + }, + { + "ce_loss_13": 2.5130710184574125, + "ce_loss_26": 1.9766233384609222, + "ce_loss_39": 1.7931175470352172, + "ce_loss_52": 1.4404253482818603, + "ce_loss_7": 2.8409298956394196, + "epoch": 0.654, + "grad_norm": 14.81675411336707, + "kl_loss_13": 2196.2, + "kl_loss_26": 1079.4, + "kl_loss_39": 687.65, + "kl_loss_7": 2891.2, + "learning_rate": 0.00027229817263404863, + "loss": 3395.3, + "step": 6540 + }, + { + "ce_loss_13": 2.489423853158951, + "ce_loss_26": 1.9515893071889878, + "ce_loss_39": 1.7590265810489654, + "ce_loss_52": 1.4197510361671448, + "ce_loss_7": 2.832774597406387, + "epoch": 0.655, + "grad_norm": 14.759093026127282, + "kl_loss_13": 2178.4, + "kl_loss_26": 1050.2, + "kl_loss_39": 655.0, + "kl_loss_7": 2893.2, + "learning_rate": 0.0002708867391362948, + "loss": 3416.7, + "step": 6550 + }, + { + "ce_loss_13": 2.5174727141857147, + "ce_loss_26": 1.9717289686203003, + "ce_loss_39": 1.7858693569898605, + "ce_loss_52": 1.4473539382219314, + "ce_loss_7": 2.8465367794036864, + "epoch": 0.656, + "grad_norm": 14.064047100581472, + "kl_loss_13": 2182.8, + "kl_loss_26": 1041.7, + "kl_loss_39": 656.75, + "kl_loss_7": 2877.6, + "learning_rate": 0.0002694776128065345, + "loss": 3397.05, + "step": 6560 + }, + { + "ce_loss_13": 2.528963714838028, + "ce_loss_26": 1.981925156712532, + "ce_loss_39": 1.7907918602228166, + "ce_loss_52": 1.4499590158462525, + "ce_loss_7": 2.871799385547638, + "epoch": 0.657, + "grad_norm": 14.569880458498675, + "kl_loss_13": 2200.6, + "kl_loss_26": 1059.1, + "kl_loss_39": 660.35, + "kl_loss_7": 2915.2, + "learning_rate": 0.00026807080783465374, + "loss": 3393.8, + "step": 6570 + }, + { + "ce_loss_13": 2.5217245757579803, + "ce_loss_26": 1.9828781098127366, + "ce_loss_39": 1.7871235221624375, + "ce_loss_52": 1.4383462622761727, + "ce_loss_7": 2.864704269170761, + "epoch": 0.658, + "grad_norm": 14.405026774063144, + "kl_loss_13": 2246.6, + "kl_loss_26": 1094.4, + "kl_loss_39": 694.55, + "kl_loss_7": 2961.2, + "learning_rate": 0.00026666633838716316, + "loss": 3410.55, + "step": 6580 + }, + { + "ce_loss_13": 2.525897091627121, + "ce_loss_26": 1.9987964391708375, + "ce_loss_39": 1.815827977657318, + "ce_loss_52": 1.4701957792043685, + "ce_loss_7": 2.8572112381458283, + "epoch": 0.659, + "grad_norm": 14.76642356275535, + "kl_loss_13": 2192.0, + "kl_loss_26": 1080.0, + "kl_loss_39": 689.6, + "kl_loss_7": 2880.4, + "learning_rate": 0.00026526421860705474, + "loss": 3403.15, + "step": 6590 + }, + { + "ce_loss_13": 2.516835355758667, + "ce_loss_26": 1.9933927595615386, + "ce_loss_39": 1.8040386736392975, + "ce_loss_52": 1.4699463561177253, + "ce_loss_7": 2.839303117990494, + "epoch": 0.66, + "grad_norm": 15.026727729458214, + "kl_loss_13": 2153.2, + "kl_loss_26": 1047.1, + "kl_loss_39": 659.7, + "kl_loss_7": 2838.8, + "learning_rate": 0.0002638644626136587, + "loss": 3420.9, + "step": 6600 + }, + { + "ce_loss_13": 2.5133367598056795, + "ce_loss_26": 1.9757141143083572, + "ce_loss_39": 1.7787566870450973, + "ce_loss_52": 1.4421678900718689, + "ce_loss_7": 2.84059277176857, + "epoch": 0.661, + "grad_norm": 14.33502800394329, + "kl_loss_13": 2167.2, + "kl_loss_26": 1050.2, + "kl_loss_39": 655.95, + "kl_loss_7": 2853.6, + "learning_rate": 0.00026246708450250255, + "loss": 3363.15, + "step": 6610 + }, + { + "ce_loss_13": 2.530620867013931, + "ce_loss_26": 2.015302965044975, + "ce_loss_39": 1.8304377377033234, + "ce_loss_52": 1.4897648423910141, + "ce_loss_7": 2.84687961935997, + "epoch": 0.662, + "grad_norm": 14.322777446508148, + "kl_loss_13": 2141.4, + "kl_loss_26": 1047.1, + "kl_loss_39": 661.0, + "kl_loss_7": 2818.4, + "learning_rate": 0.00026107209834516854, + "loss": 3368.65, + "step": 6620 + }, + { + "ce_loss_13": 2.5134909957647324, + "ce_loss_26": 1.964612963795662, + "ce_loss_39": 1.7641142904758453, + "ce_loss_52": 1.4144739270210267, + "ce_loss_7": 2.851569724082947, + "epoch": 0.663, + "grad_norm": 14.388583697005986, + "kl_loss_13": 2236.4, + "kl_loss_26": 1088.4, + "kl_loss_39": 685.8, + "kl_loss_7": 2945.2, + "learning_rate": 0.0002596795181891514, + "loss": 3390.15, + "step": 6630 + }, + { + "ce_loss_13": 2.4643958449363708, + "ce_loss_26": 1.9360562086105346, + "ce_loss_39": 1.7430761098861693, + "ce_loss_52": 1.4107858330011367, + "ce_loss_7": 2.7886133015155794, + "epoch": 0.664, + "grad_norm": 14.667067788196036, + "kl_loss_13": 2160.8, + "kl_loss_26": 1055.2, + "kl_loss_39": 663.05, + "kl_loss_7": 2836.8, + "learning_rate": 0.000258289358057718, + "loss": 3433.7, + "step": 6640 + }, + { + "ce_loss_13": 2.470621481537819, + "ce_loss_26": 1.932977157831192, + "ce_loss_39": 1.7458938509225845, + "ce_loss_52": 1.40322026014328, + "ce_loss_7": 2.8033271014690397, + "epoch": 0.665, + "grad_norm": 14.559695450600435, + "kl_loss_13": 2196.2, + "kl_loss_26": 1066.5, + "kl_loss_39": 673.3, + "kl_loss_7": 2893.6, + "learning_rate": 0.0002569016319497657, + "loss": 3385.35, + "step": 6650 + }, + { + "ce_loss_13": 2.523782452940941, + "ce_loss_26": 1.9726400285959245, + "ce_loss_39": 1.783732882142067, + "ce_loss_52": 1.441526584327221, + "ce_loss_7": 2.8546798706054686, + "epoch": 0.666, + "grad_norm": 14.338494844564005, + "kl_loss_13": 2201.0, + "kl_loss_26": 1066.9, + "kl_loss_39": 675.1, + "kl_loss_7": 2913.6, + "learning_rate": 0.00025551635383968066, + "loss": 3431.65, + "step": 6660 + }, + { + "ce_loss_13": 2.496166667342186, + "ce_loss_26": 1.9616001814603805, + "ce_loss_39": 1.7780775994062423, + "ce_loss_52": 1.4437817305326461, + "ce_loss_7": 2.8353283524513246, + "epoch": 0.667, + "grad_norm": 14.333332930511547, + "kl_loss_13": 2162.8, + "kl_loss_26": 1041.8, + "kl_loss_39": 658.3, + "kl_loss_7": 2867.2, + "learning_rate": 0.00025413353767719804, + "loss": 3373.9, + "step": 6670 + }, + { + "ce_loss_13": 2.4899742186069487, + "ce_loss_26": 1.9639368683099747, + "ce_loss_39": 1.7742518305778503, + "ce_loss_52": 1.4562912076711654, + "ce_loss_7": 2.813701218366623, + "epoch": 0.668, + "grad_norm": 15.020866565536496, + "kl_loss_13": 2118.8, + "kl_loss_26": 1008.9, + "kl_loss_39": 626.15, + "kl_loss_7": 2799.6, + "learning_rate": 0.0002527531973872617, + "loss": 3354.0, + "step": 6680 + }, + { + "ce_loss_13": 2.4495032489299775, + "ce_loss_26": 1.9232689619064331, + "ce_loss_39": 1.7322240889072418, + "ce_loss_52": 1.4083685100078582, + "ce_loss_7": 2.7809501469135283, + "epoch": 0.669, + "grad_norm": 15.157984637483661, + "kl_loss_13": 2152.4, + "kl_loss_26": 1034.8, + "kl_loss_39": 641.05, + "kl_loss_7": 2845.2, + "learning_rate": 0.0002513753468698826, + "loss": 3397.05, + "step": 6690 + }, + { + "ce_loss_13": 2.5416204214096068, + "ce_loss_26": 1.9909723430871964, + "ce_loss_39": 1.797818985581398, + "ce_loss_52": 1.4566215574741364, + "ce_loss_7": 2.8839422285556795, + "epoch": 0.67, + "grad_norm": 14.731917297604895, + "kl_loss_13": 2207.8, + "kl_loss_26": 1067.0, + "kl_loss_39": 664.6, + "kl_loss_7": 2918.4, + "learning_rate": 0.0002500000000000001, + "loss": 3410.05, + "step": 6700 + }, + { + "ce_loss_13": 2.46220725774765, + "ce_loss_26": 1.9426458358764649, + "ce_loss_39": 1.7631336867809295, + "ce_loss_52": 1.4324473321437836, + "ce_loss_7": 2.7896072566509247, + "epoch": 0.671, + "grad_norm": 14.394157937336324, + "kl_loss_13": 2145.4, + "kl_loss_26": 1030.2, + "kl_loss_39": 651.6, + "kl_loss_7": 2836.8, + "learning_rate": 0.0002486271706273421, + "loss": 3349.6, + "step": 6710 + }, + { + "ce_loss_13": 2.4807921826839445, + "ce_loss_26": 1.9570556044578553, + "ce_loss_39": 1.776718083024025, + "ce_loss_52": 1.4477659314870834, + "ce_loss_7": 2.8088565468788147, + "epoch": 0.672, + "grad_norm": 14.57538335602299, + "kl_loss_13": 2109.6, + "kl_loss_26": 1009.8, + "kl_loss_39": 634.25, + "kl_loss_7": 2806.0, + "learning_rate": 0.0002472568725762853, + "loss": 3376.2, + "step": 6720 + }, + { + "ce_loss_13": 2.4802849024534224, + "ce_loss_26": 1.9443901777267456, + "ce_loss_39": 1.7523796886205674, + "ce_loss_52": 1.4139477282762527, + "ce_loss_7": 2.8129481852054594, + "epoch": 0.673, + "grad_norm": 14.144296062088605, + "kl_loss_13": 2194.4, + "kl_loss_26": 1069.6, + "kl_loss_39": 674.95, + "kl_loss_7": 2888.8, + "learning_rate": 0.00024588911964571554, + "loss": 3364.55, + "step": 6730 + }, + { + "ce_loss_13": 2.5132571697235107, + "ce_loss_26": 1.9828839927911759, + "ce_loss_39": 1.7922901511192322, + "ce_loss_52": 1.4625934183597564, + "ce_loss_7": 2.841163671016693, + "epoch": 0.674, + "grad_norm": 14.199249331732203, + "kl_loss_13": 2159.4, + "kl_loss_26": 1039.5, + "kl_loss_39": 646.65, + "kl_loss_7": 2846.0, + "learning_rate": 0.00024452392560888974, + "loss": 3361.05, + "step": 6740 + }, + { + "ce_loss_13": 2.4865836411714555, + "ce_loss_26": 1.954461258649826, + "ce_loss_39": 1.7649456202983855, + "ce_loss_52": 1.4221005111932754, + "ce_loss_7": 2.821400898694992, + "epoch": 0.675, + "grad_norm": 14.682119193582665, + "kl_loss_13": 2206.2, + "kl_loss_26": 1077.5, + "kl_loss_39": 678.9, + "kl_loss_7": 2902.4, + "learning_rate": 0.00024316130421329695, + "loss": 3347.95, + "step": 6750 + }, + { + "ce_loss_13": 2.474187096953392, + "ce_loss_26": 1.9446223825216293, + "ce_loss_39": 1.7641993075609208, + "ce_loss_52": 1.436264917254448, + "ce_loss_7": 2.8015049755573274, + "epoch": 0.676, + "grad_norm": 14.72420694586694, + "kl_loss_13": 2155.0, + "kl_loss_26": 1030.2, + "kl_loss_39": 646.8, + "kl_loss_7": 2843.6, + "learning_rate": 0.00024180126918051909, + "loss": 3348.9, + "step": 6760 + }, + { + "ce_loss_13": 2.480317395925522, + "ce_loss_26": 1.956321433186531, + "ce_loss_39": 1.7687535285949707, + "ce_loss_52": 1.4266707986593246, + "ce_loss_7": 2.808481311798096, + "epoch": 0.677, + "grad_norm": 15.416504395614744, + "kl_loss_13": 2171.4, + "kl_loss_26": 1058.8, + "kl_loss_39": 666.6, + "kl_loss_7": 2869.2, + "learning_rate": 0.00024044383420609406, + "loss": 3413.1, + "step": 6770 + }, + { + "ce_loss_13": 2.5013114362955093, + "ce_loss_26": 1.9759581625461577, + "ce_loss_39": 1.7926720827817917, + "ce_loss_52": 1.4599666327238083, + "ce_loss_7": 2.8250863194465636, + "epoch": 0.678, + "grad_norm": 13.962008513939274, + "kl_loss_13": 2133.8, + "kl_loss_26": 1039.8, + "kl_loss_39": 651.85, + "kl_loss_7": 2824.8, + "learning_rate": 0.00023908901295937712, + "loss": 3372.05, + "step": 6780 + }, + { + "ce_loss_13": 2.489407476782799, + "ce_loss_26": 1.9609563022851944, + "ce_loss_39": 1.778808832168579, + "ce_loss_52": 1.454407089948654, + "ce_loss_7": 2.8136882543563844, + "epoch": 0.679, + "grad_norm": 14.19915367627698, + "kl_loss_13": 2110.2, + "kl_loss_26": 1016.4, + "kl_loss_39": 637.05, + "kl_loss_7": 2792.0, + "learning_rate": 0.00023773681908340283, + "loss": 3384.7, + "step": 6790 + }, + { + "ce_loss_13": 2.4634098410606384, + "ce_loss_26": 1.9384621411561966, + "ce_loss_39": 1.7507896840572357, + "ce_loss_52": 1.4146029382944107, + "ce_loss_7": 2.7943048059940336, + "epoch": 0.68, + "grad_norm": 14.861512399701681, + "kl_loss_13": 2166.4, + "kl_loss_26": 1051.5, + "kl_loss_39": 658.6, + "kl_loss_7": 2860.8, + "learning_rate": 0.00023638726619474876, + "loss": 3356.85, + "step": 6800 + }, + { + "ce_loss_13": 2.5783134520053865, + "ce_loss_26": 2.045197767019272, + "ce_loss_39": 1.8584345400333404, + "ce_loss_52": 1.5217636466026305, + "ce_loss_7": 2.9070691764354706, + "epoch": 0.681, + "grad_norm": 14.552198364638281, + "kl_loss_13": 2176.2, + "kl_loss_26": 1054.5, + "kl_loss_39": 659.7, + "kl_loss_7": 2877.6, + "learning_rate": 0.0002350403678833976, + "loss": 3347.55, + "step": 6810 + }, + { + "ce_loss_13": 2.4692390322685243, + "ce_loss_26": 1.943667185306549, + "ce_loss_39": 1.7523112028837204, + "ce_loss_52": 1.426128900051117, + "ce_loss_7": 2.7978816986083985, + "epoch": 0.682, + "grad_norm": 14.998678580001265, + "kl_loss_13": 2167.2, + "kl_loss_26": 1050.9, + "kl_loss_39": 652.95, + "kl_loss_7": 2852.4, + "learning_rate": 0.00023369613771260007, + "loss": 3369.6, + "step": 6820 + }, + { + "ce_loss_13": 2.4953604638576508, + "ce_loss_26": 1.981321769952774, + "ce_loss_39": 1.7999033033847809, + "ce_loss_52": 1.4693025022745132, + "ce_loss_7": 2.826812982559204, + "epoch": 0.683, + "grad_norm": 14.167664016838927, + "kl_loss_13": 2131.6, + "kl_loss_26": 1041.6, + "kl_loss_39": 654.4, + "kl_loss_7": 2822.8, + "learning_rate": 0.00023235458921873925, + "loss": 3334.7, + "step": 6830 + }, + { + "ce_loss_13": 2.5015017211437227, + "ce_loss_26": 1.9650517791509627, + "ce_loss_39": 1.7717696577310562, + "ce_loss_52": 1.4357108920812607, + "ce_loss_7": 2.827932006120682, + "epoch": 0.684, + "grad_norm": 14.585579391353733, + "kl_loss_13": 2162.0, + "kl_loss_26": 1051.0, + "kl_loss_39": 656.3, + "kl_loss_7": 2848.0, + "learning_rate": 0.0002310157359111938, + "loss": 3348.15, + "step": 6840 + }, + { + "ce_loss_13": 2.426406466960907, + "ce_loss_26": 1.9015664726495742, + "ce_loss_39": 1.718049594759941, + "ce_loss_52": 1.3935224622488023, + "ce_loss_7": 2.756322818994522, + "epoch": 0.685, + "grad_norm": 15.055484269746147, + "kl_loss_13": 2134.6, + "kl_loss_26": 1022.1, + "kl_loss_39": 635.6, + "kl_loss_7": 2830.0, + "learning_rate": 0.0002296795912722014, + "loss": 3335.95, + "step": 6850 + }, + { + "ce_loss_13": 2.414138987660408, + "ce_loss_26": 1.8910569071769714, + "ce_loss_39": 1.708191841840744, + "ce_loss_52": 1.383391012251377, + "ce_loss_7": 2.747501391172409, + "epoch": 0.686, + "grad_norm": 14.125926922706771, + "kl_loss_13": 2113.8, + "kl_loss_26": 1014.7, + "kl_loss_39": 637.05, + "kl_loss_7": 2808.8, + "learning_rate": 0.0002283461687567236, + "loss": 3303.65, + "step": 6860 + }, + { + "ce_loss_13": 2.464204970002174, + "ce_loss_26": 1.9375032573938369, + "ce_loss_39": 1.7498446986079217, + "ce_loss_52": 1.4226357489824295, + "ce_loss_7": 2.7877202153205873, + "epoch": 0.687, + "grad_norm": 14.738238275957917, + "kl_loss_13": 2139.2, + "kl_loss_26": 1028.9, + "kl_loss_39": 644.15, + "kl_loss_7": 2826.0, + "learning_rate": 0.00022701548179231045, + "loss": 3307.2, + "step": 6870 + }, + { + "ce_loss_13": 2.494007241725922, + "ce_loss_26": 1.9751898407936097, + "ce_loss_39": 1.7829045623540878, + "ce_loss_52": 1.4496880739927291, + "ce_loss_7": 2.8286533296108245, + "epoch": 0.688, + "grad_norm": 13.852391362837148, + "kl_loss_13": 2141.2, + "kl_loss_26": 1042.5, + "kl_loss_39": 651.3, + "kl_loss_7": 2830.8, + "learning_rate": 0.00022568754377896516, + "loss": 3367.25, + "step": 6880 + }, + { + "ce_loss_13": 2.4991670876741408, + "ce_loss_26": 1.9593406468629837, + "ce_loss_39": 1.7670493572950363, + "ce_loss_52": 1.426390787959099, + "ce_loss_7": 2.8318077862262725, + "epoch": 0.689, + "grad_norm": 14.446284686187164, + "kl_loss_13": 2202.8, + "kl_loss_26": 1070.1, + "kl_loss_39": 671.2, + "kl_loss_7": 2903.6, + "learning_rate": 0.00022436236808900844, + "loss": 3351.3, + "step": 6890 + }, + { + "ce_loss_13": 2.5084181427955627, + "ce_loss_26": 1.9811445116996764, + "ce_loss_39": 1.7889297604560852, + "ce_loss_52": 1.4664145559072495, + "ce_loss_7": 2.8357039868831633, + "epoch": 0.69, + "grad_norm": 14.484209525578123, + "kl_loss_13": 2151.6, + "kl_loss_26": 1045.5, + "kl_loss_39": 652.25, + "kl_loss_7": 2853.6, + "learning_rate": 0.00022303996806694487, + "loss": 3356.65, + "step": 6900 + }, + { + "ce_loss_13": 2.514096361398697, + "ce_loss_26": 1.9786822557449342, + "ce_loss_39": 1.7935864567756652, + "ce_loss_52": 1.4548332244157791, + "ce_loss_7": 2.839564120769501, + "epoch": 0.691, + "grad_norm": 13.76338668087359, + "kl_loss_13": 2184.2, + "kl_loss_26": 1055.1, + "kl_loss_39": 667.4, + "kl_loss_7": 2879.6, + "learning_rate": 0.00022172035702932823, + "loss": 3337.1, + "step": 6910 + }, + { + "ce_loss_13": 2.4764732241630556, + "ce_loss_26": 1.95515196621418, + "ce_loss_39": 1.772997224330902, + "ce_loss_52": 1.4487987339496613, + "ce_loss_7": 2.8033831179142, + "epoch": 0.692, + "grad_norm": 14.586346705034499, + "kl_loss_13": 2110.2, + "kl_loss_26": 1014.4, + "kl_loss_39": 638.2, + "kl_loss_7": 2793.2, + "learning_rate": 0.00022040354826462666, + "loss": 3310.75, + "step": 6920 + }, + { + "ce_loss_13": 2.468746620416641, + "ce_loss_26": 1.9468185782432557, + "ce_loss_39": 1.7679857224225999, + "ce_loss_52": 1.4481467604637146, + "ce_loss_7": 2.789846181869507, + "epoch": 0.693, + "grad_norm": 15.005402062047136, + "kl_loss_13": 2109.6, + "kl_loss_26": 1011.9, + "kl_loss_39": 635.85, + "kl_loss_7": 2789.6, + "learning_rate": 0.0002190895550330899, + "loss": 3354.5, + "step": 6930 + }, + { + "ce_loss_13": 2.4501267641782762, + "ce_loss_26": 1.9237078607082367, + "ce_loss_39": 1.735903450846672, + "ce_loss_52": 1.413103035092354, + "ce_loss_7": 2.7897567749023438, + "epoch": 0.694, + "grad_norm": 14.894293084006021, + "kl_loss_13": 2115.4, + "kl_loss_26": 1009.8, + "kl_loss_39": 623.75, + "kl_loss_7": 2825.6, + "learning_rate": 0.00021777839056661552, + "loss": 3328.85, + "step": 6940 + }, + { + "ce_loss_13": 2.5025814145803453, + "ce_loss_26": 1.9697064816951753, + "ce_loss_39": 1.7858674556016922, + "ce_loss_52": 1.459896171092987, + "ce_loss_7": 2.8366158485412596, + "epoch": 0.695, + "grad_norm": 14.62978428769736, + "kl_loss_13": 2143.8, + "kl_loss_26": 1026.4, + "kl_loss_39": 644.0, + "kl_loss_7": 2842.8, + "learning_rate": 0.0002164700680686147, + "loss": 3339.6, + "step": 6950 + }, + { + "ce_loss_13": 2.480437287688255, + "ce_loss_26": 1.9569555580615998, + "ce_loss_39": 1.7707047134637832, + "ce_loss_52": 1.4560858264565468, + "ce_loss_7": 2.8063796043395994, + "epoch": 0.696, + "grad_norm": 14.305917479002375, + "kl_loss_13": 2104.0, + "kl_loss_26": 1003.9, + "kl_loss_39": 621.6, + "kl_loss_7": 2791.2, + "learning_rate": 0.0002151646007138806, + "loss": 3346.15, + "step": 6960 + }, + { + "ce_loss_13": 2.4904795557260515, + "ce_loss_26": 1.9593748480081559, + "ce_loss_39": 1.7725317537784577, + "ce_loss_52": 1.4421725705266, + "ce_loss_7": 2.82717769742012, + "epoch": 0.697, + "grad_norm": 14.55516871467818, + "kl_loss_13": 2171.2, + "kl_loss_26": 1046.7, + "kl_loss_39": 655.3, + "kl_loss_7": 2878.4, + "learning_rate": 0.00021386200164845526, + "loss": 3321.35, + "step": 6970 + }, + { + "ce_loss_13": 2.4793561339378356, + "ce_loss_26": 1.9439466089010238, + "ce_loss_39": 1.7569547444581985, + "ce_loss_52": 1.4289204239845277, + "ce_loss_7": 2.807218599319458, + "epoch": 0.698, + "grad_norm": 13.778785355416604, + "kl_loss_13": 2152.0, + "kl_loss_26": 1037.7, + "kl_loss_39": 642.8, + "kl_loss_7": 2848.0, + "learning_rate": 0.0002125622839894964, + "loss": 3315.5, + "step": 6980 + }, + { + "ce_loss_13": 2.585531139373779, + "ce_loss_26": 2.031143417954445, + "ce_loss_39": 1.8274976074695588, + "ce_loss_52": 1.4753503799438477, + "ce_loss_7": 2.921118849515915, + "epoch": 0.699, + "grad_norm": 14.664216613597723, + "kl_loss_13": 2260.4, + "kl_loss_26": 1103.8, + "kl_loss_39": 685.9, + "kl_loss_7": 2968.0, + "learning_rate": 0.00021126546082514663, + "loss": 3365.55, + "step": 6990 + }, + { + "ce_loss_13": 2.478432095050812, + "ce_loss_26": 1.950699546933174, + "ce_loss_39": 1.7660550504922867, + "ce_loss_52": 1.4418641477823257, + "ce_loss_7": 2.802116149663925, + "epoch": 0.7, + "grad_norm": 14.414270184643762, + "kl_loss_13": 2128.4, + "kl_loss_26": 1022.4, + "kl_loss_39": 639.9, + "kl_loss_7": 2808.8, + "learning_rate": 0.00020997154521440098, + "loss": 3312.0, + "step": 7000 + }, + { + "ce_loss_13": 2.4484674006700518, + "ce_loss_26": 1.9334596753120423, + "ce_loss_39": 1.754175427556038, + "ce_loss_52": 1.4356836065649987, + "ce_loss_7": 2.7715867519378663, + "epoch": 0.701, + "grad_norm": 15.009606673101777, + "kl_loss_13": 2096.6, + "kl_loss_26": 1005.6, + "kl_loss_39": 630.9, + "kl_loss_7": 2776.0, + "learning_rate": 0.0002086805501869749, + "loss": 3296.9, + "step": 7010 + }, + { + "ce_loss_13": 2.500411355495453, + "ce_loss_26": 1.9781237423419953, + "ce_loss_39": 1.7995324105024337, + "ce_loss_52": 1.473289003968239, + "ce_loss_7": 2.8238317251205443, + "epoch": 0.702, + "grad_norm": 14.811490845229788, + "kl_loss_13": 2108.4, + "kl_loss_26": 1018.1, + "kl_loss_39": 642.65, + "kl_loss_7": 2786.4, + "learning_rate": 0.0002073924887431744, + "loss": 3301.85, + "step": 7020 + }, + { + "ce_loss_13": 2.4331007301807404, + "ce_loss_26": 1.925421604514122, + "ce_loss_39": 1.740053552389145, + "ce_loss_52": 1.4260162442922593, + "ce_loss_7": 2.750682008266449, + "epoch": 0.703, + "grad_norm": 14.093530596736626, + "kl_loss_13": 2071.0, + "kl_loss_26": 996.4, + "kl_loss_39": 613.7, + "kl_loss_7": 2741.2, + "learning_rate": 0.00020610737385376348, + "loss": 3303.45, + "step": 7030 + }, + { + "ce_loss_13": 2.438492274284363, + "ce_loss_26": 1.917963182926178, + "ce_loss_39": 1.731092056632042, + "ce_loss_52": 1.4118753850460053, + "ce_loss_7": 2.7640158772468566, + "epoch": 0.704, + "grad_norm": 14.575552477174394, + "kl_loss_13": 2113.4, + "kl_loss_26": 1019.3, + "kl_loss_39": 632.85, + "kl_loss_7": 2804.0, + "learning_rate": 0.00020482521845983521, + "loss": 3301.55, + "step": 7040 + }, + { + "ce_loss_13": 2.4910335719585417, + "ce_loss_26": 1.9715039610862732, + "ce_loss_39": 1.7800425946712495, + "ce_loss_52": 1.4519392430782319, + "ce_loss_7": 2.819239354133606, + "epoch": 0.705, + "grad_norm": 14.820051388586238, + "kl_loss_13": 2136.2, + "kl_loss_26": 1040.9, + "kl_loss_39": 651.75, + "kl_loss_7": 2824.0, + "learning_rate": 0.00020354603547267987, + "loss": 3316.6, + "step": 7050 + }, + { + "ce_loss_13": 2.4318030804395674, + "ce_loss_26": 1.910761234164238, + "ce_loss_39": 1.7284663796424866, + "ce_loss_52": 1.4129984229803085, + "ce_loss_7": 2.7625936210155486, + "epoch": 0.706, + "grad_norm": 14.59547057670379, + "kl_loss_13": 2113.8, + "kl_loss_26": 1004.1, + "kl_loss_39": 622.2, + "kl_loss_7": 2816.4, + "learning_rate": 0.00020226983777365604, + "loss": 3284.95, + "step": 7060 + }, + { + "ce_loss_13": 2.4749036192893983, + "ce_loss_26": 1.9450800210237502, + "ce_loss_39": 1.761537629365921, + "ce_loss_52": 1.4373356252908707, + "ce_loss_7": 2.809742730855942, + "epoch": 0.707, + "grad_norm": 14.651682120589488, + "kl_loss_13": 2148.8, + "kl_loss_26": 1038.1, + "kl_loss_39": 647.05, + "kl_loss_7": 2860.4, + "learning_rate": 0.00020099663821406056, + "loss": 3330.65, + "step": 7070 + }, + { + "ce_loss_13": 2.500520494580269, + "ce_loss_26": 1.9711394160985947, + "ce_loss_39": 1.7850598603487016, + "ce_loss_52": 1.4572303384542464, + "ce_loss_7": 2.822962099313736, + "epoch": 0.708, + "grad_norm": 14.695952402949361, + "kl_loss_13": 2140.6, + "kl_loss_26": 1032.0, + "kl_loss_39": 642.6, + "kl_loss_7": 2828.8, + "learning_rate": 0.00019972644961499853, + "loss": 3310.1, + "step": 7080 + }, + { + "ce_loss_13": 2.4471381455659866, + "ce_loss_26": 1.9142519533634186, + "ce_loss_39": 1.7273518294095993, + "ce_loss_52": 1.4090154066681861, + "ce_loss_7": 2.775345432758331, + "epoch": 0.709, + "grad_norm": 14.907893630817398, + "kl_loss_13": 2145.0, + "kl_loss_26": 1028.7, + "kl_loss_39": 636.3, + "kl_loss_7": 2839.2, + "learning_rate": 0.00019845928476725522, + "loss": 3284.4, + "step": 7090 + }, + { + "ce_loss_13": 2.484838107228279, + "ce_loss_26": 1.9752304345369338, + "ce_loss_39": 1.7935984045267106, + "ce_loss_52": 1.4694376409053802, + "ce_loss_7": 2.8051605463027953, + "epoch": 0.71, + "grad_norm": 14.813324344167642, + "kl_loss_13": 2100.4, + "kl_loss_26": 1009.5, + "kl_loss_39": 629.85, + "kl_loss_7": 2784.0, + "learning_rate": 0.00019719515643116677, + "loss": 3271.1, + "step": 7100 + }, + { + "ce_loss_13": 2.449986720085144, + "ce_loss_26": 1.915557289123535, + "ce_loss_39": 1.7244810461997986, + "ce_loss_52": 1.4009377419948579, + "ce_loss_7": 2.7815246999263765, + "epoch": 0.711, + "grad_norm": 14.72405028446814, + "kl_loss_13": 2128.8, + "kl_loss_26": 1018.3, + "kl_loss_39": 632.1, + "kl_loss_7": 2824.4, + "learning_rate": 0.0001959340773364911, + "loss": 3301.5, + "step": 7110 + }, + { + "ce_loss_13": 2.4507795870304108, + "ce_loss_26": 1.9264422208070755, + "ce_loss_39": 1.7369968056678773, + "ce_loss_52": 1.4171953916549682, + "ce_loss_7": 2.778002160787582, + "epoch": 0.712, + "grad_norm": 15.123641060610607, + "kl_loss_13": 2145.2, + "kl_loss_26": 1034.6, + "kl_loss_39": 640.6, + "kl_loss_7": 2832.0, + "learning_rate": 0.0001946760601822809, + "loss": 3307.65, + "step": 7120 + }, + { + "ce_loss_13": 2.4649185329675674, + "ce_loss_26": 1.9448591649532319, + "ce_loss_39": 1.7617656499147416, + "ce_loss_52": 1.4421708196401597, + "ce_loss_7": 2.7942136943340303, + "epoch": 0.713, + "grad_norm": 13.86141587264665, + "kl_loss_13": 2099.6, + "kl_loss_26": 996.3, + "kl_loss_39": 613.9, + "kl_loss_7": 2784.0, + "learning_rate": 0.00019342111763675512, + "loss": 3264.15, + "step": 7130 + }, + { + "ce_loss_13": 2.431650939583778, + "ce_loss_26": 1.8971330910921096, + "ce_loss_39": 1.7134798288345336, + "ce_loss_52": 1.3959884241223335, + "ce_loss_7": 2.7688992261886596, + "epoch": 0.714, + "grad_norm": 14.868179084191116, + "kl_loss_13": 2103.6, + "kl_loss_26": 997.3, + "kl_loss_39": 614.55, + "kl_loss_7": 2798.8, + "learning_rate": 0.00019216926233717085, + "loss": 3302.05, + "step": 7140 + }, + { + "ce_loss_13": 2.4574439406394957, + "ce_loss_26": 1.9289735972881317, + "ce_loss_39": 1.738691231608391, + "ce_loss_52": 1.4203062415122987, + "ce_loss_7": 2.7880250751972198, + "epoch": 0.715, + "grad_norm": 14.757879306344181, + "kl_loss_13": 2133.4, + "kl_loss_26": 1021.4, + "kl_loss_39": 631.75, + "kl_loss_7": 2823.2, + "learning_rate": 0.00019092050688969737, + "loss": 3296.5, + "step": 7150 + }, + { + "ce_loss_13": 2.4601316511631013, + "ce_loss_26": 1.9434845715761184, + "ce_loss_39": 1.7585901826620103, + "ce_loss_52": 1.4390017569065094, + "ce_loss_7": 2.7778116285800936, + "epoch": 0.716, + "grad_norm": 13.991843131427743, + "kl_loss_13": 2085.4, + "kl_loss_26": 1007.0, + "kl_loss_39": 627.45, + "kl_loss_7": 2755.6, + "learning_rate": 0.00018967486386928817, + "loss": 3286.15, + "step": 7160 + }, + { + "ce_loss_13": 2.451919847726822, + "ce_loss_26": 1.9279222816228867, + "ce_loss_39": 1.7440420866012574, + "ce_loss_52": 1.4374898225069046, + "ce_loss_7": 2.784500467777252, + "epoch": 0.717, + "grad_norm": 14.5708304909804, + "kl_loss_13": 2095.4, + "kl_loss_26": 992.5, + "kl_loss_39": 611.75, + "kl_loss_7": 2794.0, + "learning_rate": 0.00018843234581955443, + "loss": 3292.25, + "step": 7170 + }, + { + "ce_loss_13": 2.4709593683481215, + "ce_loss_26": 1.9460813373327255, + "ce_loss_39": 1.7575767368078232, + "ce_loss_52": 1.4326383203268052, + "ce_loss_7": 2.7951516568660737, + "epoch": 0.718, + "grad_norm": 14.981137787748375, + "kl_loss_13": 2117.8, + "kl_loss_26": 1019.7, + "kl_loss_39": 633.35, + "kl_loss_7": 2803.2, + "learning_rate": 0.00018719296525263924, + "loss": 3299.6, + "step": 7180 + }, + { + "ce_loss_13": 2.4041130542755127, + "ce_loss_26": 1.8861528187990189, + "ce_loss_39": 1.7035977393388748, + "ce_loss_52": 1.4066498517990111, + "ce_loss_7": 2.735060691833496, + "epoch": 0.719, + "grad_norm": 14.986994612654895, + "kl_loss_13": 2054.2, + "kl_loss_26": 961.6, + "kl_loss_39": 585.6, + "kl_loss_7": 2750.4, + "learning_rate": 0.0001859567346490913, + "loss": 3264.25, + "step": 7190 + }, + { + "ce_loss_13": 2.521838116645813, + "ce_loss_26": 2.004297485947609, + "ce_loss_39": 1.810739102959633, + "ce_loss_52": 1.4783420652151107, + "ce_loss_7": 2.849927377700806, + "epoch": 0.72, + "grad_norm": 14.181310648182276, + "kl_loss_13": 2154.6, + "kl_loss_26": 1052.9, + "kl_loss_39": 657.35, + "kl_loss_7": 2848.4, + "learning_rate": 0.0001847236664577389, + "loss": 3278.0, + "step": 7200 + }, + { + "ce_loss_13": 2.40320103764534, + "ce_loss_26": 1.8900187402963637, + "ce_loss_39": 1.7100308045744896, + "ce_loss_52": 1.3999869018793105, + "ce_loss_7": 2.736201885342598, + "epoch": 0.721, + "grad_norm": 14.793709205482683, + "kl_loss_13": 2080.8, + "kl_loss_26": 990.7, + "kl_loss_39": 610.95, + "kl_loss_7": 2780.0, + "learning_rate": 0.00018349377309556487, + "loss": 3283.25, + "step": 7210 + }, + { + "ce_loss_13": 2.441250967979431, + "ce_loss_26": 1.9231963992118835, + "ce_loss_39": 1.7388009175658226, + "ce_loss_52": 1.4282974660396577, + "ce_loss_7": 2.7717535465955736, + "epoch": 0.722, + "grad_norm": 15.59238941344996, + "kl_loss_13": 2104.0, + "kl_loss_26": 997.6, + "kl_loss_39": 616.85, + "kl_loss_7": 2801.2, + "learning_rate": 0.00018226706694758193, + "loss": 3263.75, + "step": 7220 + }, + { + "ce_loss_13": 2.495049071311951, + "ce_loss_26": 1.973162716627121, + "ce_loss_39": 1.7896722644567489, + "ce_loss_52": 1.4697123229503632, + "ce_loss_7": 2.8224462032318116, + "epoch": 0.723, + "grad_norm": 13.997878236797012, + "kl_loss_13": 2123.2, + "kl_loss_26": 1009.4, + "kl_loss_39": 628.15, + "kl_loss_7": 2823.2, + "learning_rate": 0.0001810435603667075, + "loss": 3267.75, + "step": 7230 + }, + { + "ce_loss_13": 2.4492597192525865, + "ce_loss_26": 1.9282636791467667, + "ce_loss_39": 1.7473824605345727, + "ce_loss_52": 1.4372958570718766, + "ce_loss_7": 2.782766741514206, + "epoch": 0.724, + "grad_norm": 14.73683414882718, + "kl_loss_13": 2088.6, + "kl_loss_26": 990.1, + "kl_loss_39": 613.75, + "kl_loss_7": 2799.6, + "learning_rate": 0.0001798232656736389, + "loss": 3246.35, + "step": 7240 + }, + { + "ce_loss_13": 2.514770272374153, + "ce_loss_26": 1.9697564780712127, + "ce_loss_39": 1.7806446701288223, + "ce_loss_52": 1.456220605969429, + "ce_loss_7": 2.8567338407039644, + "epoch": 0.725, + "grad_norm": 14.87142240672514, + "kl_loss_13": 2179.8, + "kl_loss_26": 1037.2, + "kl_loss_39": 644.4, + "kl_loss_7": 2889.2, + "learning_rate": 0.0001786061951567303, + "loss": 3273.6, + "step": 7250 + }, + { + "ce_loss_13": 2.4067626029253004, + "ce_loss_26": 1.8963259696960448, + "ce_loss_39": 1.7143886119127274, + "ce_loss_52": 1.4022117048501967, + "ce_loss_7": 2.7348236978054046, + "epoch": 0.726, + "grad_norm": 14.076850795507209, + "kl_loss_13": 2076.6, + "kl_loss_26": 994.5, + "kl_loss_39": 623.5, + "kl_loss_7": 2766.8, + "learning_rate": 0.00017739236107186857, + "loss": 3281.2, + "step": 7260 + }, + { + "ce_loss_13": 2.4501163721084596, + "ce_loss_26": 1.926158633828163, + "ce_loss_39": 1.7434884279966354, + "ce_loss_52": 1.4286866545677186, + "ce_loss_7": 2.777343970537186, + "epoch": 0.727, + "grad_norm": 13.813498461115062, + "kl_loss_13": 2114.6, + "kl_loss_26": 1012.1, + "kl_loss_39": 626.6, + "kl_loss_7": 2806.0, + "learning_rate": 0.00017618177564234904, + "loss": 3264.1, + "step": 7270 + }, + { + "ce_loss_13": 2.4412575274705888, + "ce_loss_26": 1.9107532769441604, + "ce_loss_39": 1.7259235098958015, + "ce_loss_52": 1.4076966106891633, + "ce_loss_7": 2.774235662817955, + "epoch": 0.728, + "grad_norm": 14.801740311988109, + "kl_loss_13": 2113.4, + "kl_loss_26": 1003.6, + "kl_loss_39": 619.55, + "kl_loss_7": 2807.4, + "learning_rate": 0.00017497445105875377, + "loss": 3298.7, + "step": 7280 + }, + { + "ce_loss_13": 2.445681685209274, + "ce_loss_26": 1.9345449537038804, + "ce_loss_39": 1.7532619833946228, + "ce_loss_52": 1.442472691833973, + "ce_loss_7": 2.764800661802292, + "epoch": 0.729, + "grad_norm": 14.618830047757731, + "kl_loss_13": 2063.8, + "kl_loss_26": 980.7, + "kl_loss_39": 603.55, + "kl_loss_7": 2742.0, + "learning_rate": 0.000173770399478828, + "loss": 3226.7, + "step": 7290 + }, + { + "ce_loss_13": 2.4301975846290587, + "ce_loss_26": 1.9138565450906753, + "ce_loss_39": 1.736141037940979, + "ce_loss_52": 1.427780945599079, + "ce_loss_7": 2.760072636604309, + "epoch": 0.73, + "grad_norm": 14.242974774472335, + "kl_loss_13": 2095.4, + "kl_loss_26": 990.5, + "kl_loss_39": 610.6, + "kl_loss_7": 2791.2, + "learning_rate": 0.0001725696330273575, + "loss": 3260.65, + "step": 7300 + }, + { + "ce_loss_13": 2.4727762907743456, + "ce_loss_26": 1.9557204306125642, + "ce_loss_39": 1.766261911392212, + "ce_loss_52": 1.4398792043328286, + "ce_loss_7": 2.79817710518837, + "epoch": 0.731, + "grad_norm": 14.566153451338561, + "kl_loss_13": 2113.6, + "kl_loss_26": 1018.5, + "kl_loss_39": 628.8, + "kl_loss_7": 2796.8, + "learning_rate": 0.00017137216379604724, + "loss": 3240.75, + "step": 7310 + }, + { + "ce_loss_13": 2.490224635601044, + "ce_loss_26": 1.954663872718811, + "ce_loss_39": 1.7594738394021987, + "ce_loss_52": 1.4360380351543427, + "ce_loss_7": 2.8263917326927186, + "epoch": 0.732, + "grad_norm": 13.205540898906253, + "kl_loss_13": 2161.8, + "kl_loss_26": 1044.6, + "kl_loss_39": 637.45, + "kl_loss_7": 2862.4, + "learning_rate": 0.00017017800384339925, + "loss": 3258.4, + "step": 7320 + }, + { + "ce_loss_13": 2.4344683617353438, + "ce_loss_26": 1.9195960253477096, + "ce_loss_39": 1.7325531929731368, + "ce_loss_52": 1.419457183778286, + "ce_loss_7": 2.7598242580890657, + "epoch": 0.733, + "grad_norm": 14.107781745249417, + "kl_loss_13": 2087.4, + "kl_loss_26": 1001.6, + "kl_loss_39": 618.5, + "kl_loss_7": 2774.8, + "learning_rate": 0.00016898716519459073, + "loss": 3316.4, + "step": 7330 + }, + { + "ce_loss_13": 2.4717041492462157, + "ce_loss_26": 1.9320402562618255, + "ce_loss_39": 1.730445721745491, + "ce_loss_52": 1.3991459339857102, + "ce_loss_7": 2.811204159259796, + "epoch": 0.734, + "grad_norm": 14.159198716486541, + "kl_loss_13": 2200.8, + "kl_loss_26": 1069.2, + "kl_loss_39": 657.15, + "kl_loss_7": 2902.8, + "learning_rate": 0.00016779965984135375, + "loss": 3266.3, + "step": 7340 + }, + { + "ce_loss_13": 2.4648273169994352, + "ce_loss_26": 1.9446689933538437, + "ce_loss_39": 1.7665399879217147, + "ce_loss_52": 1.4552808463573457, + "ce_loss_7": 2.7818954586982727, + "epoch": 0.735, + "grad_norm": 13.974138676843918, + "kl_loss_13": 2093.0, + "kl_loss_26": 999.2, + "kl_loss_39": 622.1, + "kl_loss_7": 2762.4, + "learning_rate": 0.00016661549974185424, + "loss": 3232.6, + "step": 7350 + }, + { + "ce_loss_13": 2.497272843122482, + "ce_loss_26": 1.9733565777540207, + "ce_loss_39": 1.7902508676052094, + "ce_loss_52": 1.4602745115756988, + "ce_loss_7": 2.8227945923805238, + "epoch": 0.736, + "grad_norm": 15.105414614283358, + "kl_loss_13": 2153.6, + "kl_loss_26": 1043.2, + "kl_loss_39": 652.45, + "kl_loss_7": 2843.6, + "learning_rate": 0.00016543469682057105, + "loss": 3314.1, + "step": 7360 + }, + { + "ce_loss_13": 2.4817920327186584, + "ce_loss_26": 1.9674188673496247, + "ce_loss_39": 1.788041964173317, + "ce_loss_52": 1.4778558552265166, + "ce_loss_7": 2.801541256904602, + "epoch": 0.737, + "grad_norm": 14.089468466172162, + "kl_loss_13": 2075.4, + "kl_loss_26": 985.2, + "kl_loss_39": 606.6, + "kl_loss_7": 2750.8, + "learning_rate": 0.00016425726296817632, + "loss": 3279.5, + "step": 7370 + }, + { + "ce_loss_13": 2.4628233551979064, + "ce_loss_26": 1.944148001074791, + "ce_loss_39": 1.7584406644105912, + "ce_loss_52": 1.440820676088333, + "ce_loss_7": 2.7982202231884004, + "epoch": 0.738, + "grad_norm": 14.250790129395915, + "kl_loss_13": 2096.0, + "kl_loss_26": 994.4, + "kl_loss_39": 612.4, + "kl_loss_7": 2800.4, + "learning_rate": 0.00016308321004141607, + "loss": 3270.5, + "step": 7380 + }, + { + "ce_loss_13": 2.4311512380838396, + "ce_loss_26": 1.910204255580902, + "ce_loss_39": 1.7292486786842347, + "ce_loss_52": 1.4260056450963021, + "ce_loss_7": 2.7644225537776945, + "epoch": 0.739, + "grad_norm": 14.26013452282849, + "kl_loss_13": 2064.2, + "kl_loss_26": 971.8, + "kl_loss_39": 596.1, + "kl_loss_7": 2766.0, + "learning_rate": 0.00016191254986299043, + "loss": 3267.55, + "step": 7390 + }, + { + "ce_loss_13": 2.3748191058635713, + "ce_loss_26": 1.8720220893621444, + "ce_loss_39": 1.695397737622261, + "ce_loss_52": 1.3954048216342927, + "ce_loss_7": 2.6974743723869326, + "epoch": 0.74, + "grad_norm": 14.042172223471859, + "kl_loss_13": 2036.0, + "kl_loss_26": 961.8, + "kl_loss_39": 589.15, + "kl_loss_7": 2719.6, + "learning_rate": 0.00016074529422143398, + "loss": 3237.3, + "step": 7400 + }, + { + "ce_loss_13": 2.504778665304184, + "ce_loss_26": 1.9736295342445374, + "ce_loss_39": 1.778898686170578, + "ce_loss_52": 1.458841660618782, + "ce_loss_7": 2.830203241109848, + "epoch": 0.741, + "grad_norm": 14.817704298873846, + "kl_loss_13": 2137.6, + "kl_loss_26": 1026.7, + "kl_loss_39": 628.9, + "kl_loss_7": 2830.4, + "learning_rate": 0.0001595814548709983, + "loss": 3256.85, + "step": 7410 + }, + { + "ce_loss_13": 2.457485032081604, + "ce_loss_26": 1.955030158162117, + "ce_loss_39": 1.7744358479976654, + "ce_loss_52": 1.4638055652379989, + "ce_loss_7": 2.7708797633647917, + "epoch": 0.742, + "grad_norm": 13.847929994452544, + "kl_loss_13": 2053.2, + "kl_loss_26": 989.3, + "kl_loss_39": 610.65, + "kl_loss_7": 2726.4, + "learning_rate": 0.00015842104353153285, + "loss": 3240.25, + "step": 7420 + }, + { + "ce_loss_13": 2.5232761919498445, + "ce_loss_26": 1.9848549604415893, + "ce_loss_39": 1.7907847046852112, + "ce_loss_52": 1.473637193441391, + "ce_loss_7": 2.8558732092380525, + "epoch": 0.743, + "grad_norm": 14.575648272616709, + "kl_loss_13": 2149.6, + "kl_loss_26": 1018.4, + "kl_loss_39": 623.4, + "kl_loss_7": 2838.8, + "learning_rate": 0.0001572640718883667, + "loss": 3254.8, + "step": 7430 + }, + { + "ce_loss_13": 2.4647684305906297, + "ce_loss_26": 1.9386949807405471, + "ce_loss_39": 1.7500061064958572, + "ce_loss_52": 1.4324503019452095, + "ce_loss_7": 2.7884095788002012, + "epoch": 0.744, + "grad_norm": 14.394764150644365, + "kl_loss_13": 2108.4, + "kl_loss_26": 1000.0, + "kl_loss_39": 616.55, + "kl_loss_7": 2791.2, + "learning_rate": 0.0001561105515921915, + "loss": 3224.3, + "step": 7440 + }, + { + "ce_loss_13": 2.441458174586296, + "ce_loss_26": 1.9374703764915466, + "ce_loss_39": 1.7534300208091735, + "ce_loss_52": 1.4378804206848144, + "ce_loss_7": 2.7636309385299684, + "epoch": 0.745, + "grad_norm": 14.678295349282738, + "kl_loss_13": 2068.2, + "kl_loss_26": 997.1, + "kl_loss_39": 620.85, + "kl_loss_7": 2745.6, + "learning_rate": 0.0001549604942589441, + "loss": 3227.25, + "step": 7450 + }, + { + "ce_loss_13": 2.4308183819055555, + "ce_loss_26": 1.9100747764110566, + "ce_loss_39": 1.7246440201997757, + "ce_loss_52": 1.409556159377098, + "ce_loss_7": 2.7695399791002275, + "epoch": 0.746, + "grad_norm": 14.694656979242655, + "kl_loss_13": 2094.8, + "kl_loss_26": 993.0, + "kl_loss_39": 612.55, + "kl_loss_7": 2804.8, + "learning_rate": 0.00015381391146968864, + "loss": 3249.4, + "step": 7460 + }, + { + "ce_loss_13": 2.462578612565994, + "ce_loss_26": 1.940753996372223, + "ce_loss_39": 1.7554692894220352, + "ce_loss_52": 1.441029006242752, + "ce_loss_7": 2.785427051782608, + "epoch": 0.747, + "grad_norm": 14.412450315252437, + "kl_loss_13": 2103.6, + "kl_loss_26": 1003.3, + "kl_loss_39": 620.4, + "kl_loss_7": 2793.2, + "learning_rate": 0.00015267081477050133, + "loss": 3242.1, + "step": 7470 + }, + { + "ce_loss_13": 2.436378574371338, + "ce_loss_26": 1.9284409761428833, + "ce_loss_39": 1.7525635540485383, + "ce_loss_52": 1.443886636197567, + "ce_loss_7": 2.762845513224602, + "epoch": 0.748, + "grad_norm": 14.082696745240801, + "kl_loss_13": 2056.6, + "kl_loss_26": 983.1, + "kl_loss_39": 607.7, + "kl_loss_7": 2742.6, + "learning_rate": 0.00015153121567235335, + "loss": 3260.75, + "step": 7480 + }, + { + "ce_loss_13": 2.4219354510307314, + "ce_loss_26": 1.9070833683013917, + "ce_loss_39": 1.727812445163727, + "ce_loss_52": 1.4214952304959296, + "ce_loss_7": 2.751905006170273, + "epoch": 0.749, + "grad_norm": 14.604071674011259, + "kl_loss_13": 2073.4, + "kl_loss_26": 983.2, + "kl_loss_39": 609.55, + "kl_loss_7": 2766.8, + "learning_rate": 0.00015039512565099468, + "loss": 3240.15, + "step": 7490 + }, + { + "ce_loss_13": 2.4254602432250976, + "ce_loss_26": 1.9135964632034301, + "ce_loss_39": 1.7295757800340652, + "ce_loss_52": 1.423691214621067, + "ce_loss_7": 2.7512109965085982, + "epoch": 0.75, + "grad_norm": 13.87053452241645, + "kl_loss_13": 2059.4, + "kl_loss_26": 978.6, + "kl_loss_39": 597.45, + "kl_loss_7": 2750.4, + "learning_rate": 0.00014926255614683932, + "loss": 3260.75, + "step": 7500 + }, + { + "ce_loss_13": 2.44639810025692, + "ce_loss_26": 1.93405482172966, + "ce_loss_39": 1.7547091454267503, + "ce_loss_52": 1.4416350960731505, + "ce_loss_7": 2.768189311027527, + "epoch": 0.751, + "grad_norm": 14.071002297078877, + "kl_loss_13": 2088.8, + "kl_loss_26": 995.6, + "kl_loss_39": 616.7, + "kl_loss_7": 2768.8, + "learning_rate": 0.0001481335185648498, + "loss": 3269.45, + "step": 7510 + }, + { + "ce_loss_13": 2.496452784538269, + "ce_loss_26": 1.9704186409711837, + "ce_loss_39": 1.784249845147133, + "ce_loss_52": 1.4717927530407906, + "ce_loss_7": 2.8235138654708862, + "epoch": 0.752, + "grad_norm": 14.017187066675143, + "kl_loss_13": 2081.6, + "kl_loss_26": 989.5, + "kl_loss_39": 608.0, + "kl_loss_7": 2766.4, + "learning_rate": 0.0001470080242744218, + "loss": 3222.85, + "step": 7520 + }, + { + "ce_loss_13": 2.4962650299072267, + "ce_loss_26": 1.98213948905468, + "ce_loss_39": 1.7954594939947128, + "ce_loss_52": 1.480476987361908, + "ce_loss_7": 2.827346932888031, + "epoch": 0.753, + "grad_norm": 14.186670012527646, + "kl_loss_13": 2094.8, + "kl_loss_26": 998.5, + "kl_loss_39": 611.15, + "kl_loss_7": 2785.6, + "learning_rate": 0.0001458860846092705, + "loss": 3232.0, + "step": 7530 + }, + { + "ce_loss_13": 2.4669371783733367, + "ce_loss_26": 1.9375512719154357, + "ce_loss_39": 1.7506729423999787, + "ce_loss_52": 1.4334673672914504, + "ce_loss_7": 2.7915061593055723, + "epoch": 0.754, + "grad_norm": 14.32315966365105, + "kl_loss_13": 2109.0, + "kl_loss_26": 996.8, + "kl_loss_39": 613.7, + "kl_loss_7": 2794.0, + "learning_rate": 0.00014476771086731566, + "loss": 3264.6, + "step": 7540 + }, + { + "ce_loss_13": 2.4759989261627195, + "ce_loss_26": 1.9555140793323518, + "ce_loss_39": 1.7739178657531738, + "ce_loss_52": 1.4679069191217422, + "ce_loss_7": 2.795573103427887, + "epoch": 0.755, + "grad_norm": 14.011488201466985, + "kl_loss_13": 2070.6, + "kl_loss_26": 974.7, + "kl_loss_39": 593.9, + "kl_loss_7": 2754.8, + "learning_rate": 0.00014365291431056872, + "loss": 3256.8, + "step": 7550 + }, + { + "ce_loss_13": 2.424694412946701, + "ce_loss_26": 1.903841146826744, + "ce_loss_39": 1.7249469131231308, + "ce_loss_52": 1.4185307189822196, + "ce_loss_7": 2.7595690310001375, + "epoch": 0.756, + "grad_norm": 14.79387989637837, + "kl_loss_13": 2093.2, + "kl_loss_26": 989.9, + "kl_loss_39": 610.5, + "kl_loss_7": 2791.6, + "learning_rate": 0.00014254170616501827, + "loss": 3235.5, + "step": 7560 + }, + { + "ce_loss_13": 2.4660239934921266, + "ce_loss_26": 1.9451529324054717, + "ce_loss_39": 1.7564547389745713, + "ce_loss_52": 1.4434847444295884, + "ce_loss_7": 2.7976350009441378, + "epoch": 0.757, + "grad_norm": 14.844517268447264, + "kl_loss_13": 2085.6, + "kl_loss_26": 981.2, + "kl_loss_39": 602.45, + "kl_loss_7": 2776.4, + "learning_rate": 0.0001414340976205183, + "loss": 3204.2, + "step": 7570 + }, + { + "ce_loss_13": 2.4295350134372713, + "ce_loss_26": 1.921643227338791, + "ce_loss_39": 1.738620987534523, + "ce_loss_52": 1.4355527609586716, + "ce_loss_7": 2.7507594347000124, + "epoch": 0.758, + "grad_norm": 14.398235424743639, + "kl_loss_13": 2045.8, + "kl_loss_26": 965.6, + "kl_loss_39": 594.6, + "kl_loss_7": 2732.4, + "learning_rate": 0.00014033009983067452, + "loss": 3240.7, + "step": 7580 + }, + { + "ce_loss_13": 2.4676227152347563, + "ce_loss_26": 1.9401549130678177, + "ce_loss_39": 1.7562287330627442, + "ce_loss_52": 1.4347741633653641, + "ce_loss_7": 2.8093821585178373, + "epoch": 0.759, + "grad_norm": 13.736433284616705, + "kl_loss_13": 2138.8, + "kl_loss_26": 1028.1, + "kl_loss_39": 641.7, + "kl_loss_7": 2851.2, + "learning_rate": 0.00013922972391273224, + "loss": 3240.15, + "step": 7590 + }, + { + "ce_loss_13": 2.491154599189758, + "ce_loss_26": 1.9668046951293945, + "ce_loss_39": 1.7748177736997603, + "ce_loss_52": 1.4488209426403045, + "ce_loss_7": 2.8212892413139343, + "epoch": 0.76, + "grad_norm": 14.65229593288616, + "kl_loss_13": 2140.6, + "kl_loss_26": 1032.9, + "kl_loss_39": 637.7, + "kl_loss_7": 2829.6, + "learning_rate": 0.0001381329809474649, + "loss": 3239.9, + "step": 7600 + }, + { + "ce_loss_13": 2.3942853659391403, + "ce_loss_26": 1.892151090502739, + "ce_loss_39": 1.7125076562166215, + "ce_loss_52": 1.4143452048301697, + "ce_loss_7": 2.720611757040024, + "epoch": 0.761, + "grad_norm": 13.295903354405345, + "kl_loss_13": 2008.0, + "kl_loss_26": 952.3, + "kl_loss_39": 583.5, + "kl_loss_7": 2686.4, + "learning_rate": 0.0001370398819790621, + "loss": 3228.6, + "step": 7610 + }, + { + "ce_loss_13": 2.48261901140213, + "ce_loss_26": 1.966281446814537, + "ce_loss_39": 1.7803379833698272, + "ce_loss_52": 1.4673886984586715, + "ce_loss_7": 2.805577594041824, + "epoch": 0.762, + "grad_norm": 14.322747311567188, + "kl_loss_13": 2093.4, + "kl_loss_26": 1000.3, + "kl_loss_39": 610.3, + "kl_loss_7": 2776.0, + "learning_rate": 0.00013595043801501794, + "loss": 3201.5, + "step": 7620 + }, + { + "ce_loss_13": 2.443099784851074, + "ce_loss_26": 1.9284921824932098, + "ce_loss_39": 1.7422718316316606, + "ce_loss_52": 1.435012650489807, + "ce_loss_7": 2.7720457434654238, + "epoch": 0.763, + "grad_norm": 14.405471822802745, + "kl_loss_13": 2082.6, + "kl_loss_26": 994.1, + "kl_loss_39": 608.55, + "kl_loss_7": 2773.2, + "learning_rate": 0.00013486466002602133, + "loss": 3225.725, + "step": 7630 + }, + { + "ce_loss_13": 2.37467542886734, + "ce_loss_26": 1.8506677508354188, + "ce_loss_39": 1.6710956811904907, + "ce_loss_52": 1.3840662211179733, + "ce_loss_7": 2.7066759169101715, + "epoch": 0.764, + "grad_norm": 13.948958944433121, + "kl_loss_13": 2038.6, + "kl_loss_26": 946.7, + "kl_loss_39": 574.05, + "kl_loss_7": 2722.4, + "learning_rate": 0.00013378255894584462, + "loss": 3167.8, + "step": 7640 + }, + { + "ce_loss_13": 2.446861132979393, + "ce_loss_26": 1.934667894244194, + "ce_loss_39": 1.7494839936494828, + "ce_loss_52": 1.4413674265146255, + "ce_loss_7": 2.7717737197875976, + "epoch": 0.765, + "grad_norm": 14.489695554621445, + "kl_loss_13": 2087.6, + "kl_loss_26": 996.2, + "kl_loss_39": 612.05, + "kl_loss_7": 2779.4, + "learning_rate": 0.0001327041456712334, + "loss": 3229.05, + "step": 7650 + }, + { + "ce_loss_13": 2.514678430557251, + "ce_loss_26": 1.9890475004911423, + "ce_loss_39": 1.8049181282520295, + "ce_loss_52": 1.4897184193134307, + "ce_loss_7": 2.8373226463794707, + "epoch": 0.766, + "grad_norm": 13.809410965696319, + "kl_loss_13": 2109.8, + "kl_loss_26": 1013.7, + "kl_loss_39": 623.8, + "kl_loss_7": 2784.8, + "learning_rate": 0.00013162943106179747, + "loss": 3248.2, + "step": 7660 + }, + { + "ce_loss_13": 2.4804063200950623, + "ce_loss_26": 1.9478756994009019, + "ce_loss_39": 1.765100008249283, + "ce_loss_52": 1.446313591301441, + "ce_loss_7": 2.814295369386673, + "epoch": 0.767, + "grad_norm": 14.599429508154355, + "kl_loss_13": 2147.0, + "kl_loss_26": 1023.2, + "kl_loss_39": 636.45, + "kl_loss_7": 2854.0, + "learning_rate": 0.00013055842593990132, + "loss": 3217.4, + "step": 7670 + }, + { + "ce_loss_13": 2.4887916058301927, + "ce_loss_26": 1.9737455695867538, + "ce_loss_39": 1.7851827770471573, + "ce_loss_52": 1.4613285958766937, + "ce_loss_7": 2.8164610981941225, + "epoch": 0.768, + "grad_norm": 14.229376114254006, + "kl_loss_13": 2127.6, + "kl_loss_26": 1032.8, + "kl_loss_39": 640.5, + "kl_loss_7": 2811.2, + "learning_rate": 0.00012949114109055414, + "loss": 3223.675, + "step": 7680 + }, + { + "ce_loss_13": 2.389929732680321, + "ce_loss_26": 1.8889827966690063, + "ce_loss_39": 1.7090455144643784, + "ce_loss_52": 1.4120649307966233, + "ce_loss_7": 2.710143965482712, + "epoch": 0.769, + "grad_norm": 13.823270022584358, + "kl_loss_13": 2025.6, + "kl_loss_26": 958.6, + "kl_loss_39": 589.95, + "kl_loss_7": 2700.0, + "learning_rate": 0.00012842758726130281, + "loss": 3247.75, + "step": 7690 + }, + { + "ce_loss_13": 2.444611003994942, + "ce_loss_26": 1.9318826824426651, + "ce_loss_39": 1.7527276873588562, + "ce_loss_52": 1.4516576603055, + "ce_loss_7": 2.7657779157161713, + "epoch": 0.77, + "grad_norm": 14.431273089148151, + "kl_loss_13": 2049.2, + "kl_loss_26": 963.2, + "kl_loss_39": 589.25, + "kl_loss_7": 2726.0, + "learning_rate": 0.00012736777516212267, + "loss": 3216.75, + "step": 7700 + }, + { + "ce_loss_13": 2.441952568292618, + "ce_loss_26": 1.9228222370147705, + "ce_loss_39": 1.7338123947381974, + "ce_loss_52": 1.4193892806768418, + "ce_loss_7": 2.7718379318714144, + "epoch": 0.771, + "grad_norm": 13.661830756949985, + "kl_loss_13": 2115.2, + "kl_loss_26": 1008.4, + "kl_loss_39": 620.0, + "kl_loss_7": 2804.8, + "learning_rate": 0.00012631171546530968, + "loss": 3199.55, + "step": 7710 + }, + { + "ce_loss_13": 2.4535767167806624, + "ce_loss_26": 1.9207569301128387, + "ce_loss_39": 1.7325547844171525, + "ce_loss_52": 1.4173025369644165, + "ce_loss_7": 2.7801734030246736, + "epoch": 0.772, + "grad_norm": 14.176576196561767, + "kl_loss_13": 2111.8, + "kl_loss_26": 1005.2, + "kl_loss_39": 618.15, + "kl_loss_7": 2793.6, + "learning_rate": 0.00012525941880537307, + "loss": 3214.15, + "step": 7720 + }, + { + "ce_loss_13": 2.4484546184539795, + "ce_loss_26": 1.9310883104801178, + "ce_loss_39": 1.7474435329437257, + "ce_loss_52": 1.4392137452960014, + "ce_loss_7": 2.779611772298813, + "epoch": 0.773, + "grad_norm": 14.626780180795521, + "kl_loss_13": 2095.4, + "kl_loss_26": 994.8, + "kl_loss_39": 607.9, + "kl_loss_7": 2786.8, + "learning_rate": 0.00012421089577892869, + "loss": 3191.6, + "step": 7730 + }, + { + "ce_loss_13": 2.463806739449501, + "ce_loss_26": 1.9203673034906388, + "ce_loss_39": 1.7268804877996444, + "ce_loss_52": 1.4067875519394875, + "ce_loss_7": 2.797054660320282, + "epoch": 0.774, + "grad_norm": 14.221427151080144, + "kl_loss_13": 2151.0, + "kl_loss_26": 1032.4, + "kl_loss_39": 637.25, + "kl_loss_7": 2842.0, + "learning_rate": 0.0001231661569445919, + "loss": 3214.8, + "step": 7740 + }, + { + "ce_loss_13": 2.4840691089630127, + "ce_loss_26": 1.9805045217275619, + "ce_loss_39": 1.7966417849063874, + "ce_loss_52": 1.4883142501115798, + "ce_loss_7": 2.8029735326766967, + "epoch": 0.775, + "grad_norm": 14.614162489546528, + "kl_loss_13": 2069.8, + "kl_loss_26": 990.4, + "kl_loss_39": 609.05, + "kl_loss_7": 2743.2, + "learning_rate": 0.00012212521282287093, + "loss": 3200.5, + "step": 7750 + }, + { + "ce_loss_13": 2.4842973172664644, + "ce_loss_26": 1.9586603373289109, + "ce_loss_39": 1.767923679947853, + "ce_loss_52": 1.450934961438179, + "ce_loss_7": 2.815416473150253, + "epoch": 0.776, + "grad_norm": 14.872662321169154, + "kl_loss_13": 2137.2, + "kl_loss_26": 1028.5, + "kl_loss_39": 636.95, + "kl_loss_7": 2826.4, + "learning_rate": 0.00012108807389606158, + "loss": 3221.25, + "step": 7760 + }, + { + "ce_loss_13": 2.430084604024887, + "ce_loss_26": 1.9105432122945785, + "ce_loss_39": 1.7237805485725404, + "ce_loss_52": 1.419256439805031, + "ce_loss_7": 2.7609162449836733, + "epoch": 0.777, + "grad_norm": 14.122349060255786, + "kl_loss_13": 2075.2, + "kl_loss_26": 984.9, + "kl_loss_39": 602.95, + "kl_loss_7": 2769.2, + "learning_rate": 0.00012005475060814159, + "loss": 3219.35, + "step": 7770 + }, + { + "ce_loss_13": 2.4920803755521774, + "ce_loss_26": 1.977930763363838, + "ce_loss_39": 1.792539432644844, + "ce_loss_52": 1.484375348687172, + "ce_loss_7": 2.818430072069168, + "epoch": 0.778, + "grad_norm": 14.838228117967187, + "kl_loss_13": 2079.2, + "kl_loss_26": 984.5, + "kl_loss_39": 603.55, + "kl_loss_7": 2766.8, + "learning_rate": 0.00011902525336466464, + "loss": 3193.3, + "step": 7780 + }, + { + "ce_loss_13": 2.4326795816421507, + "ce_loss_26": 1.9094915211200714, + "ce_loss_39": 1.7265714228153228, + "ce_loss_52": 1.4181353628635407, + "ce_loss_7": 2.757854151725769, + "epoch": 0.779, + "grad_norm": 13.98078513544715, + "kl_loss_13": 2078.4, + "kl_loss_26": 991.0, + "kl_loss_39": 610.15, + "kl_loss_7": 2758.8, + "learning_rate": 0.00011799959253265668, + "loss": 3208.85, + "step": 7790 + }, + { + "ce_loss_13": 2.42523156106472, + "ce_loss_26": 1.9246951520442963, + "ce_loss_39": 1.7428549587726594, + "ce_loss_52": 1.4467457503080368, + "ce_loss_7": 2.7435911536216735, + "epoch": 0.78, + "grad_norm": 13.811296718067993, + "kl_loss_13": 2022.0, + "kl_loss_26": 950.5, + "kl_loss_39": 578.7, + "kl_loss_7": 2701.6, + "learning_rate": 0.00011697777844051105, + "loss": 3204.7, + "step": 7800 + }, + { + "ce_loss_13": 2.496878683567047, + "ce_loss_26": 1.9677572190761565, + "ce_loss_39": 1.7739976853132249, + "ce_loss_52": 1.459720864892006, + "ce_loss_7": 2.8327562749385833, + "epoch": 0.781, + "grad_norm": 14.08625224164901, + "kl_loss_13": 2120.4, + "kl_loss_26": 1008.7, + "kl_loss_39": 615.4, + "kl_loss_7": 2822.4, + "learning_rate": 0.00011595982137788402, + "loss": 3198.55, + "step": 7810 + }, + { + "ce_loss_13": 2.4920260161161423, + "ce_loss_26": 1.9670526027679442, + "ce_loss_39": 1.783473041653633, + "ce_loss_52": 1.4704290598630905, + "ce_loss_7": 2.8202459871768952, + "epoch": 0.782, + "grad_norm": 14.200442027165447, + "kl_loss_13": 2103.8, + "kl_loss_26": 1004.1, + "kl_loss_39": 624.45, + "kl_loss_7": 2800.4, + "learning_rate": 0.00011494573159559212, + "loss": 3223.6, + "step": 7820 + }, + { + "ce_loss_13": 2.4327739059925078, + "ce_loss_26": 1.9302410751581192, + "ce_loss_39": 1.7492202669382095, + "ce_loss_52": 1.4421478152275085, + "ce_loss_7": 2.7496786177158357, + "epoch": 0.783, + "grad_norm": 13.561882382659508, + "kl_loss_13": 2046.4, + "kl_loss_26": 985.3, + "kl_loss_39": 604.05, + "kl_loss_7": 2712.8, + "learning_rate": 0.00011393551930550828, + "loss": 3172.1, + "step": 7830 + }, + { + "ce_loss_13": 2.456186518073082, + "ce_loss_26": 1.9265149384737015, + "ce_loss_39": 1.7347608864307404, + "ce_loss_52": 1.4248275607824326, + "ce_loss_7": 2.7833105325698853, + "epoch": 0.784, + "grad_norm": 14.152082617728679, + "kl_loss_13": 2109.0, + "kl_loss_26": 1006.1, + "kl_loss_39": 611.45, + "kl_loss_7": 2800.8, + "learning_rate": 0.00011292919468045875, + "loss": 3208.05, + "step": 7840 + }, + { + "ce_loss_13": 2.4463070958852766, + "ce_loss_26": 1.9245142936706543, + "ce_loss_39": 1.736380136013031, + "ce_loss_52": 1.4392430812120438, + "ce_loss_7": 2.776514196395874, + "epoch": 0.785, + "grad_norm": 13.366541015007158, + "kl_loss_13": 2080.4, + "kl_loss_26": 983.3, + "kl_loss_39": 597.9, + "kl_loss_7": 2776.4, + "learning_rate": 0.00011192676785412154, + "loss": 3185.35, + "step": 7850 + }, + { + "ce_loss_13": 2.421262636780739, + "ce_loss_26": 1.9249890923500061, + "ce_loss_39": 1.7500147104263306, + "ce_loss_52": 1.4561963319778441, + "ce_loss_7": 2.741842967271805, + "epoch": 0.786, + "grad_norm": 15.122779815741898, + "kl_loss_13": 1987.4, + "kl_loss_26": 937.4, + "kl_loss_39": 570.05, + "kl_loss_7": 2649.2, + "learning_rate": 0.00011092824892092374, + "loss": 3155.1, + "step": 7860 + }, + { + "ce_loss_13": 2.528256595134735, + "ce_loss_26": 2.005132633447647, + "ce_loss_39": 1.8220074683427812, + "ce_loss_52": 1.49842167198658, + "ce_loss_7": 2.8524305701255797, + "epoch": 0.787, + "grad_norm": 14.392088598180058, + "kl_loss_13": 2135.0, + "kl_loss_26": 1024.4, + "kl_loss_39": 633.4, + "kl_loss_7": 2828.0, + "learning_rate": 0.0001099336479359398, + "loss": 3228.45, + "step": 7870 + }, + { + "ce_loss_13": 2.4351921498775484, + "ce_loss_26": 1.9118896454572678, + "ce_loss_39": 1.7326159566640853, + "ce_loss_52": 1.43462935090065, + "ce_loss_7": 2.7571564972400666, + "epoch": 0.788, + "grad_norm": 14.109231297319436, + "kl_loss_13": 2083.4, + "kl_loss_26": 971.8, + "kl_loss_39": 593.25, + "kl_loss_7": 2767.2, + "learning_rate": 0.00010894297491479043, + "loss": 3224.35, + "step": 7880 + }, + { + "ce_loss_13": 2.4037249386310577, + "ce_loss_26": 1.8971479564905167, + "ce_loss_39": 1.713542652130127, + "ce_loss_52": 1.416736051440239, + "ce_loss_7": 2.7244735300540923, + "epoch": 0.789, + "grad_norm": 14.646464194445937, + "kl_loss_13": 2027.0, + "kl_loss_26": 959.6, + "kl_loss_39": 587.05, + "kl_loss_7": 2694.8, + "learning_rate": 0.00010795623983354214, + "loss": 3163.9, + "step": 7890 + }, + { + "ce_loss_13": 2.462240958213806, + "ce_loss_26": 1.9522877007722854, + "ce_loss_39": 1.7622255086898804, + "ce_loss_52": 1.447747752070427, + "ce_loss_7": 2.781180214881897, + "epoch": 0.79, + "grad_norm": 14.486453301055112, + "kl_loss_13": 2093.8, + "kl_loss_26": 999.0, + "kl_loss_39": 616.7, + "kl_loss_7": 2768.0, + "learning_rate": 0.00010697345262860636, + "loss": 3189.25, + "step": 7900 + }, + { + "ce_loss_13": 2.443204700946808, + "ce_loss_26": 1.9252238601446152, + "ce_loss_39": 1.7538990557193757, + "ce_loss_52": 1.4549198508262635, + "ce_loss_7": 2.762816107273102, + "epoch": 0.791, + "grad_norm": 14.756973656564183, + "kl_loss_13": 2043.6, + "kl_loss_26": 953.1, + "kl_loss_39": 582.75, + "kl_loss_7": 2730.4, + "learning_rate": 0.00010599462319663906, + "loss": 3189.25, + "step": 7910 + }, + { + "ce_loss_13": 2.4734450757503508, + "ce_loss_26": 1.9476019829511642, + "ce_loss_39": 1.7569423377513886, + "ce_loss_52": 1.448093169927597, + "ce_loss_7": 2.7963216602802277, + "epoch": 0.792, + "grad_norm": 14.148651159008981, + "kl_loss_13": 2096.8, + "kl_loss_26": 997.8, + "kl_loss_39": 608.35, + "kl_loss_7": 2784.0, + "learning_rate": 0.00010501976139444191, + "loss": 3199.1, + "step": 7920 + }, + { + "ce_loss_13": 2.460918265581131, + "ce_loss_26": 1.9421131610870361, + "ce_loss_39": 1.7556595474481582, + "ce_loss_52": 1.4451121121644974, + "ce_loss_7": 2.7843388080596925, + "epoch": 0.793, + "grad_norm": 14.590509352597412, + "kl_loss_13": 2099.4, + "kl_loss_26": 995.3, + "kl_loss_39": 605.2, + "kl_loss_7": 2780.8, + "learning_rate": 0.0001040488770388625, + "loss": 3203.35, + "step": 7930 + }, + { + "ce_loss_13": 2.379360908269882, + "ce_loss_26": 1.8618569314479827, + "ce_loss_39": 1.6797795861959457, + "ce_loss_52": 1.3791978135704994, + "ce_loss_7": 2.708191817998886, + "epoch": 0.794, + "grad_norm": 14.16954347608881, + "kl_loss_13": 2053.0, + "kl_loss_26": 969.8, + "kl_loss_39": 592.9, + "kl_loss_7": 2740.8, + "learning_rate": 0.00010308197990669538, + "loss": 3181.45, + "step": 7940 + }, + { + "ce_loss_13": 2.4218691647052766, + "ce_loss_26": 1.9039832711219788, + "ce_loss_39": 1.7168581753969192, + "ce_loss_52": 1.414185357093811, + "ce_loss_7": 2.7429304718971252, + "epoch": 0.795, + "grad_norm": 13.813266382907072, + "kl_loss_13": 2088.6, + "kl_loss_26": 990.3, + "kl_loss_39": 604.5, + "kl_loss_7": 2776.4, + "learning_rate": 0.0001021190797345839, + "loss": 3178.1, + "step": 7950 + }, + { + "ce_loss_13": 2.487806275486946, + "ce_loss_26": 1.9796326756477356, + "ce_loss_39": 1.7957882821559905, + "ce_loss_52": 1.484969075024128, + "ce_loss_7": 2.8111346662044525, + "epoch": 0.796, + "grad_norm": 14.164838891881972, + "kl_loss_13": 2047.4, + "kl_loss_26": 971.5, + "kl_loss_39": 591.45, + "kl_loss_7": 2717.2, + "learning_rate": 0.00010116018621892236, + "loss": 3174.95, + "step": 7960 + }, + { + "ce_loss_13": 2.398695731163025, + "ce_loss_26": 1.8858011841773987, + "ce_loss_39": 1.7020757973194123, + "ce_loss_52": 1.4106510564684869, + "ce_loss_7": 2.721856439113617, + "epoch": 0.797, + "grad_norm": 14.161604323369735, + "kl_loss_13": 2034.0, + "kl_loss_26": 953.8, + "kl_loss_39": 575.1, + "kl_loss_7": 2712.8, + "learning_rate": 0.00010020530901575753, + "loss": 3177.95, + "step": 7970 + }, + { + "ce_loss_13": 2.427662065625191, + "ce_loss_26": 1.921607220172882, + "ce_loss_39": 1.741393145918846, + "ce_loss_52": 1.438681322336197, + "ce_loss_7": 2.746969664096832, + "epoch": 0.798, + "grad_norm": 14.770550097448835, + "kl_loss_13": 2042.0, + "kl_loss_26": 965.9, + "kl_loss_39": 592.1, + "kl_loss_7": 2705.2, + "learning_rate": 9.925445774069231e-05, + "loss": 3170.6, + "step": 7980 + }, + { + "ce_loss_13": 2.4366293847560883, + "ce_loss_26": 1.9163338214159011, + "ce_loss_39": 1.7320117831230164, + "ce_loss_52": 1.4146809190511704, + "ce_loss_7": 2.772968965768814, + "epoch": 0.799, + "grad_norm": 13.97785133684068, + "kl_loss_13": 2098.2, + "kl_loss_26": 1008.8, + "kl_loss_39": 619.4, + "kl_loss_7": 2795.6, + "learning_rate": 9.830764196878872e-05, + "loss": 3210.25, + "step": 7990 + }, + { + "ce_loss_13": 2.519176536798477, + "ce_loss_26": 2.0015997767448424, + "ce_loss_39": 1.8160304486751557, + "ce_loss_52": 1.4840710669755937, + "ce_loss_7": 2.842372918128967, + "epoch": 0.8, + "grad_norm": 13.949092199595771, + "kl_loss_13": 2133.0, + "kl_loss_26": 1036.3, + "kl_loss_39": 652.95, + "kl_loss_7": 2818.8, + "learning_rate": 9.736487123447069e-05, + "loss": 3181.95, + "step": 8000 + }, + { + "ce_loss_13": 2.4726769655942915, + "ce_loss_26": 1.9741164237260818, + "ce_loss_39": 1.790221494436264, + "ce_loss_52": 1.482371485233307, + "ce_loss_7": 2.799379500746727, + "epoch": 0.801, + "grad_norm": 13.707660286112029, + "kl_loss_13": 2038.0, + "kl_loss_26": 977.9, + "kl_loss_39": 599.8, + "kl_loss_7": 2716.8, + "learning_rate": 9.642615503142926e-05, + "loss": 3173.65, + "step": 8010 + }, + { + "ce_loss_13": 2.4063422054052355, + "ce_loss_26": 1.904910859465599, + "ce_loss_39": 1.7236665695905686, + "ce_loss_52": 1.4340474352240562, + "ce_loss_7": 2.726147544384003, + "epoch": 0.802, + "grad_norm": 14.759158260471319, + "kl_loss_13": 2004.0, + "kl_loss_26": 947.5, + "kl_loss_39": 570.8, + "kl_loss_7": 2678.8, + "learning_rate": 9.549150281252633e-05, + "loss": 3210.3, + "step": 8020 + }, + { + "ce_loss_13": 2.439296191930771, + "ce_loss_26": 1.930625182390213, + "ce_loss_39": 1.7471411645412445, + "ce_loss_52": 1.4468423128128052, + "ce_loss_7": 2.759518486261368, + "epoch": 0.803, + "grad_norm": 14.138193136913905, + "kl_loss_13": 2029.8, + "kl_loss_26": 959.0, + "kl_loss_39": 580.8, + "kl_loss_7": 2710.0, + "learning_rate": 9.4560923989699e-05, + "loss": 3188.85, + "step": 8030 + }, + { + "ce_loss_13": 2.383785030245781, + "ce_loss_26": 1.8730993419885635, + "ce_loss_39": 1.689489060640335, + "ce_loss_52": 1.3906694814562797, + "ce_loss_7": 2.715326648950577, + "epoch": 0.804, + "grad_norm": 14.525642481529378, + "kl_loss_13": 2063.0, + "kl_loss_26": 966.1, + "kl_loss_39": 591.6, + "kl_loss_7": 2756.4, + "learning_rate": 9.363442793386607e-05, + "loss": 3171.0, + "step": 8040 + }, + { + "ce_loss_13": 2.436983805894852, + "ce_loss_26": 1.9301572561264038, + "ce_loss_39": 1.7478452265262603, + "ce_loss_52": 1.4431490540504455, + "ce_loss_7": 2.7593387603759765, + "epoch": 0.805, + "grad_norm": 14.17334080276183, + "kl_loss_13": 2038.0, + "kl_loss_26": 967.0, + "kl_loss_39": 590.6, + "kl_loss_7": 2714.4, + "learning_rate": 9.271202397483213e-05, + "loss": 3157.4, + "step": 8050 + }, + { + "ce_loss_13": 2.4742675691843035, + "ce_loss_26": 1.9660158514976502, + "ce_loss_39": 1.7886695712804794, + "ce_loss_52": 1.4793777346611023, + "ce_loss_7": 2.799517345428467, + "epoch": 0.806, + "grad_norm": 14.807557523499636, + "kl_loss_13": 2047.0, + "kl_loss_26": 977.2, + "kl_loss_39": 606.0, + "kl_loss_7": 2719.6, + "learning_rate": 9.179372140119524e-05, + "loss": 3197.5, + "step": 8060 + }, + { + "ce_loss_13": 2.404207941889763, + "ce_loss_26": 1.8871434926986694, + "ce_loss_39": 1.7082382440567017, + "ce_loss_52": 1.4106020584702492, + "ce_loss_7": 2.7318927943706512, + "epoch": 0.807, + "grad_norm": 14.241838480847589, + "kl_loss_13": 2054.2, + "kl_loss_26": 966.9, + "kl_loss_39": 589.9, + "kl_loss_7": 2745.6, + "learning_rate": 9.087952946025175e-05, + "loss": 3174.15, + "step": 8070 + }, + { + "ce_loss_13": 2.405471110343933, + "ce_loss_26": 1.9090194314718247, + "ce_loss_39": 1.7282894462347032, + "ce_loss_52": 1.4298398733139037, + "ce_loss_7": 2.7295302629470823, + "epoch": 0.808, + "grad_norm": 14.246105109452872, + "kl_loss_13": 2023.6, + "kl_loss_26": 956.4, + "kl_loss_39": 585.5, + "kl_loss_7": 2704.0, + "learning_rate": 8.996945735790446e-05, + "loss": 3220.95, + "step": 8080 + }, + { + "ce_loss_13": 2.4230452179908752, + "ce_loss_26": 1.9108994454145432, + "ce_loss_39": 1.7252773225307465, + "ce_loss_52": 1.4297530561685563, + "ce_loss_7": 2.7479528963565825, + "epoch": 0.809, + "grad_norm": 14.241539859828608, + "kl_loss_13": 2056.4, + "kl_loss_26": 974.5, + "kl_loss_39": 592.4, + "kl_loss_7": 2738.0, + "learning_rate": 8.906351425856951e-05, + "loss": 3187.2, + "step": 8090 + }, + { + "ce_loss_13": 2.5193986773490904, + "ce_loss_26": 2.004713475704193, + "ce_loss_39": 1.8214319556951524, + "ce_loss_52": 1.5133182466030122, + "ce_loss_7": 2.836967188119888, + "epoch": 0.81, + "grad_norm": 13.856573501498156, + "kl_loss_13": 2059.0, + "kl_loss_26": 983.5, + "kl_loss_39": 604.75, + "kl_loss_7": 2738.8, + "learning_rate": 8.816170928508365e-05, + "loss": 3199.5, + "step": 8100 + }, + { + "ce_loss_13": 2.463338887691498, + "ce_loss_26": 1.9509768843650819, + "ce_loss_39": 1.7714523404836655, + "ce_loss_52": 1.4671026438474655, + "ce_loss_7": 2.7907890677452087, + "epoch": 0.811, + "grad_norm": 14.442792993010306, + "kl_loss_13": 2039.6, + "kl_loss_26": 963.7, + "kl_loss_39": 594.85, + "kl_loss_7": 2727.2, + "learning_rate": 8.7264051518613e-05, + "loss": 3182.6, + "step": 8110 + }, + { + "ce_loss_13": 2.35435933470726, + "ce_loss_26": 1.8515879094600678, + "ce_loss_39": 1.675445196032524, + "ce_loss_52": 1.3868303269147872, + "ce_loss_7": 2.6736503660678865, + "epoch": 0.812, + "grad_norm": 15.186200546439618, + "kl_loss_13": 2007.0, + "kl_loss_26": 943.9, + "kl_loss_39": 575.7, + "kl_loss_7": 2679.2, + "learning_rate": 8.637054999856148e-05, + "loss": 3182.4, + "step": 8120 + }, + { + "ce_loss_13": 2.4665314495563506, + "ce_loss_26": 1.9507400870323182, + "ce_loss_39": 1.7698681026697158, + "ce_loss_52": 1.463287603855133, + "ce_loss_7": 2.790166562795639, + "epoch": 0.813, + "grad_norm": 14.702441450046226, + "kl_loss_13": 2081.6, + "kl_loss_26": 994.3, + "kl_loss_39": 606.6, + "kl_loss_7": 2757.2, + "learning_rate": 8.548121372247918e-05, + "loss": 3195.8, + "step": 8130 + }, + { + "ce_loss_13": 2.4175081342458724, + "ce_loss_26": 1.894961017370224, + "ce_loss_39": 1.7168689727783204, + "ce_loss_52": 1.4184614822268486, + "ce_loss_7": 2.74496705532074, + "epoch": 0.814, + "grad_norm": 14.150480690009331, + "kl_loss_13": 2042.4, + "kl_loss_26": 956.7, + "kl_loss_39": 584.15, + "kl_loss_7": 2725.2, + "learning_rate": 8.459605164597267e-05, + "loss": 3148.95, + "step": 8140 + }, + { + "ce_loss_13": 2.4137402385473252, + "ce_loss_26": 1.907509195804596, + "ce_loss_39": 1.7280682563781737, + "ce_loss_52": 1.4408938705921173, + "ce_loss_7": 2.734111136198044, + "epoch": 0.815, + "grad_norm": 14.607407219816483, + "kl_loss_13": 2031.8, + "kl_loss_26": 952.0, + "kl_loss_39": 576.85, + "kl_loss_7": 2711.2, + "learning_rate": 8.371507268261436e-05, + "loss": 3141.15, + "step": 8150 + }, + { + "ce_loss_13": 2.45993629693985, + "ce_loss_26": 1.9489454805850983, + "ce_loss_39": 1.7687684744596481, + "ce_loss_52": 1.4672614842653275, + "ce_loss_7": 2.7731840908527374, + "epoch": 0.816, + "grad_norm": 13.941573996490533, + "kl_loss_13": 2049.4, + "kl_loss_26": 971.7, + "kl_loss_39": 590.15, + "kl_loss_7": 2723.6, + "learning_rate": 8.283828570385238e-05, + "loss": 3167.65, + "step": 8160 + }, + { + "ce_loss_13": 2.457903391122818, + "ce_loss_26": 1.9481427311897277, + "ce_loss_39": 1.7692535519599915, + "ce_loss_52": 1.473311385512352, + "ce_loss_7": 2.7900636374950407, + "epoch": 0.817, + "grad_norm": 13.820607552825622, + "kl_loss_13": 2034.8, + "kl_loss_26": 960.7, + "kl_loss_39": 581.95, + "kl_loss_7": 2732.8, + "learning_rate": 8.196569953892202e-05, + "loss": 3175.55, + "step": 8170 + }, + { + "ce_loss_13": 2.424106791615486, + "ce_loss_26": 1.9179951936006545, + "ce_loss_39": 1.7393042415380477, + "ce_loss_52": 1.4501032710075379, + "ce_loss_7": 2.7422122418880464, + "epoch": 0.818, + "grad_norm": 13.95954126780279, + "kl_loss_13": 2017.4, + "kl_loss_26": 939.3, + "kl_loss_39": 570.5, + "kl_loss_7": 2684.4, + "learning_rate": 8.109732297475635e-05, + "loss": 3172.8, + "step": 8180 + }, + { + "ce_loss_13": 2.4480927348136903, + "ce_loss_26": 1.9403320997953415, + "ce_loss_39": 1.7644436001777648, + "ce_loss_52": 1.4613411754369736, + "ce_loss_7": 2.767207592725754, + "epoch": 0.819, + "grad_norm": 15.077790655269643, + "kl_loss_13": 2034.2, + "kl_loss_26": 959.3, + "kl_loss_39": 589.55, + "kl_loss_7": 2707.6, + "learning_rate": 8.023316475589754e-05, + "loss": 3151.8, + "step": 8190 + }, + { + "ce_loss_13": 2.389411324262619, + "ce_loss_26": 1.8819621950387955, + "ce_loss_39": 1.7063862174749374, + "ce_loss_52": 1.4187449038028717, + "ce_loss_7": 2.7186341762542723, + "epoch": 0.82, + "grad_norm": 14.069790186558153, + "kl_loss_13": 2015.6, + "kl_loss_26": 934.0, + "kl_loss_39": 570.4, + "kl_loss_7": 2711.2, + "learning_rate": 7.937323358440934e-05, + "loss": 3158.45, + "step": 8200 + }, + { + "ce_loss_13": 2.463495451211929, + "ce_loss_26": 1.9460216015577316, + "ce_loss_39": 1.7633485794067383, + "ce_loss_52": 1.459425413608551, + "ce_loss_7": 2.7843497574329374, + "epoch": 0.821, + "grad_norm": 14.129619264599885, + "kl_loss_13": 2034.2, + "kl_loss_26": 962.3, + "kl_loss_39": 588.55, + "kl_loss_7": 2710.0, + "learning_rate": 7.851753811978923e-05, + "loss": 3172.55, + "step": 8210 + }, + { + "ce_loss_13": 2.3558076560497283, + "ce_loss_26": 1.8546594500541687, + "ce_loss_39": 1.6799951493740082, + "ce_loss_52": 1.392863529920578, + "ce_loss_7": 2.6725959718227386, + "epoch": 0.822, + "grad_norm": 13.287512232663138, + "kl_loss_13": 1979.6, + "kl_loss_26": 919.4, + "kl_loss_39": 562.4, + "kl_loss_7": 2652.0, + "learning_rate": 7.766608697888095e-05, + "loss": 3151.0, + "step": 8220 + }, + { + "ce_loss_13": 2.4153092801570892, + "ce_loss_26": 1.9061992377042771, + "ce_loss_39": 1.7255131870508194, + "ce_loss_52": 1.4212424442172051, + "ce_loss_7": 2.748347020149231, + "epoch": 0.823, + "grad_norm": 14.43300896159423, + "kl_loss_13": 2076.6, + "kl_loss_26": 977.9, + "kl_loss_39": 599.3, + "kl_loss_7": 2767.2, + "learning_rate": 7.681888873578785e-05, + "loss": 3171.1, + "step": 8230 + }, + { + "ce_loss_13": 2.4028283417224885, + "ce_loss_26": 1.9037913769483565, + "ce_loss_39": 1.729577499628067, + "ce_loss_52": 1.4313182592391969, + "ce_loss_7": 2.7214892983436583, + "epoch": 0.824, + "grad_norm": 14.091655498494992, + "kl_loss_13": 1997.8, + "kl_loss_26": 945.5, + "kl_loss_39": 582.95, + "kl_loss_7": 2668.0, + "learning_rate": 7.597595192178702e-05, + "loss": 3129.45, + "step": 8240 + }, + { + "ce_loss_13": 2.385217198729515, + "ce_loss_26": 1.8788419783115387, + "ce_loss_39": 1.7016818612813949, + "ce_loss_52": 1.4092606633901597, + "ce_loss_7": 2.714561605453491, + "epoch": 0.825, + "grad_norm": 14.092565500396708, + "kl_loss_13": 2015.4, + "kl_loss_26": 950.7, + "kl_loss_39": 578.35, + "kl_loss_7": 2700.8, + "learning_rate": 7.513728502524286e-05, + "loss": 3103.55, + "step": 8250 + }, + { + "ce_loss_13": 2.411863788962364, + "ce_loss_26": 1.8953818708658219, + "ce_loss_39": 1.712015947699547, + "ce_loss_52": 1.4228723630309106, + "ce_loss_7": 2.7335788309574127, + "epoch": 0.826, + "grad_norm": 14.683616091837887, + "kl_loss_13": 2027.2, + "kl_loss_26": 946.2, + "kl_loss_39": 569.45, + "kl_loss_7": 2700.8, + "learning_rate": 7.430289649152156e-05, + "loss": 3186.45, + "step": 8260 + }, + { + "ce_loss_13": 2.4488519340753556, + "ce_loss_26": 1.9516287744045258, + "ce_loss_39": 1.7749818950891494, + "ce_loss_52": 1.479728889465332, + "ce_loss_7": 2.7712768018245697, + "epoch": 0.827, + "grad_norm": 13.955547255495208, + "kl_loss_13": 1991.0, + "kl_loss_26": 940.5, + "kl_loss_39": 573.4, + "kl_loss_7": 2658.4, + "learning_rate": 7.347279472290646e-05, + "loss": 3163.475, + "step": 8270 + }, + { + "ce_loss_13": 2.3786238610744475, + "ce_loss_26": 1.8683661013841628, + "ce_loss_39": 1.690899032354355, + "ce_loss_52": 1.4009160608053208, + "ce_loss_7": 2.705441731214523, + "epoch": 0.828, + "grad_norm": 14.039626698915084, + "kl_loss_13": 2016.2, + "kl_loss_26": 938.6, + "kl_loss_39": 571.15, + "kl_loss_7": 2694.4, + "learning_rate": 7.264698807851328e-05, + "loss": 3118.5, + "step": 8280 + }, + { + "ce_loss_13": 2.466960498690605, + "ce_loss_26": 1.946975302696228, + "ce_loss_39": 1.7688733905553817, + "ce_loss_52": 1.4637437134981155, + "ce_loss_7": 2.8006490588188173, + "epoch": 0.829, + "grad_norm": 14.106474347437752, + "kl_loss_13": 2090.4, + "kl_loss_26": 984.1, + "kl_loss_39": 609.1, + "kl_loss_7": 2790.4, + "learning_rate": 7.182548487420554e-05, + "loss": 3184.4, + "step": 8290 + }, + { + "ce_loss_13": 2.5116629540920257, + "ce_loss_26": 1.9985181391239166, + "ce_loss_39": 1.8124066442251205, + "ce_loss_52": 1.497176530957222, + "ce_loss_7": 2.8335696399211883, + "epoch": 0.83, + "grad_norm": 14.139665843791814, + "kl_loss_13": 2098.8, + "kl_loss_26": 1001.4, + "kl_loss_39": 614.35, + "kl_loss_7": 2780.6, + "learning_rate": 7.100829338251146e-05, + "loss": 3198.35, + "step": 8300 + }, + { + "ce_loss_13": 2.4611269533634186, + "ce_loss_26": 1.9527796864509583, + "ce_loss_39": 1.7704098969697952, + "ce_loss_52": 1.462582492828369, + "ce_loss_7": 2.7841490387916563, + "epoch": 0.831, + "grad_norm": 13.992596562086698, + "kl_loss_13": 2062.6, + "kl_loss_26": 988.0, + "kl_loss_39": 605.35, + "kl_loss_7": 2743.2, + "learning_rate": 7.019542183254046e-05, + "loss": 3175.4, + "step": 8310 + }, + { + "ce_loss_13": 2.4340964376926424, + "ce_loss_26": 1.9217961221933364, + "ce_loss_39": 1.7397069931030273, + "ce_loss_52": 1.4361872345209121, + "ce_loss_7": 2.759904479980469, + "epoch": 0.832, + "grad_norm": 14.479445741622119, + "kl_loss_13": 2039.6, + "kl_loss_26": 971.9, + "kl_loss_39": 599.4, + "kl_loss_7": 2716.0, + "learning_rate": 6.938687840989971e-05, + "loss": 3159.45, + "step": 8320 + }, + { + "ce_loss_13": 2.4414653837680818, + "ce_loss_26": 1.928215390443802, + "ce_loss_39": 1.7369425565004348, + "ce_loss_52": 1.4351924806833267, + "ce_loss_7": 2.76216436624527, + "epoch": 0.833, + "grad_norm": 14.95621914365995, + "kl_loss_13": 2059.8, + "kl_loss_26": 983.2, + "kl_loss_39": 594.6, + "kl_loss_7": 2738.8, + "learning_rate": 6.858267125661271e-05, + "loss": 3174.0, + "step": 8330 + }, + { + "ce_loss_13": 2.400873589515686, + "ce_loss_26": 1.90281642973423, + "ce_loss_39": 1.7204748094081879, + "ce_loss_52": 1.4244474336504935, + "ce_loss_7": 2.7152935564517975, + "epoch": 0.834, + "grad_norm": 14.103644864595271, + "kl_loss_13": 2033.8, + "kl_loss_26": 973.5, + "kl_loss_39": 591.7, + "kl_loss_7": 2704.8, + "learning_rate": 6.778280847103668e-05, + "loss": 3170.05, + "step": 8340 + }, + { + "ce_loss_13": 2.374238893389702, + "ce_loss_26": 1.8607898473739624, + "ce_loss_39": 1.6831828862428666, + "ce_loss_52": 1.396483090519905, + "ce_loss_7": 2.704102611541748, + "epoch": 0.835, + "grad_norm": 14.599308778967789, + "kl_loss_13": 2031.4, + "kl_loss_26": 934.1, + "kl_loss_39": 566.7, + "kl_loss_7": 2719.2, + "learning_rate": 6.698729810778065e-05, + "loss": 3150.95, + "step": 8350 + }, + { + "ce_loss_13": 2.4648724853992463, + "ce_loss_26": 1.9494601666927338, + "ce_loss_39": 1.7671974629163743, + "ce_loss_52": 1.461349506676197, + "ce_loss_7": 2.794687694311142, + "epoch": 0.836, + "grad_norm": 14.476811719238219, + "kl_loss_13": 2060.2, + "kl_loss_26": 979.5, + "kl_loss_39": 601.55, + "kl_loss_7": 2750.8, + "learning_rate": 6.619614817762538e-05, + "loss": 3140.75, + "step": 8360 + }, + { + "ce_loss_13": 2.4076102912425994, + "ce_loss_26": 1.9041061371564865, + "ce_loss_39": 1.7215475410223007, + "ce_loss_52": 1.426313552260399, + "ce_loss_7": 2.729493075609207, + "epoch": 0.837, + "grad_norm": 14.66675091557762, + "kl_loss_13": 2014.4, + "kl_loss_26": 958.9, + "kl_loss_39": 585.55, + "kl_loss_7": 2689.6, + "learning_rate": 6.540936664744196e-05, + "loss": 3161.6, + "step": 8370 + }, + { + "ce_loss_13": 2.4151067316532133, + "ce_loss_26": 1.9024922668933868, + "ce_loss_39": 1.7205139189958571, + "ce_loss_52": 1.4315023928880692, + "ce_loss_7": 2.733920103311539, + "epoch": 0.838, + "grad_norm": 13.748708793526905, + "kl_loss_13": 2020.6, + "kl_loss_26": 957.3, + "kl_loss_39": 574.45, + "kl_loss_7": 2690.8, + "learning_rate": 6.462696144011149e-05, + "loss": 3148.0, + "step": 8380 + }, + { + "ce_loss_13": 2.4298708856105806, + "ce_loss_26": 1.9152013957500458, + "ce_loss_39": 1.737311202287674, + "ce_loss_52": 1.441886842250824, + "ce_loss_7": 2.757741445302963, + "epoch": 0.839, + "grad_norm": 14.560412200339597, + "kl_loss_13": 2019.6, + "kl_loss_26": 946.5, + "kl_loss_39": 576.85, + "kl_loss_7": 2709.2, + "learning_rate": 6.384894043444567e-05, + "loss": 3144.45, + "step": 8390 + }, + { + "ce_loss_13": 2.4371220886707308, + "ce_loss_26": 1.9178409904241562, + "ce_loss_39": 1.7318467199802399, + "ce_loss_52": 1.427680206298828, + "ce_loss_7": 2.7716069161891936, + "epoch": 0.84, + "grad_norm": 13.228002541168403, + "kl_loss_13": 2067.8, + "kl_loss_26": 990.0, + "kl_loss_39": 597.05, + "kl_loss_7": 2766.8, + "learning_rate": 6.307531146510753e-05, + "loss": 3145.15, + "step": 8400 + }, + { + "ce_loss_13": 2.4668938338756563, + "ce_loss_26": 1.9483750283718109, + "ce_loss_39": 1.7615112096071244, + "ce_loss_52": 1.4629206866025926, + "ce_loss_7": 2.7949269711971283, + "epoch": 0.841, + "grad_norm": 14.638289805261575, + "kl_loss_13": 2070.8, + "kl_loss_26": 979.1, + "kl_loss_39": 588.85, + "kl_loss_7": 2755.2, + "learning_rate": 6.230608232253226e-05, + "loss": 3135.55, + "step": 8410 + }, + { + "ce_loss_13": 2.498942193388939, + "ce_loss_26": 1.9719986289739608, + "ce_loss_39": 1.7773171186447143, + "ce_loss_52": 1.4589012682437896, + "ce_loss_7": 2.8206138908863068, + "epoch": 0.842, + "grad_norm": 14.547037792354297, + "kl_loss_13": 2143.2, + "kl_loss_26": 1027.6, + "kl_loss_39": 626.5, + "kl_loss_7": 2830.0, + "learning_rate": 6.154126075284855e-05, + "loss": 3179.05, + "step": 8420 + }, + { + "ce_loss_13": 2.339446923136711, + "ce_loss_26": 1.836980375647545, + "ce_loss_39": 1.66352079808712, + "ce_loss_52": 1.3777535080909729, + "ce_loss_7": 2.658161628246307, + "epoch": 0.843, + "grad_norm": 13.822498461755101, + "kl_loss_13": 2006.0, + "kl_loss_26": 933.6, + "kl_loss_39": 565.3, + "kl_loss_7": 2679.6, + "learning_rate": 6.078085445780129e-05, + "loss": 3158.075, + "step": 8430 + }, + { + "ce_loss_13": 2.4486551761627195, + "ce_loss_26": 1.9329589813947679, + "ce_loss_39": 1.7519038885831832, + "ce_loss_52": 1.4464032799005508, + "ce_loss_7": 2.776008838415146, + "epoch": 0.844, + "grad_norm": 13.676549255063142, + "kl_loss_13": 2050.2, + "kl_loss_26": 971.9, + "kl_loss_39": 595.05, + "kl_loss_7": 2741.2, + "learning_rate": 6.002487109467347e-05, + "loss": 3155.95, + "step": 8440 + }, + { + "ce_loss_13": 2.467064255475998, + "ce_loss_26": 1.9566147327423096, + "ce_loss_39": 1.7722377121448516, + "ce_loss_52": 1.4749930799007416, + "ce_loss_7": 2.7902898490428925, + "epoch": 0.845, + "grad_norm": 15.195098706872598, + "kl_loss_13": 2033.8, + "kl_loss_26": 959.4, + "kl_loss_39": 581.0, + "kl_loss_7": 2712.0, + "learning_rate": 5.927331827620902e-05, + "loss": 3169.7, + "step": 8450 + }, + { + "ce_loss_13": 2.389095312356949, + "ce_loss_26": 1.8802162408828735, + "ce_loss_39": 1.7042785853147506, + "ce_loss_52": 1.4126853346824646, + "ce_loss_7": 2.714692497253418, + "epoch": 0.846, + "grad_norm": 14.567703807315205, + "kl_loss_13": 2016.0, + "kl_loss_26": 940.7, + "kl_loss_39": 574.25, + "kl_loss_7": 2700.8, + "learning_rate": 5.852620357053651e-05, + "loss": 3111.0, + "step": 8460 + }, + { + "ce_loss_13": 2.4798492193222046, + "ce_loss_26": 1.9612985998392105, + "ce_loss_39": 1.7813422173261642, + "ce_loss_52": 1.4634388938546181, + "ce_loss_7": 2.8045433819293977, + "epoch": 0.847, + "grad_norm": 13.718944289269317, + "kl_loss_13": 2114.4, + "kl_loss_26": 1008.1, + "kl_loss_39": 624.7, + "kl_loss_7": 2801.2, + "learning_rate": 5.778353450109286e-05, + "loss": 3195.2, + "step": 8470 + }, + { + "ce_loss_13": 2.38386265039444, + "ce_loss_26": 1.8872032672166825, + "ce_loss_39": 1.710950767993927, + "ce_loss_52": 1.432640826702118, + "ce_loss_7": 2.694201183319092, + "epoch": 0.848, + "grad_norm": 14.259266485735452, + "kl_loss_13": 1968.6, + "kl_loss_26": 914.3, + "kl_loss_39": 549.1, + "kl_loss_7": 2623.2, + "learning_rate": 5.7045318546547206e-05, + "loss": 3137.025, + "step": 8480 + }, + { + "ce_loss_13": 2.427330991625786, + "ce_loss_26": 1.910456082224846, + "ce_loss_39": 1.7292529791593552, + "ce_loss_52": 1.4334994465112687, + "ce_loss_7": 2.7538663387298583, + "epoch": 0.849, + "grad_norm": 13.656054783413236, + "kl_loss_13": 2033.4, + "kl_loss_26": 954.9, + "kl_loss_39": 577.85, + "kl_loss_7": 2719.6, + "learning_rate": 5.631156314072605e-05, + "loss": 3150.65, + "step": 8490 + }, + { + "ce_loss_13": 2.471989703178406, + "ce_loss_26": 1.9505236119031906, + "ce_loss_39": 1.7668047726154328, + "ce_loss_52": 1.4608478724956513, + "ce_loss_7": 2.8063031315803526, + "epoch": 0.85, + "grad_norm": 13.681266151650567, + "kl_loss_13": 2080.4, + "kl_loss_26": 982.8, + "kl_loss_39": 600.35, + "kl_loss_7": 2780.8, + "learning_rate": 5.5582275672538315e-05, + "loss": 3137.95, + "step": 8500 + }, + { + "ce_loss_13": 2.4605359852313997, + "ce_loss_26": 1.939817100763321, + "ce_loss_39": 1.753285875916481, + "ce_loss_52": 1.4498123317956924, + "ce_loss_7": 2.7863478004932403, + "epoch": 0.851, + "grad_norm": 14.038129063761419, + "kl_loss_13": 2055.6, + "kl_loss_26": 978.1, + "kl_loss_39": 593.3, + "kl_loss_7": 2740.0, + "learning_rate": 5.4857463485900484e-05, + "loss": 3144.75, + "step": 8510 + }, + { + "ce_loss_13": 2.450565594434738, + "ce_loss_26": 1.9354894876480102, + "ce_loss_39": 1.7570757120847702, + "ce_loss_52": 1.458202052116394, + "ce_loss_7": 2.770776855945587, + "epoch": 0.852, + "grad_norm": 13.945411873449233, + "kl_loss_13": 2028.8, + "kl_loss_26": 956.4, + "kl_loss_39": 584.4, + "kl_loss_7": 2704.4, + "learning_rate": 5.413713387966329e-05, + "loss": 3147.35, + "step": 8520 + }, + { + "ce_loss_13": 2.384511134028435, + "ce_loss_26": 1.885845959186554, + "ce_loss_39": 1.7117790162563324, + "ce_loss_52": 1.4215580940246582, + "ce_loss_7": 2.6971611440181733, + "epoch": 0.853, + "grad_norm": 14.280539701015499, + "kl_loss_13": 1983.0, + "kl_loss_26": 937.9, + "kl_loss_39": 571.6, + "kl_loss_7": 2646.0, + "learning_rate": 5.34212941075381e-05, + "loss": 3138.7, + "step": 8530 + }, + { + "ce_loss_13": 2.4187089085578917, + "ce_loss_26": 1.9180882632732392, + "ce_loss_39": 1.7486219108104706, + "ce_loss_52": 1.4668725609779358, + "ce_loss_7": 2.7367011964321137, + "epoch": 0.854, + "grad_norm": 13.894895708827713, + "kl_loss_13": 1988.6, + "kl_loss_26": 918.2, + "kl_loss_39": 556.1, + "kl_loss_7": 2660.4, + "learning_rate": 5.270995137802315e-05, + "loss": 3116.45, + "step": 8540 + }, + { + "ce_loss_13": 2.415088692307472, + "ce_loss_26": 1.9041693419218064, + "ce_loss_39": 1.7267356216907501, + "ce_loss_52": 1.4279317557811737, + "ce_loss_7": 2.7469893753528596, + "epoch": 0.855, + "grad_norm": 14.31134935015859, + "kl_loss_13": 2040.4, + "kl_loss_26": 959.5, + "kl_loss_39": 585.9, + "kl_loss_7": 2737.2, + "learning_rate": 5.2003112854332125e-05, + "loss": 3108.3, + "step": 8550 + }, + { + "ce_loss_13": 2.461696755886078, + "ce_loss_26": 1.9468512892723084, + "ce_loss_39": 1.7631896048784257, + "ce_loss_52": 1.4654896438121796, + "ce_loss_7": 2.780461609363556, + "epoch": 0.856, + "grad_norm": 14.365329511248612, + "kl_loss_13": 2070.0, + "kl_loss_26": 974.7, + "kl_loss_39": 589.2, + "kl_loss_7": 2744.0, + "learning_rate": 5.130078565432089e-05, + "loss": 3173.9, + "step": 8560 + }, + { + "ce_loss_13": 2.4338246136903763, + "ce_loss_26": 1.9260531306266784, + "ce_loss_39": 1.746686053276062, + "ce_loss_52": 1.4506986886262894, + "ce_loss_7": 2.7584114193916323, + "epoch": 0.857, + "grad_norm": 13.58696871299719, + "kl_loss_13": 2040.8, + "kl_loss_26": 958.4, + "kl_loss_39": 587.6, + "kl_loss_7": 2720.8, + "learning_rate": 5.060297685041659e-05, + "loss": 3124.45, + "step": 8570 + }, + { + "ce_loss_13": 2.465153419971466, + "ce_loss_26": 1.9324041992425918, + "ce_loss_39": 1.747180885076523, + "ce_loss_52": 1.4378513038158416, + "ce_loss_7": 2.8079187512397765, + "epoch": 0.858, + "grad_norm": 13.69346881706034, + "kl_loss_13": 2127.2, + "kl_loss_26": 1005.7, + "kl_loss_39": 610.45, + "kl_loss_7": 2842.8, + "learning_rate": 4.99096934695461e-05, + "loss": 3135.55, + "step": 8580 + }, + { + "ce_loss_13": 2.428412067890167, + "ce_loss_26": 1.9264894247055053, + "ce_loss_39": 1.748258227109909, + "ce_loss_52": 1.4425264418125152, + "ce_loss_7": 2.7523947954177856, + "epoch": 0.859, + "grad_norm": 14.416520228353955, + "kl_loss_13": 2024.4, + "kl_loss_26": 961.5, + "kl_loss_39": 593.95, + "kl_loss_7": 2704.0, + "learning_rate": 4.922094249306558e-05, + "loss": 3141.65, + "step": 8590 + }, + { + "ce_loss_13": 2.379640507698059, + "ce_loss_26": 1.8703870117664336, + "ce_loss_39": 1.6925265491008759, + "ce_loss_52": 1.4014200061559676, + "ce_loss_7": 2.7008225679397584, + "epoch": 0.86, + "grad_norm": 14.548164181811538, + "kl_loss_13": 2019.0, + "kl_loss_26": 952.8, + "kl_loss_39": 580.7, + "kl_loss_7": 2704.4, + "learning_rate": 4.853673085668947e-05, + "loss": 3164.35, + "step": 8600 + }, + { + "ce_loss_13": 2.3882300436496733, + "ce_loss_26": 1.8816026329994202, + "ce_loss_39": 1.701096272468567, + "ce_loss_52": 1.4169353902339936, + "ce_loss_7": 2.711241126060486, + "epoch": 0.861, + "grad_norm": 13.997789256280235, + "kl_loss_13": 2014.0, + "kl_loss_26": 941.1, + "kl_loss_39": 561.45, + "kl_loss_7": 2697.6, + "learning_rate": 4.78570654504214e-05, + "loss": 3160.75, + "step": 8610 + }, + { + "ce_loss_13": 2.4367538392543793, + "ce_loss_26": 1.9072220534086228, + "ce_loss_39": 1.724021741747856, + "ce_loss_52": 1.4287808299064637, + "ce_loss_7": 2.7643966376781464, + "epoch": 0.862, + "grad_norm": 13.935763093108008, + "kl_loss_13": 2067.0, + "kl_loss_26": 962.6, + "kl_loss_39": 576.05, + "kl_loss_7": 2765.6, + "learning_rate": 4.7181953118484556e-05, + "loss": 3127.35, + "step": 8620 + }, + { + "ce_loss_13": 2.424758407473564, + "ce_loss_26": 1.9169064074754716, + "ce_loss_39": 1.7346068799495697, + "ce_loss_52": 1.4445551723241805, + "ce_loss_7": 2.7455954015254975, + "epoch": 0.863, + "grad_norm": 14.314671068665351, + "kl_loss_13": 2012.8, + "kl_loss_26": 950.4, + "kl_loss_39": 570.95, + "kl_loss_7": 2679.2, + "learning_rate": 4.651140065925269e-05, + "loss": 3115.55, + "step": 8630 + }, + { + "ce_loss_13": 2.5092544078826906, + "ce_loss_26": 1.9829395413398743, + "ce_loss_39": 1.7923680394887924, + "ce_loss_52": 1.481997686624527, + "ce_loss_7": 2.838895618915558, + "epoch": 0.864, + "grad_norm": 14.264233585666156, + "kl_loss_13": 2110.2, + "kl_loss_26": 1005.9, + "kl_loss_39": 616.9, + "kl_loss_7": 2805.2, + "learning_rate": 4.58454148251814e-05, + "loss": 3142.0, + "step": 8640 + }, + { + "ce_loss_13": 2.43146056830883, + "ce_loss_26": 1.9165301382541657, + "ce_loss_39": 1.7361394971609116, + "ce_loss_52": 1.4455705016851426, + "ce_loss_7": 2.7573634922504424, + "epoch": 0.865, + "grad_norm": 13.3838976309547, + "kl_loss_13": 2018.4, + "kl_loss_26": 941.1, + "kl_loss_39": 567.4, + "kl_loss_7": 2702.0, + "learning_rate": 4.518400232274078e-05, + "loss": 3128.55, + "step": 8650 + }, + { + "ce_loss_13": 2.4056933134794236, + "ce_loss_26": 1.8982498347759247, + "ce_loss_39": 1.7155311942100524, + "ce_loss_52": 1.4283919543027879, + "ce_loss_7": 2.7339151203632355, + "epoch": 0.866, + "grad_norm": 13.776729544594009, + "kl_loss_13": 2029.0, + "kl_loss_26": 958.5, + "kl_loss_39": 575.6, + "kl_loss_7": 2712.0, + "learning_rate": 4.452716981234745e-05, + "loss": 3168.35, + "step": 8660 + }, + { + "ce_loss_13": 2.4304875314235685, + "ce_loss_26": 1.9086535692214965, + "ce_loss_39": 1.731420534849167, + "ce_loss_52": 1.4311698615550994, + "ce_loss_7": 2.76586651802063, + "epoch": 0.867, + "grad_norm": 14.03220486850696, + "kl_loss_13": 2043.6, + "kl_loss_26": 956.8, + "kl_loss_39": 585.05, + "kl_loss_7": 2738.0, + "learning_rate": 4.3874923908297335e-05, + "loss": 3147.75, + "step": 8670 + }, + { + "ce_loss_13": 2.422931173443794, + "ce_loss_26": 1.8919979512691498, + "ce_loss_39": 1.7088686615228652, + "ce_loss_52": 1.4077896371483802, + "ce_loss_7": 2.757069969177246, + "epoch": 0.868, + "grad_norm": 14.116898056864903, + "kl_loss_13": 2080.8, + "kl_loss_26": 974.7, + "kl_loss_39": 600.2, + "kl_loss_7": 2778.4, + "learning_rate": 4.322727117869951e-05, + "loss": 3132.8, + "step": 8680 + }, + { + "ce_loss_13": 2.4079675406217573, + "ce_loss_26": 1.9030864268541337, + "ce_loss_39": 1.7309907704591752, + "ce_loss_52": 1.4429899513721467, + "ce_loss_7": 2.725788599252701, + "epoch": 0.869, + "grad_norm": 13.611192303364593, + "kl_loss_13": 1991.0, + "kl_loss_26": 927.4, + "kl_loss_39": 558.5, + "kl_loss_7": 2668.4, + "learning_rate": 4.2584218145409916e-05, + "loss": 3135.35, + "step": 8690 + }, + { + "ce_loss_13": 2.3869937509298325, + "ce_loss_26": 1.8738444805145265, + "ce_loss_39": 1.6931981056928636, + "ce_loss_52": 1.400150865316391, + "ce_loss_7": 2.709615921974182, + "epoch": 0.87, + "grad_norm": 14.271447400785455, + "kl_loss_13": 2015.8, + "kl_loss_26": 944.7, + "kl_loss_39": 573.1, + "kl_loss_7": 2692.8, + "learning_rate": 4.194577128396521e-05, + "loss": 3114.95, + "step": 8700 + }, + { + "ce_loss_13": 2.465124714374542, + "ce_loss_26": 1.961993396282196, + "ce_loss_39": 1.7869856834411622, + "ce_loss_52": 1.4856192290782928, + "ce_loss_7": 2.785760098695755, + "epoch": 0.871, + "grad_norm": 13.70337275168833, + "kl_loss_13": 2015.8, + "kl_loss_26": 955.1, + "kl_loss_39": 582.9, + "kl_loss_7": 2693.6, + "learning_rate": 4.1311937023518264e-05, + "loss": 3146.8, + "step": 8710 + }, + { + "ce_loss_13": 2.432892268896103, + "ce_loss_26": 1.9152618199586868, + "ce_loss_39": 1.7295002430677413, + "ce_loss_52": 1.4359665989875794, + "ce_loss_7": 2.7681704640388487, + "epoch": 0.872, + "grad_norm": 14.713342569921842, + "kl_loss_13": 2035.8, + "kl_loss_26": 958.4, + "kl_loss_39": 578.45, + "kl_loss_7": 2721.6, + "learning_rate": 4.0682721746773344e-05, + "loss": 3128.35, + "step": 8720 + }, + { + "ce_loss_13": 2.3963799655437468, + "ce_loss_26": 1.8889498293399811, + "ce_loss_39": 1.7117068350315094, + "ce_loss_52": 1.4276205718517303, + "ce_loss_7": 2.7207881271839143, + "epoch": 0.873, + "grad_norm": 14.269783754551202, + "kl_loss_13": 2001.0, + "kl_loss_26": 932.4, + "kl_loss_39": 561.8, + "kl_loss_7": 2688.8, + "learning_rate": 4.0058131789920904e-05, + "loss": 3131.65, + "step": 8730 + }, + { + "ce_loss_13": 2.418975955247879, + "ce_loss_26": 1.8960406243801118, + "ce_loss_39": 1.7133139997720719, + "ce_loss_52": 1.4180105909705163, + "ce_loss_7": 2.7614206850528715, + "epoch": 0.874, + "grad_norm": 14.114592900051933, + "kl_loss_13": 2079.4, + "kl_loss_26": 975.5, + "kl_loss_39": 598.35, + "kl_loss_7": 2790.4, + "learning_rate": 3.9438173442575e-05, + "loss": 3100.7, + "step": 8740 + }, + { + "ce_loss_13": 2.4537671864032746, + "ce_loss_26": 1.946785607933998, + "ce_loss_39": 1.7634260147809981, + "ce_loss_52": 1.4690344750881195, + "ce_loss_7": 2.775768506526947, + "epoch": 0.875, + "grad_norm": 14.35027266854894, + "kl_loss_13": 2027.0, + "kl_loss_26": 955.8, + "kl_loss_39": 572.25, + "kl_loss_7": 2707.2, + "learning_rate": 3.882285294770937e-05, + "loss": 3145.7, + "step": 8750 + }, + { + "ce_loss_13": 2.4146986842155456, + "ce_loss_26": 1.8880977869033813, + "ce_loss_39": 1.7063632160425186, + "ce_loss_52": 1.403985047340393, + "ce_loss_7": 2.741646242141724, + "epoch": 0.876, + "grad_norm": 14.018669111873198, + "kl_loss_13": 2049.4, + "kl_loss_26": 969.5, + "kl_loss_39": 590.2, + "kl_loss_7": 2731.6, + "learning_rate": 3.821217650159453e-05, + "loss": 3139.0, + "step": 8760 + }, + { + "ce_loss_13": 2.3532112538814545, + "ce_loss_26": 1.8646484702825545, + "ce_loss_39": 1.6983740404248238, + "ce_loss_52": 1.4196315869688987, + "ce_loss_7": 2.666028293967247, + "epoch": 0.877, + "grad_norm": 13.69993271737963, + "kl_loss_13": 1919.2, + "kl_loss_26": 890.8, + "kl_loss_39": 542.85, + "kl_loss_7": 2582.0, + "learning_rate": 3.760615025373543e-05, + "loss": 3109.35, + "step": 8770 + }, + { + "ce_loss_13": 2.4570236086845396, + "ce_loss_26": 1.938426810503006, + "ce_loss_39": 1.7564183056354523, + "ce_loss_52": 1.453689630329609, + "ce_loss_7": 2.787124717235565, + "epoch": 0.878, + "grad_norm": 14.764457343001293, + "kl_loss_13": 2048.4, + "kl_loss_26": 962.8, + "kl_loss_39": 585.3, + "kl_loss_7": 2732.8, + "learning_rate": 3.700478030680987e-05, + "loss": 3153.7, + "step": 8780 + }, + { + "ce_loss_13": 2.4401866495609283, + "ce_loss_26": 1.9257715612649917, + "ce_loss_39": 1.7440615922212601, + "ce_loss_52": 1.4473457425832748, + "ce_loss_7": 2.76454553604126, + "epoch": 0.879, + "grad_norm": 13.624797595400036, + "kl_loss_13": 2038.4, + "kl_loss_26": 958.5, + "kl_loss_39": 579.85, + "kl_loss_7": 2717.6, + "learning_rate": 3.6408072716606344e-05, + "loss": 3158.95, + "step": 8790 + }, + { + "ce_loss_13": 2.3960734605789185, + "ce_loss_26": 1.8874068677425384, + "ce_loss_39": 1.7073772728443146, + "ce_loss_52": 1.4144951313734055, + "ce_loss_7": 2.7204393565654756, + "epoch": 0.88, + "grad_norm": 13.565627756275363, + "kl_loss_13": 2008.0, + "kl_loss_26": 944.3, + "kl_loss_39": 574.6, + "kl_loss_7": 2694.0, + "learning_rate": 3.5816033491963716e-05, + "loss": 3127.45, + "step": 8800 + }, + { + "ce_loss_13": 2.426618826389313, + "ce_loss_26": 1.921605721116066, + "ce_loss_39": 1.7422660619020462, + "ce_loss_52": 1.4409180462360383, + "ce_loss_7": 2.7458843529224395, + "epoch": 0.881, + "grad_norm": 14.680252482648882, + "kl_loss_13": 2012.6, + "kl_loss_26": 955.2, + "kl_loss_39": 587.45, + "kl_loss_7": 2678.0, + "learning_rate": 3.522866859471047e-05, + "loss": 3106.075, + "step": 8810 + }, + { + "ce_loss_13": 2.4589924097061155, + "ce_loss_26": 1.9394809186458588, + "ce_loss_39": 1.7563898861408234, + "ce_loss_52": 1.459865990281105, + "ce_loss_7": 2.780824285745621, + "epoch": 0.882, + "grad_norm": 13.410206661523853, + "kl_loss_13": 2065.4, + "kl_loss_26": 975.8, + "kl_loss_39": 594.35, + "kl_loss_7": 2745.6, + "learning_rate": 3.46459839396045e-05, + "loss": 3162.5, + "step": 8820 + }, + { + "ce_loss_13": 2.444491392374039, + "ce_loss_26": 1.919321459531784, + "ce_loss_39": 1.7317459166049958, + "ce_loss_52": 1.4241959005594254, + "ce_loss_7": 2.7687501907348633, + "epoch": 0.883, + "grad_norm": 13.604154476294898, + "kl_loss_13": 2078.6, + "kl_loss_26": 980.1, + "kl_loss_39": 596.05, + "kl_loss_7": 2764.8, + "learning_rate": 3.406798539427386e-05, + "loss": 3137.15, + "step": 8830 + }, + { + "ce_loss_13": 2.4456652402877808, + "ce_loss_26": 1.9403656631708146, + "ce_loss_39": 1.7609369516372682, + "ce_loss_52": 1.4744407176971435, + "ce_loss_7": 2.762556844949722, + "epoch": 0.884, + "grad_norm": 14.0628693009416, + "kl_loss_13": 1994.0, + "kl_loss_26": 932.0, + "kl_loss_39": 557.15, + "kl_loss_7": 2667.2, + "learning_rate": 3.349467877915746e-05, + "loss": 3099.2, + "step": 8840 + }, + { + "ce_loss_13": 2.4576884746551513, + "ce_loss_26": 1.9445500463247298, + "ce_loss_39": 1.764642345905304, + "ce_loss_52": 1.4638898521661758, + "ce_loss_7": 2.7794412195682527, + "epoch": 0.885, + "grad_norm": 13.731348532638435, + "kl_loss_13": 2048.2, + "kl_loss_26": 971.4, + "kl_loss_39": 596.3, + "kl_loss_7": 2722.8, + "learning_rate": 3.292606986744667e-05, + "loss": 3152.675, + "step": 8850 + }, + { + "ce_loss_13": 2.4683742761611938, + "ce_loss_26": 1.9468118786811828, + "ce_loss_39": 1.7659433901309967, + "ce_loss_52": 1.4723648518323897, + "ce_loss_7": 2.794690328836441, + "epoch": 0.886, + "grad_norm": 14.768179589766811, + "kl_loss_13": 2053.6, + "kl_loss_26": 960.6, + "kl_loss_39": 581.1, + "kl_loss_7": 2740.0, + "learning_rate": 3.23621643850267e-05, + "loss": 3135.9, + "step": 8860 + }, + { + "ce_loss_13": 2.3716426849365235, + "ce_loss_26": 1.8652270317077637, + "ce_loss_39": 1.6864412546157836, + "ce_loss_52": 1.4016476958990096, + "ce_loss_7": 2.6954882085323333, + "epoch": 0.887, + "grad_norm": 13.861337667665934, + "kl_loss_13": 1995.8, + "kl_loss_26": 932.7, + "kl_loss_39": 558.6, + "kl_loss_7": 2668.8, + "learning_rate": 3.180296801041971e-05, + "loss": 3116.05, + "step": 8870 + }, + { + "ce_loss_13": 2.406647819280624, + "ce_loss_26": 1.8954071879386902, + "ce_loss_39": 1.7191426277160644, + "ce_loss_52": 1.4318138241767884, + "ce_loss_7": 2.7277746230363844, + "epoch": 0.888, + "grad_norm": 14.18696677740657, + "kl_loss_13": 1999.0, + "kl_loss_26": 937.5, + "kl_loss_39": 568.1, + "kl_loss_7": 2671.2, + "learning_rate": 3.124848637472688e-05, + "loss": 3120.85, + "step": 8880 + }, + { + "ce_loss_13": 2.4183617502450945, + "ce_loss_26": 1.9087469071149825, + "ce_loss_39": 1.731605476140976, + "ce_loss_52": 1.4397760301828384, + "ce_loss_7": 2.7426804542541503, + "epoch": 0.889, + "grad_norm": 14.005614784571057, + "kl_loss_13": 2023.4, + "kl_loss_26": 938.6, + "kl_loss_39": 572.0, + "kl_loss_7": 2704.0, + "learning_rate": 3.069872506157212e-05, + "loss": 3140.35, + "step": 8890 + }, + { + "ce_loss_13": 2.3538796246051787, + "ce_loss_26": 1.8491210967302323, + "ce_loss_39": 1.6793664902448655, + "ce_loss_52": 1.4003751114010812, + "ce_loss_7": 2.6818648397922518, + "epoch": 0.89, + "grad_norm": 13.507964025411896, + "kl_loss_13": 1969.8, + "kl_loss_26": 909.9, + "kl_loss_39": 551.1, + "kl_loss_7": 2655.2, + "learning_rate": 3.0153689607045842e-05, + "loss": 3115.9, + "step": 8900 + }, + { + "ce_loss_13": 2.3964832425117493, + "ce_loss_26": 1.8831844747066497, + "ce_loss_39": 1.7046507805585862, + "ce_loss_52": 1.4187346428632737, + "ce_loss_7": 2.72187722325325, + "epoch": 0.891, + "grad_norm": 14.258655617102612, + "kl_loss_13": 2013.6, + "kl_loss_26": 929.9, + "kl_loss_39": 559.3, + "kl_loss_7": 2705.2, + "learning_rate": 2.9613385499648926e-05, + "loss": 3133.85, + "step": 8910 + }, + { + "ce_loss_13": 2.3858442664146424, + "ce_loss_26": 1.8839757442474365, + "ce_loss_39": 1.709816351532936, + "ce_loss_52": 1.430666272342205, + "ce_loss_7": 2.6985366463661196, + "epoch": 0.892, + "grad_norm": 14.05296041871021, + "kl_loss_13": 1977.6, + "kl_loss_26": 918.7, + "kl_loss_39": 552.4, + "kl_loss_7": 2640.0, + "learning_rate": 2.9077818180237692e-05, + "loss": 3161.85, + "step": 8920 + }, + { + "ce_loss_13": 2.4120604634284972, + "ce_loss_26": 1.9131643176078796, + "ce_loss_39": 1.741806897521019, + "ce_loss_52": 1.4555314972996711, + "ce_loss_7": 2.7308483004570006, + "epoch": 0.893, + "grad_norm": 14.216852449574784, + "kl_loss_13": 1989.0, + "kl_loss_26": 931.9, + "kl_loss_39": 565.55, + "kl_loss_7": 2658.8, + "learning_rate": 2.8546993041969172e-05, + "loss": 3113.0, + "step": 8930 + }, + { + "ce_loss_13": 2.417782390117645, + "ce_loss_26": 1.9075176060199737, + "ce_loss_39": 1.7230383425951004, + "ce_loss_52": 1.428551298379898, + "ce_loss_7": 2.737876206636429, + "epoch": 0.894, + "grad_norm": 13.81624953017635, + "kl_loss_13": 2024.2, + "kl_loss_26": 956.0, + "kl_loss_39": 579.25, + "kl_loss_7": 2694.8, + "learning_rate": 2.802091543024671e-05, + "loss": 3118.3, + "step": 8940 + }, + { + "ce_loss_13": 2.4281566560268404, + "ce_loss_26": 1.9209083169698715, + "ce_loss_39": 1.7325717121362687, + "ce_loss_52": 1.4276321291923524, + "ce_loss_7": 2.7494910418987275, + "epoch": 0.895, + "grad_norm": 14.667622613471202, + "kl_loss_13": 2068.0, + "kl_loss_26": 993.0, + "kl_loss_39": 599.95, + "kl_loss_7": 2738.4, + "learning_rate": 2.7499590642665774e-05, + "loss": 3152.3, + "step": 8950 + }, + { + "ce_loss_13": 2.4123401612043383, + "ce_loss_26": 1.9057573080062866, + "ce_loss_39": 1.725106343626976, + "ce_loss_52": 1.43211932182312, + "ce_loss_7": 2.7356060326099394, + "epoch": 0.896, + "grad_norm": 13.84770444101258, + "kl_loss_13": 2014.8, + "kl_loss_26": 952.6, + "kl_loss_39": 575.75, + "kl_loss_7": 2696.2, + "learning_rate": 2.6983023928961405e-05, + "loss": 3131.5, + "step": 8960 + }, + { + "ce_loss_13": 2.3709884881973267, + "ce_loss_26": 1.868075394630432, + "ce_loss_39": 1.6942670613527298, + "ce_loss_52": 1.3973538905382157, + "ce_loss_7": 2.69813577234745, + "epoch": 0.897, + "grad_norm": 14.584597117704368, + "kl_loss_13": 2004.8, + "kl_loss_26": 940.7, + "kl_loss_39": 576.3, + "kl_loss_7": 2687.2, + "learning_rate": 2.6471220490954628e-05, + "loss": 3144.15, + "step": 8970 + }, + { + "ce_loss_13": 2.421136862039566, + "ce_loss_26": 1.9213113605976104, + "ce_loss_39": 1.7483066588640213, + "ce_loss_52": 1.4667307168245316, + "ce_loss_7": 2.7391393184661865, + "epoch": 0.898, + "grad_norm": 14.053042951615785, + "kl_loss_13": 1968.0, + "kl_loss_26": 914.9, + "kl_loss_39": 554.05, + "kl_loss_7": 2636.4, + "learning_rate": 2.596418548250029e-05, + "loss": 3077.8, + "step": 8980 + }, + { + "ce_loss_13": 2.383489468693733, + "ce_loss_26": 1.8838744014501572, + "ce_loss_39": 1.7122972816228867, + "ce_loss_52": 1.4258252471685409, + "ce_loss_7": 2.7002600908279417, + "epoch": 0.899, + "grad_norm": 13.786718301064743, + "kl_loss_13": 1996.2, + "kl_loss_26": 931.7, + "kl_loss_39": 571.05, + "kl_loss_7": 2666.4, + "learning_rate": 2.5461924009435368e-05, + "loss": 3075.25, + "step": 8990 + }, + { + "ce_loss_13": 2.3905730485916137, + "ce_loss_26": 1.8808085292577743, + "ce_loss_39": 1.7040841788053513, + "ce_loss_52": 1.4255578130483628, + "ce_loss_7": 2.7172752916812897, + "epoch": 0.9, + "grad_norm": 14.142930179347214, + "kl_loss_13": 1983.0, + "kl_loss_26": 912.3, + "kl_loss_39": 546.25, + "kl_loss_7": 2670.0, + "learning_rate": 2.4964441129527336e-05, + "loss": 3116.85, + "step": 9000 + }, + { + "ce_loss_13": 2.43511378467083, + "ce_loss_26": 1.926012173295021, + "ce_loss_39": 1.7415268182754517, + "ce_loss_52": 1.4408889025449754, + "ce_loss_7": 2.761294722557068, + "epoch": 0.901, + "grad_norm": 13.928144692729873, + "kl_loss_13": 2050.6, + "kl_loss_26": 980.3, + "kl_loss_39": 592.65, + "kl_loss_7": 2732.0, + "learning_rate": 2.4471741852423235e-05, + "loss": 3132.7, + "step": 9010 + }, + { + "ce_loss_13": 2.3582999795675277, + "ce_loss_26": 1.8586651980876923, + "ce_loss_39": 1.6817226380109787, + "ce_loss_52": 1.3925694867968559, + "ce_loss_7": 2.6693267047405245, + "epoch": 0.902, + "grad_norm": 14.116392246566738, + "kl_loss_13": 1978.6, + "kl_loss_26": 929.3, + "kl_loss_39": 571.5, + "kl_loss_7": 2638.8, + "learning_rate": 2.3983831139599287e-05, + "loss": 3114.75, + "step": 9020 + }, + { + "ce_loss_13": 2.418634516000748, + "ce_loss_26": 1.9078166902065277, + "ce_loss_39": 1.7249888181686401, + "ce_loss_52": 1.4335400015115738, + "ce_loss_7": 2.7412546992301943, + "epoch": 0.903, + "grad_norm": 13.637306812880459, + "kl_loss_13": 2036.0, + "kl_loss_26": 950.4, + "kl_loss_39": 575.5, + "kl_loss_7": 2710.0, + "learning_rate": 2.3500713904311022e-05, + "loss": 3133.35, + "step": 9030 + }, + { + "ce_loss_13": 2.3931309431791306, + "ce_loss_26": 1.877245968580246, + "ce_loss_39": 1.7018247723579407, + "ce_loss_52": 1.4155418664216994, + "ce_loss_7": 2.7214196979999543, + "epoch": 0.904, + "grad_norm": 14.785773850168622, + "kl_loss_13": 2014.2, + "kl_loss_26": 931.6, + "kl_loss_39": 566.7, + "kl_loss_7": 2697.2, + "learning_rate": 2.3022395011543685e-05, + "loss": 3107.4, + "step": 9040 + }, + { + "ce_loss_13": 2.4141552269458773, + "ce_loss_26": 1.9003157913684845, + "ce_loss_39": 1.7119427561759948, + "ce_loss_52": 1.4218156844377519, + "ce_loss_7": 2.738201731443405, + "epoch": 0.905, + "grad_norm": 15.004045135842484, + "kl_loss_13": 2046.4, + "kl_loss_26": 956.8, + "kl_loss_39": 574.8, + "kl_loss_7": 2728.0, + "learning_rate": 2.2548879277963063e-05, + "loss": 3131.35, + "step": 9050 + }, + { + "ce_loss_13": 2.4368073105812074, + "ce_loss_26": 1.9358865648508072, + "ce_loss_39": 1.758334356546402, + "ce_loss_52": 1.4668044418096542, + "ce_loss_7": 2.7515600681304933, + "epoch": 0.906, + "grad_norm": 14.409368784921233, + "kl_loss_13": 1999.0, + "kl_loss_26": 944.8, + "kl_loss_39": 573.2, + "kl_loss_7": 2666.4, + "learning_rate": 2.208017147186736e-05, + "loss": 3129.15, + "step": 9060 + }, + { + "ce_loss_13": 2.4492080837488173, + "ce_loss_26": 1.9512592017650605, + "ce_loss_39": 1.7678290545940398, + "ce_loss_52": 1.4713279128074646, + "ce_loss_7": 2.772859865427017, + "epoch": 0.907, + "grad_norm": 14.409868370647818, + "kl_loss_13": 2027.6, + "kl_loss_26": 962.9, + "kl_loss_39": 580.75, + "kl_loss_7": 2708.4, + "learning_rate": 2.1616276313139227e-05, + "loss": 3136.45, + "step": 9070 + }, + { + "ce_loss_13": 2.3629566222429275, + "ce_loss_26": 1.866456887125969, + "ce_loss_39": 1.692075565457344, + "ce_loss_52": 1.4036286368966102, + "ce_loss_7": 2.681915229558945, + "epoch": 0.908, + "grad_norm": 13.121489500516578, + "kl_loss_13": 1981.6, + "kl_loss_26": 934.0, + "kl_loss_39": 568.15, + "kl_loss_7": 2653.6, + "learning_rate": 2.1157198473197415e-05, + "loss": 3145.35, + "step": 9080 + }, + { + "ce_loss_13": 2.4373748511075974, + "ce_loss_26": 1.9266161501407624, + "ce_loss_39": 1.7425854057073593, + "ce_loss_52": 1.4446089684963226, + "ce_loss_7": 2.7682818710803985, + "epoch": 0.909, + "grad_norm": 14.230511024408004, + "kl_loss_13": 2056.6, + "kl_loss_26": 973.2, + "kl_loss_39": 588.0, + "kl_loss_7": 2750.4, + "learning_rate": 2.0702942574950812e-05, + "loss": 3127.5, + "step": 9090 + }, + { + "ce_loss_13": 2.4287337332963945, + "ce_loss_26": 1.9172603338956833, + "ce_loss_39": 1.7392981857061387, + "ce_loss_52": 1.4425375372171403, + "ce_loss_7": 2.7510289788246154, + "epoch": 0.91, + "grad_norm": 14.057406508919666, + "kl_loss_13": 2046.6, + "kl_loss_26": 964.4, + "kl_loss_39": 593.1, + "kl_loss_7": 2722.4, + "learning_rate": 2.025351319275137e-05, + "loss": 3121.6, + "step": 9100 + }, + { + "ce_loss_13": 2.4320779502391816, + "ce_loss_26": 1.9104785054922104, + "ce_loss_39": 1.7267427280545236, + "ce_loss_52": 1.437354525923729, + "ce_loss_7": 2.7588888108730316, + "epoch": 0.911, + "grad_norm": 14.001627192514695, + "kl_loss_13": 2041.6, + "kl_loss_26": 960.0, + "kl_loss_39": 579.65, + "kl_loss_7": 2727.2, + "learning_rate": 1.9808914852347816e-05, + "loss": 3132.0, + "step": 9110 + }, + { + "ce_loss_13": 2.4577192962169647, + "ce_loss_26": 1.9406163454055787, + "ce_loss_39": 1.7553843706846237, + "ce_loss_52": 1.4540565699338912, + "ce_loss_7": 2.7937385022640226, + "epoch": 0.912, + "grad_norm": 14.03191291932469, + "kl_loss_13": 2055.4, + "kl_loss_26": 970.4, + "kl_loss_39": 590.75, + "kl_loss_7": 2749.6, + "learning_rate": 1.9369152030840554e-05, + "loss": 3137.8, + "step": 9120 + }, + { + "ce_loss_13": 2.37432479262352, + "ce_loss_26": 1.874118760228157, + "ce_loss_39": 1.6963829159736634, + "ce_loss_52": 1.4185152500867844, + "ce_loss_7": 2.692077511548996, + "epoch": 0.913, + "grad_norm": 14.634301127161054, + "kl_loss_13": 1966.6, + "kl_loss_26": 913.3, + "kl_loss_39": 546.15, + "kl_loss_7": 2640.0, + "learning_rate": 1.893422915663645e-05, + "loss": 3130.35, + "step": 9130 + }, + { + "ce_loss_13": 2.485106924176216, + "ce_loss_26": 1.9777852237224578, + "ce_loss_39": 1.7995707392692566, + "ce_loss_52": 1.4930359899997712, + "ce_loss_7": 2.800886517763138, + "epoch": 0.914, + "grad_norm": 13.987291564294976, + "kl_loss_13": 2050.4, + "kl_loss_26": 985.4, + "kl_loss_39": 605.6, + "kl_loss_7": 2732.4, + "learning_rate": 1.850415060940386e-05, + "loss": 3103.7, + "step": 9140 + }, + { + "ce_loss_13": 2.4284686923027037, + "ce_loss_26": 1.928224155306816, + "ce_loss_39": 1.7489481002092362, + "ce_loss_52": 1.4568757116794586, + "ce_loss_7": 2.7469853341579435, + "epoch": 0.915, + "grad_norm": 14.256565267478091, + "kl_loss_13": 2014.8, + "kl_loss_26": 950.5, + "kl_loss_39": 576.35, + "kl_loss_7": 2683.2, + "learning_rate": 1.8078920720028978e-05, + "loss": 3089.85, + "step": 9150 + }, + { + "ce_loss_13": 2.397159770131111, + "ce_loss_26": 1.9007071822881698, + "ce_loss_39": 1.7246300727128983, + "ce_loss_52": 1.4446155533194542, + "ce_loss_7": 2.7181631565093993, + "epoch": 0.916, + "grad_norm": 14.893106945439357, + "kl_loss_13": 1959.0, + "kl_loss_26": 905.8, + "kl_loss_39": 547.35, + "kl_loss_7": 2633.6, + "learning_rate": 1.765854377057219e-05, + "loss": 3113.25, + "step": 9160 + }, + { + "ce_loss_13": 2.391612654924393, + "ce_loss_26": 1.8863595336675645, + "ce_loss_39": 1.7089181810617446, + "ce_loss_52": 1.4133323535323143, + "ce_loss_7": 2.7117814838886263, + "epoch": 0.917, + "grad_norm": 13.727089751993333, + "kl_loss_13": 2026.6, + "kl_loss_26": 952.4, + "kl_loss_39": 579.55, + "kl_loss_7": 2699.2, + "learning_rate": 1.724302399422456e-05, + "loss": 3114.9, + "step": 9170 + }, + { + "ce_loss_13": 2.3898652464151384, + "ce_loss_26": 1.8965242326259613, + "ce_loss_39": 1.7190593391656876, + "ce_loss_52": 1.4282272264361382, + "ce_loss_7": 2.70514101088047, + "epoch": 0.918, + "grad_norm": 14.466760453933187, + "kl_loss_13": 1964.6, + "kl_loss_26": 920.2, + "kl_loss_39": 557.4, + "kl_loss_7": 2637.2, + "learning_rate": 1.683236557526574e-05, + "loss": 3117.7, + "step": 9180 + }, + { + "ce_loss_13": 2.363905116915703, + "ce_loss_26": 1.8732207268476486, + "ce_loss_39": 1.6985140055418015, + "ce_loss_52": 1.4114336684346198, + "ce_loss_7": 2.6808120787143705, + "epoch": 0.919, + "grad_norm": 13.740003598916001, + "kl_loss_13": 1964.6, + "kl_loss_26": 921.6, + "kl_loss_39": 557.8, + "kl_loss_7": 2632.0, + "learning_rate": 1.6426572649021475e-05, + "loss": 3121.6, + "step": 9190 + }, + { + "ce_loss_13": 2.4235911548137663, + "ce_loss_26": 1.911160832643509, + "ce_loss_39": 1.7327239394187928, + "ce_loss_52": 1.4489689737558364, + "ce_loss_7": 2.740164947509766, + "epoch": 0.92, + "grad_norm": 14.26925795970607, + "kl_loss_13": 2008.2, + "kl_loss_26": 933.2, + "kl_loss_39": 559.8, + "kl_loss_7": 2678.0, + "learning_rate": 1.6025649301821876e-05, + "loss": 3113.6, + "step": 9200 + }, + { + "ce_loss_13": 2.4683689922094345, + "ce_loss_26": 1.9488540649414063, + "ce_loss_39": 1.7631026744842528, + "ce_loss_52": 1.4620952308177948, + "ce_loss_7": 2.794441765546799, + "epoch": 0.921, + "grad_norm": 14.076823047271347, + "kl_loss_13": 2051.4, + "kl_loss_26": 970.8, + "kl_loss_39": 588.7, + "kl_loss_7": 2729.2, + "learning_rate": 1.5629599570960716e-05, + "loss": 3104.8, + "step": 9210 + }, + { + "ce_loss_13": 2.3585807204246523, + "ce_loss_26": 1.8531203657388686, + "ce_loss_39": 1.6820847302675248, + "ce_loss_52": 1.4014029562473298, + "ce_loss_7": 2.6779536455869675, + "epoch": 0.922, + "grad_norm": 13.947276602522905, + "kl_loss_13": 1987.2, + "kl_loss_26": 913.9, + "kl_loss_39": 557.25, + "kl_loss_7": 2664.4, + "learning_rate": 1.5238427444654367e-05, + "loss": 3096.2, + "step": 9220 + }, + { + "ce_loss_13": 2.3690007477998734, + "ce_loss_26": 1.8584237039089202, + "ce_loss_39": 1.6857000291347504, + "ce_loss_52": 1.3957198202610015, + "ce_loss_7": 2.697771596908569, + "epoch": 0.923, + "grad_norm": 13.90974821940606, + "kl_loss_13": 2016.4, + "kl_loss_26": 937.5, + "kl_loss_39": 574.65, + "kl_loss_7": 2710.0, + "learning_rate": 1.4852136862001764e-05, + "loss": 3120.95, + "step": 9230 + }, + { + "ce_loss_13": 2.386936154961586, + "ce_loss_26": 1.8795882225036622, + "ce_loss_39": 1.7048650175333022, + "ce_loss_52": 1.4252549767494203, + "ce_loss_7": 2.7094414860010145, + "epoch": 0.924, + "grad_norm": 14.075158349443141, + "kl_loss_13": 1994.4, + "kl_loss_26": 926.6, + "kl_loss_39": 558.55, + "kl_loss_7": 2674.0, + "learning_rate": 1.4470731712944884e-05, + "loss": 3095.9, + "step": 9240 + }, + { + "ce_loss_13": 2.480703926086426, + "ce_loss_26": 1.9693680822849273, + "ce_loss_39": 1.7812021166086196, + "ce_loss_52": 1.4721150636672973, + "ce_loss_7": 2.808130669593811, + "epoch": 0.925, + "grad_norm": 13.823292097408816, + "kl_loss_13": 2057.0, + "kl_loss_26": 989.9, + "kl_loss_39": 602.9, + "kl_loss_7": 2740.4, + "learning_rate": 1.4094215838229174e-05, + "loss": 3114.1, + "step": 9250 + }, + { + "ce_loss_13": 2.415296331048012, + "ce_loss_26": 1.9087094902992248, + "ce_loss_39": 1.7277994453907013, + "ce_loss_52": 1.4381081372499467, + "ce_loss_7": 2.738765448331833, + "epoch": 0.926, + "grad_norm": 14.170973390394844, + "kl_loss_13": 2029.4, + "kl_loss_26": 949.8, + "kl_loss_39": 570.55, + "kl_loss_7": 2714.0, + "learning_rate": 1.372259302936546e-05, + "loss": 3105.95, + "step": 9260 + }, + { + "ce_loss_13": 2.348677235841751, + "ce_loss_26": 1.8576316490769387, + "ce_loss_39": 1.6828459605574608, + "ce_loss_52": 1.3921316027641297, + "ce_loss_7": 2.6635967582464217, + "epoch": 0.927, + "grad_norm": 14.119512693749174, + "kl_loss_13": 1987.8, + "kl_loss_26": 934.2, + "kl_loss_39": 566.95, + "kl_loss_7": 2660.0, + "learning_rate": 1.3355867028591206e-05, + "loss": 3097.55, + "step": 9270 + }, + { + "ce_loss_13": 2.3999607056379317, + "ce_loss_26": 1.8902852058410644, + "ce_loss_39": 1.7127097964286804, + "ce_loss_52": 1.4248219341039658, + "ce_loss_7": 2.7230198085308075, + "epoch": 0.928, + "grad_norm": 14.447160188213138, + "kl_loss_13": 1992.2, + "kl_loss_26": 927.2, + "kl_loss_39": 561.6, + "kl_loss_7": 2679.2, + "learning_rate": 1.2994041528833267e-05, + "loss": 3090.65, + "step": 9280 + }, + { + "ce_loss_13": 2.502937263250351, + "ce_loss_26": 1.9786556929349899, + "ce_loss_39": 1.7848447173833848, + "ce_loss_52": 1.4703152477741241, + "ce_loss_7": 2.837230235338211, + "epoch": 0.929, + "grad_norm": 13.618067390259025, + "kl_loss_13": 2119.0, + "kl_loss_26": 1012.1, + "kl_loss_39": 614.9, + "kl_loss_7": 2810.8, + "learning_rate": 1.2637120173670358e-05, + "loss": 3139.05, + "step": 9290 + }, + { + "ce_loss_13": 2.4561884820461275, + "ce_loss_26": 1.9343136429786683, + "ce_loss_39": 1.747347640991211, + "ce_loss_52": 1.4338387340307235, + "ce_loss_7": 2.7828237235546114, + "epoch": 0.93, + "grad_norm": 14.04284428447472, + "kl_loss_13": 2098.8, + "kl_loss_26": 1005.3, + "kl_loss_39": 621.35, + "kl_loss_7": 2786.4, + "learning_rate": 1.2285106557296478e-05, + "loss": 3143.8, + "step": 9300 + }, + { + "ce_loss_13": 2.3713702976703646, + "ce_loss_26": 1.868654829263687, + "ce_loss_39": 1.6906698912382125, + "ce_loss_52": 1.409242296218872, + "ce_loss_7": 2.7044317960739135, + "epoch": 0.931, + "grad_norm": 14.323129542469303, + "kl_loss_13": 2006.0, + "kl_loss_26": 926.0, + "kl_loss_39": 564.0, + "kl_loss_7": 2676.4, + "learning_rate": 1.1938004224484989e-05, + "loss": 3116.75, + "step": 9310 + }, + { + "ce_loss_13": 2.428625673055649, + "ce_loss_26": 1.9227415055036545, + "ce_loss_39": 1.7408353060483932, + "ce_loss_52": 1.4440293073654176, + "ce_loss_7": 2.7480487704277037, + "epoch": 0.932, + "grad_norm": 13.286547502225938, + "kl_loss_13": 2016.8, + "kl_loss_26": 948.6, + "kl_loss_39": 573.45, + "kl_loss_7": 2695.6, + "learning_rate": 1.1595816670552429e-05, + "loss": 3098.9, + "step": 9320 + }, + { + "ce_loss_13": 2.3783950984477995, + "ce_loss_26": 1.871030893921852, + "ce_loss_39": 1.6935174107551574, + "ce_loss_52": 1.4064364448189735, + "ce_loss_7": 2.709024131298065, + "epoch": 0.933, + "grad_norm": 13.906014402683894, + "kl_loss_13": 2025.8, + "kl_loss_26": 943.2, + "kl_loss_39": 573.45, + "kl_loss_7": 2710.0, + "learning_rate": 1.1258547341323699e-05, + "loss": 3112.75, + "step": 9330 + }, + { + "ce_loss_13": 2.418292981386185, + "ce_loss_26": 1.9166706264019013, + "ce_loss_39": 1.7358173072338103, + "ce_loss_52": 1.4464493066072464, + "ce_loss_7": 2.7368280410766603, + "epoch": 0.934, + "grad_norm": 13.814674823123692, + "kl_loss_13": 2007.6, + "kl_loss_26": 936.8, + "kl_loss_39": 570.05, + "kl_loss_7": 2680.4, + "learning_rate": 1.0926199633097156e-05, + "loss": 3089.9, + "step": 9340 + }, + { + "ce_loss_13": 2.421270787715912, + "ce_loss_26": 1.9069751173257827, + "ce_loss_39": 1.7246074616909026, + "ce_loss_52": 1.4261210292577744, + "ce_loss_7": 2.7438779413700103, + "epoch": 0.935, + "grad_norm": 14.043137766458962, + "kl_loss_13": 2039.4, + "kl_loss_26": 967.3, + "kl_loss_39": 585.6, + "kl_loss_7": 2722.8, + "learning_rate": 1.0598776892610684e-05, + "loss": 3103.7, + "step": 9350 + }, + { + "ce_loss_13": 2.4761788189411162, + "ce_loss_26": 1.9670240104198455, + "ce_loss_39": 1.7913133591413497, + "ce_loss_52": 1.5009067565202714, + "ce_loss_7": 2.7962326526641847, + "epoch": 0.936, + "grad_norm": 13.716488729093573, + "kl_loss_13": 2008.0, + "kl_loss_26": 945.5, + "kl_loss_39": 573.65, + "kl_loss_7": 2685.6, + "learning_rate": 1.0276282417007399e-05, + "loss": 3106.15, + "step": 9360 + }, + { + "ce_loss_13": 2.418415975570679, + "ce_loss_26": 1.916797822713852, + "ce_loss_39": 1.741264235973358, + "ce_loss_52": 1.452787458896637, + "ce_loss_7": 2.744573098421097, + "epoch": 0.937, + "grad_norm": 13.43858517345926, + "kl_loss_13": 1999.2, + "kl_loss_26": 936.2, + "kl_loss_39": 569.25, + "kl_loss_7": 2682.0, + "learning_rate": 9.958719453803277e-06, + "loss": 3109.9, + "step": 9370 + }, + { + "ce_loss_13": 2.3895679712295532, + "ce_loss_26": 1.8777837812900544, + "ce_loss_39": 1.6948266059160233, + "ce_loss_52": 1.4046757638454437, + "ce_loss_7": 2.7213546216487883, + "epoch": 0.938, + "grad_norm": 14.159970740647127, + "kl_loss_13": 2015.8, + "kl_loss_26": 945.0, + "kl_loss_39": 569.8, + "kl_loss_7": 2706.4, + "learning_rate": 9.646091200853802e-06, + "loss": 3110.975, + "step": 9380 + }, + { + "ce_loss_13": 2.389327567815781, + "ce_loss_26": 1.8952437072992325, + "ce_loss_39": 1.7221183687448502, + "ce_loss_52": 1.4358048617839814, + "ce_loss_7": 2.7049793720245363, + "epoch": 0.939, + "grad_norm": 13.344604295880838, + "kl_loss_13": 1964.8, + "kl_loss_26": 921.8, + "kl_loss_39": 560.1, + "kl_loss_7": 2638.0, + "learning_rate": 9.338400806321978e-06, + "loss": 3087.6, + "step": 9390 + }, + { + "ce_loss_13": 2.4069793194532396, + "ce_loss_26": 1.8967778533697128, + "ce_loss_39": 1.7197368562221527, + "ce_loss_52": 1.436858707666397, + "ce_loss_7": 2.7245797514915466, + "epoch": 0.94, + "grad_norm": 13.71832049440062, + "kl_loss_13": 2002.0, + "kl_loss_26": 937.3, + "kl_loss_39": 566.65, + "kl_loss_7": 2677.2, + "learning_rate": 9.035651368646646e-06, + "loss": 3124.5, + "step": 9400 + }, + { + "ce_loss_13": 2.3785787016153335, + "ce_loss_26": 1.8779745906591416, + "ce_loss_39": 1.7003175497055054, + "ce_loss_52": 1.4152926355600357, + "ce_loss_7": 2.6983864098787307, + "epoch": 0.941, + "grad_norm": 14.37750392989465, + "kl_loss_13": 1983.0, + "kl_loss_26": 932.4, + "kl_loss_39": 563.55, + "kl_loss_7": 2649.2, + "learning_rate": 8.737845936511335e-06, + "loss": 3126.15, + "step": 9410 + }, + { + "ce_loss_13": 2.4120055079460143, + "ce_loss_26": 1.898421436548233, + "ce_loss_39": 1.7264380306005478, + "ce_loss_52": 1.4368474900722503, + "ce_loss_7": 2.737997555732727, + "epoch": 0.942, + "grad_norm": 13.918132865929852, + "kl_loss_13": 2037.6, + "kl_loss_26": 952.9, + "kl_loss_39": 578.25, + "kl_loss_7": 2716.0, + "learning_rate": 8.444987508813451e-06, + "loss": 3086.75, + "step": 9420 + }, + { + "ce_loss_13": 2.4210041254758834, + "ce_loss_26": 1.9090048849582673, + "ce_loss_39": 1.732128456234932, + "ce_loss_52": 1.4372442662715912, + "ce_loss_7": 2.743331879377365, + "epoch": 0.943, + "grad_norm": 13.763467997379733, + "kl_loss_13": 2025.8, + "kl_loss_26": 953.0, + "kl_loss_39": 582.75, + "kl_loss_7": 2709.6, + "learning_rate": 8.157079034633974e-06, + "loss": 3102.0, + "step": 9430 + }, + { + "ce_loss_13": 2.390827241539955, + "ce_loss_26": 1.8993241131305694, + "ce_loss_39": 1.720950961112976, + "ce_loss_52": 1.4380876436829566, + "ce_loss_7": 2.708938491344452, + "epoch": 0.944, + "grad_norm": 13.65095927372199, + "kl_loss_13": 1961.4, + "kl_loss_26": 921.2, + "kl_loss_39": 551.1, + "kl_loss_7": 2626.8, + "learning_rate": 7.874123413208145e-06, + "loss": 3097.4, + "step": 9440 + }, + { + "ce_loss_13": 2.377914309501648, + "ce_loss_26": 1.8749269813299179, + "ce_loss_39": 1.699908110499382, + "ce_loss_52": 1.4185950323939323, + "ce_loss_7": 2.700740724802017, + "epoch": 0.945, + "grad_norm": 13.057374043926089, + "kl_loss_13": 2000.2, + "kl_loss_26": 931.2, + "kl_loss_39": 558.45, + "kl_loss_7": 2681.6, + "learning_rate": 7.59612349389599e-06, + "loss": 3113.0, + "step": 9450 + }, + { + "ce_loss_13": 2.428489762544632, + "ce_loss_26": 1.9228288322687148, + "ce_loss_39": 1.7434315174818038, + "ce_loss_52": 1.4502023369073869, + "ce_loss_7": 2.745371562242508, + "epoch": 0.946, + "grad_norm": 13.467471129956634, + "kl_loss_13": 2003.8, + "kl_loss_26": 954.8, + "kl_loss_39": 579.9, + "kl_loss_7": 2674.8, + "learning_rate": 7.323082076153509e-06, + "loss": 3110.35, + "step": 9460 + }, + { + "ce_loss_13": 2.3993911921977995, + "ce_loss_26": 1.8969668239355086, + "ce_loss_39": 1.7201234728097916, + "ce_loss_52": 1.4261724770069122, + "ce_loss_7": 2.723515260219574, + "epoch": 0.947, + "grad_norm": 14.123562965002943, + "kl_loss_13": 1995.2, + "kl_loss_26": 940.5, + "kl_loss_39": 572.85, + "kl_loss_7": 2665.2, + "learning_rate": 7.055001909504755e-06, + "loss": 3103.5, + "step": 9470 + }, + { + "ce_loss_13": 2.371204599738121, + "ce_loss_26": 1.8650381177663804, + "ce_loss_39": 1.6901476740837098, + "ce_loss_52": 1.4073528528213501, + "ce_loss_7": 2.69525728225708, + "epoch": 0.948, + "grad_norm": 13.7395501584386, + "kl_loss_13": 2001.2, + "kl_loss_26": 926.3, + "kl_loss_39": 559.35, + "kl_loss_7": 2675.2, + "learning_rate": 6.791885693514133e-06, + "loss": 3117.9, + "step": 9480 + }, + { + "ce_loss_13": 2.3874946534633636, + "ce_loss_26": 1.8710165858268737, + "ce_loss_39": 1.6918294131755829, + "ce_loss_52": 1.4066254168748855, + "ce_loss_7": 2.718043899536133, + "epoch": 0.949, + "grad_norm": 14.444149745541107, + "kl_loss_13": 2026.6, + "kl_loss_26": 936.5, + "kl_loss_39": 563.35, + "kl_loss_7": 2720.4, + "learning_rate": 6.533736077758867e-06, + "loss": 3144.35, + "step": 9490 + }, + { + "ce_loss_13": 2.3802697598934173, + "ce_loss_26": 1.8696208387613296, + "ce_loss_39": 1.6913905203342439, + "ce_loss_52": 1.4053042978048325, + "ce_loss_7": 2.706346648931503, + "epoch": 0.95, + "grad_norm": 13.683323615533283, + "kl_loss_13": 2001.6, + "kl_loss_26": 931.6, + "kl_loss_39": 565.45, + "kl_loss_7": 2685.6, + "learning_rate": 6.2805556618028556e-06, + "loss": 3132.15, + "step": 9500 + }, + { + "ce_loss_13": 2.45430488884449, + "ce_loss_26": 1.9479888796806335, + "ce_loss_39": 1.7709876328706742, + "ce_loss_52": 1.4815271288156509, + "ce_loss_7": 2.7643910527229307, + "epoch": 0.951, + "grad_norm": 14.559144572280042, + "kl_loss_13": 2000.2, + "kl_loss_26": 946.3, + "kl_loss_39": 575.65, + "kl_loss_7": 2660.0, + "learning_rate": 6.032346995169968e-06, + "loss": 3124.85, + "step": 9510 + }, + { + "ce_loss_13": 2.4814863801002502, + "ce_loss_26": 1.976859924197197, + "ce_loss_39": 1.795655995607376, + "ce_loss_52": 1.4911505609750748, + "ce_loss_7": 2.796732819080353, + "epoch": 0.952, + "grad_norm": 14.307827869099665, + "kl_loss_13": 2052.0, + "kl_loss_26": 982.8, + "kl_loss_39": 599.65, + "kl_loss_7": 2722.8, + "learning_rate": 5.789112577318789e-06, + "loss": 3131.25, + "step": 9520 + }, + { + "ce_loss_13": 2.370393967628479, + "ce_loss_26": 1.8703551948070527, + "ce_loss_39": 1.6911178916692733, + "ce_loss_52": 1.397288253903389, + "ce_loss_7": 2.6921759128570555, + "epoch": 0.953, + "grad_norm": 13.532268906242434, + "kl_loss_13": 2004.2, + "kl_loss_26": 951.2, + "kl_loss_39": 578.45, + "kl_loss_7": 2674.4, + "learning_rate": 5.550854857617194e-06, + "loss": 3093.1, + "step": 9530 + }, + { + "ce_loss_13": 2.3659786969423293, + "ce_loss_26": 1.855891814827919, + "ce_loss_39": 1.6768086194992065, + "ce_loss_52": 1.3908632963895797, + "ce_loss_7": 2.6853357315063477, + "epoch": 0.954, + "grad_norm": 14.753075788642166, + "kl_loss_13": 2010.2, + "kl_loss_26": 935.5, + "kl_loss_39": 563.55, + "kl_loss_7": 2681.6, + "learning_rate": 5.317576235317756e-06, + "loss": 3120.8, + "step": 9540 + }, + { + "ce_loss_13": 2.4337273120880125, + "ce_loss_26": 1.9248733311891555, + "ce_loss_39": 1.7499325275421143, + "ce_loss_52": 1.4674480736255646, + "ce_loss_7": 2.7530871987342835, + "epoch": 0.955, + "grad_norm": 13.311149092546296, + "kl_loss_13": 1991.6, + "kl_loss_26": 925.5, + "kl_loss_39": 562.15, + "kl_loss_7": 2663.6, + "learning_rate": 5.089279059533658e-06, + "loss": 3080.15, + "step": 9550 + }, + { + "ce_loss_13": 2.477786514163017, + "ce_loss_26": 1.9535741955041885, + "ce_loss_39": 1.7720552951097488, + "ce_loss_52": 1.468274374306202, + "ce_loss_7": 2.806012988090515, + "epoch": 0.956, + "grad_norm": 13.557904970493434, + "kl_loss_13": 2082.8, + "kl_loss_26": 980.6, + "kl_loss_39": 601.5, + "kl_loss_7": 2773.6, + "learning_rate": 4.865965629214819e-06, + "loss": 3106.7, + "step": 9560 + }, + { + "ce_loss_13": 2.4565936863422393, + "ce_loss_26": 1.9573397368192673, + "ce_loss_39": 1.773080477118492, + "ce_loss_52": 1.4730938911437987, + "ce_loss_7": 2.7732574224472044, + "epoch": 0.957, + "grad_norm": 14.224632183872556, + "kl_loss_13": 2023.4, + "kl_loss_26": 967.5, + "kl_loss_39": 591.9, + "kl_loss_7": 2693.2, + "learning_rate": 4.6476381931251366e-06, + "loss": 3121.6, + "step": 9570 + }, + { + "ce_loss_13": 2.3847708880901335, + "ce_loss_26": 1.8858718812465667, + "ce_loss_39": 1.7032645136117934, + "ce_loss_52": 1.4209994703531266, + "ce_loss_7": 2.708657431602478, + "epoch": 0.958, + "grad_norm": 13.699065805322247, + "kl_loss_13": 1983.8, + "kl_loss_26": 921.9, + "kl_loss_39": 551.45, + "kl_loss_7": 2664.4, + "learning_rate": 4.434298949819449e-06, + "loss": 3097.9, + "step": 9580 + }, + { + "ce_loss_13": 2.417462554574013, + "ce_loss_26": 1.9159747958183289, + "ce_loss_39": 1.7397230744361878, + "ce_loss_52": 1.4456302881240846, + "ce_loss_7": 2.7354709684848784, + "epoch": 0.959, + "grad_norm": 13.043144407859455, + "kl_loss_13": 1990.0, + "kl_loss_26": 941.8, + "kl_loss_39": 574.8, + "kl_loss_7": 2662.4, + "learning_rate": 4.2259500476214406e-06, + "loss": 3095.8, + "step": 9590 + }, + { + "ce_loss_13": 2.427861177921295, + "ce_loss_26": 1.9228525012731552, + "ce_loss_39": 1.7392638593912124, + "ce_loss_52": 1.445976984500885, + "ce_loss_7": 2.7459777116775514, + "epoch": 0.96, + "grad_norm": 13.619745852010974, + "kl_loss_13": 2021.4, + "kl_loss_26": 954.5, + "kl_loss_39": 577.9, + "kl_loss_7": 2698.8, + "learning_rate": 4.02259358460233e-06, + "loss": 3122.9, + "step": 9600 + }, + { + "ce_loss_13": 2.46402502655983, + "ce_loss_26": 1.9587235629558564, + "ce_loss_39": 1.7794159650802612, + "ce_loss_52": 1.4795134991407395, + "ce_loss_7": 2.7920902401208876, + "epoch": 0.961, + "grad_norm": 13.957824607337622, + "kl_loss_13": 2034.8, + "kl_loss_26": 964.3, + "kl_loss_39": 588.35, + "kl_loss_7": 2719.6, + "learning_rate": 3.8242316085594916e-06, + "loss": 3106.9, + "step": 9610 + }, + { + "ce_loss_13": 2.4026571094989775, + "ce_loss_26": 1.8842627108097076, + "ce_loss_39": 1.6962791502475738, + "ce_loss_52": 1.4012041926383971, + "ce_loss_7": 2.7310706257820128, + "epoch": 0.962, + "grad_norm": 13.847893098332637, + "kl_loss_13": 2054.8, + "kl_loss_26": 965.5, + "kl_loss_39": 579.7, + "kl_loss_7": 2743.2, + "learning_rate": 3.630866116995757e-06, + "loss": 3149.6, + "step": 9620 + }, + { + "ce_loss_13": 2.3699823945760725, + "ce_loss_26": 1.8775872141122818, + "ce_loss_39": 1.7078934848308562, + "ce_loss_52": 1.427069191634655, + "ce_loss_7": 2.68268860578537, + "epoch": 0.963, + "grad_norm": 14.006927808523123, + "kl_loss_13": 1949.4, + "kl_loss_26": 909.6, + "kl_loss_39": 550.8, + "kl_loss_7": 2613.6, + "learning_rate": 3.4424990570994797e-06, + "loss": 3088.75, + "step": 9630 + }, + { + "ce_loss_13": 2.4296331614255906, + "ce_loss_26": 1.923089200258255, + "ce_loss_39": 1.7481043189764023, + "ce_loss_52": 1.458402395248413, + "ce_loss_7": 2.7494013249874114, + "epoch": 0.964, + "grad_norm": 13.971614829348757, + "kl_loss_13": 1997.2, + "kl_loss_26": 933.6, + "kl_loss_39": 570.15, + "kl_loss_7": 2669.6, + "learning_rate": 3.2591323257248896e-06, + "loss": 3114.2, + "step": 9640 + }, + { + "ce_loss_13": 2.4253605216741563, + "ce_loss_26": 1.920077031850815, + "ce_loss_39": 1.7424649715423584, + "ce_loss_52": 1.4570231169462204, + "ce_loss_7": 2.744251537322998, + "epoch": 0.965, + "grad_norm": 13.868560987861438, + "kl_loss_13": 2000.2, + "kl_loss_26": 937.9, + "kl_loss_39": 567.9, + "kl_loss_7": 2672.0, + "learning_rate": 3.0807677693729385e-06, + "loss": 3117.15, + "step": 9650 + }, + { + "ce_loss_13": 2.445168226957321, + "ce_loss_26": 1.9308179676532746, + "ce_loss_39": 1.7556968212127686, + "ce_loss_52": 1.462259876728058, + "ce_loss_7": 2.7607338547706606, + "epoch": 0.966, + "grad_norm": 13.868630790191208, + "kl_loss_13": 2030.0, + "kl_loss_26": 955.1, + "kl_loss_39": 582.55, + "kl_loss_7": 2710.4, + "learning_rate": 2.9074071841727055e-06, + "loss": 3136.15, + "step": 9660 + }, + { + "ce_loss_13": 2.3746090680360794, + "ce_loss_26": 1.8696930974721908, + "ce_loss_39": 1.6932952284812928, + "ce_loss_52": 1.4037827536463738, + "ce_loss_7": 2.6961917489767075, + "epoch": 0.967, + "grad_norm": 13.741681034653173, + "kl_loss_13": 2003.4, + "kl_loss_26": 943.6, + "kl_loss_39": 570.8, + "kl_loss_7": 2675.6, + "learning_rate": 2.739052315863355e-06, + "loss": 3123.325, + "step": 9670 + }, + { + "ce_loss_13": 2.4609276592731475, + "ce_loss_26": 1.9465218961238862, + "ce_loss_39": 1.755302396416664, + "ce_loss_52": 1.4513660728931428, + "ce_loss_7": 2.783074140548706, + "epoch": 0.968, + "grad_norm": 13.849931878044563, + "kl_loss_13": 2075.0, + "kl_loss_26": 988.8, + "kl_loss_39": 596.8, + "kl_loss_7": 2755.6, + "learning_rate": 2.5757048597765396e-06, + "loss": 3108.55, + "step": 9680 + }, + { + "ce_loss_13": 2.3671315789222716, + "ce_loss_26": 1.8652487874031067, + "ce_loss_39": 1.6895152300596237, + "ce_loss_52": 1.4115738093852996, + "ce_loss_7": 2.6859952569007874, + "epoch": 0.969, + "grad_norm": 14.271644916152136, + "kl_loss_13": 1975.0, + "kl_loss_26": 910.5, + "kl_loss_39": 544.7, + "kl_loss_7": 2649.2, + "learning_rate": 2.417366460819359e-06, + "loss": 3094.15, + "step": 9690 + }, + { + "ce_loss_13": 2.4030214190483092, + "ce_loss_26": 1.8979378938674927, + "ce_loss_39": 1.7221683353185653, + "ce_loss_52": 1.4381350710988046, + "ce_loss_7": 2.723929351568222, + "epoch": 0.97, + "grad_norm": 13.991735266099393, + "kl_loss_13": 1985.0, + "kl_loss_26": 927.7, + "kl_loss_39": 554.55, + "kl_loss_7": 2656.0, + "learning_rate": 2.2640387134577057e-06, + "loss": 3121.05, + "step": 9700 + }, + { + "ce_loss_13": 2.388926792144775, + "ce_loss_26": 1.8821999937295915, + "ce_loss_39": 1.7065987050533296, + "ce_loss_52": 1.427994754910469, + "ce_loss_7": 2.7048760533332823, + "epoch": 0.971, + "grad_norm": 14.179862402525806, + "kl_loss_13": 1973.6, + "kl_loss_26": 911.0, + "kl_loss_39": 548.95, + "kl_loss_7": 2646.4, + "learning_rate": 2.115723161700278e-06, + "loss": 3136.0, + "step": 9710 + }, + { + "ce_loss_13": 2.448368564248085, + "ce_loss_26": 1.9373161673545838, + "ce_loss_39": 1.7496683716773986, + "ce_loss_52": 1.4501112252473831, + "ce_loss_7": 2.7681332349777223, + "epoch": 0.972, + "grad_norm": 13.312305733104958, + "kl_loss_13": 2063.8, + "kl_loss_26": 986.5, + "kl_loss_39": 593.8, + "kl_loss_7": 2737.6, + "learning_rate": 1.9724212990830937e-06, + "loss": 3096.45, + "step": 9720 + }, + { + "ce_loss_13": 2.40165196955204, + "ce_loss_26": 1.9097970753908158, + "ce_loss_39": 1.732149314880371, + "ce_loss_52": 1.4418910443782806, + "ce_loss_7": 2.7182520925998688, + "epoch": 0.973, + "grad_norm": 13.186413860645287, + "kl_loss_13": 1986.6, + "kl_loss_26": 939.2, + "kl_loss_39": 575.8, + "kl_loss_7": 2649.6, + "learning_rate": 1.8341345686543331e-06, + "loss": 3096.7, + "step": 9730 + }, + { + "ce_loss_13": 2.4751327097415925, + "ce_loss_26": 1.9718121886253357, + "ce_loss_39": 1.7938049882650375, + "ce_loss_52": 1.510325726866722, + "ce_loss_7": 2.790462166070938, + "epoch": 0.974, + "grad_norm": 13.542368115510277, + "kl_loss_13": 1991.0, + "kl_loss_26": 919.4, + "kl_loss_39": 555.25, + "kl_loss_7": 2656.0, + "learning_rate": 1.7008643629596864e-06, + "loss": 3139.35, + "step": 9740 + }, + { + "ce_loss_13": 2.448350805044174, + "ce_loss_26": 1.9393187165260315, + "ce_loss_39": 1.756307190656662, + "ce_loss_52": 1.4625631257891656, + "ce_loss_7": 2.7839869439601896, + "epoch": 0.975, + "grad_norm": 14.153531935288711, + "kl_loss_13": 2030.2, + "kl_loss_26": 953.5, + "kl_loss_39": 573.5, + "kl_loss_7": 2731.6, + "learning_rate": 1.5726120240288633e-06, + "loss": 3091.65, + "step": 9750 + }, + { + "ce_loss_13": 2.493607670068741, + "ce_loss_26": 1.9704292267560959, + "ce_loss_39": 1.7810406684875488, + "ce_loss_52": 1.4734347879886627, + "ce_loss_7": 2.8225875020027162, + "epoch": 0.976, + "grad_norm": 13.917924014106907, + "kl_loss_13": 2092.8, + "kl_loss_26": 995.9, + "kl_loss_39": 599.65, + "kl_loss_7": 2777.6, + "learning_rate": 1.4493788433612708e-06, + "loss": 3106.15, + "step": 9760 + }, + { + "ce_loss_13": 2.389453822374344, + "ce_loss_26": 1.889643257856369, + "ce_loss_39": 1.7158276617527009, + "ce_loss_52": 1.4275161743164062, + "ce_loss_7": 2.7086060285568236, + "epoch": 0.977, + "grad_norm": 13.614062458586098, + "kl_loss_13": 1971.2, + "kl_loss_26": 924.0, + "kl_loss_39": 561.0, + "kl_loss_7": 2646.0, + "learning_rate": 1.3311660619138578e-06, + "loss": 3083.9, + "step": 9770 + }, + { + "ce_loss_13": 2.387269985675812, + "ce_loss_26": 1.8734027475118638, + "ce_loss_39": 1.6912487357854844, + "ce_loss_52": 1.4031418770551682, + "ce_loss_7": 2.7191080808639527, + "epoch": 0.978, + "grad_norm": 14.36678395992836, + "kl_loss_13": 2023.2, + "kl_loss_26": 943.6, + "kl_loss_39": 565.3, + "kl_loss_7": 2722.0, + "learning_rate": 1.2179748700879012e-06, + "loss": 3100.55, + "step": 9780 + }, + { + "ce_loss_13": 2.3631068110466003, + "ce_loss_26": 1.861675202846527, + "ce_loss_39": 1.6827853351831437, + "ce_loss_52": 1.3983743026852609, + "ce_loss_7": 2.6796926259994507, + "epoch": 0.979, + "grad_norm": 14.06766234963321, + "kl_loss_13": 1993.6, + "kl_loss_26": 930.1, + "kl_loss_39": 561.8, + "kl_loss_7": 2668.4, + "learning_rate": 1.1098064077174619e-06, + "loss": 3119.95, + "step": 9790 + }, + { + "ce_loss_13": 2.4609683632850645, + "ce_loss_26": 1.948616126179695, + "ce_loss_39": 1.7647934973239898, + "ce_loss_52": 1.454009547829628, + "ce_loss_7": 2.785689663887024, + "epoch": 0.98, + "grad_norm": 13.39999724987333, + "kl_loss_13": 2070.0, + "kl_loss_26": 989.6, + "kl_loss_39": 607.8, + "kl_loss_7": 2749.6, + "learning_rate": 1.006661764057837e-06, + "loss": 3101.0, + "step": 9800 + }, + { + "ce_loss_13": 2.3849492847919462, + "ce_loss_26": 1.8622053205966949, + "ce_loss_39": 1.6866901487112045, + "ce_loss_52": 1.3904988124966622, + "ce_loss_7": 2.713945233821869, + "epoch": 0.981, + "grad_norm": 13.776704297392317, + "kl_loss_13": 2060.2, + "kl_loss_26": 959.2, + "kl_loss_39": 584.2, + "kl_loss_7": 2753.2, + "learning_rate": 9.085419777743465e-07, + "loss": 3145.375, + "step": 9810 + }, + { + "ce_loss_13": 2.423547920584679, + "ce_loss_26": 1.9212449431419372, + "ce_loss_39": 1.7484615802764893, + "ce_loss_52": 1.4525489255785942, + "ce_loss_7": 2.7415110945701597, + "epoch": 0.982, + "grad_norm": 13.800146507088886, + "kl_loss_13": 2026.2, + "kl_loss_26": 956.5, + "kl_loss_39": 585.85, + "kl_loss_7": 2700.8, + "learning_rate": 8.15448036932176e-07, + "loss": 3140.075, + "step": 9820 + }, + { + "ce_loss_13": 2.4265142381191254, + "ce_loss_26": 1.9230076640844345, + "ce_loss_39": 1.7403117150068284, + "ce_loss_52": 1.44933120906353, + "ce_loss_7": 2.743628019094467, + "epoch": 0.983, + "grad_norm": 13.731982955757704, + "kl_loss_13": 2035.6, + "kl_loss_26": 967.5, + "kl_loss_39": 589.65, + "kl_loss_7": 2710.4, + "learning_rate": 7.273808789862724e-07, + "loss": 3097.325, + "step": 9830 + }, + { + "ce_loss_13": 2.432727184891701, + "ce_loss_26": 1.9222358494997025, + "ce_loss_39": 1.7392481476068498, + "ce_loss_52": 1.4496447369456291, + "ce_loss_7": 2.7563742280006407, + "epoch": 0.984, + "grad_norm": 14.26920700265725, + "kl_loss_13": 2031.0, + "kl_loss_26": 951.8, + "kl_loss_39": 579.1, + "kl_loss_7": 2718.0, + "learning_rate": 6.443413907720186e-07, + "loss": 3091.6, + "step": 9840 + }, + { + "ce_loss_13": 2.3425186455249785, + "ce_loss_26": 1.8573100596666337, + "ce_loss_39": 1.6887821286916733, + "ce_loss_52": 1.3970566481351852, + "ce_loss_7": 2.6601881802082064, + "epoch": 0.985, + "grad_norm": 14.017763045011195, + "kl_loss_13": 1953.8, + "kl_loss_26": 920.7, + "kl_loss_39": 567.3, + "kl_loss_7": 2625.2, + "learning_rate": 5.663304084960185e-07, + "loss": 3110.05, + "step": 9850 + }, + { + "ce_loss_13": 2.3719563096761704, + "ce_loss_26": 1.8737152755260467, + "ce_loss_39": 1.698423257470131, + "ce_loss_52": 1.4208501130342484, + "ce_loss_7": 2.6962190210819243, + "epoch": 0.986, + "grad_norm": 14.355657718500265, + "kl_loss_13": 1944.8, + "kl_loss_26": 901.7, + "kl_loss_39": 537.7, + "kl_loss_7": 2617.8, + "learning_rate": 4.933487177280482e-07, + "loss": 3084.175, + "step": 9860 + }, + { + "ce_loss_13": 2.4431921422481535, + "ce_loss_26": 1.9350731909275054, + "ce_loss_39": 1.7554681122303009, + "ce_loss_52": 1.4598491072654725, + "ce_loss_7": 2.761775279045105, + "epoch": 0.987, + "grad_norm": 14.504857971707292, + "kl_loss_13": 2021.8, + "kl_loss_26": 959.6, + "kl_loss_39": 582.75, + "kl_loss_7": 2686.8, + "learning_rate": 4.2539705339295075e-07, + "loss": 3095.3, + "step": 9870 + }, + { + "ce_loss_13": 2.3912162601947786, + "ce_loss_26": 1.8867509424686433, + "ce_loss_39": 1.7147331923246383, + "ce_loss_52": 1.4274780035018921, + "ce_loss_7": 2.7152205407619476, + "epoch": 0.988, + "grad_norm": 13.898651253722077, + "kl_loss_13": 1978.6, + "kl_loss_26": 918.4, + "kl_loss_39": 560.0, + "kl_loss_7": 2653.6, + "learning_rate": 3.6247609976319816e-07, + "loss": 3111.5, + "step": 9880 + }, + { + "ce_loss_13": 2.4592268586158754, + "ce_loss_26": 1.9460572868585586, + "ce_loss_39": 1.7624834805727005, + "ce_loss_52": 1.4708713114261627, + "ce_loss_7": 2.780032974481583, + "epoch": 0.989, + "grad_norm": 13.162597089199776, + "kl_loss_13": 2017.4, + "kl_loss_26": 950.2, + "kl_loss_39": 574.95, + "kl_loss_7": 2680.0, + "learning_rate": 3.0458649045211895e-07, + "loss": 3104.75, + "step": 9890 + }, + { + "ce_loss_13": 2.318621850013733, + "ce_loss_26": 1.8226055085659028, + "ce_loss_39": 1.6485673993825913, + "ce_loss_52": 1.3747537702322006, + "ce_loss_7": 2.6328552305698394, + "epoch": 0.99, + "grad_norm": 14.177492293626315, + "kl_loss_13": 1957.6, + "kl_loss_26": 901.5, + "kl_loss_39": 543.2, + "kl_loss_7": 2623.6, + "learning_rate": 2.517288084074587e-07, + "loss": 3097.8, + "step": 9900 + }, + { + "ce_loss_13": 2.450270253419876, + "ce_loss_26": 1.9614087045192719, + "ce_loss_39": 1.820476683974266, + "ce_loss_52": 1.4863917350769043, + "ce_loss_7": 2.776776838302612, + "epoch": 0.991, + "grad_norm": 13.637096409193225, + "kl_loss_13": 2041.0, + "kl_loss_26": 993.3, + "kl_loss_39": 622.6, + "kl_loss_7": 2724.0, + "learning_rate": 2.0390358590538505e-07, + "loss": 3133.55, + "step": 9910 + }, + { + "ce_loss_13": 2.4164200723171234, + "ce_loss_26": 1.9075238525867462, + "ce_loss_39": 1.7269282668828965, + "ce_loss_52": 1.4252729326486588, + "ce_loss_7": 2.7410891175270082, + "epoch": 0.992, + "grad_norm": 14.06088505281845, + "kl_loss_13": 2041.6, + "kl_loss_26": 973.6, + "kl_loss_39": 593.7, + "kl_loss_7": 2722.8, + "learning_rate": 1.61111304545436e-07, + "loss": 3101.25, + "step": 9920 + }, + { + "ce_loss_13": 2.4260194152593613, + "ce_loss_26": 1.9236795336008072, + "ce_loss_39": 1.7412341982126236, + "ce_loss_52": 1.445477369427681, + "ce_loss_7": 2.750396305322647, + "epoch": 0.993, + "grad_norm": 13.813190658092928, + "kl_loss_13": 2043.4, + "kl_loss_26": 963.8, + "kl_loss_39": 591.1, + "kl_loss_7": 2727.2, + "learning_rate": 1.2335239524541298e-07, + "loss": 3113.2, + "step": 9930 + }, + { + "ce_loss_13": 2.4003034621477126, + "ce_loss_26": 1.8995478272438049, + "ce_loss_39": 1.7172971665859222, + "ce_loss_52": 1.4267783105373382, + "ce_loss_7": 2.730258399248123, + "epoch": 0.994, + "grad_norm": 14.088698553106907, + "kl_loss_13": 2017.2, + "kl_loss_26": 955.6, + "kl_loss_39": 580.2, + "kl_loss_7": 2698.8, + "learning_rate": 9.06272382371065e-08, + "loss": 3112.45, + "step": 9940 + }, + { + "ce_loss_13": 2.3436710268259047, + "ce_loss_26": 1.860148760676384, + "ce_loss_39": 1.6837237626314163, + "ce_loss_52": 1.405136799812317, + "ce_loss_7": 2.659210926294327, + "epoch": 0.995, + "grad_norm": 13.69734193648067, + "kl_loss_13": 1922.2, + "kl_loss_26": 893.6, + "kl_loss_39": 537.7, + "kl_loss_7": 2583.2, + "learning_rate": 6.293616306246586e-08, + "loss": 3122.45, + "step": 9950 + }, + { + "ce_loss_13": 2.4028525710105897, + "ce_loss_26": 1.9053959518671035, + "ce_loss_39": 1.734974354505539, + "ce_loss_52": 1.4462752103805543, + "ce_loss_7": 2.7197851181030273, + "epoch": 0.996, + "grad_norm": 14.052327245568536, + "kl_loss_13": 1985.8, + "kl_loss_26": 933.9, + "kl_loss_39": 572.05, + "kl_loss_7": 2659.2, + "learning_rate": 4.027944857032395e-08, + "loss": 3115.75, + "step": 9960 + }, + { + "ce_loss_13": 2.3983709454536437, + "ce_loss_26": 1.897152093052864, + "ce_loss_39": 1.7151427894830704, + "ce_loss_52": 1.4271936371922493, + "ce_loss_7": 2.7171947032213213, + "epoch": 0.997, + "grad_norm": 13.53314291583234, + "kl_loss_13": 1999.0, + "kl_loss_26": 942.7, + "kl_loss_39": 568.8, + "kl_loss_7": 2670.0, + "learning_rate": 2.265732291356626e-08, + "loss": 3094.175, + "step": 9970 + }, + { + "ce_loss_13": 2.3423147082328795, + "ce_loss_26": 1.8376368135213852, + "ce_loss_39": 1.6704195857048034, + "ce_loss_52": 1.3995309814810752, + "ce_loss_7": 2.6571378737688063, + "epoch": 0.998, + "grad_norm": 13.139494465989356, + "kl_loss_13": 1954.0, + "kl_loss_26": 894.2, + "kl_loss_39": 540.1, + "kl_loss_7": 2622.6, + "learning_rate": 1.0069963546743833e-08, + "loss": 3091.8, + "step": 9980 + }, + { + "ce_loss_13": 2.371971958875656, + "ce_loss_26": 1.8749190032482148, + "ce_loss_39": 1.7022694885730743, + "ce_loss_52": 1.4237666621804237, + "ce_loss_7": 2.694683998823166, + "epoch": 0.999, + "grad_norm": 13.948245708314273, + "kl_loss_13": 1952.8, + "kl_loss_26": 907.5, + "kl_loss_39": 547.75, + "kl_loss_7": 2620.8, + "learning_rate": 2.517497224463483e-09, + "loss": 3089.0, + "step": 9990 + }, + { + "ce_loss_13": 2.4080661326646804, + "ce_loss_26": 1.896571347117424, + "ce_loss_39": 1.7096484139561654, + "ce_loss_52": 1.4170419454574585, + "ce_loss_7": 2.7374835878610613, + "epoch": 1.0, + "grad_norm": 13.857134097975196, + "kl_loss_13": 2044.2, + "kl_loss_26": 961.3, + "kl_loss_39": 577.05, + "kl_loss_7": 2731.6, + "learning_rate": 0.0, + "loss": 3103.8, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0167830278176768e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}