| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "ce_loss_13": 11.511599779129028, |
| "ce_loss_26": 11.188396453857422, |
| "ce_loss_39": 11.169448137283325, |
| "ce_loss_52": 1.3891706466674805, |
| "ce_loss_7": 11.556999206542969, |
| "epoch": 0.0001, |
| "grad_norm": 28.059961985369828, |
| "kl_loss_13": 20896.0, |
| "kl_loss_26": 20192.0, |
| "kl_loss_39": 20192.0, |
| "kl_loss_7": 20960.0, |
| "learning_rate": 1e-05, |
| "loss": 41080.0, |
| "step": 1 |
| }, |
| { |
| "ce_loss_13": 11.506269454956055, |
| "ce_loss_26": 11.177568621105618, |
| "ce_loss_39": 11.177141269048056, |
| "ce_loss_52": 1.458960132466422, |
| "ce_loss_7": 11.548744599024454, |
| "epoch": 0.001, |
| "grad_norm": 28.667146352003318, |
| "kl_loss_13": 20782.222222222223, |
| "kl_loss_26": 20106.666666666668, |
| "kl_loss_39": 20110.222222222223, |
| "kl_loss_7": 20867.555555555555, |
| "learning_rate": 0.0001, |
| "loss": 41008.8889, |
| "step": 10 |
| }, |
| { |
| "ce_loss_13": 11.456571412086486, |
| "ce_loss_26": 11.158088731765748, |
| "ce_loss_39": 11.15653133392334, |
| "ce_loss_52": 1.435088688135147, |
| "ce_loss_7": 11.476131820678711, |
| "epoch": 0.002, |
| "grad_norm": 35.67456270110165, |
| "kl_loss_13": 20723.2, |
| "kl_loss_26": 20118.4, |
| "kl_loss_39": 20115.2, |
| "kl_loss_7": 20764.8, |
| "learning_rate": 0.0002, |
| "loss": 40904.0, |
| "step": 20 |
| }, |
| { |
| "ce_loss_13": 11.150281167030334, |
| "ce_loss_26": 11.01296763420105, |
| "ce_loss_39": 11.044581699371339, |
| "ce_loss_52": 1.4344331562519073, |
| "ce_loss_7": 11.054779505729675, |
| "epoch": 0.003, |
| "grad_norm": 54.04879245830703, |
| "kl_loss_13": 20108.8, |
| "kl_loss_26": 19840.0, |
| "kl_loss_39": 19907.2, |
| "kl_loss_7": 19920.0, |
| "learning_rate": 0.0003, |
| "loss": 39847.2, |
| "step": 30 |
| }, |
| { |
| "ce_loss_13": 10.505089902877808, |
| "ce_loss_26": 10.497783017158508, |
| "ce_loss_39": 10.527814579010009, |
| "ce_loss_52": 1.460255417227745, |
| "ce_loss_7": 10.453347158432006, |
| "epoch": 0.004, |
| "grad_norm": 29.567872258029254, |
| "kl_loss_13": 18694.4, |
| "kl_loss_26": 18688.0, |
| "kl_loss_39": 18755.2, |
| "kl_loss_7": 18588.8, |
| "learning_rate": 0.0004, |
| "loss": 37436.0, |
| "step": 40 |
| }, |
| { |
| "ce_loss_13": 10.321261882781982, |
| "ce_loss_26": 10.244042158126831, |
| "ce_loss_39": 10.236308455467224, |
| "ce_loss_52": 1.463668829202652, |
| "ce_loss_7": 10.305184721946716, |
| "epoch": 0.005, |
| "grad_norm": 37.9866452371617, |
| "kl_loss_13": 18329.6, |
| "kl_loss_26": 18163.2, |
| "kl_loss_39": 18140.8, |
| "kl_loss_7": 18288.0, |
| "learning_rate": 0.0005, |
| "loss": 36555.2, |
| "step": 50 |
| }, |
| { |
| "ce_loss_13": 10.226529097557068, |
| "ce_loss_26": 10.111042308807374, |
| "ce_loss_39": 10.11395993232727, |
| "ce_loss_52": 1.4317695140838622, |
| "ce_loss_7": 10.210216856002807, |
| "epoch": 0.006, |
| "grad_norm": 47.58894649950276, |
| "kl_loss_13": 18208.0, |
| "kl_loss_26": 17974.4, |
| "kl_loss_39": 17980.8, |
| "kl_loss_7": 18166.4, |
| "learning_rate": 0.0006, |
| "loss": 36044.0, |
| "step": 60 |
| }, |
| { |
| "ce_loss_13": 10.142269968986511, |
| "ce_loss_26": 10.005614733695984, |
| "ce_loss_39": 10.006310772895812, |
| "ce_loss_52": 1.3979130625724792, |
| "ce_loss_7": 10.13636019229889, |
| "epoch": 0.007, |
| "grad_norm": 55.16387671378209, |
| "kl_loss_13": 18057.6, |
| "kl_loss_26": 17772.8, |
| "kl_loss_39": 17792.0, |
| "kl_loss_7": 18048.0, |
| "learning_rate": 0.0007, |
| "loss": 35718.4, |
| "step": 70 |
| }, |
| { |
| "ce_loss_13": 10.032484984397888, |
| "ce_loss_26": 9.872331905364991, |
| "ce_loss_39": 9.881066274642944, |
| "ce_loss_52": 1.4247985988855363, |
| "ce_loss_7": 10.02949812412262, |
| "epoch": 0.008, |
| "grad_norm": 59.28947925840698, |
| "kl_loss_13": 17811.2, |
| "kl_loss_26": 17488.0, |
| "kl_loss_39": 17500.8, |
| "kl_loss_7": 17808.0, |
| "learning_rate": 0.0008, |
| "loss": 35334.4, |
| "step": 80 |
| }, |
| { |
| "ce_loss_13": 9.942931509017944, |
| "ce_loss_26": 9.76176996231079, |
| "ce_loss_39": 9.775496363639832, |
| "ce_loss_52": 1.4258457243442535, |
| "ce_loss_7": 9.945418453216552, |
| "epoch": 0.009, |
| "grad_norm": 55.94798234885439, |
| "kl_loss_13": 17600.0, |
| "kl_loss_26": 17222.4, |
| "kl_loss_39": 17257.6, |
| "kl_loss_7": 17600.0, |
| "learning_rate": 0.0009000000000000001, |
| "loss": 34900.0, |
| "step": 90 |
| }, |
| { |
| "ce_loss_13": 9.852771949768066, |
| "ce_loss_26": 9.661247444152831, |
| "ce_loss_39": 9.673552298545838, |
| "ce_loss_52": 1.438367447257042, |
| "ce_loss_7": 9.860591006278991, |
| "epoch": 0.01, |
| "grad_norm": 53.3090381296634, |
| "kl_loss_13": 17385.6, |
| "kl_loss_26": 16992.0, |
| "kl_loss_39": 17024.0, |
| "kl_loss_7": 17398.4, |
| "learning_rate": 0.001, |
| "loss": 34482.8, |
| "step": 100 |
| }, |
| { |
| "ce_loss_13": 9.76314606666565, |
| "ce_loss_26": 9.563278603553773, |
| "ce_loss_39": 9.578891181945801, |
| "ce_loss_52": 1.412995059788227, |
| "ce_loss_7": 9.781555676460266, |
| "epoch": 0.011, |
| "grad_norm": 53.230976887502294, |
| "kl_loss_13": 17251.2, |
| "kl_loss_26": 16836.8, |
| "kl_loss_39": 16870.4, |
| "kl_loss_7": 17305.6, |
| "learning_rate": 0.0009999974825027757, |
| "loss": 34052.4, |
| "step": 110 |
| }, |
| { |
| "ce_loss_13": 9.681499814987182, |
| "ce_loss_26": 9.470890092849732, |
| "ce_loss_39": 9.48718273639679, |
| "ce_loss_52": 1.4235966846346855, |
| "ce_loss_7": 9.706467342376708, |
| "epoch": 0.012, |
| "grad_norm": 53.526819502242695, |
| "kl_loss_13": 17049.6, |
| "kl_loss_26": 16612.8, |
| "kl_loss_39": 16648.0, |
| "kl_loss_7": 17100.8, |
| "learning_rate": 0.0009999899300364532, |
| "loss": 33698.0, |
| "step": 120 |
| }, |
| { |
| "ce_loss_13": 9.590748715400697, |
| "ce_loss_26": 9.367487025260925, |
| "ce_loss_39": 9.386657476425171, |
| "ce_loss_52": 1.4183751314878463, |
| "ce_loss_7": 9.621446299552918, |
| "epoch": 0.013, |
| "grad_norm": 52.25839955403129, |
| "kl_loss_13": 16867.2, |
| "kl_loss_26": 16417.6, |
| "kl_loss_39": 16448.0, |
| "kl_loss_7": 16940.8, |
| "learning_rate": 0.0009999773426770863, |
| "loss": 33311.6, |
| "step": 130 |
| }, |
| { |
| "ce_loss_13": 9.527826118469239, |
| "ce_loss_26": 9.299377870559692, |
| "ce_loss_39": 9.321026277542114, |
| "ce_loss_52": 1.445027893781662, |
| "ce_loss_7": 9.561844515800477, |
| "epoch": 0.014, |
| "grad_norm": 52.41222674765903, |
| "kl_loss_13": 16692.8, |
| "kl_loss_26": 16227.2, |
| "kl_loss_39": 16273.6, |
| "kl_loss_7": 16777.6, |
| "learning_rate": 0.0009999597205514296, |
| "loss": 33030.8, |
| "step": 140 |
| }, |
| { |
| "ce_loss_13": 9.486352849006654, |
| "ce_loss_26": 9.252249264717102, |
| "ce_loss_39": 9.267914438247681, |
| "ce_loss_52": 1.4420335739850998, |
| "ce_loss_7": 9.524769949913026, |
| "epoch": 0.015, |
| "grad_norm": 53.790180993856175, |
| "kl_loss_13": 16592.0, |
| "kl_loss_26": 16104.0, |
| "kl_loss_39": 16132.8, |
| "kl_loss_7": 16657.6, |
| "learning_rate": 0.0009999370638369377, |
| "loss": 32769.2, |
| "step": 150 |
| }, |
| { |
| "ce_loss_13": 9.392394828796387, |
| "ce_loss_26": 9.15080394744873, |
| "ce_loss_39": 9.170540618896485, |
| "ce_loss_52": 1.423890632390976, |
| "ce_loss_7": 9.436835885047913, |
| "epoch": 0.016, |
| "grad_norm": 52.65338822593253, |
| "kl_loss_13": 16464.0, |
| "kl_loss_26": 15963.2, |
| "kl_loss_39": 16003.2, |
| "kl_loss_7": 16563.2, |
| "learning_rate": 0.000999909372761763, |
| "loss": 32427.6, |
| "step": 160 |
| }, |
| { |
| "ce_loss_13": 9.328035354614258, |
| "ce_loss_26": 9.082435154914856, |
| "ce_loss_39": 9.1047847032547, |
| "ce_loss_52": 1.4349344044923782, |
| "ce_loss_7": 9.380868554115295, |
| "epoch": 0.017, |
| "grad_norm": 52.40583142267758, |
| "kl_loss_13": 16296.0, |
| "kl_loss_26": 15777.6, |
| "kl_loss_39": 15832.0, |
| "kl_loss_7": 16404.8, |
| "learning_rate": 0.0009998766476047546, |
| "loss": 32178.8, |
| "step": 170 |
| }, |
| { |
| "ce_loss_13": 9.262916254997254, |
| "ce_loss_26": 9.01283278465271, |
| "ce_loss_39": 9.035380673408508, |
| "ce_loss_52": 1.3936711609363557, |
| "ce_loss_7": 9.322635316848755, |
| "epoch": 0.018, |
| "grad_norm": 52.346136942448574, |
| "kl_loss_13": 16240.0, |
| "kl_loss_26": 15716.8, |
| "kl_loss_39": 15769.6, |
| "kl_loss_7": 16363.2, |
| "learning_rate": 0.0009998388886954545, |
| "loss": 31844.4, |
| "step": 180 |
| }, |
| { |
| "ce_loss_13": 9.195696568489074, |
| "ce_loss_26": 8.94514548778534, |
| "ce_loss_39": 8.967138314247132, |
| "ce_loss_52": 1.4523959368467332, |
| "ce_loss_7": 9.263990116119384, |
| "epoch": 0.019, |
| "grad_norm": 51.44547467780483, |
| "kl_loss_13": 15985.6, |
| "kl_loss_26": 15464.0, |
| "kl_loss_39": 15515.2, |
| "kl_loss_7": 16132.8, |
| "learning_rate": 0.0009997960964140947, |
| "loss": 31580.0, |
| "step": 190 |
| }, |
| { |
| "ce_loss_13": 9.109345388412475, |
| "ce_loss_26": 8.856351280212403, |
| "ce_loss_39": 8.882033634185792, |
| "ce_loss_52": 1.425847691297531, |
| "ce_loss_7": 9.182338738441468, |
| "epoch": 0.02, |
| "grad_norm": 51.67086637607359, |
| "kl_loss_13": 15867.2, |
| "kl_loss_26": 15332.8, |
| "kl_loss_39": 15393.6, |
| "kl_loss_7": 16011.2, |
| "learning_rate": 0.0009997482711915926, |
| "loss": 31312.8, |
| "step": 200 |
| }, |
| { |
| "ce_loss_13": 9.036305499076843, |
| "ce_loss_26": 8.778529453277589, |
| "ce_loss_39": 8.803540563583374, |
| "ce_loss_52": 1.4626984983682632, |
| "ce_loss_7": 9.118139266967773, |
| "epoch": 0.021, |
| "grad_norm": 50.078507295298536, |
| "kl_loss_13": 15654.4, |
| "kl_loss_26": 15108.8, |
| "kl_loss_39": 15168.0, |
| "kl_loss_7": 15828.8, |
| "learning_rate": 0.0009996954135095479, |
| "loss": 31012.0, |
| "step": 210 |
| }, |
| { |
| "ce_loss_13": 8.98859736919403, |
| "ce_loss_26": 8.726609206199646, |
| "ce_loss_39": 8.75291087627411, |
| "ce_loss_52": 1.4173608794808388, |
| "ce_loss_7": 9.076500582695008, |
| "epoch": 0.022, |
| "grad_norm": 50.694312927544594, |
| "kl_loss_13": 15625.6, |
| "kl_loss_26": 15073.6, |
| "kl_loss_39": 15126.4, |
| "kl_loss_7": 15811.2, |
| "learning_rate": 0.0009996375239002368, |
| "loss": 30754.8, |
| "step": 220 |
| }, |
| { |
| "ce_loss_13": 8.965013265609741, |
| "ce_loss_26": 8.698115158081055, |
| "ce_loss_39": 8.71800787448883, |
| "ce_loss_52": 1.4263556391000747, |
| "ce_loss_7": 9.061096882820129, |
| "epoch": 0.023, |
| "grad_norm": 50.98814656536181, |
| "kl_loss_13": 15547.2, |
| "kl_loss_26": 14985.6, |
| "kl_loss_39": 15028.8, |
| "kl_loss_7": 15752.0, |
| "learning_rate": 0.0009995746029466072, |
| "loss": 30513.6, |
| "step": 230 |
| }, |
| { |
| "ce_loss_13": 8.891163158416749, |
| "ce_loss_26": 8.611021280288696, |
| "ce_loss_39": 8.63300838470459, |
| "ce_loss_52": 1.42348592877388, |
| "ce_loss_7": 8.998197555541992, |
| "epoch": 0.024, |
| "grad_norm": 51.614487435815626, |
| "kl_loss_13": 15393.6, |
| "kl_loss_26": 14806.4, |
| "kl_loss_39": 14848.0, |
| "kl_loss_7": 15619.2, |
| "learning_rate": 0.0009995066512822719, |
| "loss": 30248.4, |
| "step": 240 |
| }, |
| { |
| "ce_loss_13": 8.831620502471925, |
| "ce_loss_26": 8.545269632339478, |
| "ce_loss_39": 8.565073847770691, |
| "ce_loss_52": 1.452064010500908, |
| "ce_loss_7": 8.941647911071778, |
| "epoch": 0.025, |
| "grad_norm": 50.247038771654694, |
| "kl_loss_13": 15233.6, |
| "kl_loss_26": 14628.8, |
| "kl_loss_39": 14667.2, |
| "kl_loss_7": 15462.4, |
| "learning_rate": 0.000999433669591504, |
| "loss": 29955.6, |
| "step": 250 |
| }, |
| { |
| "ce_loss_13": 8.755429339408874, |
| "ce_loss_26": 8.469949841499329, |
| "ce_loss_39": 8.486004614830017, |
| "ce_loss_52": 1.4328533172607423, |
| "ce_loss_7": 8.874198198318481, |
| "epoch": 0.026, |
| "grad_norm": 49.63162227981652, |
| "kl_loss_13": 15075.2, |
| "kl_loss_26": 14473.6, |
| "kl_loss_39": 14508.8, |
| "kl_loss_7": 15326.4, |
| "learning_rate": 0.000999355658609228, |
| "loss": 29717.2, |
| "step": 260 |
| }, |
| { |
| "ce_loss_13": 8.697371244430542, |
| "ce_loss_26": 8.401416063308716, |
| "ce_loss_39": 8.412619948387146, |
| "ce_loss_52": 1.4436978071928024, |
| "ce_loss_7": 8.819016146659852, |
| "epoch": 0.027, |
| "grad_norm": 51.15355718783705, |
| "kl_loss_13": 14974.4, |
| "kl_loss_26": 14347.2, |
| "kl_loss_39": 14377.6, |
| "kl_loss_7": 15235.2, |
| "learning_rate": 0.0009992726191210138, |
| "loss": 29500.4, |
| "step": 270 |
| }, |
| { |
| "ce_loss_13": 8.66446521282196, |
| "ce_loss_26": 8.36073157787323, |
| "ce_loss_39": 8.373045516014098, |
| "ce_loss_52": 1.4337088972330094, |
| "ce_loss_7": 8.793687105178833, |
| "epoch": 0.028, |
| "grad_norm": 50.66919780267985, |
| "kl_loss_13": 14888.0, |
| "kl_loss_26": 14254.4, |
| "kl_loss_39": 14280.0, |
| "kl_loss_7": 15168.0, |
| "learning_rate": 0.0009991845519630679, |
| "loss": 29316.8, |
| "step": 280 |
| }, |
| { |
| "ce_loss_13": 8.604181599617004, |
| "ce_loss_26": 8.292751049995422, |
| "ce_loss_39": 8.304360592365265, |
| "ce_loss_52": 1.4289869368076324, |
| "ce_loss_7": 8.740529561042786, |
| "epoch": 0.029, |
| "grad_norm": 49.40131878460191, |
| "kl_loss_13": 14785.6, |
| "kl_loss_26": 14123.2, |
| "kl_loss_39": 14147.2, |
| "kl_loss_7": 15065.6, |
| "learning_rate": 0.0009990914580222257, |
| "loss": 29034.0, |
| "step": 290 |
| }, |
| { |
| "ce_loss_13": 8.560984683036803, |
| "ce_loss_26": 8.244250774383545, |
| "ce_loss_39": 8.251447391510009, |
| "ce_loss_52": 1.4629653513431549, |
| "ce_loss_7": 8.694539451599121, |
| "epoch": 0.03, |
| "grad_norm": 49.35953544498034, |
| "kl_loss_13": 14673.6, |
| "kl_loss_26": 13996.8, |
| "kl_loss_39": 14014.4, |
| "kl_loss_7": 14955.2, |
| "learning_rate": 0.0009989933382359422, |
| "loss": 28794.4, |
| "step": 300 |
| }, |
| { |
| "ce_loss_13": 8.470300602912904, |
| "ce_loss_26": 8.14130541086197, |
| "ce_loss_39": 8.148625028133392, |
| "ce_loss_52": 1.4505603075027467, |
| "ce_loss_7": 8.614710068702697, |
| "epoch": 0.031, |
| "grad_norm": 49.67315029837232, |
| "kl_loss_13": 14492.8, |
| "kl_loss_26": 13798.4, |
| "kl_loss_39": 13812.8, |
| "kl_loss_7": 14800.0, |
| "learning_rate": 0.0009988901935922825, |
| "loss": 28550.8, |
| "step": 310 |
| }, |
| { |
| "ce_loss_13": 8.449520301818847, |
| "ce_loss_26": 8.124438393115998, |
| "ce_loss_39": 8.127473723888397, |
| "ce_loss_52": 1.4619350016117096, |
| "ce_loss_7": 8.594109392166137, |
| "epoch": 0.032, |
| "grad_norm": 49.62699299914443, |
| "kl_loss_13": 14432.0, |
| "kl_loss_26": 13737.6, |
| "kl_loss_39": 13745.6, |
| "kl_loss_7": 14737.6, |
| "learning_rate": 0.0009987820251299122, |
| "loss": 28340.4, |
| "step": 320 |
| }, |
| { |
| "ce_loss_13": 8.411065363883973, |
| "ce_loss_26": 8.065169262886048, |
| "ce_loss_39": 8.066172111034394, |
| "ce_loss_52": 1.4555475383996963, |
| "ce_loss_7": 8.563976049423218, |
| "epoch": 0.033, |
| "grad_norm": 48.32599858014616, |
| "kl_loss_13": 14328.0, |
| "kl_loss_26": 13596.8, |
| "kl_loss_39": 13601.6, |
| "kl_loss_7": 14651.2, |
| "learning_rate": 0.0009986688339380862, |
| "loss": 28064.0, |
| "step": 330 |
| }, |
| { |
| "ce_loss_13": 8.342015504837036, |
| "ce_loss_26": 7.984185111522675, |
| "ce_loss_39": 7.983271932601928, |
| "ce_loss_52": 1.4297153055667877, |
| "ce_loss_7": 8.503781509399413, |
| "epoch": 0.034, |
| "grad_norm": 49.50369957464881, |
| "kl_loss_13": 14252.8, |
| "kl_loss_26": 13504.0, |
| "kl_loss_39": 13500.8, |
| "kl_loss_7": 14590.4, |
| "learning_rate": 0.0009985506211566387, |
| "loss": 27837.2, |
| "step": 340 |
| }, |
| { |
| "ce_loss_13": 8.30728188753128, |
| "ce_loss_26": 7.943272864818573, |
| "ce_loss_39": 7.946760308742523, |
| "ce_loss_52": 1.4311424046754837, |
| "ce_loss_7": 8.472229743003846, |
| "epoch": 0.035, |
| "grad_norm": 49.2761544590419, |
| "kl_loss_13": 14160.0, |
| "kl_loss_26": 13390.4, |
| "kl_loss_39": 13393.6, |
| "kl_loss_7": 14508.8, |
| "learning_rate": 0.0009984273879759713, |
| "loss": 27625.6, |
| "step": 350 |
| }, |
| { |
| "ce_loss_13": 8.231205070018769, |
| "ce_loss_26": 7.874649381637573, |
| "ce_loss_39": 7.870936000347138, |
| "ce_loss_52": 1.4489013850688934, |
| "ce_loss_7": 8.397671937942505, |
| "epoch": 0.036, |
| "grad_norm": 49.68016023572489, |
| "kl_loss_13": 13990.4, |
| "kl_loss_26": 13236.8, |
| "kl_loss_39": 13232.0, |
| "kl_loss_7": 14340.8, |
| "learning_rate": 0.0009982991356370402, |
| "loss": 27384.8, |
| "step": 360 |
| }, |
| { |
| "ce_loss_13": 8.171039760112762, |
| "ce_loss_26": 7.7931832551956175, |
| "ce_loss_39": 7.793972992897034, |
| "ce_loss_52": 1.4122098296880723, |
| "ce_loss_7": 8.344593167304993, |
| "epoch": 0.037, |
| "grad_norm": 48.18494426167262, |
| "kl_loss_13": 13920.0, |
| "kl_loss_26": 13128.0, |
| "kl_loss_39": 13129.6, |
| "kl_loss_7": 14291.2, |
| "learning_rate": 0.0009981658654313456, |
| "loss": 27248.4, |
| "step": 370 |
| }, |
| { |
| "ce_loss_13": 8.160165214538575, |
| "ce_loss_26": 7.7771016478538515, |
| "ce_loss_39": 7.771613931655883, |
| "ce_loss_52": 1.4869922280311585, |
| "ce_loss_7": 8.338345170021057, |
| "epoch": 0.038, |
| "grad_norm": 48.80785692423277, |
| "kl_loss_13": 13795.2, |
| "kl_loss_26": 12974.4, |
| "kl_loss_39": 12960.0, |
| "kl_loss_7": 14164.8, |
| "learning_rate": 0.000998027578700917, |
| "loss": 26976.4, |
| "step": 380 |
| }, |
| { |
| "ce_loss_13": 8.052374148368836, |
| "ce_loss_26": 7.658347594738006, |
| "ce_loss_39": 7.6583909630775455, |
| "ce_loss_52": 1.4154588401317596, |
| "ce_loss_7": 8.239335989952087, |
| "epoch": 0.039, |
| "grad_norm": 48.38129929768143, |
| "kl_loss_13": 13680.0, |
| "kl_loss_26": 12851.2, |
| "kl_loss_39": 12844.8, |
| "kl_loss_7": 14078.4, |
| "learning_rate": 0.0009978842768382998, |
| "loss": 26719.2, |
| "step": 390 |
| }, |
| { |
| "ce_loss_13": 8.022306847572327, |
| "ce_loss_26": 7.623832786083222, |
| "ce_loss_39": 7.616167056560516, |
| "ce_loss_52": 1.4509258031845094, |
| "ce_loss_7": 8.209609961509704, |
| "epoch": 0.04, |
| "grad_norm": 48.47003917091678, |
| "kl_loss_13": 13532.8, |
| "kl_loss_26": 12684.8, |
| "kl_loss_39": 12668.8, |
| "kl_loss_7": 13918.4, |
| "learning_rate": 0.0009977359612865424, |
| "loss": 26536.4, |
| "step": 400 |
| }, |
| { |
| "ce_loss_13": 8.007824766635895, |
| "ce_loss_26": 7.614881563186645, |
| "ce_loss_39": 7.602893710136414, |
| "ce_loss_52": 1.4645162731409074, |
| "ce_loss_7": 8.19339075088501, |
| "epoch": 0.041, |
| "grad_norm": 48.05902996814503, |
| "kl_loss_13": 13486.4, |
| "kl_loss_26": 12646.4, |
| "kl_loss_39": 12627.2, |
| "kl_loss_7": 13886.4, |
| "learning_rate": 0.0009975826335391806, |
| "loss": 26319.6, |
| "step": 410 |
| }, |
| { |
| "ce_loss_13": 7.89500185251236, |
| "ce_loss_26": 7.4844276905059814, |
| "ce_loss_39": 7.470773124694825, |
| "ce_loss_52": 1.3909434020519256, |
| "ce_loss_7": 8.089212799072266, |
| "epoch": 0.042, |
| "grad_norm": 47.6532900353141, |
| "kl_loss_13": 13393.6, |
| "kl_loss_26": 12528.0, |
| "kl_loss_39": 12504.0, |
| "kl_loss_7": 13800.0, |
| "learning_rate": 0.0009974242951402235, |
| "loss": 26051.2, |
| "step": 420 |
| }, |
| { |
| "ce_loss_13": 7.848755323886872, |
| "ce_loss_26": 7.436672508716583, |
| "ce_loss_39": 7.420604693889618, |
| "ce_loss_52": 1.4549538046121597, |
| "ce_loss_7": 8.053676557540893, |
| "epoch": 0.043, |
| "grad_norm": 47.461866848016506, |
| "kl_loss_13": 13187.2, |
| "kl_loss_26": 12316.8, |
| "kl_loss_39": 12278.4, |
| "kl_loss_7": 13619.2, |
| "learning_rate": 0.0009972609476841367, |
| "loss": 25819.2, |
| "step": 430 |
| }, |
| { |
| "ce_loss_13": 7.824773061275482, |
| "ce_loss_26": 7.38158438205719, |
| "ce_loss_39": 7.365956795215607, |
| "ce_loss_52": 1.4214935347437858, |
| "ce_loss_7": 8.031265962123872, |
| "epoch": 0.044, |
| "grad_norm": 47.54154044179159, |
| "kl_loss_13": 13169.6, |
| "kl_loss_26": 12241.6, |
| "kl_loss_39": 12208.0, |
| "kl_loss_7": 13604.8, |
| "learning_rate": 0.0009970925928158272, |
| "loss": 25669.2, |
| "step": 440 |
| }, |
| { |
| "ce_loss_13": 7.781041419506073, |
| "ce_loss_26": 7.349016737937927, |
| "ce_loss_39": 7.328068232536316, |
| "ce_loss_52": 1.4457789659500122, |
| "ce_loss_7": 7.986689484119415, |
| "epoch": 0.045, |
| "grad_norm": 47.28488093165922, |
| "kl_loss_13": 13065.6, |
| "kl_loss_26": 12150.4, |
| "kl_loss_39": 12107.2, |
| "kl_loss_7": 13499.2, |
| "learning_rate": 0.000996919232230627, |
| "loss": 25413.2, |
| "step": 450 |
| }, |
| { |
| "ce_loss_13": 7.707467567920685, |
| "ce_loss_26": 7.259365463256836, |
| "ce_loss_39": 7.2402653932571415, |
| "ce_loss_52": 1.4384218811988831, |
| "ce_loss_7": 7.9230645298957825, |
| "epoch": 0.046, |
| "grad_norm": 47.47875544223923, |
| "kl_loss_13": 12928.0, |
| "kl_loss_26": 11974.4, |
| "kl_loss_39": 11940.8, |
| "kl_loss_7": 13380.8, |
| "learning_rate": 0.0009967408676742752, |
| "loss": 25149.6, |
| "step": 460 |
| }, |
| { |
| "ce_loss_13": 7.682056641578674, |
| "ce_loss_26": 7.239385926723481, |
| "ce_loss_39": 7.214358115196228, |
| "ce_loss_52": 1.4293440610170365, |
| "ce_loss_7": 7.903320550918579, |
| "epoch": 0.047, |
| "grad_norm": 47.616227706878504, |
| "kl_loss_13": 12899.2, |
| "kl_loss_26": 11950.4, |
| "kl_loss_39": 11904.0, |
| "kl_loss_7": 13360.0, |
| "learning_rate": 0.0009965575009429006, |
| "loss": 24954.0, |
| "step": 470 |
| }, |
| { |
| "ce_loss_13": 7.673471140861511, |
| "ce_loss_26": 7.22241450548172, |
| "ce_loss_39": 7.191142916679382, |
| "ce_loss_52": 1.4720373705029488, |
| "ce_loss_7": 7.897368836402893, |
| "epoch": 0.048, |
| "grad_norm": 47.35612609507448, |
| "kl_loss_13": 12763.2, |
| "kl_loss_26": 11816.0, |
| "kl_loss_39": 11748.8, |
| "kl_loss_7": 13236.8, |
| "learning_rate": 0.0009963691338830043, |
| "loss": 24784.4, |
| "step": 480 |
| }, |
| { |
| "ce_loss_13": 7.61794912815094, |
| "ce_loss_26": 7.163714337348938, |
| "ce_loss_39": 7.132104587554932, |
| "ce_loss_52": 1.4696507424116134, |
| "ce_loss_7": 7.839430296421051, |
| "epoch": 0.049, |
| "grad_norm": 46.6485917664728, |
| "kl_loss_13": 12664.0, |
| "kl_loss_26": 11691.2, |
| "kl_loss_39": 11627.2, |
| "kl_loss_7": 13139.2, |
| "learning_rate": 0.0009961757683914405, |
| "loss": 24543.2, |
| "step": 490 |
| }, |
| { |
| "ce_loss_13": 7.507795846462249, |
| "ce_loss_26": 7.035580575466156, |
| "ce_loss_39": 7.005417144298553, |
| "ce_loss_52": 1.4070568919181823, |
| "ce_loss_7": 7.743252336978912, |
| "epoch": 0.05, |
| "grad_norm": 46.67870253681081, |
| "kl_loss_13": 12545.6, |
| "kl_loss_26": 11542.4, |
| "kl_loss_39": 11480.0, |
| "kl_loss_7": 13043.2, |
| "learning_rate": 0.0009959774064153978, |
| "loss": 24344.0, |
| "step": 500 |
| }, |
| { |
| "ce_loss_13": 7.495815181732178, |
| "ce_loss_26": 7.0229366540908815, |
| "ce_loss_39": 6.990859532356263, |
| "ce_loss_52": 1.4116702109575272, |
| "ce_loss_7": 7.736506867408752, |
| "epoch": 0.051, |
| "grad_norm": 46.06552599373074, |
| "kl_loss_13": 12505.6, |
| "kl_loss_26": 11507.2, |
| "kl_loss_39": 11433.6, |
| "kl_loss_7": 13009.6, |
| "learning_rate": 0.0009957740499523787, |
| "loss": 24160.0, |
| "step": 510 |
| }, |
| { |
| "ce_loss_13": 7.443893933296204, |
| "ce_loss_26": 6.961520659923553, |
| "ce_loss_39": 6.922756457328797, |
| "ce_loss_52": 1.4424341320991516, |
| "ce_loss_7": 7.6887711644172665, |
| "epoch": 0.052, |
| "grad_norm": 46.57676837877325, |
| "kl_loss_13": 12337.6, |
| "kl_loss_26": 11308.8, |
| "kl_loss_39": 11225.6, |
| "kl_loss_7": 12851.2, |
| "learning_rate": 0.0009955657010501807, |
| "loss": 23900.8, |
| "step": 520 |
| }, |
| { |
| "ce_loss_13": 7.388968002796173, |
| "ce_loss_26": 6.916476762294769, |
| "ce_loss_39": 6.874182558059692, |
| "ce_loss_52": 1.4647160589694976, |
| "ce_loss_7": 7.6311492919921875, |
| "epoch": 0.053, |
| "grad_norm": 46.39702705707381, |
| "kl_loss_13": 12211.2, |
| "kl_loss_26": 11196.8, |
| "kl_loss_39": 11105.6, |
| "kl_loss_7": 12718.4, |
| "learning_rate": 0.000995352361806875, |
| "loss": 23724.4, |
| "step": 530 |
| }, |
| { |
| "ce_loss_13": 7.399884676933288, |
| "ce_loss_26": 6.893076729774475, |
| "ce_loss_39": 6.849111843109131, |
| "ce_loss_52": 1.4278682440519332, |
| "ce_loss_7": 7.656503355503082, |
| "epoch": 0.054, |
| "grad_norm": 45.552992496751486, |
| "kl_loss_13": 12289.6, |
| "kl_loss_26": 11217.6, |
| "kl_loss_39": 11131.2, |
| "kl_loss_7": 12824.0, |
| "learning_rate": 0.0009951340343707852, |
| "loss": 23503.2, |
| "step": 540 |
| }, |
| { |
| "ce_loss_13": 7.299449789524078, |
| "ce_loss_26": 6.799772572517395, |
| "ce_loss_39": 6.753718996047974, |
| "ce_loss_52": 1.447503750026226, |
| "ce_loss_7": 7.554943478107452, |
| "epoch": 0.055, |
| "grad_norm": 45.64724152253333, |
| "kl_loss_13": 12067.2, |
| "kl_loss_26": 10992.0, |
| "kl_loss_39": 10900.8, |
| "kl_loss_7": 12608.0, |
| "learning_rate": 0.0009949107209404665, |
| "loss": 23326.0, |
| "step": 550 |
| }, |
| { |
| "ce_loss_13": 7.293551552295685, |
| "ce_loss_26": 6.792212247848511, |
| "ce_loss_39": 6.740794622898102, |
| "ce_loss_52": 1.4703042089939118, |
| "ce_loss_7": 7.558422005176544, |
| "epoch": 0.056, |
| "grad_norm": 45.60106893131463, |
| "kl_loss_13": 11990.4, |
| "kl_loss_26": 10931.2, |
| "kl_loss_39": 10820.8, |
| "kl_loss_7": 12540.8, |
| "learning_rate": 0.0009946824237646824, |
| "loss": 23102.4, |
| "step": 560 |
| }, |
| { |
| "ce_loss_13": 7.1714133501052855, |
| "ce_loss_26": 6.658501255512237, |
| "ce_loss_39": 6.615163576602936, |
| "ce_loss_52": 1.4395473554730416, |
| "ce_loss_7": 7.438856053352356, |
| "epoch": 0.057, |
| "grad_norm": 45.02028770939811, |
| "kl_loss_13": 11819.2, |
| "kl_loss_26": 10716.8, |
| "kl_loss_39": 10633.6, |
| "kl_loss_7": 12379.2, |
| "learning_rate": 0.0009944491451423828, |
| "loss": 22860.0, |
| "step": 570 |
| }, |
| { |
| "ce_loss_13": 7.202235555648803, |
| "ce_loss_26": 6.668069064617157, |
| "ce_loss_39": 6.622809886932373, |
| "ce_loss_52": 1.4502787292003632, |
| "ce_loss_7": 7.474415194988251, |
| "epoch": 0.058, |
| "grad_norm": 45.87654136914787, |
| "kl_loss_13": 11832.0, |
| "kl_loss_26": 10697.6, |
| "kl_loss_39": 10606.4, |
| "kl_loss_7": 12409.6, |
| "learning_rate": 0.0009942108874226813, |
| "loss": 22680.4, |
| "step": 580 |
| }, |
| { |
| "ce_loss_13": 7.099943065643311, |
| "ce_loss_26": 6.574034261703491, |
| "ce_loss_39": 6.518663537502289, |
| "ce_loss_52": 1.4500610083341599, |
| "ce_loss_7": 7.373777639865875, |
| "epoch": 0.059, |
| "grad_norm": 45.75547743089401, |
| "kl_loss_13": 11633.6, |
| "kl_loss_26": 10512.0, |
| "kl_loss_39": 10403.2, |
| "kl_loss_7": 12201.6, |
| "learning_rate": 0.00099396765300483, |
| "loss": 22462.8, |
| "step": 590 |
| }, |
| { |
| "ce_loss_13": 7.097463607788086, |
| "ce_loss_26": 6.57372156381607, |
| "ce_loss_39": 6.511255967617035, |
| "ce_loss_52": 1.480376410484314, |
| "ce_loss_7": 7.372353208065033, |
| "epoch": 0.06, |
| "grad_norm": 45.20967365382369, |
| "kl_loss_13": 11558.4, |
| "kl_loss_26": 10451.2, |
| "kl_loss_39": 10324.8, |
| "kl_loss_7": 12145.6, |
| "learning_rate": 0.0009937194443381972, |
| "loss": 22282.4, |
| "step": 600 |
| }, |
| { |
| "ce_loss_13": 7.056500816345215, |
| "ce_loss_26": 6.520907533168793, |
| "ce_loss_39": 6.457597935199738, |
| "ce_loss_52": 1.4534808412194251, |
| "ce_loss_7": 7.339275515079498, |
| "epoch": 0.061, |
| "grad_norm": 44.17007512472295, |
| "kl_loss_13": 11520.0, |
| "kl_loss_26": 10382.4, |
| "kl_loss_39": 10249.6, |
| "kl_loss_7": 12110.4, |
| "learning_rate": 0.0009934662639222412, |
| "loss": 22080.0, |
| "step": 610 |
| }, |
| { |
| "ce_loss_13": 6.959627556800842, |
| "ce_loss_26": 6.4156983375549315, |
| "ce_loss_39": 6.353180265426635, |
| "ce_loss_52": 1.4930268943309783, |
| "ce_loss_7": 7.244034695625305, |
| "epoch": 0.062, |
| "grad_norm": 43.92075543466765, |
| "kl_loss_13": 11259.2, |
| "kl_loss_26": 10102.4, |
| "kl_loss_39": 9971.2, |
| "kl_loss_7": 11859.2, |
| "learning_rate": 0.000993208114306486, |
| "loss": 21799.2, |
| "step": 620 |
| }, |
| { |
| "ce_loss_13": 6.937919509410858, |
| "ce_loss_26": 6.402250957489014, |
| "ce_loss_39": 6.331100332736969, |
| "ce_loss_52": 1.4531458377838136, |
| "ce_loss_7": 7.226283407211303, |
| "epoch": 0.063, |
| "grad_norm": 44.52659706916058, |
| "kl_loss_13": 11259.2, |
| "kl_loss_26": 10128.0, |
| "kl_loss_39": 9988.8, |
| "kl_loss_7": 11881.6, |
| "learning_rate": 0.0009929449980904952, |
| "loss": 21667.2, |
| "step": 630 |
| }, |
| { |
| "ce_loss_13": 6.914422643184662, |
| "ce_loss_26": 6.355719900131225, |
| "ce_loss_39": 6.2839093685150145, |
| "ce_loss_52": 1.463460123538971, |
| "ce_loss_7": 7.208615565299988, |
| "epoch": 0.064, |
| "grad_norm": 44.241917484883416, |
| "kl_loss_13": 11203.2, |
| "kl_loss_26": 10004.8, |
| "kl_loss_39": 9865.6, |
| "kl_loss_7": 11827.2, |
| "learning_rate": 0.0009926769179238466, |
| "loss": 21450.4, |
| "step": 640 |
| }, |
| { |
| "ce_loss_13": 6.814666819572449, |
| "ce_loss_26": 6.240503942966461, |
| "ce_loss_39": 6.164217627048492, |
| "ce_loss_52": 1.4213469997048378, |
| "ce_loss_7": 7.121113920211792, |
| "epoch": 0.065, |
| "grad_norm": 45.45585410762684, |
| "kl_loss_13": 11097.6, |
| "kl_loss_26": 9875.2, |
| "kl_loss_39": 9726.4, |
| "kl_loss_7": 11742.4, |
| "learning_rate": 0.000992403876506104, |
| "loss": 21273.2, |
| "step": 650 |
| }, |
| { |
| "ce_loss_13": 6.807473576068878, |
| "ce_loss_26": 6.237039804458618, |
| "ce_loss_39": 6.164605820178986, |
| "ce_loss_52": 1.4794408291578294, |
| "ce_loss_7": 7.109469771385193, |
| "epoch": 0.066, |
| "grad_norm": 43.77904042873825, |
| "kl_loss_13": 10964.8, |
| "kl_loss_26": 9745.6, |
| "kl_loss_39": 9593.6, |
| "kl_loss_7": 11603.2, |
| "learning_rate": 0.0009921258765867918, |
| "loss": 21034.4, |
| "step": 660 |
| }, |
| { |
| "ce_loss_13": 6.720256412029267, |
| "ce_loss_26": 6.124040985107422, |
| "ce_loss_39": 6.048683619499206, |
| "ce_loss_52": 1.4370630145072938, |
| "ce_loss_7": 7.032277429103852, |
| "epoch": 0.067, |
| "grad_norm": 44.21280182860459, |
| "kl_loss_13": 10864.0, |
| "kl_loss_26": 9596.8, |
| "kl_loss_39": 9446.4, |
| "kl_loss_7": 11528.0, |
| "learning_rate": 0.0009918429209653662, |
| "loss": 20815.6, |
| "step": 670 |
| }, |
| { |
| "ce_loss_13": 6.73115086555481, |
| "ce_loss_26": 6.149888730049133, |
| "ce_loss_39": 6.072007644176483, |
| "ce_loss_52": 1.4543532699346542, |
| "ce_loss_7": 7.039518296718597, |
| "epoch": 0.068, |
| "grad_norm": 43.58133426683343, |
| "kl_loss_13": 10844.8, |
| "kl_loss_26": 9603.2, |
| "kl_loss_39": 9433.6, |
| "kl_loss_7": 11494.4, |
| "learning_rate": 0.0009915550124911866, |
| "loss": 20688.4, |
| "step": 680 |
| }, |
| { |
| "ce_loss_13": 6.683139646053315, |
| "ce_loss_26": 6.099281096458435, |
| "ce_loss_39": 6.017751622200012, |
| "ce_loss_52": 1.4289966225624084, |
| "ce_loss_7": 6.9959977746009825, |
| "epoch": 0.069, |
| "grad_norm": 43.03707399207988, |
| "kl_loss_13": 10817.6, |
| "kl_loss_26": 9577.6, |
| "kl_loss_39": 9414.4, |
| "kl_loss_7": 11472.0, |
| "learning_rate": 0.0009912621540634887, |
| "loss": 20494.0, |
| "step": 690 |
| }, |
| { |
| "ce_loss_13": 6.5575969338417055, |
| "ce_loss_26": 5.94709130525589, |
| "ce_loss_39": 5.865298080444336, |
| "ce_loss_52": 1.3811550110578537, |
| "ce_loss_7": 6.883859884738922, |
| "epoch": 0.07, |
| "grad_norm": 43.657034485471186, |
| "kl_loss_13": 10611.2, |
| "kl_loss_26": 9316.8, |
| "kl_loss_39": 9148.8, |
| "kl_loss_7": 11299.2, |
| "learning_rate": 0.0009909643486313534, |
| "loss": 20224.4, |
| "step": 700 |
| }, |
| { |
| "ce_loss_13": 6.581148624420166, |
| "ce_loss_26": 5.951541697978973, |
| "ce_loss_39": 5.867393767833709, |
| "ce_loss_52": 1.417145846784115, |
| "ce_loss_7": 6.908858215808868, |
| "epoch": 0.071, |
| "grad_norm": 42.31273006993064, |
| "kl_loss_13": 10628.8, |
| "kl_loss_26": 9294.4, |
| "kl_loss_39": 9120.0, |
| "kl_loss_7": 11320.0, |
| "learning_rate": 0.000990661599193678, |
| "loss": 20075.6, |
| "step": 710 |
| }, |
| { |
| "ce_loss_13": 6.503521502017975, |
| "ce_loss_26": 5.871239483356476, |
| "ce_loss_39": 5.7897450685501095, |
| "ce_loss_52": 1.4011695250868796, |
| "ce_loss_7": 6.844956791400909, |
| "epoch": 0.072, |
| "grad_norm": 42.36356368480549, |
| "kl_loss_13": 10488.0, |
| "kl_loss_26": 9147.2, |
| "kl_loss_39": 8979.2, |
| "kl_loss_7": 11206.4, |
| "learning_rate": 0.0009903539087991462, |
| "loss": 19811.6, |
| "step": 720 |
| }, |
| { |
| "ce_loss_13": 6.489633810520172, |
| "ce_loss_26": 5.87660802602768, |
| "ce_loss_39": 5.779063713550568, |
| "ce_loss_52": 1.439223274588585, |
| "ce_loss_7": 6.819329023361206, |
| "epoch": 0.073, |
| "grad_norm": 42.98993238073801, |
| "kl_loss_13": 10366.4, |
| "kl_loss_26": 9057.6, |
| "kl_loss_39": 8861.6, |
| "kl_loss_7": 11059.2, |
| "learning_rate": 0.0009900412805461966, |
| "loss": 19744.8, |
| "step": 730 |
| }, |
| { |
| "ce_loss_13": 6.4397171378135685, |
| "ce_loss_26": 5.814687025547028, |
| "ce_loss_39": 5.716581547260285, |
| "ce_loss_52": 1.4390251755714416, |
| "ce_loss_7": 6.779693508148194, |
| "epoch": 0.074, |
| "grad_norm": 42.877595561482536, |
| "kl_loss_13": 10267.2, |
| "kl_loss_26": 8939.2, |
| "kl_loss_39": 8734.4, |
| "kl_loss_7": 10982.4, |
| "learning_rate": 0.0009897237175829927, |
| "loss": 19478.8, |
| "step": 740 |
| }, |
| { |
| "ce_loss_13": 6.3779888391494755, |
| "ce_loss_26": 5.756749665737152, |
| "ce_loss_39": 5.652812826633453, |
| "ce_loss_52": 1.4100830882787705, |
| "ce_loss_7": 6.712257170677185, |
| "epoch": 0.075, |
| "grad_norm": 43.56161359476007, |
| "kl_loss_13": 10203.2, |
| "kl_loss_26": 8863.2, |
| "kl_loss_39": 8649.6, |
| "kl_loss_7": 10920.0, |
| "learning_rate": 0.0009894012231073895, |
| "loss": 19311.6, |
| "step": 750 |
| }, |
| { |
| "ce_loss_13": 6.351921963691711, |
| "ce_loss_26": 5.711096298694611, |
| "ce_loss_39": 5.613580751419067, |
| "ce_loss_52": 1.4703039675951004, |
| "ce_loss_7": 6.6900406837463375, |
| "epoch": 0.076, |
| "grad_norm": 41.581645996763, |
| "kl_loss_13": 10056.0, |
| "kl_loss_26": 8678.4, |
| "kl_loss_39": 8485.6, |
| "kl_loss_7": 10764.8, |
| "learning_rate": 0.0009890738003669028, |
| "loss": 19128.0, |
| "step": 760 |
| }, |
| { |
| "ce_loss_13": 6.329116785526276, |
| "ce_loss_26": 5.685759162902832, |
| "ce_loss_39": 5.58324785232544, |
| "ce_loss_52": 1.4396527051925658, |
| "ce_loss_7": 6.677184915542602, |
| "epoch": 0.077, |
| "grad_norm": 40.86594229703089, |
| "kl_loss_13": 10036.8, |
| "kl_loss_26": 8680.0, |
| "kl_loss_39": 8467.2, |
| "kl_loss_7": 10760.0, |
| "learning_rate": 0.0009887414526586764, |
| "loss": 18930.4, |
| "step": 770 |
| }, |
| { |
| "ce_loss_13": 6.279877305030823, |
| "ce_loss_26": 5.617447376251221, |
| "ce_loss_39": 5.507227098941803, |
| "ce_loss_52": 1.4374216616153717, |
| "ce_loss_7": 6.634308731555938, |
| "epoch": 0.078, |
| "grad_norm": 41.180826238519536, |
| "kl_loss_13": 9923.2, |
| "kl_loss_26": 8513.6, |
| "kl_loss_39": 8292.8, |
| "kl_loss_7": 10667.2, |
| "learning_rate": 0.0009884041833294476, |
| "loss": 18733.6, |
| "step": 780 |
| }, |
| { |
| "ce_loss_13": 6.212144470214843, |
| "ce_loss_26": 5.565514934062958, |
| "ce_loss_39": 5.445651924610138, |
| "ce_loss_52": 1.4184710115194321, |
| "ce_loss_7": 6.563052010536194, |
| "epoch": 0.079, |
| "grad_norm": 41.51169269505913, |
| "kl_loss_13": 9840.0, |
| "kl_loss_26": 8459.2, |
| "kl_loss_39": 8207.2, |
| "kl_loss_7": 10576.0, |
| "learning_rate": 0.000988061995775515, |
| "loss": 18618.8, |
| "step": 790 |
| }, |
| { |
| "ce_loss_13": 6.177972686290741, |
| "ce_loss_26": 5.5426277875900265, |
| "ce_loss_39": 5.430073320865631, |
| "ce_loss_52": 1.4582359090447425, |
| "ce_loss_7": 6.532871425151825, |
| "epoch": 0.08, |
| "grad_norm": 41.06171415513337, |
| "kl_loss_13": 9713.6, |
| "kl_loss_26": 8348.0, |
| "kl_loss_39": 8122.4, |
| "kl_loss_7": 10464.0, |
| "learning_rate": 0.0009877148934427035, |
| "loss": 18370.0, |
| "step": 800 |
| }, |
| { |
| "ce_loss_13": 6.174833989143371, |
| "ce_loss_26": 5.505520594120026, |
| "ce_loss_39": 5.391082692146301, |
| "ce_loss_52": 1.4291342854499818, |
| "ce_loss_7": 6.535899603366852, |
| "epoch": 0.081, |
| "grad_norm": 40.55915083586062, |
| "kl_loss_13": 9748.8, |
| "kl_loss_26": 8332.0, |
| "kl_loss_39": 8094.4, |
| "kl_loss_7": 10502.4, |
| "learning_rate": 0.0009873628798263297, |
| "loss": 18197.2, |
| "step": 810 |
| }, |
| { |
| "ce_loss_13": 6.106976389884949, |
| "ce_loss_26": 5.425193250179291, |
| "ce_loss_39": 5.297831201553345, |
| "ce_loss_52": 1.4520869970321655, |
| "ce_loss_7": 6.4676952958106995, |
| "epoch": 0.082, |
| "grad_norm": 39.176828574493044, |
| "kl_loss_13": 9564.8, |
| "kl_loss_26": 8108.0, |
| "kl_loss_39": 7852.0, |
| "kl_loss_7": 10324.8, |
| "learning_rate": 0.0009870059584711668, |
| "loss": 17988.4, |
| "step": 820 |
| }, |
| { |
| "ce_loss_13": 6.029178476333618, |
| "ce_loss_26": 5.369020164012909, |
| "ce_loss_39": 5.247223997116089, |
| "ce_loss_52": 1.4342376589775085, |
| "ce_loss_7": 6.38949601650238, |
| "epoch": 0.083, |
| "grad_norm": 41.3023886018674, |
| "kl_loss_13": 9422.4, |
| "kl_loss_26": 8008.8, |
| "kl_loss_39": 7756.0, |
| "kl_loss_7": 10184.0, |
| "learning_rate": 0.000986644132971409, |
| "loss": 17788.4, |
| "step": 830 |
| }, |
| { |
| "ce_loss_13": 6.009692323207855, |
| "ce_loss_26": 5.3266006231307985, |
| "ce_loss_39": 5.202202546596527, |
| "ce_loss_52": 1.4376018613576889, |
| "ce_loss_7": 6.372254419326782, |
| "epoch": 0.084, |
| "grad_norm": 39.84971146906691, |
| "kl_loss_13": 9387.2, |
| "kl_loss_26": 7916.8, |
| "kl_loss_39": 7663.2, |
| "kl_loss_7": 10155.2, |
| "learning_rate": 0.0009862774069706345, |
| "loss": 17687.8, |
| "step": 840 |
| }, |
| { |
| "ce_loss_13": 5.948546409606934, |
| "ce_loss_26": 5.290343832969666, |
| "ce_loss_39": 5.16569093465805, |
| "ce_loss_52": 1.4315639585256577, |
| "ce_loss_7": 6.303727805614471, |
| "epoch": 0.085, |
| "grad_norm": 38.79997549953815, |
| "kl_loss_13": 9260.8, |
| "kl_loss_26": 7848.0, |
| "kl_loss_39": 7593.6, |
| "kl_loss_7": 10009.6, |
| "learning_rate": 0.000985905784161771, |
| "loss": 17478.4, |
| "step": 850 |
| }, |
| { |
| "ce_loss_13": 5.976463770866394, |
| "ce_loss_26": 5.285696280002594, |
| "ce_loss_39": 5.158673858642578, |
| "ce_loss_52": 1.4285172358155251, |
| "ce_loss_7": 6.345685577392578, |
| "epoch": 0.086, |
| "grad_norm": 39.11215287158734, |
| "kl_loss_13": 9323.2, |
| "kl_loss_26": 7843.2, |
| "kl_loss_39": 7588.0, |
| "kl_loss_7": 10100.8, |
| "learning_rate": 0.000985529268287055, |
| "loss": 17353.8, |
| "step": 860 |
| }, |
| { |
| "ce_loss_13": 5.890017306804657, |
| "ce_loss_26": 5.188625490665435, |
| "ce_loss_39": 5.061676156520844, |
| "ce_loss_52": 1.427770259976387, |
| "ce_loss_7": 6.267517876625061, |
| "epoch": 0.087, |
| "grad_norm": 38.38012767193544, |
| "kl_loss_13": 9177.6, |
| "kl_loss_26": 7678.4, |
| "kl_loss_39": 7415.2, |
| "kl_loss_7": 9971.2, |
| "learning_rate": 0.0009851478631379982, |
| "loss": 17143.4, |
| "step": 870 |
| }, |
| { |
| "ce_loss_13": 5.8172935247421265, |
| "ce_loss_26": 5.092825090885162, |
| "ce_loss_39": 4.964837598800659, |
| "ce_loss_52": 1.3596146881580353, |
| "ce_loss_7": 6.200971674919129, |
| "epoch": 0.088, |
| "grad_norm": 38.67673909990335, |
| "kl_loss_13": 9150.4, |
| "kl_loss_26": 7612.8, |
| "kl_loss_39": 7350.4, |
| "kl_loss_7": 9947.2, |
| "learning_rate": 0.0009847615725553456, |
| "loss": 17046.8, |
| "step": 880 |
| }, |
| { |
| "ce_loss_13": 5.872656679153442, |
| "ce_loss_26": 5.153263211250305, |
| "ce_loss_39": 5.00926034450531, |
| "ce_loss_52": 1.4231197819113732, |
| "ce_loss_7": 6.253647100925446, |
| "epoch": 0.089, |
| "grad_norm": 38.12938597789528, |
| "kl_loss_13": 9128.0, |
| "kl_loss_26": 7595.2, |
| "kl_loss_39": 7309.6, |
| "kl_loss_7": 9923.2, |
| "learning_rate": 0.0009843704004290394, |
| "loss": 16917.8, |
| "step": 890 |
| }, |
| { |
| "ce_loss_13": 5.798008918762207, |
| "ce_loss_26": 5.091720676422119, |
| "ce_loss_39": 4.938081228733063, |
| "ce_loss_52": 1.4382835403084755, |
| "ce_loss_7": 6.171303284168244, |
| "epoch": 0.09, |
| "grad_norm": 37.16474091329711, |
| "kl_loss_13": 8936.0, |
| "kl_loss_26": 7427.2, |
| "kl_loss_39": 7121.6, |
| "kl_loss_7": 9726.4, |
| "learning_rate": 0.0009839743506981783, |
| "loss": 16656.0, |
| "step": 900 |
| }, |
| { |
| "ce_loss_13": 5.82405720949173, |
| "ce_loss_26": 5.096203732490539, |
| "ce_loss_39": 4.965065968036652, |
| "ce_loss_52": 1.4636528208851813, |
| "ce_loss_7": 6.201912236213684, |
| "epoch": 0.091, |
| "grad_norm": 36.36947394693034, |
| "kl_loss_13": 8939.2, |
| "kl_loss_26": 7354.4, |
| "kl_loss_39": 7080.0, |
| "kl_loss_7": 9747.2, |
| "learning_rate": 0.0009835734273509786, |
| "loss": 16529.0, |
| "step": 910 |
| }, |
| { |
| "ce_loss_13": 5.734641706943512, |
| "ce_loss_26": 5.011995434761047, |
| "ce_loss_39": 4.85785391330719, |
| "ce_loss_52": 1.4457294046878815, |
| "ce_loss_7": 6.11446977853775, |
| "epoch": 0.092, |
| "grad_norm": 36.64530545748263, |
| "kl_loss_13": 8840.8, |
| "kl_loss_26": 7284.8, |
| "kl_loss_39": 6967.2, |
| "kl_loss_7": 9636.8, |
| "learning_rate": 0.0009831676344247342, |
| "loss": 16343.8, |
| "step": 920 |
| }, |
| { |
| "ce_loss_13": 5.681211936473846, |
| "ce_loss_26": 4.933374917507171, |
| "ce_loss_39": 4.788007187843323, |
| "ce_loss_52": 1.3833691507577897, |
| "ce_loss_7": 6.066722130775451, |
| "epoch": 0.093, |
| "grad_norm": 37.601960547328346, |
| "kl_loss_13": 8819.2, |
| "kl_loss_26": 7232.8, |
| "kl_loss_39": 6926.4, |
| "kl_loss_7": 9620.8, |
| "learning_rate": 0.0009827569760057755, |
| "loss": 16262.8, |
| "step": 930 |
| }, |
| { |
| "ce_loss_13": 5.685943353176117, |
| "ce_loss_26": 4.954251933097839, |
| "ce_loss_39": 4.7964679479599, |
| "ce_loss_52": 1.4205988943576813, |
| "ce_loss_7": 6.062719237804413, |
| "epoch": 0.094, |
| "grad_norm": 34.90292075240395, |
| "kl_loss_13": 8707.2, |
| "kl_loss_26": 7154.4, |
| "kl_loss_39": 6841.6, |
| "kl_loss_7": 9496.0, |
| "learning_rate": 0.000982341456229428, |
| "loss": 16011.8, |
| "step": 940 |
| }, |
| { |
| "ce_loss_13": 5.640336573123932, |
| "ce_loss_26": 4.914572691917419, |
| "ce_loss_39": 4.756339108943939, |
| "ce_loss_52": 1.46774483025074, |
| "ce_loss_7": 6.0160892605781555, |
| "epoch": 0.095, |
| "grad_norm": 35.81614055285293, |
| "kl_loss_13": 8553.6, |
| "kl_loss_26": 6999.2, |
| "kl_loss_39": 6671.2, |
| "kl_loss_7": 9348.8, |
| "learning_rate": 0.000981921079279971, |
| "loss": 15864.8, |
| "step": 950 |
| }, |
| { |
| "ce_loss_13": 5.622566449642181, |
| "ce_loss_26": 4.879516458511352, |
| "ce_loss_39": 4.717178559303283, |
| "ce_loss_52": 1.4239765584468842, |
| "ce_loss_7": 5.998941457271576, |
| "epoch": 0.096, |
| "grad_norm": 35.60374709072334, |
| "kl_loss_13": 8597.6, |
| "kl_loss_26": 7007.2, |
| "kl_loss_39": 6670.4, |
| "kl_loss_7": 9384.0, |
| "learning_rate": 0.0009814958493905962, |
| "loss": 15764.6, |
| "step": 960 |
| }, |
| { |
| "ce_loss_13": 5.552615082263946, |
| "ce_loss_26": 4.812614411115646, |
| "ce_loss_39": 4.667081838846206, |
| "ce_loss_52": 1.4328487768769265, |
| "ce_loss_7": 5.935272622108459, |
| "epoch": 0.097, |
| "grad_norm": 34.10755627830643, |
| "kl_loss_13": 8454.4, |
| "kl_loss_26": 6880.8, |
| "kl_loss_39": 6574.4, |
| "kl_loss_7": 9254.4, |
| "learning_rate": 0.0009810657708433637, |
| "loss": 15541.8, |
| "step": 970 |
| }, |
| { |
| "ce_loss_13": 5.534706914424897, |
| "ce_loss_26": 4.787809383869171, |
| "ce_loss_39": 4.631017792224884, |
| "ce_loss_52": 1.4364599764347077, |
| "ce_loss_7": 5.910705745220184, |
| "epoch": 0.098, |
| "grad_norm": 33.30412559481911, |
| "kl_loss_13": 8426.4, |
| "kl_loss_26": 6818.4, |
| "kl_loss_39": 6496.0, |
| "kl_loss_7": 9220.8, |
| "learning_rate": 0.0009806308479691594, |
| "loss": 15486.2, |
| "step": 980 |
| }, |
| { |
| "ce_loss_13": 5.455054485797882, |
| "ce_loss_26": 4.712761473655701, |
| "ce_loss_39": 4.549750781059265, |
| "ce_loss_52": 1.442040067911148, |
| "ce_loss_7": 5.841645193099976, |
| "epoch": 0.099, |
| "grad_norm": 34.061312435089185, |
| "kl_loss_13": 8225.6, |
| "kl_loss_26": 6628.8, |
| "kl_loss_39": 6296.8, |
| "kl_loss_7": 9036.8, |
| "learning_rate": 0.0009801910851476522, |
| "loss": 15382.2, |
| "step": 990 |
| }, |
| { |
| "ce_loss_13": 5.4525358319282535, |
| "ce_loss_26": 4.71786208152771, |
| "ce_loss_39": 4.559128785133362, |
| "ce_loss_52": 1.4428577244281768, |
| "ce_loss_7": 5.826938045024872, |
| "epoch": 0.1, |
| "grad_norm": 33.6953047069211, |
| "kl_loss_13": 8196.0, |
| "kl_loss_26": 6628.8, |
| "kl_loss_39": 6297.6, |
| "kl_loss_7": 8995.2, |
| "learning_rate": 0.0009797464868072487, |
| "loss": 15156.4, |
| "step": 1000 |
| }, |
| { |
| "ce_loss_13": 5.445610964298249, |
| "ce_loss_26": 4.6739885926246645, |
| "ce_loss_39": 4.508283615112305, |
| "ce_loss_52": 1.4168697059154511, |
| "ce_loss_7": 5.834221661090851, |
| "epoch": 0.101, |
| "grad_norm": 32.70197394198742, |
| "kl_loss_13": 8233.6, |
| "kl_loss_26": 6591.2, |
| "kl_loss_39": 6246.4, |
| "kl_loss_7": 9051.2, |
| "learning_rate": 0.0009792970574250492, |
| "loss": 14993.6, |
| "step": 1010 |
| }, |
| { |
| "ce_loss_13": 5.368366336822509, |
| "ce_loss_26": 4.5666680335998535, |
| "ce_loss_39": 4.400501304864884, |
| "ce_loss_52": 1.3814861461520196, |
| "ce_loss_7": 5.762167453765869, |
| "epoch": 0.102, |
| "grad_norm": 32.28847151546708, |
| "kl_loss_13": 8145.6, |
| "kl_loss_26": 6444.0, |
| "kl_loss_39": 6111.2, |
| "kl_loss_7": 8969.6, |
| "learning_rate": 0.0009788428015268028, |
| "loss": 14902.6, |
| "step": 1020 |
| }, |
| { |
| "ce_loss_13": 5.416829228401184, |
| "ce_loss_26": 4.672497856616974, |
| "ce_loss_39": 4.5039793968200685, |
| "ce_loss_52": 1.4590631812810897, |
| "ce_loss_7": 5.7889638304710385, |
| "epoch": 0.103, |
| "grad_norm": 32.94054692999689, |
| "kl_loss_13": 8076.0, |
| "kl_loss_26": 6480.0, |
| "kl_loss_39": 6138.4, |
| "kl_loss_7": 8867.2, |
| "learning_rate": 0.0009783837236868609, |
| "loss": 14715.4, |
| "step": 1030 |
| }, |
| { |
| "ce_loss_13": 5.323912274837494, |
| "ce_loss_26": 4.551275789737701, |
| "ce_loss_39": 4.376495039463043, |
| "ce_loss_52": 1.4376014918088913, |
| "ce_loss_7": 5.712179851531983, |
| "epoch": 0.104, |
| "grad_norm": 32.580444775774126, |
| "kl_loss_13": 7934.4, |
| "kl_loss_26": 6285.6, |
| "kl_loss_39": 5924.0, |
| "kl_loss_7": 8752.0, |
| "learning_rate": 0.0009779198285281327, |
| "loss": 14586.6, |
| "step": 1040 |
| }, |
| { |
| "ce_loss_13": 5.36049393415451, |
| "ce_loss_26": 4.592142331600189, |
| "ce_loss_39": 4.4142293453216555, |
| "ce_loss_52": 1.4498057544231415, |
| "ce_loss_7": 5.73596283197403, |
| "epoch": 0.105, |
| "grad_norm": 31.80216030064833, |
| "kl_loss_13": 7976.8, |
| "kl_loss_26": 6341.6, |
| "kl_loss_39": 5988.8, |
| "kl_loss_7": 8771.2, |
| "learning_rate": 0.0009774511207220368, |
| "loss": 14415.8, |
| "step": 1050 |
| }, |
| { |
| "ce_loss_13": 5.3308478713035585, |
| "ce_loss_26": 4.5750040173530575, |
| "ce_loss_39": 4.39155302643776, |
| "ce_loss_52": 1.4785875469446181, |
| "ce_loss_7": 5.713631689548492, |
| "epoch": 0.106, |
| "grad_norm": 31.180344664143906, |
| "kl_loss_13": 7883.2, |
| "kl_loss_26": 6267.2, |
| "kl_loss_39": 5887.2, |
| "kl_loss_7": 8687.2, |
| "learning_rate": 0.0009769776049884564, |
| "loss": 14270.6, |
| "step": 1060 |
| }, |
| { |
| "ce_loss_13": 5.3476661205291744, |
| "ce_loss_26": 4.561805117130279, |
| "ce_loss_39": 4.391524451971054, |
| "ce_loss_52": 1.4550457745790482, |
| "ce_loss_7": 5.725461614131928, |
| "epoch": 0.107, |
| "grad_norm": 30.9396118714078, |
| "kl_loss_13": 7968.0, |
| "kl_loss_26": 6308.0, |
| "kl_loss_39": 5942.4, |
| "kl_loss_7": 8754.4, |
| "learning_rate": 0.0009764992860956889, |
| "loss": 14268.8, |
| "step": 1070 |
| }, |
| { |
| "ce_loss_13": 5.248398435115814, |
| "ce_loss_26": 4.472868782281876, |
| "ce_loss_39": 4.294939804077148, |
| "ce_loss_52": 1.4237870454788208, |
| "ce_loss_7": 5.6272268176078795, |
| "epoch": 0.108, |
| "grad_norm": 30.45094259940441, |
| "kl_loss_13": 7834.4, |
| "kl_loss_26": 6169.6, |
| "kl_loss_39": 5812.0, |
| "kl_loss_7": 8620.8, |
| "learning_rate": 0.0009760161688604008, |
| "loss": 14058.8, |
| "step": 1080 |
| }, |
| { |
| "ce_loss_13": 5.167715132236481, |
| "ce_loss_26": 4.421313828229904, |
| "ce_loss_39": 4.241793435811997, |
| "ce_loss_52": 1.4617454051971435, |
| "ce_loss_7": 5.536553728580475, |
| "epoch": 0.109, |
| "grad_norm": 30.37346177682711, |
| "kl_loss_13": 7596.8, |
| "kl_loss_26": 5987.2, |
| "kl_loss_39": 5622.4, |
| "kl_loss_7": 8367.2, |
| "learning_rate": 0.0009755282581475768, |
| "loss": 13946.4, |
| "step": 1090 |
| }, |
| { |
| "ce_loss_13": 5.21688460111618, |
| "ce_loss_26": 4.442314791679382, |
| "ce_loss_39": 4.254946118593216, |
| "ce_loss_52": 1.4511815324425696, |
| "ce_loss_7": 5.593706953525543, |
| "epoch": 0.11, |
| "grad_norm": 30.665307755441862, |
| "kl_loss_13": 7699.2, |
| "kl_loss_26": 6030.4, |
| "kl_loss_39": 5652.0, |
| "kl_loss_7": 8485.6, |
| "learning_rate": 0.0009750355588704727, |
| "loss": 13825.8, |
| "step": 1100 |
| }, |
| { |
| "ce_loss_13": 5.112356352806091, |
| "ce_loss_26": 4.310530138015747, |
| "ce_loss_39": 4.125328695774078, |
| "ce_loss_52": 1.4114144504070283, |
| "ce_loss_7": 5.487049925327301, |
| "epoch": 0.111, |
| "grad_norm": 29.407721301071028, |
| "kl_loss_13": 7569.6, |
| "kl_loss_26": 5844.8, |
| "kl_loss_39": 5470.4, |
| "kl_loss_7": 8359.2, |
| "learning_rate": 0.0009745380759905647, |
| "loss": 13627.8, |
| "step": 1110 |
| }, |
| { |
| "ce_loss_13": 5.087459588050843, |
| "ce_loss_26": 4.285146009922028, |
| "ce_loss_39": 4.101419150829315, |
| "ce_loss_52": 1.3823944509029389, |
| "ce_loss_7": 5.47238245010376, |
| "epoch": 0.112, |
| "grad_norm": 28.691648596064702, |
| "kl_loss_13": 7563.2, |
| "kl_loss_26": 5855.2, |
| "kl_loss_39": 5476.0, |
| "kl_loss_7": 8378.4, |
| "learning_rate": 0.0009740358145174998, |
| "loss": 13629.4, |
| "step": 1120 |
| }, |
| { |
| "ce_loss_13": 5.085049080848694, |
| "ce_loss_26": 4.283704102039337, |
| "ce_loss_39": 4.095863288640976, |
| "ce_loss_52": 1.4323463156819343, |
| "ce_loss_7": 5.455269980430603, |
| "epoch": 0.113, |
| "grad_norm": 28.708307757554874, |
| "kl_loss_13": 7468.0, |
| "kl_loss_26": 5755.2, |
| "kl_loss_39": 5364.0, |
| "kl_loss_7": 8244.8, |
| "learning_rate": 0.0009735287795090455, |
| "loss": 13475.0, |
| "step": 1130 |
| }, |
| { |
| "ce_loss_13": 4.9704699397087095, |
| "ce_loss_26": 4.164930063486099, |
| "ce_loss_39": 3.981805819272995, |
| "ce_loss_52": 1.3983336806297302, |
| "ce_loss_7": 5.351285874843597, |
| "epoch": 0.114, |
| "grad_norm": 30.611214200807577, |
| "kl_loss_13": 7311.2, |
| "kl_loss_26": 5585.6, |
| "kl_loss_39": 5199.2, |
| "kl_loss_7": 8099.2, |
| "learning_rate": 0.0009730169760710386, |
| "loss": 13288.2, |
| "step": 1140 |
| }, |
| { |
| "ce_loss_13": 5.094941341876984, |
| "ce_loss_26": 4.275070035457611, |
| "ce_loss_39": 4.084649866819381, |
| "ce_loss_52": 1.4357529014348984, |
| "ce_loss_7": 5.482879185676575, |
| "epoch": 0.115, |
| "grad_norm": 29.988760771554233, |
| "kl_loss_13": 7468.8, |
| "kl_loss_26": 5715.2, |
| "kl_loss_39": 5326.4, |
| "kl_loss_7": 8275.2, |
| "learning_rate": 0.0009725004093573342, |
| "loss": 13196.6, |
| "step": 1150 |
| }, |
| { |
| "ce_loss_13": 4.942017900943756, |
| "ce_loss_26": 4.151496112346649, |
| "ce_loss_39": 3.957593894004822, |
| "ce_loss_52": 1.4098813980817795, |
| "ce_loss_7": 5.323493349552154, |
| "epoch": 0.116, |
| "grad_norm": 30.06505974205765, |
| "kl_loss_13": 7237.6, |
| "kl_loss_26": 5534.4, |
| "kl_loss_39": 5135.2, |
| "kl_loss_7": 8035.2, |
| "learning_rate": 0.0009719790845697534, |
| "loss": 13084.4, |
| "step": 1160 |
| }, |
| { |
| "ce_loss_13": 4.974001240730286, |
| "ce_loss_26": 4.16838675737381, |
| "ce_loss_39": 3.968938571214676, |
| "ce_loss_52": 1.4311222642660142, |
| "ce_loss_7": 5.3611521363258365, |
| "epoch": 0.117, |
| "grad_norm": 28.226442547632384, |
| "kl_loss_13": 7241.6, |
| "kl_loss_26": 5536.8, |
| "kl_loss_39": 5121.6, |
| "kl_loss_7": 8045.6, |
| "learning_rate": 0.0009714530069580309, |
| "loss": 12959.6, |
| "step": 1170 |
| }, |
| { |
| "ce_loss_13": 4.905927586555481, |
| "ce_loss_26": 4.065518736839294, |
| "ce_loss_39": 3.8746193647384644, |
| "ce_loss_52": 1.3948067665100097, |
| "ce_loss_7": 5.290950679779053, |
| "epoch": 0.118, |
| "grad_norm": 26.7183550844667, |
| "kl_loss_13": 7163.2, |
| "kl_loss_26": 5388.0, |
| "kl_loss_39": 4992.8, |
| "kl_loss_7": 7959.2, |
| "learning_rate": 0.0009709221818197624, |
| "loss": 12883.0, |
| "step": 1180 |
| }, |
| { |
| "ce_loss_13": 4.902862447500229, |
| "ce_loss_26": 4.10335453748703, |
| "ce_loss_39": 3.920765632390976, |
| "ce_loss_52": 1.4242859303951263, |
| "ce_loss_7": 5.280882668495178, |
| "epoch": 0.119, |
| "grad_norm": 27.541041012815914, |
| "kl_loss_13": 7096.0, |
| "kl_loss_26": 5372.8, |
| "kl_loss_39": 4987.2, |
| "kl_loss_7": 7895.2, |
| "learning_rate": 0.0009703866145003512, |
| "loss": 12755.6, |
| "step": 1190 |
| }, |
| { |
| "ce_loss_13": 4.91794501543045, |
| "ce_loss_26": 4.094452971220017, |
| "ce_loss_39": 3.892131644487381, |
| "ce_loss_52": 1.4217353582382202, |
| "ce_loss_7": 5.298339033126831, |
| "epoch": 0.12, |
| "grad_norm": 27.647397498896083, |
| "kl_loss_13": 7171.2, |
| "kl_loss_26": 5395.2, |
| "kl_loss_39": 4985.6, |
| "kl_loss_7": 7958.4, |
| "learning_rate": 0.0009698463103929542, |
| "loss": 12646.8, |
| "step": 1200 |
| }, |
| { |
| "ce_loss_13": 4.933221316337585, |
| "ce_loss_26": 4.129461044073105, |
| "ce_loss_39": 3.92621054649353, |
| "ce_loss_52": 1.4751009970903397, |
| "ce_loss_7": 5.315613615512848, |
| "epoch": 0.121, |
| "grad_norm": 26.42127738230431, |
| "kl_loss_13": 7043.2, |
| "kl_loss_26": 5317.6, |
| "kl_loss_39": 4899.2, |
| "kl_loss_7": 7841.6, |
| "learning_rate": 0.0009693012749384279, |
| "loss": 12515.8, |
| "step": 1210 |
| }, |
| { |
| "ce_loss_13": 4.865577363967896, |
| "ce_loss_26": 4.064575934410096, |
| "ce_loss_39": 3.863145834207535, |
| "ce_loss_52": 1.4422448396682739, |
| "ce_loss_7": 5.235701704025269, |
| "epoch": 0.122, |
| "grad_norm": 25.567127248660423, |
| "kl_loss_13": 6988.0, |
| "kl_loss_26": 5271.2, |
| "kl_loss_39": 4855.2, |
| "kl_loss_7": 7760.8, |
| "learning_rate": 0.0009687515136252732, |
| "loss": 12484.2, |
| "step": 1220 |
| }, |
| { |
| "ce_loss_13": 4.875257205963135, |
| "ce_loss_26": 4.068110597133637, |
| "ce_loss_39": 3.866461306810379, |
| "ce_loss_52": 1.4406520485877992, |
| "ce_loss_7": 5.254658281803131, |
| "epoch": 0.123, |
| "grad_norm": 25.60032726683598, |
| "kl_loss_13": 7019.2, |
| "kl_loss_26": 5276.8, |
| "kl_loss_39": 4864.0, |
| "kl_loss_7": 7809.6, |
| "learning_rate": 0.0009681970319895803, |
| "loss": 12358.8, |
| "step": 1230 |
| }, |
| { |
| "ce_loss_13": 4.856750476360321, |
| "ce_loss_26": 4.064332664012909, |
| "ce_loss_39": 3.865360552072525, |
| "ce_loss_52": 1.4734540313482285, |
| "ce_loss_7": 5.232936894893646, |
| "epoch": 0.124, |
| "grad_norm": 27.54111241294886, |
| "kl_loss_13": 6928.8, |
| "kl_loss_26": 5224.8, |
| "kl_loss_39": 4806.4, |
| "kl_loss_7": 7701.6, |
| "learning_rate": 0.0009676378356149733, |
| "loss": 12219.2, |
| "step": 1240 |
| }, |
| { |
| "ce_loss_13": 4.714985811710358, |
| "ce_loss_26": 3.8911590695381166, |
| "ce_loss_39": 3.6897071480751036, |
| "ce_loss_52": 1.4211883068084716, |
| "ce_loss_7": 5.092134392261505, |
| "epoch": 0.125, |
| "grad_norm": 26.159823932609697, |
| "kl_loss_13": 6763.2, |
| "kl_loss_26": 5001.6, |
| "kl_loss_39": 4584.8, |
| "kl_loss_7": 7549.6, |
| "learning_rate": 0.0009670739301325534, |
| "loss": 12043.8, |
| "step": 1250 |
| }, |
| { |
| "ce_loss_13": 4.746155381202698, |
| "ce_loss_26": 3.9151339173316955, |
| "ce_loss_39": 3.717781513929367, |
| "ce_loss_52": 1.3889067679643632, |
| "ce_loss_7": 5.118369615077972, |
| "epoch": 0.126, |
| "grad_norm": 27.210751367526278, |
| "kl_loss_13": 6825.6, |
| "kl_loss_26": 5060.0, |
| "kl_loss_39": 4660.0, |
| "kl_loss_7": 7612.0, |
| "learning_rate": 0.0009665053212208426, |
| "loss": 12020.6, |
| "step": 1260 |
| }, |
| { |
| "ce_loss_13": 4.729644465446472, |
| "ce_loss_26": 3.8812398612499237, |
| "ce_loss_39": 3.686249554157257, |
| "ce_loss_52": 1.421858811378479, |
| "ce_loss_7": 5.109574091434479, |
| "epoch": 0.127, |
| "grad_norm": 24.75325655489569, |
| "kl_loss_13": 6772.8, |
| "kl_loss_26": 4955.6, |
| "kl_loss_39": 4553.2, |
| "kl_loss_7": 7572.8, |
| "learning_rate": 0.0009659320146057262, |
| "loss": 11949.0, |
| "step": 1270 |
| }, |
| { |
| "ce_loss_13": 4.706896722316742, |
| "ce_loss_26": 3.884850525856018, |
| "ce_loss_39": 3.679063153266907, |
| "ce_loss_52": 1.4093473598361015, |
| "ce_loss_7": 5.0908261895179745, |
| "epoch": 0.128, |
| "grad_norm": 25.78294847436066, |
| "kl_loss_13": 6728.0, |
| "kl_loss_26": 4984.8, |
| "kl_loss_39": 4562.4, |
| "kl_loss_7": 7525.6, |
| "learning_rate": 0.0009653540160603955, |
| "loss": 11920.8, |
| "step": 1280 |
| }, |
| { |
| "ce_loss_13": 4.68510691523552, |
| "ce_loss_26": 3.886442297697067, |
| "ce_loss_39": 3.682328295707703, |
| "ce_loss_52": 1.4646018967032433, |
| "ce_loss_7": 5.063843679428101, |
| "epoch": 0.129, |
| "grad_norm": 24.870529388647757, |
| "kl_loss_13": 6587.2, |
| "kl_loss_26": 4871.2, |
| "kl_loss_39": 4451.6, |
| "kl_loss_7": 7384.8, |
| "learning_rate": 0.0009647713314052896, |
| "loss": 11716.1, |
| "step": 1290 |
| }, |
| { |
| "ce_loss_13": 4.6914472579956055, |
| "ce_loss_26": 3.8814328253269195, |
| "ce_loss_39": 3.6700519025325775, |
| "ce_loss_52": 1.428275752067566, |
| "ce_loss_7": 5.065358865261078, |
| "epoch": 0.13, |
| "grad_norm": 24.506148720784267, |
| "kl_loss_13": 6617.6, |
| "kl_loss_26": 4886.4, |
| "kl_loss_39": 4454.8, |
| "kl_loss_7": 7397.6, |
| "learning_rate": 0.0009641839665080363, |
| "loss": 11621.0, |
| "step": 1300 |
| }, |
| { |
| "ce_loss_13": 4.666101861000061, |
| "ce_loss_26": 3.8607113540172575, |
| "ce_loss_39": 3.6500234425067903, |
| "ce_loss_52": 1.4570627421140672, |
| "ce_loss_7": 5.0361028671264645, |
| "epoch": 0.131, |
| "grad_norm": 23.043331928969636, |
| "kl_loss_13": 6562.4, |
| "kl_loss_26": 4820.0, |
| "kl_loss_39": 4389.2, |
| "kl_loss_7": 7340.0, |
| "learning_rate": 0.0009635919272833937, |
| "loss": 11547.2, |
| "step": 1310 |
| }, |
| { |
| "ce_loss_13": 4.57597508430481, |
| "ce_loss_26": 3.757897812128067, |
| "ce_loss_39": 3.541293317079544, |
| "ce_loss_52": 1.417707359790802, |
| "ce_loss_7": 4.957513523101807, |
| "epoch": 0.132, |
| "grad_norm": 23.874048124377556, |
| "kl_loss_13": 6413.6, |
| "kl_loss_26": 4680.0, |
| "kl_loss_39": 4232.4, |
| "kl_loss_7": 7214.4, |
| "learning_rate": 0.0009629952196931902, |
| "loss": 11465.6, |
| "step": 1320 |
| }, |
| { |
| "ce_loss_13": 4.599424958229065, |
| "ce_loss_26": 3.7814753651618958, |
| "ce_loss_39": 3.5756381869316103, |
| "ce_loss_52": 1.435066069662571, |
| "ce_loss_7": 4.962808167934417, |
| "epoch": 0.133, |
| "grad_norm": 27.163150075748632, |
| "kl_loss_13": 6461.6, |
| "kl_loss_26": 4719.2, |
| "kl_loss_39": 4284.8, |
| "kl_loss_7": 7222.4, |
| "learning_rate": 0.0009623938497462645, |
| "loss": 11415.0, |
| "step": 1330 |
| }, |
| { |
| "ce_loss_13": 4.593182015419006, |
| "ce_loss_26": 3.749741864204407, |
| "ce_loss_39": 3.543072348833084, |
| "ce_loss_52": 1.4210506305098534, |
| "ce_loss_7": 4.98070273399353, |
| "epoch": 0.134, |
| "grad_norm": 23.74960078438379, |
| "kl_loss_13": 6465.6, |
| "kl_loss_26": 4674.0, |
| "kl_loss_39": 4258.0, |
| "kl_loss_7": 7272.8, |
| "learning_rate": 0.0009617878234984055, |
| "loss": 11286.2, |
| "step": 1340 |
| }, |
| { |
| "ce_loss_13": 4.5947358965873715, |
| "ce_loss_26": 3.779453754425049, |
| "ce_loss_39": 3.554258805513382, |
| "ce_loss_52": 1.440669310092926, |
| "ce_loss_7": 4.96803480386734, |
| "epoch": 0.135, |
| "grad_norm": 24.32368060127727, |
| "kl_loss_13": 6402.4, |
| "kl_loss_26": 4660.8, |
| "kl_loss_39": 4202.0, |
| "kl_loss_7": 7178.4, |
| "learning_rate": 0.0009611771470522907, |
| "loss": 11138.4, |
| "step": 1350 |
| }, |
| { |
| "ce_loss_13": 4.540663009881973, |
| "ce_loss_26": 3.7292558193206786, |
| "ce_loss_39": 3.5131251573562623, |
| "ce_loss_52": 1.4146902561187744, |
| "ce_loss_7": 4.918925869464874, |
| "epoch": 0.136, |
| "grad_norm": 23.85034387543973, |
| "kl_loss_13": 6383.2, |
| "kl_loss_26": 4642.0, |
| "kl_loss_39": 4189.2, |
| "kl_loss_7": 7181.6, |
| "learning_rate": 0.0009605618265574251, |
| "loss": 11195.2, |
| "step": 1360 |
| }, |
| { |
| "ce_loss_13": 4.595166695117951, |
| "ce_loss_26": 3.7860535979270935, |
| "ce_loss_39": 3.5680083632469177, |
| "ce_loss_52": 1.4854394227266312, |
| "ce_loss_7": 4.9739551663398744, |
| "epoch": 0.137, |
| "grad_norm": 23.647990920122997, |
| "kl_loss_13": 6340.0, |
| "kl_loss_26": 4612.0, |
| "kl_loss_39": 4162.0, |
| "kl_loss_7": 7127.2, |
| "learning_rate": 0.0009599418682100792, |
| "loss": 11028.6, |
| "step": 1370 |
| }, |
| { |
| "ce_loss_13": 4.481674873828888, |
| "ce_loss_26": 3.655723828077316, |
| "ce_loss_39": 3.4441386282444, |
| "ce_loss_52": 1.402228906750679, |
| "ce_loss_7": 4.8588902950286865, |
| "epoch": 0.138, |
| "grad_norm": 23.628756362977956, |
| "kl_loss_13": 6284.8, |
| "kl_loss_26": 4506.8, |
| "kl_loss_39": 4074.4, |
| "kl_loss_7": 7073.6, |
| "learning_rate": 0.0009593172782532268, |
| "loss": 10976.4, |
| "step": 1380 |
| }, |
| { |
| "ce_loss_13": 4.446731185913086, |
| "ce_loss_26": 3.6488849222660065, |
| "ce_loss_39": 3.4283725559711455, |
| "ce_loss_52": 1.425352481007576, |
| "ce_loss_7": 4.823254930973053, |
| "epoch": 0.139, |
| "grad_norm": 23.454937110465252, |
| "kl_loss_13": 6190.4, |
| "kl_loss_26": 4472.0, |
| "kl_loss_39": 4019.6, |
| "kl_loss_7": 6972.0, |
| "learning_rate": 0.0009586880629764817, |
| "loss": 10856.2, |
| "step": 1390 |
| }, |
| { |
| "ce_loss_13": 4.481068539619446, |
| "ce_loss_26": 3.6307739317417145, |
| "ce_loss_39": 3.4139047265052795, |
| "ce_loss_52": 1.3980510637164116, |
| "ce_loss_7": 4.877132707834244, |
| "epoch": 0.14, |
| "grad_norm": 24.701369227992412, |
| "kl_loss_13": 6300.0, |
| "kl_loss_26": 4496.4, |
| "kl_loss_39": 4045.6, |
| "kl_loss_7": 7120.0, |
| "learning_rate": 0.0009580542287160348, |
| "loss": 10848.4, |
| "step": 1400 |
| }, |
| { |
| "ce_loss_13": 4.481135439872742, |
| "ce_loss_26": 3.684358072280884, |
| "ce_loss_39": 3.454758608341217, |
| "ce_loss_52": 1.459119439125061, |
| "ce_loss_7": 4.855645072460175, |
| "epoch": 0.141, |
| "grad_norm": 24.081142665128635, |
| "kl_loss_13": 6146.4, |
| "kl_loss_26": 4455.2, |
| "kl_loss_39": 3985.2, |
| "kl_loss_7": 6936.0, |
| "learning_rate": 0.0009574157818545901, |
| "loss": 10711.8, |
| "step": 1410 |
| }, |
| { |
| "ce_loss_13": 4.440946173667908, |
| "ce_loss_26": 3.6337361335754395, |
| "ce_loss_39": 3.404258185625076, |
| "ce_loss_52": 1.411030687391758, |
| "ce_loss_7": 4.816100597381592, |
| "epoch": 0.142, |
| "grad_norm": 22.547443199755673, |
| "kl_loss_13": 6183.2, |
| "kl_loss_26": 4461.2, |
| "kl_loss_39": 3991.6, |
| "kl_loss_7": 6976.0, |
| "learning_rate": 0.0009567727288213005, |
| "loss": 10724.8, |
| "step": 1420 |
| }, |
| { |
| "ce_loss_13": 4.457812869548798, |
| "ce_loss_26": 3.6864565014839172, |
| "ce_loss_39": 3.452153670787811, |
| "ce_loss_52": 1.4788728266954423, |
| "ce_loss_7": 4.832965791225433, |
| "epoch": 0.143, |
| "grad_norm": 22.827900847688525, |
| "kl_loss_13": 6096.8, |
| "kl_loss_26": 4430.4, |
| "kl_loss_39": 3959.2, |
| "kl_loss_7": 6881.6, |
| "learning_rate": 0.0009561250760917027, |
| "loss": 10616.2, |
| "step": 1430 |
| }, |
| { |
| "ce_loss_13": 4.386147284507752, |
| "ce_loss_26": 3.586784356832504, |
| "ce_loss_39": 3.356312555074692, |
| "ce_loss_52": 1.4125748693943023, |
| "ce_loss_7": 4.7688825011253355, |
| "epoch": 0.144, |
| "grad_norm": 22.45947089806503, |
| "kl_loss_13": 6064.0, |
| "kl_loss_26": 4354.0, |
| "kl_loss_39": 3886.8, |
| "kl_loss_7": 6859.2, |
| "learning_rate": 0.0009554728301876525, |
| "loss": 10473.0, |
| "step": 1440 |
| }, |
| { |
| "ce_loss_13": 4.411167800426483, |
| "ce_loss_26": 3.590187501907349, |
| "ce_loss_39": 3.3659981071949003, |
| "ce_loss_52": 1.4227360993623734, |
| "ce_loss_7": 4.80015469789505, |
| "epoch": 0.145, |
| "grad_norm": 22.134471970609756, |
| "kl_loss_13": 6080.0, |
| "kl_loss_26": 4321.2, |
| "kl_loss_39": 3868.4, |
| "kl_loss_7": 6892.0, |
| "learning_rate": 0.0009548159976772592, |
| "loss": 10449.2, |
| "step": 1450 |
| }, |
| { |
| "ce_loss_13": 4.304250085353852, |
| "ce_loss_26": 3.5215473413467406, |
| "ce_loss_39": 3.3020472466945647, |
| "ce_loss_52": 1.455188724398613, |
| "ce_loss_7": 4.682382225990295, |
| "epoch": 0.146, |
| "grad_norm": 23.296949813068704, |
| "kl_loss_13": 5828.8, |
| "kl_loss_26": 4153.6, |
| "kl_loss_39": 3701.2, |
| "kl_loss_7": 6626.4, |
| "learning_rate": 0.0009541545851748186, |
| "loss": 10336.2, |
| "step": 1460 |
| }, |
| { |
| "ce_loss_13": 4.340453952550888, |
| "ce_loss_26": 3.527716559171677, |
| "ce_loss_39": 3.297034960985184, |
| "ce_loss_52": 1.4175047695636749, |
| "ce_loss_7": 4.720600801706314, |
| "epoch": 0.147, |
| "grad_norm": 23.69989338145452, |
| "kl_loss_13": 5933.6, |
| "kl_loss_26": 4197.2, |
| "kl_loss_39": 3732.0, |
| "kl_loss_7": 6732.8, |
| "learning_rate": 0.0009534885993407473, |
| "loss": 10320.4, |
| "step": 1470 |
| }, |
| { |
| "ce_loss_13": 4.316350519657135, |
| "ce_loss_26": 3.522084206342697, |
| "ce_loss_39": 3.293704879283905, |
| "ce_loss_52": 1.4351435631513596, |
| "ce_loss_7": 4.694415915012359, |
| "epoch": 0.148, |
| "grad_norm": 23.155946924290426, |
| "kl_loss_13": 5859.2, |
| "kl_loss_26": 4177.2, |
| "kl_loss_39": 3708.0, |
| "kl_loss_7": 6649.6, |
| "learning_rate": 0.0009528180468815154, |
| "loss": 10227.4, |
| "step": 1480 |
| }, |
| { |
| "ce_loss_13": 4.34768191576004, |
| "ce_loss_26": 3.5612153470516206, |
| "ce_loss_39": 3.3256225168704985, |
| "ce_loss_52": 1.4718306064605713, |
| "ce_loss_7": 4.73325879573822, |
| "epoch": 0.149, |
| "grad_norm": 22.828288355166595, |
| "kl_loss_13": 5840.8, |
| "kl_loss_26": 4173.2, |
| "kl_loss_39": 3699.2, |
| "kl_loss_7": 6654.4, |
| "learning_rate": 0.0009521429345495787, |
| "loss": 10213.0, |
| "step": 1490 |
| }, |
| { |
| "ce_loss_13": 4.285507726669311, |
| "ce_loss_26": 3.494914507865906, |
| "ce_loss_39": 3.2638841211795806, |
| "ce_loss_52": 1.4394250243902207, |
| "ce_loss_7": 4.674430012702942, |
| "epoch": 0.15, |
| "grad_norm": 22.358102006612796, |
| "kl_loss_13": 5802.4, |
| "kl_loss_26": 4114.4, |
| "kl_loss_39": 3639.2, |
| "kl_loss_7": 6619.2, |
| "learning_rate": 0.0009514632691433108, |
| "loss": 10144.0, |
| "step": 1500 |
| }, |
| { |
| "ce_loss_13": 4.274980753660202, |
| "ce_loss_26": 3.4573559522628785, |
| "ce_loss_39": 3.2235658168792725, |
| "ce_loss_52": 1.4003299355506897, |
| "ce_loss_7": 4.676058840751648, |
| "epoch": 0.151, |
| "grad_norm": 21.54131953317247, |
| "kl_loss_13": 5876.0, |
| "kl_loss_26": 4120.4, |
| "kl_loss_39": 3643.6, |
| "kl_loss_7": 6711.2, |
| "learning_rate": 0.0009507790575069346, |
| "loss": 10084.8, |
| "step": 1510 |
| }, |
| { |
| "ce_loss_13": 4.239670622348785, |
| "ce_loss_26": 3.4528944075107573, |
| "ce_loss_39": 3.221757102012634, |
| "ce_loss_52": 1.4378845229744912, |
| "ce_loss_7": 4.625989091396332, |
| "epoch": 0.152, |
| "grad_norm": 20.883193615641517, |
| "kl_loss_13": 5700.0, |
| "kl_loss_26": 4031.2, |
| "kl_loss_39": 3560.0, |
| "kl_loss_7": 6514.4, |
| "learning_rate": 0.0009500903065304539, |
| "loss": 9981.0, |
| "step": 1520 |
| }, |
| { |
| "ce_loss_13": 4.250718909502029, |
| "ce_loss_26": 3.4539793133735657, |
| "ce_loss_39": 3.2287534534931184, |
| "ce_loss_52": 1.450673970580101, |
| "ce_loss_7": 4.637939321994781, |
| "epoch": 0.153, |
| "grad_norm": 21.937587942461658, |
| "kl_loss_13": 5717.6, |
| "kl_loss_26": 4016.0, |
| "kl_loss_39": 3551.6, |
| "kl_loss_7": 6520.0, |
| "learning_rate": 0.0009493970231495835, |
| "loss": 9886.2, |
| "step": 1530 |
| }, |
| { |
| "ce_loss_13": 4.213818311691284, |
| "ce_loss_26": 3.4225172460079194, |
| "ce_loss_39": 3.192109799385071, |
| "ce_loss_52": 1.4253905564546585, |
| "ce_loss_7": 4.591876769065857, |
| "epoch": 0.154, |
| "grad_norm": 22.11614774484019, |
| "kl_loss_13": 5669.6, |
| "kl_loss_26": 3997.6, |
| "kl_loss_39": 3517.6, |
| "kl_loss_7": 6459.2, |
| "learning_rate": 0.0009486992143456792, |
| "loss": 9848.6, |
| "step": 1540 |
| }, |
| { |
| "ce_loss_13": 4.188841539621353, |
| "ce_loss_26": 3.3936797797679903, |
| "ce_loss_39": 3.15527623295784, |
| "ce_loss_52": 1.4304118230938911, |
| "ce_loss_7": 4.582345807552338, |
| "epoch": 0.155, |
| "grad_norm": 23.528038258157196, |
| "kl_loss_13": 5588.0, |
| "kl_loss_26": 3908.0, |
| "kl_loss_39": 3429.6, |
| "kl_loss_7": 6408.8, |
| "learning_rate": 0.0009479968871456679, |
| "loss": 9804.0, |
| "step": 1550 |
| }, |
| { |
| "ce_loss_13": 4.199311399459839, |
| "ce_loss_26": 3.388694739341736, |
| "ce_loss_39": 3.161331224441528, |
| "ce_loss_52": 1.424024721980095, |
| "ce_loss_7": 4.598053079843521, |
| "epoch": 0.156, |
| "grad_norm": 20.794486798088236, |
| "kl_loss_13": 5643.2, |
| "kl_loss_26": 3922.0, |
| "kl_loss_39": 3462.8, |
| "kl_loss_7": 6473.6, |
| "learning_rate": 0.0009472900486219768, |
| "loss": 9758.3, |
| "step": 1560 |
| }, |
| { |
| "ce_loss_13": 4.16922065615654, |
| "ce_loss_26": 3.3762763381004333, |
| "ce_loss_39": 3.142381691932678, |
| "ce_loss_52": 1.4278530597686767, |
| "ce_loss_7": 4.5707217931747435, |
| "epoch": 0.157, |
| "grad_norm": 21.684568107604626, |
| "kl_loss_13": 5582.4, |
| "kl_loss_26": 3884.4, |
| "kl_loss_39": 3405.2, |
| "kl_loss_7": 6420.0, |
| "learning_rate": 0.000946578705892462, |
| "loss": 9625.8, |
| "step": 1570 |
| }, |
| { |
| "ce_loss_13": 4.178904807567596, |
| "ce_loss_26": 3.382037007808685, |
| "ce_loss_39": 3.137607681751251, |
| "ce_loss_52": 1.4311140328645706, |
| "ce_loss_7": 4.566913962364197, |
| "epoch": 0.158, |
| "grad_norm": 21.97647697577533, |
| "kl_loss_13": 5572.8, |
| "kl_loss_26": 3885.6, |
| "kl_loss_39": 3393.6, |
| "kl_loss_7": 6394.4, |
| "learning_rate": 0.0009458628661203367, |
| "loss": 9608.1, |
| "step": 1580 |
| }, |
| { |
| "ce_loss_13": 4.179209893941879, |
| "ce_loss_26": 3.378256690502167, |
| "ce_loss_39": 3.13810538649559, |
| "ce_loss_52": 1.418858152627945, |
| "ce_loss_7": 4.562736237049103, |
| "epoch": 0.159, |
| "grad_norm": 20.949460387099393, |
| "kl_loss_13": 5612.0, |
| "kl_loss_26": 3914.4, |
| "kl_loss_39": 3428.8, |
| "kl_loss_7": 6420.0, |
| "learning_rate": 0.0009451425365140996, |
| "loss": 9608.8, |
| "step": 1590 |
| }, |
| { |
| "ce_loss_13": 4.1623717725276945, |
| "ce_loss_26": 3.376670056581497, |
| "ce_loss_39": 3.1331222474575045, |
| "ce_loss_52": 1.434694454073906, |
| "ce_loss_7": 4.556069934368134, |
| "epoch": 0.16, |
| "grad_norm": 20.884587914746188, |
| "kl_loss_13": 5573.6, |
| "kl_loss_26": 3878.8, |
| "kl_loss_39": 3382.4, |
| "kl_loss_7": 6402.4, |
| "learning_rate": 0.0009444177243274617, |
| "loss": 9535.6, |
| "step": 1600 |
| }, |
| { |
| "ce_loss_13": 4.082578724622726, |
| "ce_loss_26": 3.289813929796219, |
| "ce_loss_39": 3.0515677452087404, |
| "ce_loss_52": 1.4236672833561896, |
| "ce_loss_7": 4.478320574760437, |
| "epoch": 0.161, |
| "grad_norm": 20.592533194999756, |
| "kl_loss_13": 5423.2, |
| "kl_loss_26": 3736.8, |
| "kl_loss_39": 3249.6, |
| "kl_loss_7": 6253.6, |
| "learning_rate": 0.0009436884368592739, |
| "loss": 9466.0, |
| "step": 1610 |
| }, |
| { |
| "ce_loss_13": 4.142659282684326, |
| "ce_loss_26": 3.3751652896404267, |
| "ce_loss_39": 3.137872564792633, |
| "ce_loss_52": 1.481352651119232, |
| "ce_loss_7": 4.527895116806031, |
| "epoch": 0.162, |
| "grad_norm": 21.486710542968336, |
| "kl_loss_13": 5416.8, |
| "kl_loss_26": 3772.8, |
| "kl_loss_39": 3282.8, |
| "kl_loss_7": 6228.8, |
| "learning_rate": 0.0009429546814534529, |
| "loss": 9367.9, |
| "step": 1620 |
| }, |
| { |
| "ce_loss_13": 4.141797959804535, |
| "ce_loss_26": 3.344935214519501, |
| "ce_loss_39": 3.1015843570232393, |
| "ce_loss_52": 1.4449981674551964, |
| "ce_loss_7": 4.53586882352829, |
| "epoch": 0.163, |
| "grad_norm": 22.257565389083407, |
| "kl_loss_13": 5484.8, |
| "kl_loss_26": 3776.8, |
| "kl_loss_39": 3282.8, |
| "kl_loss_7": 6309.6, |
| "learning_rate": 0.0009422164654989072, |
| "loss": 9391.3, |
| "step": 1630 |
| }, |
| { |
| "ce_loss_13": 4.131260120868683, |
| "ce_loss_26": 3.3283946096897123, |
| "ce_loss_39": 3.08265677690506, |
| "ce_loss_52": 1.446463230252266, |
| "ce_loss_7": 4.523310673236847, |
| "epoch": 0.164, |
| "grad_norm": 20.40514368262374, |
| "kl_loss_13": 5457.6, |
| "kl_loss_26": 3766.0, |
| "kl_loss_39": 3265.6, |
| "kl_loss_7": 6288.8, |
| "learning_rate": 0.0009414737964294635, |
| "loss": 9297.4, |
| "step": 1640 |
| }, |
| { |
| "ce_loss_13": 4.055157667398452, |
| "ce_loss_26": 3.2639363288879393, |
| "ce_loss_39": 3.0281569600105285, |
| "ce_loss_52": 1.451804205775261, |
| "ce_loss_7": 4.447070962190628, |
| "epoch": 0.165, |
| "grad_norm": 21.902920394223948, |
| "kl_loss_13": 5323.2, |
| "kl_loss_26": 3634.0, |
| "kl_loss_39": 3150.0, |
| "kl_loss_7": 6143.2, |
| "learning_rate": 0.000940726681723791, |
| "loss": 9207.4, |
| "step": 1650 |
| }, |
| { |
| "ce_loss_13": 3.9808266043663023, |
| "ce_loss_26": 3.1883151113986967, |
| "ce_loss_39": 2.9586188077926634, |
| "ce_loss_52": 1.4088758006691933, |
| "ce_loss_7": 4.372277349233627, |
| "epoch": 0.166, |
| "grad_norm": 21.183658337296244, |
| "kl_loss_13": 5256.0, |
| "kl_loss_26": 3567.6, |
| "kl_loss_39": 3100.8, |
| "kl_loss_7": 6079.2, |
| "learning_rate": 0.0009399751289053266, |
| "loss": 9204.0, |
| "step": 1660 |
| }, |
| { |
| "ce_loss_13": 4.0190062642097475, |
| "ce_loss_26": 3.22450470328331, |
| "ce_loss_39": 2.987401658296585, |
| "ce_loss_52": 1.3997518077492714, |
| "ce_loss_7": 4.41674884557724, |
| "epoch": 0.167, |
| "grad_norm": 21.809854839151214, |
| "kl_loss_13": 5346.4, |
| "kl_loss_26": 3648.8, |
| "kl_loss_39": 3164.8, |
| "kl_loss_7": 6175.2, |
| "learning_rate": 0.0009392191455421988, |
| "loss": 9183.3, |
| "step": 1670 |
| }, |
| { |
| "ce_loss_13": 3.9647205591201784, |
| "ce_loss_26": 3.1920640766620636, |
| "ce_loss_39": 2.951520323753357, |
| "ce_loss_52": 1.385107731819153, |
| "ce_loss_7": 4.359467995166779, |
| "epoch": 0.168, |
| "grad_norm": 20.368238654152925, |
| "kl_loss_13": 5256.8, |
| "kl_loss_26": 3602.8, |
| "kl_loss_39": 3118.0, |
| "kl_loss_7": 6080.8, |
| "learning_rate": 0.0009384587392471515, |
| "loss": 9080.3, |
| "step": 1680 |
| }, |
| { |
| "ce_loss_13": 3.9925671815872192, |
| "ce_loss_26": 3.2080724120140074, |
| "ce_loss_39": 2.9695263385772703, |
| "ce_loss_52": 1.4156971365213393, |
| "ce_loss_7": 4.380452990531921, |
| "epoch": 0.169, |
| "grad_norm": 20.999177946491915, |
| "kl_loss_13": 5268.0, |
| "kl_loss_26": 3596.4, |
| "kl_loss_39": 3100.8, |
| "kl_loss_7": 6080.8, |
| "learning_rate": 0.0009376939176774678, |
| "loss": 8989.5, |
| "step": 1690 |
| }, |
| { |
| "ce_loss_13": 4.0180779755115505, |
| "ce_loss_26": 3.258928191661835, |
| "ce_loss_39": 3.0135749876499176, |
| "ce_loss_52": 1.450287464261055, |
| "ce_loss_7": 4.408803248405457, |
| "epoch": 0.17, |
| "grad_norm": 19.79885149243747, |
| "kl_loss_13": 5224.8, |
| "kl_loss_26": 3590.8, |
| "kl_loss_39": 3092.0, |
| "kl_loss_7": 6036.8, |
| "learning_rate": 0.0009369246885348925, |
| "loss": 8994.3, |
| "step": 1700 |
| }, |
| { |
| "ce_loss_13": 4.005008333921433, |
| "ce_loss_26": 3.2108667314052584, |
| "ce_loss_39": 2.9691853642463686, |
| "ce_loss_52": 1.4231860041618347, |
| "ce_loss_7": 4.408407872915268, |
| "epoch": 0.171, |
| "grad_norm": 20.19997348661953, |
| "kl_loss_13": 5275.2, |
| "kl_loss_26": 3580.8, |
| "kl_loss_39": 3088.8, |
| "kl_loss_7": 6116.8, |
| "learning_rate": 0.0009361510595655545, |
| "loss": 9032.7, |
| "step": 1710 |
| }, |
| { |
| "ce_loss_13": 4.0295430123806, |
| "ce_loss_26": 3.269349628686905, |
| "ce_loss_39": 3.0168069303035736, |
| "ce_loss_52": 1.4558824241161346, |
| "ce_loss_7": 4.418658912181854, |
| "epoch": 0.172, |
| "grad_norm": 20.29063277895484, |
| "kl_loss_13": 5251.2, |
| "kl_loss_26": 3624.8, |
| "kl_loss_39": 3109.2, |
| "kl_loss_7": 6066.4, |
| "learning_rate": 0.0009353730385598887, |
| "loss": 8917.6, |
| "step": 1720 |
| }, |
| { |
| "ce_loss_13": 3.904751992225647, |
| "ce_loss_26": 3.117345708608627, |
| "ce_loss_39": 2.8736896753311156, |
| "ce_loss_52": 1.404754376411438, |
| "ce_loss_7": 4.301223260164261, |
| "epoch": 0.173, |
| "grad_norm": 21.33168840347063, |
| "kl_loss_13": 5095.2, |
| "kl_loss_26": 3414.0, |
| "kl_loss_39": 2926.0, |
| "kl_loss_7": 5920.8, |
| "learning_rate": 0.0009345906333525581, |
| "loss": 8827.0, |
| "step": 1730 |
| }, |
| { |
| "ce_loss_13": 3.943844336271286, |
| "ce_loss_26": 3.192184156179428, |
| "ce_loss_39": 2.938348424434662, |
| "ce_loss_52": 1.4280656158924103, |
| "ce_loss_7": 4.340347635746002, |
| "epoch": 0.174, |
| "grad_norm": 20.78517763533083, |
| "kl_loss_13": 5121.6, |
| "kl_loss_26": 3512.0, |
| "kl_loss_39": 2997.2, |
| "kl_loss_7": 5951.2, |
| "learning_rate": 0.0009338038518223745, |
| "loss": 8776.4, |
| "step": 1740 |
| }, |
| { |
| "ce_loss_13": 3.9746175587177275, |
| "ce_loss_26": 3.217360532283783, |
| "ce_loss_39": 2.9738565742969514, |
| "ce_loss_52": 1.460440719127655, |
| "ce_loss_7": 4.36298366189003, |
| "epoch": 0.175, |
| "grad_norm": 22.95282400415446, |
| "kl_loss_13": 5116.0, |
| "kl_loss_26": 3500.0, |
| "kl_loss_39": 2996.0, |
| "kl_loss_7": 5937.6, |
| "learning_rate": 0.0009330127018922195, |
| "loss": 8715.7, |
| "step": 1750 |
| }, |
| { |
| "ce_loss_13": 3.903409707546234, |
| "ce_loss_26": 3.1333308279514314, |
| "ce_loss_39": 2.8973484218120573, |
| "ce_loss_52": 1.4389021694660187, |
| "ce_loss_7": 4.297668445110321, |
| "epoch": 0.176, |
| "grad_norm": 19.71888822868113, |
| "kl_loss_13": 5043.2, |
| "kl_loss_26": 3399.6, |
| "kl_loss_39": 2906.0, |
| "kl_loss_7": 5868.8, |
| "learning_rate": 0.0009322171915289634, |
| "loss": 8660.5, |
| "step": 1760 |
| }, |
| { |
| "ce_loss_13": 3.9436737656593324, |
| "ce_loss_26": 3.180408328771591, |
| "ce_loss_39": 2.933422142267227, |
| "ce_loss_52": 1.468785560131073, |
| "ce_loss_7": 4.335923504829407, |
| "epoch": 0.177, |
| "grad_norm": 21.100233895265884, |
| "kl_loss_13": 5036.8, |
| "kl_loss_26": 3416.8, |
| "kl_loss_39": 2893.6, |
| "kl_loss_7": 5853.6, |
| "learning_rate": 0.0009314173287433873, |
| "loss": 8685.1, |
| "step": 1770 |
| }, |
| { |
| "ce_loss_13": 4.000654596090317, |
| "ce_loss_26": 3.24809735417366, |
| "ce_loss_39": 2.9819031238555906, |
| "ce_loss_52": 1.4765232503414154, |
| "ce_loss_7": 4.392659711837768, |
| "epoch": 0.178, |
| "grad_norm": 20.346113365710192, |
| "kl_loss_13": 5141.6, |
| "kl_loss_26": 3518.0, |
| "kl_loss_39": 2990.4, |
| "kl_loss_7": 5964.0, |
| "learning_rate": 0.0009306131215901003, |
| "loss": 8673.2, |
| "step": 1780 |
| }, |
| { |
| "ce_loss_13": 3.9246813535690306, |
| "ce_loss_26": 3.1745048224925996, |
| "ce_loss_39": 2.9245960414409637, |
| "ce_loss_52": 1.4693263441324234, |
| "ce_loss_7": 4.317492133378982, |
| "epoch": 0.179, |
| "grad_norm": 19.397137651046872, |
| "kl_loss_13": 5018.4, |
| "kl_loss_26": 3389.2, |
| "kl_loss_39": 2878.0, |
| "kl_loss_7": 5844.8, |
| "learning_rate": 0.0009298045781674596, |
| "loss": 8564.1, |
| "step": 1790 |
| }, |
| { |
| "ce_loss_13": 3.910848397016525, |
| "ce_loss_26": 3.146669828891754, |
| "ce_loss_39": 2.8859269857406615, |
| "ce_loss_52": 1.419717761874199, |
| "ce_loss_7": 4.3105459094047545, |
| "epoch": 0.18, |
| "grad_norm": 19.504781875531744, |
| "kl_loss_13": 5040.0, |
| "kl_loss_26": 3411.2, |
| "kl_loss_39": 2894.0, |
| "kl_loss_7": 5888.0, |
| "learning_rate": 0.0009289917066174886, |
| "loss": 8563.1, |
| "step": 1800 |
| }, |
| { |
| "ce_loss_13": 3.886237096786499, |
| "ce_loss_26": 3.0992377579212187, |
| "ce_loss_39": 2.8514576256275177, |
| "ce_loss_52": 1.4184779956936837, |
| "ce_loss_7": 4.274789291620254, |
| "epoch": 0.181, |
| "grad_norm": 19.60587995762198, |
| "kl_loss_13": 5042.4, |
| "kl_loss_26": 3362.0, |
| "kl_loss_39": 2856.4, |
| "kl_loss_7": 5855.2, |
| "learning_rate": 0.0009281745151257945, |
| "loss": 8453.1, |
| "step": 1810 |
| }, |
| { |
| "ce_loss_13": 3.9144074499607084, |
| "ce_loss_26": 3.1662435114383696, |
| "ce_loss_39": 2.9089259922504427, |
| "ce_loss_52": 1.478528293967247, |
| "ce_loss_7": 4.301838612556457, |
| "epoch": 0.182, |
| "grad_norm": 19.79968756805323, |
| "kl_loss_13": 4923.2, |
| "kl_loss_26": 3333.2, |
| "kl_loss_39": 2811.6, |
| "kl_loss_7": 5736.8, |
| "learning_rate": 0.0009273530119214868, |
| "loss": 8471.9, |
| "step": 1820 |
| }, |
| { |
| "ce_loss_13": 3.8238776862621306, |
| "ce_loss_26": 3.058783656358719, |
| "ce_loss_39": 2.807573360204697, |
| "ce_loss_52": 1.4178009316325189, |
| "ce_loss_7": 4.227353280782699, |
| "epoch": 0.183, |
| "grad_norm": 19.471751859426742, |
| "kl_loss_13": 4899.2, |
| "kl_loss_26": 3273.6, |
| "kl_loss_39": 2762.8, |
| "kl_loss_7": 5738.4, |
| "learning_rate": 0.0009265272052770935, |
| "loss": 8399.4, |
| "step": 1830 |
| }, |
| { |
| "ce_loss_13": 3.836485821008682, |
| "ce_loss_26": 3.0666845202445985, |
| "ce_loss_39": 2.818044346570969, |
| "ce_loss_52": 1.410616011917591, |
| "ce_loss_7": 4.236015152931214, |
| "epoch": 0.184, |
| "grad_norm": 19.102126667670856, |
| "kl_loss_13": 4940.0, |
| "kl_loss_26": 3286.0, |
| "kl_loss_39": 2780.4, |
| "kl_loss_7": 5788.0, |
| "learning_rate": 0.0009256971035084784, |
| "loss": 8347.7, |
| "step": 1840 |
| }, |
| { |
| "ce_loss_13": 3.8049618661403657, |
| "ce_loss_26": 3.0603095471858976, |
| "ce_loss_39": 2.8144713938236237, |
| "ce_loss_52": 1.4259676218032837, |
| "ce_loss_7": 4.199950724840164, |
| "epoch": 0.185, |
| "grad_norm": 19.382557477849222, |
| "kl_loss_13": 4835.6, |
| "kl_loss_26": 3257.6, |
| "kl_loss_39": 2762.0, |
| "kl_loss_7": 5657.6, |
| "learning_rate": 0.0009248627149747573, |
| "loss": 8313.5, |
| "step": 1850 |
| }, |
| { |
| "ce_loss_13": 3.839466482400894, |
| "ce_loss_26": 3.065267437696457, |
| "ce_loss_39": 2.81703776717186, |
| "ce_loss_52": 1.4297384396195412, |
| "ce_loss_7": 4.240725481510163, |
| "epoch": 0.186, |
| "grad_norm": 20.053275602042234, |
| "kl_loss_13": 4908.4, |
| "kl_loss_26": 3269.6, |
| "kl_loss_39": 2759.6, |
| "kl_loss_7": 5741.6, |
| "learning_rate": 0.0009240240480782129, |
| "loss": 8305.0, |
| "step": 1860 |
| }, |
| { |
| "ce_loss_13": 3.8183856308460236, |
| "ce_loss_26": 3.070716941356659, |
| "ce_loss_39": 2.807391846179962, |
| "ce_loss_52": 1.4359196320176124, |
| "ce_loss_7": 4.214242458343506, |
| "epoch": 0.187, |
| "grad_norm": 19.191079413729856, |
| "kl_loss_13": 4847.2, |
| "kl_loss_26": 3263.6, |
| "kl_loss_39": 2733.2, |
| "kl_loss_7": 5672.8, |
| "learning_rate": 0.0009231811112642122, |
| "loss": 8227.6, |
| "step": 1870 |
| }, |
| { |
| "ce_loss_13": 3.779190129041672, |
| "ce_loss_26": 3.0389082789421082, |
| "ce_loss_39": 2.783122771978378, |
| "ce_loss_52": 1.4208515673875808, |
| "ce_loss_7": 4.162911784648895, |
| "epoch": 0.188, |
| "grad_norm": 20.43241639662848, |
| "kl_loss_13": 4795.2, |
| "kl_loss_26": 3215.2, |
| "kl_loss_39": 2693.2, |
| "kl_loss_7": 5606.4, |
| "learning_rate": 0.0009223339130211192, |
| "loss": 8213.8, |
| "step": 1880 |
| }, |
| { |
| "ce_loss_13": 3.708034944534302, |
| "ce_loss_26": 2.9702564030885696, |
| "ce_loss_39": 2.735187420248985, |
| "ce_loss_52": 1.409775149822235, |
| "ce_loss_7": 4.099964368343353, |
| "epoch": 0.189, |
| "grad_norm": 19.723419677891812, |
| "kl_loss_13": 4691.2, |
| "kl_loss_26": 3118.0, |
| "kl_loss_39": 2625.8, |
| "kl_loss_7": 5516.0, |
| "learning_rate": 0.0009214824618802108, |
| "loss": 8146.0, |
| "step": 1890 |
| }, |
| { |
| "ce_loss_13": 3.848232001066208, |
| "ce_loss_26": 3.06461501121521, |
| "ce_loss_39": 2.81026514172554, |
| "ce_loss_52": 1.4419742107391358, |
| "ce_loss_7": 4.244399529695511, |
| "epoch": 0.19, |
| "grad_norm": 21.429961437314283, |
| "kl_loss_13": 4923.2, |
| "kl_loss_26": 3246.4, |
| "kl_loss_39": 2725.6, |
| "kl_loss_7": 5755.2, |
| "learning_rate": 0.0009206267664155906, |
| "loss": 8168.2, |
| "step": 1900 |
| }, |
| { |
| "ce_loss_13": 3.74611656665802, |
| "ce_loss_26": 2.995819491147995, |
| "ce_loss_39": 2.7504830598831176, |
| "ce_loss_52": 1.429488417506218, |
| "ce_loss_7": 4.137593048810959, |
| "epoch": 0.191, |
| "grad_norm": 20.703417767001277, |
| "kl_loss_13": 4708.8, |
| "kl_loss_26": 3114.8, |
| "kl_loss_39": 2616.8, |
| "kl_loss_7": 5532.0, |
| "learning_rate": 0.0009197668352441024, |
| "loss": 8113.4, |
| "step": 1910 |
| }, |
| { |
| "ce_loss_13": 3.7616052985191346, |
| "ce_loss_26": 3.0063544154167174, |
| "ce_loss_39": 2.7453058779239656, |
| "ce_loss_52": 1.4119072929024696, |
| "ce_loss_7": 4.15754896402359, |
| "epoch": 0.192, |
| "grad_norm": 19.851460642991412, |
| "kl_loss_13": 4782.4, |
| "kl_loss_26": 3173.6, |
| "kl_loss_39": 2644.0, |
| "kl_loss_7": 5609.6, |
| "learning_rate": 0.0009189026770252437, |
| "loss": 8087.1, |
| "step": 1920 |
| }, |
| { |
| "ce_loss_13": 3.7914208650588987, |
| "ce_loss_26": 3.0324925601482393, |
| "ce_loss_39": 2.7728191137313845, |
| "ce_loss_52": 1.4355733066797256, |
| "ce_loss_7": 4.178089827299118, |
| "epoch": 0.193, |
| "grad_norm": 19.2014881795966, |
| "kl_loss_13": 4800.8, |
| "kl_loss_26": 3194.0, |
| "kl_loss_39": 2659.2, |
| "kl_loss_7": 5608.8, |
| "learning_rate": 0.000918034300461078, |
| "loss": 8051.8, |
| "step": 1930 |
| }, |
| { |
| "ce_loss_13": 3.7237455368041994, |
| "ce_loss_26": 2.979181283712387, |
| "ce_loss_39": 2.7309202194213866, |
| "ce_loss_52": 1.4160432904958724, |
| "ce_loss_7": 4.128066539764404, |
| "epoch": 0.194, |
| "grad_norm": 20.3086236795729, |
| "kl_loss_13": 4720.0, |
| "kl_loss_26": 3114.8, |
| "kl_loss_39": 2602.4, |
| "kl_loss_7": 5556.0, |
| "learning_rate": 0.0009171617142961477, |
| "loss": 8041.9, |
| "step": 1940 |
| }, |
| { |
| "ce_loss_13": 3.749431645870209, |
| "ce_loss_26": 2.9937612235546114, |
| "ce_loss_39": 2.744140648841858, |
| "ce_loss_52": 1.4348472714424134, |
| "ce_loss_7": 4.156274873018265, |
| "epoch": 0.195, |
| "grad_norm": 19.294577091174897, |
| "kl_loss_13": 4722.4, |
| "kl_loss_26": 3116.0, |
| "kl_loss_39": 2605.2, |
| "kl_loss_7": 5567.2, |
| "learning_rate": 0.0009162849273173857, |
| "loss": 7982.1, |
| "step": 1950 |
| }, |
| { |
| "ce_loss_13": 3.7092731952667237, |
| "ce_loss_26": 2.9811393320560455, |
| "ce_loss_39": 2.733800619840622, |
| "ce_loss_52": 1.4478029429912567, |
| "ce_loss_7": 4.0951203346252445, |
| "epoch": 0.196, |
| "grad_norm": 18.879569020896366, |
| "kl_loss_13": 4628.0, |
| "kl_loss_26": 3074.0, |
| "kl_loss_39": 2569.2, |
| "kl_loss_7": 5440.8, |
| "learning_rate": 0.0009154039483540273, |
| "loss": 7938.2, |
| "step": 1960 |
| }, |
| { |
| "ce_loss_13": 3.816760164499283, |
| "ce_loss_26": 3.0473806083202364, |
| "ce_loss_39": 2.7949241638183593, |
| "ce_loss_52": 1.466755247116089, |
| "ce_loss_7": 4.206719404458999, |
| "epoch": 0.197, |
| "grad_norm": 18.719442415203407, |
| "kl_loss_13": 4774.4, |
| "kl_loss_26": 3153.6, |
| "kl_loss_39": 2628.8, |
| "kl_loss_7": 5592.0, |
| "learning_rate": 0.0009145187862775209, |
| "loss": 7902.9, |
| "step": 1970 |
| }, |
| { |
| "ce_loss_13": 3.6841754376888276, |
| "ce_loss_26": 2.9468030989170075, |
| "ce_loss_39": 2.694840121269226, |
| "ce_loss_52": 1.4269890293478966, |
| "ce_loss_7": 4.081214648485184, |
| "epoch": 0.198, |
| "grad_norm": 18.93967966275207, |
| "kl_loss_13": 4605.6, |
| "kl_loss_26": 3030.4, |
| "kl_loss_39": 2512.8, |
| "kl_loss_7": 5443.2, |
| "learning_rate": 0.0009136294500014386, |
| "loss": 7824.0, |
| "step": 1980 |
| }, |
| { |
| "ce_loss_13": 3.7859152793884276, |
| "ce_loss_26": 3.025045871734619, |
| "ce_loss_39": 2.761893022060394, |
| "ce_loss_52": 1.439668196439743, |
| "ce_loss_7": 4.175345808267593, |
| "epoch": 0.199, |
| "grad_norm": 18.767073263034167, |
| "kl_loss_13": 4776.0, |
| "kl_loss_26": 3164.0, |
| "kl_loss_39": 2630.4, |
| "kl_loss_7": 5584.0, |
| "learning_rate": 0.000912735948481387, |
| "loss": 7845.9, |
| "step": 1990 |
| }, |
| { |
| "ce_loss_13": 3.7032747983932497, |
| "ce_loss_26": 2.965251809358597, |
| "ce_loss_39": 2.708036279678345, |
| "ce_loss_52": 1.4405365601181983, |
| "ce_loss_7": 4.098276823759079, |
| "epoch": 0.2, |
| "grad_norm": 18.50836033098629, |
| "kl_loss_13": 4630.0, |
| "kl_loss_26": 3052.8, |
| "kl_loss_39": 2530.4, |
| "kl_loss_7": 5457.6, |
| "learning_rate": 0.0009118382907149164, |
| "loss": 7748.9, |
| "step": 2000 |
| }, |
| { |
| "ce_loss_13": 3.717062991857529, |
| "ce_loss_26": 2.979129308462143, |
| "ce_loss_39": 2.7225220024585726, |
| "ce_loss_52": 1.45234707146883, |
| "ce_loss_7": 4.110658597946167, |
| "epoch": 0.201, |
| "grad_norm": 19.844351529751968, |
| "kl_loss_13": 4629.6, |
| "kl_loss_26": 3045.6, |
| "kl_loss_39": 2525.2, |
| "kl_loss_7": 5464.0, |
| "learning_rate": 0.0009109364857414306, |
| "loss": 7809.6, |
| "step": 2010 |
| }, |
| { |
| "ce_loss_13": 3.7279494285583494, |
| "ce_loss_26": 2.9849789261817934, |
| "ce_loss_39": 2.7267052114009855, |
| "ce_loss_52": 1.4486449271440507, |
| "ce_loss_7": 4.1131413102149965, |
| "epoch": 0.202, |
| "grad_norm": 19.33420875029095, |
| "kl_loss_13": 4639.2, |
| "kl_loss_26": 3061.2, |
| "kl_loss_39": 2533.6, |
| "kl_loss_7": 5456.0, |
| "learning_rate": 0.0009100305426420956, |
| "loss": 7708.0, |
| "step": 2020 |
| }, |
| { |
| "ce_loss_13": 3.643654578924179, |
| "ce_loss_26": 2.9251492261886596, |
| "ce_loss_39": 2.667558515071869, |
| "ce_loss_52": 1.4118730872869492, |
| "ce_loss_7": 4.034631943702697, |
| "epoch": 0.203, |
| "grad_norm": 19.14819354564296, |
| "kl_loss_13": 4553.6, |
| "kl_loss_26": 3016.0, |
| "kl_loss_39": 2492.0, |
| "kl_loss_7": 5371.2, |
| "learning_rate": 0.0009091204705397484, |
| "loss": 7699.0, |
| "step": 2030 |
| }, |
| { |
| "ce_loss_13": 3.706267160177231, |
| "ce_loss_26": 2.971902164816856, |
| "ce_loss_39": 2.7119477689266205, |
| "ce_loss_52": 1.4564015328884126, |
| "ce_loss_7": 4.097871041297912, |
| "epoch": 0.204, |
| "grad_norm": 18.950983085399912, |
| "kl_loss_13": 4556.8, |
| "kl_loss_26": 2972.2, |
| "kl_loss_39": 2454.6, |
| "kl_loss_7": 5380.0, |
| "learning_rate": 0.0009082062785988049, |
| "loss": 7671.5, |
| "step": 2040 |
| }, |
| { |
| "ce_loss_13": 3.6526230454444883, |
| "ce_loss_26": 2.905240607261658, |
| "ce_loss_39": 2.654486656188965, |
| "ce_loss_52": 1.4092363178730012, |
| "ce_loss_7": 4.043174755573273, |
| "epoch": 0.205, |
| "grad_norm": 20.137932882736667, |
| "kl_loss_13": 4559.6, |
| "kl_loss_26": 2969.2, |
| "kl_loss_39": 2459.2, |
| "kl_loss_7": 5384.0, |
| "learning_rate": 0.0009072879760251679, |
| "loss": 7662.0, |
| "step": 2050 |
| }, |
| { |
| "ce_loss_13": 3.578403168916702, |
| "ce_loss_26": 2.8548426389694215, |
| "ce_loss_39": 2.615563529729843, |
| "ce_loss_52": 1.4107020199298859, |
| "ce_loss_7": 3.97204332947731, |
| "epoch": 0.206, |
| "grad_norm": 19.383234658877402, |
| "kl_loss_13": 4444.0, |
| "kl_loss_26": 2894.0, |
| "kl_loss_39": 2391.4, |
| "kl_loss_7": 5266.4, |
| "learning_rate": 0.0009063655720661341, |
| "loss": 7643.5, |
| "step": 2060 |
| }, |
| { |
| "ce_loss_13": 3.5892180263996125, |
| "ce_loss_26": 2.855096530914307, |
| "ce_loss_39": 2.6004712164402006, |
| "ce_loss_52": 1.4145165607333183, |
| "ce_loss_7": 3.983754909038544, |
| "epoch": 0.207, |
| "grad_norm": 19.811430759479535, |
| "kl_loss_13": 4467.2, |
| "kl_loss_26": 2904.0, |
| "kl_loss_39": 2385.0, |
| "kl_loss_7": 5291.2, |
| "learning_rate": 0.000905439076010301, |
| "loss": 7534.0, |
| "step": 2070 |
| }, |
| { |
| "ce_loss_13": 3.6425350308418274, |
| "ce_loss_26": 2.9249331414699555, |
| "ce_loss_39": 2.6687968969345093, |
| "ce_loss_52": 1.4602822691202164, |
| "ce_loss_7": 4.025002205371857, |
| "epoch": 0.208, |
| "grad_norm": 19.741988709477923, |
| "kl_loss_13": 4438.4, |
| "kl_loss_26": 2910.0, |
| "kl_loss_39": 2394.8, |
| "kl_loss_7": 5248.8, |
| "learning_rate": 0.0009045084971874737, |
| "loss": 7505.8, |
| "step": 2080 |
| }, |
| { |
| "ce_loss_13": 3.553661996126175, |
| "ce_loss_26": 2.8095623433589934, |
| "ce_loss_39": 2.561682888865471, |
| "ce_loss_52": 1.3855161294341087, |
| "ce_loss_7": 3.9535735607147218, |
| "epoch": 0.209, |
| "grad_norm": 18.406387003890178, |
| "kl_loss_13": 4454.8, |
| "kl_loss_26": 2860.4, |
| "kl_loss_39": 2354.2, |
| "kl_loss_7": 5295.2, |
| "learning_rate": 0.0009035738449685707, |
| "loss": 7532.3, |
| "step": 2090 |
| }, |
| { |
| "ce_loss_13": 3.659823089838028, |
| "ce_loss_26": 2.919918876886368, |
| "ce_loss_39": 2.67503222823143, |
| "ce_loss_52": 1.4627915531396867, |
| "ce_loss_7": 4.0562332451343535, |
| "epoch": 0.21, |
| "grad_norm": 19.907425841295773, |
| "kl_loss_13": 4488.8, |
| "kl_loss_26": 2909.2, |
| "kl_loss_39": 2407.0, |
| "kl_loss_7": 5320.8, |
| "learning_rate": 0.0009026351287655293, |
| "loss": 7517.4, |
| "step": 2100 |
| }, |
| { |
| "ce_loss_13": 3.630769556760788, |
| "ce_loss_26": 2.9068243861198426, |
| "ce_loss_39": 2.642093613743782, |
| "ce_loss_52": 1.4367542505264281, |
| "ce_loss_7": 4.022511690855026, |
| "epoch": 0.211, |
| "grad_norm": 18.196990652810108, |
| "kl_loss_13": 4459.2, |
| "kl_loss_26": 2926.8, |
| "kl_loss_39": 2395.2, |
| "kl_loss_7": 5276.0, |
| "learning_rate": 0.0009016923580312113, |
| "loss": 7412.6, |
| "step": 2110 |
| }, |
| { |
| "ce_loss_13": 3.682375580072403, |
| "ce_loss_26": 2.9616983354091646, |
| "ce_loss_39": 2.6908247590065004, |
| "ce_loss_52": 1.482297733426094, |
| "ce_loss_7": 4.079386693239212, |
| "epoch": 0.212, |
| "grad_norm": 20.88180475150302, |
| "kl_loss_13": 4465.6, |
| "kl_loss_26": 2920.8, |
| "kl_loss_39": 2382.8, |
| "kl_loss_7": 5297.6, |
| "learning_rate": 0.0009007455422593077, |
| "loss": 7402.8, |
| "step": 2120 |
| }, |
| { |
| "ce_loss_13": 3.5886090993881226, |
| "ce_loss_26": 2.8697936654090883, |
| "ce_loss_39": 2.6118789970874787, |
| "ce_loss_52": 1.4324451625347137, |
| "ce_loss_7": 3.989769661426544, |
| "epoch": 0.213, |
| "grad_norm": 21.410550894381295, |
| "kl_loss_13": 4406.0, |
| "kl_loss_26": 2862.8, |
| "kl_loss_39": 2337.2, |
| "kl_loss_7": 5236.0, |
| "learning_rate": 0.0008997946909842425, |
| "loss": 7376.6, |
| "step": 2130 |
| }, |
| { |
| "ce_loss_13": 3.5265793919563295, |
| "ce_loss_26": 2.8049169957637785, |
| "ce_loss_39": 2.5549218744039535, |
| "ce_loss_52": 1.4088917583227158, |
| "ce_loss_7": 3.920357757806778, |
| "epoch": 0.214, |
| "grad_norm": 18.12930136088373, |
| "kl_loss_13": 4307.2, |
| "kl_loss_26": 2766.8, |
| "kl_loss_39": 2250.8, |
| "kl_loss_7": 5132.8, |
| "learning_rate": 0.0008988398137810777, |
| "loss": 7289.0, |
| "step": 2140 |
| }, |
| { |
| "ce_loss_13": 3.493118005990982, |
| "ce_loss_26": 2.7619090020656585, |
| "ce_loss_39": 2.523107871413231, |
| "ce_loss_52": 1.3855519428849221, |
| "ce_loss_7": 3.8948814868927, |
| "epoch": 0.215, |
| "grad_norm": 19.085499107664276, |
| "kl_loss_13": 4317.6, |
| "kl_loss_26": 2765.2, |
| "kl_loss_39": 2265.6, |
| "kl_loss_7": 5148.8, |
| "learning_rate": 0.0008978809202654162, |
| "loss": 7322.3, |
| "step": 2150 |
| }, |
| { |
| "ce_loss_13": 3.5010905504226684, |
| "ce_loss_26": 2.7818336695432664, |
| "ce_loss_39": 2.5317496716976167, |
| "ce_loss_52": 1.410066269338131, |
| "ce_loss_7": 3.8946242213249205, |
| "epoch": 0.216, |
| "grad_norm": 18.12712360145358, |
| "kl_loss_13": 4244.8, |
| "kl_loss_26": 2719.8, |
| "kl_loss_39": 2212.8, |
| "kl_loss_7": 5073.6, |
| "learning_rate": 0.0008969180200933046, |
| "loss": 7287.8, |
| "step": 2160 |
| }, |
| { |
| "ce_loss_13": 3.583745849132538, |
| "ce_loss_26": 2.8594263792037964, |
| "ce_loss_39": 2.5962479442358015, |
| "ce_loss_52": 1.4458883255720139, |
| "ce_loss_7": 3.976805257797241, |
| "epoch": 0.217, |
| "grad_norm": 17.565470208207127, |
| "kl_loss_13": 4359.6, |
| "kl_loss_26": 2818.4, |
| "kl_loss_39": 2285.8, |
| "kl_loss_7": 5177.6, |
| "learning_rate": 0.0008959511229611376, |
| "loss": 7240.9, |
| "step": 2170 |
| }, |
| { |
| "ce_loss_13": 3.580790603160858, |
| "ce_loss_26": 2.859365826845169, |
| "ce_loss_39": 2.5985568940639494, |
| "ce_loss_52": 1.4648242503404618, |
| "ce_loss_7": 3.9631645143032075, |
| "epoch": 0.218, |
| "grad_norm": 18.371856212121592, |
| "kl_loss_13": 4316.0, |
| "kl_loss_26": 2762.4, |
| "kl_loss_39": 2248.2, |
| "kl_loss_7": 5127.2, |
| "learning_rate": 0.0008949802386055581, |
| "loss": 7227.3, |
| "step": 2180 |
| }, |
| { |
| "ce_loss_13": 3.549471515417099, |
| "ce_loss_26": 2.814970576763153, |
| "ce_loss_39": 2.561775863170624, |
| "ce_loss_52": 1.4231164067983628, |
| "ce_loss_7": 3.941384530067444, |
| "epoch": 0.219, |
| "grad_norm": 17.91149753664061, |
| "kl_loss_13": 4321.2, |
| "kl_loss_26": 2784.4, |
| "kl_loss_39": 2263.6, |
| "kl_loss_7": 5152.0, |
| "learning_rate": 0.0008940053768033609, |
| "loss": 7238.8, |
| "step": 2190 |
| }, |
| { |
| "ce_loss_13": 3.5581556379795076, |
| "ce_loss_26": 2.825929582118988, |
| "ce_loss_39": 2.570690780878067, |
| "ce_loss_52": 1.4427952721714974, |
| "ce_loss_7": 3.9514447808265687, |
| "epoch": 0.22, |
| "grad_norm": 19.745379707547666, |
| "kl_loss_13": 4304.8, |
| "kl_loss_26": 2754.0, |
| "kl_loss_39": 2234.6, |
| "kl_loss_7": 5132.8, |
| "learning_rate": 0.0008930265473713938, |
| "loss": 7236.5, |
| "step": 2200 |
| }, |
| { |
| "ce_loss_13": 3.522547519207001, |
| "ce_loss_26": 2.7868906617164613, |
| "ce_loss_39": 2.524843490123749, |
| "ce_loss_52": 1.3902505502104758, |
| "ce_loss_7": 3.9295816838741304, |
| "epoch": 0.221, |
| "grad_norm": 18.58799040514992, |
| "kl_loss_13": 4349.2, |
| "kl_loss_26": 2792.0, |
| "kl_loss_39": 2262.0, |
| "kl_loss_7": 5198.4, |
| "learning_rate": 0.0008920437601664579, |
| "loss": 7187.9, |
| "step": 2210 |
| }, |
| { |
| "ce_loss_13": 3.5173128962516786, |
| "ce_loss_26": 2.804593563079834, |
| "ce_loss_39": 2.557006138563156, |
| "ce_loss_52": 1.4581168740987778, |
| "ce_loss_7": 3.910993677377701, |
| "epoch": 0.222, |
| "grad_norm": 18.95814843355098, |
| "kl_loss_13": 4224.4, |
| "kl_loss_26": 2677.2, |
| "kl_loss_39": 2165.2, |
| "kl_loss_7": 5053.6, |
| "learning_rate": 0.0008910570250852097, |
| "loss": 7168.4, |
| "step": 2220 |
| }, |
| { |
| "ce_loss_13": 3.4375841438770296, |
| "ce_loss_26": 2.7280281484127045, |
| "ce_loss_39": 2.47977514564991, |
| "ce_loss_52": 1.389390866458416, |
| "ce_loss_7": 3.838480031490326, |
| "epoch": 0.223, |
| "grad_norm": 19.685999540006744, |
| "kl_loss_13": 4179.6, |
| "kl_loss_26": 2668.0, |
| "kl_loss_39": 2152.8, |
| "kl_loss_7": 5011.2, |
| "learning_rate": 0.0008900663520640604, |
| "loss": 7080.8, |
| "step": 2230 |
| }, |
| { |
| "ce_loss_13": 3.5171928703784943, |
| "ce_loss_26": 2.8187219202518463, |
| "ce_loss_39": 2.5553694486618044, |
| "ce_loss_52": 1.4541724801063538, |
| "ce_loss_7": 3.8974815249443053, |
| "epoch": 0.224, |
| "grad_norm": 26.45112973751635, |
| "kl_loss_13": 4224.4, |
| "kl_loss_26": 2710.0, |
| "kl_loss_39": 2180.4, |
| "kl_loss_7": 5080.0, |
| "learning_rate": 0.0008890717510790764, |
| "loss": 7105.5, |
| "step": 2240 |
| }, |
| { |
| "ce_loss_13": 3.5361606895923616, |
| "ce_loss_26": 2.8268598556518554, |
| "ce_loss_39": 2.5735931187868117, |
| "ce_loss_52": 1.451829667389393, |
| "ce_loss_7": 3.924015772342682, |
| "epoch": 0.225, |
| "grad_norm": 20.75511317660791, |
| "kl_loss_13": 4247.2, |
| "kl_loss_26": 2732.8, |
| "kl_loss_39": 2219.0, |
| "kl_loss_7": 5061.6, |
| "learning_rate": 0.0008880732321458784, |
| "loss": 7074.7, |
| "step": 2250 |
| }, |
| { |
| "ce_loss_13": 3.4491190731525423, |
| "ce_loss_26": 2.7567967534065247, |
| "ce_loss_39": 2.5041564613580705, |
| "ce_loss_52": 1.4379881560802459, |
| "ce_loss_7": 3.8363168060779573, |
| "epoch": 0.226, |
| "grad_norm": 18.822666188140545, |
| "kl_loss_13": 4102.8, |
| "kl_loss_26": 2621.6, |
| "kl_loss_39": 2107.6, |
| "kl_loss_7": 4917.6, |
| "learning_rate": 0.0008870708053195413, |
| "loss": 7003.4, |
| "step": 2260 |
| }, |
| { |
| "ce_loss_13": 3.4790355801582336, |
| "ce_loss_26": 2.765997165441513, |
| "ce_loss_39": 2.5106064915657043, |
| "ce_loss_52": 1.4210210233926772, |
| "ce_loss_7": 3.8746932446956635, |
| "epoch": 0.227, |
| "grad_norm": 19.45926661054129, |
| "kl_loss_13": 4188.4, |
| "kl_loss_26": 2681.2, |
| "kl_loss_39": 2162.6, |
| "kl_loss_7": 5009.6, |
| "learning_rate": 0.0008860644806944918, |
| "loss": 7002.8, |
| "step": 2270 |
| }, |
| { |
| "ce_loss_13": 3.589535415172577, |
| "ce_loss_26": 2.870089566707611, |
| "ce_loss_39": 2.5925551772117617, |
| "ce_loss_52": 1.4516636282205582, |
| "ce_loss_7": 3.9855311453342437, |
| "epoch": 0.228, |
| "grad_norm": 18.527281302332295, |
| "kl_loss_13": 4335.2, |
| "kl_loss_26": 2802.8, |
| "kl_loss_39": 2252.6, |
| "kl_loss_7": 5164.0, |
| "learning_rate": 0.0008850542684044079, |
| "loss": 7072.1, |
| "step": 2280 |
| }, |
| { |
| "ce_loss_13": 3.4438597559928894, |
| "ce_loss_26": 2.74727184176445, |
| "ce_loss_39": 2.4982927203178407, |
| "ce_loss_52": 1.435218185186386, |
| "ce_loss_7": 3.8294356882572176, |
| "epoch": 0.229, |
| "grad_norm": 18.07396826505032, |
| "kl_loss_13": 4082.8, |
| "kl_loss_26": 2585.6, |
| "kl_loss_39": 2087.0, |
| "kl_loss_7": 4892.8, |
| "learning_rate": 0.0008840401786221159, |
| "loss": 6974.9, |
| "step": 2290 |
| }, |
| { |
| "ce_loss_13": 3.476649820804596, |
| "ce_loss_26": 2.7891372203826905, |
| "ce_loss_39": 2.5390550673007963, |
| "ce_loss_52": 1.4611460983753204, |
| "ce_loss_7": 3.854980993270874, |
| "epoch": 0.23, |
| "grad_norm": 18.716359933554248, |
| "kl_loss_13": 4118.8, |
| "kl_loss_26": 2644.2, |
| "kl_loss_39": 2128.6, |
| "kl_loss_7": 4946.0, |
| "learning_rate": 0.000883022221559489, |
| "loss": 6901.3, |
| "step": 2300 |
| }, |
| { |
| "ce_loss_13": 3.467282909154892, |
| "ce_loss_26": 2.7604516625404356, |
| "ce_loss_39": 2.5003271818161013, |
| "ce_loss_52": 1.4461873590946197, |
| "ce_loss_7": 3.856851851940155, |
| "epoch": 0.231, |
| "grad_norm": 20.721250836400138, |
| "kl_loss_13": 4114.0, |
| "kl_loss_26": 2616.8, |
| "kl_loss_39": 2092.6, |
| "kl_loss_7": 4926.4, |
| "learning_rate": 0.0008820004074673434, |
| "loss": 6876.9, |
| "step": 2310 |
| }, |
| { |
| "ce_loss_13": 3.4266963064670564, |
| "ce_loss_26": 2.719339656829834, |
| "ce_loss_39": 2.4688956409692766, |
| "ce_loss_52": 1.4164521768689156, |
| "ce_loss_7": 3.8023972034454347, |
| "epoch": 0.232, |
| "grad_norm": 17.616553712409548, |
| "kl_loss_13": 4090.4, |
| "kl_loss_26": 2589.2, |
| "kl_loss_39": 2065.4, |
| "kl_loss_7": 4891.2, |
| "learning_rate": 0.0008809747466353355, |
| "loss": 6907.6, |
| "step": 2320 |
| }, |
| { |
| "ce_loss_13": 3.536805588006973, |
| "ce_loss_26": 2.8070395588874817, |
| "ce_loss_39": 2.5501783430576324, |
| "ce_loss_52": 1.4646585762500763, |
| "ce_loss_7": 3.9247563600540163, |
| "epoch": 0.233, |
| "grad_norm": 17.658525951814237, |
| "kl_loss_13": 4218.8, |
| "kl_loss_26": 2666.0, |
| "kl_loss_39": 2147.2, |
| "kl_loss_7": 5044.0, |
| "learning_rate": 0.0008799452493918585, |
| "loss": 6862.9, |
| "step": 2330 |
| }, |
| { |
| "ce_loss_13": 3.3707460284233095, |
| "ce_loss_26": 2.680304506421089, |
| "ce_loss_39": 2.4279863387346268, |
| "ce_loss_52": 1.4292149528861047, |
| "ce_loss_7": 3.7509031653404237, |
| "epoch": 0.234, |
| "grad_norm": 18.293581891673337, |
| "kl_loss_13": 3979.6, |
| "kl_loss_26": 2481.4, |
| "kl_loss_39": 1967.6, |
| "kl_loss_7": 4792.0, |
| "learning_rate": 0.0008789119261039385, |
| "loss": 6860.7, |
| "step": 2340 |
| }, |
| { |
| "ce_loss_13": 3.411047804355621, |
| "ce_loss_26": 2.7024976193904875, |
| "ce_loss_39": 2.4417600989341737, |
| "ce_loss_52": 1.404353639483452, |
| "ce_loss_7": 3.799528968334198, |
| "epoch": 0.235, |
| "grad_norm": 19.60391257341661, |
| "kl_loss_13": 4078.4, |
| "kl_loss_26": 2576.8, |
| "kl_loss_39": 2050.4, |
| "kl_loss_7": 4901.6, |
| "learning_rate": 0.0008778747871771292, |
| "loss": 6770.8, |
| "step": 2350 |
| }, |
| { |
| "ce_loss_13": 3.4045680582523348, |
| "ce_loss_26": 2.6985227525234223, |
| "ce_loss_39": 2.451327767968178, |
| "ce_loss_52": 1.4205562889575958, |
| "ce_loss_7": 3.7882447242736816, |
| "epoch": 0.236, |
| "grad_norm": 18.739033338884607, |
| "kl_loss_13": 4038.4, |
| "kl_loss_26": 2533.2, |
| "kl_loss_39": 2034.8, |
| "kl_loss_7": 4858.4, |
| "learning_rate": 0.0008768338430554083, |
| "loss": 6755.4, |
| "step": 2360 |
| }, |
| { |
| "ce_loss_13": 3.3626140534877775, |
| "ce_loss_26": 2.65916622877121, |
| "ce_loss_39": 2.4009654462337493, |
| "ce_loss_52": 1.3916789084672927, |
| "ce_loss_7": 3.749758929014206, |
| "epoch": 0.237, |
| "grad_norm": 20.138759568479667, |
| "kl_loss_13": 4022.0, |
| "kl_loss_26": 2521.4, |
| "kl_loss_39": 1993.2, |
| "kl_loss_7": 4829.6, |
| "learning_rate": 0.0008757891042210713, |
| "loss": 6791.4, |
| "step": 2370 |
| }, |
| { |
| "ce_loss_13": 3.381242650747299, |
| "ce_loss_26": 2.68309933245182, |
| "ce_loss_39": 2.433691692352295, |
| "ce_loss_52": 1.4095703065395355, |
| "ce_loss_7": 3.772293299436569, |
| "epoch": 0.238, |
| "grad_norm": 17.323586780740413, |
| "kl_loss_13": 4016.4, |
| "kl_loss_26": 2531.4, |
| "kl_loss_39": 2016.2, |
| "kl_loss_7": 4839.6, |
| "learning_rate": 0.0008747405811946271, |
| "loss": 6729.9, |
| "step": 2380 |
| }, |
| { |
| "ce_loss_13": 3.418868046998978, |
| "ce_loss_26": 2.7223224580287932, |
| "ce_loss_39": 2.4728009045124053, |
| "ce_loss_52": 1.446861308813095, |
| "ce_loss_7": 3.7994930267333986, |
| "epoch": 0.239, |
| "grad_norm": 17.668482550134954, |
| "kl_loss_13": 4034.0, |
| "kl_loss_26": 2556.0, |
| "kl_loss_39": 2037.4, |
| "kl_loss_7": 4842.4, |
| "learning_rate": 0.0008736882845346905, |
| "loss": 6764.7, |
| "step": 2390 |
| }, |
| { |
| "ce_loss_13": 3.407726752758026, |
| "ce_loss_26": 2.714660122990608, |
| "ce_loss_39": 2.4702648639678957, |
| "ce_loss_52": 1.4476186811923981, |
| "ce_loss_7": 3.7862841546535493, |
| "epoch": 0.24, |
| "grad_norm": 20.313458924733165, |
| "kl_loss_13": 3988.4, |
| "kl_loss_26": 2507.2, |
| "kl_loss_39": 2008.2, |
| "kl_loss_7": 4794.4, |
| "learning_rate": 0.0008726322248378774, |
| "loss": 6720.4, |
| "step": 2400 |
| }, |
| { |
| "ce_loss_13": 3.4095008313655852, |
| "ce_loss_26": 2.69794414639473, |
| "ce_loss_39": 2.44133580327034, |
| "ce_loss_52": 1.426843424141407, |
| "ce_loss_7": 3.8018106520175934, |
| "epoch": 0.241, |
| "grad_norm": 17.847824779807837, |
| "kl_loss_13": 4039.6, |
| "kl_loss_26": 2535.6, |
| "kl_loss_39": 2007.8, |
| "kl_loss_7": 4864.0, |
| "learning_rate": 0.0008715724127386971, |
| "loss": 6713.1, |
| "step": 2410 |
| }, |
| { |
| "ce_loss_13": 3.370608961582184, |
| "ce_loss_26": 2.688097137212753, |
| "ce_loss_39": 2.435939407348633, |
| "ce_loss_52": 1.4344248950481415, |
| "ce_loss_7": 3.745537704229355, |
| "epoch": 0.242, |
| "grad_norm": 18.03045500704746, |
| "kl_loss_13": 3943.6, |
| "kl_loss_26": 2497.6, |
| "kl_loss_39": 1983.4, |
| "kl_loss_7": 4736.8, |
| "learning_rate": 0.0008705088589094458, |
| "loss": 6611.0, |
| "step": 2420 |
| }, |
| { |
| "ce_loss_13": 3.4596225798130034, |
| "ce_loss_26": 2.7608898997306826, |
| "ce_loss_39": 2.5022788047790527, |
| "ce_loss_52": 1.4623101890087127, |
| "ce_loss_7": 3.839513373374939, |
| "epoch": 0.243, |
| "grad_norm": 18.190444229647287, |
| "kl_loss_13": 4058.0, |
| "kl_loss_26": 2575.6, |
| "kl_loss_39": 2054.6, |
| "kl_loss_7": 4862.4, |
| "learning_rate": 0.0008694415740600988, |
| "loss": 6638.0, |
| "step": 2430 |
| }, |
| { |
| "ce_loss_13": 3.370687645673752, |
| "ce_loss_26": 2.6674872994422913, |
| "ce_loss_39": 2.414305740594864, |
| "ce_loss_52": 1.4324225425720214, |
| "ce_loss_7": 3.767412984371185, |
| "epoch": 0.244, |
| "grad_norm": 18.848120466041497, |
| "kl_loss_13": 3940.4, |
| "kl_loss_26": 2449.4, |
| "kl_loss_39": 1932.2, |
| "kl_loss_7": 4769.6, |
| "learning_rate": 0.0008683705689382025, |
| "loss": 6641.0, |
| "step": 2440 |
| }, |
| { |
| "ce_loss_13": 3.347389942407608, |
| "ce_loss_26": 2.6603663861751556, |
| "ce_loss_39": 2.4186645448207855, |
| "ce_loss_52": 1.452422297000885, |
| "ce_loss_7": 3.731181102991104, |
| "epoch": 0.245, |
| "grad_norm": 17.05503964147942, |
| "kl_loss_13": 3888.8, |
| "kl_loss_26": 2418.6, |
| "kl_loss_39": 1913.8, |
| "kl_loss_7": 4694.4, |
| "learning_rate": 0.0008672958543287666, |
| "loss": 6617.1, |
| "step": 2450 |
| }, |
| { |
| "ce_loss_13": 3.341637074947357, |
| "ce_loss_26": 2.653514164686203, |
| "ce_loss_39": 2.4030585259199144, |
| "ce_loss_52": 1.4203058749437332, |
| "ce_loss_7": 3.7326664865016936, |
| "epoch": 0.246, |
| "grad_norm": 18.028238041954314, |
| "kl_loss_13": 3894.0, |
| "kl_loss_26": 2435.6, |
| "kl_loss_39": 1932.4, |
| "kl_loss_7": 4711.2, |
| "learning_rate": 0.0008662174410541554, |
| "loss": 6537.2, |
| "step": 2460 |
| }, |
| { |
| "ce_loss_13": 3.343936342000961, |
| "ce_loss_26": 2.6602561354637144, |
| "ce_loss_39": 2.4062414824962617, |
| "ce_loss_52": 1.4341940209269524, |
| "ce_loss_7": 3.7255902886390686, |
| "epoch": 0.247, |
| "grad_norm": 17.615419947143096, |
| "kl_loss_13": 3890.8, |
| "kl_loss_26": 2433.8, |
| "kl_loss_39": 1921.6, |
| "kl_loss_7": 4692.0, |
| "learning_rate": 0.0008651353399739787, |
| "loss": 6499.5, |
| "step": 2470 |
| }, |
| { |
| "ce_loss_13": 3.36987144947052, |
| "ce_loss_26": 2.6651594936847687, |
| "ce_loss_39": 2.41037335395813, |
| "ce_loss_52": 1.4215908780694009, |
| "ce_loss_7": 3.75968217253685, |
| "epoch": 0.248, |
| "grad_norm": 19.448152371934484, |
| "kl_loss_13": 3954.4, |
| "kl_loss_26": 2461.0, |
| "kl_loss_39": 1945.4, |
| "kl_loss_7": 4776.0, |
| "learning_rate": 0.0008640495619849821, |
| "loss": 6570.0, |
| "step": 2480 |
| }, |
| { |
| "ce_loss_13": 3.3932483792304993, |
| "ce_loss_26": 2.69803272485733, |
| "ce_loss_39": 2.448850151896477, |
| "ce_loss_52": 1.4729512989521027, |
| "ce_loss_7": 3.7767464458942412, |
| "epoch": 0.249, |
| "grad_norm": 17.76914722627013, |
| "kl_loss_13": 3932.8, |
| "kl_loss_26": 2449.6, |
| "kl_loss_39": 1942.6, |
| "kl_loss_7": 4743.2, |
| "learning_rate": 0.0008629601180209381, |
| "loss": 6472.8, |
| "step": 2490 |
| }, |
| { |
| "ce_loss_13": 3.3404706001281737, |
| "ce_loss_26": 2.6527740180492403, |
| "ce_loss_39": 2.403418445587158, |
| "ce_loss_52": 1.4374166071414947, |
| "ce_loss_7": 3.7279320538043974, |
| "epoch": 0.25, |
| "grad_norm": 18.043688934070087, |
| "kl_loss_13": 3897.6, |
| "kl_loss_26": 2421.6, |
| "kl_loss_39": 1902.0, |
| "kl_loss_7": 4716.0, |
| "learning_rate": 0.000861867019052535, |
| "loss": 6482.2, |
| "step": 2500 |
| }, |
| { |
| "ce_loss_13": 3.408062273263931, |
| "ce_loss_26": 2.704035770893097, |
| "ce_loss_39": 2.46220483481884, |
| "ce_loss_52": 1.4713353991508484, |
| "ce_loss_7": 3.795337921380997, |
| "epoch": 0.251, |
| "grad_norm": 19.728713333582007, |
| "kl_loss_13": 3944.4, |
| "kl_loss_26": 2444.8, |
| "kl_loss_39": 1942.4, |
| "kl_loss_7": 4760.4, |
| "learning_rate": 0.0008607702760872678, |
| "loss": 6463.8, |
| "step": 2510 |
| }, |
| { |
| "ce_loss_13": 3.388230836391449, |
| "ce_loss_26": 2.687982529401779, |
| "ce_loss_39": 2.429553496837616, |
| "ce_loss_52": 1.4563929110765457, |
| "ce_loss_7": 3.781431978940964, |
| "epoch": 0.252, |
| "grad_norm": 18.472247546339297, |
| "kl_loss_13": 3948.8, |
| "kl_loss_26": 2449.6, |
| "kl_loss_39": 1930.0, |
| "kl_loss_7": 4769.6, |
| "learning_rate": 0.0008596699001693256, |
| "loss": 6470.9, |
| "step": 2520 |
| }, |
| { |
| "ce_loss_13": 3.3468190252780916, |
| "ce_loss_26": 2.6537194311618806, |
| "ce_loss_39": 2.38609040081501, |
| "ce_loss_52": 1.4265122324228288, |
| "ce_loss_7": 3.7305682718753816, |
| "epoch": 0.253, |
| "grad_norm": 19.078325524688942, |
| "kl_loss_13": 3872.0, |
| "kl_loss_26": 2409.4, |
| "kl_loss_39": 1875.8, |
| "kl_loss_7": 4679.2, |
| "learning_rate": 0.0008585659023794818, |
| "loss": 6413.9, |
| "step": 2530 |
| }, |
| { |
| "ce_loss_13": 3.3305428862571715, |
| "ce_loss_26": 2.6303874611854554, |
| "ce_loss_39": 2.3649342864751817, |
| "ce_loss_52": 1.4202288419008255, |
| "ce_loss_7": 3.712818431854248, |
| "epoch": 0.254, |
| "grad_norm": 17.612892401319467, |
| "kl_loss_13": 3873.6, |
| "kl_loss_26": 2396.4, |
| "kl_loss_39": 1864.0, |
| "kl_loss_7": 4688.8, |
| "learning_rate": 0.0008574582938349817, |
| "loss": 6377.5, |
| "step": 2540 |
| }, |
| { |
| "ce_loss_13": 3.3927013695240023, |
| "ce_loss_26": 2.7204831540584564, |
| "ce_loss_39": 2.464686703681946, |
| "ce_loss_52": 1.487898689508438, |
| "ce_loss_7": 3.7826764941215516, |
| "epoch": 0.255, |
| "grad_norm": 19.269838722570377, |
| "kl_loss_13": 3875.2, |
| "kl_loss_26": 2432.8, |
| "kl_loss_39": 1907.8, |
| "kl_loss_7": 4701.6, |
| "learning_rate": 0.0008563470856894315, |
| "loss": 6365.3, |
| "step": 2550 |
| }, |
| { |
| "ce_loss_13": 3.366223245859146, |
| "ce_loss_26": 2.6793350696563722, |
| "ce_loss_39": 2.4282671988010405, |
| "ce_loss_52": 1.475812730193138, |
| "ce_loss_7": 3.7593341052532194, |
| "epoch": 0.256, |
| "grad_norm": 17.67137481592248, |
| "kl_loss_13": 3846.8, |
| "kl_loss_26": 2394.4, |
| "kl_loss_39": 1881.4, |
| "kl_loss_7": 4661.6, |
| "learning_rate": 0.0008552322891326845, |
| "loss": 6381.8, |
| "step": 2560 |
| }, |
| { |
| "ce_loss_13": 3.348971825838089, |
| "ce_loss_26": 2.6535234093666076, |
| "ce_loss_39": 2.396692654490471, |
| "ce_loss_52": 1.4521390795707703, |
| "ce_loss_7": 3.737420654296875, |
| "epoch": 0.257, |
| "grad_norm": 17.922791060277248, |
| "kl_loss_13": 3853.2, |
| "kl_loss_26": 2382.8, |
| "kl_loss_39": 1860.0, |
| "kl_loss_7": 4671.6, |
| "learning_rate": 0.0008541139153907296, |
| "loss": 6320.6, |
| "step": 2570 |
| }, |
| { |
| "ce_loss_13": 3.3175282776355743, |
| "ce_loss_26": 2.62116519510746, |
| "ce_loss_39": 2.373508110642433, |
| "ce_loss_52": 1.4496644467115403, |
| "ce_loss_7": 3.7005242466926576, |
| "epoch": 0.258, |
| "grad_norm": 17.919048466696356, |
| "kl_loss_13": 3806.0, |
| "kl_loss_26": 2325.8, |
| "kl_loss_39": 1815.6, |
| "kl_loss_7": 4616.8, |
| "learning_rate": 0.0008529919757255782, |
| "loss": 6324.1, |
| "step": 2580 |
| }, |
| { |
| "ce_loss_13": 3.3524767458438873, |
| "ce_loss_26": 2.672022157907486, |
| "ce_loss_39": 2.42145474255085, |
| "ce_loss_52": 1.490525448322296, |
| "ce_loss_7": 3.731249511241913, |
| "epoch": 0.259, |
| "grad_norm": 17.94332341656099, |
| "kl_loss_13": 3786.8, |
| "kl_loss_26": 2328.6, |
| "kl_loss_39": 1819.0, |
| "kl_loss_7": 4580.4, |
| "learning_rate": 0.0008518664814351503, |
| "loss": 6266.2, |
| "step": 2590 |
| }, |
| { |
| "ce_loss_13": 3.2249048352241516, |
| "ce_loss_26": 2.526710030436516, |
| "ce_loss_39": 2.288080096244812, |
| "ce_loss_52": 1.3951421514153481, |
| "ce_loss_7": 3.6117550313472746, |
| "epoch": 0.26, |
| "grad_norm": 17.453584143766424, |
| "kl_loss_13": 3747.6, |
| "kl_loss_26": 2273.8, |
| "kl_loss_39": 1787.0, |
| "kl_loss_7": 4560.8, |
| "learning_rate": 0.0008507374438531607, |
| "loss": 6263.6, |
| "step": 2600 |
| }, |
| { |
| "ce_loss_13": 3.385346031188965, |
| "ce_loss_26": 2.689622291922569, |
| "ce_loss_39": 2.437500995397568, |
| "ce_loss_52": 1.4798223778605462, |
| "ce_loss_7": 3.780226916074753, |
| "epoch": 0.261, |
| "grad_norm": 17.77067876803282, |
| "kl_loss_13": 3881.6, |
| "kl_loss_26": 2399.4, |
| "kl_loss_39": 1889.6, |
| "kl_loss_7": 4706.4, |
| "learning_rate": 0.0008496048743490053, |
| "loss": 6251.8, |
| "step": 2610 |
| }, |
| { |
| "ce_loss_13": 3.253863149881363, |
| "ce_loss_26": 2.5867208421230314, |
| "ce_loss_39": 2.3403410583734514, |
| "ce_loss_52": 1.4355336636304856, |
| "ce_loss_7": 3.628329038619995, |
| "epoch": 0.262, |
| "grad_norm": 18.13099048831459, |
| "kl_loss_13": 3714.0, |
| "kl_loss_26": 2278.6, |
| "kl_loss_39": 1781.2, |
| "kl_loss_7": 4501.2, |
| "learning_rate": 0.0008484687843276469, |
| "loss": 6230.8, |
| "step": 2620 |
| }, |
| { |
| "ce_loss_13": 3.308898413181305, |
| "ce_loss_26": 2.6307075321674347, |
| "ce_loss_39": 2.381870651245117, |
| "ce_loss_52": 1.4658461689949036, |
| "ce_loss_7": 3.686649763584137, |
| "epoch": 0.263, |
| "grad_norm": 17.246075540125826, |
| "kl_loss_13": 3767.2, |
| "kl_loss_26": 2322.6, |
| "kl_loss_39": 1819.4, |
| "kl_loss_7": 4560.0, |
| "learning_rate": 0.0008473291852294987, |
| "loss": 6261.4, |
| "step": 2630 |
| }, |
| { |
| "ce_loss_13": 3.301677256822586, |
| "ce_loss_26": 2.623979777097702, |
| "ce_loss_39": 2.3711125582456587, |
| "ce_loss_52": 1.4476893723011017, |
| "ce_loss_7": 3.686248630285263, |
| "epoch": 0.264, |
| "grad_norm": 18.725565567002363, |
| "kl_loss_13": 3772.0, |
| "kl_loss_26": 2323.8, |
| "kl_loss_39": 1813.4, |
| "kl_loss_7": 4567.6, |
| "learning_rate": 0.0008461860885303114, |
| "loss": 6186.3, |
| "step": 2640 |
| }, |
| { |
| "ce_loss_13": 3.3081180572509767, |
| "ce_loss_26": 2.5993283450603486, |
| "ce_loss_39": 2.346868970990181, |
| "ce_loss_52": 1.4209152534604073, |
| "ce_loss_7": 3.700265485048294, |
| "epoch": 0.265, |
| "grad_norm": 16.39724372868559, |
| "kl_loss_13": 3844.8, |
| "kl_loss_26": 2352.2, |
| "kl_loss_39": 1835.0, |
| "kl_loss_7": 4663.2, |
| "learning_rate": 0.000845039505741056, |
| "loss": 6224.4, |
| "step": 2650 |
| }, |
| { |
| "ce_loss_13": 3.2767783522605898, |
| "ce_loss_26": 2.610367941856384, |
| "ce_loss_39": 2.3734692215919493, |
| "ce_loss_52": 1.4838453635573388, |
| "ce_loss_7": 3.6544183135032653, |
| "epoch": 0.266, |
| "grad_norm": 18.646431501601995, |
| "kl_loss_13": 3666.4, |
| "kl_loss_26": 2232.8, |
| "kl_loss_39": 1743.2, |
| "kl_loss_7": 4455.2, |
| "learning_rate": 0.0008438894484078086, |
| "loss": 6164.1, |
| "step": 2660 |
| }, |
| { |
| "ce_loss_13": 3.187056082487106, |
| "ce_loss_26": 2.5067271828651427, |
| "ce_loss_39": 2.265036514401436, |
| "ce_loss_52": 1.3943608120083808, |
| "ce_loss_7": 3.5653835415840147, |
| "epoch": 0.267, |
| "grad_norm": 17.226899122841342, |
| "kl_loss_13": 3695.2, |
| "kl_loss_26": 2239.8, |
| "kl_loss_39": 1740.2, |
| "kl_loss_7": 4491.6, |
| "learning_rate": 0.0008427359281116334, |
| "loss": 6116.4, |
| "step": 2670 |
| }, |
| { |
| "ce_loss_13": 3.2635743618011475, |
| "ce_loss_26": 2.5878145933151244, |
| "ce_loss_39": 2.344261533021927, |
| "ce_loss_52": 1.436160460114479, |
| "ce_loss_7": 3.642155331373215, |
| "epoch": 0.268, |
| "grad_norm": 17.358976250170773, |
| "kl_loss_13": 3715.6, |
| "kl_loss_26": 2286.6, |
| "kl_loss_39": 1782.2, |
| "kl_loss_7": 4510.8, |
| "learning_rate": 0.0008415789564684673, |
| "loss": 6120.1, |
| "step": 2680 |
| }, |
| { |
| "ce_loss_13": 3.217154061794281, |
| "ce_loss_26": 2.55003065764904, |
| "ce_loss_39": 2.3082681566476824, |
| "ce_loss_52": 1.4278038635849952, |
| "ce_loss_7": 3.602716547250748, |
| "epoch": 0.269, |
| "grad_norm": 17.314540535001434, |
| "kl_loss_13": 3655.2, |
| "kl_loss_26": 2225.8, |
| "kl_loss_39": 1731.0, |
| "kl_loss_7": 4458.8, |
| "learning_rate": 0.0008404185451290017, |
| "loss": 6184.5, |
| "step": 2690 |
| }, |
| { |
| "ce_loss_13": 3.24159716963768, |
| "ce_loss_26": 2.54996337890625, |
| "ce_loss_39": 2.3031594485044478, |
| "ce_loss_52": 1.4209882378578187, |
| "ce_loss_7": 3.6337377846241, |
| "epoch": 0.27, |
| "grad_norm": 17.492742712849175, |
| "kl_loss_13": 3726.4, |
| "kl_loss_26": 2257.8, |
| "kl_loss_39": 1756.2, |
| "kl_loss_7": 4549.6, |
| "learning_rate": 0.0008392547057785661, |
| "loss": 6062.5, |
| "step": 2700 |
| }, |
| { |
| "ce_loss_13": 3.227788990736008, |
| "ce_loss_26": 2.5358716517686846, |
| "ce_loss_39": 2.285151606798172, |
| "ce_loss_52": 1.4203684866428374, |
| "ce_loss_7": 3.6037886083126067, |
| "epoch": 0.271, |
| "grad_norm": 17.53543108395465, |
| "kl_loss_13": 3669.6, |
| "kl_loss_26": 2215.6, |
| "kl_loss_39": 1712.0, |
| "kl_loss_7": 4470.8, |
| "learning_rate": 0.0008380874501370098, |
| "loss": 6120.8, |
| "step": 2710 |
| }, |
| { |
| "ce_loss_13": 3.1636491239070894, |
| "ce_loss_26": 2.5045939922332763, |
| "ce_loss_39": 2.2759897857904434, |
| "ce_loss_52": 1.4353285342454911, |
| "ce_loss_7": 3.5319547176361086, |
| "epoch": 0.272, |
| "grad_norm": 18.586839239007904, |
| "kl_loss_13": 3539.2, |
| "kl_loss_26": 2133.8, |
| "kl_loss_39": 1666.0, |
| "kl_loss_7": 4328.8, |
| "learning_rate": 0.0008369167899585841, |
| "loss": 6051.0, |
| "step": 2720 |
| }, |
| { |
| "ce_loss_13": 3.221328115463257, |
| "ce_loss_26": 2.535122260451317, |
| "ce_loss_39": 2.2940177261829375, |
| "ce_loss_52": 1.419254219532013, |
| "ce_loss_7": 3.6125965118408203, |
| "epoch": 0.273, |
| "grad_norm": 17.658632608609814, |
| "kl_loss_13": 3688.0, |
| "kl_loss_26": 2233.2, |
| "kl_loss_39": 1733.0, |
| "kl_loss_7": 4510.0, |
| "learning_rate": 0.0008357427370318238, |
| "loss": 6045.8, |
| "step": 2730 |
| }, |
| { |
| "ce_loss_13": 3.2239345014095306, |
| "ce_loss_26": 2.556156021356583, |
| "ce_loss_39": 2.3110478937625887, |
| "ce_loss_52": 1.4514835059642792, |
| "ce_loss_7": 3.6023066878318786, |
| "epoch": 0.274, |
| "grad_norm": 18.16156755157877, |
| "kl_loss_13": 3623.2, |
| "kl_loss_26": 2184.8, |
| "kl_loss_39": 1687.0, |
| "kl_loss_7": 4421.6, |
| "learning_rate": 0.0008345653031794292, |
| "loss": 6081.8, |
| "step": 2740 |
| }, |
| { |
| "ce_loss_13": 3.248935067653656, |
| "ce_loss_26": 2.571800184249878, |
| "ce_loss_39": 2.3257440716028213, |
| "ce_loss_52": 1.4555991351604463, |
| "ce_loss_7": 3.6242997109889985, |
| "epoch": 0.275, |
| "grad_norm": 17.486859359677037, |
| "kl_loss_13": 3626.4, |
| "kl_loss_26": 2208.0, |
| "kl_loss_39": 1712.0, |
| "kl_loss_7": 4417.6, |
| "learning_rate": 0.0008333845002581458, |
| "loss": 5996.5, |
| "step": 2750 |
| }, |
| { |
| "ce_loss_13": 3.2696946620941163, |
| "ce_loss_26": 2.5927825689315798, |
| "ce_loss_39": 2.3452927708625793, |
| "ce_loss_52": 1.4668353974819184, |
| "ce_loss_7": 3.635032969713211, |
| "epoch": 0.276, |
| "grad_norm": 17.043532441273413, |
| "kl_loss_13": 3660.4, |
| "kl_loss_26": 2235.2, |
| "kl_loss_39": 1729.2, |
| "kl_loss_7": 4435.2, |
| "learning_rate": 0.0008322003401586462, |
| "loss": 5989.9, |
| "step": 2760 |
| }, |
| { |
| "ce_loss_13": 3.187331736087799, |
| "ce_loss_26": 2.537285569310188, |
| "ce_loss_39": 2.296916127204895, |
| "ce_loss_52": 1.4428926169872285, |
| "ce_loss_7": 3.55471009016037, |
| "epoch": 0.277, |
| "grad_norm": 18.300410919099367, |
| "kl_loss_13": 3558.4, |
| "kl_loss_26": 2165.8, |
| "kl_loss_39": 1682.4, |
| "kl_loss_7": 4338.0, |
| "learning_rate": 0.0008310128348054094, |
| "loss": 5970.6, |
| "step": 2770 |
| }, |
| { |
| "ce_loss_13": 3.2006444096565247, |
| "ce_loss_26": 2.5205237597227095, |
| "ce_loss_39": 2.282972750067711, |
| "ce_loss_52": 1.4260737136006356, |
| "ce_loss_7": 3.5821855068206787, |
| "epoch": 0.278, |
| "grad_norm": 17.549429821325518, |
| "kl_loss_13": 3649.2, |
| "kl_loss_26": 2206.8, |
| "kl_loss_39": 1708.8, |
| "kl_loss_7": 4441.2, |
| "learning_rate": 0.0008298219961566008, |
| "loss": 5976.9, |
| "step": 2780 |
| }, |
| { |
| "ce_loss_13": 3.1831609129905702, |
| "ce_loss_26": 2.4997926205396652, |
| "ce_loss_39": 2.2601052969694138, |
| "ce_loss_52": 1.403978604078293, |
| "ce_loss_7": 3.553097301721573, |
| "epoch": 0.279, |
| "grad_norm": 17.798788155683486, |
| "kl_loss_13": 3626.8, |
| "kl_loss_26": 2180.8, |
| "kl_loss_39": 1685.6, |
| "kl_loss_7": 4416.0, |
| "learning_rate": 0.0008286278362039527, |
| "loss": 5901.3, |
| "step": 2790 |
| }, |
| { |
| "ce_loss_13": 3.2061972856521606, |
| "ce_loss_26": 2.5348829567432403, |
| "ce_loss_39": 2.29689359664917, |
| "ce_loss_52": 1.4524233996868134, |
| "ce_loss_7": 3.576086735725403, |
| "epoch": 0.28, |
| "grad_norm": 17.920526019853103, |
| "kl_loss_13": 3566.0, |
| "kl_loss_26": 2145.8, |
| "kl_loss_39": 1659.0, |
| "kl_loss_7": 4341.6, |
| "learning_rate": 0.0008274303669726426, |
| "loss": 5875.7, |
| "step": 2800 |
| }, |
| { |
| "ce_loss_13": 3.2895997047424315, |
| "ce_loss_26": 2.6019150614738464, |
| "ce_loss_39": 2.3554980546236037, |
| "ce_loss_52": 1.4626183837652207, |
| "ce_loss_7": 3.680211156606674, |
| "epoch": 0.281, |
| "grad_norm": 18.339396286679403, |
| "kl_loss_13": 3746.0, |
| "kl_loss_26": 2276.4, |
| "kl_loss_39": 1760.4, |
| "kl_loss_7": 4566.0, |
| "learning_rate": 0.0008262296005211721, |
| "loss": 5938.8, |
| "step": 2810 |
| }, |
| { |
| "ce_loss_13": 3.1840124845504763, |
| "ce_loss_26": 2.5105081349611282, |
| "ce_loss_39": 2.2711715549230576, |
| "ce_loss_52": 1.4384785890579224, |
| "ce_loss_7": 3.5735177397727966, |
| "epoch": 0.282, |
| "grad_norm": 17.54448826978921, |
| "kl_loss_13": 3568.4, |
| "kl_loss_26": 2126.0, |
| "kl_loss_39": 1635.8, |
| "kl_loss_7": 4378.4, |
| "learning_rate": 0.0008250255489412463, |
| "loss": 5922.5, |
| "step": 2820 |
| }, |
| { |
| "ce_loss_13": 3.2147216141223907, |
| "ce_loss_26": 2.5306088238954545, |
| "ce_loss_39": 2.2879262059926986, |
| "ce_loss_52": 1.4329912930727005, |
| "ce_loss_7": 3.6062001168727873, |
| "epoch": 0.283, |
| "grad_norm": 16.224521909183913, |
| "kl_loss_13": 3634.4, |
| "kl_loss_26": 2182.2, |
| "kl_loss_39": 1686.4, |
| "kl_loss_7": 4456.0, |
| "learning_rate": 0.0008238182243576511, |
| "loss": 5870.9, |
| "step": 2830 |
| }, |
| { |
| "ce_loss_13": 3.204808014631271, |
| "ce_loss_26": 2.529933416843414, |
| "ce_loss_39": 2.2772836655378343, |
| "ce_loss_52": 1.4382712185382842, |
| "ce_loss_7": 3.593477213382721, |
| "epoch": 0.284, |
| "grad_norm": 17.85408893335792, |
| "kl_loss_13": 3598.4, |
| "kl_loss_26": 2175.4, |
| "kl_loss_39": 1665.8, |
| "kl_loss_7": 4413.6, |
| "learning_rate": 0.0008226076389281315, |
| "loss": 5857.1, |
| "step": 2840 |
| }, |
| { |
| "ce_loss_13": 3.137856882810593, |
| "ce_loss_26": 2.4886497616767884, |
| "ce_loss_39": 2.2476108491420748, |
| "ce_loss_52": 1.4388306617736817, |
| "ce_loss_7": 3.5120018839836122, |
| "epoch": 0.285, |
| "grad_norm": 17.527104476469322, |
| "kl_loss_13": 3468.4, |
| "kl_loss_26": 2085.8, |
| "kl_loss_39": 1594.4, |
| "kl_loss_7": 4250.0, |
| "learning_rate": 0.0008213938048432696, |
| "loss": 5806.8, |
| "step": 2850 |
| }, |
| { |
| "ce_loss_13": 3.134007251262665, |
| "ce_loss_26": 2.4605695813894273, |
| "ce_loss_39": 2.2274533331394197, |
| "ce_loss_52": 1.4070687741041183, |
| "ce_loss_7": 3.51087743639946, |
| "epoch": 0.286, |
| "grad_norm": 16.81428229570175, |
| "kl_loss_13": 3532.8, |
| "kl_loss_26": 2101.0, |
| "kl_loss_39": 1616.2, |
| "kl_loss_7": 4327.2, |
| "learning_rate": 0.0008201767343263612, |
| "loss": 5809.7, |
| "step": 2860 |
| }, |
| { |
| "ce_loss_13": 3.1623673915863035, |
| "ce_loss_26": 2.497657111287117, |
| "ce_loss_39": 2.2517473757267, |
| "ce_loss_52": 1.4269344687461853, |
| "ce_loss_7": 3.543222689628601, |
| "epoch": 0.287, |
| "grad_norm": 16.70636713163243, |
| "kl_loss_13": 3541.2, |
| "kl_loss_26": 2128.4, |
| "kl_loss_39": 1624.8, |
| "kl_loss_7": 4340.0, |
| "learning_rate": 0.0008189564396332927, |
| "loss": 5789.5, |
| "step": 2870 |
| }, |
| { |
| "ce_loss_13": 3.1600242078304293, |
| "ce_loss_26": 2.4901143670082093, |
| "ce_loss_39": 2.262405735254288, |
| "ce_loss_52": 1.447944176197052, |
| "ce_loss_7": 3.5380080163478853, |
| "epoch": 0.288, |
| "grad_norm": 17.918386343579222, |
| "kl_loss_13": 3481.6, |
| "kl_loss_26": 2059.8, |
| "kl_loss_39": 1588.8, |
| "kl_loss_7": 4287.6, |
| "learning_rate": 0.0008177329330524181, |
| "loss": 5812.1, |
| "step": 2880 |
| }, |
| { |
| "ce_loss_13": 3.174392342567444, |
| "ce_loss_26": 2.5159962266683578, |
| "ce_loss_39": 2.2794058710336684, |
| "ce_loss_52": 1.444807243347168, |
| "ce_loss_7": 3.5484564363956452, |
| "epoch": 0.289, |
| "grad_norm": 19.196721110708168, |
| "kl_loss_13": 3507.2, |
| "kl_loss_26": 2102.6, |
| "kl_loss_39": 1624.0, |
| "kl_loss_7": 4290.8, |
| "learning_rate": 0.0008165062269044352, |
| "loss": 5808.4, |
| "step": 2890 |
| }, |
| { |
| "ce_loss_13": 3.1698272943496706, |
| "ce_loss_26": 2.506984257698059, |
| "ce_loss_39": 2.2710763216018677, |
| "ce_loss_52": 1.4491374969482422, |
| "ce_loss_7": 3.553741979598999, |
| "epoch": 0.29, |
| "grad_norm": 16.91235339946444, |
| "kl_loss_13": 3511.2, |
| "kl_loss_26": 2109.2, |
| "kl_loss_39": 1627.2, |
| "kl_loss_7": 4317.6, |
| "learning_rate": 0.0008152763335422613, |
| "loss": 5792.2, |
| "step": 2900 |
| }, |
| { |
| "ce_loss_13": 3.115260285139084, |
| "ce_loss_26": 2.4620468825101853, |
| "ce_loss_39": 2.2224705785512926, |
| "ce_loss_52": 1.4216517835855484, |
| "ce_loss_7": 3.49349564909935, |
| "epoch": 0.291, |
| "grad_norm": 19.52390820528971, |
| "kl_loss_13": 3444.4, |
| "kl_loss_26": 2058.8, |
| "kl_loss_39": 1575.2, |
| "kl_loss_7": 4239.2, |
| "learning_rate": 0.0008140432653509088, |
| "loss": 5744.6, |
| "step": 2910 |
| }, |
| { |
| "ce_loss_13": 3.094448319077492, |
| "ce_loss_26": 2.4242703199386595, |
| "ce_loss_39": 2.1858216524124146, |
| "ce_loss_52": 1.4000759646296501, |
| "ce_loss_7": 3.4697431921958923, |
| "epoch": 0.292, |
| "grad_norm": 16.436813971821554, |
| "kl_loss_13": 3432.0, |
| "kl_loss_26": 2027.8, |
| "kl_loss_39": 1551.4, |
| "kl_loss_7": 4216.4, |
| "learning_rate": 0.0008128070347473608, |
| "loss": 5696.7, |
| "step": 2920 |
| }, |
| { |
| "ce_loss_13": 3.1132335126399995, |
| "ce_loss_26": 2.4610098242759704, |
| "ce_loss_39": 2.2259542405605317, |
| "ce_loss_52": 1.4294333100318908, |
| "ce_loss_7": 3.48697971701622, |
| "epoch": 0.293, |
| "grad_norm": 16.527383629623685, |
| "kl_loss_13": 3448.4, |
| "kl_loss_26": 2043.0, |
| "kl_loss_39": 1562.4, |
| "kl_loss_7": 4237.2, |
| "learning_rate": 0.0008115676541804455, |
| "loss": 5734.0, |
| "step": 2930 |
| }, |
| { |
| "ce_loss_13": 3.0772423684597014, |
| "ce_loss_26": 2.413277891278267, |
| "ce_loss_39": 2.179169711470604, |
| "ce_loss_52": 1.3933877736330031, |
| "ce_loss_7": 3.4470324754714965, |
| "epoch": 0.294, |
| "grad_norm": 17.18835839034016, |
| "kl_loss_13": 3431.6, |
| "kl_loss_26": 2023.0, |
| "kl_loss_39": 1542.8, |
| "kl_loss_7": 4223.6, |
| "learning_rate": 0.0008103251361307119, |
| "loss": 5705.55, |
| "step": 2940 |
| }, |
| { |
| "ce_loss_13": 3.093912643194199, |
| "ce_loss_26": 2.4372138679027557, |
| "ce_loss_39": 2.2068597853183745, |
| "ce_loss_52": 1.4339108556509017, |
| "ce_loss_7": 3.4713816404342652, |
| "epoch": 0.295, |
| "grad_norm": 16.78852919489637, |
| "kl_loss_13": 3395.2, |
| "kl_loss_26": 1998.6, |
| "kl_loss_39": 1533.6, |
| "kl_loss_7": 4186.4, |
| "learning_rate": 0.0008090794931103026, |
| "loss": 5641.3, |
| "step": 2950 |
| }, |
| { |
| "ce_loss_13": 3.1308993637561797, |
| "ce_loss_26": 2.4708112478256226, |
| "ce_loss_39": 2.22915124297142, |
| "ce_loss_52": 1.4374129235744477, |
| "ce_loss_7": 3.4991187393665313, |
| "epoch": 0.296, |
| "grad_norm": 16.768783545647338, |
| "kl_loss_13": 3436.4, |
| "kl_loss_26": 2035.0, |
| "kl_loss_39": 1548.6, |
| "kl_loss_7": 4216.8, |
| "learning_rate": 0.0008078307376628291, |
| "loss": 5645.6, |
| "step": 2960 |
| }, |
| { |
| "ce_loss_13": 3.0990765929222106, |
| "ce_loss_26": 2.452856171131134, |
| "ce_loss_39": 2.21165874004364, |
| "ce_loss_52": 1.4263496309518815, |
| "ce_loss_7": 3.472361743450165, |
| "epoch": 0.297, |
| "grad_norm": 16.652386900592916, |
| "kl_loss_13": 3401.6, |
| "kl_loss_26": 2024.0, |
| "kl_loss_39": 1532.8, |
| "kl_loss_7": 4181.6, |
| "learning_rate": 0.000806578882363245, |
| "loss": 5645.6, |
| "step": 2970 |
| }, |
| { |
| "ce_loss_13": 3.092843067646027, |
| "ce_loss_26": 2.423402965068817, |
| "ce_loss_39": 2.1854313611984253, |
| "ce_loss_52": 1.4033612102270125, |
| "ce_loss_7": 3.474502944946289, |
| "epoch": 0.298, |
| "grad_norm": 17.707727518863294, |
| "kl_loss_13": 3432.4, |
| "kl_loss_26": 2016.2, |
| "kl_loss_39": 1530.6, |
| "kl_loss_7": 4229.2, |
| "learning_rate": 0.0008053239398177191, |
| "loss": 5651.6, |
| "step": 2980 |
| }, |
| { |
| "ce_loss_13": 3.0975674211978914, |
| "ce_loss_26": 2.4374621868133546, |
| "ce_loss_39": 2.209611228108406, |
| "ce_loss_52": 1.4303795397281647, |
| "ce_loss_7": 3.475492590665817, |
| "epoch": 0.299, |
| "grad_norm": 17.746850084884183, |
| "kl_loss_13": 3416.8, |
| "kl_loss_26": 2002.4, |
| "kl_loss_39": 1530.8, |
| "kl_loss_7": 4214.8, |
| "learning_rate": 0.0008040659226635089, |
| "loss": 5630.2, |
| "step": 2990 |
| }, |
| { |
| "ce_loss_13": 3.089507430791855, |
| "ce_loss_26": 2.417942848801613, |
| "ce_loss_39": 2.1790529817342756, |
| "ce_loss_52": 1.3988826781511308, |
| "ce_loss_7": 3.468764144182205, |
| "epoch": 0.3, |
| "grad_norm": 17.0619103802906, |
| "kl_loss_13": 3422.4, |
| "kl_loss_26": 2019.4, |
| "kl_loss_39": 1533.2, |
| "kl_loss_7": 4215.2, |
| "learning_rate": 0.0008028048435688333, |
| "loss": 5562.8, |
| "step": 3000 |
| }, |
| { |
| "ce_loss_13": 3.1326099216938017, |
| "ce_loss_26": 2.471994733810425, |
| "ce_loss_39": 2.2380873382091524, |
| "ce_loss_52": 1.458441223204136, |
| "ce_loss_7": 3.502814435958862, |
| "epoch": 0.301, |
| "grad_norm": 17.222204651935485, |
| "kl_loss_13": 3436.8, |
| "kl_loss_26": 2039.6, |
| "kl_loss_39": 1559.8, |
| "kl_loss_7": 4214.8, |
| "learning_rate": 0.0008015407152327448, |
| "loss": 5664.1, |
| "step": 3010 |
| }, |
| { |
| "ce_loss_13": 3.1731098294258118, |
| "ce_loss_26": 2.504976212978363, |
| "ce_loss_39": 2.260189512372017, |
| "ce_loss_52": 1.4524748474359512, |
| "ce_loss_7": 3.555993539094925, |
| "epoch": 0.302, |
| "grad_norm": 16.72113952146357, |
| "kl_loss_13": 3518.4, |
| "kl_loss_26": 2087.0, |
| "kl_loss_39": 1585.0, |
| "kl_loss_7": 4313.2, |
| "learning_rate": 0.0008002735503850016, |
| "loss": 5589.3, |
| "step": 3020 |
| }, |
| { |
| "ce_loss_13": 3.118546891212463, |
| "ce_loss_26": 2.445932698249817, |
| "ce_loss_39": 2.2121699869632723, |
| "ce_loss_52": 1.4456641212105752, |
| "ce_loss_7": 3.491698741912842, |
| "epoch": 0.303, |
| "grad_norm": 16.572401194454876, |
| "kl_loss_13": 3394.8, |
| "kl_loss_26": 1984.4, |
| "kl_loss_39": 1511.2, |
| "kl_loss_7": 4167.2, |
| "learning_rate": 0.0007990033617859396, |
| "loss": 5580.5, |
| "step": 3030 |
| }, |
| { |
| "ce_loss_13": 3.10419015288353, |
| "ce_loss_26": 2.447344717383385, |
| "ce_loss_39": 2.212172231078148, |
| "ce_loss_52": 1.435505247116089, |
| "ce_loss_7": 3.477084743976593, |
| "epoch": 0.304, |
| "grad_norm": 18.041012229096975, |
| "kl_loss_13": 3420.8, |
| "kl_loss_26": 2017.0, |
| "kl_loss_39": 1525.6, |
| "kl_loss_7": 4208.8, |
| "learning_rate": 0.000797730162226344, |
| "loss": 5556.8, |
| "step": 3040 |
| }, |
| { |
| "ce_loss_13": 3.0508037239313124, |
| "ce_loss_26": 2.3910141468048094, |
| "ce_loss_39": 2.161831411719322, |
| "ce_loss_52": 1.3925497516989709, |
| "ce_loss_7": 3.434678375720978, |
| "epoch": 0.305, |
| "grad_norm": 18.2507403656971, |
| "kl_loss_13": 3360.8, |
| "kl_loss_26": 1961.2, |
| "kl_loss_39": 1491.3, |
| "kl_loss_7": 4164.0, |
| "learning_rate": 0.0007964539645273203, |
| "loss": 5538.7, |
| "step": 3050 |
| }, |
| { |
| "ce_loss_13": 3.143118643760681, |
| "ce_loss_26": 2.503050500154495, |
| "ce_loss_39": 2.269149711728096, |
| "ce_loss_52": 1.4877858996391295, |
| "ce_loss_7": 3.510826712846756, |
| "epoch": 0.306, |
| "grad_norm": 17.20244258760552, |
| "kl_loss_13": 3390.4, |
| "kl_loss_26": 2023.0, |
| "kl_loss_39": 1542.0, |
| "kl_loss_7": 4166.8, |
| "learning_rate": 0.000795174781540165, |
| "loss": 5547.7, |
| "step": 3060 |
| }, |
| { |
| "ce_loss_13": 3.0940939664840696, |
| "ce_loss_26": 2.4377844393253327, |
| "ce_loss_39": 2.2110770642757416, |
| "ce_loss_52": 1.453443130850792, |
| "ce_loss_7": 3.461427628993988, |
| "epoch": 0.307, |
| "grad_norm": 16.19915727951882, |
| "kl_loss_13": 3353.6, |
| "kl_loss_26": 1960.2, |
| "kl_loss_39": 1487.5, |
| "kl_loss_7": 4131.2, |
| "learning_rate": 0.0007938926261462366, |
| "loss": 5534.3, |
| "step": 3070 |
| }, |
| { |
| "ce_loss_13": 3.099123537540436, |
| "ce_loss_26": 2.428412067890167, |
| "ce_loss_39": 2.1952255785465242, |
| "ce_loss_52": 1.4312659561634065, |
| "ce_loss_7": 3.477120190858841, |
| "epoch": 0.308, |
| "grad_norm": 16.90807792363964, |
| "kl_loss_13": 3392.0, |
| "kl_loss_26": 1979.6, |
| "kl_loss_39": 1498.8, |
| "kl_loss_7": 4186.8, |
| "learning_rate": 0.0007926075112568258, |
| "loss": 5523.9, |
| "step": 3080 |
| }, |
| { |
| "ce_loss_13": 3.0900339841842652, |
| "ce_loss_26": 2.429807424545288, |
| "ce_loss_39": 2.1938013613224028, |
| "ce_loss_52": 1.4448837220668793, |
| "ce_loss_7": 3.460611253976822, |
| "epoch": 0.309, |
| "grad_norm": 17.278764306039758, |
| "kl_loss_13": 3363.2, |
| "kl_loss_26": 1959.6, |
| "kl_loss_39": 1476.2, |
| "kl_loss_7": 4153.6, |
| "learning_rate": 0.0007913194498130252, |
| "loss": 5481.8, |
| "step": 3090 |
| }, |
| { |
| "ce_loss_13": 3.0705978155136107, |
| "ce_loss_26": 2.415473333001137, |
| "ce_loss_39": 2.1859600633382796, |
| "ce_loss_52": 1.4387344419956207, |
| "ce_loss_7": 3.4460779249668123, |
| "epoch": 0.31, |
| "grad_norm": 17.744380665090848, |
| "kl_loss_13": 3328.8, |
| "kl_loss_26": 1937.2, |
| "kl_loss_39": 1461.6, |
| "kl_loss_7": 4109.6, |
| "learning_rate": 0.0007900284547855992, |
| "loss": 5494.5, |
| "step": 3100 |
| }, |
| { |
| "ce_loss_13": 3.1120304703712462, |
| "ce_loss_26": 2.437400758266449, |
| "ce_loss_39": 2.1967616409063337, |
| "ce_loss_52": 1.4447504609823227, |
| "ce_loss_7": 3.4870758295059203, |
| "epoch": 0.311, |
| "grad_norm": 17.196298155702294, |
| "kl_loss_13": 3414.8, |
| "kl_loss_26": 1997.2, |
| "kl_loss_39": 1507.4, |
| "kl_loss_7": 4197.6, |
| "learning_rate": 0.0007887345391748532, |
| "loss": 5492.4, |
| "step": 3110 |
| }, |
| { |
| "ce_loss_13": 3.0714461147785186, |
| "ce_loss_26": 2.410064917802811, |
| "ce_loss_39": 2.176686418056488, |
| "ce_loss_52": 1.4264434427022934, |
| "ce_loss_7": 3.440903478860855, |
| "epoch": 0.312, |
| "grad_norm": 17.274723589437542, |
| "kl_loss_13": 3352.8, |
| "kl_loss_26": 1965.6, |
| "kl_loss_39": 1480.0, |
| "kl_loss_7": 4134.0, |
| "learning_rate": 0.0007874377160105036, |
| "loss": 5478.5, |
| "step": 3120 |
| }, |
| { |
| "ce_loss_13": 3.0896170139312744, |
| "ce_loss_26": 2.440164825320244, |
| "ce_loss_39": 2.2094400197267534, |
| "ce_loss_52": 1.4495170325040818, |
| "ce_loss_7": 3.468176656961441, |
| "epoch": 0.313, |
| "grad_norm": 17.297623478505614, |
| "kl_loss_13": 3375.2, |
| "kl_loss_26": 1976.4, |
| "kl_loss_39": 1503.8, |
| "kl_loss_7": 4173.6, |
| "learning_rate": 0.0007861379983515449, |
| "loss": 5461.8, |
| "step": 3130 |
| }, |
| { |
| "ce_loss_13": 3.074652445316315, |
| "ce_loss_26": 2.4188932478427887, |
| "ce_loss_39": 2.1893287271261217, |
| "ce_loss_52": 1.440689930319786, |
| "ce_loss_7": 3.452376401424408, |
| "epoch": 0.314, |
| "grad_norm": 18.06264725868681, |
| "kl_loss_13": 3317.2, |
| "kl_loss_26": 1935.6, |
| "kl_loss_39": 1465.8, |
| "kl_loss_7": 4117.6, |
| "learning_rate": 0.0007848353992861195, |
| "loss": 5464.6, |
| "step": 3140 |
| }, |
| { |
| "ce_loss_13": 3.078055852651596, |
| "ce_loss_26": 2.420557659864426, |
| "ce_loss_39": 2.188734245300293, |
| "ce_loss_52": 1.4398296728730202, |
| "ce_loss_7": 3.457850754261017, |
| "epoch": 0.315, |
| "grad_norm": 17.00206014366557, |
| "kl_loss_13": 3318.4, |
| "kl_loss_26": 1927.6, |
| "kl_loss_39": 1458.6, |
| "kl_loss_7": 4114.0, |
| "learning_rate": 0.0007835299319313853, |
| "loss": 5381.3, |
| "step": 3150 |
| }, |
| { |
| "ce_loss_13": 3.059806948900223, |
| "ce_loss_26": 2.3743002265691757, |
| "ce_loss_39": 2.137469917535782, |
| "ce_loss_52": 1.3903418719768523, |
| "ce_loss_7": 3.4285161972045897, |
| "epoch": 0.316, |
| "grad_norm": 17.46330734643716, |
| "kl_loss_13": 3354.4, |
| "kl_loss_26": 1935.6, |
| "kl_loss_39": 1455.2, |
| "kl_loss_7": 4136.8, |
| "learning_rate": 0.0007822216094333848, |
| "loss": 5407.6, |
| "step": 3160 |
| }, |
| { |
| "ce_loss_13": 3.0990252554416657, |
| "ce_loss_26": 2.4341968923807142, |
| "ce_loss_39": 2.1952569454908373, |
| "ce_loss_52": 1.439093704521656, |
| "ce_loss_7": 3.4743688821792604, |
| "epoch": 0.317, |
| "grad_norm": 18.51989341882003, |
| "kl_loss_13": 3380.4, |
| "kl_loss_26": 1988.0, |
| "kl_loss_39": 1492.0, |
| "kl_loss_7": 4166.4, |
| "learning_rate": 0.0007809104449669101, |
| "loss": 5410.7, |
| "step": 3170 |
| }, |
| { |
| "ce_loss_13": 3.041397601366043, |
| "ce_loss_26": 2.382528102397919, |
| "ce_loss_39": 2.1511587262153626, |
| "ce_loss_52": 1.435599946975708, |
| "ce_loss_7": 3.4155047237873077, |
| "epoch": 0.318, |
| "grad_norm": 17.0483845332083, |
| "kl_loss_13": 3256.8, |
| "kl_loss_26": 1868.0, |
| "kl_loss_39": 1392.8, |
| "kl_loss_7": 4038.0, |
| "learning_rate": 0.0007795964517353734, |
| "loss": 5354.9, |
| "step": 3180 |
| }, |
| { |
| "ce_loss_13": 3.0867488861083983, |
| "ce_loss_26": 2.425813916325569, |
| "ce_loss_39": 2.1947717368602753, |
| "ce_loss_52": 1.4569276213645934, |
| "ce_loss_7": 3.4604012250900267, |
| "epoch": 0.319, |
| "grad_norm": 16.586058341187346, |
| "kl_loss_13": 3328.8, |
| "kl_loss_26": 1930.2, |
| "kl_loss_39": 1449.8, |
| "kl_loss_7": 4120.8, |
| "learning_rate": 0.000778279642970672, |
| "loss": 5344.7, |
| "step": 3190 |
| }, |
| { |
| "ce_loss_13": 3.0399708569049837, |
| "ce_loss_26": 2.3785893470048904, |
| "ce_loss_39": 2.138497656583786, |
| "ce_loss_52": 1.4135656535625458, |
| "ce_loss_7": 3.409178429841995, |
| "epoch": 0.32, |
| "grad_norm": 17.983547533058992, |
| "kl_loss_13": 3304.0, |
| "kl_loss_26": 1914.8, |
| "kl_loss_39": 1428.2, |
| "kl_loss_7": 4088.0, |
| "learning_rate": 0.0007769600319330552, |
| "loss": 5362.0, |
| "step": 3200 |
| }, |
| { |
| "ce_loss_13": 3.1184182286262514, |
| "ce_loss_26": 2.4707882523536684, |
| "ce_loss_39": 2.233083599805832, |
| "ce_loss_52": 1.4817634999752045, |
| "ce_loss_7": 3.4818074285984038, |
| "epoch": 0.321, |
| "grad_norm": 16.87308179941231, |
| "kl_loss_13": 3315.6, |
| "kl_loss_26": 1955.6, |
| "kl_loss_39": 1470.6, |
| "kl_loss_7": 4081.2, |
| "learning_rate": 0.0007756376319109917, |
| "loss": 5372.8, |
| "step": 3210 |
| }, |
| { |
| "ce_loss_13": 3.065703272819519, |
| "ce_loss_26": 2.4234554558992385, |
| "ce_loss_39": 2.1948377937078476, |
| "ce_loss_52": 1.4451974362134934, |
| "ce_loss_7": 3.4377165257930757, |
| "epoch": 0.322, |
| "grad_norm": 17.206055177859767, |
| "kl_loss_13": 3288.4, |
| "kl_loss_26": 1929.2, |
| "kl_loss_39": 1461.2, |
| "kl_loss_7": 4058.8, |
| "learning_rate": 0.0007743124562210351, |
| "loss": 5338.3, |
| "step": 3220 |
| }, |
| { |
| "ce_loss_13": 3.0654458463191987, |
| "ce_loss_26": 2.4097089529037476, |
| "ce_loss_39": 2.1828925907611847, |
| "ce_loss_52": 1.4646209165453912, |
| "ce_loss_7": 3.438837933540344, |
| "epoch": 0.323, |
| "grad_norm": 16.43379975440051, |
| "kl_loss_13": 3246.8, |
| "kl_loss_26": 1868.4, |
| "kl_loss_39": 1403.6, |
| "kl_loss_7": 4034.4, |
| "learning_rate": 0.0007729845182076895, |
| "loss": 5337.95, |
| "step": 3230 |
| }, |
| { |
| "ce_loss_13": 3.019025903940201, |
| "ce_loss_26": 2.3812606751918795, |
| "ce_loss_39": 2.1557460606098173, |
| "ce_loss_52": 1.4495598763227462, |
| "ce_loss_7": 3.390954166650772, |
| "epoch": 0.324, |
| "grad_norm": 17.50409394447208, |
| "kl_loss_13": 3213.2, |
| "kl_loss_26": 1857.6, |
| "kl_loss_39": 1393.2, |
| "kl_loss_7": 3999.2, |
| "learning_rate": 0.0007716538312432765, |
| "loss": 5323.8, |
| "step": 3240 |
| }, |
| { |
| "ce_loss_13": 3.0274185359478, |
| "ce_loss_26": 2.3673853039741517, |
| "ce_loss_39": 2.1314821422100065, |
| "ce_loss_52": 1.4110743701457977, |
| "ce_loss_7": 3.3962999522686004, |
| "epoch": 0.325, |
| "grad_norm": 17.627956174954214, |
| "kl_loss_13": 3281.2, |
| "kl_loss_26": 1900.8, |
| "kl_loss_39": 1419.0, |
| "kl_loss_7": 4056.8, |
| "learning_rate": 0.0007703204087277988, |
| "loss": 5310.9, |
| "step": 3250 |
| }, |
| { |
| "ce_loss_13": 2.995425891876221, |
| "ce_loss_26": 2.3437940657138823, |
| "ce_loss_39": 2.1112417429685593, |
| "ce_loss_52": 1.3956570625305176, |
| "ce_loss_7": 3.3635079681873323, |
| "epoch": 0.326, |
| "grad_norm": 17.092527088154757, |
| "kl_loss_13": 3270.8, |
| "kl_loss_26": 1881.4, |
| "kl_loss_39": 1402.8, |
| "kl_loss_7": 4054.4, |
| "learning_rate": 0.0007689842640888063, |
| "loss": 5291.9, |
| "step": 3260 |
| }, |
| { |
| "ce_loss_13": 3.0584332168102266, |
| "ce_loss_26": 2.4099347323179243, |
| "ce_loss_39": 2.182457607984543, |
| "ce_loss_52": 1.4547152355313302, |
| "ce_loss_7": 3.4290026843547823, |
| "epoch": 0.327, |
| "grad_norm": 17.31361789139729, |
| "kl_loss_13": 3256.0, |
| "kl_loss_26": 1887.8, |
| "kl_loss_39": 1426.4, |
| "kl_loss_7": 4026.4, |
| "learning_rate": 0.0007676454107812607, |
| "loss": 5264.3, |
| "step": 3270 |
| }, |
| { |
| "ce_loss_13": 3.002471148967743, |
| "ce_loss_26": 2.365675774216652, |
| "ce_loss_39": 2.1432500898838045, |
| "ce_loss_52": 1.4313921973109245, |
| "ce_loss_7": 3.3743964791297913, |
| "epoch": 0.328, |
| "grad_norm": 15.793768401685108, |
| "kl_loss_13": 3248.4, |
| "kl_loss_26": 1868.8, |
| "kl_loss_39": 1414.0, |
| "kl_loss_7": 4030.8, |
| "learning_rate": 0.0007663038622873999, |
| "loss": 5285.1, |
| "step": 3280 |
| }, |
| { |
| "ce_loss_13": 3.0830911457538606, |
| "ce_loss_26": 2.4269310742616654, |
| "ce_loss_39": 2.2025650680065154, |
| "ce_loss_52": 1.4690157890319824, |
| "ce_loss_7": 3.454869121313095, |
| "epoch": 0.329, |
| "grad_norm": 17.14503971571328, |
| "kl_loss_13": 3293.2, |
| "kl_loss_26": 1903.8, |
| "kl_loss_39": 1434.0, |
| "kl_loss_7": 4075.6, |
| "learning_rate": 0.0007649596321166025, |
| "loss": 5253.65, |
| "step": 3290 |
| }, |
| { |
| "ce_loss_13": 2.9723230481147764, |
| "ce_loss_26": 2.332389995455742, |
| "ce_loss_39": 2.1094966679811478, |
| "ce_loss_52": 1.4339970767498016, |
| "ce_loss_7": 3.333187943696976, |
| "epoch": 0.33, |
| "grad_norm": 16.633040610576913, |
| "kl_loss_13": 3118.0, |
| "kl_loss_26": 1779.4, |
| "kl_loss_39": 1323.0, |
| "kl_loss_7": 3879.6, |
| "learning_rate": 0.0007636127338052513, |
| "loss": 5233.1, |
| "step": 3300 |
| }, |
| { |
| "ce_loss_13": 2.9914496004581452, |
| "ce_loss_26": 2.3316532552242277, |
| "ce_loss_39": 2.0988477796316145, |
| "ce_loss_52": 1.400558878481388, |
| "ce_loss_7": 3.363678741455078, |
| "epoch": 0.331, |
| "grad_norm": 17.26677566687906, |
| "kl_loss_13": 3257.2, |
| "kl_loss_26": 1858.4, |
| "kl_loss_39": 1375.2, |
| "kl_loss_7": 4042.8, |
| "learning_rate": 0.0007622631809165971, |
| "loss": 5196.15, |
| "step": 3310 |
| }, |
| { |
| "ce_loss_13": 3.064756464958191, |
| "ce_loss_26": 2.422107365727425, |
| "ce_loss_39": 2.192085716128349, |
| "ce_loss_52": 1.4793070062994957, |
| "ce_loss_7": 3.430086314678192, |
| "epoch": 0.332, |
| "grad_norm": 17.330178950251707, |
| "kl_loss_13": 3243.6, |
| "kl_loss_26": 1878.8, |
| "kl_loss_39": 1409.8, |
| "kl_loss_7": 4016.4, |
| "learning_rate": 0.000760910987040623, |
| "loss": 5231.55, |
| "step": 3320 |
| }, |
| { |
| "ce_loss_13": 2.9637326538562774, |
| "ce_loss_26": 2.3162154614925385, |
| "ce_loss_39": 2.097182759642601, |
| "ce_loss_52": 1.4222271725535394, |
| "ce_loss_7": 3.332917684316635, |
| "epoch": 0.333, |
| "grad_norm": 17.085141982864975, |
| "kl_loss_13": 3141.2, |
| "kl_loss_26": 1768.8, |
| "kl_loss_39": 1318.3, |
| "kl_loss_7": 3920.0, |
| "learning_rate": 0.000759556165793906, |
| "loss": 5154.65, |
| "step": 3330 |
| }, |
| { |
| "ce_loss_13": 3.029485374689102, |
| "ce_loss_26": 2.3887200921773912, |
| "ce_loss_39": 2.1607041716575623, |
| "ce_loss_52": 1.4696623742580415, |
| "ce_loss_7": 3.3922773957252503, |
| "epoch": 0.334, |
| "grad_norm": 15.502678294546826, |
| "kl_loss_13": 3185.2, |
| "kl_loss_26": 1826.8, |
| "kl_loss_39": 1360.6, |
| "kl_loss_7": 3948.8, |
| "learning_rate": 0.000758198730819481, |
| "loss": 5180.15, |
| "step": 3340 |
| }, |
| { |
| "ce_loss_13": 3.03846270442009, |
| "ce_loss_26": 2.378660023212433, |
| "ce_loss_39": 2.1477699905633925, |
| "ce_loss_52": 1.4347332805395125, |
| "ce_loss_7": 3.4101031959056853, |
| "epoch": 0.335, |
| "grad_norm": 16.024720398541927, |
| "kl_loss_13": 3270.4, |
| "kl_loss_26": 1883.0, |
| "kl_loss_39": 1406.3, |
| "kl_loss_7": 4056.4, |
| "learning_rate": 0.0007568386957867032, |
| "loss": 5194.3, |
| "step": 3350 |
| }, |
| { |
| "ce_loss_13": 3.008663833141327, |
| "ce_loss_26": 2.3700191140174867, |
| "ce_loss_39": 2.1397975504398348, |
| "ce_loss_52": 1.4545943021774292, |
| "ce_loss_7": 3.3673401892185213, |
| "epoch": 0.336, |
| "grad_norm": 16.053283356705972, |
| "kl_loss_13": 3181.6, |
| "kl_loss_26": 1820.2, |
| "kl_loss_39": 1348.6, |
| "kl_loss_7": 3942.0, |
| "learning_rate": 0.0007554760743911103, |
| "loss": 5153.55, |
| "step": 3360 |
| }, |
| { |
| "ce_loss_13": 2.9829940140247344, |
| "ce_loss_26": 2.3409414261579515, |
| "ce_loss_39": 2.1123215198516845, |
| "ce_loss_52": 1.435232725739479, |
| "ce_loss_7": 3.355481207370758, |
| "epoch": 0.337, |
| "grad_norm": 16.67714591780792, |
| "kl_loss_13": 3152.4, |
| "kl_loss_26": 1792.8, |
| "kl_loss_39": 1321.8, |
| "kl_loss_7": 3938.0, |
| "learning_rate": 0.0007541108803542846, |
| "loss": 5142.8, |
| "step": 3370 |
| }, |
| { |
| "ce_loss_13": 3.027516704797745, |
| "ce_loss_26": 2.386495107412338, |
| "ce_loss_39": 2.1608838021755217, |
| "ce_loss_52": 1.4647808492183685, |
| "ce_loss_7": 3.3936978697776796, |
| "epoch": 0.338, |
| "grad_norm": 16.879054589986723, |
| "kl_loss_13": 3186.4, |
| "kl_loss_26": 1828.6, |
| "kl_loss_39": 1351.9, |
| "kl_loss_7": 3957.6, |
| "learning_rate": 0.0007527431274237149, |
| "loss": 5169.0, |
| "step": 3380 |
| }, |
| { |
| "ce_loss_13": 2.9946301877498627, |
| "ce_loss_26": 2.3546741545200347, |
| "ce_loss_39": 2.1238624840974807, |
| "ce_loss_52": 1.4416128873825074, |
| "ce_loss_7": 3.3628919243812563, |
| "epoch": 0.339, |
| "grad_norm": 18.526031342338438, |
| "kl_loss_13": 3157.6, |
| "kl_loss_26": 1810.6, |
| "kl_loss_39": 1339.1, |
| "kl_loss_7": 3932.0, |
| "learning_rate": 0.0007513728293726579, |
| "loss": 5107.45, |
| "step": 3390 |
| }, |
| { |
| "ce_loss_13": 2.975279802083969, |
| "ce_loss_26": 2.325047069787979, |
| "ce_loss_39": 2.1066873967647552, |
| "ce_loss_52": 1.4382890224456788, |
| "ce_loss_7": 3.3415417432785035, |
| "epoch": 0.34, |
| "grad_norm": 17.516441880993753, |
| "kl_loss_13": 3144.4, |
| "kl_loss_26": 1778.8, |
| "kl_loss_39": 1326.8, |
| "kl_loss_7": 3920.0, |
| "learning_rate": 0.00075, |
| "loss": 5107.0, |
| "step": 3400 |
| }, |
| { |
| "ce_loss_13": 2.9631440460681917, |
| "ce_loss_26": 2.318173348903656, |
| "ce_loss_39": 2.0847090512514113, |
| "ce_loss_52": 1.4200364857912064, |
| "ce_loss_7": 3.3265232741832733, |
| "epoch": 0.341, |
| "grad_norm": 16.134403370320147, |
| "kl_loss_13": 3139.2, |
| "kl_loss_26": 1766.8, |
| "kl_loss_39": 1296.8, |
| "kl_loss_7": 3908.8, |
| "learning_rate": 0.0007486246531301177, |
| "loss": 5097.65, |
| "step": 3410 |
| }, |
| { |
| "ce_loss_13": 2.99331476688385, |
| "ce_loss_26": 2.3580960750579836, |
| "ce_loss_39": 2.1382109016180038, |
| "ce_loss_52": 1.4535871922969819, |
| "ce_loss_7": 3.35606609582901, |
| "epoch": 0.342, |
| "grad_norm": 16.427769884918277, |
| "kl_loss_13": 3148.4, |
| "kl_loss_26": 1796.6, |
| "kl_loss_39": 1349.0, |
| "kl_loss_7": 3921.2, |
| "learning_rate": 0.0007472468026127384, |
| "loss": 5139.75, |
| "step": 3420 |
| }, |
| { |
| "ce_loss_13": 2.921882951259613, |
| "ce_loss_26": 2.2855687588453293, |
| "ce_loss_39": 2.067648893594742, |
| "ce_loss_52": 1.4116598561406135, |
| "ce_loss_7": 3.2927843034267426, |
| "epoch": 0.343, |
| "grad_norm": 16.808808535347794, |
| "kl_loss_13": 3095.8, |
| "kl_loss_26": 1756.2, |
| "kl_loss_39": 1301.9, |
| "kl_loss_7": 3878.0, |
| "learning_rate": 0.000745866462322802, |
| "loss": 5051.15, |
| "step": 3430 |
| }, |
| { |
| "ce_loss_13": 3.04699621796608, |
| "ce_loss_26": 2.399735540151596, |
| "ce_loss_39": 2.176995486021042, |
| "ce_loss_52": 1.496598380804062, |
| "ce_loss_7": 3.418045401573181, |
| "epoch": 0.344, |
| "grad_norm": 16.406856004421037, |
| "kl_loss_13": 3162.0, |
| "kl_loss_26": 1792.4, |
| "kl_loss_39": 1329.5, |
| "kl_loss_7": 3935.6, |
| "learning_rate": 0.0007444836461603195, |
| "loss": 5107.85, |
| "step": 3440 |
| }, |
| { |
| "ce_loss_13": 2.930357199907303, |
| "ce_loss_26": 2.3015194088220596, |
| "ce_loss_39": 2.078587147593498, |
| "ce_loss_52": 1.414345271885395, |
| "ce_loss_7": 3.2914562046527864, |
| "epoch": 0.345, |
| "grad_norm": 16.994168943169583, |
| "kl_loss_13": 3103.6, |
| "kl_loss_26": 1763.0, |
| "kl_loss_39": 1304.8, |
| "kl_loss_7": 3859.2, |
| "learning_rate": 0.0007430983680502344, |
| "loss": 5063.6, |
| "step": 3450 |
| }, |
| { |
| "ce_loss_13": 2.946500468254089, |
| "ce_loss_26": 2.3075433492660524, |
| "ce_loss_39": 2.08428935110569, |
| "ce_loss_52": 1.4254867061972618, |
| "ce_loss_7": 3.309797298908234, |
| "epoch": 0.346, |
| "grad_norm": 16.30494022816115, |
| "kl_loss_13": 3098.0, |
| "kl_loss_26": 1752.4, |
| "kl_loss_39": 1293.6, |
| "kl_loss_7": 3865.2, |
| "learning_rate": 0.0007417106419422819, |
| "loss": 5025.2, |
| "step": 3460 |
| }, |
| { |
| "ce_loss_13": 2.935671639442444, |
| "ce_loss_26": 2.2941204428672792, |
| "ce_loss_39": 2.071269851922989, |
| "ce_loss_52": 1.4085116267204285, |
| "ce_loss_7": 3.3008416891098022, |
| "epoch": 0.347, |
| "grad_norm": 17.30594652330039, |
| "kl_loss_13": 3130.4, |
| "kl_loss_26": 1779.4, |
| "kl_loss_39": 1316.9, |
| "kl_loss_7": 3898.4, |
| "learning_rate": 0.0007403204818108486, |
| "loss": 5043.95, |
| "step": 3470 |
| }, |
| { |
| "ce_loss_13": 2.934108853340149, |
| "ce_loss_26": 2.292804607748985, |
| "ce_loss_39": 2.0716417878866196, |
| "ce_loss_52": 1.4156519144773483, |
| "ce_loss_7": 3.2983541190624237, |
| "epoch": 0.348, |
| "grad_norm": 16.793884082475312, |
| "kl_loss_13": 3072.0, |
| "kl_loss_26": 1727.6, |
| "kl_loss_39": 1271.6, |
| "kl_loss_7": 3841.6, |
| "learning_rate": 0.0007389279016548316, |
| "loss": 5016.3, |
| "step": 3480 |
| }, |
| { |
| "ce_loss_13": 2.8679963111877442, |
| "ce_loss_26": 2.2394334375858307, |
| "ce_loss_39": 2.0239282071590425, |
| "ce_loss_52": 1.3932767808437347, |
| "ce_loss_7": 3.234779417514801, |
| "epoch": 0.349, |
| "grad_norm": 15.946114217155069, |
| "kl_loss_13": 3023.6, |
| "kl_loss_26": 1691.8, |
| "kl_loss_39": 1246.9, |
| "kl_loss_7": 3788.8, |
| "learning_rate": 0.0007375329154974975, |
| "loss": 5018.15, |
| "step": 3490 |
| }, |
| { |
| "ce_loss_13": 2.9285870611667635, |
| "ce_loss_26": 2.287761977314949, |
| "ce_loss_39": 2.063358634710312, |
| "ce_loss_52": 1.4101893305778503, |
| "ce_loss_7": 3.298641562461853, |
| "epoch": 0.35, |
| "grad_norm": 16.997559975693584, |
| "kl_loss_13": 3086.0, |
| "kl_loss_26": 1746.0, |
| "kl_loss_39": 1280.7, |
| "kl_loss_7": 3868.8, |
| "learning_rate": 0.0007361355373863414, |
| "loss": 5018.5, |
| "step": 3500 |
| }, |
| { |
| "ce_loss_13": 2.931669169664383, |
| "ce_loss_26": 2.288780450820923, |
| "ce_loss_39": 2.0723241955041884, |
| "ce_loss_52": 1.4271863222122192, |
| "ce_loss_7": 3.2927916407585145, |
| "epoch": 0.351, |
| "grad_norm": 15.980672586392506, |
| "kl_loss_13": 3077.6, |
| "kl_loss_26": 1726.6, |
| "kl_loss_39": 1278.3, |
| "kl_loss_7": 3843.6, |
| "learning_rate": 0.0007347357813929454, |
| "loss": 4989.7, |
| "step": 3510 |
| }, |
| { |
| "ce_loss_13": 2.928689205646515, |
| "ce_loss_26": 2.2940947294235228, |
| "ce_loss_39": 2.0670880317687987, |
| "ce_loss_52": 1.4149780303239823, |
| "ce_loss_7": 3.293194830417633, |
| "epoch": 0.352, |
| "grad_norm": 16.33808700723808, |
| "kl_loss_13": 3074.8, |
| "kl_loss_26": 1738.8, |
| "kl_loss_39": 1273.7, |
| "kl_loss_7": 3840.0, |
| "learning_rate": 0.0007333336616128369, |
| "loss": 4986.35, |
| "step": 3520 |
| }, |
| { |
| "ce_loss_13": 2.940303909778595, |
| "ce_loss_26": 2.294164848327637, |
| "ce_loss_39": 2.0657031387090683, |
| "ce_loss_52": 1.4308805465698242, |
| "ce_loss_7": 3.3036282479763033, |
| "epoch": 0.353, |
| "grad_norm": 16.29503578207681, |
| "kl_loss_13": 3067.6, |
| "kl_loss_26": 1716.2, |
| "kl_loss_39": 1247.9, |
| "kl_loss_7": 3828.0, |
| "learning_rate": 0.0007319291921653463, |
| "loss": 4998.85, |
| "step": 3530 |
| }, |
| { |
| "ce_loss_13": 2.916954427957535, |
| "ce_loss_26": 2.2819162607192993, |
| "ce_loss_39": 2.0701166808605196, |
| "ce_loss_52": 1.4166931748390197, |
| "ce_loss_7": 3.2759189188480375, |
| "epoch": 0.354, |
| "grad_norm": 17.822669536905938, |
| "kl_loss_13": 3073.6, |
| "kl_loss_26": 1723.6, |
| "kl_loss_39": 1286.2, |
| "kl_loss_7": 3837.2, |
| "learning_rate": 0.0007305223871934656, |
| "loss": 4995.55, |
| "step": 3540 |
| }, |
| { |
| "ce_loss_13": 2.975371015071869, |
| "ce_loss_26": 2.3552831768989564, |
| "ce_loss_39": 2.1292835503816603, |
| "ce_loss_52": 1.4775378912687303, |
| "ce_loss_7": 3.3292273938655854, |
| "epoch": 0.355, |
| "grad_norm": 16.58742338810906, |
| "kl_loss_13": 3054.4, |
| "kl_loss_26": 1739.2, |
| "kl_loss_39": 1279.1, |
| "kl_loss_7": 3808.8, |
| "learning_rate": 0.0007291132608637052, |
| "loss": 4945.1, |
| "step": 3550 |
| }, |
| { |
| "ce_loss_13": 2.972325986623764, |
| "ce_loss_26": 2.3392420560121536, |
| "ce_loss_39": 2.1207040429115294, |
| "ce_loss_52": 1.475459137558937, |
| "ce_loss_7": 3.3383385837078094, |
| "epoch": 0.356, |
| "grad_norm": 16.383650902730043, |
| "kl_loss_13": 3066.4, |
| "kl_loss_26": 1712.2, |
| "kl_loss_39": 1260.8, |
| "kl_loss_7": 3838.8, |
| "learning_rate": 0.0007277018273659516, |
| "loss": 4963.15, |
| "step": 3560 |
| }, |
| { |
| "ce_loss_13": 3.0111460268497465, |
| "ce_loss_26": 2.3827701687812803, |
| "ce_loss_39": 2.1591351449489595, |
| "ce_loss_52": 1.4967001289129258, |
| "ce_loss_7": 3.3728690683841704, |
| "epoch": 0.357, |
| "grad_norm": 16.824381349061323, |
| "kl_loss_13": 3101.2, |
| "kl_loss_26": 1770.6, |
| "kl_loss_39": 1310.2, |
| "kl_loss_7": 3856.8, |
| "learning_rate": 0.0007262881009133242, |
| "loss": 4952.95, |
| "step": 3570 |
| }, |
| { |
| "ce_loss_13": 2.9141285896301268, |
| "ce_loss_26": 2.2805556029081346, |
| "ce_loss_39": 2.065216201543808, |
| "ce_loss_52": 1.4257875666022302, |
| "ce_loss_7": 3.282390242815018, |
| "epoch": 0.358, |
| "grad_norm": 18.058728297237614, |
| "kl_loss_13": 3046.4, |
| "kl_loss_26": 1695.0, |
| "kl_loss_39": 1247.4, |
| "kl_loss_7": 3819.6, |
| "learning_rate": 0.0007248720957420329, |
| "loss": 4964.9, |
| "step": 3580 |
| }, |
| { |
| "ce_loss_13": 2.8828001439571382, |
| "ce_loss_26": 2.253911817073822, |
| "ce_loss_39": 2.030216920375824, |
| "ce_loss_52": 1.40321164727211, |
| "ce_loss_7": 3.251304441690445, |
| "epoch": 0.359, |
| "grad_norm": 16.573678980597606, |
| "kl_loss_13": 3047.2, |
| "kl_loss_26": 1698.2, |
| "kl_loss_39": 1242.3, |
| "kl_loss_7": 3817.6, |
| "learning_rate": 0.0007234538261112341, |
| "loss": 4895.95, |
| "step": 3590 |
| }, |
| { |
| "ce_loss_13": 2.9461396992206574, |
| "ce_loss_26": 2.3021911144256593, |
| "ce_loss_39": 2.0822067111730576, |
| "ce_loss_52": 1.448357391357422, |
| "ce_loss_7": 3.3077784180641174, |
| "epoch": 0.36, |
| "grad_norm": 17.017357286641733, |
| "kl_loss_13": 3060.4, |
| "kl_loss_26": 1708.8, |
| "kl_loss_39": 1253.4, |
| "kl_loss_7": 3833.6, |
| "learning_rate": 0.0007220333063028871, |
| "loss": 4918.35, |
| "step": 3600 |
| }, |
| { |
| "ce_loss_13": 2.846253049373627, |
| "ce_loss_26": 2.224426531791687, |
| "ce_loss_39": 2.0076546490192415, |
| "ce_loss_52": 1.395447552204132, |
| "ce_loss_7": 3.2072394728660583, |
| "epoch": 0.361, |
| "grad_norm": 15.74697405086667, |
| "kl_loss_13": 2978.0, |
| "kl_loss_26": 1658.4, |
| "kl_loss_39": 1211.2, |
| "kl_loss_7": 3738.4, |
| "learning_rate": 0.0007206105506216106, |
| "loss": 4871.3, |
| "step": 3610 |
| }, |
| { |
| "ce_loss_13": 3.0099994122982023, |
| "ce_loss_26": 2.373449808359146, |
| "ce_loss_39": 2.1529267936944962, |
| "ce_loss_52": 1.4900053232908248, |
| "ce_loss_7": 3.3753599405288695, |
| "epoch": 0.362, |
| "grad_norm": 16.970809605735944, |
| "kl_loss_13": 3087.6, |
| "kl_loss_26": 1745.2, |
| "kl_loss_39": 1286.7, |
| "kl_loss_7": 3866.8, |
| "learning_rate": 0.0007191855733945387, |
| "loss": 4947.8, |
| "step": 3620 |
| }, |
| { |
| "ce_loss_13": 2.937187296152115, |
| "ce_loss_26": 2.322328266501427, |
| "ce_loss_39": 2.105859735608101, |
| "ce_loss_52": 1.474419781565666, |
| "ce_loss_7": 3.2971576511859895, |
| "epoch": 0.363, |
| "grad_norm": 17.141982755892812, |
| "kl_loss_13": 3009.6, |
| "kl_loss_26": 1696.8, |
| "kl_loss_39": 1241.1, |
| "kl_loss_7": 3762.4, |
| "learning_rate": 0.0007177583889711762, |
| "loss": 4882.15, |
| "step": 3630 |
| }, |
| { |
| "ce_loss_13": 2.902718555927277, |
| "ce_loss_26": 2.260812908411026, |
| "ce_loss_39": 2.042543429136276, |
| "ce_loss_52": 1.4226751655340195, |
| "ce_loss_7": 3.2698469936847685, |
| "epoch": 0.364, |
| "grad_norm": 17.153862070969048, |
| "kl_loss_13": 3018.8, |
| "kl_loss_26": 1673.0, |
| "kl_loss_39": 1219.0, |
| "kl_loss_7": 3784.4, |
| "learning_rate": 0.0007163290117232541, |
| "loss": 4884.0, |
| "step": 3640 |
| }, |
| { |
| "ce_loss_13": 2.9109850347042086, |
| "ce_loss_26": 2.297417125105858, |
| "ce_loss_39": 2.077428176999092, |
| "ce_loss_52": 1.4550551682710648, |
| "ce_loss_7": 3.268283462524414, |
| "epoch": 0.365, |
| "grad_norm": 16.42744245211514, |
| "kl_loss_13": 2985.2, |
| "kl_loss_26": 1679.2, |
| "kl_loss_39": 1227.9, |
| "kl_loss_7": 3734.0, |
| "learning_rate": 0.0007148974560445859, |
| "loss": 4868.65, |
| "step": 3650 |
| }, |
| { |
| "ce_loss_13": 2.9199238896369932, |
| "ce_loss_26": 2.2848848432302473, |
| "ce_loss_39": 2.060741201043129, |
| "ce_loss_52": 1.4278603106737138, |
| "ce_loss_7": 3.2834209561347962, |
| "epoch": 0.366, |
| "grad_norm": 16.404556741779928, |
| "kl_loss_13": 3024.0, |
| "kl_loss_26": 1686.2, |
| "kl_loss_39": 1230.9, |
| "kl_loss_7": 3786.0, |
| "learning_rate": 0.0007134637363509209, |
| "loss": 4839.5, |
| "step": 3660 |
| }, |
| { |
| "ce_loss_13": 2.9712482690811157, |
| "ce_loss_26": 2.3368860691785813, |
| "ce_loss_39": 2.104892411828041, |
| "ce_loss_52": 1.4633448541164398, |
| "ce_loss_7": 3.332030898332596, |
| "epoch": 0.367, |
| "grad_norm": 15.961228476827497, |
| "kl_loss_13": 3092.4, |
| "kl_loss_26": 1760.2, |
| "kl_loss_39": 1277.5, |
| "kl_loss_7": 3848.0, |
| "learning_rate": 0.0007120278670798009, |
| "loss": 4858.55, |
| "step": 3670 |
| }, |
| { |
| "ce_loss_13": 2.951517391204834, |
| "ce_loss_26": 2.3281659215688704, |
| "ce_loss_39": 2.0995417445898057, |
| "ce_loss_52": 1.4656393617391585, |
| "ce_loss_7": 3.2964209616184235, |
| "epoch": 0.368, |
| "grad_norm": 16.089022609349872, |
| "kl_loss_13": 3003.6, |
| "kl_loss_26": 1696.8, |
| "kl_loss_39": 1232.8, |
| "kl_loss_7": 3745.6, |
| "learning_rate": 0.0007105898626904133, |
| "loss": 4774.9, |
| "step": 3680 |
| }, |
| { |
| "ce_loss_13": 2.870139628648758, |
| "ce_loss_26": 2.2511734038591387, |
| "ce_loss_39": 2.0341389745473863, |
| "ce_loss_52": 1.4250996381044387, |
| "ce_loss_7": 3.2268544733524323, |
| "epoch": 0.369, |
| "grad_norm": 15.247673028968622, |
| "kl_loss_13": 2967.2, |
| "kl_loss_26": 1653.8, |
| "kl_loss_39": 1205.5, |
| "kl_loss_7": 3723.2, |
| "learning_rate": 0.0007091497376634463, |
| "loss": 4807.45, |
| "step": 3690 |
| }, |
| { |
| "ce_loss_13": 2.8762976706027983, |
| "ce_loss_26": 2.256538024544716, |
| "ce_loss_39": 2.043423393368721, |
| "ce_loss_52": 1.4497251689434052, |
| "ce_loss_7": 3.2377980053424835, |
| "epoch": 0.37, |
| "grad_norm": 16.15904103093409, |
| "kl_loss_13": 2914.4, |
| "kl_loss_26": 1609.7, |
| "kl_loss_39": 1170.3, |
| "kl_loss_7": 3672.0, |
| "learning_rate": 0.0007077075065009433, |
| "loss": 4822.75, |
| "step": 3700 |
| }, |
| { |
| "ce_loss_13": 2.865807980298996, |
| "ce_loss_26": 2.2327334135770798, |
| "ce_loss_39": 2.012790763378143, |
| "ce_loss_52": 1.4004584282636643, |
| "ce_loss_7": 3.233772474527359, |
| "epoch": 0.371, |
| "grad_norm": 15.511174434634698, |
| "kl_loss_13": 2980.0, |
| "kl_loss_26": 1666.4, |
| "kl_loss_39": 1214.3, |
| "kl_loss_7": 3742.4, |
| "learning_rate": 0.0007062631837261557, |
| "loss": 4816.1, |
| "step": 3710 |
| }, |
| { |
| "ce_loss_13": 2.903226691484451, |
| "ce_loss_26": 2.2818103432655334, |
| "ce_loss_39": 2.059009611606598, |
| "ce_loss_52": 1.456637406349182, |
| "ce_loss_7": 3.263377320766449, |
| "epoch": 0.372, |
| "grad_norm": 17.120548608123716, |
| "kl_loss_13": 2952.8, |
| "kl_loss_26": 1642.0, |
| "kl_loss_39": 1187.9, |
| "kl_loss_7": 3710.8, |
| "learning_rate": 0.0007048167838833977, |
| "loss": 4745.55, |
| "step": 3720 |
| }, |
| { |
| "ce_loss_13": 2.900358548760414, |
| "ce_loss_26": 2.2638369113206864, |
| "ce_loss_39": 2.043374678492546, |
| "ce_loss_52": 1.4358570337295533, |
| "ce_loss_7": 3.272378832101822, |
| "epoch": 0.373, |
| "grad_norm": 15.762139849070088, |
| "kl_loss_13": 2995.6, |
| "kl_loss_26": 1646.4, |
| "kl_loss_39": 1202.7, |
| "kl_loss_7": 3778.4, |
| "learning_rate": 0.0007033683215379002, |
| "loss": 4819.05, |
| "step": 3730 |
| }, |
| { |
| "ce_loss_13": 2.891742479801178, |
| "ce_loss_26": 2.2577997177839277, |
| "ce_loss_39": 2.042544272542, |
| "ce_loss_52": 1.4357560023665428, |
| "ce_loss_7": 3.2664382100105285, |
| "epoch": 0.374, |
| "grad_norm": 17.991228767593586, |
| "kl_loss_13": 3005.6, |
| "kl_loss_26": 1661.4, |
| "kl_loss_39": 1210.7, |
| "kl_loss_7": 3790.4, |
| "learning_rate": 0.0007019178112756625, |
| "loss": 4801.4, |
| "step": 3740 |
| }, |
| { |
| "ce_loss_13": 2.937167102098465, |
| "ce_loss_26": 2.3048900216817856, |
| "ce_loss_39": 2.077365005016327, |
| "ce_loss_52": 1.4518427148461341, |
| "ce_loss_7": 3.2986050605773927, |
| "epoch": 0.375, |
| "grad_norm": 17.06397612135392, |
| "kl_loss_13": 3048.4, |
| "kl_loss_26": 1714.2, |
| "kl_loss_39": 1240.0, |
| "kl_loss_7": 3808.4, |
| "learning_rate": 0.0007004652677033068, |
| "loss": 4778.45, |
| "step": 3750 |
| }, |
| { |
| "ce_loss_13": 2.953932785987854, |
| "ce_loss_26": 2.3320761770009995, |
| "ce_loss_39": 2.1045148581266404, |
| "ce_loss_52": 1.472703790664673, |
| "ce_loss_7": 3.3274633824825286, |
| "epoch": 0.376, |
| "grad_norm": 16.845736377094994, |
| "kl_loss_13": 3032.0, |
| "kl_loss_26": 1703.8, |
| "kl_loss_39": 1244.7, |
| "kl_loss_7": 3816.0, |
| "learning_rate": 0.0006990107054479312, |
| "loss": 4794.6, |
| "step": 3760 |
| }, |
| { |
| "ce_loss_13": 2.8548416674137114, |
| "ce_loss_26": 2.240122190117836, |
| "ce_loss_39": 2.0189033895730972, |
| "ce_loss_52": 1.4262803480029107, |
| "ce_loss_7": 3.208429366350174, |
| "epoch": 0.377, |
| "grad_norm": 16.84130111884451, |
| "kl_loss_13": 2924.4, |
| "kl_loss_26": 1609.6, |
| "kl_loss_39": 1161.2, |
| "kl_loss_7": 3672.0, |
| "learning_rate": 0.000697554139156961, |
| "loss": 4779.2, |
| "step": 3770 |
| }, |
| { |
| "ce_loss_13": 2.972896063327789, |
| "ce_loss_26": 2.335559439659119, |
| "ce_loss_39": 2.111876127123833, |
| "ce_loss_52": 1.4984043270349503, |
| "ce_loss_7": 3.330926328897476, |
| "epoch": 0.378, |
| "grad_norm": 17.969038221722915, |
| "kl_loss_13": 3002.8, |
| "kl_loss_26": 1674.0, |
| "kl_loss_39": 1211.2, |
| "kl_loss_7": 3762.0, |
| "learning_rate": 0.0006960955834980027, |
| "loss": 4732.4, |
| "step": 3780 |
| }, |
| { |
| "ce_loss_13": 2.863754612207413, |
| "ce_loss_26": 2.228693225979805, |
| "ce_loss_39": 2.0101536750793456, |
| "ce_loss_52": 1.4073660969734192, |
| "ce_loss_7": 3.2303711056709288, |
| "epoch": 0.379, |
| "grad_norm": 15.796823584167846, |
| "kl_loss_13": 2960.8, |
| "kl_loss_26": 1639.0, |
| "kl_loss_39": 1188.6, |
| "kl_loss_7": 3734.4, |
| "learning_rate": 0.0006946350531586958, |
| "loss": 4740.55, |
| "step": 3790 |
| }, |
| { |
| "ce_loss_13": 2.819410902261734, |
| "ce_loss_26": 2.200511318445206, |
| "ce_loss_39": 1.9842332571744918, |
| "ce_loss_52": 1.400177489221096, |
| "ce_loss_7": 3.1923243761062623, |
| "epoch": 0.38, |
| "grad_norm": 17.863959287343352, |
| "kl_loss_13": 2930.0, |
| "kl_loss_26": 1613.6, |
| "kl_loss_39": 1162.1, |
| "kl_loss_7": 3705.2, |
| "learning_rate": 0.0006931725628465643, |
| "loss": 4745.35, |
| "step": 3800 |
| }, |
| { |
| "ce_loss_13": 2.845439475774765, |
| "ce_loss_26": 2.2171025544404985, |
| "ce_loss_39": 1.9986167669296264, |
| "ce_loss_52": 1.4112813830375672, |
| "ce_loss_7": 3.2001422882080077, |
| "epoch": 0.381, |
| "grad_norm": 15.509448386002845, |
| "kl_loss_13": 2924.0, |
| "kl_loss_26": 1603.8, |
| "kl_loss_39": 1151.4, |
| "kl_loss_7": 3677.6, |
| "learning_rate": 0.0006917081272888696, |
| "loss": 4686.25, |
| "step": 3810 |
| }, |
| { |
| "ce_loss_13": 2.875427797436714, |
| "ce_loss_26": 2.2557172268629073, |
| "ce_loss_39": 2.0311311304569246, |
| "ce_loss_52": 1.4279655352234841, |
| "ce_loss_7": 3.230677658319473, |
| "epoch": 0.382, |
| "grad_norm": 17.274488302565285, |
| "kl_loss_13": 2934.0, |
| "kl_loss_26": 1621.0, |
| "kl_loss_39": 1159.3, |
| "kl_loss_7": 3683.6, |
| "learning_rate": 0.0006902417612324615, |
| "loss": 4684.7, |
| "step": 3820 |
| }, |
| { |
| "ce_loss_13": 2.9117272198200226, |
| "ce_loss_26": 2.261174875497818, |
| "ce_loss_39": 2.036722195148468, |
| "ce_loss_52": 1.4152167439460754, |
| "ce_loss_7": 3.282198351621628, |
| "epoch": 0.383, |
| "grad_norm": 17.87083708364157, |
| "kl_loss_13": 3095.2, |
| "kl_loss_26": 1720.4, |
| "kl_loss_39": 1253.4, |
| "kl_loss_7": 3865.2, |
| "learning_rate": 0.00068877347944363, |
| "loss": 4739.15, |
| "step": 3830 |
| }, |
| { |
| "ce_loss_13": 2.8889047384262083, |
| "ce_loss_26": 2.2653014570474626, |
| "ce_loss_39": 2.0420874893665313, |
| "ce_loss_52": 1.4475852727890015, |
| "ce_loss_7": 3.253549599647522, |
| "epoch": 0.384, |
| "grad_norm": 15.6987701916489, |
| "kl_loss_13": 2966.0, |
| "kl_loss_26": 1638.2, |
| "kl_loss_39": 1187.2, |
| "kl_loss_7": 3729.2, |
| "learning_rate": 0.0006873032967079561, |
| "loss": 4730.9, |
| "step": 3840 |
| }, |
| { |
| "ce_loss_13": 2.9057071805000305, |
| "ce_loss_26": 2.2790849953889847, |
| "ce_loss_39": 2.0592786610126494, |
| "ce_loss_52": 1.452454286813736, |
| "ce_loss_7": 3.266382873058319, |
| "epoch": 0.385, |
| "grad_norm": 15.755925332297407, |
| "kl_loss_13": 2962.0, |
| "kl_loss_26": 1636.4, |
| "kl_loss_39": 1179.7, |
| "kl_loss_7": 3722.8, |
| "learning_rate": 0.0006858312278301637, |
| "loss": 4713.7, |
| "step": 3850 |
| }, |
| { |
| "ce_loss_13": 2.8342252016067504, |
| "ce_loss_26": 2.2319850236177445, |
| "ce_loss_39": 2.022706937789917, |
| "ce_loss_52": 1.4418139278888702, |
| "ce_loss_7": 3.186972415447235, |
| "epoch": 0.386, |
| "grad_norm": 17.081089442059948, |
| "kl_loss_13": 2855.2, |
| "kl_loss_26": 1568.0, |
| "kl_loss_39": 1131.2, |
| "kl_loss_7": 3603.2, |
| "learning_rate": 0.0006843572876339704, |
| "loss": 4675.25, |
| "step": 3860 |
| }, |
| { |
| "ce_loss_13": 2.7886572241783143, |
| "ce_loss_26": 2.173486915230751, |
| "ce_loss_39": 1.9662895441055297, |
| "ce_loss_52": 1.3961340665817261, |
| "ce_loss_7": 3.1484048068523407, |
| "epoch": 0.387, |
| "grad_norm": 18.57744828916969, |
| "kl_loss_13": 2842.0, |
| "kl_loss_26": 1551.8, |
| "kl_loss_39": 1125.9, |
| "kl_loss_7": 3587.2, |
| "learning_rate": 0.0006828814909619373, |
| "loss": 4659.8, |
| "step": 3870 |
| }, |
| { |
| "ce_loss_13": 2.84233677983284, |
| "ce_loss_26": 2.2270043969154356, |
| "ce_loss_39": 2.011353349685669, |
| "ce_loss_52": 1.44394671022892, |
| "ce_loss_7": 3.189998263120651, |
| "epoch": 0.388, |
| "grad_norm": 17.116859396660736, |
| "kl_loss_13": 2866.4, |
| "kl_loss_26": 1581.4, |
| "kl_loss_39": 1130.5, |
| "kl_loss_7": 3602.4, |
| "learning_rate": 0.0006814038526753205, |
| "loss": 4652.3, |
| "step": 3880 |
| }, |
| { |
| "ce_loss_13": 2.8899350225925446, |
| "ce_loss_26": 2.268605652451515, |
| "ce_loss_39": 2.047902289032936, |
| "ce_loss_52": 1.462986382842064, |
| "ce_loss_7": 3.2532753586769103, |
| "epoch": 0.389, |
| "grad_norm": 16.277065053757138, |
| "kl_loss_13": 2901.6, |
| "kl_loss_26": 1603.8, |
| "kl_loss_39": 1148.8, |
| "kl_loss_7": 3655.2, |
| "learning_rate": 0.0006799243876539213, |
| "loss": 4644.45, |
| "step": 3890 |
| }, |
| { |
| "ce_loss_13": 2.852635699510574, |
| "ce_loss_26": 2.225254198908806, |
| "ce_loss_39": 2.00534345805645, |
| "ce_loss_52": 1.420480152964592, |
| "ce_loss_7": 3.217593324184418, |
| "epoch": 0.39, |
| "grad_norm": 17.575618857452827, |
| "kl_loss_13": 2895.2, |
| "kl_loss_26": 1582.6, |
| "kl_loss_39": 1134.8, |
| "kl_loss_7": 3662.8, |
| "learning_rate": 0.0006784431107959359, |
| "loss": 4640.8, |
| "step": 3900 |
| }, |
| { |
| "ce_loss_13": 2.9095449209213258, |
| "ce_loss_26": 2.288859358429909, |
| "ce_loss_39": 2.069254148006439, |
| "ce_loss_52": 1.4762457937002182, |
| "ce_loss_7": 3.2724156618118285, |
| "epoch": 0.391, |
| "grad_norm": 15.314925266098216, |
| "kl_loss_13": 2939.6, |
| "kl_loss_26": 1620.2, |
| "kl_loss_39": 1162.8, |
| "kl_loss_7": 3702.8, |
| "learning_rate": 0.0006769600370178059, |
| "loss": 4625.75, |
| "step": 3910 |
| }, |
| { |
| "ce_loss_13": 2.79736613035202, |
| "ce_loss_26": 2.1872033685445786, |
| "ce_loss_39": 1.9660126984119415, |
| "ce_loss_52": 1.3993165016174316, |
| "ce_loss_7": 3.152447110414505, |
| "epoch": 0.392, |
| "grad_norm": 15.234701615575748, |
| "kl_loss_13": 2856.0, |
| "kl_loss_26": 1574.6, |
| "kl_loss_39": 1119.8, |
| "kl_loss_7": 3607.6, |
| "learning_rate": 0.0006754751812540679, |
| "loss": 4587.85, |
| "step": 3920 |
| }, |
| { |
| "ce_loss_13": 2.8410171031951905, |
| "ce_loss_26": 2.2249913841485975, |
| "ce_loss_39": 2.0135372936725617, |
| "ce_loss_52": 1.4371111243963242, |
| "ce_loss_7": 3.2084967494010925, |
| "epoch": 0.393, |
| "grad_norm": 16.62173105303993, |
| "kl_loss_13": 2885.6, |
| "kl_loss_26": 1588.2, |
| "kl_loss_39": 1146.8, |
| "kl_loss_7": 3644.4, |
| "learning_rate": 0.0006739885584572025, |
| "loss": 4635.2, |
| "step": 3930 |
| }, |
| { |
| "ce_loss_13": 2.7806951224803926, |
| "ce_loss_26": 2.1756977647542954, |
| "ce_loss_39": 1.96949442923069, |
| "ce_loss_52": 1.415724617242813, |
| "ce_loss_7": 3.1287400901317595, |
| "epoch": 0.394, |
| "grad_norm": 15.878619218635833, |
| "kl_loss_13": 2836.2, |
| "kl_loss_26": 1541.8, |
| "kl_loss_39": 1104.9, |
| "kl_loss_7": 3581.6, |
| "learning_rate": 0.0006725001835974853, |
| "loss": 4637.75, |
| "step": 3940 |
| }, |
| { |
| "ce_loss_13": 2.85609056353569, |
| "ce_loss_26": 2.228466436266899, |
| "ce_loss_39": 2.011217701435089, |
| "ce_loss_52": 1.4336451053619386, |
| "ce_loss_7": 3.212037581205368, |
| "epoch": 0.395, |
| "grad_norm": 15.588059225669095, |
| "kl_loss_13": 2892.8, |
| "kl_loss_26": 1574.8, |
| "kl_loss_39": 1125.7, |
| "kl_loss_7": 3657.6, |
| "learning_rate": 0.0006710100716628344, |
| "loss": 4584.95, |
| "step": 3950 |
| }, |
| { |
| "ce_loss_13": 2.820618736743927, |
| "ce_loss_26": 2.1797895193099976, |
| "ce_loss_39": 1.9612275928258895, |
| "ce_loss_52": 1.3932116001844406, |
| "ce_loss_7": 3.1924599528312685, |
| "epoch": 0.396, |
| "grad_norm": 14.878251588185849, |
| "kl_loss_13": 2911.2, |
| "kl_loss_26": 1556.2, |
| "kl_loss_39": 1114.5, |
| "kl_loss_7": 3694.0, |
| "learning_rate": 0.0006695182376586602, |
| "loss": 4607.1, |
| "step": 3960 |
| }, |
| { |
| "ce_loss_13": 2.7754017412662506, |
| "ce_loss_26": 2.1572470903396606, |
| "ce_loss_39": 1.9344938546419144, |
| "ce_loss_52": 1.3711352616548538, |
| "ce_loss_7": 3.1346897959709166, |
| "epoch": 0.397, |
| "grad_norm": 15.39943522658609, |
| "kl_loss_13": 2875.2, |
| "kl_loss_26": 1575.1, |
| "kl_loss_39": 1124.5, |
| "kl_loss_7": 3635.6, |
| "learning_rate": 0.000668024696607715, |
| "loss": 4546.3, |
| "step": 3970 |
| }, |
| { |
| "ce_loss_13": 2.7410697996616364, |
| "ce_loss_26": 2.1528750866651536, |
| "ce_loss_39": 1.944345197081566, |
| "ce_loss_52": 1.4029324680566788, |
| "ce_loss_7": 3.0945769369602205, |
| "epoch": 0.398, |
| "grad_norm": 16.69493947597699, |
| "kl_loss_13": 2742.0, |
| "kl_loss_26": 1499.6, |
| "kl_loss_39": 1066.1, |
| "kl_loss_7": 3478.0, |
| "learning_rate": 0.0006665294635499404, |
| "loss": 4509.25, |
| "step": 3980 |
| }, |
| { |
| "ce_loss_13": 2.7935349524021147, |
| "ce_loss_26": 2.191756248474121, |
| "ce_loss_39": 1.9830526530742645, |
| "ce_loss_52": 1.4325652569532394, |
| "ce_loss_7": 3.150054842233658, |
| "epoch": 0.399, |
| "grad_norm": 15.984763021073704, |
| "kl_loss_13": 2764.0, |
| "kl_loss_26": 1503.8, |
| "kl_loss_39": 1075.4, |
| "kl_loss_7": 3508.8, |
| "learning_rate": 0.0006650325535423167, |
| "loss": 4542.85, |
| "step": 3990 |
| }, |
| { |
| "ce_loss_13": 2.7841295659542085, |
| "ce_loss_26": 2.175816202163696, |
| "ce_loss_39": 1.9610484838485718, |
| "ce_loss_52": 1.3994766443967819, |
| "ce_loss_7": 3.1450257122516634, |
| "epoch": 0.4, |
| "grad_norm": 16.383690879711693, |
| "kl_loss_13": 2832.8, |
| "kl_loss_26": 1534.6, |
| "kl_loss_39": 1101.1, |
| "kl_loss_7": 3587.2, |
| "learning_rate": 0.0006635339816587109, |
| "loss": 4584.95, |
| "step": 4000 |
| }, |
| { |
| "ce_loss_13": 2.937473142147064, |
| "ce_loss_26": 2.298046553134918, |
| "ce_loss_39": 2.071186339855194, |
| "ce_loss_52": 1.4680579513311387, |
| "ce_loss_7": 3.2991883754730225, |
| "epoch": 0.401, |
| "grad_norm": 16.69896458470603, |
| "kl_loss_13": 2974.0, |
| "kl_loss_26": 1650.8, |
| "kl_loss_39": 1187.0, |
| "kl_loss_7": 3734.8, |
| "learning_rate": 0.0006620337629897252, |
| "loss": 4574.8, |
| "step": 4010 |
| }, |
| { |
| "ce_loss_13": 2.803048574924469, |
| "ce_loss_26": 2.1910858035087584, |
| "ce_loss_39": 1.977920189499855, |
| "ce_loss_52": 1.4274337738752365, |
| "ce_loss_7": 3.1627039849758147, |
| "epoch": 0.402, |
| "grad_norm": 15.21058574655926, |
| "kl_loss_13": 2803.0, |
| "kl_loss_26": 1508.9, |
| "kl_loss_39": 1074.1, |
| "kl_loss_7": 3558.8, |
| "learning_rate": 0.0006605319126425454, |
| "loss": 4546.4, |
| "step": 4020 |
| }, |
| { |
| "ce_loss_13": 2.8307320177555084, |
| "ce_loss_26": 2.208324944972992, |
| "ce_loss_39": 1.9950761079788208, |
| "ce_loss_52": 1.435056920349598, |
| "ce_loss_7": 3.19031218290329, |
| "epoch": 0.403, |
| "grad_norm": 14.837343102998657, |
| "kl_loss_13": 2876.0, |
| "kl_loss_26": 1550.3, |
| "kl_loss_39": 1112.9, |
| "kl_loss_7": 3638.4, |
| "learning_rate": 0.0006590284457407876, |
| "loss": 4535.35, |
| "step": 4030 |
| }, |
| { |
| "ce_loss_13": 2.8277206301689146, |
| "ce_loss_26": 2.2229607343673705, |
| "ce_loss_39": 2.0126491367816923, |
| "ce_loss_52": 1.465662133693695, |
| "ce_loss_7": 3.178615337610245, |
| "epoch": 0.404, |
| "grad_norm": 15.868817769840305, |
| "kl_loss_13": 2801.6, |
| "kl_loss_26": 1514.6, |
| "kl_loss_39": 1078.5, |
| "kl_loss_7": 3548.0, |
| "learning_rate": 0.0006575233774243465, |
| "loss": 4524.1, |
| "step": 4040 |
| }, |
| { |
| "ce_loss_13": 2.741392558813095, |
| "ce_loss_26": 2.1182916700839995, |
| "ce_loss_39": 1.9061576217412948, |
| "ce_loss_52": 1.3709532082080842, |
| "ce_loss_7": 3.1065491139888763, |
| "epoch": 0.405, |
| "grad_norm": 16.502947013390255, |
| "kl_loss_13": 2798.4, |
| "kl_loss_26": 1495.2, |
| "kl_loss_39": 1058.1, |
| "kl_loss_7": 3565.6, |
| "learning_rate": 0.0006560167228492435, |
| "loss": 4528.6, |
| "step": 4050 |
| }, |
| { |
| "ce_loss_13": 2.8996002614498138, |
| "ce_loss_26": 2.271700030565262, |
| "ce_loss_39": 2.045673191547394, |
| "ce_loss_52": 1.4674718797206878, |
| "ce_loss_7": 3.2622067093849183, |
| "epoch": 0.406, |
| "grad_norm": 15.215707475527795, |
| "kl_loss_13": 2900.0, |
| "kl_loss_26": 1589.6, |
| "kl_loss_39": 1131.4, |
| "kl_loss_7": 3660.8, |
| "learning_rate": 0.0006545084971874737, |
| "loss": 4547.15, |
| "step": 4060 |
| }, |
| { |
| "ce_loss_13": 2.8251163959503174, |
| "ce_loss_26": 2.1874846637248995, |
| "ce_loss_39": 1.9672167718410491, |
| "ce_loss_52": 1.4135777831077576, |
| "ce_loss_7": 3.1873776078224183, |
| "epoch": 0.407, |
| "grad_norm": 15.755939255613459, |
| "kl_loss_13": 2866.0, |
| "kl_loss_26": 1547.6, |
| "kl_loss_39": 1092.9, |
| "kl_loss_7": 3627.6, |
| "learning_rate": 0.0006529987156268526, |
| "loss": 4503.1, |
| "step": 4070 |
| }, |
| { |
| "ce_loss_13": 2.7349390149116517, |
| "ce_loss_26": 2.1141091108322145, |
| "ce_loss_39": 1.909931591153145, |
| "ce_loss_52": 1.3686757802963256, |
| "ce_loss_7": 3.0966077923774717, |
| "epoch": 0.408, |
| "grad_norm": 15.787212276524022, |
| "kl_loss_13": 2801.6, |
| "kl_loss_26": 1509.4, |
| "kl_loss_39": 1071.0, |
| "kl_loss_7": 3562.8, |
| "learning_rate": 0.0006514873933708637, |
| "loss": 4534.05, |
| "step": 4080 |
| }, |
| { |
| "ce_loss_13": 2.742733418941498, |
| "ce_loss_26": 2.1391125679016114, |
| "ce_loss_39": 1.9272442519664765, |
| "ce_loss_52": 1.387654460966587, |
| "ce_loss_7": 3.0977914452552797, |
| "epoch": 0.409, |
| "grad_norm": 15.727797591546214, |
| "kl_loss_13": 2755.6, |
| "kl_loss_26": 1488.4, |
| "kl_loss_39": 1050.3, |
| "kl_loss_7": 3508.0, |
| "learning_rate": 0.0006499745456385053, |
| "loss": 4444.65, |
| "step": 4090 |
| }, |
| { |
| "ce_loss_13": 2.7960755199193956, |
| "ce_loss_26": 2.184322661161423, |
| "ce_loss_39": 1.9677571415901185, |
| "ce_loss_52": 1.4271342948079109, |
| "ce_loss_7": 3.1514409124851226, |
| "epoch": 0.41, |
| "grad_norm": 15.52426613691677, |
| "kl_loss_13": 2809.8, |
| "kl_loss_26": 1518.3, |
| "kl_loss_39": 1075.2, |
| "kl_loss_7": 3551.6, |
| "learning_rate": 0.0006484601876641375, |
| "loss": 4500.65, |
| "step": 4100 |
| }, |
| { |
| "ce_loss_13": 2.8776713728904726, |
| "ce_loss_26": 2.257500499486923, |
| "ce_loss_39": 2.0303492128849028, |
| "ce_loss_52": 1.4582158356904984, |
| "ce_loss_7": 3.2387999415397646, |
| "epoch": 0.411, |
| "grad_norm": 15.93298743678484, |
| "kl_loss_13": 2878.8, |
| "kl_loss_26": 1576.0, |
| "kl_loss_39": 1115.6, |
| "kl_loss_7": 3640.8, |
| "learning_rate": 0.000646944334697328, |
| "loss": 4470.55, |
| "step": 4110 |
| }, |
| { |
| "ce_loss_13": 2.802631789445877, |
| "ce_loss_26": 2.2029493927955626, |
| "ce_loss_39": 2.001139259338379, |
| "ce_loss_52": 1.4623139530420304, |
| "ce_loss_7": 3.155901437997818, |
| "epoch": 0.412, |
| "grad_norm": 14.691054390726734, |
| "kl_loss_13": 2720.8, |
| "kl_loss_26": 1465.2, |
| "kl_loss_39": 1041.4, |
| "kl_loss_7": 3462.4, |
| "learning_rate": 0.0006454270020026995, |
| "loss": 4502.65, |
| "step": 4120 |
| }, |
| { |
| "ce_loss_13": 2.8162184596061706, |
| "ce_loss_26": 2.1934009909629824, |
| "ce_loss_39": 1.979950374364853, |
| "ce_loss_52": 1.4344559267163277, |
| "ce_loss_7": 3.1758966505527497, |
| "epoch": 0.413, |
| "grad_norm": 16.25780643806628, |
| "kl_loss_13": 2816.0, |
| "kl_loss_26": 1518.6, |
| "kl_loss_39": 1077.6, |
| "kl_loss_7": 3573.6, |
| "learning_rate": 0.0006439082048597755, |
| "loss": 4487.45, |
| "step": 4130 |
| }, |
| { |
| "ce_loss_13": 2.787912631034851, |
| "ce_loss_26": 2.1966257959604265, |
| "ce_loss_39": 1.9914580851793289, |
| "ce_loss_52": 1.4511510521173476, |
| "ce_loss_7": 3.1392914772033693, |
| "epoch": 0.414, |
| "grad_norm": 17.37704963704925, |
| "kl_loss_13": 2734.8, |
| "kl_loss_26": 1487.2, |
| "kl_loss_39": 1057.4, |
| "kl_loss_7": 3474.0, |
| "learning_rate": 0.0006423879585628261, |
| "loss": 4448.15, |
| "step": 4140 |
| }, |
| { |
| "ce_loss_13": 2.817258411645889, |
| "ce_loss_26": 2.1947576314210893, |
| "ce_loss_39": 1.9762789696455, |
| "ce_loss_52": 1.433014589548111, |
| "ce_loss_7": 3.182687884569168, |
| "epoch": 0.415, |
| "grad_norm": 15.35502556975723, |
| "kl_loss_13": 2826.8, |
| "kl_loss_26": 1522.0, |
| "kl_loss_39": 1072.5, |
| "kl_loss_7": 3595.2, |
| "learning_rate": 0.0006408662784207149, |
| "loss": 4433.75, |
| "step": 4150 |
| }, |
| { |
| "ce_loss_13": 2.817685341835022, |
| "ce_loss_26": 2.2071537256240843, |
| "ce_loss_39": 1.9907894372940063, |
| "ce_loss_52": 1.4230278193950654, |
| "ce_loss_7": 3.1795800507068632, |
| "epoch": 0.416, |
| "grad_norm": 15.573867614749913, |
| "kl_loss_13": 2866.0, |
| "kl_loss_26": 1558.6, |
| "kl_loss_39": 1107.2, |
| "kl_loss_7": 3632.0, |
| "learning_rate": 0.0006393431797567439, |
| "loss": 4436.3, |
| "step": 4160 |
| }, |
| { |
| "ce_loss_13": 2.819452613592148, |
| "ce_loss_26": 2.213544499874115, |
| "ce_loss_39": 1.9934939831495284, |
| "ce_loss_52": 1.4420817136764525, |
| "ce_loss_7": 3.1729123532772063, |
| "epoch": 0.417, |
| "grad_norm": 15.840337845359416, |
| "kl_loss_13": 2809.4, |
| "kl_loss_26": 1533.4, |
| "kl_loss_39": 1076.0, |
| "kl_loss_7": 3544.0, |
| "learning_rate": 0.0006378186779084996, |
| "loss": 4429.6, |
| "step": 4170 |
| }, |
| { |
| "ce_loss_13": 2.797993552684784, |
| "ce_loss_26": 2.2015393495559694, |
| "ce_loss_39": 1.986987265944481, |
| "ce_loss_52": 1.446770191192627, |
| "ce_loss_7": 3.145763796567917, |
| "epoch": 0.418, |
| "grad_norm": 16.258575254109445, |
| "kl_loss_13": 2768.4, |
| "kl_loss_26": 1520.4, |
| "kl_loss_39": 1076.5, |
| "kl_loss_7": 3502.0, |
| "learning_rate": 0.0006362927882276989, |
| "loss": 4452.8, |
| "step": 4180 |
| }, |
| { |
| "ce_loss_13": 2.809996685385704, |
| "ce_loss_26": 2.1883741706609725, |
| "ce_loss_39": 1.972084966301918, |
| "ce_loss_52": 1.4272316336631774, |
| "ce_loss_7": 3.1641923069953917, |
| "epoch": 0.419, |
| "grad_norm": 17.021132117568744, |
| "kl_loss_13": 2806.8, |
| "kl_loss_26": 1522.7, |
| "kl_loss_39": 1076.0, |
| "kl_loss_7": 3556.4, |
| "learning_rate": 0.000634765526080034, |
| "loss": 4434.25, |
| "step": 4190 |
| }, |
| { |
| "ce_loss_13": 2.7747348487377166, |
| "ce_loss_26": 2.1618224531412125, |
| "ce_loss_39": 1.9505164802074433, |
| "ce_loss_52": 1.4064817115664483, |
| "ce_loss_7": 3.1292604207992554, |
| "epoch": 0.42, |
| "grad_norm": 15.556302486325128, |
| "kl_loss_13": 2777.6, |
| "kl_loss_26": 1495.4, |
| "kl_loss_39": 1055.2, |
| "kl_loss_7": 3523.2, |
| "learning_rate": 0.0006332369068450174, |
| "loss": 4413.55, |
| "step": 4200 |
| }, |
| { |
| "ce_loss_13": 2.748269832134247, |
| "ce_loss_26": 2.145698443055153, |
| "ce_loss_39": 1.935601145029068, |
| "ce_loss_52": 1.4105115324258803, |
| "ce_loss_7": 3.1001435458660125, |
| "epoch": 0.421, |
| "grad_norm": 15.348610438295403, |
| "kl_loss_13": 2742.8, |
| "kl_loss_26": 1480.0, |
| "kl_loss_39": 1039.8, |
| "kl_loss_7": 3490.0, |
| "learning_rate": 0.0006317069459158283, |
| "loss": 4363.8, |
| "step": 4210 |
| }, |
| { |
| "ce_loss_13": 2.7747100263834, |
| "ce_loss_26": 2.16818388402462, |
| "ce_loss_39": 1.9505507349967957, |
| "ce_loss_52": 1.4193186193704606, |
| "ce_loss_7": 3.136371600627899, |
| "epoch": 0.422, |
| "grad_norm": 16.358740351868324, |
| "kl_loss_13": 2764.6, |
| "kl_loss_26": 1481.3, |
| "kl_loss_39": 1040.1, |
| "kl_loss_7": 3516.8, |
| "learning_rate": 0.0006301756586991561, |
| "loss": 4421.65, |
| "step": 4220 |
| }, |
| { |
| "ce_loss_13": 2.8185549050569536, |
| "ce_loss_26": 2.226038011908531, |
| "ce_loss_39": 2.013939729332924, |
| "ce_loss_52": 1.4788149103522301, |
| "ce_loss_7": 3.1706756830215452, |
| "epoch": 0.423, |
| "grad_norm": 14.82164626530813, |
| "kl_loss_13": 2758.0, |
| "kl_loss_26": 1495.6, |
| "kl_loss_39": 1059.3, |
| "kl_loss_7": 3503.2, |
| "learning_rate": 0.0006286430606150459, |
| "loss": 4398.35, |
| "step": 4230 |
| }, |
| { |
| "ce_loss_13": 2.7891676902770994, |
| "ce_loss_26": 2.1986444026231764, |
| "ce_loss_39": 1.9819349884986877, |
| "ce_loss_52": 1.4562569051980971, |
| "ce_loss_7": 3.1411671698093415, |
| "epoch": 0.424, |
| "grad_norm": 15.535941880253773, |
| "kl_loss_13": 2717.2, |
| "kl_loss_26": 1468.6, |
| "kl_loss_39": 1020.2, |
| "kl_loss_7": 3457.2, |
| "learning_rate": 0.0006271091670967436, |
| "loss": 4370.45, |
| "step": 4240 |
| }, |
| { |
| "ce_loss_13": 2.8151471495628355, |
| "ce_loss_26": 2.204052150249481, |
| "ce_loss_39": 1.9960095703601837, |
| "ce_loss_52": 1.45780867934227, |
| "ce_loss_7": 3.1626071453094484, |
| "epoch": 0.425, |
| "grad_norm": 16.39177349451075, |
| "kl_loss_13": 2749.2, |
| "kl_loss_26": 1471.2, |
| "kl_loss_39": 1041.9, |
| "kl_loss_7": 3492.8, |
| "learning_rate": 0.0006255739935905395, |
| "loss": 4354.95, |
| "step": 4250 |
| }, |
| { |
| "ce_loss_13": 2.7719932794570923, |
| "ce_loss_26": 2.1723096281290055, |
| "ce_loss_39": 1.9554951965808869, |
| "ce_loss_52": 1.4198345810174942, |
| "ce_loss_7": 3.134742945432663, |
| "epoch": 0.426, |
| "grad_norm": 17.215386749382045, |
| "kl_loss_13": 2775.6, |
| "kl_loss_26": 1506.6, |
| "kl_loss_39": 1055.7, |
| "kl_loss_7": 3532.8, |
| "learning_rate": 0.0006240375555556145, |
| "loss": 4360.8, |
| "step": 4260 |
| }, |
| { |
| "ce_loss_13": 2.7217872977256774, |
| "ce_loss_26": 2.1173421651124955, |
| "ce_loss_39": 1.9085008651018143, |
| "ce_loss_52": 1.400168927013874, |
| "ce_loss_7": 3.0799288749694824, |
| "epoch": 0.427, |
| "grad_norm": 15.867423276307166, |
| "kl_loss_13": 2701.0, |
| "kl_loss_26": 1432.6, |
| "kl_loss_39": 996.7, |
| "kl_loss_7": 3452.8, |
| "learning_rate": 0.000622499868463882, |
| "loss": 4320.5, |
| "step": 4270 |
| }, |
| { |
| "ce_loss_13": 2.7815617978572846, |
| "ce_loss_26": 2.1786680042743685, |
| "ce_loss_39": 1.9648784220218658, |
| "ce_loss_52": 1.4438522070646287, |
| "ce_loss_7": 3.1414669275283815, |
| "epoch": 0.428, |
| "grad_norm": 16.86028992899928, |
| "kl_loss_13": 2733.2, |
| "kl_loss_26": 1463.0, |
| "kl_loss_39": 1028.0, |
| "kl_loss_7": 3484.8, |
| "learning_rate": 0.0006209609477998338, |
| "loss": 4348.9, |
| "step": 4280 |
| }, |
| { |
| "ce_loss_13": 2.8184913277626036, |
| "ce_loss_26": 2.213253751397133, |
| "ce_loss_39": 1.986603057384491, |
| "ce_loss_52": 1.4555893182754516, |
| "ce_loss_7": 3.1685641705989838, |
| "epoch": 0.429, |
| "grad_norm": 15.40477364702056, |
| "kl_loss_13": 2779.6, |
| "kl_loss_26": 1503.0, |
| "kl_loss_39": 1049.7, |
| "kl_loss_7": 3514.4, |
| "learning_rate": 0.0006194208090603844, |
| "loss": 4374.7, |
| "step": 4290 |
| }, |
| { |
| "ce_loss_13": 2.726405268907547, |
| "ce_loss_26": 2.1394855052232744, |
| "ce_loss_39": 1.9364097625017167, |
| "ce_loss_52": 1.4365313708782197, |
| "ce_loss_7": 3.0755336761474608, |
| "epoch": 0.43, |
| "grad_norm": 14.784393649721942, |
| "kl_loss_13": 2680.0, |
| "kl_loss_26": 1434.2, |
| "kl_loss_39": 1002.6, |
| "kl_loss_7": 3415.2, |
| "learning_rate": 0.0006178794677547138, |
| "loss": 4325.15, |
| "step": 4300 |
| }, |
| { |
| "ce_loss_13": 2.78907487988472, |
| "ce_loss_26": 2.1874548703432084, |
| "ce_loss_39": 1.9674001038074493, |
| "ce_loss_52": 1.4388054758310318, |
| "ce_loss_7": 3.1580194234848022, |
| "epoch": 0.431, |
| "grad_norm": 15.540150658114959, |
| "kl_loss_13": 2772.4, |
| "kl_loss_26": 1489.8, |
| "kl_loss_39": 1036.1, |
| "kl_loss_7": 3534.8, |
| "learning_rate": 0.0006163369394041111, |
| "loss": 4337.1, |
| "step": 4310 |
| }, |
| { |
| "ce_loss_13": 2.7502326130867005, |
| "ce_loss_26": 2.1552721470594407, |
| "ce_loss_39": 1.9502787470817566, |
| "ce_loss_52": 1.4348126232624054, |
| "ce_loss_7": 3.1085386633872987, |
| "epoch": 0.432, |
| "grad_norm": 15.900486211327715, |
| "kl_loss_13": 2709.8, |
| "kl_loss_26": 1438.4, |
| "kl_loss_39": 1010.4, |
| "kl_loss_7": 3455.6, |
| "learning_rate": 0.0006147932395418205, |
| "loss": 4308.0, |
| "step": 4320 |
| }, |
| { |
| "ce_loss_13": 2.7637496650218965, |
| "ce_loss_26": 2.1625583559274673, |
| "ce_loss_39": 1.947121372818947, |
| "ce_loss_52": 1.4198297888040543, |
| "ce_loss_7": 3.1241161942481996, |
| "epoch": 0.433, |
| "grad_norm": 16.260371827994177, |
| "kl_loss_13": 2733.2, |
| "kl_loss_26": 1467.8, |
| "kl_loss_39": 1033.8, |
| "kl_loss_7": 3485.2, |
| "learning_rate": 0.0006132483837128823, |
| "loss": 4327.3, |
| "step": 4330 |
| }, |
| { |
| "ce_loss_13": 2.780191105604172, |
| "ce_loss_26": 2.1823483228683473, |
| "ce_loss_39": 1.9749175161123276, |
| "ce_loss_52": 1.4566338241100312, |
| "ce_loss_7": 3.142491656541824, |
| "epoch": 0.434, |
| "grad_norm": 16.173065879753995, |
| "kl_loss_13": 2713.6, |
| "kl_loss_26": 1446.4, |
| "kl_loss_39": 1012.3, |
| "kl_loss_7": 3465.6, |
| "learning_rate": 0.0006117023874739772, |
| "loss": 4346.0, |
| "step": 4340 |
| }, |
| { |
| "ce_loss_13": 2.756999599933624, |
| "ce_loss_26": 2.151958614587784, |
| "ce_loss_39": 1.9352585464715957, |
| "ce_loss_52": 1.4167816311120986, |
| "ce_loss_7": 3.1229954183101656, |
| "epoch": 0.435, |
| "grad_norm": 16.656646084830363, |
| "kl_loss_13": 2759.6, |
| "kl_loss_26": 1478.0, |
| "kl_loss_39": 1029.0, |
| "kl_loss_7": 3524.0, |
| "learning_rate": 0.0006101552663932703, |
| "loss": 4336.25, |
| "step": 4350 |
| }, |
| { |
| "ce_loss_13": 2.774202525615692, |
| "ce_loss_26": 2.172477602958679, |
| "ce_loss_39": 1.9620429188013078, |
| "ce_loss_52": 1.43767509162426, |
| "ce_loss_7": 3.1362563192844393, |
| "epoch": 0.436, |
| "grad_norm": 16.067338284310296, |
| "kl_loss_13": 2744.4, |
| "kl_loss_26": 1472.4, |
| "kl_loss_39": 1033.1, |
| "kl_loss_7": 3493.6, |
| "learning_rate": 0.0006086070360502539, |
| "loss": 4296.35, |
| "step": 4360 |
| }, |
| { |
| "ce_loss_13": 2.787889677286148, |
| "ce_loss_26": 2.208648791909218, |
| "ce_loss_39": 1.999781733751297, |
| "ce_loss_52": 1.4855108827352523, |
| "ce_loss_7": 3.1249564945697785, |
| "epoch": 0.437, |
| "grad_norm": 15.78991831926034, |
| "kl_loss_13": 2690.0, |
| "kl_loss_26": 1446.0, |
| "kl_loss_39": 1014.8, |
| "kl_loss_7": 3419.2, |
| "learning_rate": 0.0006070577120355903, |
| "loss": 4280.75, |
| "step": 4370 |
| }, |
| { |
| "ce_loss_13": 2.8026595056056975, |
| "ce_loss_26": 2.207309713959694, |
| "ce_loss_39": 2.0008264780044556, |
| "ce_loss_52": 1.4935471057891845, |
| "ce_loss_7": 3.1493531346321104, |
| "epoch": 0.438, |
| "grad_norm": 15.837154081953376, |
| "kl_loss_13": 2679.6, |
| "kl_loss_26": 1429.2, |
| "kl_loss_39": 1001.1, |
| "kl_loss_7": 3413.2, |
| "learning_rate": 0.0006055073099509549, |
| "loss": 4296.35, |
| "step": 4380 |
| }, |
| { |
| "ce_loss_13": 2.755897510051727, |
| "ce_loss_26": 2.1693040400743486, |
| "ce_loss_39": 1.9623985677957534, |
| "ce_loss_52": 1.4462745368480683, |
| "ce_loss_7": 3.1059607326984406, |
| "epoch": 0.439, |
| "grad_norm": 15.629443906703631, |
| "kl_loss_13": 2694.4, |
| "kl_loss_26": 1446.6, |
| "kl_loss_39": 1012.9, |
| "kl_loss_7": 3427.6, |
| "learning_rate": 0.0006039558454088796, |
| "loss": 4277.25, |
| "step": 4390 |
| }, |
| { |
| "ce_loss_13": 2.7673678040504455, |
| "ce_loss_26": 2.159538361430168, |
| "ce_loss_39": 1.9505891352891922, |
| "ce_loss_52": 1.4304020568728446, |
| "ce_loss_7": 3.1244628012180327, |
| "epoch": 0.44, |
| "grad_norm": 15.403089942991496, |
| "kl_loss_13": 2740.4, |
| "kl_loss_26": 1465.4, |
| "kl_loss_39": 1024.9, |
| "kl_loss_7": 3482.4, |
| "learning_rate": 0.0006024033340325954, |
| "loss": 4300.2, |
| "step": 4400 |
| }, |
| { |
| "ce_loss_13": 2.7479640781879424, |
| "ce_loss_26": 2.1436998754739762, |
| "ce_loss_39": 1.9348597198724746, |
| "ce_loss_52": 1.4162754774093629, |
| "ce_loss_7": 3.1057413816452026, |
| "epoch": 0.441, |
| "grad_norm": 16.11204916554698, |
| "kl_loss_13": 2726.0, |
| "kl_loss_26": 1457.7, |
| "kl_loss_39": 1024.6, |
| "kl_loss_7": 3474.8, |
| "learning_rate": 0.0006008497914558743, |
| "loss": 4264.9, |
| "step": 4410 |
| }, |
| { |
| "ce_loss_13": 2.781216788291931, |
| "ce_loss_26": 2.1821627736091616, |
| "ce_loss_39": 1.9793646305799484, |
| "ce_loss_52": 1.456499743461609, |
| "ce_loss_7": 3.1415066480636598, |
| "epoch": 0.442, |
| "grad_norm": 15.839943843413481, |
| "kl_loss_13": 2703.2, |
| "kl_loss_26": 1453.2, |
| "kl_loss_39": 1022.0, |
| "kl_loss_7": 3462.8, |
| "learning_rate": 0.0005992952333228728, |
| "loss": 4320.7, |
| "step": 4420 |
| }, |
| { |
| "ce_loss_13": 2.6314639270305635, |
| "ce_loss_26": 2.0378955364227296, |
| "ce_loss_39": 1.841288161277771, |
| "ce_loss_52": 1.367735171318054, |
| "ce_loss_7": 2.983852916955948, |
| "epoch": 0.443, |
| "grad_norm": 15.715297314159198, |
| "kl_loss_13": 2586.8, |
| "kl_loss_26": 1338.4, |
| "kl_loss_39": 929.4, |
| "kl_loss_7": 3328.4, |
| "learning_rate": 0.0005977396752879741, |
| "loss": 4224.0, |
| "step": 4430 |
| }, |
| { |
| "ce_loss_13": 2.747091996669769, |
| "ce_loss_26": 2.1425569266080857, |
| "ce_loss_39": 1.9297908574342728, |
| "ce_loss_52": 1.4299270451068877, |
| "ce_loss_7": 3.0992358028888702, |
| "epoch": 0.444, |
| "grad_norm": 15.44431585804275, |
| "kl_loss_13": 2688.8, |
| "kl_loss_26": 1428.0, |
| "kl_loss_39": 986.6, |
| "kl_loss_7": 3434.0, |
| "learning_rate": 0.0005961831330156305, |
| "loss": 4224.4, |
| "step": 4440 |
| }, |
| { |
| "ce_loss_13": 2.7738942086696623, |
| "ce_loss_26": 2.1665944904088974, |
| "ce_loss_39": 1.9503348082304002, |
| "ce_loss_52": 1.4405199617147446, |
| "ce_loss_7": 3.131233388185501, |
| "epoch": 0.445, |
| "grad_norm": 15.683988530213393, |
| "kl_loss_13": 2701.2, |
| "kl_loss_26": 1441.4, |
| "kl_loss_39": 999.9, |
| "kl_loss_7": 3451.6, |
| "learning_rate": 0.0005946256221802051, |
| "loss": 4233.55, |
| "step": 4450 |
| }, |
| { |
| "ce_loss_13": 2.6961427688598634, |
| "ce_loss_26": 2.1098335653543474, |
| "ce_loss_39": 1.8978259444236756, |
| "ce_loss_52": 1.415482410788536, |
| "ce_loss_7": 3.044248181581497, |
| "epoch": 0.446, |
| "grad_norm": 15.256632143150593, |
| "kl_loss_13": 2612.8, |
| "kl_loss_26": 1380.2, |
| "kl_loss_39": 951.7, |
| "kl_loss_7": 3350.8, |
| "learning_rate": 0.0005930671584658151, |
| "loss": 4214.65, |
| "step": 4460 |
| }, |
| { |
| "ce_loss_13": 2.732464927434921, |
| "ce_loss_26": 2.141967472434044, |
| "ce_loss_39": 1.929695299267769, |
| "ce_loss_52": 1.4190144926309585, |
| "ce_loss_7": 3.091973352432251, |
| "epoch": 0.447, |
| "grad_norm": 16.38296166656899, |
| "kl_loss_13": 2676.8, |
| "kl_loss_26": 1435.6, |
| "kl_loss_39": 1000.5, |
| "kl_loss_7": 3428.4, |
| "learning_rate": 0.0005915077575661722, |
| "loss": 4280.4, |
| "step": 4470 |
| }, |
| { |
| "ce_loss_13": 2.683997756242752, |
| "ce_loss_26": 2.091261792182922, |
| "ce_loss_39": 1.8827648997306823, |
| "ce_loss_52": 1.3911016047000886, |
| "ce_loss_7": 3.0366858661174776, |
| "epoch": 0.448, |
| "grad_norm": 15.184401397346244, |
| "kl_loss_13": 2642.4, |
| "kl_loss_26": 1399.3, |
| "kl_loss_39": 968.5, |
| "kl_loss_7": 3392.0, |
| "learning_rate": 0.000589947435184427, |
| "loss": 4194.3, |
| "step": 4480 |
| }, |
| { |
| "ce_loss_13": 2.7240218341350557, |
| "ce_loss_26": 2.128333044052124, |
| "ce_loss_39": 1.924179795384407, |
| "ce_loss_52": 1.4498969972133637, |
| "ce_loss_7": 3.0761671125888825, |
| "epoch": 0.449, |
| "grad_norm": 17.07642900561258, |
| "kl_loss_13": 2604.8, |
| "kl_loss_26": 1344.6, |
| "kl_loss_39": 924.8, |
| "kl_loss_7": 3354.0, |
| "learning_rate": 0.0005883862070330078, |
| "loss": 4206.7, |
| "step": 4490 |
| }, |
| { |
| "ce_loss_13": 2.7219722032547, |
| "ce_loss_26": 2.1390156149864197, |
| "ce_loss_39": 1.9286952793598175, |
| "ce_loss_52": 1.4294085174798965, |
| "ce_loss_7": 3.0803197801113127, |
| "epoch": 0.45, |
| "grad_norm": 15.274632326953679, |
| "kl_loss_13": 2621.6, |
| "kl_loss_26": 1403.4, |
| "kl_loss_39": 973.4, |
| "kl_loss_7": 3369.2, |
| "learning_rate": 0.0005868240888334653, |
| "loss": 4211.9, |
| "step": 4500 |
| }, |
| { |
| "ce_loss_13": 2.6810890555381777, |
| "ce_loss_26": 2.105925416946411, |
| "ce_loss_39": 1.9109346747398377, |
| "ce_loss_52": 1.4293138086795807, |
| "ce_loss_7": 3.033176803588867, |
| "epoch": 0.451, |
| "grad_norm": 17.03243058089965, |
| "kl_loss_13": 2608.2, |
| "kl_loss_26": 1375.3, |
| "kl_loss_39": 955.3, |
| "kl_loss_7": 3345.2, |
| "learning_rate": 0.0005852610963163119, |
| "loss": 4209.7, |
| "step": 4510 |
| }, |
| { |
| "ce_loss_13": 2.689431291818619, |
| "ce_loss_26": 2.1146853864192963, |
| "ce_loss_39": 1.9112016946077346, |
| "ce_loss_52": 1.4318486779928208, |
| "ce_loss_7": 3.0395568013191223, |
| "epoch": 0.452, |
| "grad_norm": 15.510330374157597, |
| "kl_loss_13": 2583.2, |
| "kl_loss_26": 1365.8, |
| "kl_loss_39": 947.8, |
| "kl_loss_7": 3313.2, |
| "learning_rate": 0.0005836972452208654, |
| "loss": 4185.25, |
| "step": 4520 |
| }, |
| { |
| "ce_loss_13": 2.7475471079349516, |
| "ce_loss_26": 2.157953730225563, |
| "ce_loss_39": 1.9457335144281387, |
| "ce_loss_52": 1.4404390811920167, |
| "ce_loss_7": 3.104820030927658, |
| "epoch": 0.453, |
| "grad_norm": 15.220972226004102, |
| "kl_loss_13": 2688.4, |
| "kl_loss_26": 1444.2, |
| "kl_loss_39": 1006.8, |
| "kl_loss_7": 3431.6, |
| "learning_rate": 0.0005821325512950885, |
| "loss": 4222.6, |
| "step": 4530 |
| }, |
| { |
| "ce_loss_13": 2.7701157510280607, |
| "ce_loss_26": 2.1823565661907196, |
| "ce_loss_39": 1.9763484060764314, |
| "ce_loss_52": 1.4842363893985748, |
| "ce_loss_7": 3.1242611587047575, |
| "epoch": 0.454, |
| "grad_norm": 16.181871779695452, |
| "kl_loss_13": 2641.6, |
| "kl_loss_26": 1398.0, |
| "kl_loss_39": 968.2, |
| "kl_loss_7": 3387.6, |
| "learning_rate": 0.0005805670302954321, |
| "loss": 4206.95, |
| "step": 4540 |
| }, |
| { |
| "ce_loss_13": 2.69073800444603, |
| "ce_loss_26": 2.1018889248371124, |
| "ce_loss_39": 1.8928476065397262, |
| "ce_loss_52": 1.4156933531165123, |
| "ce_loss_7": 3.051577550172806, |
| "epoch": 0.455, |
| "grad_norm": 15.802548274169151, |
| "kl_loss_13": 2629.2, |
| "kl_loss_26": 1374.2, |
| "kl_loss_39": 934.4, |
| "kl_loss_7": 3382.8, |
| "learning_rate": 0.000579000697986675, |
| "loss": 4173.65, |
| "step": 4550 |
| }, |
| { |
| "ce_loss_13": 2.734744447469711, |
| "ce_loss_26": 2.139357805252075, |
| "ce_loss_39": 1.9327150255441665, |
| "ce_loss_52": 1.44863750487566, |
| "ce_loss_7": 3.083251416683197, |
| "epoch": 0.456, |
| "grad_norm": 15.332335326805197, |
| "kl_loss_13": 2646.0, |
| "kl_loss_26": 1390.4, |
| "kl_loss_39": 959.2, |
| "kl_loss_7": 3391.2, |
| "learning_rate": 0.0005774335701417662, |
| "loss": 4177.45, |
| "step": 4560 |
| }, |
| { |
| "ce_loss_13": 2.696203714609146, |
| "ce_loss_26": 2.102033945918083, |
| "ce_loss_39": 1.8971556156873703, |
| "ce_loss_52": 1.4197801396250724, |
| "ce_loss_7": 3.047564595937729, |
| "epoch": 0.457, |
| "grad_norm": 16.076096882060348, |
| "kl_loss_13": 2600.0, |
| "kl_loss_26": 1362.8, |
| "kl_loss_39": 939.0, |
| "kl_loss_7": 3341.2, |
| "learning_rate": 0.0005758656625416658, |
| "loss": 4183.3, |
| "step": 4570 |
| }, |
| { |
| "ce_loss_13": 2.7472688376903536, |
| "ce_loss_26": 2.1439451813697814, |
| "ce_loss_39": 1.9378813654184341, |
| "ce_loss_52": 1.4498099207878112, |
| "ce_loss_7": 3.1056803286075594, |
| "epoch": 0.458, |
| "grad_norm": 15.434602661166036, |
| "kl_loss_13": 2667.6, |
| "kl_loss_26": 1394.6, |
| "kl_loss_39": 964.2, |
| "kl_loss_7": 3421.6, |
| "learning_rate": 0.0005742969909751859, |
| "loss": 4202.65, |
| "step": 4580 |
| }, |
| { |
| "ce_loss_13": 2.819844591617584, |
| "ce_loss_26": 2.217494735121727, |
| "ce_loss_39": 1.995268750190735, |
| "ce_loss_52": 1.4828792631626129, |
| "ce_loss_7": 3.183567076921463, |
| "epoch": 0.459, |
| "grad_norm": 15.16665840440692, |
| "kl_loss_13": 2715.2, |
| "kl_loss_26": 1447.8, |
| "kl_loss_39": 1000.9, |
| "kl_loss_7": 3469.2, |
| "learning_rate": 0.0005727275712388318, |
| "loss": 4159.15, |
| "step": 4590 |
| }, |
| { |
| "ce_loss_13": 2.757504242658615, |
| "ce_loss_26": 2.1517707139253615, |
| "ce_loss_39": 1.9407849818468095, |
| "ce_loss_52": 1.4452633827924728, |
| "ce_loss_7": 3.1194639682769774, |
| "epoch": 0.46, |
| "grad_norm": 16.189362396808324, |
| "kl_loss_13": 2690.8, |
| "kl_loss_26": 1421.0, |
| "kl_loss_39": 976.6, |
| "kl_loss_7": 3439.6, |
| "learning_rate": 0.0005711574191366427, |
| "loss": 4126.7, |
| "step": 4600 |
| }, |
| { |
| "ce_loss_13": 2.7185553312301636, |
| "ce_loss_26": 2.140464088320732, |
| "ce_loss_39": 1.9312822461128234, |
| "ce_loss_52": 1.4499182224273681, |
| "ce_loss_7": 3.074033808708191, |
| "epoch": 0.461, |
| "grad_norm": 15.796779340482095, |
| "kl_loss_13": 2590.8, |
| "kl_loss_26": 1373.5, |
| "kl_loss_39": 946.6, |
| "kl_loss_7": 3326.0, |
| "learning_rate": 0.0005695865504800327, |
| "loss": 4117.15, |
| "step": 4610 |
| }, |
| { |
| "ce_loss_13": 2.689697802066803, |
| "ce_loss_26": 2.1229261219501496, |
| "ce_loss_39": 1.9236579477787017, |
| "ce_loss_52": 1.4474295616149901, |
| "ce_loss_7": 3.0403923749923707, |
| "epoch": 0.462, |
| "grad_norm": 15.469933015809259, |
| "kl_loss_13": 2559.6, |
| "kl_loss_26": 1353.2, |
| "kl_loss_39": 936.3, |
| "kl_loss_7": 3287.2, |
| "learning_rate": 0.0005680149810876322, |
| "loss": 4141.45, |
| "step": 4620 |
| }, |
| { |
| "ce_loss_13": 2.709194713830948, |
| "ce_loss_26": 2.1067664295434954, |
| "ce_loss_39": 1.8838362753391267, |
| "ce_loss_52": 1.4009573340415955, |
| "ce_loss_7": 3.070843666791916, |
| "epoch": 0.463, |
| "grad_norm": 15.475096434174118, |
| "kl_loss_13": 2674.8, |
| "kl_loss_26": 1399.4, |
| "kl_loss_39": 943.3, |
| "kl_loss_7": 3422.8, |
| "learning_rate": 0.0005664427267851271, |
| "loss": 4160.8, |
| "step": 4630 |
| }, |
| { |
| "ce_loss_13": 2.7120527923107147, |
| "ce_loss_26": 2.120649069547653, |
| "ce_loss_39": 1.9163700252771378, |
| "ce_loss_52": 1.4380100429058076, |
| "ce_loss_7": 3.061056911945343, |
| "epoch": 0.464, |
| "grad_norm": 15.555501351653449, |
| "kl_loss_13": 2608.0, |
| "kl_loss_26": 1354.6, |
| "kl_loss_39": 932.5, |
| "kl_loss_7": 3340.4, |
| "learning_rate": 0.0005648698034051009, |
| "loss": 4170.2, |
| "step": 4640 |
| }, |
| { |
| "ce_loss_13": 2.7389404594898226, |
| "ce_loss_26": 2.1454448729753492, |
| "ce_loss_39": 1.9379454165697099, |
| "ce_loss_52": 1.45494404733181, |
| "ce_loss_7": 3.090787374973297, |
| "epoch": 0.465, |
| "grad_norm": 17.172011566290227, |
| "kl_loss_13": 2617.0, |
| "kl_loss_26": 1379.3, |
| "kl_loss_39": 945.6, |
| "kl_loss_7": 3354.0, |
| "learning_rate": 0.0005632962267868747, |
| "loss": 4137.2, |
| "step": 4650 |
| }, |
| { |
| "ce_loss_13": 2.618588683009148, |
| "ce_loss_26": 2.0511282205581667, |
| "ce_loss_39": 1.857603308558464, |
| "ce_loss_52": 1.3924044981598853, |
| "ce_loss_7": 2.9628873229026795, |
| "epoch": 0.466, |
| "grad_norm": 15.078077143393564, |
| "kl_loss_13": 2528.4, |
| "kl_loss_26": 1330.4, |
| "kl_loss_39": 924.6, |
| "kl_loss_7": 3258.4, |
| "learning_rate": 0.0005617220127763474, |
| "loss": 4108.7, |
| "step": 4660 |
| }, |
| { |
| "ce_loss_13": 2.706065672636032, |
| "ce_loss_26": 2.128128296136856, |
| "ce_loss_39": 1.921997308731079, |
| "ce_loss_52": 1.4441686987876892, |
| "ce_loss_7": 3.059876149892807, |
| "epoch": 0.467, |
| "grad_norm": 16.456195061500843, |
| "kl_loss_13": 2576.0, |
| "kl_loss_26": 1356.8, |
| "kl_loss_39": 938.3, |
| "kl_loss_7": 3319.6, |
| "learning_rate": 0.0005601471772258368, |
| "loss": 4092.5, |
| "step": 4670 |
| }, |
| { |
| "ce_loss_13": 2.691108763217926, |
| "ce_loss_26": 2.1213403046131134, |
| "ce_loss_39": 1.9077781707048416, |
| "ce_loss_52": 1.4377064436674118, |
| "ce_loss_7": 3.0406983733177184, |
| "epoch": 0.468, |
| "grad_norm": 15.284887521485958, |
| "kl_loss_13": 2584.8, |
| "kl_loss_26": 1368.0, |
| "kl_loss_39": 933.0, |
| "kl_loss_7": 3316.4, |
| "learning_rate": 0.0005585717359939192, |
| "loss": 4090.9, |
| "step": 4680 |
| }, |
| { |
| "ce_loss_13": 2.715133213996887, |
| "ce_loss_26": 2.1481328904628754, |
| "ce_loss_39": 1.938890340924263, |
| "ce_loss_52": 1.456964261829853, |
| "ce_loss_7": 3.063201904296875, |
| "epoch": 0.469, |
| "grad_norm": 14.95117298176539, |
| "kl_loss_13": 2573.6, |
| "kl_loss_26": 1372.5, |
| "kl_loss_39": 948.0, |
| "kl_loss_7": 3294.4, |
| "learning_rate": 0.0005569957049452703, |
| "loss": 4067.75, |
| "step": 4690 |
| }, |
| { |
| "ce_loss_13": 2.7355633437633515, |
| "ce_loss_26": 2.130109578371048, |
| "ce_loss_39": 1.9153006434440614, |
| "ce_loss_52": 1.4125859558582305, |
| "ce_loss_7": 3.0963422894477843, |
| "epoch": 0.47, |
| "grad_norm": 15.35914349074289, |
| "kl_loss_13": 2726.0, |
| "kl_loss_26": 1448.2, |
| "kl_loss_39": 1002.1, |
| "kl_loss_7": 3478.0, |
| "learning_rate": 0.0005554190999505056, |
| "loss": 4157.55, |
| "step": 4700 |
| }, |
| { |
| "ce_loss_13": 2.6879218101501463, |
| "ce_loss_26": 2.098815104365349, |
| "ce_loss_39": 1.8952988266944886, |
| "ce_loss_52": 1.4328953355550766, |
| "ce_loss_7": 3.040910530090332, |
| "epoch": 0.471, |
| "grad_norm": 15.95882741217506, |
| "kl_loss_13": 2568.0, |
| "kl_loss_26": 1323.6, |
| "kl_loss_39": 901.3, |
| "kl_loss_7": 3319.6, |
| "learning_rate": 0.0005538419368860196, |
| "loss": 4062.85, |
| "step": 4710 |
| }, |
| { |
| "ce_loss_13": 2.6794900715351107, |
| "ce_loss_26": 2.0905598402023315, |
| "ce_loss_39": 1.8867737114429475, |
| "ce_loss_52": 1.4186928808689117, |
| "ce_loss_7": 3.0324371635913847, |
| "epoch": 0.472, |
| "grad_norm": 15.674607259246525, |
| "kl_loss_13": 2569.4, |
| "kl_loss_26": 1340.6, |
| "kl_loss_39": 914.1, |
| "kl_loss_7": 3303.6, |
| "learning_rate": 0.0005522642316338268, |
| "loss": 4084.1, |
| "step": 4720 |
| }, |
| { |
| "ce_loss_13": 2.7094511866569517, |
| "ce_loss_26": 2.1240269035100936, |
| "ce_loss_39": 1.9192228257656097, |
| "ce_loss_52": 1.4589810997247696, |
| "ce_loss_7": 3.0527816653251647, |
| "epoch": 0.473, |
| "grad_norm": 15.84409146313606, |
| "kl_loss_13": 2555.6, |
| "kl_loss_26": 1330.2, |
| "kl_loss_39": 901.4, |
| "kl_loss_7": 3270.0, |
| "learning_rate": 0.0005506860000814017, |
| "loss": 4024.65, |
| "step": 4730 |
| }, |
| { |
| "ce_loss_13": 2.686307519674301, |
| "ce_loss_26": 2.1099446028470994, |
| "ce_loss_39": 1.9098408967256546, |
| "ce_loss_52": 1.4574729681015015, |
| "ce_loss_7": 3.03258957862854, |
| "epoch": 0.474, |
| "grad_norm": 16.290287645699628, |
| "kl_loss_13": 2538.4, |
| "kl_loss_26": 1308.0, |
| "kl_loss_39": 889.7, |
| "kl_loss_7": 3278.4, |
| "learning_rate": 0.0005491072581215186, |
| "loss": 4058.25, |
| "step": 4740 |
| }, |
| { |
| "ce_loss_13": 2.685838830471039, |
| "ce_loss_26": 2.097555673122406, |
| "ce_loss_39": 1.8953343421220779, |
| "ce_loss_52": 1.4276258319616317, |
| "ce_loss_7": 3.042680394649506, |
| "epoch": 0.475, |
| "grad_norm": 15.834434583556519, |
| "kl_loss_13": 2590.0, |
| "kl_loss_26": 1354.4, |
| "kl_loss_39": 931.0, |
| "kl_loss_7": 3334.4, |
| "learning_rate": 0.0005475280216520913, |
| "loss": 4057.7, |
| "step": 4750 |
| }, |
| { |
| "ce_loss_13": 2.645804923772812, |
| "ce_loss_26": 2.073691374063492, |
| "ce_loss_39": 1.872296717762947, |
| "ce_loss_52": 1.4167594254016875, |
| "ce_loss_7": 2.992863970994949, |
| "epoch": 0.476, |
| "grad_norm": 15.93243827944326, |
| "kl_loss_13": 2532.4, |
| "kl_loss_26": 1326.8, |
| "kl_loss_39": 900.3, |
| "kl_loss_7": 3268.8, |
| "learning_rate": 0.0005459483065760138, |
| "loss": 4104.4, |
| "step": 4760 |
| }, |
| { |
| "ce_loss_13": 2.7042051672935488, |
| "ce_loss_26": 2.1117112547159196, |
| "ce_loss_39": 1.9040750682353973, |
| "ce_loss_52": 1.4314228266477584, |
| "ce_loss_7": 3.0566958367824553, |
| "epoch": 0.477, |
| "grad_norm": 15.353551244219883, |
| "kl_loss_13": 2608.4, |
| "kl_loss_26": 1361.8, |
| "kl_loss_39": 937.4, |
| "kl_loss_7": 3352.4, |
| "learning_rate": 0.0005443681288009991, |
| "loss": 4078.7, |
| "step": 4770 |
| }, |
| { |
| "ce_loss_13": 2.6856160342693327, |
| "ce_loss_26": 2.0829177469015123, |
| "ce_loss_39": 1.8763625353574753, |
| "ce_loss_52": 1.4030324995517731, |
| "ce_loss_7": 3.0448363423347473, |
| "epoch": 0.478, |
| "grad_norm": 16.14315974332923, |
| "kl_loss_13": 2646.4, |
| "kl_loss_26": 1377.9, |
| "kl_loss_39": 941.4, |
| "kl_loss_7": 3398.4, |
| "learning_rate": 0.0005427875042394199, |
| "loss": 4031.6, |
| "step": 4780 |
| }, |
| { |
| "ce_loss_13": 2.6821600914001467, |
| "ce_loss_26": 2.1105621844530105, |
| "ce_loss_39": 1.9080501794815063, |
| "ce_loss_52": 1.4579481482505798, |
| "ce_loss_7": 3.0244390606880187, |
| "epoch": 0.479, |
| "grad_norm": 16.41928903258253, |
| "kl_loss_13": 2508.6, |
| "kl_loss_26": 1304.7, |
| "kl_loss_39": 890.8, |
| "kl_loss_7": 3233.6, |
| "learning_rate": 0.0005412064488081482, |
| "loss": 4041.85, |
| "step": 4790 |
| }, |
| { |
| "ce_loss_13": 2.644778722524643, |
| "ce_loss_26": 2.0624290674924852, |
| "ce_loss_39": 1.8686836928129196, |
| "ce_loss_52": 1.4204004764556886, |
| "ce_loss_7": 2.987608629465103, |
| "epoch": 0.48, |
| "grad_norm": 15.406655412683705, |
| "kl_loss_13": 2521.2, |
| "kl_loss_26": 1293.1, |
| "kl_loss_39": 883.5, |
| "kl_loss_7": 3258.4, |
| "learning_rate": 0.0005396249784283942, |
| "loss": 4018.65, |
| "step": 4800 |
| }, |
| { |
| "ce_loss_13": 2.6721664726734162, |
| "ce_loss_26": 2.0903854191303255, |
| "ce_loss_39": 1.8859550595283507, |
| "ce_loss_52": 1.4339767321944237, |
| "ce_loss_7": 3.022903573513031, |
| "epoch": 0.481, |
| "grad_norm": 15.533152676819311, |
| "kl_loss_13": 2549.2, |
| "kl_loss_26": 1311.5, |
| "kl_loss_39": 887.7, |
| "kl_loss_7": 3276.8, |
| "learning_rate": 0.0005380431090255476, |
| "loss": 4094.5, |
| "step": 4810 |
| }, |
| { |
| "ce_loss_13": 2.7146060168743134, |
| "ce_loss_26": 2.138300836086273, |
| "ce_loss_39": 1.9245162457227707, |
| "ce_loss_52": 1.4378316938877105, |
| "ce_loss_7": 3.0600010454654694, |
| "epoch": 0.482, |
| "grad_norm": 15.889585422309347, |
| "kl_loss_13": 2616.4, |
| "kl_loss_26": 1396.0, |
| "kl_loss_39": 954.3, |
| "kl_loss_7": 3337.2, |
| "learning_rate": 0.0005364608565290155, |
| "loss": 4019.85, |
| "step": 4820 |
| }, |
| { |
| "ce_loss_13": 2.7267942845821382, |
| "ce_loss_26": 2.1315987795591353, |
| "ce_loss_39": 1.92288878262043, |
| "ce_loss_52": 1.4603519141674042, |
| "ce_loss_7": 3.089752674102783, |
| "epoch": 0.483, |
| "grad_norm": 14.942603293515566, |
| "kl_loss_13": 2598.4, |
| "kl_loss_26": 1350.0, |
| "kl_loss_39": 913.7, |
| "kl_loss_7": 3351.2, |
| "learning_rate": 0.0005348782368720626, |
| "loss": 4054.35, |
| "step": 4830 |
| }, |
| { |
| "ce_loss_13": 2.702422133088112, |
| "ce_loss_26": 2.1349568367004395, |
| "ce_loss_39": 1.923803049325943, |
| "ce_loss_52": 1.4461625874042512, |
| "ce_loss_7": 3.0548571348190308, |
| "epoch": 0.484, |
| "grad_norm": 14.924067089524062, |
| "kl_loss_13": 2579.8, |
| "kl_loss_26": 1357.8, |
| "kl_loss_39": 926.9, |
| "kl_loss_7": 3324.0, |
| "learning_rate": 0.000533295265991652, |
| "loss": 4024.65, |
| "step": 4840 |
| }, |
| { |
| "ce_loss_13": 2.6280339270830155, |
| "ce_loss_26": 2.0501014798879624, |
| "ce_loss_39": 1.8476706713438034, |
| "ce_loss_52": 1.3977838337421418, |
| "ce_loss_7": 2.9806483924388885, |
| "epoch": 0.485, |
| "grad_norm": 16.07575861381767, |
| "kl_loss_13": 2515.0, |
| "kl_loss_26": 1293.9, |
| "kl_loss_39": 881.9, |
| "kl_loss_7": 3258.8, |
| "learning_rate": 0.0005317119598282822, |
| "loss": 4003.05, |
| "step": 4850 |
| }, |
| { |
| "ce_loss_13": 2.7017304062843324, |
| "ce_loss_26": 2.123658448457718, |
| "ce_loss_39": 1.9226309835910798, |
| "ce_loss_52": 1.4685954213142396, |
| "ce_loss_7": 3.052913784980774, |
| "epoch": 0.486, |
| "grad_norm": 14.261884924612158, |
| "kl_loss_13": 2552.0, |
| "kl_loss_26": 1330.6, |
| "kl_loss_39": 901.6, |
| "kl_loss_7": 3291.2, |
| "learning_rate": 0.0005301283343258293, |
| "loss": 4032.7, |
| "step": 4860 |
| }, |
| { |
| "ce_loss_13": 2.663911575078964, |
| "ce_loss_26": 2.0811035096645356, |
| "ce_loss_39": 1.8746151685714723, |
| "ce_loss_52": 1.4212765499949456, |
| "ce_loss_7": 3.0172334790229796, |
| "epoch": 0.487, |
| "grad_norm": 15.979242608660392, |
| "kl_loss_13": 2526.0, |
| "kl_loss_26": 1301.0, |
| "kl_loss_39": 883.9, |
| "kl_loss_7": 3276.4, |
| "learning_rate": 0.000528544405431384, |
| "loss": 4020.25, |
| "step": 4870 |
| }, |
| { |
| "ce_loss_13": 2.660174161195755, |
| "ce_loss_26": 2.0942499101161958, |
| "ce_loss_39": 1.8920150458812715, |
| "ce_loss_52": 1.4465530335903167, |
| "ce_loss_7": 3.0074438989162444, |
| "epoch": 0.488, |
| "grad_norm": 14.78308909898239, |
| "kl_loss_13": 2500.8, |
| "kl_loss_26": 1294.5, |
| "kl_loss_39": 873.6, |
| "kl_loss_7": 3228.0, |
| "learning_rate": 0.000526960189095093, |
| "loss": 4016.5, |
| "step": 4880 |
| }, |
| { |
| "ce_loss_13": 2.635312020778656, |
| "ce_loss_26": 2.0795453995466233, |
| "ce_loss_39": 1.8807054102420806, |
| "ce_loss_52": 1.4317258328199387, |
| "ce_loss_7": 2.9746360957622526, |
| "epoch": 0.489, |
| "grad_norm": 15.033016045239172, |
| "kl_loss_13": 2473.6, |
| "kl_loss_26": 1289.8, |
| "kl_loss_39": 880.7, |
| "kl_loss_7": 3185.2, |
| "learning_rate": 0.0005253757012699972, |
| "loss": 3996.8, |
| "step": 4890 |
| }, |
| { |
| "ce_loss_13": 2.68422954082489, |
| "ce_loss_26": 2.1027563750743865, |
| "ce_loss_39": 1.8974193513393403, |
| "ce_loss_52": 1.4426774829626083, |
| "ce_loss_7": 3.037678909301758, |
| "epoch": 0.49, |
| "grad_norm": 15.631967370179172, |
| "kl_loss_13": 2553.2, |
| "kl_loss_26": 1323.6, |
| "kl_loss_39": 894.5, |
| "kl_loss_7": 3295.2, |
| "learning_rate": 0.0005237909579118712, |
| "loss": 3967.65, |
| "step": 4900 |
| }, |
| { |
| "ce_loss_13": 2.680465018749237, |
| "ce_loss_26": 2.1046013057231905, |
| "ce_loss_39": 1.9009331673383714, |
| "ce_loss_52": 1.44480240046978, |
| "ce_loss_7": 3.036197912693024, |
| "epoch": 0.491, |
| "grad_norm": 15.759372326211485, |
| "kl_loss_13": 2524.0, |
| "kl_loss_26": 1301.7, |
| "kl_loss_39": 885.8, |
| "kl_loss_7": 3274.8, |
| "learning_rate": 0.0005222059749790631, |
| "loss": 3979.5, |
| "step": 4910 |
| }, |
| { |
| "ce_loss_13": 2.6961126804351805, |
| "ce_loss_26": 2.1182867020368574, |
| "ce_loss_39": 1.9074458956718445, |
| "ce_loss_52": 1.456271693110466, |
| "ce_loss_7": 3.044221115112305, |
| "epoch": 0.492, |
| "grad_norm": 16.34015666907617, |
| "kl_loss_13": 2536.4, |
| "kl_loss_26": 1322.4, |
| "kl_loss_39": 892.3, |
| "kl_loss_7": 3262.8, |
| "learning_rate": 0.0005206207684323337, |
| "loss": 3964.95, |
| "step": 4920 |
| }, |
| { |
| "ce_loss_13": 2.6250401854515077, |
| "ce_loss_26": 2.041203039884567, |
| "ce_loss_39": 1.8369982630014419, |
| "ce_loss_52": 1.4043618232011794, |
| "ce_loss_7": 2.976498603820801, |
| "epoch": 0.493, |
| "grad_norm": 15.643386607908784, |
| "kl_loss_13": 2504.8, |
| "kl_loss_26": 1279.6, |
| "kl_loss_39": 857.1, |
| "kl_loss_7": 3243.6, |
| "learning_rate": 0.000519035354234695, |
| "loss": 3956.4, |
| "step": 4930 |
| }, |
| { |
| "ce_loss_13": 2.731994906067848, |
| "ce_loss_26": 2.151404523849487, |
| "ce_loss_39": 1.9346221089363098, |
| "ce_loss_52": 1.4679641619324684, |
| "ce_loss_7": 3.0894507080316544, |
| "epoch": 0.494, |
| "grad_norm": 15.972342532535162, |
| "kl_loss_13": 2584.2, |
| "kl_loss_26": 1354.3, |
| "kl_loss_39": 912.1, |
| "kl_loss_7": 3338.8, |
| "learning_rate": 0.0005174497483512506, |
| "loss": 3986.95, |
| "step": 4940 |
| }, |
| { |
| "ce_loss_13": 2.692367374897003, |
| "ce_loss_26": 2.122538897395134, |
| "ce_loss_39": 1.9124073147773744, |
| "ce_loss_52": 1.452483794093132, |
| "ce_loss_7": 3.042392200231552, |
| "epoch": 0.495, |
| "grad_norm": 15.983001798116502, |
| "kl_loss_13": 2533.4, |
| "kl_loss_26": 1327.0, |
| "kl_loss_39": 895.7, |
| "kl_loss_7": 3252.0, |
| "learning_rate": 0.0005158639667490339, |
| "loss": 3967.55, |
| "step": 4950 |
| }, |
| { |
| "ce_loss_13": 2.6044699877500532, |
| "ce_loss_26": 2.0326590865850447, |
| "ce_loss_39": 1.8286783695220947, |
| "ce_loss_52": 1.3870023548603059, |
| "ce_loss_7": 2.947771966457367, |
| "epoch": 0.496, |
| "grad_norm": 15.42571652932802, |
| "kl_loss_13": 2500.2, |
| "kl_loss_26": 1296.2, |
| "kl_loss_39": 875.7, |
| "kl_loss_7": 3230.8, |
| "learning_rate": 0.0005142780253968481, |
| "loss": 3955.8, |
| "step": 4960 |
| }, |
| { |
| "ce_loss_13": 2.642567253112793, |
| "ce_loss_26": 2.0776680946350097, |
| "ce_loss_39": 1.875117465853691, |
| "ce_loss_52": 1.4392555862665177, |
| "ce_loss_7": 2.991294425725937, |
| "epoch": 0.497, |
| "grad_norm": 15.451081778073426, |
| "kl_loss_13": 2501.6, |
| "kl_loss_26": 1291.4, |
| "kl_loss_39": 867.8, |
| "kl_loss_7": 3244.8, |
| "learning_rate": 0.0005126919402651053, |
| "loss": 3945.75, |
| "step": 4970 |
| }, |
| { |
| "ce_loss_13": 2.6330737471580505, |
| "ce_loss_26": 2.0630074977874755, |
| "ce_loss_39": 1.856469190120697, |
| "ce_loss_52": 1.4164980471134185, |
| "ce_loss_7": 2.9769130408763886, |
| "epoch": 0.498, |
| "grad_norm": 15.34176013733142, |
| "kl_loss_13": 2503.2, |
| "kl_loss_26": 1294.2, |
| "kl_loss_39": 874.8, |
| "kl_loss_7": 3228.4, |
| "learning_rate": 0.0005111057273256647, |
| "loss": 3917.2, |
| "step": 4980 |
| }, |
| { |
| "ce_loss_13": 2.6576701521873476, |
| "ce_loss_26": 2.089341068267822, |
| "ce_loss_39": 1.8844720661640166, |
| "ce_loss_52": 1.4477695405483246, |
| "ce_loss_7": 3.0070637345314024, |
| "epoch": 0.499, |
| "grad_norm": 15.00660182184455, |
| "kl_loss_13": 2473.8, |
| "kl_loss_26": 1283.9, |
| "kl_loss_39": 853.9, |
| "kl_loss_7": 3207.6, |
| "learning_rate": 0.0005095194025516733, |
| "loss": 3923.7, |
| "step": 4990 |
| }, |
| { |
| "ce_loss_13": 2.697542816400528, |
| "ce_loss_26": 2.125366801023483, |
| "ce_loss_39": 1.9231987714767456, |
| "ce_loss_52": 1.4726029485464096, |
| "ce_loss_7": 3.0440734326839447, |
| "epoch": 0.5, |
| "grad_norm": 15.160859466153447, |
| "kl_loss_13": 2511.6, |
| "kl_loss_26": 1303.5, |
| "kl_loss_39": 880.0, |
| "kl_loss_7": 3249.2, |
| "learning_rate": 0.000507932981917404, |
| "loss": 3938.75, |
| "step": 5000 |
| }, |
| { |
| "ce_loss_13": 2.554905018210411, |
| "ce_loss_26": 1.9959456473588943, |
| "ce_loss_39": 1.8001023352146148, |
| "ce_loss_52": 1.370650653541088, |
| "ce_loss_7": 2.8987653195858, |
| "epoch": 0.501, |
| "grad_norm": 17.14356490933402, |
| "kl_loss_13": 2439.4, |
| "kl_loss_26": 1259.5, |
| "kl_loss_39": 849.2, |
| "kl_loss_7": 3158.0, |
| "learning_rate": 0.0005063464813980949, |
| "loss": 3915.65, |
| "step": 5010 |
| }, |
| { |
| "ce_loss_13": 2.5977672755718233, |
| "ce_loss_26": 2.0211563646793365, |
| "ce_loss_39": 1.8262152045965194, |
| "ce_loss_52": 1.3998217657208443, |
| "ce_loss_7": 2.950869935750961, |
| "epoch": 0.502, |
| "grad_norm": 14.97441402973133, |
| "kl_loss_13": 2466.8, |
| "kl_loss_26": 1257.6, |
| "kl_loss_39": 848.9, |
| "kl_loss_7": 3202.0, |
| "learning_rate": 0.0005047599169697884, |
| "loss": 3931.7, |
| "step": 5020 |
| }, |
| { |
| "ce_loss_13": 2.6368628799915315, |
| "ce_loss_26": 2.0560896009206773, |
| "ce_loss_39": 1.8569422334432601, |
| "ce_loss_52": 1.429998092353344, |
| "ce_loss_7": 2.981409990787506, |
| "epoch": 0.503, |
| "grad_norm": 15.444575545537123, |
| "kl_loss_13": 2486.4, |
| "kl_loss_26": 1264.6, |
| "kl_loss_39": 845.9, |
| "kl_loss_7": 3216.4, |
| "learning_rate": 0.000503173304609171, |
| "loss": 3936.6, |
| "step": 5030 |
| }, |
| { |
| "ce_loss_13": 2.694787061214447, |
| "ce_loss_26": 2.10608988404274, |
| "ce_loss_39": 1.8957834452390672, |
| "ce_loss_52": 1.4505297511816024, |
| "ce_loss_7": 3.0503607213497164, |
| "epoch": 0.504, |
| "grad_norm": 15.80764020207202, |
| "kl_loss_13": 2558.4, |
| "kl_loss_26": 1318.1, |
| "kl_loss_39": 884.7, |
| "kl_loss_7": 3306.4, |
| "learning_rate": 0.0005015866602934111, |
| "loss": 3939.75, |
| "step": 5040 |
| }, |
| { |
| "ce_loss_13": 2.621226805448532, |
| "ce_loss_26": 2.0584420263767242, |
| "ce_loss_39": 1.861902078986168, |
| "ce_loss_52": 1.4422300636768342, |
| "ce_loss_7": 2.951041603088379, |
| "epoch": 0.505, |
| "grad_norm": 14.881966197673897, |
| "kl_loss_13": 2426.4, |
| "kl_loss_26": 1238.3, |
| "kl_loss_39": 828.1, |
| "kl_loss_7": 3127.6, |
| "learning_rate": 0.0005, |
| "loss": 3934.2, |
| "step": 5050 |
| }, |
| { |
| "ce_loss_13": 2.660506749153137, |
| "ce_loss_26": 2.098774325847626, |
| "ce_loss_39": 1.886313620209694, |
| "ce_loss_52": 1.4474970057606698, |
| "ce_loss_7": 3.011888575553894, |
| "epoch": 0.506, |
| "grad_norm": 14.941506448796416, |
| "kl_loss_13": 2498.2, |
| "kl_loss_26": 1299.9, |
| "kl_loss_39": 867.9, |
| "kl_loss_7": 3221.2, |
| "learning_rate": 0.0004984133397065889, |
| "loss": 3903.9, |
| "step": 5060 |
| }, |
| { |
| "ce_loss_13": 2.626861798763275, |
| "ce_loss_26": 2.0604827493429183, |
| "ce_loss_39": 1.8599964112043381, |
| "ce_loss_52": 1.4386487394571303, |
| "ce_loss_7": 2.975832390785217, |
| "epoch": 0.507, |
| "grad_norm": 15.44952616226393, |
| "kl_loss_13": 2452.8, |
| "kl_loss_26": 1250.8, |
| "kl_loss_39": 833.3, |
| "kl_loss_7": 3188.4, |
| "learning_rate": 0.0004968266953908291, |
| "loss": 3885.3, |
| "step": 5070 |
| }, |
| { |
| "ce_loss_13": 2.562318778038025, |
| "ce_loss_26": 1.9999902278184891, |
| "ce_loss_39": 1.8035208880901337, |
| "ce_loss_52": 1.3915461212396623, |
| "ce_loss_7": 2.904841202497482, |
| "epoch": 0.508, |
| "grad_norm": 14.83407474509182, |
| "kl_loss_13": 2407.2, |
| "kl_loss_26": 1226.2, |
| "kl_loss_39": 815.3, |
| "kl_loss_7": 3134.0, |
| "learning_rate": 0.0004952400830302117, |
| "loss": 3881.25, |
| "step": 5080 |
| }, |
| { |
| "ce_loss_13": 2.5728674054145815, |
| "ce_loss_26": 2.013489532470703, |
| "ce_loss_39": 1.8151986598968506, |
| "ce_loss_52": 1.3948280960321426, |
| "ce_loss_7": 2.9155173718929293, |
| "epoch": 0.509, |
| "grad_norm": 14.64837783343134, |
| "kl_loss_13": 2420.0, |
| "kl_loss_26": 1236.4, |
| "kl_loss_39": 829.6, |
| "kl_loss_7": 3145.6, |
| "learning_rate": 0.0004936535186019053, |
| "loss": 3875.9, |
| "step": 5090 |
| }, |
| { |
| "ce_loss_13": 2.657623714208603, |
| "ce_loss_26": 2.0687634259462357, |
| "ce_loss_39": 1.8608120799064636, |
| "ce_loss_52": 1.418680590391159, |
| "ce_loss_7": 3.0090177237987517, |
| "epoch": 0.51, |
| "grad_norm": 15.50653814601008, |
| "kl_loss_13": 2529.6, |
| "kl_loss_26": 1302.4, |
| "kl_loss_39": 874.3, |
| "kl_loss_7": 3273.2, |
| "learning_rate": 0.000492067018082596, |
| "loss": 3924.45, |
| "step": 5100 |
| }, |
| { |
| "ce_loss_13": 2.651608294248581, |
| "ce_loss_26": 2.0792359739542006, |
| "ce_loss_39": 1.8647643625736237, |
| "ce_loss_52": 1.4323984265327454, |
| "ce_loss_7": 3.0009018778800964, |
| "epoch": 0.511, |
| "grad_norm": 14.358667358457254, |
| "kl_loss_13": 2496.8, |
| "kl_loss_26": 1292.0, |
| "kl_loss_39": 854.6, |
| "kl_loss_7": 3224.0, |
| "learning_rate": 0.0004904805974483267, |
| "loss": 3865.45, |
| "step": 5110 |
| }, |
| { |
| "ce_loss_13": 2.641629362106323, |
| "ce_loss_26": 2.072116160392761, |
| "ce_loss_39": 1.8716523438692092, |
| "ce_loss_52": 1.4481347769498825, |
| "ce_loss_7": 2.989072245359421, |
| "epoch": 0.512, |
| "grad_norm": 15.321352631891271, |
| "kl_loss_13": 2423.6, |
| "kl_loss_26": 1237.5, |
| "kl_loss_39": 826.9, |
| "kl_loss_7": 3150.8, |
| "learning_rate": 0.0004888942726743353, |
| "loss": 3863.15, |
| "step": 5120 |
| }, |
| { |
| "ce_loss_13": 2.6105270087718964, |
| "ce_loss_26": 2.041230320930481, |
| "ce_loss_39": 1.8405257403850555, |
| "ce_loss_52": 1.4164400100708008, |
| "ce_loss_7": 2.9590699791908266, |
| "epoch": 0.513, |
| "grad_norm": 15.778047819453661, |
| "kl_loss_13": 2444.0, |
| "kl_loss_26": 1244.1, |
| "kl_loss_39": 835.9, |
| "kl_loss_7": 3179.2, |
| "learning_rate": 0.0004873080597348947, |
| "loss": 3860.6, |
| "step": 5130 |
| }, |
| { |
| "ce_loss_13": 2.7020871877670287, |
| "ce_loss_26": 2.1217857897281647, |
| "ce_loss_39": 1.910724088549614, |
| "ce_loss_52": 1.4596379309892655, |
| "ce_loss_7": 3.0534912407398225, |
| "epoch": 0.514, |
| "grad_norm": 15.798738758695562, |
| "kl_loss_13": 2550.4, |
| "kl_loss_26": 1321.6, |
| "kl_loss_39": 886.4, |
| "kl_loss_7": 3290.0, |
| "learning_rate": 0.0004857219746031519, |
| "loss": 3877.4, |
| "step": 5140 |
| }, |
| { |
| "ce_loss_13": 2.629297113418579, |
| "ce_loss_26": 2.065913289785385, |
| "ce_loss_39": 1.8652951270341873, |
| "ce_loss_52": 1.4406857430934905, |
| "ce_loss_7": 2.975607615709305, |
| "epoch": 0.515, |
| "grad_norm": 15.843426047808201, |
| "kl_loss_13": 2425.2, |
| "kl_loss_26": 1235.3, |
| "kl_loss_39": 819.8, |
| "kl_loss_7": 3155.2, |
| "learning_rate": 0.0004841360332509663, |
| "loss": 3881.85, |
| "step": 5150 |
| }, |
| { |
| "ce_loss_13": 2.6566128492355348, |
| "ce_loss_26": 2.0851503133773805, |
| "ce_loss_39": 1.879001685976982, |
| "ce_loss_52": 1.4529996007680892, |
| "ce_loss_7": 3.001719295978546, |
| "epoch": 0.516, |
| "grad_norm": 15.230952623778938, |
| "kl_loss_13": 2453.2, |
| "kl_loss_26": 1247.2, |
| "kl_loss_39": 825.4, |
| "kl_loss_7": 3185.6, |
| "learning_rate": 0.0004825502516487497, |
| "loss": 3877.3, |
| "step": 5160 |
| }, |
| { |
| "ce_loss_13": 2.6446830928325653, |
| "ce_loss_26": 2.087994411587715, |
| "ce_loss_39": 1.8902768969535828, |
| "ce_loss_52": 1.4724875479936599, |
| "ce_loss_7": 2.994678741693497, |
| "epoch": 0.517, |
| "grad_norm": 16.39635866458352, |
| "kl_loss_13": 2415.8, |
| "kl_loss_26": 1232.4, |
| "kl_loss_39": 823.7, |
| "kl_loss_7": 3135.2, |
| "learning_rate": 0.00048096464576530507, |
| "loss": 3828.0, |
| "step": 5170 |
| }, |
| { |
| "ce_loss_13": 2.5944429993629456, |
| "ce_loss_26": 2.036814641952515, |
| "ce_loss_39": 1.841314834356308, |
| "ce_loss_52": 1.4209656611084938, |
| "ce_loss_7": 2.9290528297424316, |
| "epoch": 0.518, |
| "grad_norm": 14.479752914773284, |
| "kl_loss_13": 2424.2, |
| "kl_loss_26": 1242.2, |
| "kl_loss_39": 829.4, |
| "kl_loss_7": 3150.4, |
| "learning_rate": 0.00047937923156766646, |
| "loss": 3845.8, |
| "step": 5180 |
| }, |
| { |
| "ce_loss_13": 2.660686820745468, |
| "ce_loss_26": 2.094280996918678, |
| "ce_loss_39": 1.8889294683933258, |
| "ce_loss_52": 1.4619006276130677, |
| "ce_loss_7": 3.008899760246277, |
| "epoch": 0.519, |
| "grad_norm": 16.01644389337825, |
| "kl_loss_13": 2475.6, |
| "kl_loss_26": 1273.8, |
| "kl_loss_39": 850.7, |
| "kl_loss_7": 3204.4, |
| "learning_rate": 0.00047779402502093696, |
| "loss": 3846.45, |
| "step": 5190 |
| }, |
| { |
| "ce_loss_13": 2.5972321003675463, |
| "ce_loss_26": 2.0274816364049912, |
| "ce_loss_39": 1.8247474491596223, |
| "ce_loss_52": 1.4065502554178237, |
| "ce_loss_7": 2.9485019743442535, |
| "epoch": 0.52, |
| "grad_norm": 14.98957972258473, |
| "kl_loss_13": 2450.8, |
| "kl_loss_26": 1255.3, |
| "kl_loss_39": 832.3, |
| "kl_loss_7": 3178.4, |
| "learning_rate": 0.0004762090420881289, |
| "loss": 3895.0, |
| "step": 5200 |
| }, |
| { |
| "ce_loss_13": 2.665889722108841, |
| "ce_loss_26": 2.110061451792717, |
| "ce_loss_39": 1.9084287703037262, |
| "ce_loss_52": 1.4813150823116303, |
| "ce_loss_7": 3.0068661749362944, |
| "epoch": 0.521, |
| "grad_norm": 15.21430509367723, |
| "kl_loss_13": 2448.0, |
| "kl_loss_26": 1268.2, |
| "kl_loss_39": 849.3, |
| "kl_loss_7": 3164.4, |
| "learning_rate": 0.00047462429873000296, |
| "loss": 3816.95, |
| "step": 5210 |
| }, |
| { |
| "ce_loss_13": 2.655850923061371, |
| "ce_loss_26": 2.085476315021515, |
| "ce_loss_39": 1.874689555168152, |
| "ce_loss_52": 1.446556892991066, |
| "ce_loss_7": 3.0054982542991637, |
| "epoch": 0.522, |
| "grad_norm": 16.427127136749267, |
| "kl_loss_13": 2452.4, |
| "kl_loss_26": 1264.0, |
| "kl_loss_39": 836.0, |
| "kl_loss_7": 3188.0, |
| "learning_rate": 0.0004730398109049071, |
| "loss": 3838.95, |
| "step": 5220 |
| }, |
| { |
| "ce_loss_13": 2.677028661966324, |
| "ce_loss_26": 2.119970577955246, |
| "ce_loss_39": 1.92548668384552, |
| "ce_loss_52": 1.4896603375673294, |
| "ce_loss_7": 3.008550250530243, |
| "epoch": 0.523, |
| "grad_norm": 16.0659576233147, |
| "kl_loss_13": 2444.4, |
| "kl_loss_26": 1258.1, |
| "kl_loss_39": 844.1, |
| "kl_loss_7": 3161.6, |
| "learning_rate": 0.000471455594568616, |
| "loss": 3864.55, |
| "step": 5230 |
| }, |
| { |
| "ce_loss_13": 2.6650006234645844, |
| "ce_loss_26": 2.085580566525459, |
| "ce_loss_39": 1.884287601709366, |
| "ce_loss_52": 1.453607079386711, |
| "ce_loss_7": 3.0236206233501433, |
| "epoch": 0.524, |
| "grad_norm": 14.653166262246215, |
| "kl_loss_13": 2489.6, |
| "kl_loss_26": 1272.8, |
| "kl_loss_39": 845.7, |
| "kl_loss_7": 3232.4, |
| "learning_rate": 0.00046987166567417086, |
| "loss": 3878.7, |
| "step": 5240 |
| }, |
| { |
| "ce_loss_13": 2.584680277109146, |
| "ce_loss_26": 2.0209302097558974, |
| "ce_loss_39": 1.8149988621473312, |
| "ce_loss_52": 1.390892931818962, |
| "ce_loss_7": 2.9321685075759887, |
| "epoch": 0.525, |
| "grad_norm": 15.168241253700586, |
| "kl_loss_13": 2428.4, |
| "kl_loss_26": 1252.2, |
| "kl_loss_39": 829.9, |
| "kl_loss_7": 3154.8, |
| "learning_rate": 0.00046828804017171776, |
| "loss": 3851.45, |
| "step": 5250 |
| }, |
| { |
| "ce_loss_13": 2.6199812322854994, |
| "ce_loss_26": 2.072703015804291, |
| "ce_loss_39": 1.8745949417352676, |
| "ce_loss_52": 1.4573716089129447, |
| "ce_loss_7": 2.9569752633571627, |
| "epoch": 0.526, |
| "grad_norm": 15.019656924741254, |
| "kl_loss_13": 2412.4, |
| "kl_loss_26": 1238.7, |
| "kl_loss_39": 825.7, |
| "kl_loss_7": 3130.0, |
| "learning_rate": 0.00046670473400834805, |
| "loss": 3822.4, |
| "step": 5260 |
| }, |
| { |
| "ce_loss_13": 2.614406701922417, |
| "ce_loss_26": 2.038132056593895, |
| "ce_loss_39": 1.835758489370346, |
| "ce_loss_52": 1.4097227707505227, |
| "ce_loss_7": 2.953414297103882, |
| "epoch": 0.527, |
| "grad_norm": 15.237091081862786, |
| "kl_loss_13": 2460.0, |
| "kl_loss_26": 1259.0, |
| "kl_loss_39": 841.0, |
| "kl_loss_7": 3178.4, |
| "learning_rate": 0.00046512176312793734, |
| "loss": 3820.5, |
| "step": 5270 |
| }, |
| { |
| "ce_loss_13": 2.5675832986831666, |
| "ce_loss_26": 2.0170306622982026, |
| "ce_loss_39": 1.8148203998804093, |
| "ce_loss_52": 1.4015038818120957, |
| "ce_loss_7": 2.9166265070438384, |
| "epoch": 0.528, |
| "grad_norm": 15.622184104337453, |
| "kl_loss_13": 2384.4, |
| "kl_loss_26": 1219.7, |
| "kl_loss_39": 809.7, |
| "kl_loss_7": 3111.2, |
| "learning_rate": 0.00046353914347098467, |
| "loss": 3805.5, |
| "step": 5280 |
| }, |
| { |
| "ce_loss_13": 2.600316107273102, |
| "ce_loss_26": 2.038938209414482, |
| "ce_loss_39": 1.8384071439504623, |
| "ce_loss_52": 1.4146052584052087, |
| "ce_loss_7": 2.9424943923950195, |
| "epoch": 0.529, |
| "grad_norm": 16.18662441031771, |
| "kl_loss_13": 2437.8, |
| "kl_loss_26": 1252.1, |
| "kl_loss_39": 829.2, |
| "kl_loss_7": 3150.0, |
| "learning_rate": 0.0004619568909744524, |
| "loss": 3784.9, |
| "step": 5290 |
| }, |
| { |
| "ce_loss_13": 2.6181087523698805, |
| "ce_loss_26": 2.0588752537965775, |
| "ce_loss_39": 1.85763358771801, |
| "ce_loss_52": 1.4420970767736434, |
| "ce_loss_7": 2.962410408258438, |
| "epoch": 0.53, |
| "grad_norm": 15.067130011925716, |
| "kl_loss_13": 2406.6, |
| "kl_loss_26": 1231.8, |
| "kl_loss_39": 819.2, |
| "kl_loss_7": 3132.4, |
| "learning_rate": 0.00046037502157160573, |
| "loss": 3815.9, |
| "step": 5300 |
| }, |
| { |
| "ce_loss_13": 2.5704480171203614, |
| "ce_loss_26": 2.007723420858383, |
| "ce_loss_39": 1.8157259494066238, |
| "ce_loss_52": 1.4100188240408897, |
| "ce_loss_7": 2.9128104388713836, |
| "epoch": 0.531, |
| "grad_norm": 14.940801357367253, |
| "kl_loss_13": 2375.2, |
| "kl_loss_26": 1192.5, |
| "kl_loss_39": 790.2, |
| "kl_loss_7": 3097.2, |
| "learning_rate": 0.00045879355119185207, |
| "loss": 3774.75, |
| "step": 5310 |
| }, |
| { |
| "ce_loss_13": 2.6117980778217316, |
| "ce_loss_26": 2.049151787161827, |
| "ce_loss_39": 1.8556257247924806, |
| "ce_loss_52": 1.4484239488840103, |
| "ce_loss_7": 2.9571595907211305, |
| "epoch": 0.532, |
| "grad_norm": 15.084958990702525, |
| "kl_loss_13": 2412.8, |
| "kl_loss_26": 1214.0, |
| "kl_loss_39": 801.8, |
| "kl_loss_7": 3144.4, |
| "learning_rate": 0.0004572124957605803, |
| "loss": 3796.3, |
| "step": 5320 |
| }, |
| { |
| "ce_loss_13": 2.6047778964042663, |
| "ce_loss_26": 2.0361655563116074, |
| "ce_loss_39": 1.829242792725563, |
| "ce_loss_52": 1.4240625560283662, |
| "ce_loss_7": 2.9437944889068604, |
| "epoch": 0.533, |
| "grad_norm": 14.875930516711946, |
| "kl_loss_13": 2409.0, |
| "kl_loss_26": 1224.1, |
| "kl_loss_39": 807.1, |
| "kl_loss_7": 3128.8, |
| "learning_rate": 0.00045563187119900103, |
| "loss": 3772.25, |
| "step": 5330 |
| }, |
| { |
| "ce_loss_13": 2.5524469763040543, |
| "ce_loss_26": 1.9957795530557632, |
| "ce_loss_39": 1.7972583919763565, |
| "ce_loss_52": 1.3942344322800637, |
| "ce_loss_7": 2.899323457479477, |
| "epoch": 0.534, |
| "grad_norm": 14.888961320169138, |
| "kl_loss_13": 2374.0, |
| "kl_loss_26": 1190.2, |
| "kl_loss_39": 785.5, |
| "kl_loss_7": 3107.6, |
| "learning_rate": 0.00045405169342398633, |
| "loss": 3809.2, |
| "step": 5340 |
| }, |
| { |
| "ce_loss_13": 2.6405540108680725, |
| "ce_loss_26": 2.080012783408165, |
| "ce_loss_39": 1.876559317111969, |
| "ce_loss_52": 1.462306337058544, |
| "ce_loss_7": 2.981167531013489, |
| "epoch": 0.535, |
| "grad_norm": 14.563184685424444, |
| "kl_loss_13": 2406.4, |
| "kl_loss_26": 1229.1, |
| "kl_loss_39": 816.6, |
| "kl_loss_7": 3130.4, |
| "learning_rate": 0.0004524719783479088, |
| "loss": 3797.75, |
| "step": 5350 |
| }, |
| { |
| "ce_loss_13": 2.645623618364334, |
| "ce_loss_26": 2.0881788969039916, |
| "ce_loss_39": 1.888492900133133, |
| "ce_loss_52": 1.4696861803531647, |
| "ce_loss_7": 2.994585871696472, |
| "epoch": 0.536, |
| "grad_norm": 15.457746807868878, |
| "kl_loss_13": 2427.2, |
| "kl_loss_26": 1243.7, |
| "kl_loss_39": 825.7, |
| "kl_loss_7": 3155.2, |
| "learning_rate": 0.00045089274187848144, |
| "loss": 3837.25, |
| "step": 5360 |
| }, |
| { |
| "ce_loss_13": 2.607480788230896, |
| "ce_loss_26": 2.0540437757968903, |
| "ce_loss_39": 1.854740971326828, |
| "ce_loss_52": 1.4391999766230583, |
| "ce_loss_7": 2.955876570940018, |
| "epoch": 0.537, |
| "grad_norm": 15.654126474702101, |
| "kl_loss_13": 2406.0, |
| "kl_loss_26": 1230.1, |
| "kl_loss_39": 814.4, |
| "kl_loss_7": 3128.8, |
| "learning_rate": 0.00044931399991859835, |
| "loss": 3791.2, |
| "step": 5370 |
| }, |
| { |
| "ce_loss_13": 2.5806866496801377, |
| "ce_loss_26": 2.0246922999620436, |
| "ce_loss_39": 1.8295573323965073, |
| "ce_loss_52": 1.433498626947403, |
| "ce_loss_7": 2.932585430145264, |
| "epoch": 0.538, |
| "grad_norm": 15.050745413645357, |
| "kl_loss_13": 2377.8, |
| "kl_loss_26": 1193.3, |
| "kl_loss_39": 790.4, |
| "kl_loss_7": 3116.4, |
| "learning_rate": 0.00044773576836617336, |
| "loss": 3771.2, |
| "step": 5380 |
| }, |
| { |
| "ce_loss_13": 2.5772230982780457, |
| "ce_loss_26": 2.0075444668531417, |
| "ce_loss_39": 1.8076122283935547, |
| "ce_loss_52": 1.3957727670669555, |
| "ce_loss_7": 2.9269763708114622, |
| "epoch": 0.539, |
| "grad_norm": 14.739351688834923, |
| "kl_loss_13": 2418.2, |
| "kl_loss_26": 1217.6, |
| "kl_loss_39": 806.9, |
| "kl_loss_7": 3154.0, |
| "learning_rate": 0.00044615806311398056, |
| "loss": 3764.85, |
| "step": 5390 |
| }, |
| { |
| "ce_loss_13": 2.6164508730173113, |
| "ce_loss_26": 2.0489900171756745, |
| "ce_loss_39": 1.8477211087942123, |
| "ce_loss_52": 1.4378462180495262, |
| "ce_loss_7": 2.9610529631376266, |
| "epoch": 0.54, |
| "grad_norm": 15.257899541784896, |
| "kl_loss_13": 2422.8, |
| "kl_loss_26": 1222.4, |
| "kl_loss_39": 806.9, |
| "kl_loss_7": 3149.2, |
| "learning_rate": 0.00044458090004949454, |
| "loss": 3789.8, |
| "step": 5400 |
| }, |
| { |
| "ce_loss_13": 2.639938807487488, |
| "ce_loss_26": 2.078041157126427, |
| "ce_loss_39": 1.8733548551797867, |
| "ce_loss_52": 1.4663413792848587, |
| "ce_loss_7": 2.978548914194107, |
| "epoch": 0.541, |
| "grad_norm": 15.643709958040844, |
| "kl_loss_13": 2399.4, |
| "kl_loss_26": 1217.8, |
| "kl_loss_39": 804.5, |
| "kl_loss_7": 3111.2, |
| "learning_rate": 0.0004430042950547297, |
| "loss": 3775.4, |
| "step": 5410 |
| }, |
| { |
| "ce_loss_13": 2.6770710349082947, |
| "ce_loss_26": 2.110589724779129, |
| "ce_loss_39": 1.9134115755558014, |
| "ce_loss_52": 1.5031546354293823, |
| "ce_loss_7": 3.0216497242450715, |
| "epoch": 0.542, |
| "grad_norm": 14.919746196312753, |
| "kl_loss_13": 2411.6, |
| "kl_loss_26": 1216.6, |
| "kl_loss_39": 809.1, |
| "kl_loss_7": 3147.6, |
| "learning_rate": 0.0004414282640060809, |
| "loss": 3768.95, |
| "step": 5420 |
| }, |
| { |
| "ce_loss_13": 2.6296884536743166, |
| "ce_loss_26": 2.0768262147903442, |
| "ce_loss_39": 1.8766031116247177, |
| "ce_loss_52": 1.4701504305005073, |
| "ce_loss_7": 2.9714462876319887, |
| "epoch": 0.543, |
| "grad_norm": 14.109291634946612, |
| "kl_loss_13": 2361.6, |
| "kl_loss_26": 1207.6, |
| "kl_loss_39": 794.5, |
| "kl_loss_7": 3073.6, |
| "learning_rate": 0.0004398528227741633, |
| "loss": 3734.5, |
| "step": 5430 |
| }, |
| { |
| "ce_loss_13": 2.5891071379184725, |
| "ce_loss_26": 2.024920642375946, |
| "ce_loss_39": 1.8224879026412963, |
| "ce_loss_52": 1.4225929498672485, |
| "ce_loss_7": 2.939086824655533, |
| "epoch": 0.544, |
| "grad_norm": 15.182283952769808, |
| "kl_loss_13": 2384.4, |
| "kl_loss_26": 1201.0, |
| "kl_loss_39": 785.9, |
| "kl_loss_7": 3118.0, |
| "learning_rate": 0.00043827798722365264, |
| "loss": 3724.5, |
| "step": 5440 |
| }, |
| { |
| "ce_loss_13": 2.521664083003998, |
| "ce_loss_26": 1.9635723412036896, |
| "ce_loss_39": 1.776253479719162, |
| "ce_loss_52": 1.3777382284402848, |
| "ce_loss_7": 2.8684565305709837, |
| "epoch": 0.545, |
| "grad_norm": 14.678349850224247, |
| "kl_loss_13": 2357.4, |
| "kl_loss_26": 1171.5, |
| "kl_loss_39": 776.8, |
| "kl_loss_7": 3081.2, |
| "learning_rate": 0.00043670377321312535, |
| "loss": 3748.6, |
| "step": 5450 |
| }, |
| { |
| "ce_loss_13": 2.6282208263874054, |
| "ce_loss_26": 2.063512918353081, |
| "ce_loss_39": 1.8621816724538802, |
| "ce_loss_52": 1.4465220913290977, |
| "ce_loss_7": 2.9661788761615755, |
| "epoch": 0.546, |
| "grad_norm": 14.95327030090302, |
| "kl_loss_13": 2410.4, |
| "kl_loss_26": 1226.2, |
| "kl_loss_39": 814.8, |
| "kl_loss_7": 3130.0, |
| "learning_rate": 0.0004351301965948991, |
| "loss": 3750.5, |
| "step": 5460 |
| }, |
| { |
| "ce_loss_13": 2.6448668360710146, |
| "ce_loss_26": 2.092715525627136, |
| "ce_loss_39": 1.8841570675373078, |
| "ce_loss_52": 1.465877577662468, |
| "ce_loss_7": 2.983266705274582, |
| "epoch": 0.547, |
| "grad_norm": 14.464098395004918, |
| "kl_loss_13": 2427.2, |
| "kl_loss_26": 1254.5, |
| "kl_loss_39": 827.1, |
| "kl_loss_7": 3138.8, |
| "learning_rate": 0.000433557273214873, |
| "loss": 3760.7, |
| "step": 5470 |
| }, |
| { |
| "ce_loss_13": 2.5597914129495623, |
| "ce_loss_26": 2.0002847105264663, |
| "ce_loss_39": 1.8126664906740189, |
| "ce_loss_52": 1.4252239495515824, |
| "ce_loss_7": 2.89780033826828, |
| "epoch": 0.548, |
| "grad_norm": 14.511201984799017, |
| "kl_loss_13": 2336.4, |
| "kl_loss_26": 1154.4, |
| "kl_loss_39": 760.7, |
| "kl_loss_7": 3046.4, |
| "learning_rate": 0.000431985018912368, |
| "loss": 3744.2, |
| "step": 5480 |
| }, |
| { |
| "ce_loss_13": 2.556107610464096, |
| "ce_loss_26": 2.0089694380760195, |
| "ce_loss_39": 1.8094982028007507, |
| "ce_loss_52": 1.4210678458213806, |
| "ce_loss_7": 2.8992061018943787, |
| "epoch": 0.549, |
| "grad_norm": 14.88322732328834, |
| "kl_loss_13": 2332.4, |
| "kl_loss_26": 1182.7, |
| "kl_loss_39": 771.7, |
| "kl_loss_7": 3056.4, |
| "learning_rate": 0.0004304134495199674, |
| "loss": 3725.95, |
| "step": 5490 |
| }, |
| { |
| "ce_loss_13": 2.5557271778583526, |
| "ce_loss_26": 2.015131750702858, |
| "ce_loss_39": 1.8132081747055053, |
| "ce_loss_52": 1.4253456503152848, |
| "ce_loss_7": 2.8894282221794128, |
| "epoch": 0.55, |
| "grad_norm": 14.509766958879414, |
| "kl_loss_13": 2342.4, |
| "kl_loss_26": 1192.8, |
| "kl_loss_39": 776.15, |
| "kl_loss_7": 3041.2, |
| "learning_rate": 0.0004288425808633575, |
| "loss": 3690.5, |
| "step": 5500 |
| }, |
| { |
| "ce_loss_13": 2.6498306572437285, |
| "ce_loss_26": 2.0864265114068985, |
| "ce_loss_39": 1.884456393122673, |
| "ce_loss_52": 1.4724615901708602, |
| "ce_loss_7": 2.994233113527298, |
| "epoch": 0.551, |
| "grad_norm": 15.223257504626806, |
| "kl_loss_13": 2419.8, |
| "kl_loss_26": 1227.5, |
| "kl_loss_39": 805.6, |
| "kl_loss_7": 3137.2, |
| "learning_rate": 0.0004272724287611684, |
| "loss": 3719.95, |
| "step": 5510 |
| }, |
| { |
| "ce_loss_13": 2.6063999772071837, |
| "ce_loss_26": 2.0453016996383666, |
| "ce_loss_39": 1.8452126443386079, |
| "ce_loss_52": 1.4395337477326393, |
| "ce_loss_7": 2.9420916736125946, |
| "epoch": 0.552, |
| "grad_norm": 14.707511636553823, |
| "kl_loss_13": 2389.2, |
| "kl_loss_26": 1204.1, |
| "kl_loss_39": 791.2, |
| "kl_loss_7": 3102.4, |
| "learning_rate": 0.00042570300902481425, |
| "loss": 3704.35, |
| "step": 5520 |
| }, |
| { |
| "ce_loss_13": 2.563195550441742, |
| "ce_loss_26": 1.9973567068576812, |
| "ce_loss_39": 1.7899492472410201, |
| "ce_loss_52": 1.3841874808073045, |
| "ce_loss_7": 2.9130555033683776, |
| "epoch": 0.553, |
| "grad_norm": 14.788339735855649, |
| "kl_loss_13": 2417.4, |
| "kl_loss_26": 1232.3, |
| "kl_loss_39": 809.2, |
| "kl_loss_7": 3150.4, |
| "learning_rate": 0.00042413433745833423, |
| "loss": 3716.65, |
| "step": 5530 |
| }, |
| { |
| "ce_loss_13": 2.5711400628089907, |
| "ce_loss_26": 2.0185573011636735, |
| "ce_loss_39": 1.8240427374839783, |
| "ce_loss_52": 1.4254193544387816, |
| "ce_loss_7": 2.9173736214637755, |
| "epoch": 0.554, |
| "grad_norm": 14.905176252765152, |
| "kl_loss_13": 2345.2, |
| "kl_loss_26": 1182.6, |
| "kl_loss_39": 781.0, |
| "kl_loss_7": 3069.2, |
| "learning_rate": 0.0004225664298582339, |
| "loss": 3692.3, |
| "step": 5540 |
| }, |
| { |
| "ce_loss_13": 2.5974901139736177, |
| "ce_loss_26": 2.046999195218086, |
| "ce_loss_39": 1.8451066941022873, |
| "ce_loss_52": 1.436999562382698, |
| "ce_loss_7": 2.9430564284324645, |
| "epoch": 0.555, |
| "grad_norm": 16.142335203190324, |
| "kl_loss_13": 2402.8, |
| "kl_loss_26": 1219.2, |
| "kl_loss_39": 797.9, |
| "kl_loss_7": 3125.6, |
| "learning_rate": 0.000420999302013325, |
| "loss": 3720.0, |
| "step": 5550 |
| }, |
| { |
| "ce_loss_13": 2.5756935298442842, |
| "ce_loss_26": 2.00499467253685, |
| "ce_loss_39": 1.8017524302005767, |
| "ce_loss_52": 1.4018217638134955, |
| "ce_loss_7": 2.9280818104743958, |
| "epoch": 0.556, |
| "grad_norm": 15.838882130746192, |
| "kl_loss_13": 2404.2, |
| "kl_loss_26": 1195.7, |
| "kl_loss_39": 787.5, |
| "kl_loss_7": 3148.8, |
| "learning_rate": 0.000419432969704568, |
| "loss": 3744.35, |
| "step": 5560 |
| }, |
| { |
| "ce_loss_13": 2.6485643923282622, |
| "ce_loss_26": 2.0993672519922257, |
| "ce_loss_39": 1.8959094911813736, |
| "ce_loss_52": 1.4813728883862496, |
| "ce_loss_7": 2.994894337654114, |
| "epoch": 0.557, |
| "grad_norm": 14.30299821768554, |
| "kl_loss_13": 2404.0, |
| "kl_loss_26": 1234.2, |
| "kl_loss_39": 810.35, |
| "kl_loss_7": 3130.8, |
| "learning_rate": 0.00041786744870491154, |
| "loss": 3698.4, |
| "step": 5570 |
| }, |
| { |
| "ce_loss_13": 2.6594822227954866, |
| "ce_loss_26": 2.0962711691856386, |
| "ce_loss_39": 1.8888987362384797, |
| "ce_loss_52": 1.477835801243782, |
| "ce_loss_7": 3.003460741043091, |
| "epoch": 0.558, |
| "grad_norm": 14.927142462067188, |
| "kl_loss_13": 2421.4, |
| "kl_loss_26": 1232.3, |
| "kl_loss_39": 804.5, |
| "kl_loss_7": 3142.8, |
| "learning_rate": 0.0004163027547791347, |
| "loss": 3696.4, |
| "step": 5580 |
| }, |
| { |
| "ce_loss_13": 2.6033630073070526, |
| "ce_loss_26": 2.052795875072479, |
| "ce_loss_39": 1.8524764776229858, |
| "ce_loss_52": 1.4589103490114212, |
| "ce_loss_7": 2.9362357556819916, |
| "epoch": 0.559, |
| "grad_norm": 15.035071624412899, |
| "kl_loss_13": 2332.2, |
| "kl_loss_26": 1173.9, |
| "kl_loss_39": 769.3, |
| "kl_loss_7": 3041.6, |
| "learning_rate": 0.0004147389036836881, |
| "loss": 3676.85, |
| "step": 5590 |
| }, |
| { |
| "ce_loss_13": 2.5536737203598023, |
| "ce_loss_26": 2.0111388891935347, |
| "ce_loss_39": 1.8144014358520508, |
| "ce_loss_52": 1.4328835308551788, |
| "ce_loss_7": 2.88922523856163, |
| "epoch": 0.56, |
| "grad_norm": 14.534788987143044, |
| "kl_loss_13": 2316.6, |
| "kl_loss_26": 1164.9, |
| "kl_loss_39": 758.2, |
| "kl_loss_7": 3024.4, |
| "learning_rate": 0.00041317591116653486, |
| "loss": 3694.55, |
| "step": 5600 |
| }, |
| { |
| "ce_loss_13": 2.606983852386475, |
| "ce_loss_26": 2.032891970872879, |
| "ce_loss_39": 1.8340051174163818, |
| "ce_loss_52": 1.4310238301753997, |
| "ce_loss_7": 2.9573661506175997, |
| "epoch": 0.561, |
| "grad_norm": 16.216124841765133, |
| "kl_loss_13": 2427.2, |
| "kl_loss_26": 1205.0, |
| "kl_loss_39": 794.3, |
| "kl_loss_7": 3156.0, |
| "learning_rate": 0.0004116137929669921, |
| "loss": 3679.35, |
| "step": 5610 |
| }, |
| { |
| "ce_loss_13": 2.5370231360197066, |
| "ce_loss_26": 1.9963056713342666, |
| "ce_loss_39": 1.803014099597931, |
| "ce_loss_52": 1.4229481190443038, |
| "ce_loss_7": 2.8732429146766663, |
| "epoch": 0.562, |
| "grad_norm": 15.330886130898222, |
| "kl_loss_13": 2277.4, |
| "kl_loss_26": 1124.3, |
| "kl_loss_39": 729.3, |
| "kl_loss_7": 2988.8, |
| "learning_rate": 0.00041005256481557305, |
| "loss": 3673.5, |
| "step": 5620 |
| }, |
| { |
| "ce_loss_13": 2.6017677545547486, |
| "ce_loss_26": 2.0595314621925356, |
| "ce_loss_39": 1.8631951808929443, |
| "ce_loss_52": 1.4627325683832169, |
| "ce_loss_7": 2.9314393043518066, |
| "epoch": 0.563, |
| "grad_norm": 14.465862339922571, |
| "kl_loss_13": 2332.2, |
| "kl_loss_26": 1190.7, |
| "kl_loss_39": 781.3, |
| "kl_loss_7": 3032.0, |
| "learning_rate": 0.00040849224243382767, |
| "loss": 3672.9, |
| "step": 5630 |
| }, |
| { |
| "ce_loss_13": 2.5594456523656843, |
| "ce_loss_26": 2.00023832321167, |
| "ce_loss_39": 1.8007198423147202, |
| "ce_loss_52": 1.4131150737404823, |
| "ce_loss_7": 2.904243141412735, |
| "epoch": 0.564, |
| "grad_norm": 15.017201171045896, |
| "kl_loss_13": 2360.2, |
| "kl_loss_26": 1180.4, |
| "kl_loss_39": 768.1, |
| "kl_loss_7": 3075.6, |
| "learning_rate": 0.000406932841534185, |
| "loss": 3693.75, |
| "step": 5640 |
| }, |
| { |
| "ce_loss_13": 2.595109748840332, |
| "ce_loss_26": 2.045265626907349, |
| "ce_loss_39": 1.85684574842453, |
| "ce_loss_52": 1.4735014230012893, |
| "ce_loss_7": 2.943417179584503, |
| "epoch": 0.565, |
| "grad_norm": 15.364099861784982, |
| "kl_loss_13": 2322.0, |
| "kl_loss_26": 1155.7, |
| "kl_loss_39": 755.4, |
| "kl_loss_7": 3048.4, |
| "learning_rate": 0.0004053743778197951, |
| "loss": 3668.9, |
| "step": 5650 |
| }, |
| { |
| "ce_loss_13": 2.582511156797409, |
| "ce_loss_26": 2.0224455118179323, |
| "ce_loss_39": 1.8295785069465638, |
| "ce_loss_52": 1.4373956888914108, |
| "ce_loss_7": 2.9102243304252626, |
| "epoch": 0.566, |
| "grad_norm": 14.693957764502153, |
| "kl_loss_13": 2342.4, |
| "kl_loss_26": 1176.8, |
| "kl_loss_39": 776.5, |
| "kl_loss_7": 3046.4, |
| "learning_rate": 0.0004038168669843697, |
| "loss": 3650.65, |
| "step": 5660 |
| }, |
| { |
| "ce_loss_13": 2.603584831953049, |
| "ce_loss_26": 2.04022336602211, |
| "ce_loss_39": 1.8417048037052155, |
| "ce_loss_52": 1.447445745766163, |
| "ce_loss_7": 2.9460166096687317, |
| "epoch": 0.567, |
| "grad_norm": 15.203721551974317, |
| "kl_loss_13": 2379.4, |
| "kl_loss_26": 1187.3, |
| "kl_loss_39": 777.4, |
| "kl_loss_7": 3104.8, |
| "learning_rate": 0.000402260324712026, |
| "loss": 3688.75, |
| "step": 5670 |
| }, |
| { |
| "ce_loss_13": 2.526816266775131, |
| "ce_loss_26": 1.9893273174762727, |
| "ce_loss_39": 1.793796670436859, |
| "ce_loss_52": 1.4289553046226502, |
| "ce_loss_7": 2.8634801030159, |
| "epoch": 0.568, |
| "grad_norm": 14.842310660649245, |
| "kl_loss_13": 2254.2, |
| "kl_loss_26": 1116.4, |
| "kl_loss_39": 714.95, |
| "kl_loss_7": 2954.8, |
| "learning_rate": 0.00040070476667712743, |
| "loss": 3637.75, |
| "step": 5680 |
| }, |
| { |
| "ce_loss_13": 2.615302687883377, |
| "ce_loss_26": 2.059708908200264, |
| "ce_loss_39": 1.8555728137493133, |
| "ce_loss_52": 1.4557225406169891, |
| "ce_loss_7": 2.9577668845653533, |
| "epoch": 0.569, |
| "grad_norm": 14.742701130409761, |
| "kl_loss_13": 2387.6, |
| "kl_loss_26": 1214.9, |
| "kl_loss_39": 792.5, |
| "kl_loss_7": 3105.2, |
| "learning_rate": 0.0003991502085441259, |
| "loss": 3676.05, |
| "step": 5690 |
| }, |
| { |
| "ce_loss_13": 2.5645705699920653, |
| "ce_loss_26": 2.007223817706108, |
| "ce_loss_39": 1.8173159271478654, |
| "ce_loss_52": 1.4376816004514694, |
| "ce_loss_7": 2.895137590169907, |
| "epoch": 0.57, |
| "grad_norm": 15.460594692772787, |
| "kl_loss_13": 2314.8, |
| "kl_loss_26": 1152.0, |
| "kl_loss_39": 753.2, |
| "kl_loss_7": 3024.8, |
| "learning_rate": 0.0003975966659674047, |
| "loss": 3621.95, |
| "step": 5700 |
| }, |
| { |
| "ce_loss_13": 2.559953585267067, |
| "ce_loss_26": 2.032286322116852, |
| "ce_loss_39": 1.845168125629425, |
| "ce_loss_52": 1.461988940834999, |
| "ce_loss_7": 2.9015457332134247, |
| "epoch": 0.571, |
| "grad_norm": 15.171579029800053, |
| "kl_loss_13": 2278.8, |
| "kl_loss_26": 1159.2, |
| "kl_loss_39": 758.6, |
| "kl_loss_7": 2982.0, |
| "learning_rate": 0.0003960441545911204, |
| "loss": 3675.95, |
| "step": 5710 |
| }, |
| { |
| "ce_loss_13": 2.6008632302284242, |
| "ce_loss_26": 2.043924775719643, |
| "ce_loss_39": 1.8485127180814742, |
| "ce_loss_52": 1.4697089165449142, |
| "ce_loss_7": 2.9355547785758973, |
| "epoch": 0.572, |
| "grad_norm": 14.834558653485171, |
| "kl_loss_13": 2319.4, |
| "kl_loss_26": 1152.8, |
| "kl_loss_39": 750.0, |
| "kl_loss_7": 3027.6, |
| "learning_rate": 0.0003944926900490452, |
| "loss": 3638.65, |
| "step": 5720 |
| }, |
| { |
| "ce_loss_13": 2.532833296060562, |
| "ce_loss_26": 1.9711334377527236, |
| "ce_loss_39": 1.77955681681633, |
| "ce_loss_52": 1.4007374957203864, |
| "ce_loss_7": 2.8781362950801848, |
| "epoch": 0.573, |
| "grad_norm": 16.10932164493431, |
| "kl_loss_13": 2337.8, |
| "kl_loss_26": 1151.9, |
| "kl_loss_39": 757.3, |
| "kl_loss_7": 3064.8, |
| "learning_rate": 0.0003929422879644099, |
| "loss": 3650.2, |
| "step": 5730 |
| }, |
| { |
| "ce_loss_13": 2.5908755481243135, |
| "ce_loss_26": 2.0414842426776887, |
| "ce_loss_39": 1.8504431873559952, |
| "ce_loss_52": 1.4606543123722076, |
| "ce_loss_7": 2.926515054702759, |
| "epoch": 0.574, |
| "grad_norm": 14.72871950232802, |
| "kl_loss_13": 2333.4, |
| "kl_loss_26": 1164.1, |
| "kl_loss_39": 763.5, |
| "kl_loss_7": 3044.4, |
| "learning_rate": 0.0003913929639497462, |
| "loss": 3615.45, |
| "step": 5740 |
| }, |
| { |
| "ce_loss_13": 2.591219651699066, |
| "ce_loss_26": 2.044692638516426, |
| "ce_loss_39": 1.8468156188726426, |
| "ce_loss_52": 1.452781331539154, |
| "ce_loss_7": 2.932692265510559, |
| "epoch": 0.575, |
| "grad_norm": 14.536189899304834, |
| "kl_loss_13": 2345.4, |
| "kl_loss_26": 1189.2, |
| "kl_loss_39": 775.0, |
| "kl_loss_7": 3054.8, |
| "learning_rate": 0.00038984473360672965, |
| "loss": 3631.3, |
| "step": 5750 |
| }, |
| { |
| "ce_loss_13": 2.555169379711151, |
| "ce_loss_26": 2.0133784860372543, |
| "ce_loss_39": 1.8234185576438904, |
| "ce_loss_52": 1.4436144948005676, |
| "ce_loss_7": 2.896002060174942, |
| "epoch": 0.576, |
| "grad_norm": 15.34263044515968, |
| "kl_loss_13": 2285.0, |
| "kl_loss_26": 1136.7, |
| "kl_loss_39": 742.35, |
| "kl_loss_7": 2993.2, |
| "learning_rate": 0.0003882976125260229, |
| "loss": 3658.4, |
| "step": 5760 |
| }, |
| { |
| "ce_loss_13": 2.502971774339676, |
| "ce_loss_26": 1.9493688374757767, |
| "ce_loss_39": 1.7570990473031998, |
| "ce_loss_52": 1.3885455280542374, |
| "ce_loss_7": 2.8447738111019136, |
| "epoch": 0.577, |
| "grad_norm": 14.660193254554198, |
| "kl_loss_13": 2295.0, |
| "kl_loss_26": 1140.5, |
| "kl_loss_39": 734.0, |
| "kl_loss_7": 3013.6, |
| "learning_rate": 0.00038675161628711776, |
| "loss": 3632.8, |
| "step": 5770 |
| }, |
| { |
| "ce_loss_13": 2.5624574303627012, |
| "ce_loss_26": 2.0051666617393495, |
| "ce_loss_39": 1.8087779253721237, |
| "ce_loss_52": 1.41810100376606, |
| "ce_loss_7": 2.895737165212631, |
| "epoch": 0.578, |
| "grad_norm": 14.482088330386649, |
| "kl_loss_13": 2343.4, |
| "kl_loss_26": 1176.1, |
| "kl_loss_39": 766.9, |
| "kl_loss_7": 3047.6, |
| "learning_rate": 0.0003852067604581794, |
| "loss": 3602.85, |
| "step": 5780 |
| }, |
| { |
| "ce_loss_13": 2.5270320236682893, |
| "ce_loss_26": 1.9888224333524704, |
| "ce_loss_39": 1.803659090399742, |
| "ce_loss_52": 1.428774857521057, |
| "ce_loss_7": 2.8705179512500765, |
| "epoch": 0.579, |
| "grad_norm": 14.979884111866252, |
| "kl_loss_13": 2273.2, |
| "kl_loss_26": 1136.8, |
| "kl_loss_39": 739.9, |
| "kl_loss_7": 2982.8, |
| "learning_rate": 0.0003836630605958888, |
| "loss": 3603.35, |
| "step": 5790 |
| }, |
| { |
| "ce_loss_13": 2.5794149696826936, |
| "ce_loss_26": 2.0252732813358305, |
| "ce_loss_39": 1.8308797210454941, |
| "ce_loss_52": 1.4384155124425888, |
| "ce_loss_7": 2.9223524034023285, |
| "epoch": 0.58, |
| "grad_norm": 14.8686548037009, |
| "kl_loss_13": 2322.0, |
| "kl_loss_26": 1158.7, |
| "kl_loss_39": 758.3, |
| "kl_loss_7": 3044.0, |
| "learning_rate": 0.0003821205322452863, |
| "loss": 3636.15, |
| "step": 5800 |
| }, |
| { |
| "ce_loss_13": 2.6002571165561674, |
| "ce_loss_26": 2.0497290968894957, |
| "ce_loss_39": 1.8579988300800323, |
| "ce_loss_52": 1.4732319116592407, |
| "ce_loss_7": 2.9367463052272798, |
| "epoch": 0.581, |
| "grad_norm": 15.52768956484863, |
| "kl_loss_13": 2303.2, |
| "kl_loss_26": 1143.4, |
| "kl_loss_39": 743.0, |
| "kl_loss_7": 3018.4, |
| "learning_rate": 0.0003805791909396155, |
| "loss": 3651.1, |
| "step": 5810 |
| }, |
| { |
| "ce_loss_13": 2.5309071093797684, |
| "ce_loss_26": 1.9803194522857666, |
| "ce_loss_39": 1.7861102789640426, |
| "ce_loss_52": 1.410745631158352, |
| "ce_loss_7": 2.8677128195762633, |
| "epoch": 0.582, |
| "grad_norm": 14.66111038706468, |
| "kl_loss_13": 2302.0, |
| "kl_loss_26": 1148.3, |
| "kl_loss_39": 748.9, |
| "kl_loss_7": 3008.0, |
| "learning_rate": 0.0003790390522001662, |
| "loss": 3564.65, |
| "step": 5820 |
| }, |
| { |
| "ce_loss_13": 2.5017096638679504, |
| "ce_loss_26": 1.94625324010849, |
| "ce_loss_39": 1.749154046177864, |
| "ce_loss_52": 1.3770527362823486, |
| "ce_loss_7": 2.8349112212657928, |
| "epoch": 0.583, |
| "grad_norm": 14.629798672555188, |
| "kl_loss_13": 2294.6, |
| "kl_loss_26": 1141.8, |
| "kl_loss_39": 731.9, |
| "kl_loss_7": 3003.2, |
| "learning_rate": 0.0003775001315361183, |
| "loss": 3613.35, |
| "step": 5830 |
| }, |
| { |
| "ce_loss_13": 2.5965979039669036, |
| "ce_loss_26": 2.0513822197914124, |
| "ce_loss_39": 1.8568279683589934, |
| "ce_loss_52": 1.486306893825531, |
| "ce_loss_7": 2.932318705320358, |
| "epoch": 0.584, |
| "grad_norm": 15.483089802029363, |
| "kl_loss_13": 2285.6, |
| "kl_loss_26": 1133.6, |
| "kl_loss_39": 729.5, |
| "kl_loss_7": 2989.2, |
| "learning_rate": 0.0003759624444443858, |
| "loss": 3579.55, |
| "step": 5840 |
| }, |
| { |
| "ce_loss_13": 2.584410917758942, |
| "ce_loss_26": 2.022917777299881, |
| "ce_loss_39": 1.8257231026887895, |
| "ce_loss_52": 1.437566375732422, |
| "ce_loss_7": 2.9283429443836213, |
| "epoch": 0.585, |
| "grad_norm": 15.042900713620126, |
| "kl_loss_13": 2353.8, |
| "kl_loss_26": 1173.8, |
| "kl_loss_39": 763.0, |
| "kl_loss_7": 3075.6, |
| "learning_rate": 0.00037442600640946044, |
| "loss": 3619.6, |
| "step": 5850 |
| }, |
| { |
| "ce_loss_13": 2.5062096178531648, |
| "ce_loss_26": 1.9529441505670548, |
| "ce_loss_39": 1.756454050540924, |
| "ce_loss_52": 1.3877734661102294, |
| "ce_loss_7": 2.852471035718918, |
| "epoch": 0.586, |
| "grad_norm": 15.637281763013858, |
| "kl_loss_13": 2307.4, |
| "kl_loss_26": 1145.7, |
| "kl_loss_39": 736.7, |
| "kl_loss_7": 3028.0, |
| "learning_rate": 0.00037289083290325663, |
| "loss": 3605.0, |
| "step": 5860 |
| }, |
| { |
| "ce_loss_13": 2.550817745923996, |
| "ce_loss_26": 2.0112617135047914, |
| "ce_loss_39": 1.8207211345434189, |
| "ce_loss_52": 1.4542410910129546, |
| "ce_loss_7": 2.8887066781520843, |
| "epoch": 0.587, |
| "grad_norm": 14.48910345968502, |
| "kl_loss_13": 2241.0, |
| "kl_loss_26": 1095.1, |
| "kl_loss_39": 709.9, |
| "kl_loss_7": 2950.4, |
| "learning_rate": 0.0003713569393849543, |
| "loss": 3628.55, |
| "step": 5870 |
| }, |
| { |
| "ce_loss_13": 2.5198557287454606, |
| "ce_loss_26": 1.9734882295131684, |
| "ce_loss_39": 1.7831378549337387, |
| "ce_loss_52": 1.4195168539881706, |
| "ce_loss_7": 2.8633585631847382, |
| "epoch": 0.588, |
| "grad_norm": 14.606482629979089, |
| "kl_loss_13": 2270.0, |
| "kl_loss_26": 1121.9, |
| "kl_loss_39": 724.8, |
| "kl_loss_7": 2990.8, |
| "learning_rate": 0.00036982434130084397, |
| "loss": 3605.15, |
| "step": 5880 |
| }, |
| { |
| "ce_loss_13": 2.5002260982990263, |
| "ce_loss_26": 1.9510222643613815, |
| "ce_loss_39": 1.760866141319275, |
| "ce_loss_52": 1.3993624940514564, |
| "ce_loss_7": 2.8445383846759795, |
| "epoch": 0.589, |
| "grad_norm": 15.157934768830916, |
| "kl_loss_13": 2246.4, |
| "kl_loss_26": 1094.8, |
| "kl_loss_39": 699.8, |
| "kl_loss_7": 2967.6, |
| "learning_rate": 0.00036829305408417166, |
| "loss": 3580.45, |
| "step": 5890 |
| }, |
| { |
| "ce_loss_13": 2.4792177438735963, |
| "ce_loss_26": 1.9329254776239395, |
| "ce_loss_39": 1.7395812034606934, |
| "ce_loss_52": 1.3731168687343598, |
| "ce_loss_7": 2.8229923218488695, |
| "epoch": 0.59, |
| "grad_norm": 14.656512940113272, |
| "kl_loss_13": 2271.0, |
| "kl_loss_26": 1112.9, |
| "kl_loss_39": 720.45, |
| "kl_loss_7": 2995.2, |
| "learning_rate": 0.0003667630931549826, |
| "loss": 3601.65, |
| "step": 5900 |
| }, |
| { |
| "ce_loss_13": 2.6107949793338774, |
| "ce_loss_26": 2.0506039649248122, |
| "ce_loss_39": 1.8547100484371186, |
| "ce_loss_52": 1.4670722007751464, |
| "ce_loss_7": 2.947498029470444, |
| "epoch": 0.591, |
| "grad_norm": 15.535800254061792, |
| "kl_loss_13": 2341.4, |
| "kl_loss_26": 1168.3, |
| "kl_loss_39": 753.2, |
| "kl_loss_7": 3065.2, |
| "learning_rate": 0.00036523447391996613, |
| "loss": 3580.55, |
| "step": 5910 |
| }, |
| { |
| "ce_loss_13": 2.5217359244823454, |
| "ce_loss_26": 1.988610166311264, |
| "ce_loss_39": 1.8043963432312011, |
| "ce_loss_52": 1.4320271372795106, |
| "ce_loss_7": 2.8523899018764496, |
| "epoch": 0.592, |
| "grad_norm": 15.144566757103092, |
| "kl_loss_13": 2242.6, |
| "kl_loss_26": 1108.7, |
| "kl_loss_39": 722.4, |
| "kl_loss_7": 2935.6, |
| "learning_rate": 0.00036370721177230114, |
| "loss": 3609.65, |
| "step": 5920 |
| }, |
| { |
| "ce_loss_13": 2.5657771229743958, |
| "ce_loss_26": 2.024441570043564, |
| "ce_loss_39": 1.8324565082788467, |
| "ce_loss_52": 1.4527549773454667, |
| "ce_loss_7": 2.9029002487659454, |
| "epoch": 0.593, |
| "grad_norm": 14.305280391890491, |
| "kl_loss_13": 2257.4, |
| "kl_loss_26": 1128.5, |
| "kl_loss_39": 727.25, |
| "kl_loss_7": 2971.6, |
| "learning_rate": 0.00036218132209150044, |
| "loss": 3561.25, |
| "step": 5930 |
| }, |
| { |
| "ce_loss_13": 2.5320515751838686, |
| "ce_loss_26": 1.993978601694107, |
| "ce_loss_39": 1.7951736986637115, |
| "ce_loss_52": 1.4332455009222032, |
| "ce_loss_7": 2.8734976410865785, |
| "epoch": 0.594, |
| "grad_norm": 14.731628028248535, |
| "kl_loss_13": 2276.0, |
| "kl_loss_26": 1134.7, |
| "kl_loss_39": 725.65, |
| "kl_loss_7": 2984.8, |
| "learning_rate": 0.0003606568202432562, |
| "loss": 3568.15, |
| "step": 5940 |
| }, |
| { |
| "ce_loss_13": 2.467684972286224, |
| "ce_loss_26": 1.9258863091468812, |
| "ce_loss_39": 1.7374547556042672, |
| "ce_loss_52": 1.3822436913847924, |
| "ce_loss_7": 2.8027748644351957, |
| "epoch": 0.595, |
| "grad_norm": 13.952374257617315, |
| "kl_loss_13": 2221.6, |
| "kl_loss_26": 1099.3, |
| "kl_loss_39": 702.85, |
| "kl_loss_7": 2916.8, |
| "learning_rate": 0.0003591337215792851, |
| "loss": 3573.15, |
| "step": 5950 |
| }, |
| { |
| "ce_loss_13": 2.5612458407878878, |
| "ce_loss_26": 2.005467265844345, |
| "ce_loss_39": 1.8046582967042923, |
| "ce_loss_52": 1.4227981299161911, |
| "ce_loss_7": 2.9120794773101806, |
| "epoch": 0.596, |
| "grad_norm": 14.684621517642583, |
| "kl_loss_13": 2333.2, |
| "kl_loss_26": 1167.6, |
| "kl_loss_39": 756.5, |
| "kl_loss_7": 3067.6, |
| "learning_rate": 0.00035761204143717383, |
| "loss": 3598.3, |
| "step": 5960 |
| }, |
| { |
| "ce_loss_13": 2.539260357618332, |
| "ce_loss_26": 1.997820395231247, |
| "ce_loss_39": 1.7995415717363357, |
| "ce_loss_52": 1.4228885769844055, |
| "ce_loss_7": 2.8742454588413238, |
| "epoch": 0.597, |
| "grad_norm": 14.834521968922566, |
| "kl_loss_13": 2285.6, |
| "kl_loss_26": 1143.0, |
| "kl_loss_39": 733.0, |
| "kl_loss_7": 2992.4, |
| "learning_rate": 0.0003560917951402245, |
| "loss": 3549.75, |
| "step": 5970 |
| }, |
| { |
| "ce_loss_13": 2.514337483048439, |
| "ce_loss_26": 1.9719054281711579, |
| "ce_loss_39": 1.7803026676177978, |
| "ce_loss_52": 1.4242254197597504, |
| "ce_loss_7": 2.8526304841041563, |
| "epoch": 0.598, |
| "grad_norm": 15.184376306356116, |
| "kl_loss_13": 2262.4, |
| "kl_loss_26": 1100.6, |
| "kl_loss_39": 700.55, |
| "kl_loss_7": 2977.6, |
| "learning_rate": 0.00035457299799730046, |
| "loss": 3595.65, |
| "step": 5980 |
| }, |
| { |
| "ce_loss_13": 2.5518115133047106, |
| "ce_loss_26": 2.011714455485344, |
| "ce_loss_39": 1.8179199546575546, |
| "ce_loss_52": 1.4457479059696197, |
| "ce_loss_7": 2.903665816783905, |
| "epoch": 0.599, |
| "grad_norm": 17.70688578121031, |
| "kl_loss_13": 2293.4, |
| "kl_loss_26": 1146.1, |
| "kl_loss_39": 736.9, |
| "kl_loss_7": 3022.8, |
| "learning_rate": 0.0003530556653026721, |
| "loss": 3553.45, |
| "step": 5990 |
| }, |
| { |
| "ce_loss_13": 2.5623465538024903, |
| "ce_loss_26": 2.0235190600156785, |
| "ce_loss_39": 1.8284282714128495, |
| "ce_loss_52": 1.4427544534206391, |
| "ce_loss_7": 2.8994126319885254, |
| "epoch": 0.6, |
| "grad_norm": 14.620466600171055, |
| "kl_loss_13": 2310.4, |
| "kl_loss_26": 1152.8, |
| "kl_loss_39": 754.8, |
| "kl_loss_7": 3022.0, |
| "learning_rate": 0.00035153981233586274, |
| "loss": 3592.9, |
| "step": 6000 |
| }, |
| { |
| "ce_loss_13": 2.592492914199829, |
| "ce_loss_26": 2.0241310060024262, |
| "ce_loss_39": 1.8282486945390701, |
| "ce_loss_52": 1.4501359939575196, |
| "ce_loss_7": 2.934645599126816, |
| "epoch": 0.601, |
| "grad_norm": 15.179282109125554, |
| "kl_loss_13": 2355.8, |
| "kl_loss_26": 1159.4, |
| "kl_loss_39": 745.9, |
| "kl_loss_7": 3082.4, |
| "learning_rate": 0.00035002545436149473, |
| "loss": 3551.8, |
| "step": 6010 |
| }, |
| { |
| "ce_loss_13": 2.4968257695436478, |
| "ce_loss_26": 1.950178360939026, |
| "ce_loss_39": 1.7565111339092254, |
| "ce_loss_52": 1.3940304026007653, |
| "ce_loss_7": 2.8332657337188722, |
| "epoch": 0.602, |
| "grad_norm": 15.101566569350041, |
| "kl_loss_13": 2271.8, |
| "kl_loss_26": 1114.8, |
| "kl_loss_39": 717.3, |
| "kl_loss_7": 2986.8, |
| "learning_rate": 0.0003485126066291364, |
| "loss": 3553.3, |
| "step": 6020 |
| }, |
| { |
| "ce_loss_13": 2.5295323967933654, |
| "ce_loss_26": 1.9963963776826859, |
| "ce_loss_39": 1.8044916093349457, |
| "ce_loss_52": 1.446289749443531, |
| "ce_loss_7": 2.8674661338329317, |
| "epoch": 0.603, |
| "grad_norm": 14.29880611806114, |
| "kl_loss_13": 2227.0, |
| "kl_loss_26": 1097.1, |
| "kl_loss_39": 702.25, |
| "kl_loss_7": 2938.0, |
| "learning_rate": 0.0003470012843731476, |
| "loss": 3534.85, |
| "step": 6030 |
| }, |
| { |
| "ce_loss_13": 2.498071011900902, |
| "ce_loss_26": 1.9493587136268615, |
| "ce_loss_39": 1.76174655854702, |
| "ce_loss_52": 1.407905325293541, |
| "ce_loss_7": 2.847205549478531, |
| "epoch": 0.604, |
| "grad_norm": 14.483859928846364, |
| "kl_loss_13": 2244.8, |
| "kl_loss_26": 1096.9, |
| "kl_loss_39": 701.8, |
| "kl_loss_7": 2975.2, |
| "learning_rate": 0.00034549150281252633, |
| "loss": 3514.35, |
| "step": 6040 |
| }, |
| { |
| "ce_loss_13": 2.4966412246227265, |
| "ce_loss_26": 1.9553426146507262, |
| "ce_loss_39": 1.7613533914089203, |
| "ce_loss_52": 1.4072722673416138, |
| "ce_loss_7": 2.8383829057216645, |
| "epoch": 0.605, |
| "grad_norm": 14.78851376475336, |
| "kl_loss_13": 2235.6, |
| "kl_loss_26": 1090.3, |
| "kl_loss_39": 695.0, |
| "kl_loss_7": 2954.0, |
| "learning_rate": 0.0003439832771507565, |
| "loss": 3563.65, |
| "step": 6050 |
| }, |
| { |
| "ce_loss_13": 2.502397668361664, |
| "ce_loss_26": 1.9571218103170396, |
| "ce_loss_39": 1.7665416598320007, |
| "ce_loss_52": 1.412816160917282, |
| "ce_loss_7": 2.8454441905021666, |
| "epoch": 0.606, |
| "grad_norm": 15.4871318885597, |
| "kl_loss_13": 2242.0, |
| "kl_loss_26": 1096.6, |
| "kl_loss_39": 696.5, |
| "kl_loss_7": 2962.4, |
| "learning_rate": 0.0003424766225756537, |
| "loss": 3510.25, |
| "step": 6060 |
| }, |
| { |
| "ce_loss_13": 2.5223917841911314, |
| "ce_loss_26": 1.9723280429840089, |
| "ce_loss_39": 1.7729612857103347, |
| "ce_loss_52": 1.4048843801021575, |
| "ce_loss_7": 2.8645106673240663, |
| "epoch": 0.607, |
| "grad_norm": 15.573595841239559, |
| "kl_loss_13": 2300.6, |
| "kl_loss_26": 1138.2, |
| "kl_loss_39": 727.85, |
| "kl_loss_7": 3019.6, |
| "learning_rate": 0.00034097155425921255, |
| "loss": 3527.0, |
| "step": 6070 |
| }, |
| { |
| "ce_loss_13": 2.491760790348053, |
| "ce_loss_26": 1.9636650770902633, |
| "ce_loss_39": 1.7728582590818405, |
| "ce_loss_52": 1.4259491577744483, |
| "ce_loss_7": 2.821820729970932, |
| "epoch": 0.608, |
| "grad_norm": 14.653123046442355, |
| "kl_loss_13": 2168.6, |
| "kl_loss_26": 1063.1, |
| "kl_loss_39": 679.3, |
| "kl_loss_7": 2864.4, |
| "learning_rate": 0.0003394680873574546, |
| "loss": 3528.9, |
| "step": 6080 |
| }, |
| { |
| "ce_loss_13": 2.5085246324539185, |
| "ce_loss_26": 1.9797437161207199, |
| "ce_loss_39": 1.7878784984350204, |
| "ce_loss_52": 1.4316603004932404, |
| "ce_loss_7": 2.847382205724716, |
| "epoch": 0.609, |
| "grad_norm": 14.913942335206821, |
| "kl_loss_13": 2211.2, |
| "kl_loss_26": 1091.8, |
| "kl_loss_39": 696.95, |
| "kl_loss_7": 2922.0, |
| "learning_rate": 0.0003379662370102747, |
| "loss": 3549.95, |
| "step": 6090 |
| }, |
| { |
| "ce_loss_13": 2.4759395986795427, |
| "ce_loss_26": 1.949927881360054, |
| "ce_loss_39": 1.7604204803705215, |
| "ce_loss_52": 1.41001408547163, |
| "ce_loss_7": 2.8064420104026793, |
| "epoch": 0.61, |
| "grad_norm": 14.676610718900648, |
| "kl_loss_13": 2185.6, |
| "kl_loss_26": 1077.2, |
| "kl_loss_39": 688.0, |
| "kl_loss_7": 2875.6, |
| "learning_rate": 0.0003364660183412892, |
| "loss": 3507.8, |
| "step": 6100 |
| }, |
| { |
| "ce_loss_13": 2.531206899881363, |
| "ce_loss_26": 1.9836675137281419, |
| "ce_loss_39": 1.7869503110647202, |
| "ce_loss_52": 1.432834729552269, |
| "ce_loss_7": 2.873502719402313, |
| "epoch": 0.611, |
| "grad_norm": 14.829577774305747, |
| "kl_loss_13": 2255.0, |
| "kl_loss_26": 1106.2, |
| "kl_loss_39": 703.75, |
| "kl_loss_7": 2966.4, |
| "learning_rate": 0.0003349674464576834, |
| "loss": 3495.25, |
| "step": 6110 |
| }, |
| { |
| "ce_loss_13": 2.514421796798706, |
| "ce_loss_26": 1.9697007417678833, |
| "ce_loss_39": 1.780946272611618, |
| "ce_loss_52": 1.4202388614416122, |
| "ce_loss_7": 2.8420049071311952, |
| "epoch": 0.612, |
| "grad_norm": 14.8111788086498, |
| "kl_loss_13": 2230.2, |
| "kl_loss_26": 1101.1, |
| "kl_loss_39": 705.2, |
| "kl_loss_7": 2920.0, |
| "learning_rate": 0.00033347053645005966, |
| "loss": 3492.65, |
| "step": 6120 |
| }, |
| { |
| "ce_loss_13": 2.537186449766159, |
| "ce_loss_26": 2.004375171661377, |
| "ce_loss_39": 1.8157330989837646, |
| "ce_loss_52": 1.451986312866211, |
| "ce_loss_7": 2.878436690568924, |
| "epoch": 0.613, |
| "grad_norm": 15.401404522918961, |
| "kl_loss_13": 2242.0, |
| "kl_loss_26": 1113.3, |
| "kl_loss_39": 718.55, |
| "kl_loss_7": 2945.6, |
| "learning_rate": 0.00033197530339228485, |
| "loss": 3459.1, |
| "step": 6130 |
| }, |
| { |
| "ce_loss_13": 2.506669583916664, |
| "ce_loss_26": 1.9629664570093155, |
| "ce_loss_39": 1.78002208173275, |
| "ce_loss_52": 1.4117600202560425, |
| "ce_loss_7": 2.847608286142349, |
| "epoch": 0.614, |
| "grad_norm": 15.390199704579972, |
| "kl_loss_13": 2259.8, |
| "kl_loss_26": 1112.7, |
| "kl_loss_39": 727.8, |
| "kl_loss_7": 2986.8, |
| "learning_rate": 0.00033048176234133967, |
| "loss": 3537.45, |
| "step": 6140 |
| }, |
| { |
| "ce_loss_13": 2.5588534235954286, |
| "ce_loss_26": 2.024568209052086, |
| "ce_loss_39": 1.8284731358289719, |
| "ce_loss_52": 1.4587910890579223, |
| "ce_loss_7": 2.8921659886837006, |
| "epoch": 0.615, |
| "grad_norm": 14.629003698363993, |
| "kl_loss_13": 2259.8, |
| "kl_loss_26": 1126.9, |
| "kl_loss_39": 724.0, |
| "kl_loss_7": 2958.8, |
| "learning_rate": 0.0003289899283371657, |
| "loss": 3536.8, |
| "step": 6150 |
| }, |
| { |
| "ce_loss_13": 2.4743064284324645, |
| "ce_loss_26": 1.9476153373718261, |
| "ce_loss_39": 1.7604322880506516, |
| "ce_loss_52": 1.4059417188167571, |
| "ce_loss_7": 2.7994522780179976, |
| "epoch": 0.616, |
| "grad_norm": 15.287517673195092, |
| "kl_loss_13": 2207.6, |
| "kl_loss_26": 1094.2, |
| "kl_loss_39": 708.05, |
| "kl_loss_7": 2892.4, |
| "learning_rate": 0.0003274998164025148, |
| "loss": 3522.0, |
| "step": 6160 |
| }, |
| { |
| "ce_loss_13": 2.61488196849823, |
| "ce_loss_26": 2.067342773079872, |
| "ce_loss_39": 1.8675355583429336, |
| "ce_loss_52": 1.4927641093730926, |
| "ce_loss_7": 2.948505789041519, |
| "epoch": 0.617, |
| "grad_norm": 14.482727532648099, |
| "kl_loss_13": 2288.0, |
| "kl_loss_26": 1139.7, |
| "kl_loss_39": 729.35, |
| "kl_loss_7": 2993.6, |
| "learning_rate": 0.0003260114415427975, |
| "loss": 3494.95, |
| "step": 6170 |
| }, |
| { |
| "ce_loss_13": 2.5525584638118746, |
| "ce_loss_26": 1.9941669285297394, |
| "ce_loss_39": 1.7988057792186738, |
| "ce_loss_52": 1.4276691943407058, |
| "ce_loss_7": 2.888332962989807, |
| "epoch": 0.618, |
| "grad_norm": 15.194650585699007, |
| "kl_loss_13": 2319.2, |
| "kl_loss_26": 1148.8, |
| "kl_loss_39": 742.3, |
| "kl_loss_7": 3022.8, |
| "learning_rate": 0.0003245248187459323, |
| "loss": 3535.8, |
| "step": 6180 |
| }, |
| { |
| "ce_loss_13": 2.5258300840854644, |
| "ce_loss_26": 1.9903143167495727, |
| "ce_loss_39": 1.8010086834430694, |
| "ce_loss_52": 1.4478329718112946, |
| "ce_loss_7": 2.86256263256073, |
| "epoch": 0.619, |
| "grad_norm": 14.652051426511987, |
| "kl_loss_13": 2228.6, |
| "kl_loss_26": 1088.0, |
| "kl_loss_39": 694.0, |
| "kl_loss_7": 2940.4, |
| "learning_rate": 0.00032303996298219416, |
| "loss": 3513.7, |
| "step": 6190 |
| }, |
| { |
| "ce_loss_13": 2.5733933985233306, |
| "ce_loss_26": 2.0336378514766693, |
| "ce_loss_39": 1.8403378069400786, |
| "ce_loss_52": 1.4681322902441025, |
| "ce_loss_7": 2.920198345184326, |
| "epoch": 0.62, |
| "grad_norm": 15.056132330426859, |
| "kl_loss_13": 2280.4, |
| "kl_loss_26": 1131.3, |
| "kl_loss_39": 733.75, |
| "kl_loss_7": 3002.8, |
| "learning_rate": 0.00032155688920406414, |
| "loss": 3507.7, |
| "step": 6200 |
| }, |
| { |
| "ce_loss_13": 2.5029452949762345, |
| "ce_loss_26": 1.9643093675374985, |
| "ce_loss_39": 1.7774474427103997, |
| "ce_loss_52": 1.4176854699850083, |
| "ce_loss_7": 2.823590323328972, |
| "epoch": 0.621, |
| "grad_norm": 14.49052860339056, |
| "kl_loss_13": 2220.0, |
| "kl_loss_26": 1086.1, |
| "kl_loss_39": 699.45, |
| "kl_loss_7": 2902.8, |
| "learning_rate": 0.0003200756123460788, |
| "loss": 3535.45, |
| "step": 6210 |
| }, |
| { |
| "ce_loss_13": 2.489814931154251, |
| "ce_loss_26": 1.9501473933458329, |
| "ce_loss_39": 1.7617685228586197, |
| "ce_loss_52": 1.41129230260849, |
| "ce_loss_7": 2.816985684633255, |
| "epoch": 0.622, |
| "grad_norm": 14.505478323074753, |
| "kl_loss_13": 2218.6, |
| "kl_loss_26": 1080.9, |
| "kl_loss_39": 689.55, |
| "kl_loss_7": 2918.0, |
| "learning_rate": 0.00031859614732467957, |
| "loss": 3488.95, |
| "step": 6220 |
| }, |
| { |
| "ce_loss_13": 2.5316161155700683, |
| "ce_loss_26": 2.0058958530426025, |
| "ce_loss_39": 1.8137048929929733, |
| "ce_loss_52": 1.461652959883213, |
| "ce_loss_7": 2.8647646605968475, |
| "epoch": 0.623, |
| "grad_norm": 13.917606472624449, |
| "kl_loss_13": 2204.8, |
| "kl_loss_26": 1085.2, |
| "kl_loss_39": 685.1, |
| "kl_loss_7": 2902.4, |
| "learning_rate": 0.00031711850903806275, |
| "loss": 3465.2, |
| "step": 6230 |
| }, |
| { |
| "ce_loss_13": 2.500165891647339, |
| "ce_loss_26": 1.9614384204149247, |
| "ce_loss_39": 1.771432462334633, |
| "ce_loss_52": 1.4076189696788788, |
| "ce_loss_7": 2.832933169603348, |
| "epoch": 0.624, |
| "grad_norm": 14.223738912104434, |
| "kl_loss_13": 2258.6, |
| "kl_loss_26": 1125.7, |
| "kl_loss_39": 725.1, |
| "kl_loss_7": 2956.8, |
| "learning_rate": 0.0003156427123660297, |
| "loss": 3486.3, |
| "step": 6240 |
| }, |
| { |
| "ce_loss_13": 2.5448601841926575, |
| "ce_loss_26": 2.000015211105347, |
| "ce_loss_39": 1.8015096932649612, |
| "ce_loss_52": 1.440242400765419, |
| "ce_loss_7": 2.878038114309311, |
| "epoch": 0.625, |
| "grad_norm": 14.471917182701668, |
| "kl_loss_13": 2262.4, |
| "kl_loss_26": 1124.4, |
| "kl_loss_39": 714.0, |
| "kl_loss_7": 2962.4, |
| "learning_rate": 0.0003141687721698363, |
| "loss": 3490.15, |
| "step": 6250 |
| }, |
| { |
| "ce_loss_13": 2.5199947118759156, |
| "ce_loss_26": 1.986537829041481, |
| "ce_loss_39": 1.796593463420868, |
| "ce_loss_52": 1.4425756543874741, |
| "ce_loss_7": 2.854223221540451, |
| "epoch": 0.626, |
| "grad_norm": 14.607284979944513, |
| "kl_loss_13": 2212.8, |
| "kl_loss_26": 1085.5, |
| "kl_loss_39": 686.9, |
| "kl_loss_7": 2911.6, |
| "learning_rate": 0.00031269670329204396, |
| "loss": 3493.3, |
| "step": 6260 |
| }, |
| { |
| "ce_loss_13": 2.5326361417770387, |
| "ce_loss_26": 2.002932494878769, |
| "ce_loss_39": 1.8195136040449142, |
| "ce_loss_52": 1.4736543655395509, |
| "ce_loss_7": 2.8653872847557067, |
| "epoch": 0.627, |
| "grad_norm": 13.814637143502924, |
| "kl_loss_13": 2172.2, |
| "kl_loss_26": 1054.9, |
| "kl_loss_39": 669.6, |
| "kl_loss_7": 2869.6, |
| "learning_rate": 0.00031122652055637015, |
| "loss": 3492.5, |
| "step": 6270 |
| }, |
| { |
| "ce_loss_13": 2.482167053222656, |
| "ce_loss_26": 1.9512667179107666, |
| "ce_loss_39": 1.7665533930063249, |
| "ce_loss_52": 1.42098408639431, |
| "ce_loss_7": 2.8168558061122893, |
| "epoch": 0.628, |
| "grad_norm": 16.02072851141737, |
| "kl_loss_13": 2193.0, |
| "kl_loss_26": 1069.4, |
| "kl_loss_39": 682.35, |
| "kl_loss_7": 2894.0, |
| "learning_rate": 0.0003097582387675385, |
| "loss": 3459.75, |
| "step": 6280 |
| }, |
| { |
| "ce_loss_13": 2.47237606048584, |
| "ce_loss_26": 1.95036241710186, |
| "ce_loss_39": 1.764494326710701, |
| "ce_loss_52": 1.4227848395705223, |
| "ce_loss_7": 2.8021757781505583, |
| "epoch": 0.629, |
| "grad_norm": 15.255934367745192, |
| "kl_loss_13": 2167.4, |
| "kl_loss_26": 1053.3, |
| "kl_loss_39": 664.9, |
| "kl_loss_7": 2858.8, |
| "learning_rate": 0.00030829187271113034, |
| "loss": 3446.7, |
| "step": 6290 |
| }, |
| { |
| "ce_loss_13": 2.501003822684288, |
| "ce_loss_26": 1.958929392695427, |
| "ce_loss_39": 1.76513631939888, |
| "ce_loss_52": 1.4046356767416, |
| "ce_loss_7": 2.8350019991397857, |
| "epoch": 0.63, |
| "grad_norm": 14.88714427947574, |
| "kl_loss_13": 2270.2, |
| "kl_loss_26": 1123.2, |
| "kl_loss_39": 716.8, |
| "kl_loss_7": 2970.0, |
| "learning_rate": 0.00030682743715343565, |
| "loss": 3508.45, |
| "step": 6300 |
| }, |
| { |
| "ce_loss_13": 2.5800569117069245, |
| "ce_loss_26": 2.0348214149475097, |
| "ce_loss_39": 1.8403811991214751, |
| "ce_loss_52": 1.476933541893959, |
| "ce_loss_7": 2.9134635806083677, |
| "epoch": 0.631, |
| "grad_norm": 14.94722074912583, |
| "kl_loss_13": 2245.0, |
| "kl_loss_26": 1107.2, |
| "kl_loss_39": 706.25, |
| "kl_loss_7": 2952.8, |
| "learning_rate": 0.0003053649468413043, |
| "loss": 3499.45, |
| "step": 6310 |
| }, |
| { |
| "ce_loss_13": 2.5208792209625246, |
| "ce_loss_26": 1.9822687000036239, |
| "ce_loss_39": 1.7960426419973374, |
| "ce_loss_52": 1.4455793976783753, |
| "ce_loss_7": 2.8597992181777956, |
| "epoch": 0.632, |
| "grad_norm": 15.453003389066977, |
| "kl_loss_13": 2216.2, |
| "kl_loss_26": 1070.1, |
| "kl_loss_39": 678.0, |
| "kl_loss_7": 2927.6, |
| "learning_rate": 0.00030390441650199725, |
| "loss": 3483.5, |
| "step": 6320 |
| }, |
| { |
| "ce_loss_13": 2.441950124502182, |
| "ce_loss_26": 1.91085105240345, |
| "ce_loss_39": 1.7248677492141724, |
| "ce_loss_52": 1.3885301351547241, |
| "ce_loss_7": 2.777034705877304, |
| "epoch": 0.633, |
| "grad_norm": 14.901626155092913, |
| "kl_loss_13": 2155.8, |
| "kl_loss_26": 1040.1, |
| "kl_loss_39": 653.2, |
| "kl_loss_7": 2859.2, |
| "learning_rate": 0.00030244586084303903, |
| "loss": 3433.35, |
| "step": 6330 |
| }, |
| { |
| "ce_loss_13": 2.4565594136714934, |
| "ce_loss_26": 1.9154207110404968, |
| "ce_loss_39": 1.7311667442321776, |
| "ce_loss_52": 1.3880270063877105, |
| "ce_loss_7": 2.79474156498909, |
| "epoch": 0.634, |
| "grad_norm": 15.212752240316364, |
| "kl_loss_13": 2198.2, |
| "kl_loss_26": 1053.8, |
| "kl_loss_39": 669.5, |
| "kl_loss_7": 2908.0, |
| "learning_rate": 0.00030098929455206903, |
| "loss": 3450.2, |
| "step": 6340 |
| }, |
| { |
| "ce_loss_13": 2.4875996589660643, |
| "ce_loss_26": 1.9456024587154388, |
| "ce_loss_39": 1.7510357975959778, |
| "ce_loss_52": 1.4120293408632278, |
| "ce_loss_7": 2.8184066474437715, |
| "epoch": 0.635, |
| "grad_norm": 14.444701123760842, |
| "kl_loss_13": 2190.4, |
| "kl_loss_26": 1070.0, |
| "kl_loss_39": 670.4, |
| "kl_loss_7": 2884.8, |
| "learning_rate": 0.00029953473229669324, |
| "loss": 3500.6, |
| "step": 6350 |
| }, |
| { |
| "ce_loss_13": 2.505690813064575, |
| "ce_loss_26": 1.968365904688835, |
| "ce_loss_39": 1.7826423317193985, |
| "ce_loss_52": 1.4394667357206345, |
| "ce_loss_7": 2.8453324735164642, |
| "epoch": 0.636, |
| "grad_norm": 14.480279054372499, |
| "kl_loss_13": 2207.2, |
| "kl_loss_26": 1070.3, |
| "kl_loss_39": 680.95, |
| "kl_loss_7": 2914.4, |
| "learning_rate": 0.00029808218872433767, |
| "loss": 3473.05, |
| "step": 6360 |
| }, |
| { |
| "ce_loss_13": 2.462022843956947, |
| "ce_loss_26": 1.9284056276082993, |
| "ce_loss_39": 1.7475204050540925, |
| "ce_loss_52": 1.3994116008281707, |
| "ce_loss_7": 2.799399584531784, |
| "epoch": 0.637, |
| "grad_norm": 14.854420396233579, |
| "kl_loss_13": 2187.4, |
| "kl_loss_26": 1057.9, |
| "kl_loss_39": 676.9, |
| "kl_loss_7": 2908.4, |
| "learning_rate": 0.0002966316784621, |
| "loss": 3431.55, |
| "step": 6370 |
| }, |
| { |
| "ce_loss_13": 2.46474946141243, |
| "ce_loss_26": 1.9243933081626892, |
| "ce_loss_39": 1.7351078271865845, |
| "ce_loss_52": 1.3951061010360717, |
| "ce_loss_7": 2.809561550617218, |
| "epoch": 0.638, |
| "grad_norm": 14.398656186824772, |
| "kl_loss_13": 2201.4, |
| "kl_loss_26": 1064.6, |
| "kl_loss_39": 673.7, |
| "kl_loss_7": 2913.6, |
| "learning_rate": 0.0002951832161166024, |
| "loss": 3433.1, |
| "step": 6380 |
| }, |
| { |
| "ce_loss_13": 2.524071788787842, |
| "ce_loss_26": 1.9882585108280182, |
| "ce_loss_39": 1.8004582822322845, |
| "ce_loss_52": 1.457192499935627, |
| "ce_loss_7": 2.859678488969803, |
| "epoch": 0.639, |
| "grad_norm": 14.823787609750735, |
| "kl_loss_13": 2198.8, |
| "kl_loss_26": 1062.9, |
| "kl_loss_39": 671.2, |
| "kl_loss_7": 2895.2, |
| "learning_rate": 0.0002937368162738445, |
| "loss": 3448.55, |
| "step": 6390 |
| }, |
| { |
| "ce_loss_13": 2.476853275299072, |
| "ce_loss_26": 1.940243661403656, |
| "ce_loss_39": 1.7542554527521133, |
| "ce_loss_52": 1.4153559118509293, |
| "ce_loss_7": 2.8181783974170687, |
| "epoch": 0.64, |
| "grad_norm": 14.674283953178037, |
| "kl_loss_13": 2177.8, |
| "kl_loss_26": 1053.1, |
| "kl_loss_39": 664.6, |
| "kl_loss_7": 2899.6, |
| "learning_rate": 0.0002922924934990568, |
| "loss": 3441.6, |
| "step": 6400 |
| }, |
| { |
| "ce_loss_13": 2.4689641296863556, |
| "ce_loss_26": 1.933935484290123, |
| "ce_loss_39": 1.7450189381837844, |
| "ce_loss_52": 1.3959352299571037, |
| "ce_loss_7": 2.8043021619319917, |
| "epoch": 0.641, |
| "grad_norm": 16.188741684715673, |
| "kl_loss_13": 2210.6, |
| "kl_loss_26": 1080.2, |
| "kl_loss_39": 681.3, |
| "kl_loss_7": 2920.8, |
| "learning_rate": 0.0002908502623365536, |
| "loss": 3439.95, |
| "step": 6410 |
| }, |
| { |
| "ce_loss_13": 2.51960112452507, |
| "ce_loss_26": 1.9860825181007384, |
| "ce_loss_39": 1.792076262831688, |
| "ce_loss_52": 1.4388326108455658, |
| "ce_loss_7": 2.8559226214885713, |
| "epoch": 0.642, |
| "grad_norm": 15.164079412983817, |
| "kl_loss_13": 2205.0, |
| "kl_loss_26": 1076.5, |
| "kl_loss_39": 686.15, |
| "kl_loss_7": 2899.6, |
| "learning_rate": 0.0002894101373095867, |
| "loss": 3403.75, |
| "step": 6420 |
| }, |
| { |
| "ce_loss_13": 2.57558217048645, |
| "ce_loss_26": 2.037161833047867, |
| "ce_loss_39": 1.8481904029846192, |
| "ce_loss_52": 1.4971486061811448, |
| "ce_loss_7": 2.9037492871284485, |
| "epoch": 0.643, |
| "grad_norm": 14.617370011749491, |
| "kl_loss_13": 2241.0, |
| "kl_loss_26": 1099.6, |
| "kl_loss_39": 706.2, |
| "kl_loss_7": 2940.0, |
| "learning_rate": 0.00028797213292019926, |
| "loss": 3465.25, |
| "step": 6430 |
| }, |
| { |
| "ce_loss_13": 2.4703142285346984, |
| "ce_loss_26": 1.9478118807077407, |
| "ce_loss_39": 1.7643914371728897, |
| "ce_loss_52": 1.4303042769432068, |
| "ce_loss_7": 2.8028221487998963, |
| "epoch": 0.644, |
| "grad_norm": 14.268057235198288, |
| "kl_loss_13": 2150.2, |
| "kl_loss_26": 1039.5, |
| "kl_loss_39": 657.25, |
| "kl_loss_7": 2845.2, |
| "learning_rate": 0.0002865362636490791, |
| "loss": 3397.05, |
| "step": 6440 |
| }, |
| { |
| "ce_loss_13": 2.5057600528001784, |
| "ce_loss_26": 1.963181382417679, |
| "ce_loss_39": 1.766686275601387, |
| "ce_loss_52": 1.4219027027487754, |
| "ce_loss_7": 2.842684781551361, |
| "epoch": 0.645, |
| "grad_norm": 15.007302910220881, |
| "kl_loss_13": 2227.2, |
| "kl_loss_26": 1085.9, |
| "kl_loss_39": 688.6, |
| "kl_loss_7": 2943.2, |
| "learning_rate": 0.0002851025439554142, |
| "loss": 3420.9, |
| "step": 6450 |
| }, |
| { |
| "ce_loss_13": 2.5524505376815796, |
| "ce_loss_26": 2.00695119202137, |
| "ce_loss_39": 1.812732595205307, |
| "ce_loss_52": 1.4619058847427369, |
| "ce_loss_7": 2.89662281870842, |
| "epoch": 0.646, |
| "grad_norm": 14.944231437877365, |
| "kl_loss_13": 2231.2, |
| "kl_loss_26": 1087.1, |
| "kl_loss_39": 688.9, |
| "kl_loss_7": 2952.8, |
| "learning_rate": 0.00028367098827674573, |
| "loss": 3473.25, |
| "step": 6460 |
| }, |
| { |
| "ce_loss_13": 2.5118141055107115, |
| "ce_loss_26": 1.9752016961574554, |
| "ce_loss_39": 1.790860089659691, |
| "ce_loss_52": 1.4514876693487166, |
| "ce_loss_7": 2.8457858681678774, |
| "epoch": 0.647, |
| "grad_norm": 14.47327693725919, |
| "kl_loss_13": 2178.4, |
| "kl_loss_26": 1057.8, |
| "kl_loss_39": 666.1, |
| "kl_loss_7": 2874.4, |
| "learning_rate": 0.00028224161102882397, |
| "loss": 3430.95, |
| "step": 6470 |
| }, |
| { |
| "ce_loss_13": 2.4975059896707537, |
| "ce_loss_26": 1.9533880710601808, |
| "ce_loss_39": 1.7617440074682236, |
| "ce_loss_52": 1.414775413274765, |
| "ce_loss_7": 2.8368520498275758, |
| "epoch": 0.648, |
| "grad_norm": 14.67215053951943, |
| "kl_loss_13": 2218.6, |
| "kl_loss_26": 1078.5, |
| "kl_loss_39": 690.45, |
| "kl_loss_7": 2930.0, |
| "learning_rate": 0.00028081442660546124, |
| "loss": 3435.85, |
| "step": 6480 |
| }, |
| { |
| "ce_loss_13": 2.4587242364883424, |
| "ce_loss_26": 1.9304294764995575, |
| "ce_loss_39": 1.7412324339151382, |
| "ce_loss_52": 1.4029302895069122, |
| "ce_loss_7": 2.7982348799705505, |
| "epoch": 0.649, |
| "grad_norm": 14.728330622094298, |
| "kl_loss_13": 2170.4, |
| "kl_loss_26": 1055.8, |
| "kl_loss_39": 664.85, |
| "kl_loss_7": 2878.8, |
| "learning_rate": 0.0002793894493783892, |
| "loss": 3431.05, |
| "step": 6490 |
| }, |
| { |
| "ce_loss_13": 2.5393730461597444, |
| "ce_loss_26": 1.9960223108530044, |
| "ce_loss_39": 1.808261874318123, |
| "ce_loss_52": 1.45538187623024, |
| "ce_loss_7": 2.8760022819042206, |
| "epoch": 0.65, |
| "grad_norm": 15.357970116880674, |
| "kl_loss_13": 2229.4, |
| "kl_loss_26": 1089.7, |
| "kl_loss_39": 695.95, |
| "kl_loss_7": 2930.8, |
| "learning_rate": 0.0002779666936971129, |
| "loss": 3429.5, |
| "step": 6500 |
| }, |
| { |
| "ce_loss_13": 2.496321311593056, |
| "ce_loss_26": 1.9730540215969086, |
| "ce_loss_39": 1.7846842855215073, |
| "ce_loss_52": 1.444144432246685, |
| "ce_loss_7": 2.8269869565963743, |
| "epoch": 0.651, |
| "grad_norm": 14.142211217794582, |
| "kl_loss_13": 2159.4, |
| "kl_loss_26": 1043.4, |
| "kl_loss_39": 660.95, |
| "kl_loss_7": 2856.0, |
| "learning_rate": 0.00027654617388876614, |
| "loss": 3409.65, |
| "step": 6510 |
| }, |
| { |
| "ce_loss_13": 2.4942274272441862, |
| "ce_loss_26": 1.9708451181650162, |
| "ce_loss_39": 1.781627294421196, |
| "ce_loss_52": 1.436317929625511, |
| "ce_loss_7": 2.824290210008621, |
| "epoch": 0.652, |
| "grad_norm": 14.257728738894219, |
| "kl_loss_13": 2158.0, |
| "kl_loss_26": 1061.3, |
| "kl_loss_39": 676.9, |
| "kl_loss_7": 2841.6, |
| "learning_rate": 0.0002751279042579672, |
| "loss": 3420.3, |
| "step": 6520 |
| }, |
| { |
| "ce_loss_13": 2.474119412899017, |
| "ce_loss_26": 1.9305396527051926, |
| "ce_loss_39": 1.745158138871193, |
| "ce_loss_52": 1.4086509764194488, |
| "ce_loss_7": 2.8088379979133604, |
| "epoch": 0.653, |
| "grad_norm": 14.091894330635501, |
| "kl_loss_13": 2189.8, |
| "kl_loss_26": 1060.2, |
| "kl_loss_39": 671.45, |
| "kl_loss_7": 2896.4, |
| "learning_rate": 0.00027371189908667604, |
| "loss": 3430.2, |
| "step": 6530 |
| }, |
| { |
| "ce_loss_13": 2.5130710184574125, |
| "ce_loss_26": 1.9766233384609222, |
| "ce_loss_39": 1.7931175470352172, |
| "ce_loss_52": 1.4404253482818603, |
| "ce_loss_7": 2.8409298956394196, |
| "epoch": 0.654, |
| "grad_norm": 14.81675411336707, |
| "kl_loss_13": 2196.2, |
| "kl_loss_26": 1079.4, |
| "kl_loss_39": 687.65, |
| "kl_loss_7": 2891.2, |
| "learning_rate": 0.00027229817263404863, |
| "loss": 3395.3, |
| "step": 6540 |
| }, |
| { |
| "ce_loss_13": 2.489423853158951, |
| "ce_loss_26": 1.9515893071889878, |
| "ce_loss_39": 1.7590265810489654, |
| "ce_loss_52": 1.4197510361671448, |
| "ce_loss_7": 2.832774597406387, |
| "epoch": 0.655, |
| "grad_norm": 14.759093026127282, |
| "kl_loss_13": 2178.4, |
| "kl_loss_26": 1050.2, |
| "kl_loss_39": 655.0, |
| "kl_loss_7": 2893.2, |
| "learning_rate": 0.0002708867391362948, |
| "loss": 3416.7, |
| "step": 6550 |
| }, |
| { |
| "ce_loss_13": 2.5174727141857147, |
| "ce_loss_26": 1.9717289686203003, |
| "ce_loss_39": 1.7858693569898605, |
| "ce_loss_52": 1.4473539382219314, |
| "ce_loss_7": 2.8465367794036864, |
| "epoch": 0.656, |
| "grad_norm": 14.064047100581472, |
| "kl_loss_13": 2182.8, |
| "kl_loss_26": 1041.7, |
| "kl_loss_39": 656.75, |
| "kl_loss_7": 2877.6, |
| "learning_rate": 0.0002694776128065345, |
| "loss": 3397.05, |
| "step": 6560 |
| }, |
| { |
| "ce_loss_13": 2.528963714838028, |
| "ce_loss_26": 1.981925156712532, |
| "ce_loss_39": 1.7907918602228166, |
| "ce_loss_52": 1.4499590158462525, |
| "ce_loss_7": 2.871799385547638, |
| "epoch": 0.657, |
| "grad_norm": 14.569880458498675, |
| "kl_loss_13": 2200.6, |
| "kl_loss_26": 1059.1, |
| "kl_loss_39": 660.35, |
| "kl_loss_7": 2915.2, |
| "learning_rate": 0.00026807080783465374, |
| "loss": 3393.8, |
| "step": 6570 |
| }, |
| { |
| "ce_loss_13": 2.5217245757579803, |
| "ce_loss_26": 1.9828781098127366, |
| "ce_loss_39": 1.7871235221624375, |
| "ce_loss_52": 1.4383462622761727, |
| "ce_loss_7": 2.864704269170761, |
| "epoch": 0.658, |
| "grad_norm": 14.405026774063144, |
| "kl_loss_13": 2246.6, |
| "kl_loss_26": 1094.4, |
| "kl_loss_39": 694.55, |
| "kl_loss_7": 2961.2, |
| "learning_rate": 0.00026666633838716316, |
| "loss": 3410.55, |
| "step": 6580 |
| }, |
| { |
| "ce_loss_13": 2.525897091627121, |
| "ce_loss_26": 1.9987964391708375, |
| "ce_loss_39": 1.815827977657318, |
| "ce_loss_52": 1.4701957792043685, |
| "ce_loss_7": 2.8572112381458283, |
| "epoch": 0.659, |
| "grad_norm": 14.76642356275535, |
| "kl_loss_13": 2192.0, |
| "kl_loss_26": 1080.0, |
| "kl_loss_39": 689.6, |
| "kl_loss_7": 2880.4, |
| "learning_rate": 0.00026526421860705474, |
| "loss": 3403.15, |
| "step": 6590 |
| }, |
| { |
| "ce_loss_13": 2.516835355758667, |
| "ce_loss_26": 1.9933927595615386, |
| "ce_loss_39": 1.8040386736392975, |
| "ce_loss_52": 1.4699463561177253, |
| "ce_loss_7": 2.839303117990494, |
| "epoch": 0.66, |
| "grad_norm": 15.026727729458214, |
| "kl_loss_13": 2153.2, |
| "kl_loss_26": 1047.1, |
| "kl_loss_39": 659.7, |
| "kl_loss_7": 2838.8, |
| "learning_rate": 0.0002638644626136587, |
| "loss": 3420.9, |
| "step": 6600 |
| }, |
| { |
| "ce_loss_13": 2.5133367598056795, |
| "ce_loss_26": 1.9757141143083572, |
| "ce_loss_39": 1.7787566870450973, |
| "ce_loss_52": 1.4421678900718689, |
| "ce_loss_7": 2.84059277176857, |
| "epoch": 0.661, |
| "grad_norm": 14.33502800394329, |
| "kl_loss_13": 2167.2, |
| "kl_loss_26": 1050.2, |
| "kl_loss_39": 655.95, |
| "kl_loss_7": 2853.6, |
| "learning_rate": 0.00026246708450250255, |
| "loss": 3363.15, |
| "step": 6610 |
| }, |
| { |
| "ce_loss_13": 2.530620867013931, |
| "ce_loss_26": 2.015302965044975, |
| "ce_loss_39": 1.8304377377033234, |
| "ce_loss_52": 1.4897648423910141, |
| "ce_loss_7": 2.84687961935997, |
| "epoch": 0.662, |
| "grad_norm": 14.322777446508148, |
| "kl_loss_13": 2141.4, |
| "kl_loss_26": 1047.1, |
| "kl_loss_39": 661.0, |
| "kl_loss_7": 2818.4, |
| "learning_rate": 0.00026107209834516854, |
| "loss": 3368.65, |
| "step": 6620 |
| }, |
| { |
| "ce_loss_13": 2.5134909957647324, |
| "ce_loss_26": 1.964612963795662, |
| "ce_loss_39": 1.7641142904758453, |
| "ce_loss_52": 1.4144739270210267, |
| "ce_loss_7": 2.851569724082947, |
| "epoch": 0.663, |
| "grad_norm": 14.388583697005986, |
| "kl_loss_13": 2236.4, |
| "kl_loss_26": 1088.4, |
| "kl_loss_39": 685.8, |
| "kl_loss_7": 2945.2, |
| "learning_rate": 0.0002596795181891514, |
| "loss": 3390.15, |
| "step": 6630 |
| }, |
| { |
| "ce_loss_13": 2.4643958449363708, |
| "ce_loss_26": 1.9360562086105346, |
| "ce_loss_39": 1.7430761098861693, |
| "ce_loss_52": 1.4107858330011367, |
| "ce_loss_7": 2.7886133015155794, |
| "epoch": 0.664, |
| "grad_norm": 14.667067788196036, |
| "kl_loss_13": 2160.8, |
| "kl_loss_26": 1055.2, |
| "kl_loss_39": 663.05, |
| "kl_loss_7": 2836.8, |
| "learning_rate": 0.000258289358057718, |
| "loss": 3433.7, |
| "step": 6640 |
| }, |
| { |
| "ce_loss_13": 2.470621481537819, |
| "ce_loss_26": 1.932977157831192, |
| "ce_loss_39": 1.7458938509225845, |
| "ce_loss_52": 1.40322026014328, |
| "ce_loss_7": 2.8033271014690397, |
| "epoch": 0.665, |
| "grad_norm": 14.559695450600435, |
| "kl_loss_13": 2196.2, |
| "kl_loss_26": 1066.5, |
| "kl_loss_39": 673.3, |
| "kl_loss_7": 2893.6, |
| "learning_rate": 0.0002569016319497657, |
| "loss": 3385.35, |
| "step": 6650 |
| }, |
| { |
| "ce_loss_13": 2.523782452940941, |
| "ce_loss_26": 1.9726400285959245, |
| "ce_loss_39": 1.783732882142067, |
| "ce_loss_52": 1.441526584327221, |
| "ce_loss_7": 2.8546798706054686, |
| "epoch": 0.666, |
| "grad_norm": 14.338494844564005, |
| "kl_loss_13": 2201.0, |
| "kl_loss_26": 1066.9, |
| "kl_loss_39": 675.1, |
| "kl_loss_7": 2913.6, |
| "learning_rate": 0.00025551635383968066, |
| "loss": 3431.65, |
| "step": 6660 |
| }, |
| { |
| "ce_loss_13": 2.496166667342186, |
| "ce_loss_26": 1.9616001814603805, |
| "ce_loss_39": 1.7780775994062423, |
| "ce_loss_52": 1.4437817305326461, |
| "ce_loss_7": 2.8353283524513246, |
| "epoch": 0.667, |
| "grad_norm": 14.333332930511547, |
| "kl_loss_13": 2162.8, |
| "kl_loss_26": 1041.8, |
| "kl_loss_39": 658.3, |
| "kl_loss_7": 2867.2, |
| "learning_rate": 0.00025413353767719804, |
| "loss": 3373.9, |
| "step": 6670 |
| }, |
| { |
| "ce_loss_13": 2.4899742186069487, |
| "ce_loss_26": 1.9639368683099747, |
| "ce_loss_39": 1.7742518305778503, |
| "ce_loss_52": 1.4562912076711654, |
| "ce_loss_7": 2.813701218366623, |
| "epoch": 0.668, |
| "grad_norm": 15.020866565536496, |
| "kl_loss_13": 2118.8, |
| "kl_loss_26": 1008.9, |
| "kl_loss_39": 626.15, |
| "kl_loss_7": 2799.6, |
| "learning_rate": 0.0002527531973872617, |
| "loss": 3354.0, |
| "step": 6680 |
| }, |
| { |
| "ce_loss_13": 2.4495032489299775, |
| "ce_loss_26": 1.9232689619064331, |
| "ce_loss_39": 1.7322240889072418, |
| "ce_loss_52": 1.4083685100078582, |
| "ce_loss_7": 2.7809501469135283, |
| "epoch": 0.669, |
| "grad_norm": 15.157984637483661, |
| "kl_loss_13": 2152.4, |
| "kl_loss_26": 1034.8, |
| "kl_loss_39": 641.05, |
| "kl_loss_7": 2845.2, |
| "learning_rate": 0.0002513753468698826, |
| "loss": 3397.05, |
| "step": 6690 |
| }, |
| { |
| "ce_loss_13": 2.5416204214096068, |
| "ce_loss_26": 1.9909723430871964, |
| "ce_loss_39": 1.797818985581398, |
| "ce_loss_52": 1.4566215574741364, |
| "ce_loss_7": 2.8839422285556795, |
| "epoch": 0.67, |
| "grad_norm": 14.731917297604895, |
| "kl_loss_13": 2207.8, |
| "kl_loss_26": 1067.0, |
| "kl_loss_39": 664.6, |
| "kl_loss_7": 2918.4, |
| "learning_rate": 0.0002500000000000001, |
| "loss": 3410.05, |
| "step": 6700 |
| }, |
| { |
| "ce_loss_13": 2.46220725774765, |
| "ce_loss_26": 1.9426458358764649, |
| "ce_loss_39": 1.7631336867809295, |
| "ce_loss_52": 1.4324473321437836, |
| "ce_loss_7": 2.7896072566509247, |
| "epoch": 0.671, |
| "grad_norm": 14.394157937336324, |
| "kl_loss_13": 2145.4, |
| "kl_loss_26": 1030.2, |
| "kl_loss_39": 651.6, |
| "kl_loss_7": 2836.8, |
| "learning_rate": 0.0002486271706273421, |
| "loss": 3349.6, |
| "step": 6710 |
| }, |
| { |
| "ce_loss_13": 2.4807921826839445, |
| "ce_loss_26": 1.9570556044578553, |
| "ce_loss_39": 1.776718083024025, |
| "ce_loss_52": 1.4477659314870834, |
| "ce_loss_7": 2.8088565468788147, |
| "epoch": 0.672, |
| "grad_norm": 14.57538335602299, |
| "kl_loss_13": 2109.6, |
| "kl_loss_26": 1009.8, |
| "kl_loss_39": 634.25, |
| "kl_loss_7": 2806.0, |
| "learning_rate": 0.0002472568725762853, |
| "loss": 3376.2, |
| "step": 6720 |
| }, |
| { |
| "ce_loss_13": 2.4802849024534224, |
| "ce_loss_26": 1.9443901777267456, |
| "ce_loss_39": 1.7523796886205674, |
| "ce_loss_52": 1.4139477282762527, |
| "ce_loss_7": 2.8129481852054594, |
| "epoch": 0.673, |
| "grad_norm": 14.144296062088605, |
| "kl_loss_13": 2194.4, |
| "kl_loss_26": 1069.6, |
| "kl_loss_39": 674.95, |
| "kl_loss_7": 2888.8, |
| "learning_rate": 0.00024588911964571554, |
| "loss": 3364.55, |
| "step": 6730 |
| }, |
| { |
| "ce_loss_13": 2.5132571697235107, |
| "ce_loss_26": 1.9828839927911759, |
| "ce_loss_39": 1.7922901511192322, |
| "ce_loss_52": 1.4625934183597564, |
| "ce_loss_7": 2.841163671016693, |
| "epoch": 0.674, |
| "grad_norm": 14.199249331732203, |
| "kl_loss_13": 2159.4, |
| "kl_loss_26": 1039.5, |
| "kl_loss_39": 646.65, |
| "kl_loss_7": 2846.0, |
| "learning_rate": 0.00024452392560888974, |
| "loss": 3361.05, |
| "step": 6740 |
| }, |
| { |
| "ce_loss_13": 2.4865836411714555, |
| "ce_loss_26": 1.954461258649826, |
| "ce_loss_39": 1.7649456202983855, |
| "ce_loss_52": 1.4221005111932754, |
| "ce_loss_7": 2.821400898694992, |
| "epoch": 0.675, |
| "grad_norm": 14.682119193582665, |
| "kl_loss_13": 2206.2, |
| "kl_loss_26": 1077.5, |
| "kl_loss_39": 678.9, |
| "kl_loss_7": 2902.4, |
| "learning_rate": 0.00024316130421329695, |
| "loss": 3347.95, |
| "step": 6750 |
| }, |
| { |
| "ce_loss_13": 2.474187096953392, |
| "ce_loss_26": 1.9446223825216293, |
| "ce_loss_39": 1.7641993075609208, |
| "ce_loss_52": 1.436264917254448, |
| "ce_loss_7": 2.8015049755573274, |
| "epoch": 0.676, |
| "grad_norm": 14.72420694586694, |
| "kl_loss_13": 2155.0, |
| "kl_loss_26": 1030.2, |
| "kl_loss_39": 646.8, |
| "kl_loss_7": 2843.6, |
| "learning_rate": 0.00024180126918051909, |
| "loss": 3348.9, |
| "step": 6760 |
| }, |
| { |
| "ce_loss_13": 2.480317395925522, |
| "ce_loss_26": 1.956321433186531, |
| "ce_loss_39": 1.7687535285949707, |
| "ce_loss_52": 1.4266707986593246, |
| "ce_loss_7": 2.808481311798096, |
| "epoch": 0.677, |
| "grad_norm": 15.416504395614744, |
| "kl_loss_13": 2171.4, |
| "kl_loss_26": 1058.8, |
| "kl_loss_39": 666.6, |
| "kl_loss_7": 2869.2, |
| "learning_rate": 0.00024044383420609406, |
| "loss": 3413.1, |
| "step": 6770 |
| }, |
| { |
| "ce_loss_13": 2.5013114362955093, |
| "ce_loss_26": 1.9759581625461577, |
| "ce_loss_39": 1.7926720827817917, |
| "ce_loss_52": 1.4599666327238083, |
| "ce_loss_7": 2.8250863194465636, |
| "epoch": 0.678, |
| "grad_norm": 13.962008513939274, |
| "kl_loss_13": 2133.8, |
| "kl_loss_26": 1039.8, |
| "kl_loss_39": 651.85, |
| "kl_loss_7": 2824.8, |
| "learning_rate": 0.00023908901295937712, |
| "loss": 3372.05, |
| "step": 6780 |
| }, |
| { |
| "ce_loss_13": 2.489407476782799, |
| "ce_loss_26": 1.9609563022851944, |
| "ce_loss_39": 1.778808832168579, |
| "ce_loss_52": 1.454407089948654, |
| "ce_loss_7": 2.8136882543563844, |
| "epoch": 0.679, |
| "grad_norm": 14.19915367627698, |
| "kl_loss_13": 2110.2, |
| "kl_loss_26": 1016.4, |
| "kl_loss_39": 637.05, |
| "kl_loss_7": 2792.0, |
| "learning_rate": 0.00023773681908340283, |
| "loss": 3384.7, |
| "step": 6790 |
| }, |
| { |
| "ce_loss_13": 2.4634098410606384, |
| "ce_loss_26": 1.9384621411561966, |
| "ce_loss_39": 1.7507896840572357, |
| "ce_loss_52": 1.4146029382944107, |
| "ce_loss_7": 2.7943048059940336, |
| "epoch": 0.68, |
| "grad_norm": 14.861512399701681, |
| "kl_loss_13": 2166.4, |
| "kl_loss_26": 1051.5, |
| "kl_loss_39": 658.6, |
| "kl_loss_7": 2860.8, |
| "learning_rate": 0.00023638726619474876, |
| "loss": 3356.85, |
| "step": 6800 |
| }, |
| { |
| "ce_loss_13": 2.5783134520053865, |
| "ce_loss_26": 2.045197767019272, |
| "ce_loss_39": 1.8584345400333404, |
| "ce_loss_52": 1.5217636466026305, |
| "ce_loss_7": 2.9070691764354706, |
| "epoch": 0.681, |
| "grad_norm": 14.552198364638281, |
| "kl_loss_13": 2176.2, |
| "kl_loss_26": 1054.5, |
| "kl_loss_39": 659.7, |
| "kl_loss_7": 2877.6, |
| "learning_rate": 0.0002350403678833976, |
| "loss": 3347.55, |
| "step": 6810 |
| }, |
| { |
| "ce_loss_13": 2.4692390322685243, |
| "ce_loss_26": 1.943667185306549, |
| "ce_loss_39": 1.7523112028837204, |
| "ce_loss_52": 1.426128900051117, |
| "ce_loss_7": 2.7978816986083985, |
| "epoch": 0.682, |
| "grad_norm": 14.998678580001265, |
| "kl_loss_13": 2167.2, |
| "kl_loss_26": 1050.9, |
| "kl_loss_39": 652.95, |
| "kl_loss_7": 2852.4, |
| "learning_rate": 0.00023369613771260007, |
| "loss": 3369.6, |
| "step": 6820 |
| }, |
| { |
| "ce_loss_13": 2.4953604638576508, |
| "ce_loss_26": 1.981321769952774, |
| "ce_loss_39": 1.7999033033847809, |
| "ce_loss_52": 1.4693025022745132, |
| "ce_loss_7": 2.826812982559204, |
| "epoch": 0.683, |
| "grad_norm": 14.167664016838927, |
| "kl_loss_13": 2131.6, |
| "kl_loss_26": 1041.6, |
| "kl_loss_39": 654.4, |
| "kl_loss_7": 2822.8, |
| "learning_rate": 0.00023235458921873925, |
| "loss": 3334.7, |
| "step": 6830 |
| }, |
| { |
| "ce_loss_13": 2.5015017211437227, |
| "ce_loss_26": 1.9650517791509627, |
| "ce_loss_39": 1.7717696577310562, |
| "ce_loss_52": 1.4357108920812607, |
| "ce_loss_7": 2.827932006120682, |
| "epoch": 0.684, |
| "grad_norm": 14.585579391353733, |
| "kl_loss_13": 2162.0, |
| "kl_loss_26": 1051.0, |
| "kl_loss_39": 656.3, |
| "kl_loss_7": 2848.0, |
| "learning_rate": 0.0002310157359111938, |
| "loss": 3348.15, |
| "step": 6840 |
| }, |
| { |
| "ce_loss_13": 2.426406466960907, |
| "ce_loss_26": 1.9015664726495742, |
| "ce_loss_39": 1.718049594759941, |
| "ce_loss_52": 1.3935224622488023, |
| "ce_loss_7": 2.756322818994522, |
| "epoch": 0.685, |
| "grad_norm": 15.055484269746147, |
| "kl_loss_13": 2134.6, |
| "kl_loss_26": 1022.1, |
| "kl_loss_39": 635.6, |
| "kl_loss_7": 2830.0, |
| "learning_rate": 0.0002296795912722014, |
| "loss": 3335.95, |
| "step": 6850 |
| }, |
| { |
| "ce_loss_13": 2.414138987660408, |
| "ce_loss_26": 1.8910569071769714, |
| "ce_loss_39": 1.708191841840744, |
| "ce_loss_52": 1.383391012251377, |
| "ce_loss_7": 2.747501391172409, |
| "epoch": 0.686, |
| "grad_norm": 14.125926922706771, |
| "kl_loss_13": 2113.8, |
| "kl_loss_26": 1014.7, |
| "kl_loss_39": 637.05, |
| "kl_loss_7": 2808.8, |
| "learning_rate": 0.0002283461687567236, |
| "loss": 3303.65, |
| "step": 6860 |
| }, |
| { |
| "ce_loss_13": 2.464204970002174, |
| "ce_loss_26": 1.9375032573938369, |
| "ce_loss_39": 1.7498446986079217, |
| "ce_loss_52": 1.4226357489824295, |
| "ce_loss_7": 2.7877202153205873, |
| "epoch": 0.687, |
| "grad_norm": 14.738238275957917, |
| "kl_loss_13": 2139.2, |
| "kl_loss_26": 1028.9, |
| "kl_loss_39": 644.15, |
| "kl_loss_7": 2826.0, |
| "learning_rate": 0.00022701548179231045, |
| "loss": 3307.2, |
| "step": 6870 |
| }, |
| { |
| "ce_loss_13": 2.494007241725922, |
| "ce_loss_26": 1.9751898407936097, |
| "ce_loss_39": 1.7829045623540878, |
| "ce_loss_52": 1.4496880739927291, |
| "ce_loss_7": 2.8286533296108245, |
| "epoch": 0.688, |
| "grad_norm": 13.852391362837148, |
| "kl_loss_13": 2141.2, |
| "kl_loss_26": 1042.5, |
| "kl_loss_39": 651.3, |
| "kl_loss_7": 2830.8, |
| "learning_rate": 0.00022568754377896516, |
| "loss": 3367.25, |
| "step": 6880 |
| }, |
| { |
| "ce_loss_13": 2.4991670876741408, |
| "ce_loss_26": 1.9593406468629837, |
| "ce_loss_39": 1.7670493572950363, |
| "ce_loss_52": 1.426390787959099, |
| "ce_loss_7": 2.8318077862262725, |
| "epoch": 0.689, |
| "grad_norm": 14.446284686187164, |
| "kl_loss_13": 2202.8, |
| "kl_loss_26": 1070.1, |
| "kl_loss_39": 671.2, |
| "kl_loss_7": 2903.6, |
| "learning_rate": 0.00022436236808900844, |
| "loss": 3351.3, |
| "step": 6890 |
| }, |
| { |
| "ce_loss_13": 2.5084181427955627, |
| "ce_loss_26": 1.9811445116996764, |
| "ce_loss_39": 1.7889297604560852, |
| "ce_loss_52": 1.4664145559072495, |
| "ce_loss_7": 2.8357039868831633, |
| "epoch": 0.69, |
| "grad_norm": 14.484209525578123, |
| "kl_loss_13": 2151.6, |
| "kl_loss_26": 1045.5, |
| "kl_loss_39": 652.25, |
| "kl_loss_7": 2853.6, |
| "learning_rate": 0.00022303996806694487, |
| "loss": 3356.65, |
| "step": 6900 |
| }, |
| { |
| "ce_loss_13": 2.514096361398697, |
| "ce_loss_26": 1.9786822557449342, |
| "ce_loss_39": 1.7935864567756652, |
| "ce_loss_52": 1.4548332244157791, |
| "ce_loss_7": 2.839564120769501, |
| "epoch": 0.691, |
| "grad_norm": 13.76338668087359, |
| "kl_loss_13": 2184.2, |
| "kl_loss_26": 1055.1, |
| "kl_loss_39": 667.4, |
| "kl_loss_7": 2879.6, |
| "learning_rate": 0.00022172035702932823, |
| "loss": 3337.1, |
| "step": 6910 |
| }, |
| { |
| "ce_loss_13": 2.4764732241630556, |
| "ce_loss_26": 1.95515196621418, |
| "ce_loss_39": 1.772997224330902, |
| "ce_loss_52": 1.4487987339496613, |
| "ce_loss_7": 2.8033831179142, |
| "epoch": 0.692, |
| "grad_norm": 14.586346705034499, |
| "kl_loss_13": 2110.2, |
| "kl_loss_26": 1014.4, |
| "kl_loss_39": 638.2, |
| "kl_loss_7": 2793.2, |
| "learning_rate": 0.00022040354826462666, |
| "loss": 3310.75, |
| "step": 6920 |
| }, |
| { |
| "ce_loss_13": 2.468746620416641, |
| "ce_loss_26": 1.9468185782432557, |
| "ce_loss_39": 1.7679857224225999, |
| "ce_loss_52": 1.4481467604637146, |
| "ce_loss_7": 2.789846181869507, |
| "epoch": 0.693, |
| "grad_norm": 15.005402062047136, |
| "kl_loss_13": 2109.6, |
| "kl_loss_26": 1011.9, |
| "kl_loss_39": 635.85, |
| "kl_loss_7": 2789.6, |
| "learning_rate": 0.0002190895550330899, |
| "loss": 3354.5, |
| "step": 6930 |
| }, |
| { |
| "ce_loss_13": 2.4501267641782762, |
| "ce_loss_26": 1.9237078607082367, |
| "ce_loss_39": 1.735903450846672, |
| "ce_loss_52": 1.413103035092354, |
| "ce_loss_7": 2.7897567749023438, |
| "epoch": 0.694, |
| "grad_norm": 14.894293084006021, |
| "kl_loss_13": 2115.4, |
| "kl_loss_26": 1009.8, |
| "kl_loss_39": 623.75, |
| "kl_loss_7": 2825.6, |
| "learning_rate": 0.00021777839056661552, |
| "loss": 3328.85, |
| "step": 6940 |
| }, |
| { |
| "ce_loss_13": 2.5025814145803453, |
| "ce_loss_26": 1.9697064816951753, |
| "ce_loss_39": 1.7858674556016922, |
| "ce_loss_52": 1.459896171092987, |
| "ce_loss_7": 2.8366158485412596, |
| "epoch": 0.695, |
| "grad_norm": 14.62978428769736, |
| "kl_loss_13": 2143.8, |
| "kl_loss_26": 1026.4, |
| "kl_loss_39": 644.0, |
| "kl_loss_7": 2842.8, |
| "learning_rate": 0.0002164700680686147, |
| "loss": 3339.6, |
| "step": 6950 |
| }, |
| { |
| "ce_loss_13": 2.480437287688255, |
| "ce_loss_26": 1.9569555580615998, |
| "ce_loss_39": 1.7707047134637832, |
| "ce_loss_52": 1.4560858264565468, |
| "ce_loss_7": 2.8063796043395994, |
| "epoch": 0.696, |
| "grad_norm": 14.305917479002375, |
| "kl_loss_13": 2104.0, |
| "kl_loss_26": 1003.9, |
| "kl_loss_39": 621.6, |
| "kl_loss_7": 2791.2, |
| "learning_rate": 0.0002151646007138806, |
| "loss": 3346.15, |
| "step": 6960 |
| }, |
| { |
| "ce_loss_13": 2.4904795557260515, |
| "ce_loss_26": 1.9593748480081559, |
| "ce_loss_39": 1.7725317537784577, |
| "ce_loss_52": 1.4421725705266, |
| "ce_loss_7": 2.82717769742012, |
| "epoch": 0.697, |
| "grad_norm": 14.55516871467818, |
| "kl_loss_13": 2171.2, |
| "kl_loss_26": 1046.7, |
| "kl_loss_39": 655.3, |
| "kl_loss_7": 2878.4, |
| "learning_rate": 0.00021386200164845526, |
| "loss": 3321.35, |
| "step": 6970 |
| }, |
| { |
| "ce_loss_13": 2.4793561339378356, |
| "ce_loss_26": 1.9439466089010238, |
| "ce_loss_39": 1.7569547444581985, |
| "ce_loss_52": 1.4289204239845277, |
| "ce_loss_7": 2.807218599319458, |
| "epoch": 0.698, |
| "grad_norm": 13.778785355416604, |
| "kl_loss_13": 2152.0, |
| "kl_loss_26": 1037.7, |
| "kl_loss_39": 642.8, |
| "kl_loss_7": 2848.0, |
| "learning_rate": 0.0002125622839894964, |
| "loss": 3315.5, |
| "step": 6980 |
| }, |
| { |
| "ce_loss_13": 2.585531139373779, |
| "ce_loss_26": 2.031143417954445, |
| "ce_loss_39": 1.8274976074695588, |
| "ce_loss_52": 1.4753503799438477, |
| "ce_loss_7": 2.921118849515915, |
| "epoch": 0.699, |
| "grad_norm": 14.664216613597723, |
| "kl_loss_13": 2260.4, |
| "kl_loss_26": 1103.8, |
| "kl_loss_39": 685.9, |
| "kl_loss_7": 2968.0, |
| "learning_rate": 0.00021126546082514663, |
| "loss": 3365.55, |
| "step": 6990 |
| }, |
| { |
| "ce_loss_13": 2.478432095050812, |
| "ce_loss_26": 1.950699546933174, |
| "ce_loss_39": 1.7660550504922867, |
| "ce_loss_52": 1.4418641477823257, |
| "ce_loss_7": 2.802116149663925, |
| "epoch": 0.7, |
| "grad_norm": 14.414270184643762, |
| "kl_loss_13": 2128.4, |
| "kl_loss_26": 1022.4, |
| "kl_loss_39": 639.9, |
| "kl_loss_7": 2808.8, |
| "learning_rate": 0.00020997154521440098, |
| "loss": 3312.0, |
| "step": 7000 |
| }, |
| { |
| "ce_loss_13": 2.4484674006700518, |
| "ce_loss_26": 1.9334596753120423, |
| "ce_loss_39": 1.754175427556038, |
| "ce_loss_52": 1.4356836065649987, |
| "ce_loss_7": 2.7715867519378663, |
| "epoch": 0.701, |
| "grad_norm": 15.009606673101777, |
| "kl_loss_13": 2096.6, |
| "kl_loss_26": 1005.6, |
| "kl_loss_39": 630.9, |
| "kl_loss_7": 2776.0, |
| "learning_rate": 0.0002086805501869749, |
| "loss": 3296.9, |
| "step": 7010 |
| }, |
| { |
| "ce_loss_13": 2.500411355495453, |
| "ce_loss_26": 1.9781237423419953, |
| "ce_loss_39": 1.7995324105024337, |
| "ce_loss_52": 1.473289003968239, |
| "ce_loss_7": 2.8238317251205443, |
| "epoch": 0.702, |
| "grad_norm": 14.811490845229788, |
| "kl_loss_13": 2108.4, |
| "kl_loss_26": 1018.1, |
| "kl_loss_39": 642.65, |
| "kl_loss_7": 2786.4, |
| "learning_rate": 0.0002073924887431744, |
| "loss": 3301.85, |
| "step": 7020 |
| }, |
| { |
| "ce_loss_13": 2.4331007301807404, |
| "ce_loss_26": 1.925421604514122, |
| "ce_loss_39": 1.740053552389145, |
| "ce_loss_52": 1.4260162442922593, |
| "ce_loss_7": 2.750682008266449, |
| "epoch": 0.703, |
| "grad_norm": 14.093530596736626, |
| "kl_loss_13": 2071.0, |
| "kl_loss_26": 996.4, |
| "kl_loss_39": 613.7, |
| "kl_loss_7": 2741.2, |
| "learning_rate": 0.00020610737385376348, |
| "loss": 3303.45, |
| "step": 7030 |
| }, |
| { |
| "ce_loss_13": 2.438492274284363, |
| "ce_loss_26": 1.917963182926178, |
| "ce_loss_39": 1.731092056632042, |
| "ce_loss_52": 1.4118753850460053, |
| "ce_loss_7": 2.7640158772468566, |
| "epoch": 0.704, |
| "grad_norm": 14.575552477174394, |
| "kl_loss_13": 2113.4, |
| "kl_loss_26": 1019.3, |
| "kl_loss_39": 632.85, |
| "kl_loss_7": 2804.0, |
| "learning_rate": 0.00020482521845983521, |
| "loss": 3301.55, |
| "step": 7040 |
| }, |
| { |
| "ce_loss_13": 2.4910335719585417, |
| "ce_loss_26": 1.9715039610862732, |
| "ce_loss_39": 1.7800425946712495, |
| "ce_loss_52": 1.4519392430782319, |
| "ce_loss_7": 2.819239354133606, |
| "epoch": 0.705, |
| "grad_norm": 14.820051388586238, |
| "kl_loss_13": 2136.2, |
| "kl_loss_26": 1040.9, |
| "kl_loss_39": 651.75, |
| "kl_loss_7": 2824.0, |
| "learning_rate": 0.00020354603547267987, |
| "loss": 3316.6, |
| "step": 7050 |
| }, |
| { |
| "ce_loss_13": 2.4318030804395674, |
| "ce_loss_26": 1.910761234164238, |
| "ce_loss_39": 1.7284663796424866, |
| "ce_loss_52": 1.4129984229803085, |
| "ce_loss_7": 2.7625936210155486, |
| "epoch": 0.706, |
| "grad_norm": 14.59547057670379, |
| "kl_loss_13": 2113.8, |
| "kl_loss_26": 1004.1, |
| "kl_loss_39": 622.2, |
| "kl_loss_7": 2816.4, |
| "learning_rate": 0.00020226983777365604, |
| "loss": 3284.95, |
| "step": 7060 |
| }, |
| { |
| "ce_loss_13": 2.4749036192893983, |
| "ce_loss_26": 1.9450800210237502, |
| "ce_loss_39": 1.761537629365921, |
| "ce_loss_52": 1.4373356252908707, |
| "ce_loss_7": 2.809742730855942, |
| "epoch": 0.707, |
| "grad_norm": 14.651682120589488, |
| "kl_loss_13": 2148.8, |
| "kl_loss_26": 1038.1, |
| "kl_loss_39": 647.05, |
| "kl_loss_7": 2860.4, |
| "learning_rate": 0.00020099663821406056, |
| "loss": 3330.65, |
| "step": 7070 |
| }, |
| { |
| "ce_loss_13": 2.500520494580269, |
| "ce_loss_26": 1.9711394160985947, |
| "ce_loss_39": 1.7850598603487016, |
| "ce_loss_52": 1.4572303384542464, |
| "ce_loss_7": 2.822962099313736, |
| "epoch": 0.708, |
| "grad_norm": 14.695952402949361, |
| "kl_loss_13": 2140.6, |
| "kl_loss_26": 1032.0, |
| "kl_loss_39": 642.6, |
| "kl_loss_7": 2828.8, |
| "learning_rate": 0.00019972644961499853, |
| "loss": 3310.1, |
| "step": 7080 |
| }, |
| { |
| "ce_loss_13": 2.4471381455659866, |
| "ce_loss_26": 1.9142519533634186, |
| "ce_loss_39": 1.7273518294095993, |
| "ce_loss_52": 1.4090154066681861, |
| "ce_loss_7": 2.775345432758331, |
| "epoch": 0.709, |
| "grad_norm": 14.907893630817398, |
| "kl_loss_13": 2145.0, |
| "kl_loss_26": 1028.7, |
| "kl_loss_39": 636.3, |
| "kl_loss_7": 2839.2, |
| "learning_rate": 0.00019845928476725522, |
| "loss": 3284.4, |
| "step": 7090 |
| }, |
| { |
| "ce_loss_13": 2.484838107228279, |
| "ce_loss_26": 1.9752304345369338, |
| "ce_loss_39": 1.7935984045267106, |
| "ce_loss_52": 1.4694376409053802, |
| "ce_loss_7": 2.8051605463027953, |
| "epoch": 0.71, |
| "grad_norm": 14.813324344167642, |
| "kl_loss_13": 2100.4, |
| "kl_loss_26": 1009.5, |
| "kl_loss_39": 629.85, |
| "kl_loss_7": 2784.0, |
| "learning_rate": 0.00019719515643116677, |
| "loss": 3271.1, |
| "step": 7100 |
| }, |
| { |
| "ce_loss_13": 2.449986720085144, |
| "ce_loss_26": 1.915557289123535, |
| "ce_loss_39": 1.7244810461997986, |
| "ce_loss_52": 1.4009377419948579, |
| "ce_loss_7": 2.7815246999263765, |
| "epoch": 0.711, |
| "grad_norm": 14.72405028446814, |
| "kl_loss_13": 2128.8, |
| "kl_loss_26": 1018.3, |
| "kl_loss_39": 632.1, |
| "kl_loss_7": 2824.4, |
| "learning_rate": 0.0001959340773364911, |
| "loss": 3301.5, |
| "step": 7110 |
| }, |
| { |
| "ce_loss_13": 2.4507795870304108, |
| "ce_loss_26": 1.9264422208070755, |
| "ce_loss_39": 1.7369968056678773, |
| "ce_loss_52": 1.4171953916549682, |
| "ce_loss_7": 2.778002160787582, |
| "epoch": 0.712, |
| "grad_norm": 15.123641060610607, |
| "kl_loss_13": 2145.2, |
| "kl_loss_26": 1034.6, |
| "kl_loss_39": 640.6, |
| "kl_loss_7": 2832.0, |
| "learning_rate": 0.0001946760601822809, |
| "loss": 3307.65, |
| "step": 7120 |
| }, |
| { |
| "ce_loss_13": 2.4649185329675674, |
| "ce_loss_26": 1.9448591649532319, |
| "ce_loss_39": 1.7617656499147416, |
| "ce_loss_52": 1.4421708196401597, |
| "ce_loss_7": 2.7942136943340303, |
| "epoch": 0.713, |
| "grad_norm": 13.86141587264665, |
| "kl_loss_13": 2099.6, |
| "kl_loss_26": 996.3, |
| "kl_loss_39": 613.9, |
| "kl_loss_7": 2784.0, |
| "learning_rate": 0.00019342111763675512, |
| "loss": 3264.15, |
| "step": 7130 |
| }, |
| { |
| "ce_loss_13": 2.431650939583778, |
| "ce_loss_26": 1.8971330910921096, |
| "ce_loss_39": 1.7134798288345336, |
| "ce_loss_52": 1.3959884241223335, |
| "ce_loss_7": 2.7688992261886596, |
| "epoch": 0.714, |
| "grad_norm": 14.868179084191116, |
| "kl_loss_13": 2103.6, |
| "kl_loss_26": 997.3, |
| "kl_loss_39": 614.55, |
| "kl_loss_7": 2798.8, |
| "learning_rate": 0.00019216926233717085, |
| "loss": 3302.05, |
| "step": 7140 |
| }, |
| { |
| "ce_loss_13": 2.4574439406394957, |
| "ce_loss_26": 1.9289735972881317, |
| "ce_loss_39": 1.738691231608391, |
| "ce_loss_52": 1.4203062415122987, |
| "ce_loss_7": 2.7880250751972198, |
| "epoch": 0.715, |
| "grad_norm": 14.757879306344181, |
| "kl_loss_13": 2133.4, |
| "kl_loss_26": 1021.4, |
| "kl_loss_39": 631.75, |
| "kl_loss_7": 2823.2, |
| "learning_rate": 0.00019092050688969737, |
| "loss": 3296.5, |
| "step": 7150 |
| }, |
| { |
| "ce_loss_13": 2.4601316511631013, |
| "ce_loss_26": 1.9434845715761184, |
| "ce_loss_39": 1.7585901826620103, |
| "ce_loss_52": 1.4390017569065094, |
| "ce_loss_7": 2.7778116285800936, |
| "epoch": 0.716, |
| "grad_norm": 13.991843131427743, |
| "kl_loss_13": 2085.4, |
| "kl_loss_26": 1007.0, |
| "kl_loss_39": 627.45, |
| "kl_loss_7": 2755.6, |
| "learning_rate": 0.00018967486386928817, |
| "loss": 3286.15, |
| "step": 7160 |
| }, |
| { |
| "ce_loss_13": 2.451919847726822, |
| "ce_loss_26": 1.9279222816228867, |
| "ce_loss_39": 1.7440420866012574, |
| "ce_loss_52": 1.4374898225069046, |
| "ce_loss_7": 2.784500467777252, |
| "epoch": 0.717, |
| "grad_norm": 14.5708304909804, |
| "kl_loss_13": 2095.4, |
| "kl_loss_26": 992.5, |
| "kl_loss_39": 611.75, |
| "kl_loss_7": 2794.0, |
| "learning_rate": 0.00018843234581955443, |
| "loss": 3292.25, |
| "step": 7170 |
| }, |
| { |
| "ce_loss_13": 2.4709593683481215, |
| "ce_loss_26": 1.9460813373327255, |
| "ce_loss_39": 1.7575767368078232, |
| "ce_loss_52": 1.4326383203268052, |
| "ce_loss_7": 2.7951516568660737, |
| "epoch": 0.718, |
| "grad_norm": 14.981137787748375, |
| "kl_loss_13": 2117.8, |
| "kl_loss_26": 1019.7, |
| "kl_loss_39": 633.35, |
| "kl_loss_7": 2803.2, |
| "learning_rate": 0.00018719296525263924, |
| "loss": 3299.6, |
| "step": 7180 |
| }, |
| { |
| "ce_loss_13": 2.4041130542755127, |
| "ce_loss_26": 1.8861528187990189, |
| "ce_loss_39": 1.7035977393388748, |
| "ce_loss_52": 1.4066498517990111, |
| "ce_loss_7": 2.735060691833496, |
| "epoch": 0.719, |
| "grad_norm": 14.986994612654895, |
| "kl_loss_13": 2054.2, |
| "kl_loss_26": 961.6, |
| "kl_loss_39": 585.6, |
| "kl_loss_7": 2750.4, |
| "learning_rate": 0.0001859567346490913, |
| "loss": 3264.25, |
| "step": 7190 |
| }, |
| { |
| "ce_loss_13": 2.521838116645813, |
| "ce_loss_26": 2.004297485947609, |
| "ce_loss_39": 1.810739102959633, |
| "ce_loss_52": 1.4783420652151107, |
| "ce_loss_7": 2.849927377700806, |
| "epoch": 0.72, |
| "grad_norm": 14.181310648182276, |
| "kl_loss_13": 2154.6, |
| "kl_loss_26": 1052.9, |
| "kl_loss_39": 657.35, |
| "kl_loss_7": 2848.4, |
| "learning_rate": 0.0001847236664577389, |
| "loss": 3278.0, |
| "step": 7200 |
| }, |
| { |
| "ce_loss_13": 2.40320103764534, |
| "ce_loss_26": 1.8900187402963637, |
| "ce_loss_39": 1.7100308045744896, |
| "ce_loss_52": 1.3999869018793105, |
| "ce_loss_7": 2.736201885342598, |
| "epoch": 0.721, |
| "grad_norm": 14.793709205482683, |
| "kl_loss_13": 2080.8, |
| "kl_loss_26": 990.7, |
| "kl_loss_39": 610.95, |
| "kl_loss_7": 2780.0, |
| "learning_rate": 0.00018349377309556487, |
| "loss": 3283.25, |
| "step": 7210 |
| }, |
| { |
| "ce_loss_13": 2.441250967979431, |
| "ce_loss_26": 1.9231963992118835, |
| "ce_loss_39": 1.7388009175658226, |
| "ce_loss_52": 1.4282974660396577, |
| "ce_loss_7": 2.7717535465955736, |
| "epoch": 0.722, |
| "grad_norm": 15.59238941344996, |
| "kl_loss_13": 2104.0, |
| "kl_loss_26": 997.6, |
| "kl_loss_39": 616.85, |
| "kl_loss_7": 2801.2, |
| "learning_rate": 0.00018226706694758193, |
| "loss": 3263.75, |
| "step": 7220 |
| }, |
| { |
| "ce_loss_13": 2.495049071311951, |
| "ce_loss_26": 1.973162716627121, |
| "ce_loss_39": 1.7896722644567489, |
| "ce_loss_52": 1.4697123229503632, |
| "ce_loss_7": 2.8224462032318116, |
| "epoch": 0.723, |
| "grad_norm": 13.997878236797012, |
| "kl_loss_13": 2123.2, |
| "kl_loss_26": 1009.4, |
| "kl_loss_39": 628.15, |
| "kl_loss_7": 2823.2, |
| "learning_rate": 0.0001810435603667075, |
| "loss": 3267.75, |
| "step": 7230 |
| }, |
| { |
| "ce_loss_13": 2.4492597192525865, |
| "ce_loss_26": 1.9282636791467667, |
| "ce_loss_39": 1.7473824605345727, |
| "ce_loss_52": 1.4372958570718766, |
| "ce_loss_7": 2.782766741514206, |
| "epoch": 0.724, |
| "grad_norm": 14.73683414882718, |
| "kl_loss_13": 2088.6, |
| "kl_loss_26": 990.1, |
| "kl_loss_39": 613.75, |
| "kl_loss_7": 2799.6, |
| "learning_rate": 0.0001798232656736389, |
| "loss": 3246.35, |
| "step": 7240 |
| }, |
| { |
| "ce_loss_13": 2.514770272374153, |
| "ce_loss_26": 1.9697564780712127, |
| "ce_loss_39": 1.7806446701288223, |
| "ce_loss_52": 1.456220605969429, |
| "ce_loss_7": 2.8567338407039644, |
| "epoch": 0.725, |
| "grad_norm": 14.87142240672514, |
| "kl_loss_13": 2179.8, |
| "kl_loss_26": 1037.2, |
| "kl_loss_39": 644.4, |
| "kl_loss_7": 2889.2, |
| "learning_rate": 0.0001786061951567303, |
| "loss": 3273.6, |
| "step": 7250 |
| }, |
| { |
| "ce_loss_13": 2.4067626029253004, |
| "ce_loss_26": 1.8963259696960448, |
| "ce_loss_39": 1.7143886119127274, |
| "ce_loss_52": 1.4022117048501967, |
| "ce_loss_7": 2.7348236978054046, |
| "epoch": 0.726, |
| "grad_norm": 14.076850795507209, |
| "kl_loss_13": 2076.6, |
| "kl_loss_26": 994.5, |
| "kl_loss_39": 623.5, |
| "kl_loss_7": 2766.8, |
| "learning_rate": 0.00017739236107186857, |
| "loss": 3281.2, |
| "step": 7260 |
| }, |
| { |
| "ce_loss_13": 2.4501163721084596, |
| "ce_loss_26": 1.926158633828163, |
| "ce_loss_39": 1.7434884279966354, |
| "ce_loss_52": 1.4286866545677186, |
| "ce_loss_7": 2.777343970537186, |
| "epoch": 0.727, |
| "grad_norm": 13.813498461115062, |
| "kl_loss_13": 2114.6, |
| "kl_loss_26": 1012.1, |
| "kl_loss_39": 626.6, |
| "kl_loss_7": 2806.0, |
| "learning_rate": 0.00017618177564234904, |
| "loss": 3264.1, |
| "step": 7270 |
| }, |
| { |
| "ce_loss_13": 2.4412575274705888, |
| "ce_loss_26": 1.9107532769441604, |
| "ce_loss_39": 1.7259235098958015, |
| "ce_loss_52": 1.4076966106891633, |
| "ce_loss_7": 2.774235662817955, |
| "epoch": 0.728, |
| "grad_norm": 14.801740311988109, |
| "kl_loss_13": 2113.4, |
| "kl_loss_26": 1003.6, |
| "kl_loss_39": 619.55, |
| "kl_loss_7": 2807.4, |
| "learning_rate": 0.00017497445105875377, |
| "loss": 3298.7, |
| "step": 7280 |
| }, |
| { |
| "ce_loss_13": 2.445681685209274, |
| "ce_loss_26": 1.9345449537038804, |
| "ce_loss_39": 1.7532619833946228, |
| "ce_loss_52": 1.442472691833973, |
| "ce_loss_7": 2.764800661802292, |
| "epoch": 0.729, |
| "grad_norm": 14.618830047757731, |
| "kl_loss_13": 2063.8, |
| "kl_loss_26": 980.7, |
| "kl_loss_39": 603.55, |
| "kl_loss_7": 2742.0, |
| "learning_rate": 0.000173770399478828, |
| "loss": 3226.7, |
| "step": 7290 |
| }, |
| { |
| "ce_loss_13": 2.4301975846290587, |
| "ce_loss_26": 1.9138565450906753, |
| "ce_loss_39": 1.736141037940979, |
| "ce_loss_52": 1.427780945599079, |
| "ce_loss_7": 2.760072636604309, |
| "epoch": 0.73, |
| "grad_norm": 14.242974774472335, |
| "kl_loss_13": 2095.4, |
| "kl_loss_26": 990.5, |
| "kl_loss_39": 610.6, |
| "kl_loss_7": 2791.2, |
| "learning_rate": 0.0001725696330273575, |
| "loss": 3260.65, |
| "step": 7300 |
| }, |
| { |
| "ce_loss_13": 2.4727762907743456, |
| "ce_loss_26": 1.9557204306125642, |
| "ce_loss_39": 1.766261911392212, |
| "ce_loss_52": 1.4398792043328286, |
| "ce_loss_7": 2.79817710518837, |
| "epoch": 0.731, |
| "grad_norm": 14.566153451338561, |
| "kl_loss_13": 2113.6, |
| "kl_loss_26": 1018.5, |
| "kl_loss_39": 628.8, |
| "kl_loss_7": 2796.8, |
| "learning_rate": 0.00017137216379604724, |
| "loss": 3240.75, |
| "step": 7310 |
| }, |
| { |
| "ce_loss_13": 2.490224635601044, |
| "ce_loss_26": 1.954663872718811, |
| "ce_loss_39": 1.7594738394021987, |
| "ce_loss_52": 1.4360380351543427, |
| "ce_loss_7": 2.8263917326927186, |
| "epoch": 0.732, |
| "grad_norm": 13.205540898906253, |
| "kl_loss_13": 2161.8, |
| "kl_loss_26": 1044.6, |
| "kl_loss_39": 637.45, |
| "kl_loss_7": 2862.4, |
| "learning_rate": 0.00017017800384339925, |
| "loss": 3258.4, |
| "step": 7320 |
| }, |
| { |
| "ce_loss_13": 2.4344683617353438, |
| "ce_loss_26": 1.9195960253477096, |
| "ce_loss_39": 1.7325531929731368, |
| "ce_loss_52": 1.419457183778286, |
| "ce_loss_7": 2.7598242580890657, |
| "epoch": 0.733, |
| "grad_norm": 14.107781745249417, |
| "kl_loss_13": 2087.4, |
| "kl_loss_26": 1001.6, |
| "kl_loss_39": 618.5, |
| "kl_loss_7": 2774.8, |
| "learning_rate": 0.00016898716519459073, |
| "loss": 3316.4, |
| "step": 7330 |
| }, |
| { |
| "ce_loss_13": 2.4717041492462157, |
| "ce_loss_26": 1.9320402562618255, |
| "ce_loss_39": 1.730445721745491, |
| "ce_loss_52": 1.3991459339857102, |
| "ce_loss_7": 2.811204159259796, |
| "epoch": 0.734, |
| "grad_norm": 14.159198716486541, |
| "kl_loss_13": 2200.8, |
| "kl_loss_26": 1069.2, |
| "kl_loss_39": 657.15, |
| "kl_loss_7": 2902.8, |
| "learning_rate": 0.00016779965984135375, |
| "loss": 3266.3, |
| "step": 7340 |
| }, |
| { |
| "ce_loss_13": 2.4648273169994352, |
| "ce_loss_26": 1.9446689933538437, |
| "ce_loss_39": 1.7665399879217147, |
| "ce_loss_52": 1.4552808463573457, |
| "ce_loss_7": 2.7818954586982727, |
| "epoch": 0.735, |
| "grad_norm": 13.974138676843918, |
| "kl_loss_13": 2093.0, |
| "kl_loss_26": 999.2, |
| "kl_loss_39": 622.1, |
| "kl_loss_7": 2762.4, |
| "learning_rate": 0.00016661549974185424, |
| "loss": 3232.6, |
| "step": 7350 |
| }, |
| { |
| "ce_loss_13": 2.497272843122482, |
| "ce_loss_26": 1.9733565777540207, |
| "ce_loss_39": 1.7902508676052094, |
| "ce_loss_52": 1.4602745115756988, |
| "ce_loss_7": 2.8227945923805238, |
| "epoch": 0.736, |
| "grad_norm": 15.105414614283358, |
| "kl_loss_13": 2153.6, |
| "kl_loss_26": 1043.2, |
| "kl_loss_39": 652.45, |
| "kl_loss_7": 2843.6, |
| "learning_rate": 0.00016543469682057105, |
| "loss": 3314.1, |
| "step": 7360 |
| }, |
| { |
| "ce_loss_13": 2.4817920327186584, |
| "ce_loss_26": 1.9674188673496247, |
| "ce_loss_39": 1.788041964173317, |
| "ce_loss_52": 1.4778558552265166, |
| "ce_loss_7": 2.801541256904602, |
| "epoch": 0.737, |
| "grad_norm": 14.089468466172162, |
| "kl_loss_13": 2075.4, |
| "kl_loss_26": 985.2, |
| "kl_loss_39": 606.6, |
| "kl_loss_7": 2750.8, |
| "learning_rate": 0.00016425726296817632, |
| "loss": 3279.5, |
| "step": 7370 |
| }, |
| { |
| "ce_loss_13": 2.4628233551979064, |
| "ce_loss_26": 1.944148001074791, |
| "ce_loss_39": 1.7584406644105912, |
| "ce_loss_52": 1.440820676088333, |
| "ce_loss_7": 2.7982202231884004, |
| "epoch": 0.738, |
| "grad_norm": 14.250790129395915, |
| "kl_loss_13": 2096.0, |
| "kl_loss_26": 994.4, |
| "kl_loss_39": 612.4, |
| "kl_loss_7": 2800.4, |
| "learning_rate": 0.00016308321004141607, |
| "loss": 3270.5, |
| "step": 7380 |
| }, |
| { |
| "ce_loss_13": 2.4311512380838396, |
| "ce_loss_26": 1.910204255580902, |
| "ce_loss_39": 1.7292486786842347, |
| "ce_loss_52": 1.4260056450963021, |
| "ce_loss_7": 2.7644225537776945, |
| "epoch": 0.739, |
| "grad_norm": 14.26013452282849, |
| "kl_loss_13": 2064.2, |
| "kl_loss_26": 971.8, |
| "kl_loss_39": 596.1, |
| "kl_loss_7": 2766.0, |
| "learning_rate": 0.00016191254986299043, |
| "loss": 3267.55, |
| "step": 7390 |
| }, |
| { |
| "ce_loss_13": 2.3748191058635713, |
| "ce_loss_26": 1.8720220893621444, |
| "ce_loss_39": 1.695397737622261, |
| "ce_loss_52": 1.3954048216342927, |
| "ce_loss_7": 2.6974743723869326, |
| "epoch": 0.74, |
| "grad_norm": 14.042172223471859, |
| "kl_loss_13": 2036.0, |
| "kl_loss_26": 961.8, |
| "kl_loss_39": 589.15, |
| "kl_loss_7": 2719.6, |
| "learning_rate": 0.00016074529422143398, |
| "loss": 3237.3, |
| "step": 7400 |
| }, |
| { |
| "ce_loss_13": 2.504778665304184, |
| "ce_loss_26": 1.9736295342445374, |
| "ce_loss_39": 1.778898686170578, |
| "ce_loss_52": 1.458841660618782, |
| "ce_loss_7": 2.830203241109848, |
| "epoch": 0.741, |
| "grad_norm": 14.817704298873846, |
| "kl_loss_13": 2137.6, |
| "kl_loss_26": 1026.7, |
| "kl_loss_39": 628.9, |
| "kl_loss_7": 2830.4, |
| "learning_rate": 0.0001595814548709983, |
| "loss": 3256.85, |
| "step": 7410 |
| }, |
| { |
| "ce_loss_13": 2.457485032081604, |
| "ce_loss_26": 1.955030158162117, |
| "ce_loss_39": 1.7744358479976654, |
| "ce_loss_52": 1.4638055652379989, |
| "ce_loss_7": 2.7708797633647917, |
| "epoch": 0.742, |
| "grad_norm": 13.847929994452544, |
| "kl_loss_13": 2053.2, |
| "kl_loss_26": 989.3, |
| "kl_loss_39": 610.65, |
| "kl_loss_7": 2726.4, |
| "learning_rate": 0.00015842104353153285, |
| "loss": 3240.25, |
| "step": 7420 |
| }, |
| { |
| "ce_loss_13": 2.5232761919498445, |
| "ce_loss_26": 1.9848549604415893, |
| "ce_loss_39": 1.7907847046852112, |
| "ce_loss_52": 1.473637193441391, |
| "ce_loss_7": 2.8558732092380525, |
| "epoch": 0.743, |
| "grad_norm": 14.575648272616709, |
| "kl_loss_13": 2149.6, |
| "kl_loss_26": 1018.4, |
| "kl_loss_39": 623.4, |
| "kl_loss_7": 2838.8, |
| "learning_rate": 0.0001572640718883667, |
| "loss": 3254.8, |
| "step": 7430 |
| }, |
| { |
| "ce_loss_13": 2.4647684305906297, |
| "ce_loss_26": 1.9386949807405471, |
| "ce_loss_39": 1.7500061064958572, |
| "ce_loss_52": 1.4324503019452095, |
| "ce_loss_7": 2.7884095788002012, |
| "epoch": 0.744, |
| "grad_norm": 14.394764150644365, |
| "kl_loss_13": 2108.4, |
| "kl_loss_26": 1000.0, |
| "kl_loss_39": 616.55, |
| "kl_loss_7": 2791.2, |
| "learning_rate": 0.0001561105515921915, |
| "loss": 3224.3, |
| "step": 7440 |
| }, |
| { |
| "ce_loss_13": 2.441458174586296, |
| "ce_loss_26": 1.9374703764915466, |
| "ce_loss_39": 1.7534300208091735, |
| "ce_loss_52": 1.4378804206848144, |
| "ce_loss_7": 2.7636309385299684, |
| "epoch": 0.745, |
| "grad_norm": 14.678295349282738, |
| "kl_loss_13": 2068.2, |
| "kl_loss_26": 997.1, |
| "kl_loss_39": 620.85, |
| "kl_loss_7": 2745.6, |
| "learning_rate": 0.0001549604942589441, |
| "loss": 3227.25, |
| "step": 7450 |
| }, |
| { |
| "ce_loss_13": 2.4308183819055555, |
| "ce_loss_26": 1.9100747764110566, |
| "ce_loss_39": 1.7246440201997757, |
| "ce_loss_52": 1.409556159377098, |
| "ce_loss_7": 2.7695399791002275, |
| "epoch": 0.746, |
| "grad_norm": 14.694656979242655, |
| "kl_loss_13": 2094.8, |
| "kl_loss_26": 993.0, |
| "kl_loss_39": 612.55, |
| "kl_loss_7": 2804.8, |
| "learning_rate": 0.00015381391146968864, |
| "loss": 3249.4, |
| "step": 7460 |
| }, |
| { |
| "ce_loss_13": 2.462578612565994, |
| "ce_loss_26": 1.940753996372223, |
| "ce_loss_39": 1.7554692894220352, |
| "ce_loss_52": 1.441029006242752, |
| "ce_loss_7": 2.785427051782608, |
| "epoch": 0.747, |
| "grad_norm": 14.412450315252437, |
| "kl_loss_13": 2103.6, |
| "kl_loss_26": 1003.3, |
| "kl_loss_39": 620.4, |
| "kl_loss_7": 2793.2, |
| "learning_rate": 0.00015267081477050133, |
| "loss": 3242.1, |
| "step": 7470 |
| }, |
| { |
| "ce_loss_13": 2.436378574371338, |
| "ce_loss_26": 1.9284409761428833, |
| "ce_loss_39": 1.7525635540485383, |
| "ce_loss_52": 1.443886636197567, |
| "ce_loss_7": 2.762845513224602, |
| "epoch": 0.748, |
| "grad_norm": 14.082696745240801, |
| "kl_loss_13": 2056.6, |
| "kl_loss_26": 983.1, |
| "kl_loss_39": 607.7, |
| "kl_loss_7": 2742.6, |
| "learning_rate": 0.00015153121567235335, |
| "loss": 3260.75, |
| "step": 7480 |
| }, |
| { |
| "ce_loss_13": 2.4219354510307314, |
| "ce_loss_26": 1.9070833683013917, |
| "ce_loss_39": 1.727812445163727, |
| "ce_loss_52": 1.4214952304959296, |
| "ce_loss_7": 2.751905006170273, |
| "epoch": 0.749, |
| "grad_norm": 14.604071674011259, |
| "kl_loss_13": 2073.4, |
| "kl_loss_26": 983.2, |
| "kl_loss_39": 609.55, |
| "kl_loss_7": 2766.8, |
| "learning_rate": 0.00015039512565099468, |
| "loss": 3240.15, |
| "step": 7490 |
| }, |
| { |
| "ce_loss_13": 2.4254602432250976, |
| "ce_loss_26": 1.9135964632034301, |
| "ce_loss_39": 1.7295757800340652, |
| "ce_loss_52": 1.423691214621067, |
| "ce_loss_7": 2.7512109965085982, |
| "epoch": 0.75, |
| "grad_norm": 13.87053452241645, |
| "kl_loss_13": 2059.4, |
| "kl_loss_26": 978.6, |
| "kl_loss_39": 597.45, |
| "kl_loss_7": 2750.4, |
| "learning_rate": 0.00014926255614683932, |
| "loss": 3260.75, |
| "step": 7500 |
| }, |
| { |
| "ce_loss_13": 2.44639810025692, |
| "ce_loss_26": 1.93405482172966, |
| "ce_loss_39": 1.7547091454267503, |
| "ce_loss_52": 1.4416350960731505, |
| "ce_loss_7": 2.768189311027527, |
| "epoch": 0.751, |
| "grad_norm": 14.071002297078877, |
| "kl_loss_13": 2088.8, |
| "kl_loss_26": 995.6, |
| "kl_loss_39": 616.7, |
| "kl_loss_7": 2768.8, |
| "learning_rate": 0.0001481335185648498, |
| "loss": 3269.45, |
| "step": 7510 |
| }, |
| { |
| "ce_loss_13": 2.496452784538269, |
| "ce_loss_26": 1.9704186409711837, |
| "ce_loss_39": 1.784249845147133, |
| "ce_loss_52": 1.4717927530407906, |
| "ce_loss_7": 2.8235138654708862, |
| "epoch": 0.752, |
| "grad_norm": 14.017187066675143, |
| "kl_loss_13": 2081.6, |
| "kl_loss_26": 989.5, |
| "kl_loss_39": 608.0, |
| "kl_loss_7": 2766.4, |
| "learning_rate": 0.0001470080242744218, |
| "loss": 3222.85, |
| "step": 7520 |
| }, |
| { |
| "ce_loss_13": 2.4962650299072267, |
| "ce_loss_26": 1.98213948905468, |
| "ce_loss_39": 1.7954594939947128, |
| "ce_loss_52": 1.480476987361908, |
| "ce_loss_7": 2.827346932888031, |
| "epoch": 0.753, |
| "grad_norm": 14.186670012527646, |
| "kl_loss_13": 2094.8, |
| "kl_loss_26": 998.5, |
| "kl_loss_39": 611.15, |
| "kl_loss_7": 2785.6, |
| "learning_rate": 0.0001458860846092705, |
| "loss": 3232.0, |
| "step": 7530 |
| }, |
| { |
| "ce_loss_13": 2.4669371783733367, |
| "ce_loss_26": 1.9375512719154357, |
| "ce_loss_39": 1.7506729423999787, |
| "ce_loss_52": 1.4334673672914504, |
| "ce_loss_7": 2.7915061593055723, |
| "epoch": 0.754, |
| "grad_norm": 14.32315966365105, |
| "kl_loss_13": 2109.0, |
| "kl_loss_26": 996.8, |
| "kl_loss_39": 613.7, |
| "kl_loss_7": 2794.0, |
| "learning_rate": 0.00014476771086731566, |
| "loss": 3264.6, |
| "step": 7540 |
| }, |
| { |
| "ce_loss_13": 2.4759989261627195, |
| "ce_loss_26": 1.9555140793323518, |
| "ce_loss_39": 1.7739178657531738, |
| "ce_loss_52": 1.4679069191217422, |
| "ce_loss_7": 2.795573103427887, |
| "epoch": 0.755, |
| "grad_norm": 14.011488201466985, |
| "kl_loss_13": 2070.6, |
| "kl_loss_26": 974.7, |
| "kl_loss_39": 593.9, |
| "kl_loss_7": 2754.8, |
| "learning_rate": 0.00014365291431056872, |
| "loss": 3256.8, |
| "step": 7550 |
| }, |
| { |
| "ce_loss_13": 2.424694412946701, |
| "ce_loss_26": 1.903841146826744, |
| "ce_loss_39": 1.7249469131231308, |
| "ce_loss_52": 1.4185307189822196, |
| "ce_loss_7": 2.7595690310001375, |
| "epoch": 0.756, |
| "grad_norm": 14.79387989637837, |
| "kl_loss_13": 2093.2, |
| "kl_loss_26": 989.9, |
| "kl_loss_39": 610.5, |
| "kl_loss_7": 2791.6, |
| "learning_rate": 0.00014254170616501827, |
| "loss": 3235.5, |
| "step": 7560 |
| }, |
| { |
| "ce_loss_13": 2.4660239934921266, |
| "ce_loss_26": 1.9451529324054717, |
| "ce_loss_39": 1.7564547389745713, |
| "ce_loss_52": 1.4434847444295884, |
| "ce_loss_7": 2.7976350009441378, |
| "epoch": 0.757, |
| "grad_norm": 14.844517268447264, |
| "kl_loss_13": 2085.6, |
| "kl_loss_26": 981.2, |
| "kl_loss_39": 602.45, |
| "kl_loss_7": 2776.4, |
| "learning_rate": 0.0001414340976205183, |
| "loss": 3204.2, |
| "step": 7570 |
| }, |
| { |
| "ce_loss_13": 2.4295350134372713, |
| "ce_loss_26": 1.921643227338791, |
| "ce_loss_39": 1.738620987534523, |
| "ce_loss_52": 1.4355527609586716, |
| "ce_loss_7": 2.7507594347000124, |
| "epoch": 0.758, |
| "grad_norm": 14.398235424743639, |
| "kl_loss_13": 2045.8, |
| "kl_loss_26": 965.6, |
| "kl_loss_39": 594.6, |
| "kl_loss_7": 2732.4, |
| "learning_rate": 0.00014033009983067452, |
| "loss": 3240.7, |
| "step": 7580 |
| }, |
| { |
| "ce_loss_13": 2.4676227152347563, |
| "ce_loss_26": 1.9401549130678177, |
| "ce_loss_39": 1.7562287330627442, |
| "ce_loss_52": 1.4347741633653641, |
| "ce_loss_7": 2.8093821585178373, |
| "epoch": 0.759, |
| "grad_norm": 13.736433284616705, |
| "kl_loss_13": 2138.8, |
| "kl_loss_26": 1028.1, |
| "kl_loss_39": 641.7, |
| "kl_loss_7": 2851.2, |
| "learning_rate": 0.00013922972391273224, |
| "loss": 3240.15, |
| "step": 7590 |
| }, |
| { |
| "ce_loss_13": 2.491154599189758, |
| "ce_loss_26": 1.9668046951293945, |
| "ce_loss_39": 1.7748177736997603, |
| "ce_loss_52": 1.4488209426403045, |
| "ce_loss_7": 2.8212892413139343, |
| "epoch": 0.76, |
| "grad_norm": 14.65229593288616, |
| "kl_loss_13": 2140.6, |
| "kl_loss_26": 1032.9, |
| "kl_loss_39": 637.7, |
| "kl_loss_7": 2829.6, |
| "learning_rate": 0.0001381329809474649, |
| "loss": 3239.9, |
| "step": 7600 |
| }, |
| { |
| "ce_loss_13": 2.3942853659391403, |
| "ce_loss_26": 1.892151090502739, |
| "ce_loss_39": 1.7125076562166215, |
| "ce_loss_52": 1.4143452048301697, |
| "ce_loss_7": 2.720611757040024, |
| "epoch": 0.761, |
| "grad_norm": 13.295903354405345, |
| "kl_loss_13": 2008.0, |
| "kl_loss_26": 952.3, |
| "kl_loss_39": 583.5, |
| "kl_loss_7": 2686.4, |
| "learning_rate": 0.0001370398819790621, |
| "loss": 3228.6, |
| "step": 7610 |
| }, |
| { |
| "ce_loss_13": 2.48261901140213, |
| "ce_loss_26": 1.966281446814537, |
| "ce_loss_39": 1.7803379833698272, |
| "ce_loss_52": 1.4673886984586715, |
| "ce_loss_7": 2.805577594041824, |
| "epoch": 0.762, |
| "grad_norm": 14.322747311567188, |
| "kl_loss_13": 2093.4, |
| "kl_loss_26": 1000.3, |
| "kl_loss_39": 610.3, |
| "kl_loss_7": 2776.0, |
| "learning_rate": 0.00013595043801501794, |
| "loss": 3201.5, |
| "step": 7620 |
| }, |
| { |
| "ce_loss_13": 2.443099784851074, |
| "ce_loss_26": 1.9284921824932098, |
| "ce_loss_39": 1.7422718316316606, |
| "ce_loss_52": 1.435012650489807, |
| "ce_loss_7": 2.7720457434654238, |
| "epoch": 0.763, |
| "grad_norm": 14.405471822802745, |
| "kl_loss_13": 2082.6, |
| "kl_loss_26": 994.1, |
| "kl_loss_39": 608.55, |
| "kl_loss_7": 2773.2, |
| "learning_rate": 0.00013486466002602133, |
| "loss": 3225.725, |
| "step": 7630 |
| }, |
| { |
| "ce_loss_13": 2.37467542886734, |
| "ce_loss_26": 1.8506677508354188, |
| "ce_loss_39": 1.6710956811904907, |
| "ce_loss_52": 1.3840662211179733, |
| "ce_loss_7": 2.7066759169101715, |
| "epoch": 0.764, |
| "grad_norm": 13.948958944433121, |
| "kl_loss_13": 2038.6, |
| "kl_loss_26": 946.7, |
| "kl_loss_39": 574.05, |
| "kl_loss_7": 2722.4, |
| "learning_rate": 0.00013378255894584462, |
| "loss": 3167.8, |
| "step": 7640 |
| }, |
| { |
| "ce_loss_13": 2.446861132979393, |
| "ce_loss_26": 1.934667894244194, |
| "ce_loss_39": 1.7494839936494828, |
| "ce_loss_52": 1.4413674265146255, |
| "ce_loss_7": 2.7717737197875976, |
| "epoch": 0.765, |
| "grad_norm": 14.489695554621445, |
| "kl_loss_13": 2087.6, |
| "kl_loss_26": 996.2, |
| "kl_loss_39": 612.05, |
| "kl_loss_7": 2779.4, |
| "learning_rate": 0.0001327041456712334, |
| "loss": 3229.05, |
| "step": 7650 |
| }, |
| { |
| "ce_loss_13": 2.514678430557251, |
| "ce_loss_26": 1.9890475004911423, |
| "ce_loss_39": 1.8049181282520295, |
| "ce_loss_52": 1.4897184193134307, |
| "ce_loss_7": 2.8373226463794707, |
| "epoch": 0.766, |
| "grad_norm": 13.809410965696319, |
| "kl_loss_13": 2109.8, |
| "kl_loss_26": 1013.7, |
| "kl_loss_39": 623.8, |
| "kl_loss_7": 2784.8, |
| "learning_rate": 0.00013162943106179747, |
| "loss": 3248.2, |
| "step": 7660 |
| }, |
| { |
| "ce_loss_13": 2.4804063200950623, |
| "ce_loss_26": 1.9478756994009019, |
| "ce_loss_39": 1.765100008249283, |
| "ce_loss_52": 1.446313591301441, |
| "ce_loss_7": 2.814295369386673, |
| "epoch": 0.767, |
| "grad_norm": 14.599429508154355, |
| "kl_loss_13": 2147.0, |
| "kl_loss_26": 1023.2, |
| "kl_loss_39": 636.45, |
| "kl_loss_7": 2854.0, |
| "learning_rate": 0.00013055842593990132, |
| "loss": 3217.4, |
| "step": 7670 |
| }, |
| { |
| "ce_loss_13": 2.4887916058301927, |
| "ce_loss_26": 1.9737455695867538, |
| "ce_loss_39": 1.7851827770471573, |
| "ce_loss_52": 1.4613285958766937, |
| "ce_loss_7": 2.8164610981941225, |
| "epoch": 0.768, |
| "grad_norm": 14.229376114254006, |
| "kl_loss_13": 2127.6, |
| "kl_loss_26": 1032.8, |
| "kl_loss_39": 640.5, |
| "kl_loss_7": 2811.2, |
| "learning_rate": 0.00012949114109055414, |
| "loss": 3223.675, |
| "step": 7680 |
| }, |
| { |
| "ce_loss_13": 2.389929732680321, |
| "ce_loss_26": 1.8889827966690063, |
| "ce_loss_39": 1.7090455144643784, |
| "ce_loss_52": 1.4120649307966233, |
| "ce_loss_7": 2.710143965482712, |
| "epoch": 0.769, |
| "grad_norm": 13.823270022584358, |
| "kl_loss_13": 2025.6, |
| "kl_loss_26": 958.6, |
| "kl_loss_39": 589.95, |
| "kl_loss_7": 2700.0, |
| "learning_rate": 0.00012842758726130281, |
| "loss": 3247.75, |
| "step": 7690 |
| }, |
| { |
| "ce_loss_13": 2.444611003994942, |
| "ce_loss_26": 1.9318826824426651, |
| "ce_loss_39": 1.7527276873588562, |
| "ce_loss_52": 1.4516576603055, |
| "ce_loss_7": 2.7657779157161713, |
| "epoch": 0.77, |
| "grad_norm": 14.431273089148151, |
| "kl_loss_13": 2049.2, |
| "kl_loss_26": 963.2, |
| "kl_loss_39": 589.25, |
| "kl_loss_7": 2726.0, |
| "learning_rate": 0.00012736777516212267, |
| "loss": 3216.75, |
| "step": 7700 |
| }, |
| { |
| "ce_loss_13": 2.441952568292618, |
| "ce_loss_26": 1.9228222370147705, |
| "ce_loss_39": 1.7338123947381974, |
| "ce_loss_52": 1.4193892806768418, |
| "ce_loss_7": 2.7718379318714144, |
| "epoch": 0.771, |
| "grad_norm": 13.661830756949985, |
| "kl_loss_13": 2115.2, |
| "kl_loss_26": 1008.4, |
| "kl_loss_39": 620.0, |
| "kl_loss_7": 2804.8, |
| "learning_rate": 0.00012631171546530968, |
| "loss": 3199.55, |
| "step": 7710 |
| }, |
| { |
| "ce_loss_13": 2.4535767167806624, |
| "ce_loss_26": 1.9207569301128387, |
| "ce_loss_39": 1.7325547844171525, |
| "ce_loss_52": 1.4173025369644165, |
| "ce_loss_7": 2.7801734030246736, |
| "epoch": 0.772, |
| "grad_norm": 14.176576196561767, |
| "kl_loss_13": 2111.8, |
| "kl_loss_26": 1005.2, |
| "kl_loss_39": 618.15, |
| "kl_loss_7": 2793.6, |
| "learning_rate": 0.00012525941880537307, |
| "loss": 3214.15, |
| "step": 7720 |
| }, |
| { |
| "ce_loss_13": 2.4484546184539795, |
| "ce_loss_26": 1.9310883104801178, |
| "ce_loss_39": 1.7474435329437257, |
| "ce_loss_52": 1.4392137452960014, |
| "ce_loss_7": 2.779611772298813, |
| "epoch": 0.773, |
| "grad_norm": 14.626780180795521, |
| "kl_loss_13": 2095.4, |
| "kl_loss_26": 994.8, |
| "kl_loss_39": 607.9, |
| "kl_loss_7": 2786.8, |
| "learning_rate": 0.00012421089577892869, |
| "loss": 3191.6, |
| "step": 7730 |
| }, |
| { |
| "ce_loss_13": 2.463806739449501, |
| "ce_loss_26": 1.9203673034906388, |
| "ce_loss_39": 1.7268804877996444, |
| "ce_loss_52": 1.4067875519394875, |
| "ce_loss_7": 2.797054660320282, |
| "epoch": 0.774, |
| "grad_norm": 14.221427151080144, |
| "kl_loss_13": 2151.0, |
| "kl_loss_26": 1032.4, |
| "kl_loss_39": 637.25, |
| "kl_loss_7": 2842.0, |
| "learning_rate": 0.0001231661569445919, |
| "loss": 3214.8, |
| "step": 7740 |
| }, |
| { |
| "ce_loss_13": 2.4840691089630127, |
| "ce_loss_26": 1.9805045217275619, |
| "ce_loss_39": 1.7966417849063874, |
| "ce_loss_52": 1.4883142501115798, |
| "ce_loss_7": 2.8029735326766967, |
| "epoch": 0.775, |
| "grad_norm": 14.614162489546528, |
| "kl_loss_13": 2069.8, |
| "kl_loss_26": 990.4, |
| "kl_loss_39": 609.05, |
| "kl_loss_7": 2743.2, |
| "learning_rate": 0.00012212521282287093, |
| "loss": 3200.5, |
| "step": 7750 |
| }, |
| { |
| "ce_loss_13": 2.4842973172664644, |
| "ce_loss_26": 1.9586603373289109, |
| "ce_loss_39": 1.767923679947853, |
| "ce_loss_52": 1.450934961438179, |
| "ce_loss_7": 2.815416473150253, |
| "epoch": 0.776, |
| "grad_norm": 14.872662321169154, |
| "kl_loss_13": 2137.2, |
| "kl_loss_26": 1028.5, |
| "kl_loss_39": 636.95, |
| "kl_loss_7": 2826.4, |
| "learning_rate": 0.00012108807389606158, |
| "loss": 3221.25, |
| "step": 7760 |
| }, |
| { |
| "ce_loss_13": 2.430084604024887, |
| "ce_loss_26": 1.9105432122945785, |
| "ce_loss_39": 1.7237805485725404, |
| "ce_loss_52": 1.419256439805031, |
| "ce_loss_7": 2.7609162449836733, |
| "epoch": 0.777, |
| "grad_norm": 14.122349060255786, |
| "kl_loss_13": 2075.2, |
| "kl_loss_26": 984.9, |
| "kl_loss_39": 602.95, |
| "kl_loss_7": 2769.2, |
| "learning_rate": 0.00012005475060814159, |
| "loss": 3219.35, |
| "step": 7770 |
| }, |
| { |
| "ce_loss_13": 2.4920803755521774, |
| "ce_loss_26": 1.977930763363838, |
| "ce_loss_39": 1.792539432644844, |
| "ce_loss_52": 1.484375348687172, |
| "ce_loss_7": 2.818430072069168, |
| "epoch": 0.778, |
| "grad_norm": 14.838228117967187, |
| "kl_loss_13": 2079.2, |
| "kl_loss_26": 984.5, |
| "kl_loss_39": 603.55, |
| "kl_loss_7": 2766.8, |
| "learning_rate": 0.00011902525336466464, |
| "loss": 3193.3, |
| "step": 7780 |
| }, |
| { |
| "ce_loss_13": 2.4326795816421507, |
| "ce_loss_26": 1.9094915211200714, |
| "ce_loss_39": 1.7265714228153228, |
| "ce_loss_52": 1.4181353628635407, |
| "ce_loss_7": 2.757854151725769, |
| "epoch": 0.779, |
| "grad_norm": 13.98078513544715, |
| "kl_loss_13": 2078.4, |
| "kl_loss_26": 991.0, |
| "kl_loss_39": 610.15, |
| "kl_loss_7": 2758.8, |
| "learning_rate": 0.00011799959253265668, |
| "loss": 3208.85, |
| "step": 7790 |
| }, |
| { |
| "ce_loss_13": 2.42523156106472, |
| "ce_loss_26": 1.9246951520442963, |
| "ce_loss_39": 1.7428549587726594, |
| "ce_loss_52": 1.4467457503080368, |
| "ce_loss_7": 2.7435911536216735, |
| "epoch": 0.78, |
| "grad_norm": 13.811296718067993, |
| "kl_loss_13": 2022.0, |
| "kl_loss_26": 950.5, |
| "kl_loss_39": 578.7, |
| "kl_loss_7": 2701.6, |
| "learning_rate": 0.00011697777844051105, |
| "loss": 3204.7, |
| "step": 7800 |
| }, |
| { |
| "ce_loss_13": 2.496878683567047, |
| "ce_loss_26": 1.9677572190761565, |
| "ce_loss_39": 1.7739976853132249, |
| "ce_loss_52": 1.459720864892006, |
| "ce_loss_7": 2.8327562749385833, |
| "epoch": 0.781, |
| "grad_norm": 14.08625224164901, |
| "kl_loss_13": 2120.4, |
| "kl_loss_26": 1008.7, |
| "kl_loss_39": 615.4, |
| "kl_loss_7": 2822.4, |
| "learning_rate": 0.00011595982137788402, |
| "loss": 3198.55, |
| "step": 7810 |
| }, |
| { |
| "ce_loss_13": 2.4920260161161423, |
| "ce_loss_26": 1.9670526027679442, |
| "ce_loss_39": 1.783473041653633, |
| "ce_loss_52": 1.4704290598630905, |
| "ce_loss_7": 2.8202459871768952, |
| "epoch": 0.782, |
| "grad_norm": 14.200442027165447, |
| "kl_loss_13": 2103.8, |
| "kl_loss_26": 1004.1, |
| "kl_loss_39": 624.45, |
| "kl_loss_7": 2800.4, |
| "learning_rate": 0.00011494573159559212, |
| "loss": 3223.6, |
| "step": 7820 |
| }, |
| { |
| "ce_loss_13": 2.4327739059925078, |
| "ce_loss_26": 1.9302410751581192, |
| "ce_loss_39": 1.7492202669382095, |
| "ce_loss_52": 1.4421478152275085, |
| "ce_loss_7": 2.7496786177158357, |
| "epoch": 0.783, |
| "grad_norm": 13.561882382659508, |
| "kl_loss_13": 2046.4, |
| "kl_loss_26": 985.3, |
| "kl_loss_39": 604.05, |
| "kl_loss_7": 2712.8, |
| "learning_rate": 0.00011393551930550828, |
| "loss": 3172.1, |
| "step": 7830 |
| }, |
| { |
| "ce_loss_13": 2.456186518073082, |
| "ce_loss_26": 1.9265149384737015, |
| "ce_loss_39": 1.7347608864307404, |
| "ce_loss_52": 1.4248275607824326, |
| "ce_loss_7": 2.7833105325698853, |
| "epoch": 0.784, |
| "grad_norm": 14.152082617728679, |
| "kl_loss_13": 2109.0, |
| "kl_loss_26": 1006.1, |
| "kl_loss_39": 611.45, |
| "kl_loss_7": 2800.8, |
| "learning_rate": 0.00011292919468045875, |
| "loss": 3208.05, |
| "step": 7840 |
| }, |
| { |
| "ce_loss_13": 2.4463070958852766, |
| "ce_loss_26": 1.9245142936706543, |
| "ce_loss_39": 1.736380136013031, |
| "ce_loss_52": 1.4392430812120438, |
| "ce_loss_7": 2.776514196395874, |
| "epoch": 0.785, |
| "grad_norm": 13.366541015007158, |
| "kl_loss_13": 2080.4, |
| "kl_loss_26": 983.3, |
| "kl_loss_39": 597.9, |
| "kl_loss_7": 2776.4, |
| "learning_rate": 0.00011192676785412154, |
| "loss": 3185.35, |
| "step": 7850 |
| }, |
| { |
| "ce_loss_13": 2.421262636780739, |
| "ce_loss_26": 1.9249890923500061, |
| "ce_loss_39": 1.7500147104263306, |
| "ce_loss_52": 1.4561963319778441, |
| "ce_loss_7": 2.741842967271805, |
| "epoch": 0.786, |
| "grad_norm": 15.122779815741898, |
| "kl_loss_13": 1987.4, |
| "kl_loss_26": 937.4, |
| "kl_loss_39": 570.05, |
| "kl_loss_7": 2649.2, |
| "learning_rate": 0.00011092824892092374, |
| "loss": 3155.1, |
| "step": 7860 |
| }, |
| { |
| "ce_loss_13": 2.528256595134735, |
| "ce_loss_26": 2.005132633447647, |
| "ce_loss_39": 1.8220074683427812, |
| "ce_loss_52": 1.49842167198658, |
| "ce_loss_7": 2.8524305701255797, |
| "epoch": 0.787, |
| "grad_norm": 14.392088598180058, |
| "kl_loss_13": 2135.0, |
| "kl_loss_26": 1024.4, |
| "kl_loss_39": 633.4, |
| "kl_loss_7": 2828.0, |
| "learning_rate": 0.0001099336479359398, |
| "loss": 3228.45, |
| "step": 7870 |
| }, |
| { |
| "ce_loss_13": 2.4351921498775484, |
| "ce_loss_26": 1.9118896454572678, |
| "ce_loss_39": 1.7326159566640853, |
| "ce_loss_52": 1.43462935090065, |
| "ce_loss_7": 2.7571564972400666, |
| "epoch": 0.788, |
| "grad_norm": 14.109231297319436, |
| "kl_loss_13": 2083.4, |
| "kl_loss_26": 971.8, |
| "kl_loss_39": 593.25, |
| "kl_loss_7": 2767.2, |
| "learning_rate": 0.00010894297491479043, |
| "loss": 3224.35, |
| "step": 7880 |
| }, |
| { |
| "ce_loss_13": 2.4037249386310577, |
| "ce_loss_26": 1.8971479564905167, |
| "ce_loss_39": 1.713542652130127, |
| "ce_loss_52": 1.416736051440239, |
| "ce_loss_7": 2.7244735300540923, |
| "epoch": 0.789, |
| "grad_norm": 14.646464194445937, |
| "kl_loss_13": 2027.0, |
| "kl_loss_26": 959.6, |
| "kl_loss_39": 587.05, |
| "kl_loss_7": 2694.8, |
| "learning_rate": 0.00010795623983354214, |
| "loss": 3163.9, |
| "step": 7890 |
| }, |
| { |
| "ce_loss_13": 2.462240958213806, |
| "ce_loss_26": 1.9522877007722854, |
| "ce_loss_39": 1.7622255086898804, |
| "ce_loss_52": 1.447747752070427, |
| "ce_loss_7": 2.781180214881897, |
| "epoch": 0.79, |
| "grad_norm": 14.486453301055112, |
| "kl_loss_13": 2093.8, |
| "kl_loss_26": 999.0, |
| "kl_loss_39": 616.7, |
| "kl_loss_7": 2768.0, |
| "learning_rate": 0.00010697345262860636, |
| "loss": 3189.25, |
| "step": 7900 |
| }, |
| { |
| "ce_loss_13": 2.443204700946808, |
| "ce_loss_26": 1.9252238601446152, |
| "ce_loss_39": 1.7538990557193757, |
| "ce_loss_52": 1.4549198508262635, |
| "ce_loss_7": 2.762816107273102, |
| "epoch": 0.791, |
| "grad_norm": 14.756973656564183, |
| "kl_loss_13": 2043.6, |
| "kl_loss_26": 953.1, |
| "kl_loss_39": 582.75, |
| "kl_loss_7": 2730.4, |
| "learning_rate": 0.00010599462319663906, |
| "loss": 3189.25, |
| "step": 7910 |
| }, |
| { |
| "ce_loss_13": 2.4734450757503508, |
| "ce_loss_26": 1.9476019829511642, |
| "ce_loss_39": 1.7569423377513886, |
| "ce_loss_52": 1.448093169927597, |
| "ce_loss_7": 2.7963216602802277, |
| "epoch": 0.792, |
| "grad_norm": 14.148651159008981, |
| "kl_loss_13": 2096.8, |
| "kl_loss_26": 997.8, |
| "kl_loss_39": 608.35, |
| "kl_loss_7": 2784.0, |
| "learning_rate": 0.00010501976139444191, |
| "loss": 3199.1, |
| "step": 7920 |
| }, |
| { |
| "ce_loss_13": 2.460918265581131, |
| "ce_loss_26": 1.9421131610870361, |
| "ce_loss_39": 1.7556595474481582, |
| "ce_loss_52": 1.4451121121644974, |
| "ce_loss_7": 2.7843388080596925, |
| "epoch": 0.793, |
| "grad_norm": 14.590509352597412, |
| "kl_loss_13": 2099.4, |
| "kl_loss_26": 995.3, |
| "kl_loss_39": 605.2, |
| "kl_loss_7": 2780.8, |
| "learning_rate": 0.0001040488770388625, |
| "loss": 3203.35, |
| "step": 7930 |
| }, |
| { |
| "ce_loss_13": 2.379360908269882, |
| "ce_loss_26": 1.8618569314479827, |
| "ce_loss_39": 1.6797795861959457, |
| "ce_loss_52": 1.3791978135704994, |
| "ce_loss_7": 2.708191817998886, |
| "epoch": 0.794, |
| "grad_norm": 14.16954347608881, |
| "kl_loss_13": 2053.0, |
| "kl_loss_26": 969.8, |
| "kl_loss_39": 592.9, |
| "kl_loss_7": 2740.8, |
| "learning_rate": 0.00010308197990669538, |
| "loss": 3181.45, |
| "step": 7940 |
| }, |
| { |
| "ce_loss_13": 2.4218691647052766, |
| "ce_loss_26": 1.9039832711219788, |
| "ce_loss_39": 1.7168581753969192, |
| "ce_loss_52": 1.414185357093811, |
| "ce_loss_7": 2.7429304718971252, |
| "epoch": 0.795, |
| "grad_norm": 13.813266382907072, |
| "kl_loss_13": 2088.6, |
| "kl_loss_26": 990.3, |
| "kl_loss_39": 604.5, |
| "kl_loss_7": 2776.4, |
| "learning_rate": 0.0001021190797345839, |
| "loss": 3178.1, |
| "step": 7950 |
| }, |
| { |
| "ce_loss_13": 2.487806275486946, |
| "ce_loss_26": 1.9796326756477356, |
| "ce_loss_39": 1.7957882821559905, |
| "ce_loss_52": 1.484969075024128, |
| "ce_loss_7": 2.8111346662044525, |
| "epoch": 0.796, |
| "grad_norm": 14.164838891881972, |
| "kl_loss_13": 2047.4, |
| "kl_loss_26": 971.5, |
| "kl_loss_39": 591.45, |
| "kl_loss_7": 2717.2, |
| "learning_rate": 0.00010116018621892236, |
| "loss": 3174.95, |
| "step": 7960 |
| }, |
| { |
| "ce_loss_13": 2.398695731163025, |
| "ce_loss_26": 1.8858011841773987, |
| "ce_loss_39": 1.7020757973194123, |
| "ce_loss_52": 1.4106510564684869, |
| "ce_loss_7": 2.721856439113617, |
| "epoch": 0.797, |
| "grad_norm": 14.161604323369735, |
| "kl_loss_13": 2034.0, |
| "kl_loss_26": 953.8, |
| "kl_loss_39": 575.1, |
| "kl_loss_7": 2712.8, |
| "learning_rate": 0.00010020530901575753, |
| "loss": 3177.95, |
| "step": 7970 |
| }, |
| { |
| "ce_loss_13": 2.427662065625191, |
| "ce_loss_26": 1.921607220172882, |
| "ce_loss_39": 1.741393145918846, |
| "ce_loss_52": 1.438681322336197, |
| "ce_loss_7": 2.746969664096832, |
| "epoch": 0.798, |
| "grad_norm": 14.770550097448835, |
| "kl_loss_13": 2042.0, |
| "kl_loss_26": 965.9, |
| "kl_loss_39": 592.1, |
| "kl_loss_7": 2705.2, |
| "learning_rate": 9.925445774069231e-05, |
| "loss": 3170.6, |
| "step": 7980 |
| }, |
| { |
| "ce_loss_13": 2.4366293847560883, |
| "ce_loss_26": 1.9163338214159011, |
| "ce_loss_39": 1.7320117831230164, |
| "ce_loss_52": 1.4146809190511704, |
| "ce_loss_7": 2.772968965768814, |
| "epoch": 0.799, |
| "grad_norm": 13.97785133684068, |
| "kl_loss_13": 2098.2, |
| "kl_loss_26": 1008.8, |
| "kl_loss_39": 619.4, |
| "kl_loss_7": 2795.6, |
| "learning_rate": 9.830764196878872e-05, |
| "loss": 3210.25, |
| "step": 7990 |
| }, |
| { |
| "ce_loss_13": 2.519176536798477, |
| "ce_loss_26": 2.0015997767448424, |
| "ce_loss_39": 1.8160304486751557, |
| "ce_loss_52": 1.4840710669755937, |
| "ce_loss_7": 2.842372918128967, |
| "epoch": 0.8, |
| "grad_norm": 13.949092199595771, |
| "kl_loss_13": 2133.0, |
| "kl_loss_26": 1036.3, |
| "kl_loss_39": 652.95, |
| "kl_loss_7": 2818.8, |
| "learning_rate": 9.736487123447069e-05, |
| "loss": 3181.95, |
| "step": 8000 |
| }, |
| { |
| "ce_loss_13": 2.4726769655942915, |
| "ce_loss_26": 1.9741164237260818, |
| "ce_loss_39": 1.790221494436264, |
| "ce_loss_52": 1.482371485233307, |
| "ce_loss_7": 2.799379500746727, |
| "epoch": 0.801, |
| "grad_norm": 13.707660286112029, |
| "kl_loss_13": 2038.0, |
| "kl_loss_26": 977.9, |
| "kl_loss_39": 599.8, |
| "kl_loss_7": 2716.8, |
| "learning_rate": 9.642615503142926e-05, |
| "loss": 3173.65, |
| "step": 8010 |
| }, |
| { |
| "ce_loss_13": 2.4063422054052355, |
| "ce_loss_26": 1.904910859465599, |
| "ce_loss_39": 1.7236665695905686, |
| "ce_loss_52": 1.4340474352240562, |
| "ce_loss_7": 2.726147544384003, |
| "epoch": 0.802, |
| "grad_norm": 14.759158260471319, |
| "kl_loss_13": 2004.0, |
| "kl_loss_26": 947.5, |
| "kl_loss_39": 570.8, |
| "kl_loss_7": 2678.8, |
| "learning_rate": 9.549150281252633e-05, |
| "loss": 3210.3, |
| "step": 8020 |
| }, |
| { |
| "ce_loss_13": 2.439296191930771, |
| "ce_loss_26": 1.930625182390213, |
| "ce_loss_39": 1.7471411645412445, |
| "ce_loss_52": 1.4468423128128052, |
| "ce_loss_7": 2.759518486261368, |
| "epoch": 0.803, |
| "grad_norm": 14.138193136913905, |
| "kl_loss_13": 2029.8, |
| "kl_loss_26": 959.0, |
| "kl_loss_39": 580.8, |
| "kl_loss_7": 2710.0, |
| "learning_rate": 9.4560923989699e-05, |
| "loss": 3188.85, |
| "step": 8030 |
| }, |
| { |
| "ce_loss_13": 2.383785030245781, |
| "ce_loss_26": 1.8730993419885635, |
| "ce_loss_39": 1.689489060640335, |
| "ce_loss_52": 1.3906694814562797, |
| "ce_loss_7": 2.715326648950577, |
| "epoch": 0.804, |
| "grad_norm": 14.525642481529378, |
| "kl_loss_13": 2063.0, |
| "kl_loss_26": 966.1, |
| "kl_loss_39": 591.6, |
| "kl_loss_7": 2756.4, |
| "learning_rate": 9.363442793386607e-05, |
| "loss": 3171.0, |
| "step": 8040 |
| }, |
| { |
| "ce_loss_13": 2.436983805894852, |
| "ce_loss_26": 1.9301572561264038, |
| "ce_loss_39": 1.7478452265262603, |
| "ce_loss_52": 1.4431490540504455, |
| "ce_loss_7": 2.7593387603759765, |
| "epoch": 0.805, |
| "grad_norm": 14.17334080276183, |
| "kl_loss_13": 2038.0, |
| "kl_loss_26": 967.0, |
| "kl_loss_39": 590.6, |
| "kl_loss_7": 2714.4, |
| "learning_rate": 9.271202397483213e-05, |
| "loss": 3157.4, |
| "step": 8050 |
| }, |
| { |
| "ce_loss_13": 2.4742675691843035, |
| "ce_loss_26": 1.9660158514976502, |
| "ce_loss_39": 1.7886695712804794, |
| "ce_loss_52": 1.4793777346611023, |
| "ce_loss_7": 2.799517345428467, |
| "epoch": 0.806, |
| "grad_norm": 14.807557523499636, |
| "kl_loss_13": 2047.0, |
| "kl_loss_26": 977.2, |
| "kl_loss_39": 606.0, |
| "kl_loss_7": 2719.6, |
| "learning_rate": 9.179372140119524e-05, |
| "loss": 3197.5, |
| "step": 8060 |
| }, |
| { |
| "ce_loss_13": 2.404207941889763, |
| "ce_loss_26": 1.8871434926986694, |
| "ce_loss_39": 1.7082382440567017, |
| "ce_loss_52": 1.4106020584702492, |
| "ce_loss_7": 2.7318927943706512, |
| "epoch": 0.807, |
| "grad_norm": 14.241838480847589, |
| "kl_loss_13": 2054.2, |
| "kl_loss_26": 966.9, |
| "kl_loss_39": 589.9, |
| "kl_loss_7": 2745.6, |
| "learning_rate": 9.087952946025175e-05, |
| "loss": 3174.15, |
| "step": 8070 |
| }, |
| { |
| "ce_loss_13": 2.405471110343933, |
| "ce_loss_26": 1.9090194314718247, |
| "ce_loss_39": 1.7282894462347032, |
| "ce_loss_52": 1.4298398733139037, |
| "ce_loss_7": 2.7295302629470823, |
| "epoch": 0.808, |
| "grad_norm": 14.246105109452872, |
| "kl_loss_13": 2023.6, |
| "kl_loss_26": 956.4, |
| "kl_loss_39": 585.5, |
| "kl_loss_7": 2704.0, |
| "learning_rate": 8.996945735790446e-05, |
| "loss": 3220.95, |
| "step": 8080 |
| }, |
| { |
| "ce_loss_13": 2.4230452179908752, |
| "ce_loss_26": 1.9108994454145432, |
| "ce_loss_39": 1.7252773225307465, |
| "ce_loss_52": 1.4297530561685563, |
| "ce_loss_7": 2.7479528963565825, |
| "epoch": 0.809, |
| "grad_norm": 14.241539859828608, |
| "kl_loss_13": 2056.4, |
| "kl_loss_26": 974.5, |
| "kl_loss_39": 592.4, |
| "kl_loss_7": 2738.0, |
| "learning_rate": 8.906351425856951e-05, |
| "loss": 3187.2, |
| "step": 8090 |
| }, |
| { |
| "ce_loss_13": 2.5193986773490904, |
| "ce_loss_26": 2.004713475704193, |
| "ce_loss_39": 1.8214319556951524, |
| "ce_loss_52": 1.5133182466030122, |
| "ce_loss_7": 2.836967188119888, |
| "epoch": 0.81, |
| "grad_norm": 13.856573501498156, |
| "kl_loss_13": 2059.0, |
| "kl_loss_26": 983.5, |
| "kl_loss_39": 604.75, |
| "kl_loss_7": 2738.8, |
| "learning_rate": 8.816170928508365e-05, |
| "loss": 3199.5, |
| "step": 8100 |
| }, |
| { |
| "ce_loss_13": 2.463338887691498, |
| "ce_loss_26": 1.9509768843650819, |
| "ce_loss_39": 1.7714523404836655, |
| "ce_loss_52": 1.4671026438474655, |
| "ce_loss_7": 2.7907890677452087, |
| "epoch": 0.811, |
| "grad_norm": 14.442792993010306, |
| "kl_loss_13": 2039.6, |
| "kl_loss_26": 963.7, |
| "kl_loss_39": 594.85, |
| "kl_loss_7": 2727.2, |
| "learning_rate": 8.7264051518613e-05, |
| "loss": 3182.6, |
| "step": 8110 |
| }, |
| { |
| "ce_loss_13": 2.35435933470726, |
| "ce_loss_26": 1.8515879094600678, |
| "ce_loss_39": 1.675445196032524, |
| "ce_loss_52": 1.3868303269147872, |
| "ce_loss_7": 2.6736503660678865, |
| "epoch": 0.812, |
| "grad_norm": 15.186200546439618, |
| "kl_loss_13": 2007.0, |
| "kl_loss_26": 943.9, |
| "kl_loss_39": 575.7, |
| "kl_loss_7": 2679.2, |
| "learning_rate": 8.637054999856148e-05, |
| "loss": 3182.4, |
| "step": 8120 |
| }, |
| { |
| "ce_loss_13": 2.4665314495563506, |
| "ce_loss_26": 1.9507400870323182, |
| "ce_loss_39": 1.7698681026697158, |
| "ce_loss_52": 1.463287603855133, |
| "ce_loss_7": 2.790166562795639, |
| "epoch": 0.813, |
| "grad_norm": 14.702441450046226, |
| "kl_loss_13": 2081.6, |
| "kl_loss_26": 994.3, |
| "kl_loss_39": 606.6, |
| "kl_loss_7": 2757.2, |
| "learning_rate": 8.548121372247918e-05, |
| "loss": 3195.8, |
| "step": 8130 |
| }, |
| { |
| "ce_loss_13": 2.4175081342458724, |
| "ce_loss_26": 1.894961017370224, |
| "ce_loss_39": 1.7168689727783204, |
| "ce_loss_52": 1.4184614822268486, |
| "ce_loss_7": 2.74496705532074, |
| "epoch": 0.814, |
| "grad_norm": 14.150480690009331, |
| "kl_loss_13": 2042.4, |
| "kl_loss_26": 956.7, |
| "kl_loss_39": 584.15, |
| "kl_loss_7": 2725.2, |
| "learning_rate": 8.459605164597267e-05, |
| "loss": 3148.95, |
| "step": 8140 |
| }, |
| { |
| "ce_loss_13": 2.4137402385473252, |
| "ce_loss_26": 1.907509195804596, |
| "ce_loss_39": 1.7280682563781737, |
| "ce_loss_52": 1.4408938705921173, |
| "ce_loss_7": 2.734111136198044, |
| "epoch": 0.815, |
| "grad_norm": 14.607407219816483, |
| "kl_loss_13": 2031.8, |
| "kl_loss_26": 952.0, |
| "kl_loss_39": 576.85, |
| "kl_loss_7": 2711.2, |
| "learning_rate": 8.371507268261436e-05, |
| "loss": 3141.15, |
| "step": 8150 |
| }, |
| { |
| "ce_loss_13": 2.45993629693985, |
| "ce_loss_26": 1.9489454805850983, |
| "ce_loss_39": 1.7687684744596481, |
| "ce_loss_52": 1.4672614842653275, |
| "ce_loss_7": 2.7731840908527374, |
| "epoch": 0.816, |
| "grad_norm": 13.941573996490533, |
| "kl_loss_13": 2049.4, |
| "kl_loss_26": 971.7, |
| "kl_loss_39": 590.15, |
| "kl_loss_7": 2723.6, |
| "learning_rate": 8.283828570385238e-05, |
| "loss": 3167.65, |
| "step": 8160 |
| }, |
| { |
| "ce_loss_13": 2.457903391122818, |
| "ce_loss_26": 1.9481427311897277, |
| "ce_loss_39": 1.7692535519599915, |
| "ce_loss_52": 1.473311385512352, |
| "ce_loss_7": 2.7900636374950407, |
| "epoch": 0.817, |
| "grad_norm": 13.820607552825622, |
| "kl_loss_13": 2034.8, |
| "kl_loss_26": 960.7, |
| "kl_loss_39": 581.95, |
| "kl_loss_7": 2732.8, |
| "learning_rate": 8.196569953892202e-05, |
| "loss": 3175.55, |
| "step": 8170 |
| }, |
| { |
| "ce_loss_13": 2.424106791615486, |
| "ce_loss_26": 1.9179951936006545, |
| "ce_loss_39": 1.7393042415380477, |
| "ce_loss_52": 1.4501032710075379, |
| "ce_loss_7": 2.7422122418880464, |
| "epoch": 0.818, |
| "grad_norm": 13.95954126780279, |
| "kl_loss_13": 2017.4, |
| "kl_loss_26": 939.3, |
| "kl_loss_39": 570.5, |
| "kl_loss_7": 2684.4, |
| "learning_rate": 8.109732297475635e-05, |
| "loss": 3172.8, |
| "step": 8180 |
| }, |
| { |
| "ce_loss_13": 2.4480927348136903, |
| "ce_loss_26": 1.9403320997953415, |
| "ce_loss_39": 1.7644436001777648, |
| "ce_loss_52": 1.4613411754369736, |
| "ce_loss_7": 2.767207592725754, |
| "epoch": 0.819, |
| "grad_norm": 15.077790655269643, |
| "kl_loss_13": 2034.2, |
| "kl_loss_26": 959.3, |
| "kl_loss_39": 589.55, |
| "kl_loss_7": 2707.6, |
| "learning_rate": 8.023316475589754e-05, |
| "loss": 3151.8, |
| "step": 8190 |
| }, |
| { |
| "ce_loss_13": 2.389411324262619, |
| "ce_loss_26": 1.8819621950387955, |
| "ce_loss_39": 1.7063862174749374, |
| "ce_loss_52": 1.4187449038028717, |
| "ce_loss_7": 2.7186341762542723, |
| "epoch": 0.82, |
| "grad_norm": 14.069790186558153, |
| "kl_loss_13": 2015.6, |
| "kl_loss_26": 934.0, |
| "kl_loss_39": 570.4, |
| "kl_loss_7": 2711.2, |
| "learning_rate": 7.937323358440934e-05, |
| "loss": 3158.45, |
| "step": 8200 |
| }, |
| { |
| "ce_loss_13": 2.463495451211929, |
| "ce_loss_26": 1.9460216015577316, |
| "ce_loss_39": 1.7633485794067383, |
| "ce_loss_52": 1.459425413608551, |
| "ce_loss_7": 2.7843497574329374, |
| "epoch": 0.821, |
| "grad_norm": 14.129619264599885, |
| "kl_loss_13": 2034.2, |
| "kl_loss_26": 962.3, |
| "kl_loss_39": 588.55, |
| "kl_loss_7": 2710.0, |
| "learning_rate": 7.851753811978923e-05, |
| "loss": 3172.55, |
| "step": 8210 |
| }, |
| { |
| "ce_loss_13": 2.3558076560497283, |
| "ce_loss_26": 1.8546594500541687, |
| "ce_loss_39": 1.6799951493740082, |
| "ce_loss_52": 1.392863529920578, |
| "ce_loss_7": 2.6725959718227386, |
| "epoch": 0.822, |
| "grad_norm": 13.287512232663138, |
| "kl_loss_13": 1979.6, |
| "kl_loss_26": 919.4, |
| "kl_loss_39": 562.4, |
| "kl_loss_7": 2652.0, |
| "learning_rate": 7.766608697888095e-05, |
| "loss": 3151.0, |
| "step": 8220 |
| }, |
| { |
| "ce_loss_13": 2.4153092801570892, |
| "ce_loss_26": 1.9061992377042771, |
| "ce_loss_39": 1.7255131870508194, |
| "ce_loss_52": 1.4212424442172051, |
| "ce_loss_7": 2.748347020149231, |
| "epoch": 0.823, |
| "grad_norm": 14.43300896159423, |
| "kl_loss_13": 2076.6, |
| "kl_loss_26": 977.9, |
| "kl_loss_39": 599.3, |
| "kl_loss_7": 2767.2, |
| "learning_rate": 7.681888873578785e-05, |
| "loss": 3171.1, |
| "step": 8230 |
| }, |
| { |
| "ce_loss_13": 2.4028283417224885, |
| "ce_loss_26": 1.9037913769483565, |
| "ce_loss_39": 1.729577499628067, |
| "ce_loss_52": 1.4313182592391969, |
| "ce_loss_7": 2.7214892983436583, |
| "epoch": 0.824, |
| "grad_norm": 14.091655498494992, |
| "kl_loss_13": 1997.8, |
| "kl_loss_26": 945.5, |
| "kl_loss_39": 582.95, |
| "kl_loss_7": 2668.0, |
| "learning_rate": 7.597595192178702e-05, |
| "loss": 3129.45, |
| "step": 8240 |
| }, |
| { |
| "ce_loss_13": 2.385217198729515, |
| "ce_loss_26": 1.8788419783115387, |
| "ce_loss_39": 1.7016818612813949, |
| "ce_loss_52": 1.4092606633901597, |
| "ce_loss_7": 2.714561605453491, |
| "epoch": 0.825, |
| "grad_norm": 14.092565500396708, |
| "kl_loss_13": 2015.4, |
| "kl_loss_26": 950.7, |
| "kl_loss_39": 578.35, |
| "kl_loss_7": 2700.8, |
| "learning_rate": 7.513728502524286e-05, |
| "loss": 3103.55, |
| "step": 8250 |
| }, |
| { |
| "ce_loss_13": 2.411863788962364, |
| "ce_loss_26": 1.8953818708658219, |
| "ce_loss_39": 1.712015947699547, |
| "ce_loss_52": 1.4228723630309106, |
| "ce_loss_7": 2.7335788309574127, |
| "epoch": 0.826, |
| "grad_norm": 14.683616091837887, |
| "kl_loss_13": 2027.2, |
| "kl_loss_26": 946.2, |
| "kl_loss_39": 569.45, |
| "kl_loss_7": 2700.8, |
| "learning_rate": 7.430289649152156e-05, |
| "loss": 3186.45, |
| "step": 8260 |
| }, |
| { |
| "ce_loss_13": 2.4488519340753556, |
| "ce_loss_26": 1.9516287744045258, |
| "ce_loss_39": 1.7749818950891494, |
| "ce_loss_52": 1.479728889465332, |
| "ce_loss_7": 2.7712768018245697, |
| "epoch": 0.827, |
| "grad_norm": 13.955547255495208, |
| "kl_loss_13": 1991.0, |
| "kl_loss_26": 940.5, |
| "kl_loss_39": 573.4, |
| "kl_loss_7": 2658.4, |
| "learning_rate": 7.347279472290646e-05, |
| "loss": 3163.475, |
| "step": 8270 |
| }, |
| { |
| "ce_loss_13": 2.3786238610744475, |
| "ce_loss_26": 1.8683661013841628, |
| "ce_loss_39": 1.690899032354355, |
| "ce_loss_52": 1.4009160608053208, |
| "ce_loss_7": 2.705441731214523, |
| "epoch": 0.828, |
| "grad_norm": 14.039626698915084, |
| "kl_loss_13": 2016.2, |
| "kl_loss_26": 938.6, |
| "kl_loss_39": 571.15, |
| "kl_loss_7": 2694.4, |
| "learning_rate": 7.264698807851328e-05, |
| "loss": 3118.5, |
| "step": 8280 |
| }, |
| { |
| "ce_loss_13": 2.466960498690605, |
| "ce_loss_26": 1.946975302696228, |
| "ce_loss_39": 1.7688733905553817, |
| "ce_loss_52": 1.4637437134981155, |
| "ce_loss_7": 2.8006490588188173, |
| "epoch": 0.829, |
| "grad_norm": 14.106474347437752, |
| "kl_loss_13": 2090.4, |
| "kl_loss_26": 984.1, |
| "kl_loss_39": 609.1, |
| "kl_loss_7": 2790.4, |
| "learning_rate": 7.182548487420554e-05, |
| "loss": 3184.4, |
| "step": 8290 |
| }, |
| { |
| "ce_loss_13": 2.5116629540920257, |
| "ce_loss_26": 1.9985181391239166, |
| "ce_loss_39": 1.8124066442251205, |
| "ce_loss_52": 1.497176530957222, |
| "ce_loss_7": 2.8335696399211883, |
| "epoch": 0.83, |
| "grad_norm": 14.139665843791814, |
| "kl_loss_13": 2098.8, |
| "kl_loss_26": 1001.4, |
| "kl_loss_39": 614.35, |
| "kl_loss_7": 2780.6, |
| "learning_rate": 7.100829338251146e-05, |
| "loss": 3198.35, |
| "step": 8300 |
| }, |
| { |
| "ce_loss_13": 2.4611269533634186, |
| "ce_loss_26": 1.9527796864509583, |
| "ce_loss_39": 1.7704098969697952, |
| "ce_loss_52": 1.462582492828369, |
| "ce_loss_7": 2.7841490387916563, |
| "epoch": 0.831, |
| "grad_norm": 13.992596562086698, |
| "kl_loss_13": 2062.6, |
| "kl_loss_26": 988.0, |
| "kl_loss_39": 605.35, |
| "kl_loss_7": 2743.2, |
| "learning_rate": 7.019542183254046e-05, |
| "loss": 3175.4, |
| "step": 8310 |
| }, |
| { |
| "ce_loss_13": 2.4340964376926424, |
| "ce_loss_26": 1.9217961221933364, |
| "ce_loss_39": 1.7397069931030273, |
| "ce_loss_52": 1.4361872345209121, |
| "ce_loss_7": 2.759904479980469, |
| "epoch": 0.832, |
| "grad_norm": 14.479445741622119, |
| "kl_loss_13": 2039.6, |
| "kl_loss_26": 971.9, |
| "kl_loss_39": 599.4, |
| "kl_loss_7": 2716.0, |
| "learning_rate": 6.938687840989971e-05, |
| "loss": 3159.45, |
| "step": 8320 |
| }, |
| { |
| "ce_loss_13": 2.4414653837680818, |
| "ce_loss_26": 1.928215390443802, |
| "ce_loss_39": 1.7369425565004348, |
| "ce_loss_52": 1.4351924806833267, |
| "ce_loss_7": 2.76216436624527, |
| "epoch": 0.833, |
| "grad_norm": 14.95621914365995, |
| "kl_loss_13": 2059.8, |
| "kl_loss_26": 983.2, |
| "kl_loss_39": 594.6, |
| "kl_loss_7": 2738.8, |
| "learning_rate": 6.858267125661271e-05, |
| "loss": 3174.0, |
| "step": 8330 |
| }, |
| { |
| "ce_loss_13": 2.400873589515686, |
| "ce_loss_26": 1.90281642973423, |
| "ce_loss_39": 1.7204748094081879, |
| "ce_loss_52": 1.4244474336504935, |
| "ce_loss_7": 2.7152935564517975, |
| "epoch": 0.834, |
| "grad_norm": 14.103644864595271, |
| "kl_loss_13": 2033.8, |
| "kl_loss_26": 973.5, |
| "kl_loss_39": 591.7, |
| "kl_loss_7": 2704.8, |
| "learning_rate": 6.778280847103668e-05, |
| "loss": 3170.05, |
| "step": 8340 |
| }, |
| { |
| "ce_loss_13": 2.374238893389702, |
| "ce_loss_26": 1.8607898473739624, |
| "ce_loss_39": 1.6831828862428666, |
| "ce_loss_52": 1.396483090519905, |
| "ce_loss_7": 2.704102611541748, |
| "epoch": 0.835, |
| "grad_norm": 14.599308778967789, |
| "kl_loss_13": 2031.4, |
| "kl_loss_26": 934.1, |
| "kl_loss_39": 566.7, |
| "kl_loss_7": 2719.2, |
| "learning_rate": 6.698729810778065e-05, |
| "loss": 3150.95, |
| "step": 8350 |
| }, |
| { |
| "ce_loss_13": 2.4648724853992463, |
| "ce_loss_26": 1.9494601666927338, |
| "ce_loss_39": 1.7671974629163743, |
| "ce_loss_52": 1.461349506676197, |
| "ce_loss_7": 2.794687694311142, |
| "epoch": 0.836, |
| "grad_norm": 14.476811719238219, |
| "kl_loss_13": 2060.2, |
| "kl_loss_26": 979.5, |
| "kl_loss_39": 601.55, |
| "kl_loss_7": 2750.8, |
| "learning_rate": 6.619614817762538e-05, |
| "loss": 3140.75, |
| "step": 8360 |
| }, |
| { |
| "ce_loss_13": 2.4076102912425994, |
| "ce_loss_26": 1.9041061371564865, |
| "ce_loss_39": 1.7215475410223007, |
| "ce_loss_52": 1.426313552260399, |
| "ce_loss_7": 2.729493075609207, |
| "epoch": 0.837, |
| "grad_norm": 14.66675091557762, |
| "kl_loss_13": 2014.4, |
| "kl_loss_26": 958.9, |
| "kl_loss_39": 585.55, |
| "kl_loss_7": 2689.6, |
| "learning_rate": 6.540936664744196e-05, |
| "loss": 3161.6, |
| "step": 8370 |
| }, |
| { |
| "ce_loss_13": 2.4151067316532133, |
| "ce_loss_26": 1.9024922668933868, |
| "ce_loss_39": 1.7205139189958571, |
| "ce_loss_52": 1.4315023928880692, |
| "ce_loss_7": 2.733920103311539, |
| "epoch": 0.838, |
| "grad_norm": 13.748708793526905, |
| "kl_loss_13": 2020.6, |
| "kl_loss_26": 957.3, |
| "kl_loss_39": 574.45, |
| "kl_loss_7": 2690.8, |
| "learning_rate": 6.462696144011149e-05, |
| "loss": 3148.0, |
| "step": 8380 |
| }, |
| { |
| "ce_loss_13": 2.4298708856105806, |
| "ce_loss_26": 1.9152013957500458, |
| "ce_loss_39": 1.737311202287674, |
| "ce_loss_52": 1.441886842250824, |
| "ce_loss_7": 2.757741445302963, |
| "epoch": 0.839, |
| "grad_norm": 14.560412200339597, |
| "kl_loss_13": 2019.6, |
| "kl_loss_26": 946.5, |
| "kl_loss_39": 576.85, |
| "kl_loss_7": 2709.2, |
| "learning_rate": 6.384894043444567e-05, |
| "loss": 3144.45, |
| "step": 8390 |
| }, |
| { |
| "ce_loss_13": 2.4371220886707308, |
| "ce_loss_26": 1.9178409904241562, |
| "ce_loss_39": 1.7318467199802399, |
| "ce_loss_52": 1.427680206298828, |
| "ce_loss_7": 2.7716069161891936, |
| "epoch": 0.84, |
| "grad_norm": 13.228002541168403, |
| "kl_loss_13": 2067.8, |
| "kl_loss_26": 990.0, |
| "kl_loss_39": 597.05, |
| "kl_loss_7": 2766.8, |
| "learning_rate": 6.307531146510753e-05, |
| "loss": 3145.15, |
| "step": 8400 |
| }, |
| { |
| "ce_loss_13": 2.4668938338756563, |
| "ce_loss_26": 1.9483750283718109, |
| "ce_loss_39": 1.7615112096071244, |
| "ce_loss_52": 1.4629206866025926, |
| "ce_loss_7": 2.7949269711971283, |
| "epoch": 0.841, |
| "grad_norm": 14.638289805261575, |
| "kl_loss_13": 2070.8, |
| "kl_loss_26": 979.1, |
| "kl_loss_39": 588.85, |
| "kl_loss_7": 2755.2, |
| "learning_rate": 6.230608232253226e-05, |
| "loss": 3135.55, |
| "step": 8410 |
| }, |
| { |
| "ce_loss_13": 2.498942193388939, |
| "ce_loss_26": 1.9719986289739608, |
| "ce_loss_39": 1.7773171186447143, |
| "ce_loss_52": 1.4589012682437896, |
| "ce_loss_7": 2.8206138908863068, |
| "epoch": 0.842, |
| "grad_norm": 14.547037792354297, |
| "kl_loss_13": 2143.2, |
| "kl_loss_26": 1027.6, |
| "kl_loss_39": 626.5, |
| "kl_loss_7": 2830.0, |
| "learning_rate": 6.154126075284855e-05, |
| "loss": 3179.05, |
| "step": 8420 |
| }, |
| { |
| "ce_loss_13": 2.339446923136711, |
| "ce_loss_26": 1.836980375647545, |
| "ce_loss_39": 1.66352079808712, |
| "ce_loss_52": 1.3777535080909729, |
| "ce_loss_7": 2.658161628246307, |
| "epoch": 0.843, |
| "grad_norm": 13.822498461755101, |
| "kl_loss_13": 2006.0, |
| "kl_loss_26": 933.6, |
| "kl_loss_39": 565.3, |
| "kl_loss_7": 2679.6, |
| "learning_rate": 6.078085445780129e-05, |
| "loss": 3158.075, |
| "step": 8430 |
| }, |
| { |
| "ce_loss_13": 2.4486551761627195, |
| "ce_loss_26": 1.9329589813947679, |
| "ce_loss_39": 1.7519038885831832, |
| "ce_loss_52": 1.4464032799005508, |
| "ce_loss_7": 2.776008838415146, |
| "epoch": 0.844, |
| "grad_norm": 13.676549255063142, |
| "kl_loss_13": 2050.2, |
| "kl_loss_26": 971.9, |
| "kl_loss_39": 595.05, |
| "kl_loss_7": 2741.2, |
| "learning_rate": 6.002487109467347e-05, |
| "loss": 3155.95, |
| "step": 8440 |
| }, |
| { |
| "ce_loss_13": 2.467064255475998, |
| "ce_loss_26": 1.9566147327423096, |
| "ce_loss_39": 1.7722377121448516, |
| "ce_loss_52": 1.4749930799007416, |
| "ce_loss_7": 2.7902898490428925, |
| "epoch": 0.845, |
| "grad_norm": 15.195098706872598, |
| "kl_loss_13": 2033.8, |
| "kl_loss_26": 959.4, |
| "kl_loss_39": 581.0, |
| "kl_loss_7": 2712.0, |
| "learning_rate": 5.927331827620902e-05, |
| "loss": 3169.7, |
| "step": 8450 |
| }, |
| { |
| "ce_loss_13": 2.389095312356949, |
| "ce_loss_26": 1.8802162408828735, |
| "ce_loss_39": 1.7042785853147506, |
| "ce_loss_52": 1.4126853346824646, |
| "ce_loss_7": 2.714692497253418, |
| "epoch": 0.846, |
| "grad_norm": 14.567703807315205, |
| "kl_loss_13": 2016.0, |
| "kl_loss_26": 940.7, |
| "kl_loss_39": 574.25, |
| "kl_loss_7": 2700.8, |
| "learning_rate": 5.852620357053651e-05, |
| "loss": 3111.0, |
| "step": 8460 |
| }, |
| { |
| "ce_loss_13": 2.4798492193222046, |
| "ce_loss_26": 1.9612985998392105, |
| "ce_loss_39": 1.7813422173261642, |
| "ce_loss_52": 1.4634388938546181, |
| "ce_loss_7": 2.8045433819293977, |
| "epoch": 0.847, |
| "grad_norm": 13.718944289269317, |
| "kl_loss_13": 2114.4, |
| "kl_loss_26": 1008.1, |
| "kl_loss_39": 624.7, |
| "kl_loss_7": 2801.2, |
| "learning_rate": 5.778353450109286e-05, |
| "loss": 3195.2, |
| "step": 8470 |
| }, |
| { |
| "ce_loss_13": 2.38386265039444, |
| "ce_loss_26": 1.8872032672166825, |
| "ce_loss_39": 1.710950767993927, |
| "ce_loss_52": 1.432640826702118, |
| "ce_loss_7": 2.694201183319092, |
| "epoch": 0.848, |
| "grad_norm": 14.259266485735452, |
| "kl_loss_13": 1968.6, |
| "kl_loss_26": 914.3, |
| "kl_loss_39": 549.1, |
| "kl_loss_7": 2623.2, |
| "learning_rate": 5.7045318546547206e-05, |
| "loss": 3137.025, |
| "step": 8480 |
| }, |
| { |
| "ce_loss_13": 2.427330991625786, |
| "ce_loss_26": 1.910456082224846, |
| "ce_loss_39": 1.7292529791593552, |
| "ce_loss_52": 1.4334994465112687, |
| "ce_loss_7": 2.7538663387298583, |
| "epoch": 0.849, |
| "grad_norm": 13.656054783413236, |
| "kl_loss_13": 2033.4, |
| "kl_loss_26": 954.9, |
| "kl_loss_39": 577.85, |
| "kl_loss_7": 2719.6, |
| "learning_rate": 5.631156314072605e-05, |
| "loss": 3150.65, |
| "step": 8490 |
| }, |
| { |
| "ce_loss_13": 2.471989703178406, |
| "ce_loss_26": 1.9505236119031906, |
| "ce_loss_39": 1.7668047726154328, |
| "ce_loss_52": 1.4608478724956513, |
| "ce_loss_7": 2.8063031315803526, |
| "epoch": 0.85, |
| "grad_norm": 13.681266151650567, |
| "kl_loss_13": 2080.4, |
| "kl_loss_26": 982.8, |
| "kl_loss_39": 600.35, |
| "kl_loss_7": 2780.8, |
| "learning_rate": 5.5582275672538315e-05, |
| "loss": 3137.95, |
| "step": 8500 |
| }, |
| { |
| "ce_loss_13": 2.4605359852313997, |
| "ce_loss_26": 1.939817100763321, |
| "ce_loss_39": 1.753285875916481, |
| "ce_loss_52": 1.4498123317956924, |
| "ce_loss_7": 2.7863478004932403, |
| "epoch": 0.851, |
| "grad_norm": 14.038129063761419, |
| "kl_loss_13": 2055.6, |
| "kl_loss_26": 978.1, |
| "kl_loss_39": 593.3, |
| "kl_loss_7": 2740.0, |
| "learning_rate": 5.4857463485900484e-05, |
| "loss": 3144.75, |
| "step": 8510 |
| }, |
| { |
| "ce_loss_13": 2.450565594434738, |
| "ce_loss_26": 1.9354894876480102, |
| "ce_loss_39": 1.7570757120847702, |
| "ce_loss_52": 1.458202052116394, |
| "ce_loss_7": 2.770776855945587, |
| "epoch": 0.852, |
| "grad_norm": 13.945411873449233, |
| "kl_loss_13": 2028.8, |
| "kl_loss_26": 956.4, |
| "kl_loss_39": 584.4, |
| "kl_loss_7": 2704.4, |
| "learning_rate": 5.413713387966329e-05, |
| "loss": 3147.35, |
| "step": 8520 |
| }, |
| { |
| "ce_loss_13": 2.384511134028435, |
| "ce_loss_26": 1.885845959186554, |
| "ce_loss_39": 1.7117790162563324, |
| "ce_loss_52": 1.4215580940246582, |
| "ce_loss_7": 2.6971611440181733, |
| "epoch": 0.853, |
| "grad_norm": 14.280539701015499, |
| "kl_loss_13": 1983.0, |
| "kl_loss_26": 937.9, |
| "kl_loss_39": 571.6, |
| "kl_loss_7": 2646.0, |
| "learning_rate": 5.34212941075381e-05, |
| "loss": 3138.7, |
| "step": 8530 |
| }, |
| { |
| "ce_loss_13": 2.4187089085578917, |
| "ce_loss_26": 1.9180882632732392, |
| "ce_loss_39": 1.7486219108104706, |
| "ce_loss_52": 1.4668725609779358, |
| "ce_loss_7": 2.7367011964321137, |
| "epoch": 0.854, |
| "grad_norm": 13.894895708827713, |
| "kl_loss_13": 1988.6, |
| "kl_loss_26": 918.2, |
| "kl_loss_39": 556.1, |
| "kl_loss_7": 2660.4, |
| "learning_rate": 5.270995137802315e-05, |
| "loss": 3116.45, |
| "step": 8540 |
| }, |
| { |
| "ce_loss_13": 2.415088692307472, |
| "ce_loss_26": 1.9041693419218064, |
| "ce_loss_39": 1.7267356216907501, |
| "ce_loss_52": 1.4279317557811737, |
| "ce_loss_7": 2.7469893753528596, |
| "epoch": 0.855, |
| "grad_norm": 14.31134935015859, |
| "kl_loss_13": 2040.4, |
| "kl_loss_26": 959.5, |
| "kl_loss_39": 585.9, |
| "kl_loss_7": 2737.2, |
| "learning_rate": 5.2003112854332125e-05, |
| "loss": 3108.3, |
| "step": 8550 |
| }, |
| { |
| "ce_loss_13": 2.461696755886078, |
| "ce_loss_26": 1.9468512892723084, |
| "ce_loss_39": 1.7631896048784257, |
| "ce_loss_52": 1.4654896438121796, |
| "ce_loss_7": 2.780461609363556, |
| "epoch": 0.856, |
| "grad_norm": 14.365329511248612, |
| "kl_loss_13": 2070.0, |
| "kl_loss_26": 974.7, |
| "kl_loss_39": 589.2, |
| "kl_loss_7": 2744.0, |
| "learning_rate": 5.130078565432089e-05, |
| "loss": 3173.9, |
| "step": 8560 |
| }, |
| { |
| "ce_loss_13": 2.4338246136903763, |
| "ce_loss_26": 1.9260531306266784, |
| "ce_loss_39": 1.746686053276062, |
| "ce_loss_52": 1.4506986886262894, |
| "ce_loss_7": 2.7584114193916323, |
| "epoch": 0.857, |
| "grad_norm": 13.58696871299719, |
| "kl_loss_13": 2040.8, |
| "kl_loss_26": 958.4, |
| "kl_loss_39": 587.6, |
| "kl_loss_7": 2720.8, |
| "learning_rate": 5.060297685041659e-05, |
| "loss": 3124.45, |
| "step": 8570 |
| }, |
| { |
| "ce_loss_13": 2.465153419971466, |
| "ce_loss_26": 1.9324041992425918, |
| "ce_loss_39": 1.747180885076523, |
| "ce_loss_52": 1.4378513038158416, |
| "ce_loss_7": 2.8079187512397765, |
| "epoch": 0.858, |
| "grad_norm": 13.69346881706034, |
| "kl_loss_13": 2127.2, |
| "kl_loss_26": 1005.7, |
| "kl_loss_39": 610.45, |
| "kl_loss_7": 2842.8, |
| "learning_rate": 4.99096934695461e-05, |
| "loss": 3135.55, |
| "step": 8580 |
| }, |
| { |
| "ce_loss_13": 2.428412067890167, |
| "ce_loss_26": 1.9264894247055053, |
| "ce_loss_39": 1.748258227109909, |
| "ce_loss_52": 1.4425264418125152, |
| "ce_loss_7": 2.7523947954177856, |
| "epoch": 0.859, |
| "grad_norm": 14.416520228353955, |
| "kl_loss_13": 2024.4, |
| "kl_loss_26": 961.5, |
| "kl_loss_39": 593.95, |
| "kl_loss_7": 2704.0, |
| "learning_rate": 4.922094249306558e-05, |
| "loss": 3141.65, |
| "step": 8590 |
| }, |
| { |
| "ce_loss_13": 2.379640507698059, |
| "ce_loss_26": 1.8703870117664336, |
| "ce_loss_39": 1.6925265491008759, |
| "ce_loss_52": 1.4014200061559676, |
| "ce_loss_7": 2.7008225679397584, |
| "epoch": 0.86, |
| "grad_norm": 14.548164181811538, |
| "kl_loss_13": 2019.0, |
| "kl_loss_26": 952.8, |
| "kl_loss_39": 580.7, |
| "kl_loss_7": 2704.4, |
| "learning_rate": 4.853673085668947e-05, |
| "loss": 3164.35, |
| "step": 8600 |
| }, |
| { |
| "ce_loss_13": 2.3882300436496733, |
| "ce_loss_26": 1.8816026329994202, |
| "ce_loss_39": 1.701096272468567, |
| "ce_loss_52": 1.4169353902339936, |
| "ce_loss_7": 2.711241126060486, |
| "epoch": 0.861, |
| "grad_norm": 13.997789256280235, |
| "kl_loss_13": 2014.0, |
| "kl_loss_26": 941.1, |
| "kl_loss_39": 561.45, |
| "kl_loss_7": 2697.6, |
| "learning_rate": 4.78570654504214e-05, |
| "loss": 3160.75, |
| "step": 8610 |
| }, |
| { |
| "ce_loss_13": 2.4367538392543793, |
| "ce_loss_26": 1.9072220534086228, |
| "ce_loss_39": 1.724021741747856, |
| "ce_loss_52": 1.4287808299064637, |
| "ce_loss_7": 2.7643966376781464, |
| "epoch": 0.862, |
| "grad_norm": 13.935763093108008, |
| "kl_loss_13": 2067.0, |
| "kl_loss_26": 962.6, |
| "kl_loss_39": 576.05, |
| "kl_loss_7": 2765.6, |
| "learning_rate": 4.7181953118484556e-05, |
| "loss": 3127.35, |
| "step": 8620 |
| }, |
| { |
| "ce_loss_13": 2.424758407473564, |
| "ce_loss_26": 1.9169064074754716, |
| "ce_loss_39": 1.7346068799495697, |
| "ce_loss_52": 1.4445551723241805, |
| "ce_loss_7": 2.7455954015254975, |
| "epoch": 0.863, |
| "grad_norm": 14.314671068665351, |
| "kl_loss_13": 2012.8, |
| "kl_loss_26": 950.4, |
| "kl_loss_39": 570.95, |
| "kl_loss_7": 2679.2, |
| "learning_rate": 4.651140065925269e-05, |
| "loss": 3115.55, |
| "step": 8630 |
| }, |
| { |
| "ce_loss_13": 2.5092544078826906, |
| "ce_loss_26": 1.9829395413398743, |
| "ce_loss_39": 1.7923680394887924, |
| "ce_loss_52": 1.481997686624527, |
| "ce_loss_7": 2.838895618915558, |
| "epoch": 0.864, |
| "grad_norm": 14.264233585666156, |
| "kl_loss_13": 2110.2, |
| "kl_loss_26": 1005.9, |
| "kl_loss_39": 616.9, |
| "kl_loss_7": 2805.2, |
| "learning_rate": 4.58454148251814e-05, |
| "loss": 3142.0, |
| "step": 8640 |
| }, |
| { |
| "ce_loss_13": 2.43146056830883, |
| "ce_loss_26": 1.9165301382541657, |
| "ce_loss_39": 1.7361394971609116, |
| "ce_loss_52": 1.4455705016851426, |
| "ce_loss_7": 2.7573634922504424, |
| "epoch": 0.865, |
| "grad_norm": 13.3838976309547, |
| "kl_loss_13": 2018.4, |
| "kl_loss_26": 941.1, |
| "kl_loss_39": 567.4, |
| "kl_loss_7": 2702.0, |
| "learning_rate": 4.518400232274078e-05, |
| "loss": 3128.55, |
| "step": 8650 |
| }, |
| { |
| "ce_loss_13": 2.4056933134794236, |
| "ce_loss_26": 1.8982498347759247, |
| "ce_loss_39": 1.7155311942100524, |
| "ce_loss_52": 1.4283919543027879, |
| "ce_loss_7": 2.7339151203632355, |
| "epoch": 0.866, |
| "grad_norm": 13.776729544594009, |
| "kl_loss_13": 2029.0, |
| "kl_loss_26": 958.5, |
| "kl_loss_39": 575.6, |
| "kl_loss_7": 2712.0, |
| "learning_rate": 4.452716981234745e-05, |
| "loss": 3168.35, |
| "step": 8660 |
| }, |
| { |
| "ce_loss_13": 2.4304875314235685, |
| "ce_loss_26": 1.9086535692214965, |
| "ce_loss_39": 1.731420534849167, |
| "ce_loss_52": 1.4311698615550994, |
| "ce_loss_7": 2.76586651802063, |
| "epoch": 0.867, |
| "grad_norm": 14.03220486850696, |
| "kl_loss_13": 2043.6, |
| "kl_loss_26": 956.8, |
| "kl_loss_39": 585.05, |
| "kl_loss_7": 2738.0, |
| "learning_rate": 4.3874923908297335e-05, |
| "loss": 3147.75, |
| "step": 8670 |
| }, |
| { |
| "ce_loss_13": 2.422931173443794, |
| "ce_loss_26": 1.8919979512691498, |
| "ce_loss_39": 1.7088686615228652, |
| "ce_loss_52": 1.4077896371483802, |
| "ce_loss_7": 2.757069969177246, |
| "epoch": 0.868, |
| "grad_norm": 14.116898056864903, |
| "kl_loss_13": 2080.8, |
| "kl_loss_26": 974.7, |
| "kl_loss_39": 600.2, |
| "kl_loss_7": 2778.4, |
| "learning_rate": 4.322727117869951e-05, |
| "loss": 3132.8, |
| "step": 8680 |
| }, |
| { |
| "ce_loss_13": 2.4079675406217573, |
| "ce_loss_26": 1.9030864268541337, |
| "ce_loss_39": 1.7309907704591752, |
| "ce_loss_52": 1.4429899513721467, |
| "ce_loss_7": 2.725788599252701, |
| "epoch": 0.869, |
| "grad_norm": 13.611192303364593, |
| "kl_loss_13": 1991.0, |
| "kl_loss_26": 927.4, |
| "kl_loss_39": 558.5, |
| "kl_loss_7": 2668.4, |
| "learning_rate": 4.2584218145409916e-05, |
| "loss": 3135.35, |
| "step": 8690 |
| }, |
| { |
| "ce_loss_13": 2.3869937509298325, |
| "ce_loss_26": 1.8738444805145265, |
| "ce_loss_39": 1.6931981056928636, |
| "ce_loss_52": 1.400150865316391, |
| "ce_loss_7": 2.709615921974182, |
| "epoch": 0.87, |
| "grad_norm": 14.271447400785455, |
| "kl_loss_13": 2015.8, |
| "kl_loss_26": 944.7, |
| "kl_loss_39": 573.1, |
| "kl_loss_7": 2692.8, |
| "learning_rate": 4.194577128396521e-05, |
| "loss": 3114.95, |
| "step": 8700 |
| }, |
| { |
| "ce_loss_13": 2.465124714374542, |
| "ce_loss_26": 1.961993396282196, |
| "ce_loss_39": 1.7869856834411622, |
| "ce_loss_52": 1.4856192290782928, |
| "ce_loss_7": 2.785760098695755, |
| "epoch": 0.871, |
| "grad_norm": 13.70337275168833, |
| "kl_loss_13": 2015.8, |
| "kl_loss_26": 955.1, |
| "kl_loss_39": 582.9, |
| "kl_loss_7": 2693.6, |
| "learning_rate": 4.1311937023518264e-05, |
| "loss": 3146.8, |
| "step": 8710 |
| }, |
| { |
| "ce_loss_13": 2.432892268896103, |
| "ce_loss_26": 1.9152618199586868, |
| "ce_loss_39": 1.7295002430677413, |
| "ce_loss_52": 1.4359665989875794, |
| "ce_loss_7": 2.7681704640388487, |
| "epoch": 0.872, |
| "grad_norm": 14.713342569921842, |
| "kl_loss_13": 2035.8, |
| "kl_loss_26": 958.4, |
| "kl_loss_39": 578.45, |
| "kl_loss_7": 2721.6, |
| "learning_rate": 4.0682721746773344e-05, |
| "loss": 3128.35, |
| "step": 8720 |
| }, |
| { |
| "ce_loss_13": 2.3963799655437468, |
| "ce_loss_26": 1.8889498293399811, |
| "ce_loss_39": 1.7117068350315094, |
| "ce_loss_52": 1.4276205718517303, |
| "ce_loss_7": 2.7207881271839143, |
| "epoch": 0.873, |
| "grad_norm": 14.269783754551202, |
| "kl_loss_13": 2001.0, |
| "kl_loss_26": 932.4, |
| "kl_loss_39": 561.8, |
| "kl_loss_7": 2688.8, |
| "learning_rate": 4.0058131789920904e-05, |
| "loss": 3131.65, |
| "step": 8730 |
| }, |
| { |
| "ce_loss_13": 2.418975955247879, |
| "ce_loss_26": 1.8960406243801118, |
| "ce_loss_39": 1.7133139997720719, |
| "ce_loss_52": 1.4180105909705163, |
| "ce_loss_7": 2.7614206850528715, |
| "epoch": 0.874, |
| "grad_norm": 14.114592900051933, |
| "kl_loss_13": 2079.4, |
| "kl_loss_26": 975.5, |
| "kl_loss_39": 598.35, |
| "kl_loss_7": 2790.4, |
| "learning_rate": 3.9438173442575e-05, |
| "loss": 3100.7, |
| "step": 8740 |
| }, |
| { |
| "ce_loss_13": 2.4537671864032746, |
| "ce_loss_26": 1.946785607933998, |
| "ce_loss_39": 1.7634260147809981, |
| "ce_loss_52": 1.4690344750881195, |
| "ce_loss_7": 2.775768506526947, |
| "epoch": 0.875, |
| "grad_norm": 14.35027266854894, |
| "kl_loss_13": 2027.0, |
| "kl_loss_26": 955.8, |
| "kl_loss_39": 572.25, |
| "kl_loss_7": 2707.2, |
| "learning_rate": 3.882285294770937e-05, |
| "loss": 3145.7, |
| "step": 8750 |
| }, |
| { |
| "ce_loss_13": 2.4146986842155456, |
| "ce_loss_26": 1.8880977869033813, |
| "ce_loss_39": 1.7063632160425186, |
| "ce_loss_52": 1.403985047340393, |
| "ce_loss_7": 2.741646242141724, |
| "epoch": 0.876, |
| "grad_norm": 14.018669111873198, |
| "kl_loss_13": 2049.4, |
| "kl_loss_26": 969.5, |
| "kl_loss_39": 590.2, |
| "kl_loss_7": 2731.6, |
| "learning_rate": 3.821217650159453e-05, |
| "loss": 3139.0, |
| "step": 8760 |
| }, |
| { |
| "ce_loss_13": 2.3532112538814545, |
| "ce_loss_26": 1.8646484702825545, |
| "ce_loss_39": 1.6983740404248238, |
| "ce_loss_52": 1.4196315869688987, |
| "ce_loss_7": 2.666028293967247, |
| "epoch": 0.877, |
| "grad_norm": 13.69993271737963, |
| "kl_loss_13": 1919.2, |
| "kl_loss_26": 890.8, |
| "kl_loss_39": 542.85, |
| "kl_loss_7": 2582.0, |
| "learning_rate": 3.760615025373543e-05, |
| "loss": 3109.35, |
| "step": 8770 |
| }, |
| { |
| "ce_loss_13": 2.4570236086845396, |
| "ce_loss_26": 1.938426810503006, |
| "ce_loss_39": 1.7564183056354523, |
| "ce_loss_52": 1.453689630329609, |
| "ce_loss_7": 2.787124717235565, |
| "epoch": 0.878, |
| "grad_norm": 14.764457343001293, |
| "kl_loss_13": 2048.4, |
| "kl_loss_26": 962.8, |
| "kl_loss_39": 585.3, |
| "kl_loss_7": 2732.8, |
| "learning_rate": 3.700478030680987e-05, |
| "loss": 3153.7, |
| "step": 8780 |
| }, |
| { |
| "ce_loss_13": 2.4401866495609283, |
| "ce_loss_26": 1.9257715612649917, |
| "ce_loss_39": 1.7440615922212601, |
| "ce_loss_52": 1.4473457425832748, |
| "ce_loss_7": 2.76454553604126, |
| "epoch": 0.879, |
| "grad_norm": 13.624797595400036, |
| "kl_loss_13": 2038.4, |
| "kl_loss_26": 958.5, |
| "kl_loss_39": 579.85, |
| "kl_loss_7": 2717.6, |
| "learning_rate": 3.6408072716606344e-05, |
| "loss": 3158.95, |
| "step": 8790 |
| }, |
| { |
| "ce_loss_13": 2.3960734605789185, |
| "ce_loss_26": 1.8874068677425384, |
| "ce_loss_39": 1.7073772728443146, |
| "ce_loss_52": 1.4144951313734055, |
| "ce_loss_7": 2.7204393565654756, |
| "epoch": 0.88, |
| "grad_norm": 13.565627756275363, |
| "kl_loss_13": 2008.0, |
| "kl_loss_26": 944.3, |
| "kl_loss_39": 574.6, |
| "kl_loss_7": 2694.0, |
| "learning_rate": 3.5816033491963716e-05, |
| "loss": 3127.45, |
| "step": 8800 |
| }, |
| { |
| "ce_loss_13": 2.426618826389313, |
| "ce_loss_26": 1.921605721116066, |
| "ce_loss_39": 1.7422660619020462, |
| "ce_loss_52": 1.4409180462360383, |
| "ce_loss_7": 2.7458843529224395, |
| "epoch": 0.881, |
| "grad_norm": 14.680252482648882, |
| "kl_loss_13": 2012.6, |
| "kl_loss_26": 955.2, |
| "kl_loss_39": 587.45, |
| "kl_loss_7": 2678.0, |
| "learning_rate": 3.522866859471047e-05, |
| "loss": 3106.075, |
| "step": 8810 |
| }, |
| { |
| "ce_loss_13": 2.4589924097061155, |
| "ce_loss_26": 1.9394809186458588, |
| "ce_loss_39": 1.7563898861408234, |
| "ce_loss_52": 1.459865990281105, |
| "ce_loss_7": 2.780824285745621, |
| "epoch": 0.882, |
| "grad_norm": 13.410206661523853, |
| "kl_loss_13": 2065.4, |
| "kl_loss_26": 975.8, |
| "kl_loss_39": 594.35, |
| "kl_loss_7": 2745.6, |
| "learning_rate": 3.46459839396045e-05, |
| "loss": 3162.5, |
| "step": 8820 |
| }, |
| { |
| "ce_loss_13": 2.444491392374039, |
| "ce_loss_26": 1.919321459531784, |
| "ce_loss_39": 1.7317459166049958, |
| "ce_loss_52": 1.4241959005594254, |
| "ce_loss_7": 2.7687501907348633, |
| "epoch": 0.883, |
| "grad_norm": 13.604154476294898, |
| "kl_loss_13": 2078.6, |
| "kl_loss_26": 980.1, |
| "kl_loss_39": 596.05, |
| "kl_loss_7": 2764.8, |
| "learning_rate": 3.406798539427386e-05, |
| "loss": 3137.15, |
| "step": 8830 |
| }, |
| { |
| "ce_loss_13": 2.4456652402877808, |
| "ce_loss_26": 1.9403656631708146, |
| "ce_loss_39": 1.7609369516372682, |
| "ce_loss_52": 1.4744407176971435, |
| "ce_loss_7": 2.762556844949722, |
| "epoch": 0.884, |
| "grad_norm": 14.0628693009416, |
| "kl_loss_13": 1994.0, |
| "kl_loss_26": 932.0, |
| "kl_loss_39": 557.15, |
| "kl_loss_7": 2667.2, |
| "learning_rate": 3.349467877915746e-05, |
| "loss": 3099.2, |
| "step": 8840 |
| }, |
| { |
| "ce_loss_13": 2.4576884746551513, |
| "ce_loss_26": 1.9445500463247298, |
| "ce_loss_39": 1.764642345905304, |
| "ce_loss_52": 1.4638898521661758, |
| "ce_loss_7": 2.7794412195682527, |
| "epoch": 0.885, |
| "grad_norm": 13.731348532638435, |
| "kl_loss_13": 2048.2, |
| "kl_loss_26": 971.4, |
| "kl_loss_39": 596.3, |
| "kl_loss_7": 2722.8, |
| "learning_rate": 3.292606986744667e-05, |
| "loss": 3152.675, |
| "step": 8850 |
| }, |
| { |
| "ce_loss_13": 2.4683742761611938, |
| "ce_loss_26": 1.9468118786811828, |
| "ce_loss_39": 1.7659433901309967, |
| "ce_loss_52": 1.4723648518323897, |
| "ce_loss_7": 2.794690328836441, |
| "epoch": 0.886, |
| "grad_norm": 14.768179589766811, |
| "kl_loss_13": 2053.6, |
| "kl_loss_26": 960.6, |
| "kl_loss_39": 581.1, |
| "kl_loss_7": 2740.0, |
| "learning_rate": 3.23621643850267e-05, |
| "loss": 3135.9, |
| "step": 8860 |
| }, |
| { |
| "ce_loss_13": 2.3716426849365235, |
| "ce_loss_26": 1.8652270317077637, |
| "ce_loss_39": 1.6864412546157836, |
| "ce_loss_52": 1.4016476958990096, |
| "ce_loss_7": 2.6954882085323333, |
| "epoch": 0.887, |
| "grad_norm": 13.861337667665934, |
| "kl_loss_13": 1995.8, |
| "kl_loss_26": 932.7, |
| "kl_loss_39": 558.6, |
| "kl_loss_7": 2668.8, |
| "learning_rate": 3.180296801041971e-05, |
| "loss": 3116.05, |
| "step": 8870 |
| }, |
| { |
| "ce_loss_13": 2.406647819280624, |
| "ce_loss_26": 1.8954071879386902, |
| "ce_loss_39": 1.7191426277160644, |
| "ce_loss_52": 1.4318138241767884, |
| "ce_loss_7": 2.7277746230363844, |
| "epoch": 0.888, |
| "grad_norm": 14.18696677740657, |
| "kl_loss_13": 1999.0, |
| "kl_loss_26": 937.5, |
| "kl_loss_39": 568.1, |
| "kl_loss_7": 2671.2, |
| "learning_rate": 3.124848637472688e-05, |
| "loss": 3120.85, |
| "step": 8880 |
| }, |
| { |
| "ce_loss_13": 2.4183617502450945, |
| "ce_loss_26": 1.9087469071149825, |
| "ce_loss_39": 1.731605476140976, |
| "ce_loss_52": 1.4397760301828384, |
| "ce_loss_7": 2.7426804542541503, |
| "epoch": 0.889, |
| "grad_norm": 14.005614784571057, |
| "kl_loss_13": 2023.4, |
| "kl_loss_26": 938.6, |
| "kl_loss_39": 572.0, |
| "kl_loss_7": 2704.0, |
| "learning_rate": 3.069872506157212e-05, |
| "loss": 3140.35, |
| "step": 8890 |
| }, |
| { |
| "ce_loss_13": 2.3538796246051787, |
| "ce_loss_26": 1.8491210967302323, |
| "ce_loss_39": 1.6793664902448655, |
| "ce_loss_52": 1.4003751114010812, |
| "ce_loss_7": 2.6818648397922518, |
| "epoch": 0.89, |
| "grad_norm": 13.507964025411896, |
| "kl_loss_13": 1969.8, |
| "kl_loss_26": 909.9, |
| "kl_loss_39": 551.1, |
| "kl_loss_7": 2655.2, |
| "learning_rate": 3.0153689607045842e-05, |
| "loss": 3115.9, |
| "step": 8900 |
| }, |
| { |
| "ce_loss_13": 2.3964832425117493, |
| "ce_loss_26": 1.8831844747066497, |
| "ce_loss_39": 1.7046507805585862, |
| "ce_loss_52": 1.4187346428632737, |
| "ce_loss_7": 2.72187722325325, |
| "epoch": 0.891, |
| "grad_norm": 14.258655617102612, |
| "kl_loss_13": 2013.6, |
| "kl_loss_26": 929.9, |
| "kl_loss_39": 559.3, |
| "kl_loss_7": 2705.2, |
| "learning_rate": 2.9613385499648926e-05, |
| "loss": 3133.85, |
| "step": 8910 |
| }, |
| { |
| "ce_loss_13": 2.3858442664146424, |
| "ce_loss_26": 1.8839757442474365, |
| "ce_loss_39": 1.709816351532936, |
| "ce_loss_52": 1.430666272342205, |
| "ce_loss_7": 2.6985366463661196, |
| "epoch": 0.892, |
| "grad_norm": 14.05296041871021, |
| "kl_loss_13": 1977.6, |
| "kl_loss_26": 918.7, |
| "kl_loss_39": 552.4, |
| "kl_loss_7": 2640.0, |
| "learning_rate": 2.9077818180237692e-05, |
| "loss": 3161.85, |
| "step": 8920 |
| }, |
| { |
| "ce_loss_13": 2.4120604634284972, |
| "ce_loss_26": 1.9131643176078796, |
| "ce_loss_39": 1.741806897521019, |
| "ce_loss_52": 1.4555314972996711, |
| "ce_loss_7": 2.7308483004570006, |
| "epoch": 0.893, |
| "grad_norm": 14.216852449574784, |
| "kl_loss_13": 1989.0, |
| "kl_loss_26": 931.9, |
| "kl_loss_39": 565.55, |
| "kl_loss_7": 2658.8, |
| "learning_rate": 2.8546993041969172e-05, |
| "loss": 3113.0, |
| "step": 8930 |
| }, |
| { |
| "ce_loss_13": 2.417782390117645, |
| "ce_loss_26": 1.9075176060199737, |
| "ce_loss_39": 1.7230383425951004, |
| "ce_loss_52": 1.428551298379898, |
| "ce_loss_7": 2.737876206636429, |
| "epoch": 0.894, |
| "grad_norm": 13.81624953017635, |
| "kl_loss_13": 2024.2, |
| "kl_loss_26": 956.0, |
| "kl_loss_39": 579.25, |
| "kl_loss_7": 2694.8, |
| "learning_rate": 2.802091543024671e-05, |
| "loss": 3118.3, |
| "step": 8940 |
| }, |
| { |
| "ce_loss_13": 2.4281566560268404, |
| "ce_loss_26": 1.9209083169698715, |
| "ce_loss_39": 1.7325717121362687, |
| "ce_loss_52": 1.4276321291923524, |
| "ce_loss_7": 2.7494910418987275, |
| "epoch": 0.895, |
| "grad_norm": 14.667622613471202, |
| "kl_loss_13": 2068.0, |
| "kl_loss_26": 993.0, |
| "kl_loss_39": 599.95, |
| "kl_loss_7": 2738.4, |
| "learning_rate": 2.7499590642665774e-05, |
| "loss": 3152.3, |
| "step": 8950 |
| }, |
| { |
| "ce_loss_13": 2.4123401612043383, |
| "ce_loss_26": 1.9057573080062866, |
| "ce_loss_39": 1.725106343626976, |
| "ce_loss_52": 1.43211932182312, |
| "ce_loss_7": 2.7356060326099394, |
| "epoch": 0.896, |
| "grad_norm": 13.84770444101258, |
| "kl_loss_13": 2014.8, |
| "kl_loss_26": 952.6, |
| "kl_loss_39": 575.75, |
| "kl_loss_7": 2696.2, |
| "learning_rate": 2.6983023928961405e-05, |
| "loss": 3131.5, |
| "step": 8960 |
| }, |
| { |
| "ce_loss_13": 2.3709884881973267, |
| "ce_loss_26": 1.868075394630432, |
| "ce_loss_39": 1.6942670613527298, |
| "ce_loss_52": 1.3973538905382157, |
| "ce_loss_7": 2.69813577234745, |
| "epoch": 0.897, |
| "grad_norm": 14.584597117704368, |
| "kl_loss_13": 2004.8, |
| "kl_loss_26": 940.7, |
| "kl_loss_39": 576.3, |
| "kl_loss_7": 2687.2, |
| "learning_rate": 2.6471220490954628e-05, |
| "loss": 3144.15, |
| "step": 8970 |
| }, |
| { |
| "ce_loss_13": 2.421136862039566, |
| "ce_loss_26": 1.9213113605976104, |
| "ce_loss_39": 1.7483066588640213, |
| "ce_loss_52": 1.4667307168245316, |
| "ce_loss_7": 2.7391393184661865, |
| "epoch": 0.898, |
| "grad_norm": 14.053042951615785, |
| "kl_loss_13": 1968.0, |
| "kl_loss_26": 914.9, |
| "kl_loss_39": 554.05, |
| "kl_loss_7": 2636.4, |
| "learning_rate": 2.596418548250029e-05, |
| "loss": 3077.8, |
| "step": 8980 |
| }, |
| { |
| "ce_loss_13": 2.383489468693733, |
| "ce_loss_26": 1.8838744014501572, |
| "ce_loss_39": 1.7122972816228867, |
| "ce_loss_52": 1.4258252471685409, |
| "ce_loss_7": 2.7002600908279417, |
| "epoch": 0.899, |
| "grad_norm": 13.786718301064743, |
| "kl_loss_13": 1996.2, |
| "kl_loss_26": 931.7, |
| "kl_loss_39": 571.05, |
| "kl_loss_7": 2666.4, |
| "learning_rate": 2.5461924009435368e-05, |
| "loss": 3075.25, |
| "step": 8990 |
| }, |
| { |
| "ce_loss_13": 2.3905730485916137, |
| "ce_loss_26": 1.8808085292577743, |
| "ce_loss_39": 1.7040841788053513, |
| "ce_loss_52": 1.4255578130483628, |
| "ce_loss_7": 2.7172752916812897, |
| "epoch": 0.9, |
| "grad_norm": 14.142930179347214, |
| "kl_loss_13": 1983.0, |
| "kl_loss_26": 912.3, |
| "kl_loss_39": 546.25, |
| "kl_loss_7": 2670.0, |
| "learning_rate": 2.4964441129527336e-05, |
| "loss": 3116.85, |
| "step": 9000 |
| }, |
| { |
| "ce_loss_13": 2.43511378467083, |
| "ce_loss_26": 1.926012173295021, |
| "ce_loss_39": 1.7415268182754517, |
| "ce_loss_52": 1.4408889025449754, |
| "ce_loss_7": 2.761294722557068, |
| "epoch": 0.901, |
| "grad_norm": 13.928144692729873, |
| "kl_loss_13": 2050.6, |
| "kl_loss_26": 980.3, |
| "kl_loss_39": 592.65, |
| "kl_loss_7": 2732.0, |
| "learning_rate": 2.4471741852423235e-05, |
| "loss": 3132.7, |
| "step": 9010 |
| }, |
| { |
| "ce_loss_13": 2.3582999795675277, |
| "ce_loss_26": 1.8586651980876923, |
| "ce_loss_39": 1.6817226380109787, |
| "ce_loss_52": 1.3925694867968559, |
| "ce_loss_7": 2.6693267047405245, |
| "epoch": 0.902, |
| "grad_norm": 14.116392246566738, |
| "kl_loss_13": 1978.6, |
| "kl_loss_26": 929.3, |
| "kl_loss_39": 571.5, |
| "kl_loss_7": 2638.8, |
| "learning_rate": 2.3983831139599287e-05, |
| "loss": 3114.75, |
| "step": 9020 |
| }, |
| { |
| "ce_loss_13": 2.418634516000748, |
| "ce_loss_26": 1.9078166902065277, |
| "ce_loss_39": 1.7249888181686401, |
| "ce_loss_52": 1.4335400015115738, |
| "ce_loss_7": 2.7412546992301943, |
| "epoch": 0.903, |
| "grad_norm": 13.637306812880459, |
| "kl_loss_13": 2036.0, |
| "kl_loss_26": 950.4, |
| "kl_loss_39": 575.5, |
| "kl_loss_7": 2710.0, |
| "learning_rate": 2.3500713904311022e-05, |
| "loss": 3133.35, |
| "step": 9030 |
| }, |
| { |
| "ce_loss_13": 2.3931309431791306, |
| "ce_loss_26": 1.877245968580246, |
| "ce_loss_39": 1.7018247723579407, |
| "ce_loss_52": 1.4155418664216994, |
| "ce_loss_7": 2.7214196979999543, |
| "epoch": 0.904, |
| "grad_norm": 14.785773850168622, |
| "kl_loss_13": 2014.2, |
| "kl_loss_26": 931.6, |
| "kl_loss_39": 566.7, |
| "kl_loss_7": 2697.2, |
| "learning_rate": 2.3022395011543685e-05, |
| "loss": 3107.4, |
| "step": 9040 |
| }, |
| { |
| "ce_loss_13": 2.4141552269458773, |
| "ce_loss_26": 1.9003157913684845, |
| "ce_loss_39": 1.7119427561759948, |
| "ce_loss_52": 1.4218156844377519, |
| "ce_loss_7": 2.738201731443405, |
| "epoch": 0.905, |
| "grad_norm": 15.004045135842484, |
| "kl_loss_13": 2046.4, |
| "kl_loss_26": 956.8, |
| "kl_loss_39": 574.8, |
| "kl_loss_7": 2728.0, |
| "learning_rate": 2.2548879277963063e-05, |
| "loss": 3131.35, |
| "step": 9050 |
| }, |
| { |
| "ce_loss_13": 2.4368073105812074, |
| "ce_loss_26": 1.9358865648508072, |
| "ce_loss_39": 1.758334356546402, |
| "ce_loss_52": 1.4668044418096542, |
| "ce_loss_7": 2.7515600681304933, |
| "epoch": 0.906, |
| "grad_norm": 14.409368784921233, |
| "kl_loss_13": 1999.0, |
| "kl_loss_26": 944.8, |
| "kl_loss_39": 573.2, |
| "kl_loss_7": 2666.4, |
| "learning_rate": 2.208017147186736e-05, |
| "loss": 3129.15, |
| "step": 9060 |
| }, |
| { |
| "ce_loss_13": 2.4492080837488173, |
| "ce_loss_26": 1.9512592017650605, |
| "ce_loss_39": 1.7678290545940398, |
| "ce_loss_52": 1.4713279128074646, |
| "ce_loss_7": 2.772859865427017, |
| "epoch": 0.907, |
| "grad_norm": 14.409868370647818, |
| "kl_loss_13": 2027.6, |
| "kl_loss_26": 962.9, |
| "kl_loss_39": 580.75, |
| "kl_loss_7": 2708.4, |
| "learning_rate": 2.1616276313139227e-05, |
| "loss": 3136.45, |
| "step": 9070 |
| }, |
| { |
| "ce_loss_13": 2.3629566222429275, |
| "ce_loss_26": 1.866456887125969, |
| "ce_loss_39": 1.692075565457344, |
| "ce_loss_52": 1.4036286368966102, |
| "ce_loss_7": 2.681915229558945, |
| "epoch": 0.908, |
| "grad_norm": 13.121489500516578, |
| "kl_loss_13": 1981.6, |
| "kl_loss_26": 934.0, |
| "kl_loss_39": 568.15, |
| "kl_loss_7": 2653.6, |
| "learning_rate": 2.1157198473197415e-05, |
| "loss": 3145.35, |
| "step": 9080 |
| }, |
| { |
| "ce_loss_13": 2.4373748511075974, |
| "ce_loss_26": 1.9266161501407624, |
| "ce_loss_39": 1.7425854057073593, |
| "ce_loss_52": 1.4446089684963226, |
| "ce_loss_7": 2.7682818710803985, |
| "epoch": 0.909, |
| "grad_norm": 14.230511024408004, |
| "kl_loss_13": 2056.6, |
| "kl_loss_26": 973.2, |
| "kl_loss_39": 588.0, |
| "kl_loss_7": 2750.4, |
| "learning_rate": 2.0702942574950812e-05, |
| "loss": 3127.5, |
| "step": 9090 |
| }, |
| { |
| "ce_loss_13": 2.4287337332963945, |
| "ce_loss_26": 1.9172603338956833, |
| "ce_loss_39": 1.7392981857061387, |
| "ce_loss_52": 1.4425375372171403, |
| "ce_loss_7": 2.7510289788246154, |
| "epoch": 0.91, |
| "grad_norm": 14.057406508919666, |
| "kl_loss_13": 2046.6, |
| "kl_loss_26": 964.4, |
| "kl_loss_39": 593.1, |
| "kl_loss_7": 2722.4, |
| "learning_rate": 2.025351319275137e-05, |
| "loss": 3121.6, |
| "step": 9100 |
| }, |
| { |
| "ce_loss_13": 2.4320779502391816, |
| "ce_loss_26": 1.9104785054922104, |
| "ce_loss_39": 1.7267427280545236, |
| "ce_loss_52": 1.437354525923729, |
| "ce_loss_7": 2.7588888108730316, |
| "epoch": 0.911, |
| "grad_norm": 14.001627192514695, |
| "kl_loss_13": 2041.6, |
| "kl_loss_26": 960.0, |
| "kl_loss_39": 579.65, |
| "kl_loss_7": 2727.2, |
| "learning_rate": 1.9808914852347816e-05, |
| "loss": 3132.0, |
| "step": 9110 |
| }, |
| { |
| "ce_loss_13": 2.4577192962169647, |
| "ce_loss_26": 1.9406163454055787, |
| "ce_loss_39": 1.7553843706846237, |
| "ce_loss_52": 1.4540565699338912, |
| "ce_loss_7": 2.7937385022640226, |
| "epoch": 0.912, |
| "grad_norm": 14.03191291932469, |
| "kl_loss_13": 2055.4, |
| "kl_loss_26": 970.4, |
| "kl_loss_39": 590.75, |
| "kl_loss_7": 2749.6, |
| "learning_rate": 1.9369152030840554e-05, |
| "loss": 3137.8, |
| "step": 9120 |
| }, |
| { |
| "ce_loss_13": 2.37432479262352, |
| "ce_loss_26": 1.874118760228157, |
| "ce_loss_39": 1.6963829159736634, |
| "ce_loss_52": 1.4185152500867844, |
| "ce_loss_7": 2.692077511548996, |
| "epoch": 0.913, |
| "grad_norm": 14.634301127161054, |
| "kl_loss_13": 1966.6, |
| "kl_loss_26": 913.3, |
| "kl_loss_39": 546.15, |
| "kl_loss_7": 2640.0, |
| "learning_rate": 1.893422915663645e-05, |
| "loss": 3130.35, |
| "step": 9130 |
| }, |
| { |
| "ce_loss_13": 2.485106924176216, |
| "ce_loss_26": 1.9777852237224578, |
| "ce_loss_39": 1.7995707392692566, |
| "ce_loss_52": 1.4930359899997712, |
| "ce_loss_7": 2.800886517763138, |
| "epoch": 0.914, |
| "grad_norm": 13.987291564294976, |
| "kl_loss_13": 2050.4, |
| "kl_loss_26": 985.4, |
| "kl_loss_39": 605.6, |
| "kl_loss_7": 2732.4, |
| "learning_rate": 1.850415060940386e-05, |
| "loss": 3103.7, |
| "step": 9140 |
| }, |
| { |
| "ce_loss_13": 2.4284686923027037, |
| "ce_loss_26": 1.928224155306816, |
| "ce_loss_39": 1.7489481002092362, |
| "ce_loss_52": 1.4568757116794586, |
| "ce_loss_7": 2.7469853341579435, |
| "epoch": 0.915, |
| "grad_norm": 14.256565267478091, |
| "kl_loss_13": 2014.8, |
| "kl_loss_26": 950.5, |
| "kl_loss_39": 576.35, |
| "kl_loss_7": 2683.2, |
| "learning_rate": 1.8078920720028978e-05, |
| "loss": 3089.85, |
| "step": 9150 |
| }, |
| { |
| "ce_loss_13": 2.397159770131111, |
| "ce_loss_26": 1.9007071822881698, |
| "ce_loss_39": 1.7246300727128983, |
| "ce_loss_52": 1.4446155533194542, |
| "ce_loss_7": 2.7181631565093993, |
| "epoch": 0.916, |
| "grad_norm": 14.893106945439357, |
| "kl_loss_13": 1959.0, |
| "kl_loss_26": 905.8, |
| "kl_loss_39": 547.35, |
| "kl_loss_7": 2633.6, |
| "learning_rate": 1.765854377057219e-05, |
| "loss": 3113.25, |
| "step": 9160 |
| }, |
| { |
| "ce_loss_13": 2.391612654924393, |
| "ce_loss_26": 1.8863595336675645, |
| "ce_loss_39": 1.7089181810617446, |
| "ce_loss_52": 1.4133323535323143, |
| "ce_loss_7": 2.7117814838886263, |
| "epoch": 0.917, |
| "grad_norm": 13.727089751993333, |
| "kl_loss_13": 2026.6, |
| "kl_loss_26": 952.4, |
| "kl_loss_39": 579.55, |
| "kl_loss_7": 2699.2, |
| "learning_rate": 1.724302399422456e-05, |
| "loss": 3114.9, |
| "step": 9170 |
| }, |
| { |
| "ce_loss_13": 2.3898652464151384, |
| "ce_loss_26": 1.8965242326259613, |
| "ce_loss_39": 1.7190593391656876, |
| "ce_loss_52": 1.4282272264361382, |
| "ce_loss_7": 2.70514101088047, |
| "epoch": 0.918, |
| "grad_norm": 14.466760453933187, |
| "kl_loss_13": 1964.6, |
| "kl_loss_26": 920.2, |
| "kl_loss_39": 557.4, |
| "kl_loss_7": 2637.2, |
| "learning_rate": 1.683236557526574e-05, |
| "loss": 3117.7, |
| "step": 9180 |
| }, |
| { |
| "ce_loss_13": 2.363905116915703, |
| "ce_loss_26": 1.8732207268476486, |
| "ce_loss_39": 1.6985140055418015, |
| "ce_loss_52": 1.4114336684346198, |
| "ce_loss_7": 2.6808120787143705, |
| "epoch": 0.919, |
| "grad_norm": 13.740003598916001, |
| "kl_loss_13": 1964.6, |
| "kl_loss_26": 921.6, |
| "kl_loss_39": 557.8, |
| "kl_loss_7": 2632.0, |
| "learning_rate": 1.6426572649021475e-05, |
| "loss": 3121.6, |
| "step": 9190 |
| }, |
| { |
| "ce_loss_13": 2.4235911548137663, |
| "ce_loss_26": 1.911160832643509, |
| "ce_loss_39": 1.7327239394187928, |
| "ce_loss_52": 1.4489689737558364, |
| "ce_loss_7": 2.740164947509766, |
| "epoch": 0.92, |
| "grad_norm": 14.26925795970607, |
| "kl_loss_13": 2008.2, |
| "kl_loss_26": 933.2, |
| "kl_loss_39": 559.8, |
| "kl_loss_7": 2678.0, |
| "learning_rate": 1.6025649301821876e-05, |
| "loss": 3113.6, |
| "step": 9200 |
| }, |
| { |
| "ce_loss_13": 2.4683689922094345, |
| "ce_loss_26": 1.9488540649414063, |
| "ce_loss_39": 1.7631026744842528, |
| "ce_loss_52": 1.4620952308177948, |
| "ce_loss_7": 2.794441765546799, |
| "epoch": 0.921, |
| "grad_norm": 14.076823047271347, |
| "kl_loss_13": 2051.4, |
| "kl_loss_26": 970.8, |
| "kl_loss_39": 588.7, |
| "kl_loss_7": 2729.2, |
| "learning_rate": 1.5629599570960716e-05, |
| "loss": 3104.8, |
| "step": 9210 |
| }, |
| { |
| "ce_loss_13": 2.3585807204246523, |
| "ce_loss_26": 1.8531203657388686, |
| "ce_loss_39": 1.6820847302675248, |
| "ce_loss_52": 1.4014029562473298, |
| "ce_loss_7": 2.6779536455869675, |
| "epoch": 0.922, |
| "grad_norm": 13.947276602522905, |
| "kl_loss_13": 1987.2, |
| "kl_loss_26": 913.9, |
| "kl_loss_39": 557.25, |
| "kl_loss_7": 2664.4, |
| "learning_rate": 1.5238427444654367e-05, |
| "loss": 3096.2, |
| "step": 9220 |
| }, |
| { |
| "ce_loss_13": 2.3690007477998734, |
| "ce_loss_26": 1.8584237039089202, |
| "ce_loss_39": 1.6857000291347504, |
| "ce_loss_52": 1.3957198202610015, |
| "ce_loss_7": 2.697771596908569, |
| "epoch": 0.923, |
| "grad_norm": 13.90974821940606, |
| "kl_loss_13": 2016.4, |
| "kl_loss_26": 937.5, |
| "kl_loss_39": 574.65, |
| "kl_loss_7": 2710.0, |
| "learning_rate": 1.4852136862001764e-05, |
| "loss": 3120.95, |
| "step": 9230 |
| }, |
| { |
| "ce_loss_13": 2.386936154961586, |
| "ce_loss_26": 1.8795882225036622, |
| "ce_loss_39": 1.7048650175333022, |
| "ce_loss_52": 1.4252549767494203, |
| "ce_loss_7": 2.7094414860010145, |
| "epoch": 0.924, |
| "grad_norm": 14.075158349443141, |
| "kl_loss_13": 1994.4, |
| "kl_loss_26": 926.6, |
| "kl_loss_39": 558.55, |
| "kl_loss_7": 2674.0, |
| "learning_rate": 1.4470731712944884e-05, |
| "loss": 3095.9, |
| "step": 9240 |
| }, |
| { |
| "ce_loss_13": 2.480703926086426, |
| "ce_loss_26": 1.9693680822849273, |
| "ce_loss_39": 1.7812021166086196, |
| "ce_loss_52": 1.4721150636672973, |
| "ce_loss_7": 2.808130669593811, |
| "epoch": 0.925, |
| "grad_norm": 13.823292097408816, |
| "kl_loss_13": 2057.0, |
| "kl_loss_26": 989.9, |
| "kl_loss_39": 602.9, |
| "kl_loss_7": 2740.4, |
| "learning_rate": 1.4094215838229174e-05, |
| "loss": 3114.1, |
| "step": 9250 |
| }, |
| { |
| "ce_loss_13": 2.415296331048012, |
| "ce_loss_26": 1.9087094902992248, |
| "ce_loss_39": 1.7277994453907013, |
| "ce_loss_52": 1.4381081372499467, |
| "ce_loss_7": 2.738765448331833, |
| "epoch": 0.926, |
| "grad_norm": 14.170973390394844, |
| "kl_loss_13": 2029.4, |
| "kl_loss_26": 949.8, |
| "kl_loss_39": 570.55, |
| "kl_loss_7": 2714.0, |
| "learning_rate": 1.372259302936546e-05, |
| "loss": 3105.95, |
| "step": 9260 |
| }, |
| { |
| "ce_loss_13": 2.348677235841751, |
| "ce_loss_26": 1.8576316490769387, |
| "ce_loss_39": 1.6828459605574608, |
| "ce_loss_52": 1.3921316027641297, |
| "ce_loss_7": 2.6635967582464217, |
| "epoch": 0.927, |
| "grad_norm": 14.119512693749174, |
| "kl_loss_13": 1987.8, |
| "kl_loss_26": 934.2, |
| "kl_loss_39": 566.95, |
| "kl_loss_7": 2660.0, |
| "learning_rate": 1.3355867028591206e-05, |
| "loss": 3097.55, |
| "step": 9270 |
| }, |
| { |
| "ce_loss_13": 2.3999607056379317, |
| "ce_loss_26": 1.8902852058410644, |
| "ce_loss_39": 1.7127097964286804, |
| "ce_loss_52": 1.4248219341039658, |
| "ce_loss_7": 2.7230198085308075, |
| "epoch": 0.928, |
| "grad_norm": 14.447160188213138, |
| "kl_loss_13": 1992.2, |
| "kl_loss_26": 927.2, |
| "kl_loss_39": 561.6, |
| "kl_loss_7": 2679.2, |
| "learning_rate": 1.2994041528833267e-05, |
| "loss": 3090.65, |
| "step": 9280 |
| }, |
| { |
| "ce_loss_13": 2.502937263250351, |
| "ce_loss_26": 1.9786556929349899, |
| "ce_loss_39": 1.7848447173833848, |
| "ce_loss_52": 1.4703152477741241, |
| "ce_loss_7": 2.837230235338211, |
| "epoch": 0.929, |
| "grad_norm": 13.618067390259025, |
| "kl_loss_13": 2119.0, |
| "kl_loss_26": 1012.1, |
| "kl_loss_39": 614.9, |
| "kl_loss_7": 2810.8, |
| "learning_rate": 1.2637120173670358e-05, |
| "loss": 3139.05, |
| "step": 9290 |
| }, |
| { |
| "ce_loss_13": 2.4561884820461275, |
| "ce_loss_26": 1.9343136429786683, |
| "ce_loss_39": 1.747347640991211, |
| "ce_loss_52": 1.4338387340307235, |
| "ce_loss_7": 2.7828237235546114, |
| "epoch": 0.93, |
| "grad_norm": 14.04284428447472, |
| "kl_loss_13": 2098.8, |
| "kl_loss_26": 1005.3, |
| "kl_loss_39": 621.35, |
| "kl_loss_7": 2786.4, |
| "learning_rate": 1.2285106557296478e-05, |
| "loss": 3143.8, |
| "step": 9300 |
| }, |
| { |
| "ce_loss_13": 2.3713702976703646, |
| "ce_loss_26": 1.868654829263687, |
| "ce_loss_39": 1.6906698912382125, |
| "ce_loss_52": 1.409242296218872, |
| "ce_loss_7": 2.7044317960739135, |
| "epoch": 0.931, |
| "grad_norm": 14.323129542469303, |
| "kl_loss_13": 2006.0, |
| "kl_loss_26": 926.0, |
| "kl_loss_39": 564.0, |
| "kl_loss_7": 2676.4, |
| "learning_rate": 1.1938004224484989e-05, |
| "loss": 3116.75, |
| "step": 9310 |
| }, |
| { |
| "ce_loss_13": 2.428625673055649, |
| "ce_loss_26": 1.9227415055036545, |
| "ce_loss_39": 1.7408353060483932, |
| "ce_loss_52": 1.4440293073654176, |
| "ce_loss_7": 2.7480487704277037, |
| "epoch": 0.932, |
| "grad_norm": 13.286547502225938, |
| "kl_loss_13": 2016.8, |
| "kl_loss_26": 948.6, |
| "kl_loss_39": 573.45, |
| "kl_loss_7": 2695.6, |
| "learning_rate": 1.1595816670552429e-05, |
| "loss": 3098.9, |
| "step": 9320 |
| }, |
| { |
| "ce_loss_13": 2.3783950984477995, |
| "ce_loss_26": 1.871030893921852, |
| "ce_loss_39": 1.6935174107551574, |
| "ce_loss_52": 1.4064364448189735, |
| "ce_loss_7": 2.709024131298065, |
| "epoch": 0.933, |
| "grad_norm": 13.906014402683894, |
| "kl_loss_13": 2025.8, |
| "kl_loss_26": 943.2, |
| "kl_loss_39": 573.45, |
| "kl_loss_7": 2710.0, |
| "learning_rate": 1.1258547341323699e-05, |
| "loss": 3112.75, |
| "step": 9330 |
| }, |
| { |
| "ce_loss_13": 2.418292981386185, |
| "ce_loss_26": 1.9166706264019013, |
| "ce_loss_39": 1.7358173072338103, |
| "ce_loss_52": 1.4464493066072464, |
| "ce_loss_7": 2.7368280410766603, |
| "epoch": 0.934, |
| "grad_norm": 13.814674823123692, |
| "kl_loss_13": 2007.6, |
| "kl_loss_26": 936.8, |
| "kl_loss_39": 570.05, |
| "kl_loss_7": 2680.4, |
| "learning_rate": 1.0926199633097156e-05, |
| "loss": 3089.9, |
| "step": 9340 |
| }, |
| { |
| "ce_loss_13": 2.421270787715912, |
| "ce_loss_26": 1.9069751173257827, |
| "ce_loss_39": 1.7246074616909026, |
| "ce_loss_52": 1.4261210292577744, |
| "ce_loss_7": 2.7438779413700103, |
| "epoch": 0.935, |
| "grad_norm": 14.043137766458962, |
| "kl_loss_13": 2039.4, |
| "kl_loss_26": 967.3, |
| "kl_loss_39": 585.6, |
| "kl_loss_7": 2722.8, |
| "learning_rate": 1.0598776892610684e-05, |
| "loss": 3103.7, |
| "step": 9350 |
| }, |
| { |
| "ce_loss_13": 2.4761788189411162, |
| "ce_loss_26": 1.9670240104198455, |
| "ce_loss_39": 1.7913133591413497, |
| "ce_loss_52": 1.5009067565202714, |
| "ce_loss_7": 2.7962326526641847, |
| "epoch": 0.936, |
| "grad_norm": 13.716488729093573, |
| "kl_loss_13": 2008.0, |
| "kl_loss_26": 945.5, |
| "kl_loss_39": 573.65, |
| "kl_loss_7": 2685.6, |
| "learning_rate": 1.0276282417007399e-05, |
| "loss": 3106.15, |
| "step": 9360 |
| }, |
| { |
| "ce_loss_13": 2.418415975570679, |
| "ce_loss_26": 1.916797822713852, |
| "ce_loss_39": 1.741264235973358, |
| "ce_loss_52": 1.452787458896637, |
| "ce_loss_7": 2.744573098421097, |
| "epoch": 0.937, |
| "grad_norm": 13.43858517345926, |
| "kl_loss_13": 1999.2, |
| "kl_loss_26": 936.2, |
| "kl_loss_39": 569.25, |
| "kl_loss_7": 2682.0, |
| "learning_rate": 9.958719453803277e-06, |
| "loss": 3109.9, |
| "step": 9370 |
| }, |
| { |
| "ce_loss_13": 2.3895679712295532, |
| "ce_loss_26": 1.8777837812900544, |
| "ce_loss_39": 1.6948266059160233, |
| "ce_loss_52": 1.4046757638454437, |
| "ce_loss_7": 2.7213546216487883, |
| "epoch": 0.938, |
| "grad_norm": 14.159970740647127, |
| "kl_loss_13": 2015.8, |
| "kl_loss_26": 945.0, |
| "kl_loss_39": 569.8, |
| "kl_loss_7": 2706.4, |
| "learning_rate": 9.646091200853802e-06, |
| "loss": 3110.975, |
| "step": 9380 |
| }, |
| { |
| "ce_loss_13": 2.389327567815781, |
| "ce_loss_26": 1.8952437072992325, |
| "ce_loss_39": 1.7221183687448502, |
| "ce_loss_52": 1.4358048617839814, |
| "ce_loss_7": 2.7049793720245363, |
| "epoch": 0.939, |
| "grad_norm": 13.344604295880838, |
| "kl_loss_13": 1964.8, |
| "kl_loss_26": 921.8, |
| "kl_loss_39": 560.1, |
| "kl_loss_7": 2638.0, |
| "learning_rate": 9.338400806321978e-06, |
| "loss": 3087.6, |
| "step": 9390 |
| }, |
| { |
| "ce_loss_13": 2.4069793194532396, |
| "ce_loss_26": 1.8967778533697128, |
| "ce_loss_39": 1.7197368562221527, |
| "ce_loss_52": 1.436858707666397, |
| "ce_loss_7": 2.7245797514915466, |
| "epoch": 0.94, |
| "grad_norm": 13.71832049440062, |
| "kl_loss_13": 2002.0, |
| "kl_loss_26": 937.3, |
| "kl_loss_39": 566.65, |
| "kl_loss_7": 2677.2, |
| "learning_rate": 9.035651368646646e-06, |
| "loss": 3124.5, |
| "step": 9400 |
| }, |
| { |
| "ce_loss_13": 2.3785787016153335, |
| "ce_loss_26": 1.8779745906591416, |
| "ce_loss_39": 1.7003175497055054, |
| "ce_loss_52": 1.4152926355600357, |
| "ce_loss_7": 2.6983864098787307, |
| "epoch": 0.941, |
| "grad_norm": 14.37750392989465, |
| "kl_loss_13": 1983.0, |
| "kl_loss_26": 932.4, |
| "kl_loss_39": 563.55, |
| "kl_loss_7": 2649.2, |
| "learning_rate": 8.737845936511335e-06, |
| "loss": 3126.15, |
| "step": 9410 |
| }, |
| { |
| "ce_loss_13": 2.4120055079460143, |
| "ce_loss_26": 1.898421436548233, |
| "ce_loss_39": 1.7264380306005478, |
| "ce_loss_52": 1.4368474900722503, |
| "ce_loss_7": 2.737997555732727, |
| "epoch": 0.942, |
| "grad_norm": 13.918132865929852, |
| "kl_loss_13": 2037.6, |
| "kl_loss_26": 952.9, |
| "kl_loss_39": 578.25, |
| "kl_loss_7": 2716.0, |
| "learning_rate": 8.444987508813451e-06, |
| "loss": 3086.75, |
| "step": 9420 |
| }, |
| { |
| "ce_loss_13": 2.4210041254758834, |
| "ce_loss_26": 1.9090048849582673, |
| "ce_loss_39": 1.732128456234932, |
| "ce_loss_52": 1.4372442662715912, |
| "ce_loss_7": 2.743331879377365, |
| "epoch": 0.943, |
| "grad_norm": 13.763467997379733, |
| "kl_loss_13": 2025.8, |
| "kl_loss_26": 953.0, |
| "kl_loss_39": 582.75, |
| "kl_loss_7": 2709.6, |
| "learning_rate": 8.157079034633974e-06, |
| "loss": 3102.0, |
| "step": 9430 |
| }, |
| { |
| "ce_loss_13": 2.390827241539955, |
| "ce_loss_26": 1.8993241131305694, |
| "ce_loss_39": 1.720950961112976, |
| "ce_loss_52": 1.4380876436829566, |
| "ce_loss_7": 2.708938491344452, |
| "epoch": 0.944, |
| "grad_norm": 13.65095927372199, |
| "kl_loss_13": 1961.4, |
| "kl_loss_26": 921.2, |
| "kl_loss_39": 551.1, |
| "kl_loss_7": 2626.8, |
| "learning_rate": 7.874123413208145e-06, |
| "loss": 3097.4, |
| "step": 9440 |
| }, |
| { |
| "ce_loss_13": 2.377914309501648, |
| "ce_loss_26": 1.8749269813299179, |
| "ce_loss_39": 1.699908110499382, |
| "ce_loss_52": 1.4185950323939323, |
| "ce_loss_7": 2.700740724802017, |
| "epoch": 0.945, |
| "grad_norm": 13.057374043926089, |
| "kl_loss_13": 2000.2, |
| "kl_loss_26": 931.2, |
| "kl_loss_39": 558.45, |
| "kl_loss_7": 2681.6, |
| "learning_rate": 7.59612349389599e-06, |
| "loss": 3113.0, |
| "step": 9450 |
| }, |
| { |
| "ce_loss_13": 2.428489762544632, |
| "ce_loss_26": 1.9228288322687148, |
| "ce_loss_39": 1.7434315174818038, |
| "ce_loss_52": 1.4502023369073869, |
| "ce_loss_7": 2.745371562242508, |
| "epoch": 0.946, |
| "grad_norm": 13.467471129956634, |
| "kl_loss_13": 2003.8, |
| "kl_loss_26": 954.8, |
| "kl_loss_39": 579.9, |
| "kl_loss_7": 2674.8, |
| "learning_rate": 7.323082076153509e-06, |
| "loss": 3110.35, |
| "step": 9460 |
| }, |
| { |
| "ce_loss_13": 2.3993911921977995, |
| "ce_loss_26": 1.8969668239355086, |
| "ce_loss_39": 1.7201234728097916, |
| "ce_loss_52": 1.4261724770069122, |
| "ce_loss_7": 2.723515260219574, |
| "epoch": 0.947, |
| "grad_norm": 14.123562965002943, |
| "kl_loss_13": 1995.2, |
| "kl_loss_26": 940.5, |
| "kl_loss_39": 572.85, |
| "kl_loss_7": 2665.2, |
| "learning_rate": 7.055001909504755e-06, |
| "loss": 3103.5, |
| "step": 9470 |
| }, |
| { |
| "ce_loss_13": 2.371204599738121, |
| "ce_loss_26": 1.8650381177663804, |
| "ce_loss_39": 1.6901476740837098, |
| "ce_loss_52": 1.4073528528213501, |
| "ce_loss_7": 2.69525728225708, |
| "epoch": 0.948, |
| "grad_norm": 13.7395501584386, |
| "kl_loss_13": 2001.2, |
| "kl_loss_26": 926.3, |
| "kl_loss_39": 559.35, |
| "kl_loss_7": 2675.2, |
| "learning_rate": 6.791885693514133e-06, |
| "loss": 3117.9, |
| "step": 9480 |
| }, |
| { |
| "ce_loss_13": 2.3874946534633636, |
| "ce_loss_26": 1.8710165858268737, |
| "ce_loss_39": 1.6918294131755829, |
| "ce_loss_52": 1.4066254168748855, |
| "ce_loss_7": 2.718043899536133, |
| "epoch": 0.949, |
| "grad_norm": 14.444149745541107, |
| "kl_loss_13": 2026.6, |
| "kl_loss_26": 936.5, |
| "kl_loss_39": 563.35, |
| "kl_loss_7": 2720.4, |
| "learning_rate": 6.533736077758867e-06, |
| "loss": 3144.35, |
| "step": 9490 |
| }, |
| { |
| "ce_loss_13": 2.3802697598934173, |
| "ce_loss_26": 1.8696208387613296, |
| "ce_loss_39": 1.6913905203342439, |
| "ce_loss_52": 1.4053042978048325, |
| "ce_loss_7": 2.706346648931503, |
| "epoch": 0.95, |
| "grad_norm": 13.683323615533283, |
| "kl_loss_13": 2001.6, |
| "kl_loss_26": 931.6, |
| "kl_loss_39": 565.45, |
| "kl_loss_7": 2685.6, |
| "learning_rate": 6.2805556618028556e-06, |
| "loss": 3132.15, |
| "step": 9500 |
| }, |
| { |
| "ce_loss_13": 2.45430488884449, |
| "ce_loss_26": 1.9479888796806335, |
| "ce_loss_39": 1.7709876328706742, |
| "ce_loss_52": 1.4815271288156509, |
| "ce_loss_7": 2.7643910527229307, |
| "epoch": 0.951, |
| "grad_norm": 14.559144572280042, |
| "kl_loss_13": 2000.2, |
| "kl_loss_26": 946.3, |
| "kl_loss_39": 575.65, |
| "kl_loss_7": 2660.0, |
| "learning_rate": 6.032346995169968e-06, |
| "loss": 3124.85, |
| "step": 9510 |
| }, |
| { |
| "ce_loss_13": 2.4814863801002502, |
| "ce_loss_26": 1.976859924197197, |
| "ce_loss_39": 1.795655995607376, |
| "ce_loss_52": 1.4911505609750748, |
| "ce_loss_7": 2.796732819080353, |
| "epoch": 0.952, |
| "grad_norm": 14.307827869099665, |
| "kl_loss_13": 2052.0, |
| "kl_loss_26": 982.8, |
| "kl_loss_39": 599.65, |
| "kl_loss_7": 2722.8, |
| "learning_rate": 5.789112577318789e-06, |
| "loss": 3131.25, |
| "step": 9520 |
| }, |
| { |
| "ce_loss_13": 2.370393967628479, |
| "ce_loss_26": 1.8703551948070527, |
| "ce_loss_39": 1.6911178916692733, |
| "ce_loss_52": 1.397288253903389, |
| "ce_loss_7": 2.6921759128570555, |
| "epoch": 0.953, |
| "grad_norm": 13.532268906242434, |
| "kl_loss_13": 2004.2, |
| "kl_loss_26": 951.2, |
| "kl_loss_39": 578.45, |
| "kl_loss_7": 2674.4, |
| "learning_rate": 5.550854857617194e-06, |
| "loss": 3093.1, |
| "step": 9530 |
| }, |
| { |
| "ce_loss_13": 2.3659786969423293, |
| "ce_loss_26": 1.855891814827919, |
| "ce_loss_39": 1.6768086194992065, |
| "ce_loss_52": 1.3908632963895797, |
| "ce_loss_7": 2.6853357315063477, |
| "epoch": 0.954, |
| "grad_norm": 14.753075788642166, |
| "kl_loss_13": 2010.2, |
| "kl_loss_26": 935.5, |
| "kl_loss_39": 563.55, |
| "kl_loss_7": 2681.6, |
| "learning_rate": 5.317576235317756e-06, |
| "loss": 3120.8, |
| "step": 9540 |
| }, |
| { |
| "ce_loss_13": 2.4337273120880125, |
| "ce_loss_26": 1.9248733311891555, |
| "ce_loss_39": 1.7499325275421143, |
| "ce_loss_52": 1.4674480736255646, |
| "ce_loss_7": 2.7530871987342835, |
| "epoch": 0.955, |
| "grad_norm": 13.311149092546296, |
| "kl_loss_13": 1991.6, |
| "kl_loss_26": 925.5, |
| "kl_loss_39": 562.15, |
| "kl_loss_7": 2663.6, |
| "learning_rate": 5.089279059533658e-06, |
| "loss": 3080.15, |
| "step": 9550 |
| }, |
| { |
| "ce_loss_13": 2.477786514163017, |
| "ce_loss_26": 1.9535741955041885, |
| "ce_loss_39": 1.7720552951097488, |
| "ce_loss_52": 1.468274374306202, |
| "ce_loss_7": 2.806012988090515, |
| "epoch": 0.956, |
| "grad_norm": 13.557904970493434, |
| "kl_loss_13": 2082.8, |
| "kl_loss_26": 980.6, |
| "kl_loss_39": 601.5, |
| "kl_loss_7": 2773.6, |
| "learning_rate": 4.865965629214819e-06, |
| "loss": 3106.7, |
| "step": 9560 |
| }, |
| { |
| "ce_loss_13": 2.4565936863422393, |
| "ce_loss_26": 1.9573397368192673, |
| "ce_loss_39": 1.773080477118492, |
| "ce_loss_52": 1.4730938911437987, |
| "ce_loss_7": 2.7732574224472044, |
| "epoch": 0.957, |
| "grad_norm": 14.224632183872556, |
| "kl_loss_13": 2023.4, |
| "kl_loss_26": 967.5, |
| "kl_loss_39": 591.9, |
| "kl_loss_7": 2693.2, |
| "learning_rate": 4.6476381931251366e-06, |
| "loss": 3121.6, |
| "step": 9570 |
| }, |
| { |
| "ce_loss_13": 2.3847708880901335, |
| "ce_loss_26": 1.8858718812465667, |
| "ce_loss_39": 1.7032645136117934, |
| "ce_loss_52": 1.4209994703531266, |
| "ce_loss_7": 2.708657431602478, |
| "epoch": 0.958, |
| "grad_norm": 13.699065805322247, |
| "kl_loss_13": 1983.8, |
| "kl_loss_26": 921.9, |
| "kl_loss_39": 551.45, |
| "kl_loss_7": 2664.4, |
| "learning_rate": 4.434298949819449e-06, |
| "loss": 3097.9, |
| "step": 9580 |
| }, |
| { |
| "ce_loss_13": 2.417462554574013, |
| "ce_loss_26": 1.9159747958183289, |
| "ce_loss_39": 1.7397230744361878, |
| "ce_loss_52": 1.4456302881240846, |
| "ce_loss_7": 2.7354709684848784, |
| "epoch": 0.959, |
| "grad_norm": 13.043144407859455, |
| "kl_loss_13": 1990.0, |
| "kl_loss_26": 941.8, |
| "kl_loss_39": 574.8, |
| "kl_loss_7": 2662.4, |
| "learning_rate": 4.2259500476214406e-06, |
| "loss": 3095.8, |
| "step": 9590 |
| }, |
| { |
| "ce_loss_13": 2.427861177921295, |
| "ce_loss_26": 1.9228525012731552, |
| "ce_loss_39": 1.7392638593912124, |
| "ce_loss_52": 1.445976984500885, |
| "ce_loss_7": 2.7459777116775514, |
| "epoch": 0.96, |
| "grad_norm": 13.619745852010974, |
| "kl_loss_13": 2021.4, |
| "kl_loss_26": 954.5, |
| "kl_loss_39": 577.9, |
| "kl_loss_7": 2698.8, |
| "learning_rate": 4.02259358460233e-06, |
| "loss": 3122.9, |
| "step": 9600 |
| }, |
| { |
| "ce_loss_13": 2.46402502655983, |
| "ce_loss_26": 1.9587235629558564, |
| "ce_loss_39": 1.7794159650802612, |
| "ce_loss_52": 1.4795134991407395, |
| "ce_loss_7": 2.7920902401208876, |
| "epoch": 0.961, |
| "grad_norm": 13.957824607337622, |
| "kl_loss_13": 2034.8, |
| "kl_loss_26": 964.3, |
| "kl_loss_39": 588.35, |
| "kl_loss_7": 2719.6, |
| "learning_rate": 3.8242316085594916e-06, |
| "loss": 3106.9, |
| "step": 9610 |
| }, |
| { |
| "ce_loss_13": 2.4026571094989775, |
| "ce_loss_26": 1.8842627108097076, |
| "ce_loss_39": 1.6962791502475738, |
| "ce_loss_52": 1.4012041926383971, |
| "ce_loss_7": 2.7310706257820128, |
| "epoch": 0.962, |
| "grad_norm": 13.847893098332637, |
| "kl_loss_13": 2054.8, |
| "kl_loss_26": 965.5, |
| "kl_loss_39": 579.7, |
| "kl_loss_7": 2743.2, |
| "learning_rate": 3.630866116995757e-06, |
| "loss": 3149.6, |
| "step": 9620 |
| }, |
| { |
| "ce_loss_13": 2.3699823945760725, |
| "ce_loss_26": 1.8775872141122818, |
| "ce_loss_39": 1.7078934848308562, |
| "ce_loss_52": 1.427069191634655, |
| "ce_loss_7": 2.68268860578537, |
| "epoch": 0.963, |
| "grad_norm": 14.006927808523123, |
| "kl_loss_13": 1949.4, |
| "kl_loss_26": 909.6, |
| "kl_loss_39": 550.8, |
| "kl_loss_7": 2613.6, |
| "learning_rate": 3.4424990570994797e-06, |
| "loss": 3088.75, |
| "step": 9630 |
| }, |
| { |
| "ce_loss_13": 2.4296331614255906, |
| "ce_loss_26": 1.923089200258255, |
| "ce_loss_39": 1.7481043189764023, |
| "ce_loss_52": 1.458402395248413, |
| "ce_loss_7": 2.7494013249874114, |
| "epoch": 0.964, |
| "grad_norm": 13.971614829348757, |
| "kl_loss_13": 1997.2, |
| "kl_loss_26": 933.6, |
| "kl_loss_39": 570.15, |
| "kl_loss_7": 2669.6, |
| "learning_rate": 3.2591323257248896e-06, |
| "loss": 3114.2, |
| "step": 9640 |
| }, |
| { |
| "ce_loss_13": 2.4253605216741563, |
| "ce_loss_26": 1.920077031850815, |
| "ce_loss_39": 1.7424649715423584, |
| "ce_loss_52": 1.4570231169462204, |
| "ce_loss_7": 2.744251537322998, |
| "epoch": 0.965, |
| "grad_norm": 13.868560987861438, |
| "kl_loss_13": 2000.2, |
| "kl_loss_26": 937.9, |
| "kl_loss_39": 567.9, |
| "kl_loss_7": 2672.0, |
| "learning_rate": 3.0807677693729385e-06, |
| "loss": 3117.15, |
| "step": 9650 |
| }, |
| { |
| "ce_loss_13": 2.445168226957321, |
| "ce_loss_26": 1.9308179676532746, |
| "ce_loss_39": 1.7556968212127686, |
| "ce_loss_52": 1.462259876728058, |
| "ce_loss_7": 2.7607338547706606, |
| "epoch": 0.966, |
| "grad_norm": 13.868630790191208, |
| "kl_loss_13": 2030.0, |
| "kl_loss_26": 955.1, |
| "kl_loss_39": 582.55, |
| "kl_loss_7": 2710.4, |
| "learning_rate": 2.9074071841727055e-06, |
| "loss": 3136.15, |
| "step": 9660 |
| }, |
| { |
| "ce_loss_13": 2.3746090680360794, |
| "ce_loss_26": 1.8696930974721908, |
| "ce_loss_39": 1.6932952284812928, |
| "ce_loss_52": 1.4037827536463738, |
| "ce_loss_7": 2.6961917489767075, |
| "epoch": 0.967, |
| "grad_norm": 13.741681034653173, |
| "kl_loss_13": 2003.4, |
| "kl_loss_26": 943.6, |
| "kl_loss_39": 570.8, |
| "kl_loss_7": 2675.6, |
| "learning_rate": 2.739052315863355e-06, |
| "loss": 3123.325, |
| "step": 9670 |
| }, |
| { |
| "ce_loss_13": 2.4609276592731475, |
| "ce_loss_26": 1.9465218961238862, |
| "ce_loss_39": 1.755302396416664, |
| "ce_loss_52": 1.4513660728931428, |
| "ce_loss_7": 2.783074140548706, |
| "epoch": 0.968, |
| "grad_norm": 13.849931878044563, |
| "kl_loss_13": 2075.0, |
| "kl_loss_26": 988.8, |
| "kl_loss_39": 596.8, |
| "kl_loss_7": 2755.6, |
| "learning_rate": 2.5757048597765396e-06, |
| "loss": 3108.55, |
| "step": 9680 |
| }, |
| { |
| "ce_loss_13": 2.3671315789222716, |
| "ce_loss_26": 1.8652487874031067, |
| "ce_loss_39": 1.6895152300596237, |
| "ce_loss_52": 1.4115738093852996, |
| "ce_loss_7": 2.6859952569007874, |
| "epoch": 0.969, |
| "grad_norm": 14.271644916152136, |
| "kl_loss_13": 1975.0, |
| "kl_loss_26": 910.5, |
| "kl_loss_39": 544.7, |
| "kl_loss_7": 2649.2, |
| "learning_rate": 2.417366460819359e-06, |
| "loss": 3094.15, |
| "step": 9690 |
| }, |
| { |
| "ce_loss_13": 2.4030214190483092, |
| "ce_loss_26": 1.8979378938674927, |
| "ce_loss_39": 1.7221683353185653, |
| "ce_loss_52": 1.4381350710988046, |
| "ce_loss_7": 2.723929351568222, |
| "epoch": 0.97, |
| "grad_norm": 13.991735266099393, |
| "kl_loss_13": 1985.0, |
| "kl_loss_26": 927.7, |
| "kl_loss_39": 554.55, |
| "kl_loss_7": 2656.0, |
| "learning_rate": 2.2640387134577057e-06, |
| "loss": 3121.05, |
| "step": 9700 |
| }, |
| { |
| "ce_loss_13": 2.388926792144775, |
| "ce_loss_26": 1.8821999937295915, |
| "ce_loss_39": 1.7065987050533296, |
| "ce_loss_52": 1.427994754910469, |
| "ce_loss_7": 2.7048760533332823, |
| "epoch": 0.971, |
| "grad_norm": 14.179862402525806, |
| "kl_loss_13": 1973.6, |
| "kl_loss_26": 911.0, |
| "kl_loss_39": 548.95, |
| "kl_loss_7": 2646.4, |
| "learning_rate": 2.115723161700278e-06, |
| "loss": 3136.0, |
| "step": 9710 |
| }, |
| { |
| "ce_loss_13": 2.448368564248085, |
| "ce_loss_26": 1.9373161673545838, |
| "ce_loss_39": 1.7496683716773986, |
| "ce_loss_52": 1.4501112252473831, |
| "ce_loss_7": 2.7681332349777223, |
| "epoch": 0.972, |
| "grad_norm": 13.312305733104958, |
| "kl_loss_13": 2063.8, |
| "kl_loss_26": 986.5, |
| "kl_loss_39": 593.8, |
| "kl_loss_7": 2737.6, |
| "learning_rate": 1.9724212990830937e-06, |
| "loss": 3096.45, |
| "step": 9720 |
| }, |
| { |
| "ce_loss_13": 2.40165196955204, |
| "ce_loss_26": 1.9097970753908158, |
| "ce_loss_39": 1.732149314880371, |
| "ce_loss_52": 1.4418910443782806, |
| "ce_loss_7": 2.7182520925998688, |
| "epoch": 0.973, |
| "grad_norm": 13.186413860645287, |
| "kl_loss_13": 1986.6, |
| "kl_loss_26": 939.2, |
| "kl_loss_39": 575.8, |
| "kl_loss_7": 2649.6, |
| "learning_rate": 1.8341345686543331e-06, |
| "loss": 3096.7, |
| "step": 9730 |
| }, |
| { |
| "ce_loss_13": 2.4751327097415925, |
| "ce_loss_26": 1.9718121886253357, |
| "ce_loss_39": 1.7938049882650375, |
| "ce_loss_52": 1.510325726866722, |
| "ce_loss_7": 2.790462166070938, |
| "epoch": 0.974, |
| "grad_norm": 13.542368115510277, |
| "kl_loss_13": 1991.0, |
| "kl_loss_26": 919.4, |
| "kl_loss_39": 555.25, |
| "kl_loss_7": 2656.0, |
| "learning_rate": 1.7008643629596864e-06, |
| "loss": 3139.35, |
| "step": 9740 |
| }, |
| { |
| "ce_loss_13": 2.448350805044174, |
| "ce_loss_26": 1.9393187165260315, |
| "ce_loss_39": 1.756307190656662, |
| "ce_loss_52": 1.4625631257891656, |
| "ce_loss_7": 2.7839869439601896, |
| "epoch": 0.975, |
| "grad_norm": 14.153531935288711, |
| "kl_loss_13": 2030.2, |
| "kl_loss_26": 953.5, |
| "kl_loss_39": 573.5, |
| "kl_loss_7": 2731.6, |
| "learning_rate": 1.5726120240288633e-06, |
| "loss": 3091.65, |
| "step": 9750 |
| }, |
| { |
| "ce_loss_13": 2.493607670068741, |
| "ce_loss_26": 1.9704292267560959, |
| "ce_loss_39": 1.7810406684875488, |
| "ce_loss_52": 1.4734347879886627, |
| "ce_loss_7": 2.8225875020027162, |
| "epoch": 0.976, |
| "grad_norm": 13.917924014106907, |
| "kl_loss_13": 2092.8, |
| "kl_loss_26": 995.9, |
| "kl_loss_39": 599.65, |
| "kl_loss_7": 2777.6, |
| "learning_rate": 1.4493788433612708e-06, |
| "loss": 3106.15, |
| "step": 9760 |
| }, |
| { |
| "ce_loss_13": 2.389453822374344, |
| "ce_loss_26": 1.889643257856369, |
| "ce_loss_39": 1.7158276617527009, |
| "ce_loss_52": 1.4275161743164062, |
| "ce_loss_7": 2.7086060285568236, |
| "epoch": 0.977, |
| "grad_norm": 13.614062458586098, |
| "kl_loss_13": 1971.2, |
| "kl_loss_26": 924.0, |
| "kl_loss_39": 561.0, |
| "kl_loss_7": 2646.0, |
| "learning_rate": 1.3311660619138578e-06, |
| "loss": 3083.9, |
| "step": 9770 |
| }, |
| { |
| "ce_loss_13": 2.387269985675812, |
| "ce_loss_26": 1.8734027475118638, |
| "ce_loss_39": 1.6912487357854844, |
| "ce_loss_52": 1.4031418770551682, |
| "ce_loss_7": 2.7191080808639527, |
| "epoch": 0.978, |
| "grad_norm": 14.36678395992836, |
| "kl_loss_13": 2023.2, |
| "kl_loss_26": 943.6, |
| "kl_loss_39": 565.3, |
| "kl_loss_7": 2722.0, |
| "learning_rate": 1.2179748700879012e-06, |
| "loss": 3100.55, |
| "step": 9780 |
| }, |
| { |
| "ce_loss_13": 2.3631068110466003, |
| "ce_loss_26": 1.861675202846527, |
| "ce_loss_39": 1.6827853351831437, |
| "ce_loss_52": 1.3983743026852609, |
| "ce_loss_7": 2.6796926259994507, |
| "epoch": 0.979, |
| "grad_norm": 14.06766234963321, |
| "kl_loss_13": 1993.6, |
| "kl_loss_26": 930.1, |
| "kl_loss_39": 561.8, |
| "kl_loss_7": 2668.4, |
| "learning_rate": 1.1098064077174619e-06, |
| "loss": 3119.95, |
| "step": 9790 |
| }, |
| { |
| "ce_loss_13": 2.4609683632850645, |
| "ce_loss_26": 1.948616126179695, |
| "ce_loss_39": 1.7647934973239898, |
| "ce_loss_52": 1.454009547829628, |
| "ce_loss_7": 2.785689663887024, |
| "epoch": 0.98, |
| "grad_norm": 13.39999724987333, |
| "kl_loss_13": 2070.0, |
| "kl_loss_26": 989.6, |
| "kl_loss_39": 607.8, |
| "kl_loss_7": 2749.6, |
| "learning_rate": 1.006661764057837e-06, |
| "loss": 3101.0, |
| "step": 9800 |
| }, |
| { |
| "ce_loss_13": 2.3849492847919462, |
| "ce_loss_26": 1.8622053205966949, |
| "ce_loss_39": 1.6866901487112045, |
| "ce_loss_52": 1.3904988124966622, |
| "ce_loss_7": 2.713945233821869, |
| "epoch": 0.981, |
| "grad_norm": 13.776704297392317, |
| "kl_loss_13": 2060.2, |
| "kl_loss_26": 959.2, |
| "kl_loss_39": 584.2, |
| "kl_loss_7": 2753.2, |
| "learning_rate": 9.085419777743465e-07, |
| "loss": 3145.375, |
| "step": 9810 |
| }, |
| { |
| "ce_loss_13": 2.423547920584679, |
| "ce_loss_26": 1.9212449431419372, |
| "ce_loss_39": 1.7484615802764893, |
| "ce_loss_52": 1.4525489255785942, |
| "ce_loss_7": 2.7415110945701597, |
| "epoch": 0.982, |
| "grad_norm": 13.800146507088886, |
| "kl_loss_13": 2026.2, |
| "kl_loss_26": 956.5, |
| "kl_loss_39": 585.85, |
| "kl_loss_7": 2700.8, |
| "learning_rate": 8.15448036932176e-07, |
| "loss": 3140.075, |
| "step": 9820 |
| }, |
| { |
| "ce_loss_13": 2.4265142381191254, |
| "ce_loss_26": 1.9230076640844345, |
| "ce_loss_39": 1.7403117150068284, |
| "ce_loss_52": 1.44933120906353, |
| "ce_loss_7": 2.743628019094467, |
| "epoch": 0.983, |
| "grad_norm": 13.731982955757704, |
| "kl_loss_13": 2035.6, |
| "kl_loss_26": 967.5, |
| "kl_loss_39": 589.65, |
| "kl_loss_7": 2710.4, |
| "learning_rate": 7.273808789862724e-07, |
| "loss": 3097.325, |
| "step": 9830 |
| }, |
| { |
| "ce_loss_13": 2.432727184891701, |
| "ce_loss_26": 1.9222358494997025, |
| "ce_loss_39": 1.7392481476068498, |
| "ce_loss_52": 1.4496447369456291, |
| "ce_loss_7": 2.7563742280006407, |
| "epoch": 0.984, |
| "grad_norm": 14.26920700265725, |
| "kl_loss_13": 2031.0, |
| "kl_loss_26": 951.8, |
| "kl_loss_39": 579.1, |
| "kl_loss_7": 2718.0, |
| "learning_rate": 6.443413907720186e-07, |
| "loss": 3091.6, |
| "step": 9840 |
| }, |
| { |
| "ce_loss_13": 2.3425186455249785, |
| "ce_loss_26": 1.8573100596666337, |
| "ce_loss_39": 1.6887821286916733, |
| "ce_loss_52": 1.3970566481351852, |
| "ce_loss_7": 2.6601881802082064, |
| "epoch": 0.985, |
| "grad_norm": 14.017763045011195, |
| "kl_loss_13": 1953.8, |
| "kl_loss_26": 920.7, |
| "kl_loss_39": 567.3, |
| "kl_loss_7": 2625.2, |
| "learning_rate": 5.663304084960185e-07, |
| "loss": 3110.05, |
| "step": 9850 |
| }, |
| { |
| "ce_loss_13": 2.3719563096761704, |
| "ce_loss_26": 1.8737152755260467, |
| "ce_loss_39": 1.698423257470131, |
| "ce_loss_52": 1.4208501130342484, |
| "ce_loss_7": 2.6962190210819243, |
| "epoch": 0.986, |
| "grad_norm": 14.355657718500265, |
| "kl_loss_13": 1944.8, |
| "kl_loss_26": 901.7, |
| "kl_loss_39": 537.7, |
| "kl_loss_7": 2617.8, |
| "learning_rate": 4.933487177280482e-07, |
| "loss": 3084.175, |
| "step": 9860 |
| }, |
| { |
| "ce_loss_13": 2.4431921422481535, |
| "ce_loss_26": 1.9350731909275054, |
| "ce_loss_39": 1.7554681122303009, |
| "ce_loss_52": 1.4598491072654725, |
| "ce_loss_7": 2.761775279045105, |
| "epoch": 0.987, |
| "grad_norm": 14.504857971707292, |
| "kl_loss_13": 2021.8, |
| "kl_loss_26": 959.6, |
| "kl_loss_39": 582.75, |
| "kl_loss_7": 2686.8, |
| "learning_rate": 4.2539705339295075e-07, |
| "loss": 3095.3, |
| "step": 9870 |
| }, |
| { |
| "ce_loss_13": 2.3912162601947786, |
| "ce_loss_26": 1.8867509424686433, |
| "ce_loss_39": 1.7147331923246383, |
| "ce_loss_52": 1.4274780035018921, |
| "ce_loss_7": 2.7152205407619476, |
| "epoch": 0.988, |
| "grad_norm": 13.898651253722077, |
| "kl_loss_13": 1978.6, |
| "kl_loss_26": 918.4, |
| "kl_loss_39": 560.0, |
| "kl_loss_7": 2653.6, |
| "learning_rate": 3.6247609976319816e-07, |
| "loss": 3111.5, |
| "step": 9880 |
| }, |
| { |
| "ce_loss_13": 2.4592268586158754, |
| "ce_loss_26": 1.9460572868585586, |
| "ce_loss_39": 1.7624834805727005, |
| "ce_loss_52": 1.4708713114261627, |
| "ce_loss_7": 2.780032974481583, |
| "epoch": 0.989, |
| "grad_norm": 13.162597089199776, |
| "kl_loss_13": 2017.4, |
| "kl_loss_26": 950.2, |
| "kl_loss_39": 574.95, |
| "kl_loss_7": 2680.0, |
| "learning_rate": 3.0458649045211895e-07, |
| "loss": 3104.75, |
| "step": 9890 |
| }, |
| { |
| "ce_loss_13": 2.318621850013733, |
| "ce_loss_26": 1.8226055085659028, |
| "ce_loss_39": 1.6485673993825913, |
| "ce_loss_52": 1.3747537702322006, |
| "ce_loss_7": 2.6328552305698394, |
| "epoch": 0.99, |
| "grad_norm": 14.177492293626315, |
| "kl_loss_13": 1957.6, |
| "kl_loss_26": 901.5, |
| "kl_loss_39": 543.2, |
| "kl_loss_7": 2623.6, |
| "learning_rate": 2.517288084074587e-07, |
| "loss": 3097.8, |
| "step": 9900 |
| }, |
| { |
| "ce_loss_13": 2.450270253419876, |
| "ce_loss_26": 1.9614087045192719, |
| "ce_loss_39": 1.820476683974266, |
| "ce_loss_52": 1.4863917350769043, |
| "ce_loss_7": 2.776776838302612, |
| "epoch": 0.991, |
| "grad_norm": 13.637096409193225, |
| "kl_loss_13": 2041.0, |
| "kl_loss_26": 993.3, |
| "kl_loss_39": 622.6, |
| "kl_loss_7": 2724.0, |
| "learning_rate": 2.0390358590538505e-07, |
| "loss": 3133.55, |
| "step": 9910 |
| }, |
| { |
| "ce_loss_13": 2.4164200723171234, |
| "ce_loss_26": 1.9075238525867462, |
| "ce_loss_39": 1.7269282668828965, |
| "ce_loss_52": 1.4252729326486588, |
| "ce_loss_7": 2.7410891175270082, |
| "epoch": 0.992, |
| "grad_norm": 14.06088505281845, |
| "kl_loss_13": 2041.6, |
| "kl_loss_26": 973.6, |
| "kl_loss_39": 593.7, |
| "kl_loss_7": 2722.8, |
| "learning_rate": 1.61111304545436e-07, |
| "loss": 3101.25, |
| "step": 9920 |
| }, |
| { |
| "ce_loss_13": 2.4260194152593613, |
| "ce_loss_26": 1.9236795336008072, |
| "ce_loss_39": 1.7412341982126236, |
| "ce_loss_52": 1.445477369427681, |
| "ce_loss_7": 2.750396305322647, |
| "epoch": 0.993, |
| "grad_norm": 13.813190658092928, |
| "kl_loss_13": 2043.4, |
| "kl_loss_26": 963.8, |
| "kl_loss_39": 591.1, |
| "kl_loss_7": 2727.2, |
| "learning_rate": 1.2335239524541298e-07, |
| "loss": 3113.2, |
| "step": 9930 |
| }, |
| { |
| "ce_loss_13": 2.4003034621477126, |
| "ce_loss_26": 1.8995478272438049, |
| "ce_loss_39": 1.7172971665859222, |
| "ce_loss_52": 1.4267783105373382, |
| "ce_loss_7": 2.730258399248123, |
| "epoch": 0.994, |
| "grad_norm": 14.088698553106907, |
| "kl_loss_13": 2017.2, |
| "kl_loss_26": 955.6, |
| "kl_loss_39": 580.2, |
| "kl_loss_7": 2698.8, |
| "learning_rate": 9.06272382371065e-08, |
| "loss": 3112.45, |
| "step": 9940 |
| }, |
| { |
| "ce_loss_13": 2.3436710268259047, |
| "ce_loss_26": 1.860148760676384, |
| "ce_loss_39": 1.6837237626314163, |
| "ce_loss_52": 1.405136799812317, |
| "ce_loss_7": 2.659210926294327, |
| "epoch": 0.995, |
| "grad_norm": 13.69734193648067, |
| "kl_loss_13": 1922.2, |
| "kl_loss_26": 893.6, |
| "kl_loss_39": 537.7, |
| "kl_loss_7": 2583.2, |
| "learning_rate": 6.293616306246586e-08, |
| "loss": 3122.45, |
| "step": 9950 |
| }, |
| { |
| "ce_loss_13": 2.4028525710105897, |
| "ce_loss_26": 1.9053959518671035, |
| "ce_loss_39": 1.734974354505539, |
| "ce_loss_52": 1.4462752103805543, |
| "ce_loss_7": 2.7197851181030273, |
| "epoch": 0.996, |
| "grad_norm": 14.052327245568536, |
| "kl_loss_13": 1985.8, |
| "kl_loss_26": 933.9, |
| "kl_loss_39": 572.05, |
| "kl_loss_7": 2659.2, |
| "learning_rate": 4.027944857032395e-08, |
| "loss": 3115.75, |
| "step": 9960 |
| }, |
| { |
| "ce_loss_13": 2.3983709454536437, |
| "ce_loss_26": 1.897152093052864, |
| "ce_loss_39": 1.7151427894830704, |
| "ce_loss_52": 1.4271936371922493, |
| "ce_loss_7": 2.7171947032213213, |
| "epoch": 0.997, |
| "grad_norm": 13.53314291583234, |
| "kl_loss_13": 1999.0, |
| "kl_loss_26": 942.7, |
| "kl_loss_39": 568.8, |
| "kl_loss_7": 2670.0, |
| "learning_rate": 2.265732291356626e-08, |
| "loss": 3094.175, |
| "step": 9970 |
| }, |
| { |
| "ce_loss_13": 2.3423147082328795, |
| "ce_loss_26": 1.8376368135213852, |
| "ce_loss_39": 1.6704195857048034, |
| "ce_loss_52": 1.3995309814810752, |
| "ce_loss_7": 2.6571378737688063, |
| "epoch": 0.998, |
| "grad_norm": 13.139494465989356, |
| "kl_loss_13": 1954.0, |
| "kl_loss_26": 894.2, |
| "kl_loss_39": 540.1, |
| "kl_loss_7": 2622.6, |
| "learning_rate": 1.0069963546743833e-08, |
| "loss": 3091.8, |
| "step": 9980 |
| }, |
| { |
| "ce_loss_13": 2.371971958875656, |
| "ce_loss_26": 1.8749190032482148, |
| "ce_loss_39": 1.7022694885730743, |
| "ce_loss_52": 1.4237666621804237, |
| "ce_loss_7": 2.694683998823166, |
| "epoch": 0.999, |
| "grad_norm": 13.948245708314273, |
| "kl_loss_13": 1952.8, |
| "kl_loss_26": 907.5, |
| "kl_loss_39": 547.75, |
| "kl_loss_7": 2620.8, |
| "learning_rate": 2.517497224463483e-09, |
| "loss": 3089.0, |
| "step": 9990 |
| }, |
| { |
| "ce_loss_13": 2.4080661326646804, |
| "ce_loss_26": 1.896571347117424, |
| "ce_loss_39": 1.7096484139561654, |
| "ce_loss_52": 1.4170419454574585, |
| "ce_loss_7": 2.7374835878610613, |
| "epoch": 1.0, |
| "grad_norm": 13.857134097975196, |
| "kl_loss_13": 2044.2, |
| "kl_loss_26": 961.3, |
| "kl_loss_39": 577.05, |
| "kl_loss_7": 2731.6, |
| "learning_rate": 0.0, |
| "loss": 3103.8, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 250, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0167830278176768e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|