Balcony-Model21 / trainer_state.json
adpretko's picture
Upload folder using huggingface_hub
ccb5639 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"ce_loss_13": 11.511599779129028,
"ce_loss_26": 11.188396453857422,
"ce_loss_39": 11.169448137283325,
"ce_loss_52": 1.3891706466674805,
"ce_loss_7": 11.556999206542969,
"epoch": 0.0001,
"grad_norm": 28.059961985369828,
"kl_loss_13": 20896.0,
"kl_loss_26": 20192.0,
"kl_loss_39": 20192.0,
"kl_loss_7": 20960.0,
"learning_rate": 1e-05,
"loss": 41080.0,
"step": 1
},
{
"ce_loss_13": 11.506269454956055,
"ce_loss_26": 11.177568621105618,
"ce_loss_39": 11.177141269048056,
"ce_loss_52": 1.458960132466422,
"ce_loss_7": 11.548744599024454,
"epoch": 0.001,
"grad_norm": 28.667146352003318,
"kl_loss_13": 20782.222222222223,
"kl_loss_26": 20106.666666666668,
"kl_loss_39": 20110.222222222223,
"kl_loss_7": 20867.555555555555,
"learning_rate": 0.0001,
"loss": 41008.8889,
"step": 10
},
{
"ce_loss_13": 11.456571412086486,
"ce_loss_26": 11.158088731765748,
"ce_loss_39": 11.15653133392334,
"ce_loss_52": 1.435088688135147,
"ce_loss_7": 11.476131820678711,
"epoch": 0.002,
"grad_norm": 35.67456270110165,
"kl_loss_13": 20723.2,
"kl_loss_26": 20118.4,
"kl_loss_39": 20115.2,
"kl_loss_7": 20764.8,
"learning_rate": 0.0002,
"loss": 40904.0,
"step": 20
},
{
"ce_loss_13": 11.150281167030334,
"ce_loss_26": 11.01296763420105,
"ce_loss_39": 11.044581699371339,
"ce_loss_52": 1.4344331562519073,
"ce_loss_7": 11.054779505729675,
"epoch": 0.003,
"grad_norm": 54.04879245830703,
"kl_loss_13": 20108.8,
"kl_loss_26": 19840.0,
"kl_loss_39": 19907.2,
"kl_loss_7": 19920.0,
"learning_rate": 0.0003,
"loss": 39847.2,
"step": 30
},
{
"ce_loss_13": 10.505089902877808,
"ce_loss_26": 10.497783017158508,
"ce_loss_39": 10.527814579010009,
"ce_loss_52": 1.460255417227745,
"ce_loss_7": 10.453347158432006,
"epoch": 0.004,
"grad_norm": 29.567872258029254,
"kl_loss_13": 18694.4,
"kl_loss_26": 18688.0,
"kl_loss_39": 18755.2,
"kl_loss_7": 18588.8,
"learning_rate": 0.0004,
"loss": 37436.0,
"step": 40
},
{
"ce_loss_13": 10.321261882781982,
"ce_loss_26": 10.244042158126831,
"ce_loss_39": 10.236308455467224,
"ce_loss_52": 1.463668829202652,
"ce_loss_7": 10.305184721946716,
"epoch": 0.005,
"grad_norm": 37.9866452371617,
"kl_loss_13": 18329.6,
"kl_loss_26": 18163.2,
"kl_loss_39": 18140.8,
"kl_loss_7": 18288.0,
"learning_rate": 0.0005,
"loss": 36555.2,
"step": 50
},
{
"ce_loss_13": 10.226529097557068,
"ce_loss_26": 10.111042308807374,
"ce_loss_39": 10.11395993232727,
"ce_loss_52": 1.4317695140838622,
"ce_loss_7": 10.210216856002807,
"epoch": 0.006,
"grad_norm": 47.58894649950276,
"kl_loss_13": 18208.0,
"kl_loss_26": 17974.4,
"kl_loss_39": 17980.8,
"kl_loss_7": 18166.4,
"learning_rate": 0.0006,
"loss": 36044.0,
"step": 60
},
{
"ce_loss_13": 10.142269968986511,
"ce_loss_26": 10.005614733695984,
"ce_loss_39": 10.006310772895812,
"ce_loss_52": 1.3979130625724792,
"ce_loss_7": 10.13636019229889,
"epoch": 0.007,
"grad_norm": 55.16387671378209,
"kl_loss_13": 18057.6,
"kl_loss_26": 17772.8,
"kl_loss_39": 17792.0,
"kl_loss_7": 18048.0,
"learning_rate": 0.0007,
"loss": 35718.4,
"step": 70
},
{
"ce_loss_13": 10.032484984397888,
"ce_loss_26": 9.872331905364991,
"ce_loss_39": 9.881066274642944,
"ce_loss_52": 1.4247985988855363,
"ce_loss_7": 10.02949812412262,
"epoch": 0.008,
"grad_norm": 59.28947925840698,
"kl_loss_13": 17811.2,
"kl_loss_26": 17488.0,
"kl_loss_39": 17500.8,
"kl_loss_7": 17808.0,
"learning_rate": 0.0008,
"loss": 35334.4,
"step": 80
},
{
"ce_loss_13": 9.942931509017944,
"ce_loss_26": 9.76176996231079,
"ce_loss_39": 9.775496363639832,
"ce_loss_52": 1.4258457243442535,
"ce_loss_7": 9.945418453216552,
"epoch": 0.009,
"grad_norm": 55.94798234885439,
"kl_loss_13": 17600.0,
"kl_loss_26": 17222.4,
"kl_loss_39": 17257.6,
"kl_loss_7": 17600.0,
"learning_rate": 0.0009000000000000001,
"loss": 34900.0,
"step": 90
},
{
"ce_loss_13": 9.852771949768066,
"ce_loss_26": 9.661247444152831,
"ce_loss_39": 9.673552298545838,
"ce_loss_52": 1.438367447257042,
"ce_loss_7": 9.860591006278991,
"epoch": 0.01,
"grad_norm": 53.3090381296634,
"kl_loss_13": 17385.6,
"kl_loss_26": 16992.0,
"kl_loss_39": 17024.0,
"kl_loss_7": 17398.4,
"learning_rate": 0.001,
"loss": 34482.8,
"step": 100
},
{
"ce_loss_13": 9.76314606666565,
"ce_loss_26": 9.563278603553773,
"ce_loss_39": 9.578891181945801,
"ce_loss_52": 1.412995059788227,
"ce_loss_7": 9.781555676460266,
"epoch": 0.011,
"grad_norm": 53.230976887502294,
"kl_loss_13": 17251.2,
"kl_loss_26": 16836.8,
"kl_loss_39": 16870.4,
"kl_loss_7": 17305.6,
"learning_rate": 0.0009999974825027757,
"loss": 34052.4,
"step": 110
},
{
"ce_loss_13": 9.681499814987182,
"ce_loss_26": 9.470890092849732,
"ce_loss_39": 9.48718273639679,
"ce_loss_52": 1.4235966846346855,
"ce_loss_7": 9.706467342376708,
"epoch": 0.012,
"grad_norm": 53.526819502242695,
"kl_loss_13": 17049.6,
"kl_loss_26": 16612.8,
"kl_loss_39": 16648.0,
"kl_loss_7": 17100.8,
"learning_rate": 0.0009999899300364532,
"loss": 33698.0,
"step": 120
},
{
"ce_loss_13": 9.590748715400697,
"ce_loss_26": 9.367487025260925,
"ce_loss_39": 9.386657476425171,
"ce_loss_52": 1.4183751314878463,
"ce_loss_7": 9.621446299552918,
"epoch": 0.013,
"grad_norm": 52.25839955403129,
"kl_loss_13": 16867.2,
"kl_loss_26": 16417.6,
"kl_loss_39": 16448.0,
"kl_loss_7": 16940.8,
"learning_rate": 0.0009999773426770863,
"loss": 33311.6,
"step": 130
},
{
"ce_loss_13": 9.527826118469239,
"ce_loss_26": 9.299377870559692,
"ce_loss_39": 9.321026277542114,
"ce_loss_52": 1.445027893781662,
"ce_loss_7": 9.561844515800477,
"epoch": 0.014,
"grad_norm": 52.41222674765903,
"kl_loss_13": 16692.8,
"kl_loss_26": 16227.2,
"kl_loss_39": 16273.6,
"kl_loss_7": 16777.6,
"learning_rate": 0.0009999597205514296,
"loss": 33030.8,
"step": 140
},
{
"ce_loss_13": 9.486352849006654,
"ce_loss_26": 9.252249264717102,
"ce_loss_39": 9.267914438247681,
"ce_loss_52": 1.4420335739850998,
"ce_loss_7": 9.524769949913026,
"epoch": 0.015,
"grad_norm": 53.790180993856175,
"kl_loss_13": 16592.0,
"kl_loss_26": 16104.0,
"kl_loss_39": 16132.8,
"kl_loss_7": 16657.6,
"learning_rate": 0.0009999370638369377,
"loss": 32769.2,
"step": 150
},
{
"ce_loss_13": 9.392394828796387,
"ce_loss_26": 9.15080394744873,
"ce_loss_39": 9.170540618896485,
"ce_loss_52": 1.423890632390976,
"ce_loss_7": 9.436835885047913,
"epoch": 0.016,
"grad_norm": 52.65338822593253,
"kl_loss_13": 16464.0,
"kl_loss_26": 15963.2,
"kl_loss_39": 16003.2,
"kl_loss_7": 16563.2,
"learning_rate": 0.000999909372761763,
"loss": 32427.6,
"step": 160
},
{
"ce_loss_13": 9.328035354614258,
"ce_loss_26": 9.082435154914856,
"ce_loss_39": 9.1047847032547,
"ce_loss_52": 1.4349344044923782,
"ce_loss_7": 9.380868554115295,
"epoch": 0.017,
"grad_norm": 52.40583142267758,
"kl_loss_13": 16296.0,
"kl_loss_26": 15777.6,
"kl_loss_39": 15832.0,
"kl_loss_7": 16404.8,
"learning_rate": 0.0009998766476047546,
"loss": 32178.8,
"step": 170
},
{
"ce_loss_13": 9.262916254997254,
"ce_loss_26": 9.01283278465271,
"ce_loss_39": 9.035380673408508,
"ce_loss_52": 1.3936711609363557,
"ce_loss_7": 9.322635316848755,
"epoch": 0.018,
"grad_norm": 52.346136942448574,
"kl_loss_13": 16240.0,
"kl_loss_26": 15716.8,
"kl_loss_39": 15769.6,
"kl_loss_7": 16363.2,
"learning_rate": 0.0009998388886954545,
"loss": 31844.4,
"step": 180
},
{
"ce_loss_13": 9.195696568489074,
"ce_loss_26": 8.94514548778534,
"ce_loss_39": 8.967138314247132,
"ce_loss_52": 1.4523959368467332,
"ce_loss_7": 9.263990116119384,
"epoch": 0.019,
"grad_norm": 51.44547467780483,
"kl_loss_13": 15985.6,
"kl_loss_26": 15464.0,
"kl_loss_39": 15515.2,
"kl_loss_7": 16132.8,
"learning_rate": 0.0009997960964140947,
"loss": 31580.0,
"step": 190
},
{
"ce_loss_13": 9.109345388412475,
"ce_loss_26": 8.856351280212403,
"ce_loss_39": 8.882033634185792,
"ce_loss_52": 1.425847691297531,
"ce_loss_7": 9.182338738441468,
"epoch": 0.02,
"grad_norm": 51.67086637607359,
"kl_loss_13": 15867.2,
"kl_loss_26": 15332.8,
"kl_loss_39": 15393.6,
"kl_loss_7": 16011.2,
"learning_rate": 0.0009997482711915926,
"loss": 31312.8,
"step": 200
},
{
"ce_loss_13": 9.036305499076843,
"ce_loss_26": 8.778529453277589,
"ce_loss_39": 8.803540563583374,
"ce_loss_52": 1.4626984983682632,
"ce_loss_7": 9.118139266967773,
"epoch": 0.021,
"grad_norm": 50.078507295298536,
"kl_loss_13": 15654.4,
"kl_loss_26": 15108.8,
"kl_loss_39": 15168.0,
"kl_loss_7": 15828.8,
"learning_rate": 0.0009996954135095479,
"loss": 31012.0,
"step": 210
},
{
"ce_loss_13": 8.98859736919403,
"ce_loss_26": 8.726609206199646,
"ce_loss_39": 8.75291087627411,
"ce_loss_52": 1.4173608794808388,
"ce_loss_7": 9.076500582695008,
"epoch": 0.022,
"grad_norm": 50.694312927544594,
"kl_loss_13": 15625.6,
"kl_loss_26": 15073.6,
"kl_loss_39": 15126.4,
"kl_loss_7": 15811.2,
"learning_rate": 0.0009996375239002368,
"loss": 30754.8,
"step": 220
},
{
"ce_loss_13": 8.965013265609741,
"ce_loss_26": 8.698115158081055,
"ce_loss_39": 8.71800787448883,
"ce_loss_52": 1.4263556391000747,
"ce_loss_7": 9.061096882820129,
"epoch": 0.023,
"grad_norm": 50.98814656536181,
"kl_loss_13": 15547.2,
"kl_loss_26": 14985.6,
"kl_loss_39": 15028.8,
"kl_loss_7": 15752.0,
"learning_rate": 0.0009995746029466072,
"loss": 30513.6,
"step": 230
},
{
"ce_loss_13": 8.891163158416749,
"ce_loss_26": 8.611021280288696,
"ce_loss_39": 8.63300838470459,
"ce_loss_52": 1.42348592877388,
"ce_loss_7": 8.998197555541992,
"epoch": 0.024,
"grad_norm": 51.614487435815626,
"kl_loss_13": 15393.6,
"kl_loss_26": 14806.4,
"kl_loss_39": 14848.0,
"kl_loss_7": 15619.2,
"learning_rate": 0.0009995066512822719,
"loss": 30248.4,
"step": 240
},
{
"ce_loss_13": 8.831620502471925,
"ce_loss_26": 8.545269632339478,
"ce_loss_39": 8.565073847770691,
"ce_loss_52": 1.452064010500908,
"ce_loss_7": 8.941647911071778,
"epoch": 0.025,
"grad_norm": 50.247038771654694,
"kl_loss_13": 15233.6,
"kl_loss_26": 14628.8,
"kl_loss_39": 14667.2,
"kl_loss_7": 15462.4,
"learning_rate": 0.000999433669591504,
"loss": 29955.6,
"step": 250
},
{
"ce_loss_13": 8.755429339408874,
"ce_loss_26": 8.469949841499329,
"ce_loss_39": 8.486004614830017,
"ce_loss_52": 1.4328533172607423,
"ce_loss_7": 8.874198198318481,
"epoch": 0.026,
"grad_norm": 49.63162227981652,
"kl_loss_13": 15075.2,
"kl_loss_26": 14473.6,
"kl_loss_39": 14508.8,
"kl_loss_7": 15326.4,
"learning_rate": 0.000999355658609228,
"loss": 29717.2,
"step": 260
},
{
"ce_loss_13": 8.697371244430542,
"ce_loss_26": 8.401416063308716,
"ce_loss_39": 8.412619948387146,
"ce_loss_52": 1.4436978071928024,
"ce_loss_7": 8.819016146659852,
"epoch": 0.027,
"grad_norm": 51.15355718783705,
"kl_loss_13": 14974.4,
"kl_loss_26": 14347.2,
"kl_loss_39": 14377.6,
"kl_loss_7": 15235.2,
"learning_rate": 0.0009992726191210138,
"loss": 29500.4,
"step": 270
},
{
"ce_loss_13": 8.66446521282196,
"ce_loss_26": 8.36073157787323,
"ce_loss_39": 8.373045516014098,
"ce_loss_52": 1.4337088972330094,
"ce_loss_7": 8.793687105178833,
"epoch": 0.028,
"grad_norm": 50.66919780267985,
"kl_loss_13": 14888.0,
"kl_loss_26": 14254.4,
"kl_loss_39": 14280.0,
"kl_loss_7": 15168.0,
"learning_rate": 0.0009991845519630679,
"loss": 29316.8,
"step": 280
},
{
"ce_loss_13": 8.604181599617004,
"ce_loss_26": 8.292751049995422,
"ce_loss_39": 8.304360592365265,
"ce_loss_52": 1.4289869368076324,
"ce_loss_7": 8.740529561042786,
"epoch": 0.029,
"grad_norm": 49.40131878460191,
"kl_loss_13": 14785.6,
"kl_loss_26": 14123.2,
"kl_loss_39": 14147.2,
"kl_loss_7": 15065.6,
"learning_rate": 0.0009990914580222257,
"loss": 29034.0,
"step": 290
},
{
"ce_loss_13": 8.560984683036803,
"ce_loss_26": 8.244250774383545,
"ce_loss_39": 8.251447391510009,
"ce_loss_52": 1.4629653513431549,
"ce_loss_7": 8.694539451599121,
"epoch": 0.03,
"grad_norm": 49.35953544498034,
"kl_loss_13": 14673.6,
"kl_loss_26": 13996.8,
"kl_loss_39": 14014.4,
"kl_loss_7": 14955.2,
"learning_rate": 0.0009989933382359422,
"loss": 28794.4,
"step": 300
},
{
"ce_loss_13": 8.470300602912904,
"ce_loss_26": 8.14130541086197,
"ce_loss_39": 8.148625028133392,
"ce_loss_52": 1.4505603075027467,
"ce_loss_7": 8.614710068702697,
"epoch": 0.031,
"grad_norm": 49.67315029837232,
"kl_loss_13": 14492.8,
"kl_loss_26": 13798.4,
"kl_loss_39": 13812.8,
"kl_loss_7": 14800.0,
"learning_rate": 0.0009988901935922825,
"loss": 28550.8,
"step": 310
},
{
"ce_loss_13": 8.449520301818847,
"ce_loss_26": 8.124438393115998,
"ce_loss_39": 8.127473723888397,
"ce_loss_52": 1.4619350016117096,
"ce_loss_7": 8.594109392166137,
"epoch": 0.032,
"grad_norm": 49.62699299914443,
"kl_loss_13": 14432.0,
"kl_loss_26": 13737.6,
"kl_loss_39": 13745.6,
"kl_loss_7": 14737.6,
"learning_rate": 0.0009987820251299122,
"loss": 28340.4,
"step": 320
},
{
"ce_loss_13": 8.411065363883973,
"ce_loss_26": 8.065169262886048,
"ce_loss_39": 8.066172111034394,
"ce_loss_52": 1.4555475383996963,
"ce_loss_7": 8.563976049423218,
"epoch": 0.033,
"grad_norm": 48.32599858014616,
"kl_loss_13": 14328.0,
"kl_loss_26": 13596.8,
"kl_loss_39": 13601.6,
"kl_loss_7": 14651.2,
"learning_rate": 0.0009986688339380862,
"loss": 28064.0,
"step": 330
},
{
"ce_loss_13": 8.342015504837036,
"ce_loss_26": 7.984185111522675,
"ce_loss_39": 7.983271932601928,
"ce_loss_52": 1.4297153055667877,
"ce_loss_7": 8.503781509399413,
"epoch": 0.034,
"grad_norm": 49.50369957464881,
"kl_loss_13": 14252.8,
"kl_loss_26": 13504.0,
"kl_loss_39": 13500.8,
"kl_loss_7": 14590.4,
"learning_rate": 0.0009985506211566387,
"loss": 27837.2,
"step": 340
},
{
"ce_loss_13": 8.30728188753128,
"ce_loss_26": 7.943272864818573,
"ce_loss_39": 7.946760308742523,
"ce_loss_52": 1.4311424046754837,
"ce_loss_7": 8.472229743003846,
"epoch": 0.035,
"grad_norm": 49.2761544590419,
"kl_loss_13": 14160.0,
"kl_loss_26": 13390.4,
"kl_loss_39": 13393.6,
"kl_loss_7": 14508.8,
"learning_rate": 0.0009984273879759713,
"loss": 27625.6,
"step": 350
},
{
"ce_loss_13": 8.231205070018769,
"ce_loss_26": 7.874649381637573,
"ce_loss_39": 7.870936000347138,
"ce_loss_52": 1.4489013850688934,
"ce_loss_7": 8.397671937942505,
"epoch": 0.036,
"grad_norm": 49.68016023572489,
"kl_loss_13": 13990.4,
"kl_loss_26": 13236.8,
"kl_loss_39": 13232.0,
"kl_loss_7": 14340.8,
"learning_rate": 0.0009982991356370402,
"loss": 27384.8,
"step": 360
},
{
"ce_loss_13": 8.171039760112762,
"ce_loss_26": 7.7931832551956175,
"ce_loss_39": 7.793972992897034,
"ce_loss_52": 1.4122098296880723,
"ce_loss_7": 8.344593167304993,
"epoch": 0.037,
"grad_norm": 48.18494426167262,
"kl_loss_13": 13920.0,
"kl_loss_26": 13128.0,
"kl_loss_39": 13129.6,
"kl_loss_7": 14291.2,
"learning_rate": 0.0009981658654313456,
"loss": 27248.4,
"step": 370
},
{
"ce_loss_13": 8.160165214538575,
"ce_loss_26": 7.7771016478538515,
"ce_loss_39": 7.771613931655883,
"ce_loss_52": 1.4869922280311585,
"ce_loss_7": 8.338345170021057,
"epoch": 0.038,
"grad_norm": 48.80785692423277,
"kl_loss_13": 13795.2,
"kl_loss_26": 12974.4,
"kl_loss_39": 12960.0,
"kl_loss_7": 14164.8,
"learning_rate": 0.000998027578700917,
"loss": 26976.4,
"step": 380
},
{
"ce_loss_13": 8.052374148368836,
"ce_loss_26": 7.658347594738006,
"ce_loss_39": 7.6583909630775455,
"ce_loss_52": 1.4154588401317596,
"ce_loss_7": 8.239335989952087,
"epoch": 0.039,
"grad_norm": 48.38129929768143,
"kl_loss_13": 13680.0,
"kl_loss_26": 12851.2,
"kl_loss_39": 12844.8,
"kl_loss_7": 14078.4,
"learning_rate": 0.0009978842768382998,
"loss": 26719.2,
"step": 390
},
{
"ce_loss_13": 8.022306847572327,
"ce_loss_26": 7.623832786083222,
"ce_loss_39": 7.616167056560516,
"ce_loss_52": 1.4509258031845094,
"ce_loss_7": 8.209609961509704,
"epoch": 0.04,
"grad_norm": 48.47003917091678,
"kl_loss_13": 13532.8,
"kl_loss_26": 12684.8,
"kl_loss_39": 12668.8,
"kl_loss_7": 13918.4,
"learning_rate": 0.0009977359612865424,
"loss": 26536.4,
"step": 400
},
{
"ce_loss_13": 8.007824766635895,
"ce_loss_26": 7.614881563186645,
"ce_loss_39": 7.602893710136414,
"ce_loss_52": 1.4645162731409074,
"ce_loss_7": 8.19339075088501,
"epoch": 0.041,
"grad_norm": 48.05902996814503,
"kl_loss_13": 13486.4,
"kl_loss_26": 12646.4,
"kl_loss_39": 12627.2,
"kl_loss_7": 13886.4,
"learning_rate": 0.0009975826335391806,
"loss": 26319.6,
"step": 410
},
{
"ce_loss_13": 7.89500185251236,
"ce_loss_26": 7.4844276905059814,
"ce_loss_39": 7.470773124694825,
"ce_loss_52": 1.3909434020519256,
"ce_loss_7": 8.089212799072266,
"epoch": 0.042,
"grad_norm": 47.6532900353141,
"kl_loss_13": 13393.6,
"kl_loss_26": 12528.0,
"kl_loss_39": 12504.0,
"kl_loss_7": 13800.0,
"learning_rate": 0.0009974242951402235,
"loss": 26051.2,
"step": 420
},
{
"ce_loss_13": 7.848755323886872,
"ce_loss_26": 7.436672508716583,
"ce_loss_39": 7.420604693889618,
"ce_loss_52": 1.4549538046121597,
"ce_loss_7": 8.053676557540893,
"epoch": 0.043,
"grad_norm": 47.461866848016506,
"kl_loss_13": 13187.2,
"kl_loss_26": 12316.8,
"kl_loss_39": 12278.4,
"kl_loss_7": 13619.2,
"learning_rate": 0.0009972609476841367,
"loss": 25819.2,
"step": 430
},
{
"ce_loss_13": 7.824773061275482,
"ce_loss_26": 7.38158438205719,
"ce_loss_39": 7.365956795215607,
"ce_loss_52": 1.4214935347437858,
"ce_loss_7": 8.031265962123872,
"epoch": 0.044,
"grad_norm": 47.54154044179159,
"kl_loss_13": 13169.6,
"kl_loss_26": 12241.6,
"kl_loss_39": 12208.0,
"kl_loss_7": 13604.8,
"learning_rate": 0.0009970925928158272,
"loss": 25669.2,
"step": 440
},
{
"ce_loss_13": 7.781041419506073,
"ce_loss_26": 7.349016737937927,
"ce_loss_39": 7.328068232536316,
"ce_loss_52": 1.4457789659500122,
"ce_loss_7": 7.986689484119415,
"epoch": 0.045,
"grad_norm": 47.28488093165922,
"kl_loss_13": 13065.6,
"kl_loss_26": 12150.4,
"kl_loss_39": 12107.2,
"kl_loss_7": 13499.2,
"learning_rate": 0.000996919232230627,
"loss": 25413.2,
"step": 450
},
{
"ce_loss_13": 7.707467567920685,
"ce_loss_26": 7.259365463256836,
"ce_loss_39": 7.2402653932571415,
"ce_loss_52": 1.4384218811988831,
"ce_loss_7": 7.9230645298957825,
"epoch": 0.046,
"grad_norm": 47.47875544223923,
"kl_loss_13": 12928.0,
"kl_loss_26": 11974.4,
"kl_loss_39": 11940.8,
"kl_loss_7": 13380.8,
"learning_rate": 0.0009967408676742752,
"loss": 25149.6,
"step": 460
},
{
"ce_loss_13": 7.682056641578674,
"ce_loss_26": 7.239385926723481,
"ce_loss_39": 7.214358115196228,
"ce_loss_52": 1.4293440610170365,
"ce_loss_7": 7.903320550918579,
"epoch": 0.047,
"grad_norm": 47.616227706878504,
"kl_loss_13": 12899.2,
"kl_loss_26": 11950.4,
"kl_loss_39": 11904.0,
"kl_loss_7": 13360.0,
"learning_rate": 0.0009965575009429006,
"loss": 24954.0,
"step": 470
},
{
"ce_loss_13": 7.673471140861511,
"ce_loss_26": 7.22241450548172,
"ce_loss_39": 7.191142916679382,
"ce_loss_52": 1.4720373705029488,
"ce_loss_7": 7.897368836402893,
"epoch": 0.048,
"grad_norm": 47.35612609507448,
"kl_loss_13": 12763.2,
"kl_loss_26": 11816.0,
"kl_loss_39": 11748.8,
"kl_loss_7": 13236.8,
"learning_rate": 0.0009963691338830043,
"loss": 24784.4,
"step": 480
},
{
"ce_loss_13": 7.61794912815094,
"ce_loss_26": 7.163714337348938,
"ce_loss_39": 7.132104587554932,
"ce_loss_52": 1.4696507424116134,
"ce_loss_7": 7.839430296421051,
"epoch": 0.049,
"grad_norm": 46.6485917664728,
"kl_loss_13": 12664.0,
"kl_loss_26": 11691.2,
"kl_loss_39": 11627.2,
"kl_loss_7": 13139.2,
"learning_rate": 0.0009961757683914405,
"loss": 24543.2,
"step": 490
},
{
"ce_loss_13": 7.507795846462249,
"ce_loss_26": 7.035580575466156,
"ce_loss_39": 7.005417144298553,
"ce_loss_52": 1.4070568919181823,
"ce_loss_7": 7.743252336978912,
"epoch": 0.05,
"grad_norm": 46.67870253681081,
"kl_loss_13": 12545.6,
"kl_loss_26": 11542.4,
"kl_loss_39": 11480.0,
"kl_loss_7": 13043.2,
"learning_rate": 0.0009959774064153978,
"loss": 24344.0,
"step": 500
},
{
"ce_loss_13": 7.495815181732178,
"ce_loss_26": 7.0229366540908815,
"ce_loss_39": 6.990859532356263,
"ce_loss_52": 1.4116702109575272,
"ce_loss_7": 7.736506867408752,
"epoch": 0.051,
"grad_norm": 46.06552599373074,
"kl_loss_13": 12505.6,
"kl_loss_26": 11507.2,
"kl_loss_39": 11433.6,
"kl_loss_7": 13009.6,
"learning_rate": 0.0009957740499523787,
"loss": 24160.0,
"step": 510
},
{
"ce_loss_13": 7.443893933296204,
"ce_loss_26": 6.961520659923553,
"ce_loss_39": 6.922756457328797,
"ce_loss_52": 1.4424341320991516,
"ce_loss_7": 7.6887711644172665,
"epoch": 0.052,
"grad_norm": 46.57676837877325,
"kl_loss_13": 12337.6,
"kl_loss_26": 11308.8,
"kl_loss_39": 11225.6,
"kl_loss_7": 12851.2,
"learning_rate": 0.0009955657010501807,
"loss": 23900.8,
"step": 520
},
{
"ce_loss_13": 7.388968002796173,
"ce_loss_26": 6.916476762294769,
"ce_loss_39": 6.874182558059692,
"ce_loss_52": 1.4647160589694976,
"ce_loss_7": 7.6311492919921875,
"epoch": 0.053,
"grad_norm": 46.39702705707381,
"kl_loss_13": 12211.2,
"kl_loss_26": 11196.8,
"kl_loss_39": 11105.6,
"kl_loss_7": 12718.4,
"learning_rate": 0.000995352361806875,
"loss": 23724.4,
"step": 530
},
{
"ce_loss_13": 7.399884676933288,
"ce_loss_26": 6.893076729774475,
"ce_loss_39": 6.849111843109131,
"ce_loss_52": 1.4278682440519332,
"ce_loss_7": 7.656503355503082,
"epoch": 0.054,
"grad_norm": 45.552992496751486,
"kl_loss_13": 12289.6,
"kl_loss_26": 11217.6,
"kl_loss_39": 11131.2,
"kl_loss_7": 12824.0,
"learning_rate": 0.0009951340343707852,
"loss": 23503.2,
"step": 540
},
{
"ce_loss_13": 7.299449789524078,
"ce_loss_26": 6.799772572517395,
"ce_loss_39": 6.753718996047974,
"ce_loss_52": 1.447503750026226,
"ce_loss_7": 7.554943478107452,
"epoch": 0.055,
"grad_norm": 45.64724152253333,
"kl_loss_13": 12067.2,
"kl_loss_26": 10992.0,
"kl_loss_39": 10900.8,
"kl_loss_7": 12608.0,
"learning_rate": 0.0009949107209404665,
"loss": 23326.0,
"step": 550
},
{
"ce_loss_13": 7.293551552295685,
"ce_loss_26": 6.792212247848511,
"ce_loss_39": 6.740794622898102,
"ce_loss_52": 1.4703042089939118,
"ce_loss_7": 7.558422005176544,
"epoch": 0.056,
"grad_norm": 45.60106893131463,
"kl_loss_13": 11990.4,
"kl_loss_26": 10931.2,
"kl_loss_39": 10820.8,
"kl_loss_7": 12540.8,
"learning_rate": 0.0009946824237646824,
"loss": 23102.4,
"step": 560
},
{
"ce_loss_13": 7.1714133501052855,
"ce_loss_26": 6.658501255512237,
"ce_loss_39": 6.615163576602936,
"ce_loss_52": 1.4395473554730416,
"ce_loss_7": 7.438856053352356,
"epoch": 0.057,
"grad_norm": 45.02028770939811,
"kl_loss_13": 11819.2,
"kl_loss_26": 10716.8,
"kl_loss_39": 10633.6,
"kl_loss_7": 12379.2,
"learning_rate": 0.0009944491451423828,
"loss": 22860.0,
"step": 570
},
{
"ce_loss_13": 7.202235555648803,
"ce_loss_26": 6.668069064617157,
"ce_loss_39": 6.622809886932373,
"ce_loss_52": 1.4502787292003632,
"ce_loss_7": 7.474415194988251,
"epoch": 0.058,
"grad_norm": 45.87654136914787,
"kl_loss_13": 11832.0,
"kl_loss_26": 10697.6,
"kl_loss_39": 10606.4,
"kl_loss_7": 12409.6,
"learning_rate": 0.0009942108874226813,
"loss": 22680.4,
"step": 580
},
{
"ce_loss_13": 7.099943065643311,
"ce_loss_26": 6.574034261703491,
"ce_loss_39": 6.518663537502289,
"ce_loss_52": 1.4500610083341599,
"ce_loss_7": 7.373777639865875,
"epoch": 0.059,
"grad_norm": 45.75547743089401,
"kl_loss_13": 11633.6,
"kl_loss_26": 10512.0,
"kl_loss_39": 10403.2,
"kl_loss_7": 12201.6,
"learning_rate": 0.00099396765300483,
"loss": 22462.8,
"step": 590
},
{
"ce_loss_13": 7.097463607788086,
"ce_loss_26": 6.57372156381607,
"ce_loss_39": 6.511255967617035,
"ce_loss_52": 1.480376410484314,
"ce_loss_7": 7.372353208065033,
"epoch": 0.06,
"grad_norm": 45.20967365382369,
"kl_loss_13": 11558.4,
"kl_loss_26": 10451.2,
"kl_loss_39": 10324.8,
"kl_loss_7": 12145.6,
"learning_rate": 0.0009937194443381972,
"loss": 22282.4,
"step": 600
},
{
"ce_loss_13": 7.056500816345215,
"ce_loss_26": 6.520907533168793,
"ce_loss_39": 6.457597935199738,
"ce_loss_52": 1.4534808412194251,
"ce_loss_7": 7.339275515079498,
"epoch": 0.061,
"grad_norm": 44.17007512472295,
"kl_loss_13": 11520.0,
"kl_loss_26": 10382.4,
"kl_loss_39": 10249.6,
"kl_loss_7": 12110.4,
"learning_rate": 0.0009934662639222412,
"loss": 22080.0,
"step": 610
},
{
"ce_loss_13": 6.959627556800842,
"ce_loss_26": 6.4156983375549315,
"ce_loss_39": 6.353180265426635,
"ce_loss_52": 1.4930268943309783,
"ce_loss_7": 7.244034695625305,
"epoch": 0.062,
"grad_norm": 43.92075543466765,
"kl_loss_13": 11259.2,
"kl_loss_26": 10102.4,
"kl_loss_39": 9971.2,
"kl_loss_7": 11859.2,
"learning_rate": 0.000993208114306486,
"loss": 21799.2,
"step": 620
},
{
"ce_loss_13": 6.937919509410858,
"ce_loss_26": 6.402250957489014,
"ce_loss_39": 6.331100332736969,
"ce_loss_52": 1.4531458377838136,
"ce_loss_7": 7.226283407211303,
"epoch": 0.063,
"grad_norm": 44.52659706916058,
"kl_loss_13": 11259.2,
"kl_loss_26": 10128.0,
"kl_loss_39": 9988.8,
"kl_loss_7": 11881.6,
"learning_rate": 0.0009929449980904952,
"loss": 21667.2,
"step": 630
},
{
"ce_loss_13": 6.914422643184662,
"ce_loss_26": 6.355719900131225,
"ce_loss_39": 6.2839093685150145,
"ce_loss_52": 1.463460123538971,
"ce_loss_7": 7.208615565299988,
"epoch": 0.064,
"grad_norm": 44.241917484883416,
"kl_loss_13": 11203.2,
"kl_loss_26": 10004.8,
"kl_loss_39": 9865.6,
"kl_loss_7": 11827.2,
"learning_rate": 0.0009926769179238466,
"loss": 21450.4,
"step": 640
},
{
"ce_loss_13": 6.814666819572449,
"ce_loss_26": 6.240503942966461,
"ce_loss_39": 6.164217627048492,
"ce_loss_52": 1.4213469997048378,
"ce_loss_7": 7.121113920211792,
"epoch": 0.065,
"grad_norm": 45.45585410762684,
"kl_loss_13": 11097.6,
"kl_loss_26": 9875.2,
"kl_loss_39": 9726.4,
"kl_loss_7": 11742.4,
"learning_rate": 0.000992403876506104,
"loss": 21273.2,
"step": 650
},
{
"ce_loss_13": 6.807473576068878,
"ce_loss_26": 6.237039804458618,
"ce_loss_39": 6.164605820178986,
"ce_loss_52": 1.4794408291578294,
"ce_loss_7": 7.109469771385193,
"epoch": 0.066,
"grad_norm": 43.77904042873825,
"kl_loss_13": 10964.8,
"kl_loss_26": 9745.6,
"kl_loss_39": 9593.6,
"kl_loss_7": 11603.2,
"learning_rate": 0.0009921258765867918,
"loss": 21034.4,
"step": 660
},
{
"ce_loss_13": 6.720256412029267,
"ce_loss_26": 6.124040985107422,
"ce_loss_39": 6.048683619499206,
"ce_loss_52": 1.4370630145072938,
"ce_loss_7": 7.032277429103852,
"epoch": 0.067,
"grad_norm": 44.21280182860459,
"kl_loss_13": 10864.0,
"kl_loss_26": 9596.8,
"kl_loss_39": 9446.4,
"kl_loss_7": 11528.0,
"learning_rate": 0.0009918429209653662,
"loss": 20815.6,
"step": 670
},
{
"ce_loss_13": 6.73115086555481,
"ce_loss_26": 6.149888730049133,
"ce_loss_39": 6.072007644176483,
"ce_loss_52": 1.4543532699346542,
"ce_loss_7": 7.039518296718597,
"epoch": 0.068,
"grad_norm": 43.58133426683343,
"kl_loss_13": 10844.8,
"kl_loss_26": 9603.2,
"kl_loss_39": 9433.6,
"kl_loss_7": 11494.4,
"learning_rate": 0.0009915550124911866,
"loss": 20688.4,
"step": 680
},
{
"ce_loss_13": 6.683139646053315,
"ce_loss_26": 6.099281096458435,
"ce_loss_39": 6.017751622200012,
"ce_loss_52": 1.4289966225624084,
"ce_loss_7": 6.9959977746009825,
"epoch": 0.069,
"grad_norm": 43.03707399207988,
"kl_loss_13": 10817.6,
"kl_loss_26": 9577.6,
"kl_loss_39": 9414.4,
"kl_loss_7": 11472.0,
"learning_rate": 0.0009912621540634887,
"loss": 20494.0,
"step": 690
},
{
"ce_loss_13": 6.5575969338417055,
"ce_loss_26": 5.94709130525589,
"ce_loss_39": 5.865298080444336,
"ce_loss_52": 1.3811550110578537,
"ce_loss_7": 6.883859884738922,
"epoch": 0.07,
"grad_norm": 43.657034485471186,
"kl_loss_13": 10611.2,
"kl_loss_26": 9316.8,
"kl_loss_39": 9148.8,
"kl_loss_7": 11299.2,
"learning_rate": 0.0009909643486313534,
"loss": 20224.4,
"step": 700
},
{
"ce_loss_13": 6.581148624420166,
"ce_loss_26": 5.951541697978973,
"ce_loss_39": 5.867393767833709,
"ce_loss_52": 1.417145846784115,
"ce_loss_7": 6.908858215808868,
"epoch": 0.071,
"grad_norm": 42.31273006993064,
"kl_loss_13": 10628.8,
"kl_loss_26": 9294.4,
"kl_loss_39": 9120.0,
"kl_loss_7": 11320.0,
"learning_rate": 0.000990661599193678,
"loss": 20075.6,
"step": 710
},
{
"ce_loss_13": 6.503521502017975,
"ce_loss_26": 5.871239483356476,
"ce_loss_39": 5.7897450685501095,
"ce_loss_52": 1.4011695250868796,
"ce_loss_7": 6.844956791400909,
"epoch": 0.072,
"grad_norm": 42.36356368480549,
"kl_loss_13": 10488.0,
"kl_loss_26": 9147.2,
"kl_loss_39": 8979.2,
"kl_loss_7": 11206.4,
"learning_rate": 0.0009903539087991462,
"loss": 19811.6,
"step": 720
},
{
"ce_loss_13": 6.489633810520172,
"ce_loss_26": 5.87660802602768,
"ce_loss_39": 5.779063713550568,
"ce_loss_52": 1.439223274588585,
"ce_loss_7": 6.819329023361206,
"epoch": 0.073,
"grad_norm": 42.98993238073801,
"kl_loss_13": 10366.4,
"kl_loss_26": 9057.6,
"kl_loss_39": 8861.6,
"kl_loss_7": 11059.2,
"learning_rate": 0.0009900412805461966,
"loss": 19744.8,
"step": 730
},
{
"ce_loss_13": 6.4397171378135685,
"ce_loss_26": 5.814687025547028,
"ce_loss_39": 5.716581547260285,
"ce_loss_52": 1.4390251755714416,
"ce_loss_7": 6.779693508148194,
"epoch": 0.074,
"grad_norm": 42.877595561482536,
"kl_loss_13": 10267.2,
"kl_loss_26": 8939.2,
"kl_loss_39": 8734.4,
"kl_loss_7": 10982.4,
"learning_rate": 0.0009897237175829927,
"loss": 19478.8,
"step": 740
},
{
"ce_loss_13": 6.3779888391494755,
"ce_loss_26": 5.756749665737152,
"ce_loss_39": 5.652812826633453,
"ce_loss_52": 1.4100830882787705,
"ce_loss_7": 6.712257170677185,
"epoch": 0.075,
"grad_norm": 43.56161359476007,
"kl_loss_13": 10203.2,
"kl_loss_26": 8863.2,
"kl_loss_39": 8649.6,
"kl_loss_7": 10920.0,
"learning_rate": 0.0009894012231073895,
"loss": 19311.6,
"step": 750
},
{
"ce_loss_13": 6.351921963691711,
"ce_loss_26": 5.711096298694611,
"ce_loss_39": 5.613580751419067,
"ce_loss_52": 1.4703039675951004,
"ce_loss_7": 6.6900406837463375,
"epoch": 0.076,
"grad_norm": 41.581645996763,
"kl_loss_13": 10056.0,
"kl_loss_26": 8678.4,
"kl_loss_39": 8485.6,
"kl_loss_7": 10764.8,
"learning_rate": 0.0009890738003669028,
"loss": 19128.0,
"step": 760
},
{
"ce_loss_13": 6.329116785526276,
"ce_loss_26": 5.685759162902832,
"ce_loss_39": 5.58324785232544,
"ce_loss_52": 1.4396527051925658,
"ce_loss_7": 6.677184915542602,
"epoch": 0.077,
"grad_norm": 40.86594229703089,
"kl_loss_13": 10036.8,
"kl_loss_26": 8680.0,
"kl_loss_39": 8467.2,
"kl_loss_7": 10760.0,
"learning_rate": 0.0009887414526586764,
"loss": 18930.4,
"step": 770
},
{
"ce_loss_13": 6.279877305030823,
"ce_loss_26": 5.617447376251221,
"ce_loss_39": 5.507227098941803,
"ce_loss_52": 1.4374216616153717,
"ce_loss_7": 6.634308731555938,
"epoch": 0.078,
"grad_norm": 41.180826238519536,
"kl_loss_13": 9923.2,
"kl_loss_26": 8513.6,
"kl_loss_39": 8292.8,
"kl_loss_7": 10667.2,
"learning_rate": 0.0009884041833294476,
"loss": 18733.6,
"step": 780
},
{
"ce_loss_13": 6.212144470214843,
"ce_loss_26": 5.565514934062958,
"ce_loss_39": 5.445651924610138,
"ce_loss_52": 1.4184710115194321,
"ce_loss_7": 6.563052010536194,
"epoch": 0.079,
"grad_norm": 41.51169269505913,
"kl_loss_13": 9840.0,
"kl_loss_26": 8459.2,
"kl_loss_39": 8207.2,
"kl_loss_7": 10576.0,
"learning_rate": 0.000988061995775515,
"loss": 18618.8,
"step": 790
},
{
"ce_loss_13": 6.177972686290741,
"ce_loss_26": 5.5426277875900265,
"ce_loss_39": 5.430073320865631,
"ce_loss_52": 1.4582359090447425,
"ce_loss_7": 6.532871425151825,
"epoch": 0.08,
"grad_norm": 41.06171415513337,
"kl_loss_13": 9713.6,
"kl_loss_26": 8348.0,
"kl_loss_39": 8122.4,
"kl_loss_7": 10464.0,
"learning_rate": 0.0009877148934427035,
"loss": 18370.0,
"step": 800
},
{
"ce_loss_13": 6.174833989143371,
"ce_loss_26": 5.505520594120026,
"ce_loss_39": 5.391082692146301,
"ce_loss_52": 1.4291342854499818,
"ce_loss_7": 6.535899603366852,
"epoch": 0.081,
"grad_norm": 40.55915083586062,
"kl_loss_13": 9748.8,
"kl_loss_26": 8332.0,
"kl_loss_39": 8094.4,
"kl_loss_7": 10502.4,
"learning_rate": 0.0009873628798263297,
"loss": 18197.2,
"step": 810
},
{
"ce_loss_13": 6.106976389884949,
"ce_loss_26": 5.425193250179291,
"ce_loss_39": 5.297831201553345,
"ce_loss_52": 1.4520869970321655,
"ce_loss_7": 6.4676952958106995,
"epoch": 0.082,
"grad_norm": 39.176828574493044,
"kl_loss_13": 9564.8,
"kl_loss_26": 8108.0,
"kl_loss_39": 7852.0,
"kl_loss_7": 10324.8,
"learning_rate": 0.0009870059584711668,
"loss": 17988.4,
"step": 820
},
{
"ce_loss_13": 6.029178476333618,
"ce_loss_26": 5.369020164012909,
"ce_loss_39": 5.247223997116089,
"ce_loss_52": 1.4342376589775085,
"ce_loss_7": 6.38949601650238,
"epoch": 0.083,
"grad_norm": 41.3023886018674,
"kl_loss_13": 9422.4,
"kl_loss_26": 8008.8,
"kl_loss_39": 7756.0,
"kl_loss_7": 10184.0,
"learning_rate": 0.000986644132971409,
"loss": 17788.4,
"step": 830
},
{
"ce_loss_13": 6.009692323207855,
"ce_loss_26": 5.3266006231307985,
"ce_loss_39": 5.202202546596527,
"ce_loss_52": 1.4376018613576889,
"ce_loss_7": 6.372254419326782,
"epoch": 0.084,
"grad_norm": 39.84971146906691,
"kl_loss_13": 9387.2,
"kl_loss_26": 7916.8,
"kl_loss_39": 7663.2,
"kl_loss_7": 10155.2,
"learning_rate": 0.0009862774069706345,
"loss": 17687.8,
"step": 840
},
{
"ce_loss_13": 5.948546409606934,
"ce_loss_26": 5.290343832969666,
"ce_loss_39": 5.16569093465805,
"ce_loss_52": 1.4315639585256577,
"ce_loss_7": 6.303727805614471,
"epoch": 0.085,
"grad_norm": 38.79997549953815,
"kl_loss_13": 9260.8,
"kl_loss_26": 7848.0,
"kl_loss_39": 7593.6,
"kl_loss_7": 10009.6,
"learning_rate": 0.000985905784161771,
"loss": 17478.4,
"step": 850
},
{
"ce_loss_13": 5.976463770866394,
"ce_loss_26": 5.285696280002594,
"ce_loss_39": 5.158673858642578,
"ce_loss_52": 1.4285172358155251,
"ce_loss_7": 6.345685577392578,
"epoch": 0.086,
"grad_norm": 39.11215287158734,
"kl_loss_13": 9323.2,
"kl_loss_26": 7843.2,
"kl_loss_39": 7588.0,
"kl_loss_7": 10100.8,
"learning_rate": 0.000985529268287055,
"loss": 17353.8,
"step": 860
},
{
"ce_loss_13": 5.890017306804657,
"ce_loss_26": 5.188625490665435,
"ce_loss_39": 5.061676156520844,
"ce_loss_52": 1.427770259976387,
"ce_loss_7": 6.267517876625061,
"epoch": 0.087,
"grad_norm": 38.38012767193544,
"kl_loss_13": 9177.6,
"kl_loss_26": 7678.4,
"kl_loss_39": 7415.2,
"kl_loss_7": 9971.2,
"learning_rate": 0.0009851478631379982,
"loss": 17143.4,
"step": 870
},
{
"ce_loss_13": 5.8172935247421265,
"ce_loss_26": 5.092825090885162,
"ce_loss_39": 4.964837598800659,
"ce_loss_52": 1.3596146881580353,
"ce_loss_7": 6.200971674919129,
"epoch": 0.088,
"grad_norm": 38.67673909990335,
"kl_loss_13": 9150.4,
"kl_loss_26": 7612.8,
"kl_loss_39": 7350.4,
"kl_loss_7": 9947.2,
"learning_rate": 0.0009847615725553456,
"loss": 17046.8,
"step": 880
},
{
"ce_loss_13": 5.872656679153442,
"ce_loss_26": 5.153263211250305,
"ce_loss_39": 5.00926034450531,
"ce_loss_52": 1.4231197819113732,
"ce_loss_7": 6.253647100925446,
"epoch": 0.089,
"grad_norm": 38.12938597789528,
"kl_loss_13": 9128.0,
"kl_loss_26": 7595.2,
"kl_loss_39": 7309.6,
"kl_loss_7": 9923.2,
"learning_rate": 0.0009843704004290394,
"loss": 16917.8,
"step": 890
},
{
"ce_loss_13": 5.798008918762207,
"ce_loss_26": 5.091720676422119,
"ce_loss_39": 4.938081228733063,
"ce_loss_52": 1.4382835403084755,
"ce_loss_7": 6.171303284168244,
"epoch": 0.09,
"grad_norm": 37.16474091329711,
"kl_loss_13": 8936.0,
"kl_loss_26": 7427.2,
"kl_loss_39": 7121.6,
"kl_loss_7": 9726.4,
"learning_rate": 0.0009839743506981783,
"loss": 16656.0,
"step": 900
},
{
"ce_loss_13": 5.82405720949173,
"ce_loss_26": 5.096203732490539,
"ce_loss_39": 4.965065968036652,
"ce_loss_52": 1.4636528208851813,
"ce_loss_7": 6.201912236213684,
"epoch": 0.091,
"grad_norm": 36.36947394693034,
"kl_loss_13": 8939.2,
"kl_loss_26": 7354.4,
"kl_loss_39": 7080.0,
"kl_loss_7": 9747.2,
"learning_rate": 0.0009835734273509786,
"loss": 16529.0,
"step": 910
},
{
"ce_loss_13": 5.734641706943512,
"ce_loss_26": 5.011995434761047,
"ce_loss_39": 4.85785391330719,
"ce_loss_52": 1.4457294046878815,
"ce_loss_7": 6.11446977853775,
"epoch": 0.092,
"grad_norm": 36.64530545748263,
"kl_loss_13": 8840.8,
"kl_loss_26": 7284.8,
"kl_loss_39": 6967.2,
"kl_loss_7": 9636.8,
"learning_rate": 0.0009831676344247342,
"loss": 16343.8,
"step": 920
},
{
"ce_loss_13": 5.681211936473846,
"ce_loss_26": 4.933374917507171,
"ce_loss_39": 4.788007187843323,
"ce_loss_52": 1.3833691507577897,
"ce_loss_7": 6.066722130775451,
"epoch": 0.093,
"grad_norm": 37.601960547328346,
"kl_loss_13": 8819.2,
"kl_loss_26": 7232.8,
"kl_loss_39": 6926.4,
"kl_loss_7": 9620.8,
"learning_rate": 0.0009827569760057755,
"loss": 16262.8,
"step": 930
},
{
"ce_loss_13": 5.685943353176117,
"ce_loss_26": 4.954251933097839,
"ce_loss_39": 4.7964679479599,
"ce_loss_52": 1.4205988943576813,
"ce_loss_7": 6.062719237804413,
"epoch": 0.094,
"grad_norm": 34.90292075240395,
"kl_loss_13": 8707.2,
"kl_loss_26": 7154.4,
"kl_loss_39": 6841.6,
"kl_loss_7": 9496.0,
"learning_rate": 0.000982341456229428,
"loss": 16011.8,
"step": 940
},
{
"ce_loss_13": 5.640336573123932,
"ce_loss_26": 4.914572691917419,
"ce_loss_39": 4.756339108943939,
"ce_loss_52": 1.46774483025074,
"ce_loss_7": 6.0160892605781555,
"epoch": 0.095,
"grad_norm": 35.81614055285293,
"kl_loss_13": 8553.6,
"kl_loss_26": 6999.2,
"kl_loss_39": 6671.2,
"kl_loss_7": 9348.8,
"learning_rate": 0.000981921079279971,
"loss": 15864.8,
"step": 950
},
{
"ce_loss_13": 5.622566449642181,
"ce_loss_26": 4.879516458511352,
"ce_loss_39": 4.717178559303283,
"ce_loss_52": 1.4239765584468842,
"ce_loss_7": 5.998941457271576,
"epoch": 0.096,
"grad_norm": 35.60374709072334,
"kl_loss_13": 8597.6,
"kl_loss_26": 7007.2,
"kl_loss_39": 6670.4,
"kl_loss_7": 9384.0,
"learning_rate": 0.0009814958493905962,
"loss": 15764.6,
"step": 960
},
{
"ce_loss_13": 5.552615082263946,
"ce_loss_26": 4.812614411115646,
"ce_loss_39": 4.667081838846206,
"ce_loss_52": 1.4328487768769265,
"ce_loss_7": 5.935272622108459,
"epoch": 0.097,
"grad_norm": 34.10755627830643,
"kl_loss_13": 8454.4,
"kl_loss_26": 6880.8,
"kl_loss_39": 6574.4,
"kl_loss_7": 9254.4,
"learning_rate": 0.0009810657708433637,
"loss": 15541.8,
"step": 970
},
{
"ce_loss_13": 5.534706914424897,
"ce_loss_26": 4.787809383869171,
"ce_loss_39": 4.631017792224884,
"ce_loss_52": 1.4364599764347077,
"ce_loss_7": 5.910705745220184,
"epoch": 0.098,
"grad_norm": 33.30412559481911,
"kl_loss_13": 8426.4,
"kl_loss_26": 6818.4,
"kl_loss_39": 6496.0,
"kl_loss_7": 9220.8,
"learning_rate": 0.0009806308479691594,
"loss": 15486.2,
"step": 980
},
{
"ce_loss_13": 5.455054485797882,
"ce_loss_26": 4.712761473655701,
"ce_loss_39": 4.549750781059265,
"ce_loss_52": 1.442040067911148,
"ce_loss_7": 5.841645193099976,
"epoch": 0.099,
"grad_norm": 34.061312435089185,
"kl_loss_13": 8225.6,
"kl_loss_26": 6628.8,
"kl_loss_39": 6296.8,
"kl_loss_7": 9036.8,
"learning_rate": 0.0009801910851476522,
"loss": 15382.2,
"step": 990
},
{
"ce_loss_13": 5.4525358319282535,
"ce_loss_26": 4.71786208152771,
"ce_loss_39": 4.559128785133362,
"ce_loss_52": 1.4428577244281768,
"ce_loss_7": 5.826938045024872,
"epoch": 0.1,
"grad_norm": 33.6953047069211,
"kl_loss_13": 8196.0,
"kl_loss_26": 6628.8,
"kl_loss_39": 6297.6,
"kl_loss_7": 8995.2,
"learning_rate": 0.0009797464868072487,
"loss": 15156.4,
"step": 1000
},
{
"ce_loss_13": 5.445610964298249,
"ce_loss_26": 4.6739885926246645,
"ce_loss_39": 4.508283615112305,
"ce_loss_52": 1.4168697059154511,
"ce_loss_7": 5.834221661090851,
"epoch": 0.101,
"grad_norm": 32.70197394198742,
"kl_loss_13": 8233.6,
"kl_loss_26": 6591.2,
"kl_loss_39": 6246.4,
"kl_loss_7": 9051.2,
"learning_rate": 0.0009792970574250492,
"loss": 14993.6,
"step": 1010
},
{
"ce_loss_13": 5.368366336822509,
"ce_loss_26": 4.5666680335998535,
"ce_loss_39": 4.400501304864884,
"ce_loss_52": 1.3814861461520196,
"ce_loss_7": 5.762167453765869,
"epoch": 0.102,
"grad_norm": 32.28847151546708,
"kl_loss_13": 8145.6,
"kl_loss_26": 6444.0,
"kl_loss_39": 6111.2,
"kl_loss_7": 8969.6,
"learning_rate": 0.0009788428015268028,
"loss": 14902.6,
"step": 1020
},
{
"ce_loss_13": 5.416829228401184,
"ce_loss_26": 4.672497856616974,
"ce_loss_39": 4.5039793968200685,
"ce_loss_52": 1.4590631812810897,
"ce_loss_7": 5.7889638304710385,
"epoch": 0.103,
"grad_norm": 32.94054692999689,
"kl_loss_13": 8076.0,
"kl_loss_26": 6480.0,
"kl_loss_39": 6138.4,
"kl_loss_7": 8867.2,
"learning_rate": 0.0009783837236868609,
"loss": 14715.4,
"step": 1030
},
{
"ce_loss_13": 5.323912274837494,
"ce_loss_26": 4.551275789737701,
"ce_loss_39": 4.376495039463043,
"ce_loss_52": 1.4376014918088913,
"ce_loss_7": 5.712179851531983,
"epoch": 0.104,
"grad_norm": 32.580444775774126,
"kl_loss_13": 7934.4,
"kl_loss_26": 6285.6,
"kl_loss_39": 5924.0,
"kl_loss_7": 8752.0,
"learning_rate": 0.0009779198285281327,
"loss": 14586.6,
"step": 1040
},
{
"ce_loss_13": 5.36049393415451,
"ce_loss_26": 4.592142331600189,
"ce_loss_39": 4.4142293453216555,
"ce_loss_52": 1.4498057544231415,
"ce_loss_7": 5.73596283197403,
"epoch": 0.105,
"grad_norm": 31.80216030064833,
"kl_loss_13": 7976.8,
"kl_loss_26": 6341.6,
"kl_loss_39": 5988.8,
"kl_loss_7": 8771.2,
"learning_rate": 0.0009774511207220368,
"loss": 14415.8,
"step": 1050
},
{
"ce_loss_13": 5.3308478713035585,
"ce_loss_26": 4.5750040173530575,
"ce_loss_39": 4.39155302643776,
"ce_loss_52": 1.4785875469446181,
"ce_loss_7": 5.713631689548492,
"epoch": 0.106,
"grad_norm": 31.180344664143906,
"kl_loss_13": 7883.2,
"kl_loss_26": 6267.2,
"kl_loss_39": 5887.2,
"kl_loss_7": 8687.2,
"learning_rate": 0.0009769776049884564,
"loss": 14270.6,
"step": 1060
},
{
"ce_loss_13": 5.3476661205291744,
"ce_loss_26": 4.561805117130279,
"ce_loss_39": 4.391524451971054,
"ce_loss_52": 1.4550457745790482,
"ce_loss_7": 5.725461614131928,
"epoch": 0.107,
"grad_norm": 30.9396118714078,
"kl_loss_13": 7968.0,
"kl_loss_26": 6308.0,
"kl_loss_39": 5942.4,
"kl_loss_7": 8754.4,
"learning_rate": 0.0009764992860956889,
"loss": 14268.8,
"step": 1070
},
{
"ce_loss_13": 5.248398435115814,
"ce_loss_26": 4.472868782281876,
"ce_loss_39": 4.294939804077148,
"ce_loss_52": 1.4237870454788208,
"ce_loss_7": 5.6272268176078795,
"epoch": 0.108,
"grad_norm": 30.45094259940441,
"kl_loss_13": 7834.4,
"kl_loss_26": 6169.6,
"kl_loss_39": 5812.0,
"kl_loss_7": 8620.8,
"learning_rate": 0.0009760161688604008,
"loss": 14058.8,
"step": 1080
},
{
"ce_loss_13": 5.167715132236481,
"ce_loss_26": 4.421313828229904,
"ce_loss_39": 4.241793435811997,
"ce_loss_52": 1.4617454051971435,
"ce_loss_7": 5.536553728580475,
"epoch": 0.109,
"grad_norm": 30.37346177682711,
"kl_loss_13": 7596.8,
"kl_loss_26": 5987.2,
"kl_loss_39": 5622.4,
"kl_loss_7": 8367.2,
"learning_rate": 0.0009755282581475768,
"loss": 13946.4,
"step": 1090
},
{
"ce_loss_13": 5.21688460111618,
"ce_loss_26": 4.442314791679382,
"ce_loss_39": 4.254946118593216,
"ce_loss_52": 1.4511815324425696,
"ce_loss_7": 5.593706953525543,
"epoch": 0.11,
"grad_norm": 30.665307755441862,
"kl_loss_13": 7699.2,
"kl_loss_26": 6030.4,
"kl_loss_39": 5652.0,
"kl_loss_7": 8485.6,
"learning_rate": 0.0009750355588704727,
"loss": 13825.8,
"step": 1100
},
{
"ce_loss_13": 5.112356352806091,
"ce_loss_26": 4.310530138015747,
"ce_loss_39": 4.125328695774078,
"ce_loss_52": 1.4114144504070283,
"ce_loss_7": 5.487049925327301,
"epoch": 0.111,
"grad_norm": 29.407721301071028,
"kl_loss_13": 7569.6,
"kl_loss_26": 5844.8,
"kl_loss_39": 5470.4,
"kl_loss_7": 8359.2,
"learning_rate": 0.0009745380759905647,
"loss": 13627.8,
"step": 1110
},
{
"ce_loss_13": 5.087459588050843,
"ce_loss_26": 4.285146009922028,
"ce_loss_39": 4.101419150829315,
"ce_loss_52": 1.3823944509029389,
"ce_loss_7": 5.47238245010376,
"epoch": 0.112,
"grad_norm": 28.691648596064702,
"kl_loss_13": 7563.2,
"kl_loss_26": 5855.2,
"kl_loss_39": 5476.0,
"kl_loss_7": 8378.4,
"learning_rate": 0.0009740358145174998,
"loss": 13629.4,
"step": 1120
},
{
"ce_loss_13": 5.085049080848694,
"ce_loss_26": 4.283704102039337,
"ce_loss_39": 4.095863288640976,
"ce_loss_52": 1.4323463156819343,
"ce_loss_7": 5.455269980430603,
"epoch": 0.113,
"grad_norm": 28.708307757554874,
"kl_loss_13": 7468.0,
"kl_loss_26": 5755.2,
"kl_loss_39": 5364.0,
"kl_loss_7": 8244.8,
"learning_rate": 0.0009735287795090455,
"loss": 13475.0,
"step": 1130
},
{
"ce_loss_13": 4.9704699397087095,
"ce_loss_26": 4.164930063486099,
"ce_loss_39": 3.981805819272995,
"ce_loss_52": 1.3983336806297302,
"ce_loss_7": 5.351285874843597,
"epoch": 0.114,
"grad_norm": 30.611214200807577,
"kl_loss_13": 7311.2,
"kl_loss_26": 5585.6,
"kl_loss_39": 5199.2,
"kl_loss_7": 8099.2,
"learning_rate": 0.0009730169760710386,
"loss": 13288.2,
"step": 1140
},
{
"ce_loss_13": 5.094941341876984,
"ce_loss_26": 4.275070035457611,
"ce_loss_39": 4.084649866819381,
"ce_loss_52": 1.4357529014348984,
"ce_loss_7": 5.482879185676575,
"epoch": 0.115,
"grad_norm": 29.988760771554233,
"kl_loss_13": 7468.8,
"kl_loss_26": 5715.2,
"kl_loss_39": 5326.4,
"kl_loss_7": 8275.2,
"learning_rate": 0.0009725004093573342,
"loss": 13196.6,
"step": 1150
},
{
"ce_loss_13": 4.942017900943756,
"ce_loss_26": 4.151496112346649,
"ce_loss_39": 3.957593894004822,
"ce_loss_52": 1.4098813980817795,
"ce_loss_7": 5.323493349552154,
"epoch": 0.116,
"grad_norm": 30.06505974205765,
"kl_loss_13": 7237.6,
"kl_loss_26": 5534.4,
"kl_loss_39": 5135.2,
"kl_loss_7": 8035.2,
"learning_rate": 0.0009719790845697534,
"loss": 13084.4,
"step": 1160
},
{
"ce_loss_13": 4.974001240730286,
"ce_loss_26": 4.16838675737381,
"ce_loss_39": 3.968938571214676,
"ce_loss_52": 1.4311222642660142,
"ce_loss_7": 5.3611521363258365,
"epoch": 0.117,
"grad_norm": 28.226442547632384,
"kl_loss_13": 7241.6,
"kl_loss_26": 5536.8,
"kl_loss_39": 5121.6,
"kl_loss_7": 8045.6,
"learning_rate": 0.0009714530069580309,
"loss": 12959.6,
"step": 1170
},
{
"ce_loss_13": 4.905927586555481,
"ce_loss_26": 4.065518736839294,
"ce_loss_39": 3.8746193647384644,
"ce_loss_52": 1.3948067665100097,
"ce_loss_7": 5.290950679779053,
"epoch": 0.118,
"grad_norm": 26.7183550844667,
"kl_loss_13": 7163.2,
"kl_loss_26": 5388.0,
"kl_loss_39": 4992.8,
"kl_loss_7": 7959.2,
"learning_rate": 0.0009709221818197624,
"loss": 12883.0,
"step": 1180
},
{
"ce_loss_13": 4.902862447500229,
"ce_loss_26": 4.10335453748703,
"ce_loss_39": 3.920765632390976,
"ce_loss_52": 1.4242859303951263,
"ce_loss_7": 5.280882668495178,
"epoch": 0.119,
"grad_norm": 27.541041012815914,
"kl_loss_13": 7096.0,
"kl_loss_26": 5372.8,
"kl_loss_39": 4987.2,
"kl_loss_7": 7895.2,
"learning_rate": 0.0009703866145003512,
"loss": 12755.6,
"step": 1190
},
{
"ce_loss_13": 4.91794501543045,
"ce_loss_26": 4.094452971220017,
"ce_loss_39": 3.892131644487381,
"ce_loss_52": 1.4217353582382202,
"ce_loss_7": 5.298339033126831,
"epoch": 0.12,
"grad_norm": 27.647397498896083,
"kl_loss_13": 7171.2,
"kl_loss_26": 5395.2,
"kl_loss_39": 4985.6,
"kl_loss_7": 7958.4,
"learning_rate": 0.0009698463103929542,
"loss": 12646.8,
"step": 1200
},
{
"ce_loss_13": 4.933221316337585,
"ce_loss_26": 4.129461044073105,
"ce_loss_39": 3.92621054649353,
"ce_loss_52": 1.4751009970903397,
"ce_loss_7": 5.315613615512848,
"epoch": 0.121,
"grad_norm": 26.42127738230431,
"kl_loss_13": 7043.2,
"kl_loss_26": 5317.6,
"kl_loss_39": 4899.2,
"kl_loss_7": 7841.6,
"learning_rate": 0.0009693012749384279,
"loss": 12515.8,
"step": 1210
},
{
"ce_loss_13": 4.865577363967896,
"ce_loss_26": 4.064575934410096,
"ce_loss_39": 3.863145834207535,
"ce_loss_52": 1.4422448396682739,
"ce_loss_7": 5.235701704025269,
"epoch": 0.122,
"grad_norm": 25.567127248660423,
"kl_loss_13": 6988.0,
"kl_loss_26": 5271.2,
"kl_loss_39": 4855.2,
"kl_loss_7": 7760.8,
"learning_rate": 0.0009687515136252732,
"loss": 12484.2,
"step": 1220
},
{
"ce_loss_13": 4.875257205963135,
"ce_loss_26": 4.068110597133637,
"ce_loss_39": 3.866461306810379,
"ce_loss_52": 1.4406520485877992,
"ce_loss_7": 5.254658281803131,
"epoch": 0.123,
"grad_norm": 25.60032726683598,
"kl_loss_13": 7019.2,
"kl_loss_26": 5276.8,
"kl_loss_39": 4864.0,
"kl_loss_7": 7809.6,
"learning_rate": 0.0009681970319895803,
"loss": 12358.8,
"step": 1230
},
{
"ce_loss_13": 4.856750476360321,
"ce_loss_26": 4.064332664012909,
"ce_loss_39": 3.865360552072525,
"ce_loss_52": 1.4734540313482285,
"ce_loss_7": 5.232936894893646,
"epoch": 0.124,
"grad_norm": 27.54111241294886,
"kl_loss_13": 6928.8,
"kl_loss_26": 5224.8,
"kl_loss_39": 4806.4,
"kl_loss_7": 7701.6,
"learning_rate": 0.0009676378356149733,
"loss": 12219.2,
"step": 1240
},
{
"ce_loss_13": 4.714985811710358,
"ce_loss_26": 3.8911590695381166,
"ce_loss_39": 3.6897071480751036,
"ce_loss_52": 1.4211883068084716,
"ce_loss_7": 5.092134392261505,
"epoch": 0.125,
"grad_norm": 26.159823932609697,
"kl_loss_13": 6763.2,
"kl_loss_26": 5001.6,
"kl_loss_39": 4584.8,
"kl_loss_7": 7549.6,
"learning_rate": 0.0009670739301325534,
"loss": 12043.8,
"step": 1250
},
{
"ce_loss_13": 4.746155381202698,
"ce_loss_26": 3.9151339173316955,
"ce_loss_39": 3.717781513929367,
"ce_loss_52": 1.3889067679643632,
"ce_loss_7": 5.118369615077972,
"epoch": 0.126,
"grad_norm": 27.210751367526278,
"kl_loss_13": 6825.6,
"kl_loss_26": 5060.0,
"kl_loss_39": 4660.0,
"kl_loss_7": 7612.0,
"learning_rate": 0.0009665053212208426,
"loss": 12020.6,
"step": 1260
},
{
"ce_loss_13": 4.729644465446472,
"ce_loss_26": 3.8812398612499237,
"ce_loss_39": 3.686249554157257,
"ce_loss_52": 1.421858811378479,
"ce_loss_7": 5.109574091434479,
"epoch": 0.127,
"grad_norm": 24.75325655489569,
"kl_loss_13": 6772.8,
"kl_loss_26": 4955.6,
"kl_loss_39": 4553.2,
"kl_loss_7": 7572.8,
"learning_rate": 0.0009659320146057262,
"loss": 11949.0,
"step": 1270
},
{
"ce_loss_13": 4.706896722316742,
"ce_loss_26": 3.884850525856018,
"ce_loss_39": 3.679063153266907,
"ce_loss_52": 1.4093473598361015,
"ce_loss_7": 5.0908261895179745,
"epoch": 0.128,
"grad_norm": 25.78294847436066,
"kl_loss_13": 6728.0,
"kl_loss_26": 4984.8,
"kl_loss_39": 4562.4,
"kl_loss_7": 7525.6,
"learning_rate": 0.0009653540160603955,
"loss": 11920.8,
"step": 1280
},
{
"ce_loss_13": 4.68510691523552,
"ce_loss_26": 3.886442297697067,
"ce_loss_39": 3.682328295707703,
"ce_loss_52": 1.4646018967032433,
"ce_loss_7": 5.063843679428101,
"epoch": 0.129,
"grad_norm": 24.870529388647757,
"kl_loss_13": 6587.2,
"kl_loss_26": 4871.2,
"kl_loss_39": 4451.6,
"kl_loss_7": 7384.8,
"learning_rate": 0.0009647713314052896,
"loss": 11716.1,
"step": 1290
},
{
"ce_loss_13": 4.6914472579956055,
"ce_loss_26": 3.8814328253269195,
"ce_loss_39": 3.6700519025325775,
"ce_loss_52": 1.428275752067566,
"ce_loss_7": 5.065358865261078,
"epoch": 0.13,
"grad_norm": 24.506148720784267,
"kl_loss_13": 6617.6,
"kl_loss_26": 4886.4,
"kl_loss_39": 4454.8,
"kl_loss_7": 7397.6,
"learning_rate": 0.0009641839665080363,
"loss": 11621.0,
"step": 1300
},
{
"ce_loss_13": 4.666101861000061,
"ce_loss_26": 3.8607113540172575,
"ce_loss_39": 3.6500234425067903,
"ce_loss_52": 1.4570627421140672,
"ce_loss_7": 5.0361028671264645,
"epoch": 0.131,
"grad_norm": 23.043331928969636,
"kl_loss_13": 6562.4,
"kl_loss_26": 4820.0,
"kl_loss_39": 4389.2,
"kl_loss_7": 7340.0,
"learning_rate": 0.0009635919272833937,
"loss": 11547.2,
"step": 1310
},
{
"ce_loss_13": 4.57597508430481,
"ce_loss_26": 3.757897812128067,
"ce_loss_39": 3.541293317079544,
"ce_loss_52": 1.417707359790802,
"ce_loss_7": 4.957513523101807,
"epoch": 0.132,
"grad_norm": 23.874048124377556,
"kl_loss_13": 6413.6,
"kl_loss_26": 4680.0,
"kl_loss_39": 4232.4,
"kl_loss_7": 7214.4,
"learning_rate": 0.0009629952196931902,
"loss": 11465.6,
"step": 1320
},
{
"ce_loss_13": 4.599424958229065,
"ce_loss_26": 3.7814753651618958,
"ce_loss_39": 3.5756381869316103,
"ce_loss_52": 1.435066069662571,
"ce_loss_7": 4.962808167934417,
"epoch": 0.133,
"grad_norm": 27.163150075748632,
"kl_loss_13": 6461.6,
"kl_loss_26": 4719.2,
"kl_loss_39": 4284.8,
"kl_loss_7": 7222.4,
"learning_rate": 0.0009623938497462645,
"loss": 11415.0,
"step": 1330
},
{
"ce_loss_13": 4.593182015419006,
"ce_loss_26": 3.749741864204407,
"ce_loss_39": 3.543072348833084,
"ce_loss_52": 1.4210506305098534,
"ce_loss_7": 4.98070273399353,
"epoch": 0.134,
"grad_norm": 23.74960078438379,
"kl_loss_13": 6465.6,
"kl_loss_26": 4674.0,
"kl_loss_39": 4258.0,
"kl_loss_7": 7272.8,
"learning_rate": 0.0009617878234984055,
"loss": 11286.2,
"step": 1340
},
{
"ce_loss_13": 4.5947358965873715,
"ce_loss_26": 3.779453754425049,
"ce_loss_39": 3.554258805513382,
"ce_loss_52": 1.440669310092926,
"ce_loss_7": 4.96803480386734,
"epoch": 0.135,
"grad_norm": 24.32368060127727,
"kl_loss_13": 6402.4,
"kl_loss_26": 4660.8,
"kl_loss_39": 4202.0,
"kl_loss_7": 7178.4,
"learning_rate": 0.0009611771470522907,
"loss": 11138.4,
"step": 1350
},
{
"ce_loss_13": 4.540663009881973,
"ce_loss_26": 3.7292558193206786,
"ce_loss_39": 3.5131251573562623,
"ce_loss_52": 1.4146902561187744,
"ce_loss_7": 4.918925869464874,
"epoch": 0.136,
"grad_norm": 23.85034387543973,
"kl_loss_13": 6383.2,
"kl_loss_26": 4642.0,
"kl_loss_39": 4189.2,
"kl_loss_7": 7181.6,
"learning_rate": 0.0009605618265574251,
"loss": 11195.2,
"step": 1360
},
{
"ce_loss_13": 4.595166695117951,
"ce_loss_26": 3.7860535979270935,
"ce_loss_39": 3.5680083632469177,
"ce_loss_52": 1.4854394227266312,
"ce_loss_7": 4.9739551663398744,
"epoch": 0.137,
"grad_norm": 23.647990920122997,
"kl_loss_13": 6340.0,
"kl_loss_26": 4612.0,
"kl_loss_39": 4162.0,
"kl_loss_7": 7127.2,
"learning_rate": 0.0009599418682100792,
"loss": 11028.6,
"step": 1370
},
{
"ce_loss_13": 4.481674873828888,
"ce_loss_26": 3.655723828077316,
"ce_loss_39": 3.4441386282444,
"ce_loss_52": 1.402228906750679,
"ce_loss_7": 4.8588902950286865,
"epoch": 0.138,
"grad_norm": 23.628756362977956,
"kl_loss_13": 6284.8,
"kl_loss_26": 4506.8,
"kl_loss_39": 4074.4,
"kl_loss_7": 7073.6,
"learning_rate": 0.0009593172782532268,
"loss": 10976.4,
"step": 1380
},
{
"ce_loss_13": 4.446731185913086,
"ce_loss_26": 3.6488849222660065,
"ce_loss_39": 3.4283725559711455,
"ce_loss_52": 1.425352481007576,
"ce_loss_7": 4.823254930973053,
"epoch": 0.139,
"grad_norm": 23.454937110465252,
"kl_loss_13": 6190.4,
"kl_loss_26": 4472.0,
"kl_loss_39": 4019.6,
"kl_loss_7": 6972.0,
"learning_rate": 0.0009586880629764817,
"loss": 10856.2,
"step": 1390
},
{
"ce_loss_13": 4.481068539619446,
"ce_loss_26": 3.6307739317417145,
"ce_loss_39": 3.4139047265052795,
"ce_loss_52": 1.3980510637164116,
"ce_loss_7": 4.877132707834244,
"epoch": 0.14,
"grad_norm": 24.701369227992412,
"kl_loss_13": 6300.0,
"kl_loss_26": 4496.4,
"kl_loss_39": 4045.6,
"kl_loss_7": 7120.0,
"learning_rate": 0.0009580542287160348,
"loss": 10848.4,
"step": 1400
},
{
"ce_loss_13": 4.481135439872742,
"ce_loss_26": 3.684358072280884,
"ce_loss_39": 3.454758608341217,
"ce_loss_52": 1.459119439125061,
"ce_loss_7": 4.855645072460175,
"epoch": 0.141,
"grad_norm": 24.081142665128635,
"kl_loss_13": 6146.4,
"kl_loss_26": 4455.2,
"kl_loss_39": 3985.2,
"kl_loss_7": 6936.0,
"learning_rate": 0.0009574157818545901,
"loss": 10711.8,
"step": 1410
},
{
"ce_loss_13": 4.440946173667908,
"ce_loss_26": 3.6337361335754395,
"ce_loss_39": 3.404258185625076,
"ce_loss_52": 1.411030687391758,
"ce_loss_7": 4.816100597381592,
"epoch": 0.142,
"grad_norm": 22.547443199755673,
"kl_loss_13": 6183.2,
"kl_loss_26": 4461.2,
"kl_loss_39": 3991.6,
"kl_loss_7": 6976.0,
"learning_rate": 0.0009567727288213005,
"loss": 10724.8,
"step": 1420
},
{
"ce_loss_13": 4.457812869548798,
"ce_loss_26": 3.6864565014839172,
"ce_loss_39": 3.452153670787811,
"ce_loss_52": 1.4788728266954423,
"ce_loss_7": 4.832965791225433,
"epoch": 0.143,
"grad_norm": 22.827900847688525,
"kl_loss_13": 6096.8,
"kl_loss_26": 4430.4,
"kl_loss_39": 3959.2,
"kl_loss_7": 6881.6,
"learning_rate": 0.0009561250760917027,
"loss": 10616.2,
"step": 1430
},
{
"ce_loss_13": 4.386147284507752,
"ce_loss_26": 3.586784356832504,
"ce_loss_39": 3.356312555074692,
"ce_loss_52": 1.4125748693943023,
"ce_loss_7": 4.7688825011253355,
"epoch": 0.144,
"grad_norm": 22.45947089806503,
"kl_loss_13": 6064.0,
"kl_loss_26": 4354.0,
"kl_loss_39": 3886.8,
"kl_loss_7": 6859.2,
"learning_rate": 0.0009554728301876525,
"loss": 10473.0,
"step": 1440
},
{
"ce_loss_13": 4.411167800426483,
"ce_loss_26": 3.590187501907349,
"ce_loss_39": 3.3659981071949003,
"ce_loss_52": 1.4227360993623734,
"ce_loss_7": 4.80015469789505,
"epoch": 0.145,
"grad_norm": 22.134471970609756,
"kl_loss_13": 6080.0,
"kl_loss_26": 4321.2,
"kl_loss_39": 3868.4,
"kl_loss_7": 6892.0,
"learning_rate": 0.0009548159976772592,
"loss": 10449.2,
"step": 1450
},
{
"ce_loss_13": 4.304250085353852,
"ce_loss_26": 3.5215473413467406,
"ce_loss_39": 3.3020472466945647,
"ce_loss_52": 1.455188724398613,
"ce_loss_7": 4.682382225990295,
"epoch": 0.146,
"grad_norm": 23.296949813068704,
"kl_loss_13": 5828.8,
"kl_loss_26": 4153.6,
"kl_loss_39": 3701.2,
"kl_loss_7": 6626.4,
"learning_rate": 0.0009541545851748186,
"loss": 10336.2,
"step": 1460
},
{
"ce_loss_13": 4.340453952550888,
"ce_loss_26": 3.527716559171677,
"ce_loss_39": 3.297034960985184,
"ce_loss_52": 1.4175047695636749,
"ce_loss_7": 4.720600801706314,
"epoch": 0.147,
"grad_norm": 23.69989338145452,
"kl_loss_13": 5933.6,
"kl_loss_26": 4197.2,
"kl_loss_39": 3732.0,
"kl_loss_7": 6732.8,
"learning_rate": 0.0009534885993407473,
"loss": 10320.4,
"step": 1470
},
{
"ce_loss_13": 4.316350519657135,
"ce_loss_26": 3.522084206342697,
"ce_loss_39": 3.293704879283905,
"ce_loss_52": 1.4351435631513596,
"ce_loss_7": 4.694415915012359,
"epoch": 0.148,
"grad_norm": 23.155946924290426,
"kl_loss_13": 5859.2,
"kl_loss_26": 4177.2,
"kl_loss_39": 3708.0,
"kl_loss_7": 6649.6,
"learning_rate": 0.0009528180468815154,
"loss": 10227.4,
"step": 1480
},
{
"ce_loss_13": 4.34768191576004,
"ce_loss_26": 3.5612153470516206,
"ce_loss_39": 3.3256225168704985,
"ce_loss_52": 1.4718306064605713,
"ce_loss_7": 4.73325879573822,
"epoch": 0.149,
"grad_norm": 22.828288355166595,
"kl_loss_13": 5840.8,
"kl_loss_26": 4173.2,
"kl_loss_39": 3699.2,
"kl_loss_7": 6654.4,
"learning_rate": 0.0009521429345495787,
"loss": 10213.0,
"step": 1490
},
{
"ce_loss_13": 4.285507726669311,
"ce_loss_26": 3.494914507865906,
"ce_loss_39": 3.2638841211795806,
"ce_loss_52": 1.4394250243902207,
"ce_loss_7": 4.674430012702942,
"epoch": 0.15,
"grad_norm": 22.358102006612796,
"kl_loss_13": 5802.4,
"kl_loss_26": 4114.4,
"kl_loss_39": 3639.2,
"kl_loss_7": 6619.2,
"learning_rate": 0.0009514632691433108,
"loss": 10144.0,
"step": 1500
},
{
"ce_loss_13": 4.274980753660202,
"ce_loss_26": 3.4573559522628785,
"ce_loss_39": 3.2235658168792725,
"ce_loss_52": 1.4003299355506897,
"ce_loss_7": 4.676058840751648,
"epoch": 0.151,
"grad_norm": 21.54131953317247,
"kl_loss_13": 5876.0,
"kl_loss_26": 4120.4,
"kl_loss_39": 3643.6,
"kl_loss_7": 6711.2,
"learning_rate": 0.0009507790575069346,
"loss": 10084.8,
"step": 1510
},
{
"ce_loss_13": 4.239670622348785,
"ce_loss_26": 3.4528944075107573,
"ce_loss_39": 3.221757102012634,
"ce_loss_52": 1.4378845229744912,
"ce_loss_7": 4.625989091396332,
"epoch": 0.152,
"grad_norm": 20.883193615641517,
"kl_loss_13": 5700.0,
"kl_loss_26": 4031.2,
"kl_loss_39": 3560.0,
"kl_loss_7": 6514.4,
"learning_rate": 0.0009500903065304539,
"loss": 9981.0,
"step": 1520
},
{
"ce_loss_13": 4.250718909502029,
"ce_loss_26": 3.4539793133735657,
"ce_loss_39": 3.2287534534931184,
"ce_loss_52": 1.450673970580101,
"ce_loss_7": 4.637939321994781,
"epoch": 0.153,
"grad_norm": 21.937587942461658,
"kl_loss_13": 5717.6,
"kl_loss_26": 4016.0,
"kl_loss_39": 3551.6,
"kl_loss_7": 6520.0,
"learning_rate": 0.0009493970231495835,
"loss": 9886.2,
"step": 1530
},
{
"ce_loss_13": 4.213818311691284,
"ce_loss_26": 3.4225172460079194,
"ce_loss_39": 3.192109799385071,
"ce_loss_52": 1.4253905564546585,
"ce_loss_7": 4.591876769065857,
"epoch": 0.154,
"grad_norm": 22.11614774484019,
"kl_loss_13": 5669.6,
"kl_loss_26": 3997.6,
"kl_loss_39": 3517.6,
"kl_loss_7": 6459.2,
"learning_rate": 0.0009486992143456792,
"loss": 9848.6,
"step": 1540
},
{
"ce_loss_13": 4.188841539621353,
"ce_loss_26": 3.3936797797679903,
"ce_loss_39": 3.15527623295784,
"ce_loss_52": 1.4304118230938911,
"ce_loss_7": 4.582345807552338,
"epoch": 0.155,
"grad_norm": 23.528038258157196,
"kl_loss_13": 5588.0,
"kl_loss_26": 3908.0,
"kl_loss_39": 3429.6,
"kl_loss_7": 6408.8,
"learning_rate": 0.0009479968871456679,
"loss": 9804.0,
"step": 1550
},
{
"ce_loss_13": 4.199311399459839,
"ce_loss_26": 3.388694739341736,
"ce_loss_39": 3.161331224441528,
"ce_loss_52": 1.424024721980095,
"ce_loss_7": 4.598053079843521,
"epoch": 0.156,
"grad_norm": 20.794486798088236,
"kl_loss_13": 5643.2,
"kl_loss_26": 3922.0,
"kl_loss_39": 3462.8,
"kl_loss_7": 6473.6,
"learning_rate": 0.0009472900486219768,
"loss": 9758.3,
"step": 1560
},
{
"ce_loss_13": 4.16922065615654,
"ce_loss_26": 3.3762763381004333,
"ce_loss_39": 3.142381691932678,
"ce_loss_52": 1.4278530597686767,
"ce_loss_7": 4.5707217931747435,
"epoch": 0.157,
"grad_norm": 21.684568107604626,
"kl_loss_13": 5582.4,
"kl_loss_26": 3884.4,
"kl_loss_39": 3405.2,
"kl_loss_7": 6420.0,
"learning_rate": 0.000946578705892462,
"loss": 9625.8,
"step": 1570
},
{
"ce_loss_13": 4.178904807567596,
"ce_loss_26": 3.382037007808685,
"ce_loss_39": 3.137607681751251,
"ce_loss_52": 1.4311140328645706,
"ce_loss_7": 4.566913962364197,
"epoch": 0.158,
"grad_norm": 21.97647697577533,
"kl_loss_13": 5572.8,
"kl_loss_26": 3885.6,
"kl_loss_39": 3393.6,
"kl_loss_7": 6394.4,
"learning_rate": 0.0009458628661203367,
"loss": 9608.1,
"step": 1580
},
{
"ce_loss_13": 4.179209893941879,
"ce_loss_26": 3.378256690502167,
"ce_loss_39": 3.13810538649559,
"ce_loss_52": 1.418858152627945,
"ce_loss_7": 4.562736237049103,
"epoch": 0.159,
"grad_norm": 20.949460387099393,
"kl_loss_13": 5612.0,
"kl_loss_26": 3914.4,
"kl_loss_39": 3428.8,
"kl_loss_7": 6420.0,
"learning_rate": 0.0009451425365140996,
"loss": 9608.8,
"step": 1590
},
{
"ce_loss_13": 4.1623717725276945,
"ce_loss_26": 3.376670056581497,
"ce_loss_39": 3.1331222474575045,
"ce_loss_52": 1.434694454073906,
"ce_loss_7": 4.556069934368134,
"epoch": 0.16,
"grad_norm": 20.884587914746188,
"kl_loss_13": 5573.6,
"kl_loss_26": 3878.8,
"kl_loss_39": 3382.4,
"kl_loss_7": 6402.4,
"learning_rate": 0.0009444177243274617,
"loss": 9535.6,
"step": 1600
},
{
"ce_loss_13": 4.082578724622726,
"ce_loss_26": 3.289813929796219,
"ce_loss_39": 3.0515677452087404,
"ce_loss_52": 1.4236672833561896,
"ce_loss_7": 4.478320574760437,
"epoch": 0.161,
"grad_norm": 20.592533194999756,
"kl_loss_13": 5423.2,
"kl_loss_26": 3736.8,
"kl_loss_39": 3249.6,
"kl_loss_7": 6253.6,
"learning_rate": 0.0009436884368592739,
"loss": 9466.0,
"step": 1610
},
{
"ce_loss_13": 4.142659282684326,
"ce_loss_26": 3.3751652896404267,
"ce_loss_39": 3.137872564792633,
"ce_loss_52": 1.481352651119232,
"ce_loss_7": 4.527895116806031,
"epoch": 0.162,
"grad_norm": 21.486710542968336,
"kl_loss_13": 5416.8,
"kl_loss_26": 3772.8,
"kl_loss_39": 3282.8,
"kl_loss_7": 6228.8,
"learning_rate": 0.0009429546814534529,
"loss": 9367.9,
"step": 1620
},
{
"ce_loss_13": 4.141797959804535,
"ce_loss_26": 3.344935214519501,
"ce_loss_39": 3.1015843570232393,
"ce_loss_52": 1.4449981674551964,
"ce_loss_7": 4.53586882352829,
"epoch": 0.163,
"grad_norm": 22.257565389083407,
"kl_loss_13": 5484.8,
"kl_loss_26": 3776.8,
"kl_loss_39": 3282.8,
"kl_loss_7": 6309.6,
"learning_rate": 0.0009422164654989072,
"loss": 9391.3,
"step": 1630
},
{
"ce_loss_13": 4.131260120868683,
"ce_loss_26": 3.3283946096897123,
"ce_loss_39": 3.08265677690506,
"ce_loss_52": 1.446463230252266,
"ce_loss_7": 4.523310673236847,
"epoch": 0.164,
"grad_norm": 20.40514368262374,
"kl_loss_13": 5457.6,
"kl_loss_26": 3766.0,
"kl_loss_39": 3265.6,
"kl_loss_7": 6288.8,
"learning_rate": 0.0009414737964294635,
"loss": 9297.4,
"step": 1640
},
{
"ce_loss_13": 4.055157667398452,
"ce_loss_26": 3.2639363288879393,
"ce_loss_39": 3.0281569600105285,
"ce_loss_52": 1.451804205775261,
"ce_loss_7": 4.447070962190628,
"epoch": 0.165,
"grad_norm": 21.902920394223948,
"kl_loss_13": 5323.2,
"kl_loss_26": 3634.0,
"kl_loss_39": 3150.0,
"kl_loss_7": 6143.2,
"learning_rate": 0.000940726681723791,
"loss": 9207.4,
"step": 1650
},
{
"ce_loss_13": 3.9808266043663023,
"ce_loss_26": 3.1883151113986967,
"ce_loss_39": 2.9586188077926634,
"ce_loss_52": 1.4088758006691933,
"ce_loss_7": 4.372277349233627,
"epoch": 0.166,
"grad_norm": 21.183658337296244,
"kl_loss_13": 5256.0,
"kl_loss_26": 3567.6,
"kl_loss_39": 3100.8,
"kl_loss_7": 6079.2,
"learning_rate": 0.0009399751289053266,
"loss": 9204.0,
"step": 1660
},
{
"ce_loss_13": 4.0190062642097475,
"ce_loss_26": 3.22450470328331,
"ce_loss_39": 2.987401658296585,
"ce_loss_52": 1.3997518077492714,
"ce_loss_7": 4.41674884557724,
"epoch": 0.167,
"grad_norm": 21.809854839151214,
"kl_loss_13": 5346.4,
"kl_loss_26": 3648.8,
"kl_loss_39": 3164.8,
"kl_loss_7": 6175.2,
"learning_rate": 0.0009392191455421988,
"loss": 9183.3,
"step": 1670
},
{
"ce_loss_13": 3.9647205591201784,
"ce_loss_26": 3.1920640766620636,
"ce_loss_39": 2.951520323753357,
"ce_loss_52": 1.385107731819153,
"ce_loss_7": 4.359467995166779,
"epoch": 0.168,
"grad_norm": 20.368238654152925,
"kl_loss_13": 5256.8,
"kl_loss_26": 3602.8,
"kl_loss_39": 3118.0,
"kl_loss_7": 6080.8,
"learning_rate": 0.0009384587392471515,
"loss": 9080.3,
"step": 1680
},
{
"ce_loss_13": 3.9925671815872192,
"ce_loss_26": 3.2080724120140074,
"ce_loss_39": 2.9695263385772703,
"ce_loss_52": 1.4156971365213393,
"ce_loss_7": 4.380452990531921,
"epoch": 0.169,
"grad_norm": 20.999177946491915,
"kl_loss_13": 5268.0,
"kl_loss_26": 3596.4,
"kl_loss_39": 3100.8,
"kl_loss_7": 6080.8,
"learning_rate": 0.0009376939176774678,
"loss": 8989.5,
"step": 1690
},
{
"ce_loss_13": 4.0180779755115505,
"ce_loss_26": 3.258928191661835,
"ce_loss_39": 3.0135749876499176,
"ce_loss_52": 1.450287464261055,
"ce_loss_7": 4.408803248405457,
"epoch": 0.17,
"grad_norm": 19.79885149243747,
"kl_loss_13": 5224.8,
"kl_loss_26": 3590.8,
"kl_loss_39": 3092.0,
"kl_loss_7": 6036.8,
"learning_rate": 0.0009369246885348925,
"loss": 8994.3,
"step": 1700
},
{
"ce_loss_13": 4.005008333921433,
"ce_loss_26": 3.2108667314052584,
"ce_loss_39": 2.9691853642463686,
"ce_loss_52": 1.4231860041618347,
"ce_loss_7": 4.408407872915268,
"epoch": 0.171,
"grad_norm": 20.19997348661953,
"kl_loss_13": 5275.2,
"kl_loss_26": 3580.8,
"kl_loss_39": 3088.8,
"kl_loss_7": 6116.8,
"learning_rate": 0.0009361510595655545,
"loss": 9032.7,
"step": 1710
},
{
"ce_loss_13": 4.0295430123806,
"ce_loss_26": 3.269349628686905,
"ce_loss_39": 3.0168069303035736,
"ce_loss_52": 1.4558824241161346,
"ce_loss_7": 4.418658912181854,
"epoch": 0.172,
"grad_norm": 20.29063277895484,
"kl_loss_13": 5251.2,
"kl_loss_26": 3624.8,
"kl_loss_39": 3109.2,
"kl_loss_7": 6066.4,
"learning_rate": 0.0009353730385598887,
"loss": 8917.6,
"step": 1720
},
{
"ce_loss_13": 3.904751992225647,
"ce_loss_26": 3.117345708608627,
"ce_loss_39": 2.8736896753311156,
"ce_loss_52": 1.404754376411438,
"ce_loss_7": 4.301223260164261,
"epoch": 0.173,
"grad_norm": 21.33168840347063,
"kl_loss_13": 5095.2,
"kl_loss_26": 3414.0,
"kl_loss_39": 2926.0,
"kl_loss_7": 5920.8,
"learning_rate": 0.0009345906333525581,
"loss": 8827.0,
"step": 1730
},
{
"ce_loss_13": 3.943844336271286,
"ce_loss_26": 3.192184156179428,
"ce_loss_39": 2.938348424434662,
"ce_loss_52": 1.4280656158924103,
"ce_loss_7": 4.340347635746002,
"epoch": 0.174,
"grad_norm": 20.78517763533083,
"kl_loss_13": 5121.6,
"kl_loss_26": 3512.0,
"kl_loss_39": 2997.2,
"kl_loss_7": 5951.2,
"learning_rate": 0.0009338038518223745,
"loss": 8776.4,
"step": 1740
},
{
"ce_loss_13": 3.9746175587177275,
"ce_loss_26": 3.217360532283783,
"ce_loss_39": 2.9738565742969514,
"ce_loss_52": 1.460440719127655,
"ce_loss_7": 4.36298366189003,
"epoch": 0.175,
"grad_norm": 22.95282400415446,
"kl_loss_13": 5116.0,
"kl_loss_26": 3500.0,
"kl_loss_39": 2996.0,
"kl_loss_7": 5937.6,
"learning_rate": 0.0009330127018922195,
"loss": 8715.7,
"step": 1750
},
{
"ce_loss_13": 3.903409707546234,
"ce_loss_26": 3.1333308279514314,
"ce_loss_39": 2.8973484218120573,
"ce_loss_52": 1.4389021694660187,
"ce_loss_7": 4.297668445110321,
"epoch": 0.176,
"grad_norm": 19.71888822868113,
"kl_loss_13": 5043.2,
"kl_loss_26": 3399.6,
"kl_loss_39": 2906.0,
"kl_loss_7": 5868.8,
"learning_rate": 0.0009322171915289634,
"loss": 8660.5,
"step": 1760
},
{
"ce_loss_13": 3.9436737656593324,
"ce_loss_26": 3.180408328771591,
"ce_loss_39": 2.933422142267227,
"ce_loss_52": 1.468785560131073,
"ce_loss_7": 4.335923504829407,
"epoch": 0.177,
"grad_norm": 21.100233895265884,
"kl_loss_13": 5036.8,
"kl_loss_26": 3416.8,
"kl_loss_39": 2893.6,
"kl_loss_7": 5853.6,
"learning_rate": 0.0009314173287433873,
"loss": 8685.1,
"step": 1770
},
{
"ce_loss_13": 4.000654596090317,
"ce_loss_26": 3.24809735417366,
"ce_loss_39": 2.9819031238555906,
"ce_loss_52": 1.4765232503414154,
"ce_loss_7": 4.392659711837768,
"epoch": 0.178,
"grad_norm": 20.346113365710192,
"kl_loss_13": 5141.6,
"kl_loss_26": 3518.0,
"kl_loss_39": 2990.4,
"kl_loss_7": 5964.0,
"learning_rate": 0.0009306131215901003,
"loss": 8673.2,
"step": 1780
},
{
"ce_loss_13": 3.9246813535690306,
"ce_loss_26": 3.1745048224925996,
"ce_loss_39": 2.9245960414409637,
"ce_loss_52": 1.4693263441324234,
"ce_loss_7": 4.317492133378982,
"epoch": 0.179,
"grad_norm": 19.397137651046872,
"kl_loss_13": 5018.4,
"kl_loss_26": 3389.2,
"kl_loss_39": 2878.0,
"kl_loss_7": 5844.8,
"learning_rate": 0.0009298045781674596,
"loss": 8564.1,
"step": 1790
},
{
"ce_loss_13": 3.910848397016525,
"ce_loss_26": 3.146669828891754,
"ce_loss_39": 2.8859269857406615,
"ce_loss_52": 1.419717761874199,
"ce_loss_7": 4.3105459094047545,
"epoch": 0.18,
"grad_norm": 19.504781875531744,
"kl_loss_13": 5040.0,
"kl_loss_26": 3411.2,
"kl_loss_39": 2894.0,
"kl_loss_7": 5888.0,
"learning_rate": 0.0009289917066174886,
"loss": 8563.1,
"step": 1800
},
{
"ce_loss_13": 3.886237096786499,
"ce_loss_26": 3.0992377579212187,
"ce_loss_39": 2.8514576256275177,
"ce_loss_52": 1.4184779956936837,
"ce_loss_7": 4.274789291620254,
"epoch": 0.181,
"grad_norm": 19.60587995762198,
"kl_loss_13": 5042.4,
"kl_loss_26": 3362.0,
"kl_loss_39": 2856.4,
"kl_loss_7": 5855.2,
"learning_rate": 0.0009281745151257945,
"loss": 8453.1,
"step": 1810
},
{
"ce_loss_13": 3.9144074499607084,
"ce_loss_26": 3.1662435114383696,
"ce_loss_39": 2.9089259922504427,
"ce_loss_52": 1.478528293967247,
"ce_loss_7": 4.301838612556457,
"epoch": 0.182,
"grad_norm": 19.79968756805323,
"kl_loss_13": 4923.2,
"kl_loss_26": 3333.2,
"kl_loss_39": 2811.6,
"kl_loss_7": 5736.8,
"learning_rate": 0.0009273530119214868,
"loss": 8471.9,
"step": 1820
},
{
"ce_loss_13": 3.8238776862621306,
"ce_loss_26": 3.058783656358719,
"ce_loss_39": 2.807573360204697,
"ce_loss_52": 1.4178009316325189,
"ce_loss_7": 4.227353280782699,
"epoch": 0.183,
"grad_norm": 19.471751859426742,
"kl_loss_13": 4899.2,
"kl_loss_26": 3273.6,
"kl_loss_39": 2762.8,
"kl_loss_7": 5738.4,
"learning_rate": 0.0009265272052770935,
"loss": 8399.4,
"step": 1830
},
{
"ce_loss_13": 3.836485821008682,
"ce_loss_26": 3.0666845202445985,
"ce_loss_39": 2.818044346570969,
"ce_loss_52": 1.410616011917591,
"ce_loss_7": 4.236015152931214,
"epoch": 0.184,
"grad_norm": 19.102126667670856,
"kl_loss_13": 4940.0,
"kl_loss_26": 3286.0,
"kl_loss_39": 2780.4,
"kl_loss_7": 5788.0,
"learning_rate": 0.0009256971035084784,
"loss": 8347.7,
"step": 1840
},
{
"ce_loss_13": 3.8049618661403657,
"ce_loss_26": 3.0603095471858976,
"ce_loss_39": 2.8144713938236237,
"ce_loss_52": 1.4259676218032837,
"ce_loss_7": 4.199950724840164,
"epoch": 0.185,
"grad_norm": 19.382557477849222,
"kl_loss_13": 4835.6,
"kl_loss_26": 3257.6,
"kl_loss_39": 2762.0,
"kl_loss_7": 5657.6,
"learning_rate": 0.0009248627149747573,
"loss": 8313.5,
"step": 1850
},
{
"ce_loss_13": 3.839466482400894,
"ce_loss_26": 3.065267437696457,
"ce_loss_39": 2.81703776717186,
"ce_loss_52": 1.4297384396195412,
"ce_loss_7": 4.240725481510163,
"epoch": 0.186,
"grad_norm": 20.053275602042234,
"kl_loss_13": 4908.4,
"kl_loss_26": 3269.6,
"kl_loss_39": 2759.6,
"kl_loss_7": 5741.6,
"learning_rate": 0.0009240240480782129,
"loss": 8305.0,
"step": 1860
},
{
"ce_loss_13": 3.8183856308460236,
"ce_loss_26": 3.070716941356659,
"ce_loss_39": 2.807391846179962,
"ce_loss_52": 1.4359196320176124,
"ce_loss_7": 4.214242458343506,
"epoch": 0.187,
"grad_norm": 19.191079413729856,
"kl_loss_13": 4847.2,
"kl_loss_26": 3263.6,
"kl_loss_39": 2733.2,
"kl_loss_7": 5672.8,
"learning_rate": 0.0009231811112642122,
"loss": 8227.6,
"step": 1870
},
{
"ce_loss_13": 3.779190129041672,
"ce_loss_26": 3.0389082789421082,
"ce_loss_39": 2.783122771978378,
"ce_loss_52": 1.4208515673875808,
"ce_loss_7": 4.162911784648895,
"epoch": 0.188,
"grad_norm": 20.43241639662848,
"kl_loss_13": 4795.2,
"kl_loss_26": 3215.2,
"kl_loss_39": 2693.2,
"kl_loss_7": 5606.4,
"learning_rate": 0.0009223339130211192,
"loss": 8213.8,
"step": 1880
},
{
"ce_loss_13": 3.708034944534302,
"ce_loss_26": 2.9702564030885696,
"ce_loss_39": 2.735187420248985,
"ce_loss_52": 1.409775149822235,
"ce_loss_7": 4.099964368343353,
"epoch": 0.189,
"grad_norm": 19.723419677891812,
"kl_loss_13": 4691.2,
"kl_loss_26": 3118.0,
"kl_loss_39": 2625.8,
"kl_loss_7": 5516.0,
"learning_rate": 0.0009214824618802108,
"loss": 8146.0,
"step": 1890
},
{
"ce_loss_13": 3.848232001066208,
"ce_loss_26": 3.06461501121521,
"ce_loss_39": 2.81026514172554,
"ce_loss_52": 1.4419742107391358,
"ce_loss_7": 4.244399529695511,
"epoch": 0.19,
"grad_norm": 21.429961437314283,
"kl_loss_13": 4923.2,
"kl_loss_26": 3246.4,
"kl_loss_39": 2725.6,
"kl_loss_7": 5755.2,
"learning_rate": 0.0009206267664155906,
"loss": 8168.2,
"step": 1900
},
{
"ce_loss_13": 3.74611656665802,
"ce_loss_26": 2.995819491147995,
"ce_loss_39": 2.7504830598831176,
"ce_loss_52": 1.429488417506218,
"ce_loss_7": 4.137593048810959,
"epoch": 0.191,
"grad_norm": 20.703417767001277,
"kl_loss_13": 4708.8,
"kl_loss_26": 3114.8,
"kl_loss_39": 2616.8,
"kl_loss_7": 5532.0,
"learning_rate": 0.0009197668352441024,
"loss": 8113.4,
"step": 1910
},
{
"ce_loss_13": 3.7616052985191346,
"ce_loss_26": 3.0063544154167174,
"ce_loss_39": 2.7453058779239656,
"ce_loss_52": 1.4119072929024696,
"ce_loss_7": 4.15754896402359,
"epoch": 0.192,
"grad_norm": 19.851460642991412,
"kl_loss_13": 4782.4,
"kl_loss_26": 3173.6,
"kl_loss_39": 2644.0,
"kl_loss_7": 5609.6,
"learning_rate": 0.0009189026770252437,
"loss": 8087.1,
"step": 1920
},
{
"ce_loss_13": 3.7914208650588987,
"ce_loss_26": 3.0324925601482393,
"ce_loss_39": 2.7728191137313845,
"ce_loss_52": 1.4355733066797256,
"ce_loss_7": 4.178089827299118,
"epoch": 0.193,
"grad_norm": 19.2014881795966,
"kl_loss_13": 4800.8,
"kl_loss_26": 3194.0,
"kl_loss_39": 2659.2,
"kl_loss_7": 5608.8,
"learning_rate": 0.000918034300461078,
"loss": 8051.8,
"step": 1930
},
{
"ce_loss_13": 3.7237455368041994,
"ce_loss_26": 2.979181283712387,
"ce_loss_39": 2.7309202194213866,
"ce_loss_52": 1.4160432904958724,
"ce_loss_7": 4.128066539764404,
"epoch": 0.194,
"grad_norm": 20.3086236795729,
"kl_loss_13": 4720.0,
"kl_loss_26": 3114.8,
"kl_loss_39": 2602.4,
"kl_loss_7": 5556.0,
"learning_rate": 0.0009171617142961477,
"loss": 8041.9,
"step": 1940
},
{
"ce_loss_13": 3.749431645870209,
"ce_loss_26": 2.9937612235546114,
"ce_loss_39": 2.744140648841858,
"ce_loss_52": 1.4348472714424134,
"ce_loss_7": 4.156274873018265,
"epoch": 0.195,
"grad_norm": 19.294577091174897,
"kl_loss_13": 4722.4,
"kl_loss_26": 3116.0,
"kl_loss_39": 2605.2,
"kl_loss_7": 5567.2,
"learning_rate": 0.0009162849273173857,
"loss": 7982.1,
"step": 1950
},
{
"ce_loss_13": 3.7092731952667237,
"ce_loss_26": 2.9811393320560455,
"ce_loss_39": 2.733800619840622,
"ce_loss_52": 1.4478029429912567,
"ce_loss_7": 4.0951203346252445,
"epoch": 0.196,
"grad_norm": 18.879569020896366,
"kl_loss_13": 4628.0,
"kl_loss_26": 3074.0,
"kl_loss_39": 2569.2,
"kl_loss_7": 5440.8,
"learning_rate": 0.0009154039483540273,
"loss": 7938.2,
"step": 1960
},
{
"ce_loss_13": 3.816760164499283,
"ce_loss_26": 3.0473806083202364,
"ce_loss_39": 2.7949241638183593,
"ce_loss_52": 1.466755247116089,
"ce_loss_7": 4.206719404458999,
"epoch": 0.197,
"grad_norm": 18.719442415203407,
"kl_loss_13": 4774.4,
"kl_loss_26": 3153.6,
"kl_loss_39": 2628.8,
"kl_loss_7": 5592.0,
"learning_rate": 0.0009145187862775209,
"loss": 7902.9,
"step": 1970
},
{
"ce_loss_13": 3.6841754376888276,
"ce_loss_26": 2.9468030989170075,
"ce_loss_39": 2.694840121269226,
"ce_loss_52": 1.4269890293478966,
"ce_loss_7": 4.081214648485184,
"epoch": 0.198,
"grad_norm": 18.93967966275207,
"kl_loss_13": 4605.6,
"kl_loss_26": 3030.4,
"kl_loss_39": 2512.8,
"kl_loss_7": 5443.2,
"learning_rate": 0.0009136294500014386,
"loss": 7824.0,
"step": 1980
},
{
"ce_loss_13": 3.7859152793884276,
"ce_loss_26": 3.025045871734619,
"ce_loss_39": 2.761893022060394,
"ce_loss_52": 1.439668196439743,
"ce_loss_7": 4.175345808267593,
"epoch": 0.199,
"grad_norm": 18.767073263034167,
"kl_loss_13": 4776.0,
"kl_loss_26": 3164.0,
"kl_loss_39": 2630.4,
"kl_loss_7": 5584.0,
"learning_rate": 0.000912735948481387,
"loss": 7845.9,
"step": 1990
},
{
"ce_loss_13": 3.7032747983932497,
"ce_loss_26": 2.965251809358597,
"ce_loss_39": 2.708036279678345,
"ce_loss_52": 1.4405365601181983,
"ce_loss_7": 4.098276823759079,
"epoch": 0.2,
"grad_norm": 18.50836033098629,
"kl_loss_13": 4630.0,
"kl_loss_26": 3052.8,
"kl_loss_39": 2530.4,
"kl_loss_7": 5457.6,
"learning_rate": 0.0009118382907149164,
"loss": 7748.9,
"step": 2000
},
{
"ce_loss_13": 3.717062991857529,
"ce_loss_26": 2.979129308462143,
"ce_loss_39": 2.7225220024585726,
"ce_loss_52": 1.45234707146883,
"ce_loss_7": 4.110658597946167,
"epoch": 0.201,
"grad_norm": 19.844351529751968,
"kl_loss_13": 4629.6,
"kl_loss_26": 3045.6,
"kl_loss_39": 2525.2,
"kl_loss_7": 5464.0,
"learning_rate": 0.0009109364857414306,
"loss": 7809.6,
"step": 2010
},
{
"ce_loss_13": 3.7279494285583494,
"ce_loss_26": 2.9849789261817934,
"ce_loss_39": 2.7267052114009855,
"ce_loss_52": 1.4486449271440507,
"ce_loss_7": 4.1131413102149965,
"epoch": 0.202,
"grad_norm": 19.33420875029095,
"kl_loss_13": 4639.2,
"kl_loss_26": 3061.2,
"kl_loss_39": 2533.6,
"kl_loss_7": 5456.0,
"learning_rate": 0.0009100305426420956,
"loss": 7708.0,
"step": 2020
},
{
"ce_loss_13": 3.643654578924179,
"ce_loss_26": 2.9251492261886596,
"ce_loss_39": 2.667558515071869,
"ce_loss_52": 1.4118730872869492,
"ce_loss_7": 4.034631943702697,
"epoch": 0.203,
"grad_norm": 19.14819354564296,
"kl_loss_13": 4553.6,
"kl_loss_26": 3016.0,
"kl_loss_39": 2492.0,
"kl_loss_7": 5371.2,
"learning_rate": 0.0009091204705397484,
"loss": 7699.0,
"step": 2030
},
{
"ce_loss_13": 3.706267160177231,
"ce_loss_26": 2.971902164816856,
"ce_loss_39": 2.7119477689266205,
"ce_loss_52": 1.4564015328884126,
"ce_loss_7": 4.097871041297912,
"epoch": 0.204,
"grad_norm": 18.950983085399912,
"kl_loss_13": 4556.8,
"kl_loss_26": 2972.2,
"kl_loss_39": 2454.6,
"kl_loss_7": 5380.0,
"learning_rate": 0.0009082062785988049,
"loss": 7671.5,
"step": 2040
},
{
"ce_loss_13": 3.6526230454444883,
"ce_loss_26": 2.905240607261658,
"ce_loss_39": 2.654486656188965,
"ce_loss_52": 1.4092363178730012,
"ce_loss_7": 4.043174755573273,
"epoch": 0.205,
"grad_norm": 20.137932882736667,
"kl_loss_13": 4559.6,
"kl_loss_26": 2969.2,
"kl_loss_39": 2459.2,
"kl_loss_7": 5384.0,
"learning_rate": 0.0009072879760251679,
"loss": 7662.0,
"step": 2050
},
{
"ce_loss_13": 3.578403168916702,
"ce_loss_26": 2.8548426389694215,
"ce_loss_39": 2.615563529729843,
"ce_loss_52": 1.4107020199298859,
"ce_loss_7": 3.97204332947731,
"epoch": 0.206,
"grad_norm": 19.383234658877402,
"kl_loss_13": 4444.0,
"kl_loss_26": 2894.0,
"kl_loss_39": 2391.4,
"kl_loss_7": 5266.4,
"learning_rate": 0.0009063655720661341,
"loss": 7643.5,
"step": 2060
},
{
"ce_loss_13": 3.5892180263996125,
"ce_loss_26": 2.855096530914307,
"ce_loss_39": 2.6004712164402006,
"ce_loss_52": 1.4145165607333183,
"ce_loss_7": 3.983754909038544,
"epoch": 0.207,
"grad_norm": 19.811430759479535,
"kl_loss_13": 4467.2,
"kl_loss_26": 2904.0,
"kl_loss_39": 2385.0,
"kl_loss_7": 5291.2,
"learning_rate": 0.000905439076010301,
"loss": 7534.0,
"step": 2070
},
{
"ce_loss_13": 3.6425350308418274,
"ce_loss_26": 2.9249331414699555,
"ce_loss_39": 2.6687968969345093,
"ce_loss_52": 1.4602822691202164,
"ce_loss_7": 4.025002205371857,
"epoch": 0.208,
"grad_norm": 19.741988709477923,
"kl_loss_13": 4438.4,
"kl_loss_26": 2910.0,
"kl_loss_39": 2394.8,
"kl_loss_7": 5248.8,
"learning_rate": 0.0009045084971874737,
"loss": 7505.8,
"step": 2080
},
{
"ce_loss_13": 3.553661996126175,
"ce_loss_26": 2.8095623433589934,
"ce_loss_39": 2.561682888865471,
"ce_loss_52": 1.3855161294341087,
"ce_loss_7": 3.9535735607147218,
"epoch": 0.209,
"grad_norm": 18.406387003890178,
"kl_loss_13": 4454.8,
"kl_loss_26": 2860.4,
"kl_loss_39": 2354.2,
"kl_loss_7": 5295.2,
"learning_rate": 0.0009035738449685707,
"loss": 7532.3,
"step": 2090
},
{
"ce_loss_13": 3.659823089838028,
"ce_loss_26": 2.919918876886368,
"ce_loss_39": 2.67503222823143,
"ce_loss_52": 1.4627915531396867,
"ce_loss_7": 4.0562332451343535,
"epoch": 0.21,
"grad_norm": 19.907425841295773,
"kl_loss_13": 4488.8,
"kl_loss_26": 2909.2,
"kl_loss_39": 2407.0,
"kl_loss_7": 5320.8,
"learning_rate": 0.0009026351287655293,
"loss": 7517.4,
"step": 2100
},
{
"ce_loss_13": 3.630769556760788,
"ce_loss_26": 2.9068243861198426,
"ce_loss_39": 2.642093613743782,
"ce_loss_52": 1.4367542505264281,
"ce_loss_7": 4.022511690855026,
"epoch": 0.211,
"grad_norm": 18.196990652810108,
"kl_loss_13": 4459.2,
"kl_loss_26": 2926.8,
"kl_loss_39": 2395.2,
"kl_loss_7": 5276.0,
"learning_rate": 0.0009016923580312113,
"loss": 7412.6,
"step": 2110
},
{
"ce_loss_13": 3.682375580072403,
"ce_loss_26": 2.9616983354091646,
"ce_loss_39": 2.6908247590065004,
"ce_loss_52": 1.482297733426094,
"ce_loss_7": 4.079386693239212,
"epoch": 0.212,
"grad_norm": 20.88180475150302,
"kl_loss_13": 4465.6,
"kl_loss_26": 2920.8,
"kl_loss_39": 2382.8,
"kl_loss_7": 5297.6,
"learning_rate": 0.0009007455422593077,
"loss": 7402.8,
"step": 2120
},
{
"ce_loss_13": 3.5886090993881226,
"ce_loss_26": 2.8697936654090883,
"ce_loss_39": 2.6118789970874787,
"ce_loss_52": 1.4324451625347137,
"ce_loss_7": 3.989769661426544,
"epoch": 0.213,
"grad_norm": 21.410550894381295,
"kl_loss_13": 4406.0,
"kl_loss_26": 2862.8,
"kl_loss_39": 2337.2,
"kl_loss_7": 5236.0,
"learning_rate": 0.0008997946909842425,
"loss": 7376.6,
"step": 2130
},
{
"ce_loss_13": 3.5265793919563295,
"ce_loss_26": 2.8049169957637785,
"ce_loss_39": 2.5549218744039535,
"ce_loss_52": 1.4088917583227158,
"ce_loss_7": 3.920357757806778,
"epoch": 0.214,
"grad_norm": 18.12930136088373,
"kl_loss_13": 4307.2,
"kl_loss_26": 2766.8,
"kl_loss_39": 2250.8,
"kl_loss_7": 5132.8,
"learning_rate": 0.0008988398137810777,
"loss": 7289.0,
"step": 2140
},
{
"ce_loss_13": 3.493118005990982,
"ce_loss_26": 2.7619090020656585,
"ce_loss_39": 2.523107871413231,
"ce_loss_52": 1.3855519428849221,
"ce_loss_7": 3.8948814868927,
"epoch": 0.215,
"grad_norm": 19.085499107664276,
"kl_loss_13": 4317.6,
"kl_loss_26": 2765.2,
"kl_loss_39": 2265.6,
"kl_loss_7": 5148.8,
"learning_rate": 0.0008978809202654162,
"loss": 7322.3,
"step": 2150
},
{
"ce_loss_13": 3.5010905504226684,
"ce_loss_26": 2.7818336695432664,
"ce_loss_39": 2.5317496716976167,
"ce_loss_52": 1.410066269338131,
"ce_loss_7": 3.8946242213249205,
"epoch": 0.216,
"grad_norm": 18.12712360145358,
"kl_loss_13": 4244.8,
"kl_loss_26": 2719.8,
"kl_loss_39": 2212.8,
"kl_loss_7": 5073.6,
"learning_rate": 0.0008969180200933046,
"loss": 7287.8,
"step": 2160
},
{
"ce_loss_13": 3.583745849132538,
"ce_loss_26": 2.8594263792037964,
"ce_loss_39": 2.5962479442358015,
"ce_loss_52": 1.4458883255720139,
"ce_loss_7": 3.976805257797241,
"epoch": 0.217,
"grad_norm": 17.565470208207127,
"kl_loss_13": 4359.6,
"kl_loss_26": 2818.4,
"kl_loss_39": 2285.8,
"kl_loss_7": 5177.6,
"learning_rate": 0.0008959511229611376,
"loss": 7240.9,
"step": 2170
},
{
"ce_loss_13": 3.580790603160858,
"ce_loss_26": 2.859365826845169,
"ce_loss_39": 2.5985568940639494,
"ce_loss_52": 1.4648242503404618,
"ce_loss_7": 3.9631645143032075,
"epoch": 0.218,
"grad_norm": 18.371856212121592,
"kl_loss_13": 4316.0,
"kl_loss_26": 2762.4,
"kl_loss_39": 2248.2,
"kl_loss_7": 5127.2,
"learning_rate": 0.0008949802386055581,
"loss": 7227.3,
"step": 2180
},
{
"ce_loss_13": 3.549471515417099,
"ce_loss_26": 2.814970576763153,
"ce_loss_39": 2.561775863170624,
"ce_loss_52": 1.4231164067983628,
"ce_loss_7": 3.941384530067444,
"epoch": 0.219,
"grad_norm": 17.91149753664061,
"kl_loss_13": 4321.2,
"kl_loss_26": 2784.4,
"kl_loss_39": 2263.6,
"kl_loss_7": 5152.0,
"learning_rate": 0.0008940053768033609,
"loss": 7238.8,
"step": 2190
},
{
"ce_loss_13": 3.5581556379795076,
"ce_loss_26": 2.825929582118988,
"ce_loss_39": 2.570690780878067,
"ce_loss_52": 1.4427952721714974,
"ce_loss_7": 3.9514447808265687,
"epoch": 0.22,
"grad_norm": 19.745379707547666,
"kl_loss_13": 4304.8,
"kl_loss_26": 2754.0,
"kl_loss_39": 2234.6,
"kl_loss_7": 5132.8,
"learning_rate": 0.0008930265473713938,
"loss": 7236.5,
"step": 2200
},
{
"ce_loss_13": 3.522547519207001,
"ce_loss_26": 2.7868906617164613,
"ce_loss_39": 2.524843490123749,
"ce_loss_52": 1.3902505502104758,
"ce_loss_7": 3.9295816838741304,
"epoch": 0.221,
"grad_norm": 18.58799040514992,
"kl_loss_13": 4349.2,
"kl_loss_26": 2792.0,
"kl_loss_39": 2262.0,
"kl_loss_7": 5198.4,
"learning_rate": 0.0008920437601664579,
"loss": 7187.9,
"step": 2210
},
{
"ce_loss_13": 3.5173128962516786,
"ce_loss_26": 2.804593563079834,
"ce_loss_39": 2.557006138563156,
"ce_loss_52": 1.4581168740987778,
"ce_loss_7": 3.910993677377701,
"epoch": 0.222,
"grad_norm": 18.95814843355098,
"kl_loss_13": 4224.4,
"kl_loss_26": 2677.2,
"kl_loss_39": 2165.2,
"kl_loss_7": 5053.6,
"learning_rate": 0.0008910570250852097,
"loss": 7168.4,
"step": 2220
},
{
"ce_loss_13": 3.4375841438770296,
"ce_loss_26": 2.7280281484127045,
"ce_loss_39": 2.47977514564991,
"ce_loss_52": 1.389390866458416,
"ce_loss_7": 3.838480031490326,
"epoch": 0.223,
"grad_norm": 19.685999540006744,
"kl_loss_13": 4179.6,
"kl_loss_26": 2668.0,
"kl_loss_39": 2152.8,
"kl_loss_7": 5011.2,
"learning_rate": 0.0008900663520640604,
"loss": 7080.8,
"step": 2230
},
{
"ce_loss_13": 3.5171928703784943,
"ce_loss_26": 2.8187219202518463,
"ce_loss_39": 2.5553694486618044,
"ce_loss_52": 1.4541724801063538,
"ce_loss_7": 3.8974815249443053,
"epoch": 0.224,
"grad_norm": 26.45112973751635,
"kl_loss_13": 4224.4,
"kl_loss_26": 2710.0,
"kl_loss_39": 2180.4,
"kl_loss_7": 5080.0,
"learning_rate": 0.0008890717510790764,
"loss": 7105.5,
"step": 2240
},
{
"ce_loss_13": 3.5361606895923616,
"ce_loss_26": 2.8268598556518554,
"ce_loss_39": 2.5735931187868117,
"ce_loss_52": 1.451829667389393,
"ce_loss_7": 3.924015772342682,
"epoch": 0.225,
"grad_norm": 20.75511317660791,
"kl_loss_13": 4247.2,
"kl_loss_26": 2732.8,
"kl_loss_39": 2219.0,
"kl_loss_7": 5061.6,
"learning_rate": 0.0008880732321458784,
"loss": 7074.7,
"step": 2250
},
{
"ce_loss_13": 3.4491190731525423,
"ce_loss_26": 2.7567967534065247,
"ce_loss_39": 2.5041564613580705,
"ce_loss_52": 1.4379881560802459,
"ce_loss_7": 3.8363168060779573,
"epoch": 0.226,
"grad_norm": 18.822666188140545,
"kl_loss_13": 4102.8,
"kl_loss_26": 2621.6,
"kl_loss_39": 2107.6,
"kl_loss_7": 4917.6,
"learning_rate": 0.0008870708053195413,
"loss": 7003.4,
"step": 2260
},
{
"ce_loss_13": 3.4790355801582336,
"ce_loss_26": 2.765997165441513,
"ce_loss_39": 2.5106064915657043,
"ce_loss_52": 1.4210210233926772,
"ce_loss_7": 3.8746932446956635,
"epoch": 0.227,
"grad_norm": 19.45926661054129,
"kl_loss_13": 4188.4,
"kl_loss_26": 2681.2,
"kl_loss_39": 2162.6,
"kl_loss_7": 5009.6,
"learning_rate": 0.0008860644806944918,
"loss": 7002.8,
"step": 2270
},
{
"ce_loss_13": 3.589535415172577,
"ce_loss_26": 2.870089566707611,
"ce_loss_39": 2.5925551772117617,
"ce_loss_52": 1.4516636282205582,
"ce_loss_7": 3.9855311453342437,
"epoch": 0.228,
"grad_norm": 18.527281302332295,
"kl_loss_13": 4335.2,
"kl_loss_26": 2802.8,
"kl_loss_39": 2252.6,
"kl_loss_7": 5164.0,
"learning_rate": 0.0008850542684044079,
"loss": 7072.1,
"step": 2280
},
{
"ce_loss_13": 3.4438597559928894,
"ce_loss_26": 2.74727184176445,
"ce_loss_39": 2.4982927203178407,
"ce_loss_52": 1.435218185186386,
"ce_loss_7": 3.8294356882572176,
"epoch": 0.229,
"grad_norm": 18.07396826505032,
"kl_loss_13": 4082.8,
"kl_loss_26": 2585.6,
"kl_loss_39": 2087.0,
"kl_loss_7": 4892.8,
"learning_rate": 0.0008840401786221159,
"loss": 6974.9,
"step": 2290
},
{
"ce_loss_13": 3.476649820804596,
"ce_loss_26": 2.7891372203826905,
"ce_loss_39": 2.5390550673007963,
"ce_loss_52": 1.4611460983753204,
"ce_loss_7": 3.854980993270874,
"epoch": 0.23,
"grad_norm": 18.716359933554248,
"kl_loss_13": 4118.8,
"kl_loss_26": 2644.2,
"kl_loss_39": 2128.6,
"kl_loss_7": 4946.0,
"learning_rate": 0.000883022221559489,
"loss": 6901.3,
"step": 2300
},
{
"ce_loss_13": 3.467282909154892,
"ce_loss_26": 2.7604516625404356,
"ce_loss_39": 2.5003271818161013,
"ce_loss_52": 1.4461873590946197,
"ce_loss_7": 3.856851851940155,
"epoch": 0.231,
"grad_norm": 20.721250836400138,
"kl_loss_13": 4114.0,
"kl_loss_26": 2616.8,
"kl_loss_39": 2092.6,
"kl_loss_7": 4926.4,
"learning_rate": 0.0008820004074673434,
"loss": 6876.9,
"step": 2310
},
{
"ce_loss_13": 3.4266963064670564,
"ce_loss_26": 2.719339656829834,
"ce_loss_39": 2.4688956409692766,
"ce_loss_52": 1.4164521768689156,
"ce_loss_7": 3.8023972034454347,
"epoch": 0.232,
"grad_norm": 17.616553712409548,
"kl_loss_13": 4090.4,
"kl_loss_26": 2589.2,
"kl_loss_39": 2065.4,
"kl_loss_7": 4891.2,
"learning_rate": 0.0008809747466353355,
"loss": 6907.6,
"step": 2320
},
{
"ce_loss_13": 3.536805588006973,
"ce_loss_26": 2.8070395588874817,
"ce_loss_39": 2.5501783430576324,
"ce_loss_52": 1.4646585762500763,
"ce_loss_7": 3.9247563600540163,
"epoch": 0.233,
"grad_norm": 17.658525951814237,
"kl_loss_13": 4218.8,
"kl_loss_26": 2666.0,
"kl_loss_39": 2147.2,
"kl_loss_7": 5044.0,
"learning_rate": 0.0008799452493918585,
"loss": 6862.9,
"step": 2330
},
{
"ce_loss_13": 3.3707460284233095,
"ce_loss_26": 2.680304506421089,
"ce_loss_39": 2.4279863387346268,
"ce_loss_52": 1.4292149528861047,
"ce_loss_7": 3.7509031653404237,
"epoch": 0.234,
"grad_norm": 18.293581891673337,
"kl_loss_13": 3979.6,
"kl_loss_26": 2481.4,
"kl_loss_39": 1967.6,
"kl_loss_7": 4792.0,
"learning_rate": 0.0008789119261039385,
"loss": 6860.7,
"step": 2340
},
{
"ce_loss_13": 3.411047804355621,
"ce_loss_26": 2.7024976193904875,
"ce_loss_39": 2.4417600989341737,
"ce_loss_52": 1.404353639483452,
"ce_loss_7": 3.799528968334198,
"epoch": 0.235,
"grad_norm": 19.60391257341661,
"kl_loss_13": 4078.4,
"kl_loss_26": 2576.8,
"kl_loss_39": 2050.4,
"kl_loss_7": 4901.6,
"learning_rate": 0.0008778747871771292,
"loss": 6770.8,
"step": 2350
},
{
"ce_loss_13": 3.4045680582523348,
"ce_loss_26": 2.6985227525234223,
"ce_loss_39": 2.451327767968178,
"ce_loss_52": 1.4205562889575958,
"ce_loss_7": 3.7882447242736816,
"epoch": 0.236,
"grad_norm": 18.739033338884607,
"kl_loss_13": 4038.4,
"kl_loss_26": 2533.2,
"kl_loss_39": 2034.8,
"kl_loss_7": 4858.4,
"learning_rate": 0.0008768338430554083,
"loss": 6755.4,
"step": 2360
},
{
"ce_loss_13": 3.3626140534877775,
"ce_loss_26": 2.65916622877121,
"ce_loss_39": 2.4009654462337493,
"ce_loss_52": 1.3916789084672927,
"ce_loss_7": 3.749758929014206,
"epoch": 0.237,
"grad_norm": 20.138759568479667,
"kl_loss_13": 4022.0,
"kl_loss_26": 2521.4,
"kl_loss_39": 1993.2,
"kl_loss_7": 4829.6,
"learning_rate": 0.0008757891042210713,
"loss": 6791.4,
"step": 2370
},
{
"ce_loss_13": 3.381242650747299,
"ce_loss_26": 2.68309933245182,
"ce_loss_39": 2.433691692352295,
"ce_loss_52": 1.4095703065395355,
"ce_loss_7": 3.772293299436569,
"epoch": 0.238,
"grad_norm": 17.323586780740413,
"kl_loss_13": 4016.4,
"kl_loss_26": 2531.4,
"kl_loss_39": 2016.2,
"kl_loss_7": 4839.6,
"learning_rate": 0.0008747405811946271,
"loss": 6729.9,
"step": 2380
},
{
"ce_loss_13": 3.418868046998978,
"ce_loss_26": 2.7223224580287932,
"ce_loss_39": 2.4728009045124053,
"ce_loss_52": 1.446861308813095,
"ce_loss_7": 3.7994930267333986,
"epoch": 0.239,
"grad_norm": 17.668482550134954,
"kl_loss_13": 4034.0,
"kl_loss_26": 2556.0,
"kl_loss_39": 2037.4,
"kl_loss_7": 4842.4,
"learning_rate": 0.0008736882845346905,
"loss": 6764.7,
"step": 2390
},
{
"ce_loss_13": 3.407726752758026,
"ce_loss_26": 2.714660122990608,
"ce_loss_39": 2.4702648639678957,
"ce_loss_52": 1.4476186811923981,
"ce_loss_7": 3.7862841546535493,
"epoch": 0.24,
"grad_norm": 20.313458924733165,
"kl_loss_13": 3988.4,
"kl_loss_26": 2507.2,
"kl_loss_39": 2008.2,
"kl_loss_7": 4794.4,
"learning_rate": 0.0008726322248378774,
"loss": 6720.4,
"step": 2400
},
{
"ce_loss_13": 3.4095008313655852,
"ce_loss_26": 2.69794414639473,
"ce_loss_39": 2.44133580327034,
"ce_loss_52": 1.426843424141407,
"ce_loss_7": 3.8018106520175934,
"epoch": 0.241,
"grad_norm": 17.847824779807837,
"kl_loss_13": 4039.6,
"kl_loss_26": 2535.6,
"kl_loss_39": 2007.8,
"kl_loss_7": 4864.0,
"learning_rate": 0.0008715724127386971,
"loss": 6713.1,
"step": 2410
},
{
"ce_loss_13": 3.370608961582184,
"ce_loss_26": 2.688097137212753,
"ce_loss_39": 2.435939407348633,
"ce_loss_52": 1.4344248950481415,
"ce_loss_7": 3.745537704229355,
"epoch": 0.242,
"grad_norm": 18.03045500704746,
"kl_loss_13": 3943.6,
"kl_loss_26": 2497.6,
"kl_loss_39": 1983.4,
"kl_loss_7": 4736.8,
"learning_rate": 0.0008705088589094458,
"loss": 6611.0,
"step": 2420
},
{
"ce_loss_13": 3.4596225798130034,
"ce_loss_26": 2.7608898997306826,
"ce_loss_39": 2.5022788047790527,
"ce_loss_52": 1.4623101890087127,
"ce_loss_7": 3.839513373374939,
"epoch": 0.243,
"grad_norm": 18.190444229647287,
"kl_loss_13": 4058.0,
"kl_loss_26": 2575.6,
"kl_loss_39": 2054.6,
"kl_loss_7": 4862.4,
"learning_rate": 0.0008694415740600988,
"loss": 6638.0,
"step": 2430
},
{
"ce_loss_13": 3.370687645673752,
"ce_loss_26": 2.6674872994422913,
"ce_loss_39": 2.414305740594864,
"ce_loss_52": 1.4324225425720214,
"ce_loss_7": 3.767412984371185,
"epoch": 0.244,
"grad_norm": 18.848120466041497,
"kl_loss_13": 3940.4,
"kl_loss_26": 2449.4,
"kl_loss_39": 1932.2,
"kl_loss_7": 4769.6,
"learning_rate": 0.0008683705689382025,
"loss": 6641.0,
"step": 2440
},
{
"ce_loss_13": 3.347389942407608,
"ce_loss_26": 2.6603663861751556,
"ce_loss_39": 2.4186645448207855,
"ce_loss_52": 1.452422297000885,
"ce_loss_7": 3.731181102991104,
"epoch": 0.245,
"grad_norm": 17.05503964147942,
"kl_loss_13": 3888.8,
"kl_loss_26": 2418.6,
"kl_loss_39": 1913.8,
"kl_loss_7": 4694.4,
"learning_rate": 0.0008672958543287666,
"loss": 6617.1,
"step": 2450
},
{
"ce_loss_13": 3.341637074947357,
"ce_loss_26": 2.653514164686203,
"ce_loss_39": 2.4030585259199144,
"ce_loss_52": 1.4203058749437332,
"ce_loss_7": 3.7326664865016936,
"epoch": 0.246,
"grad_norm": 18.028238041954314,
"kl_loss_13": 3894.0,
"kl_loss_26": 2435.6,
"kl_loss_39": 1932.4,
"kl_loss_7": 4711.2,
"learning_rate": 0.0008662174410541554,
"loss": 6537.2,
"step": 2460
},
{
"ce_loss_13": 3.343936342000961,
"ce_loss_26": 2.6602561354637144,
"ce_loss_39": 2.4062414824962617,
"ce_loss_52": 1.4341940209269524,
"ce_loss_7": 3.7255902886390686,
"epoch": 0.247,
"grad_norm": 17.615419947143096,
"kl_loss_13": 3890.8,
"kl_loss_26": 2433.8,
"kl_loss_39": 1921.6,
"kl_loss_7": 4692.0,
"learning_rate": 0.0008651353399739787,
"loss": 6499.5,
"step": 2470
},
{
"ce_loss_13": 3.36987144947052,
"ce_loss_26": 2.6651594936847687,
"ce_loss_39": 2.41037335395813,
"ce_loss_52": 1.4215908780694009,
"ce_loss_7": 3.75968217253685,
"epoch": 0.248,
"grad_norm": 19.448152371934484,
"kl_loss_13": 3954.4,
"kl_loss_26": 2461.0,
"kl_loss_39": 1945.4,
"kl_loss_7": 4776.0,
"learning_rate": 0.0008640495619849821,
"loss": 6570.0,
"step": 2480
},
{
"ce_loss_13": 3.3932483792304993,
"ce_loss_26": 2.69803272485733,
"ce_loss_39": 2.448850151896477,
"ce_loss_52": 1.4729512989521027,
"ce_loss_7": 3.7767464458942412,
"epoch": 0.249,
"grad_norm": 17.76914722627013,
"kl_loss_13": 3932.8,
"kl_loss_26": 2449.6,
"kl_loss_39": 1942.6,
"kl_loss_7": 4743.2,
"learning_rate": 0.0008629601180209381,
"loss": 6472.8,
"step": 2490
},
{
"ce_loss_13": 3.3404706001281737,
"ce_loss_26": 2.6527740180492403,
"ce_loss_39": 2.403418445587158,
"ce_loss_52": 1.4374166071414947,
"ce_loss_7": 3.7279320538043974,
"epoch": 0.25,
"grad_norm": 18.043688934070087,
"kl_loss_13": 3897.6,
"kl_loss_26": 2421.6,
"kl_loss_39": 1902.0,
"kl_loss_7": 4716.0,
"learning_rate": 0.000861867019052535,
"loss": 6482.2,
"step": 2500
},
{
"ce_loss_13": 3.408062273263931,
"ce_loss_26": 2.704035770893097,
"ce_loss_39": 2.46220483481884,
"ce_loss_52": 1.4713353991508484,
"ce_loss_7": 3.795337921380997,
"epoch": 0.251,
"grad_norm": 19.728713333582007,
"kl_loss_13": 3944.4,
"kl_loss_26": 2444.8,
"kl_loss_39": 1942.4,
"kl_loss_7": 4760.4,
"learning_rate": 0.0008607702760872678,
"loss": 6463.8,
"step": 2510
},
{
"ce_loss_13": 3.388230836391449,
"ce_loss_26": 2.687982529401779,
"ce_loss_39": 2.429553496837616,
"ce_loss_52": 1.4563929110765457,
"ce_loss_7": 3.781431978940964,
"epoch": 0.252,
"grad_norm": 18.472247546339297,
"kl_loss_13": 3948.8,
"kl_loss_26": 2449.6,
"kl_loss_39": 1930.0,
"kl_loss_7": 4769.6,
"learning_rate": 0.0008596699001693256,
"loss": 6470.9,
"step": 2520
},
{
"ce_loss_13": 3.3468190252780916,
"ce_loss_26": 2.6537194311618806,
"ce_loss_39": 2.38609040081501,
"ce_loss_52": 1.4265122324228288,
"ce_loss_7": 3.7305682718753816,
"epoch": 0.253,
"grad_norm": 19.078325524688942,
"kl_loss_13": 3872.0,
"kl_loss_26": 2409.4,
"kl_loss_39": 1875.8,
"kl_loss_7": 4679.2,
"learning_rate": 0.0008585659023794818,
"loss": 6413.9,
"step": 2530
},
{
"ce_loss_13": 3.3305428862571715,
"ce_loss_26": 2.6303874611854554,
"ce_loss_39": 2.3649342864751817,
"ce_loss_52": 1.4202288419008255,
"ce_loss_7": 3.712818431854248,
"epoch": 0.254,
"grad_norm": 17.612892401319467,
"kl_loss_13": 3873.6,
"kl_loss_26": 2396.4,
"kl_loss_39": 1864.0,
"kl_loss_7": 4688.8,
"learning_rate": 0.0008574582938349817,
"loss": 6377.5,
"step": 2540
},
{
"ce_loss_13": 3.3927013695240023,
"ce_loss_26": 2.7204831540584564,
"ce_loss_39": 2.464686703681946,
"ce_loss_52": 1.487898689508438,
"ce_loss_7": 3.7826764941215516,
"epoch": 0.255,
"grad_norm": 19.269838722570377,
"kl_loss_13": 3875.2,
"kl_loss_26": 2432.8,
"kl_loss_39": 1907.8,
"kl_loss_7": 4701.6,
"learning_rate": 0.0008563470856894315,
"loss": 6365.3,
"step": 2550
},
{
"ce_loss_13": 3.366223245859146,
"ce_loss_26": 2.6793350696563722,
"ce_loss_39": 2.4282671988010405,
"ce_loss_52": 1.475812730193138,
"ce_loss_7": 3.7593341052532194,
"epoch": 0.256,
"grad_norm": 17.67137481592248,
"kl_loss_13": 3846.8,
"kl_loss_26": 2394.4,
"kl_loss_39": 1881.4,
"kl_loss_7": 4661.6,
"learning_rate": 0.0008552322891326845,
"loss": 6381.8,
"step": 2560
},
{
"ce_loss_13": 3.348971825838089,
"ce_loss_26": 2.6535234093666076,
"ce_loss_39": 2.396692654490471,
"ce_loss_52": 1.4521390795707703,
"ce_loss_7": 3.737420654296875,
"epoch": 0.257,
"grad_norm": 17.922791060277248,
"kl_loss_13": 3853.2,
"kl_loss_26": 2382.8,
"kl_loss_39": 1860.0,
"kl_loss_7": 4671.6,
"learning_rate": 0.0008541139153907296,
"loss": 6320.6,
"step": 2570
},
{
"ce_loss_13": 3.3175282776355743,
"ce_loss_26": 2.62116519510746,
"ce_loss_39": 2.373508110642433,
"ce_loss_52": 1.4496644467115403,
"ce_loss_7": 3.7005242466926576,
"epoch": 0.258,
"grad_norm": 17.919048466696356,
"kl_loss_13": 3806.0,
"kl_loss_26": 2325.8,
"kl_loss_39": 1815.6,
"kl_loss_7": 4616.8,
"learning_rate": 0.0008529919757255782,
"loss": 6324.1,
"step": 2580
},
{
"ce_loss_13": 3.3524767458438873,
"ce_loss_26": 2.672022157907486,
"ce_loss_39": 2.42145474255085,
"ce_loss_52": 1.490525448322296,
"ce_loss_7": 3.731249511241913,
"epoch": 0.259,
"grad_norm": 17.94332341656099,
"kl_loss_13": 3786.8,
"kl_loss_26": 2328.6,
"kl_loss_39": 1819.0,
"kl_loss_7": 4580.4,
"learning_rate": 0.0008518664814351503,
"loss": 6266.2,
"step": 2590
},
{
"ce_loss_13": 3.2249048352241516,
"ce_loss_26": 2.526710030436516,
"ce_loss_39": 2.288080096244812,
"ce_loss_52": 1.3951421514153481,
"ce_loss_7": 3.6117550313472746,
"epoch": 0.26,
"grad_norm": 17.453584143766424,
"kl_loss_13": 3747.6,
"kl_loss_26": 2273.8,
"kl_loss_39": 1787.0,
"kl_loss_7": 4560.8,
"learning_rate": 0.0008507374438531607,
"loss": 6263.6,
"step": 2600
},
{
"ce_loss_13": 3.385346031188965,
"ce_loss_26": 2.689622291922569,
"ce_loss_39": 2.437500995397568,
"ce_loss_52": 1.4798223778605462,
"ce_loss_7": 3.780226916074753,
"epoch": 0.261,
"grad_norm": 17.77067876803282,
"kl_loss_13": 3881.6,
"kl_loss_26": 2399.4,
"kl_loss_39": 1889.6,
"kl_loss_7": 4706.4,
"learning_rate": 0.0008496048743490053,
"loss": 6251.8,
"step": 2610
},
{
"ce_loss_13": 3.253863149881363,
"ce_loss_26": 2.5867208421230314,
"ce_loss_39": 2.3403410583734514,
"ce_loss_52": 1.4355336636304856,
"ce_loss_7": 3.628329038619995,
"epoch": 0.262,
"grad_norm": 18.13099048831459,
"kl_loss_13": 3714.0,
"kl_loss_26": 2278.6,
"kl_loss_39": 1781.2,
"kl_loss_7": 4501.2,
"learning_rate": 0.0008484687843276469,
"loss": 6230.8,
"step": 2620
},
{
"ce_loss_13": 3.308898413181305,
"ce_loss_26": 2.6307075321674347,
"ce_loss_39": 2.381870651245117,
"ce_loss_52": 1.4658461689949036,
"ce_loss_7": 3.686649763584137,
"epoch": 0.263,
"grad_norm": 17.246075540125826,
"kl_loss_13": 3767.2,
"kl_loss_26": 2322.6,
"kl_loss_39": 1819.4,
"kl_loss_7": 4560.0,
"learning_rate": 0.0008473291852294987,
"loss": 6261.4,
"step": 2630
},
{
"ce_loss_13": 3.301677256822586,
"ce_loss_26": 2.623979777097702,
"ce_loss_39": 2.3711125582456587,
"ce_loss_52": 1.4476893723011017,
"ce_loss_7": 3.686248630285263,
"epoch": 0.264,
"grad_norm": 18.725565567002363,
"kl_loss_13": 3772.0,
"kl_loss_26": 2323.8,
"kl_loss_39": 1813.4,
"kl_loss_7": 4567.6,
"learning_rate": 0.0008461860885303114,
"loss": 6186.3,
"step": 2640
},
{
"ce_loss_13": 3.3081180572509767,
"ce_loss_26": 2.5993283450603486,
"ce_loss_39": 2.346868970990181,
"ce_loss_52": 1.4209152534604073,
"ce_loss_7": 3.700265485048294,
"epoch": 0.265,
"grad_norm": 16.39724372868559,
"kl_loss_13": 3844.8,
"kl_loss_26": 2352.2,
"kl_loss_39": 1835.0,
"kl_loss_7": 4663.2,
"learning_rate": 0.000845039505741056,
"loss": 6224.4,
"step": 2650
},
{
"ce_loss_13": 3.2767783522605898,
"ce_loss_26": 2.610367941856384,
"ce_loss_39": 2.3734692215919493,
"ce_loss_52": 1.4838453635573388,
"ce_loss_7": 3.6544183135032653,
"epoch": 0.266,
"grad_norm": 18.646431501601995,
"kl_loss_13": 3666.4,
"kl_loss_26": 2232.8,
"kl_loss_39": 1743.2,
"kl_loss_7": 4455.2,
"learning_rate": 0.0008438894484078086,
"loss": 6164.1,
"step": 2660
},
{
"ce_loss_13": 3.187056082487106,
"ce_loss_26": 2.5067271828651427,
"ce_loss_39": 2.265036514401436,
"ce_loss_52": 1.3943608120083808,
"ce_loss_7": 3.5653835415840147,
"epoch": 0.267,
"grad_norm": 17.226899122841342,
"kl_loss_13": 3695.2,
"kl_loss_26": 2239.8,
"kl_loss_39": 1740.2,
"kl_loss_7": 4491.6,
"learning_rate": 0.0008427359281116334,
"loss": 6116.4,
"step": 2670
},
{
"ce_loss_13": 3.2635743618011475,
"ce_loss_26": 2.5878145933151244,
"ce_loss_39": 2.344261533021927,
"ce_loss_52": 1.436160460114479,
"ce_loss_7": 3.642155331373215,
"epoch": 0.268,
"grad_norm": 17.358976250170773,
"kl_loss_13": 3715.6,
"kl_loss_26": 2286.6,
"kl_loss_39": 1782.2,
"kl_loss_7": 4510.8,
"learning_rate": 0.0008415789564684673,
"loss": 6120.1,
"step": 2680
},
{
"ce_loss_13": 3.217154061794281,
"ce_loss_26": 2.55003065764904,
"ce_loss_39": 2.3082681566476824,
"ce_loss_52": 1.4278038635849952,
"ce_loss_7": 3.602716547250748,
"epoch": 0.269,
"grad_norm": 17.314540535001434,
"kl_loss_13": 3655.2,
"kl_loss_26": 2225.8,
"kl_loss_39": 1731.0,
"kl_loss_7": 4458.8,
"learning_rate": 0.0008404185451290017,
"loss": 6184.5,
"step": 2690
},
{
"ce_loss_13": 3.24159716963768,
"ce_loss_26": 2.54996337890625,
"ce_loss_39": 2.3031594485044478,
"ce_loss_52": 1.4209882378578187,
"ce_loss_7": 3.6337377846241,
"epoch": 0.27,
"grad_norm": 17.492742712849175,
"kl_loss_13": 3726.4,
"kl_loss_26": 2257.8,
"kl_loss_39": 1756.2,
"kl_loss_7": 4549.6,
"learning_rate": 0.0008392547057785661,
"loss": 6062.5,
"step": 2700
},
{
"ce_loss_13": 3.227788990736008,
"ce_loss_26": 2.5358716517686846,
"ce_loss_39": 2.285151606798172,
"ce_loss_52": 1.4203684866428374,
"ce_loss_7": 3.6037886083126067,
"epoch": 0.271,
"grad_norm": 17.53543108395465,
"kl_loss_13": 3669.6,
"kl_loss_26": 2215.6,
"kl_loss_39": 1712.0,
"kl_loss_7": 4470.8,
"learning_rate": 0.0008380874501370098,
"loss": 6120.8,
"step": 2710
},
{
"ce_loss_13": 3.1636491239070894,
"ce_loss_26": 2.5045939922332763,
"ce_loss_39": 2.2759897857904434,
"ce_loss_52": 1.4353285342454911,
"ce_loss_7": 3.5319547176361086,
"epoch": 0.272,
"grad_norm": 18.586839239007904,
"kl_loss_13": 3539.2,
"kl_loss_26": 2133.8,
"kl_loss_39": 1666.0,
"kl_loss_7": 4328.8,
"learning_rate": 0.0008369167899585841,
"loss": 6051.0,
"step": 2720
},
{
"ce_loss_13": 3.221328115463257,
"ce_loss_26": 2.535122260451317,
"ce_loss_39": 2.2940177261829375,
"ce_loss_52": 1.419254219532013,
"ce_loss_7": 3.6125965118408203,
"epoch": 0.273,
"grad_norm": 17.658632608609814,
"kl_loss_13": 3688.0,
"kl_loss_26": 2233.2,
"kl_loss_39": 1733.0,
"kl_loss_7": 4510.0,
"learning_rate": 0.0008357427370318238,
"loss": 6045.8,
"step": 2730
},
{
"ce_loss_13": 3.2239345014095306,
"ce_loss_26": 2.556156021356583,
"ce_loss_39": 2.3110478937625887,
"ce_loss_52": 1.4514835059642792,
"ce_loss_7": 3.6023066878318786,
"epoch": 0.274,
"grad_norm": 18.16156755157877,
"kl_loss_13": 3623.2,
"kl_loss_26": 2184.8,
"kl_loss_39": 1687.0,
"kl_loss_7": 4421.6,
"learning_rate": 0.0008345653031794292,
"loss": 6081.8,
"step": 2740
},
{
"ce_loss_13": 3.248935067653656,
"ce_loss_26": 2.571800184249878,
"ce_loss_39": 2.3257440716028213,
"ce_loss_52": 1.4555991351604463,
"ce_loss_7": 3.6242997109889985,
"epoch": 0.275,
"grad_norm": 17.486859359677037,
"kl_loss_13": 3626.4,
"kl_loss_26": 2208.0,
"kl_loss_39": 1712.0,
"kl_loss_7": 4417.6,
"learning_rate": 0.0008333845002581458,
"loss": 5996.5,
"step": 2750
},
{
"ce_loss_13": 3.2696946620941163,
"ce_loss_26": 2.5927825689315798,
"ce_loss_39": 2.3452927708625793,
"ce_loss_52": 1.4668353974819184,
"ce_loss_7": 3.635032969713211,
"epoch": 0.276,
"grad_norm": 17.043532441273413,
"kl_loss_13": 3660.4,
"kl_loss_26": 2235.2,
"kl_loss_39": 1729.2,
"kl_loss_7": 4435.2,
"learning_rate": 0.0008322003401586462,
"loss": 5989.9,
"step": 2760
},
{
"ce_loss_13": 3.187331736087799,
"ce_loss_26": 2.537285569310188,
"ce_loss_39": 2.296916127204895,
"ce_loss_52": 1.4428926169872285,
"ce_loss_7": 3.55471009016037,
"epoch": 0.277,
"grad_norm": 18.300410919099367,
"kl_loss_13": 3558.4,
"kl_loss_26": 2165.8,
"kl_loss_39": 1682.4,
"kl_loss_7": 4338.0,
"learning_rate": 0.0008310128348054094,
"loss": 5970.6,
"step": 2770
},
{
"ce_loss_13": 3.2006444096565247,
"ce_loss_26": 2.5205237597227095,
"ce_loss_39": 2.282972750067711,
"ce_loss_52": 1.4260737136006356,
"ce_loss_7": 3.5821855068206787,
"epoch": 0.278,
"grad_norm": 17.549429821325518,
"kl_loss_13": 3649.2,
"kl_loss_26": 2206.8,
"kl_loss_39": 1708.8,
"kl_loss_7": 4441.2,
"learning_rate": 0.0008298219961566008,
"loss": 5976.9,
"step": 2780
},
{
"ce_loss_13": 3.1831609129905702,
"ce_loss_26": 2.4997926205396652,
"ce_loss_39": 2.2601052969694138,
"ce_loss_52": 1.403978604078293,
"ce_loss_7": 3.553097301721573,
"epoch": 0.279,
"grad_norm": 17.798788155683486,
"kl_loss_13": 3626.8,
"kl_loss_26": 2180.8,
"kl_loss_39": 1685.6,
"kl_loss_7": 4416.0,
"learning_rate": 0.0008286278362039527,
"loss": 5901.3,
"step": 2790
},
{
"ce_loss_13": 3.2061972856521606,
"ce_loss_26": 2.5348829567432403,
"ce_loss_39": 2.29689359664917,
"ce_loss_52": 1.4524233996868134,
"ce_loss_7": 3.576086735725403,
"epoch": 0.28,
"grad_norm": 17.920526019853103,
"kl_loss_13": 3566.0,
"kl_loss_26": 2145.8,
"kl_loss_39": 1659.0,
"kl_loss_7": 4341.6,
"learning_rate": 0.0008274303669726426,
"loss": 5875.7,
"step": 2800
},
{
"ce_loss_13": 3.2895997047424315,
"ce_loss_26": 2.6019150614738464,
"ce_loss_39": 2.3554980546236037,
"ce_loss_52": 1.4626183837652207,
"ce_loss_7": 3.680211156606674,
"epoch": 0.281,
"grad_norm": 18.339396286679403,
"kl_loss_13": 3746.0,
"kl_loss_26": 2276.4,
"kl_loss_39": 1760.4,
"kl_loss_7": 4566.0,
"learning_rate": 0.0008262296005211721,
"loss": 5938.8,
"step": 2810
},
{
"ce_loss_13": 3.1840124845504763,
"ce_loss_26": 2.5105081349611282,
"ce_loss_39": 2.2711715549230576,
"ce_loss_52": 1.4384785890579224,
"ce_loss_7": 3.5735177397727966,
"epoch": 0.282,
"grad_norm": 17.54448826978921,
"kl_loss_13": 3568.4,
"kl_loss_26": 2126.0,
"kl_loss_39": 1635.8,
"kl_loss_7": 4378.4,
"learning_rate": 0.0008250255489412463,
"loss": 5922.5,
"step": 2820
},
{
"ce_loss_13": 3.2147216141223907,
"ce_loss_26": 2.5306088238954545,
"ce_loss_39": 2.2879262059926986,
"ce_loss_52": 1.4329912930727005,
"ce_loss_7": 3.6062001168727873,
"epoch": 0.283,
"grad_norm": 16.224521909183913,
"kl_loss_13": 3634.4,
"kl_loss_26": 2182.2,
"kl_loss_39": 1686.4,
"kl_loss_7": 4456.0,
"learning_rate": 0.0008238182243576511,
"loss": 5870.9,
"step": 2830
},
{
"ce_loss_13": 3.204808014631271,
"ce_loss_26": 2.529933416843414,
"ce_loss_39": 2.2772836655378343,
"ce_loss_52": 1.4382712185382842,
"ce_loss_7": 3.593477213382721,
"epoch": 0.284,
"grad_norm": 17.85408893335792,
"kl_loss_13": 3598.4,
"kl_loss_26": 2175.4,
"kl_loss_39": 1665.8,
"kl_loss_7": 4413.6,
"learning_rate": 0.0008226076389281315,
"loss": 5857.1,
"step": 2840
},
{
"ce_loss_13": 3.137856882810593,
"ce_loss_26": 2.4886497616767884,
"ce_loss_39": 2.2476108491420748,
"ce_loss_52": 1.4388306617736817,
"ce_loss_7": 3.5120018839836122,
"epoch": 0.285,
"grad_norm": 17.527104476469322,
"kl_loss_13": 3468.4,
"kl_loss_26": 2085.8,
"kl_loss_39": 1594.4,
"kl_loss_7": 4250.0,
"learning_rate": 0.0008213938048432696,
"loss": 5806.8,
"step": 2850
},
{
"ce_loss_13": 3.134007251262665,
"ce_loss_26": 2.4605695813894273,
"ce_loss_39": 2.2274533331394197,
"ce_loss_52": 1.4070687741041183,
"ce_loss_7": 3.51087743639946,
"epoch": 0.286,
"grad_norm": 16.81428229570175,
"kl_loss_13": 3532.8,
"kl_loss_26": 2101.0,
"kl_loss_39": 1616.2,
"kl_loss_7": 4327.2,
"learning_rate": 0.0008201767343263612,
"loss": 5809.7,
"step": 2860
},
{
"ce_loss_13": 3.1623673915863035,
"ce_loss_26": 2.497657111287117,
"ce_loss_39": 2.2517473757267,
"ce_loss_52": 1.4269344687461853,
"ce_loss_7": 3.543222689628601,
"epoch": 0.287,
"grad_norm": 16.70636713163243,
"kl_loss_13": 3541.2,
"kl_loss_26": 2128.4,
"kl_loss_39": 1624.8,
"kl_loss_7": 4340.0,
"learning_rate": 0.0008189564396332927,
"loss": 5789.5,
"step": 2870
},
{
"ce_loss_13": 3.1600242078304293,
"ce_loss_26": 2.4901143670082093,
"ce_loss_39": 2.262405735254288,
"ce_loss_52": 1.447944176197052,
"ce_loss_7": 3.5380080163478853,
"epoch": 0.288,
"grad_norm": 17.918386343579222,
"kl_loss_13": 3481.6,
"kl_loss_26": 2059.8,
"kl_loss_39": 1588.8,
"kl_loss_7": 4287.6,
"learning_rate": 0.0008177329330524181,
"loss": 5812.1,
"step": 2880
},
{
"ce_loss_13": 3.174392342567444,
"ce_loss_26": 2.5159962266683578,
"ce_loss_39": 2.2794058710336684,
"ce_loss_52": 1.444807243347168,
"ce_loss_7": 3.5484564363956452,
"epoch": 0.289,
"grad_norm": 19.196721110708168,
"kl_loss_13": 3507.2,
"kl_loss_26": 2102.6,
"kl_loss_39": 1624.0,
"kl_loss_7": 4290.8,
"learning_rate": 0.0008165062269044352,
"loss": 5808.4,
"step": 2890
},
{
"ce_loss_13": 3.1698272943496706,
"ce_loss_26": 2.506984257698059,
"ce_loss_39": 2.2710763216018677,
"ce_loss_52": 1.4491374969482422,
"ce_loss_7": 3.553741979598999,
"epoch": 0.29,
"grad_norm": 16.91235339946444,
"kl_loss_13": 3511.2,
"kl_loss_26": 2109.2,
"kl_loss_39": 1627.2,
"kl_loss_7": 4317.6,
"learning_rate": 0.0008152763335422613,
"loss": 5792.2,
"step": 2900
},
{
"ce_loss_13": 3.115260285139084,
"ce_loss_26": 2.4620468825101853,
"ce_loss_39": 2.2224705785512926,
"ce_loss_52": 1.4216517835855484,
"ce_loss_7": 3.49349564909935,
"epoch": 0.291,
"grad_norm": 19.52390820528971,
"kl_loss_13": 3444.4,
"kl_loss_26": 2058.8,
"kl_loss_39": 1575.2,
"kl_loss_7": 4239.2,
"learning_rate": 0.0008140432653509088,
"loss": 5744.6,
"step": 2910
},
{
"ce_loss_13": 3.094448319077492,
"ce_loss_26": 2.4242703199386595,
"ce_loss_39": 2.1858216524124146,
"ce_loss_52": 1.4000759646296501,
"ce_loss_7": 3.4697431921958923,
"epoch": 0.292,
"grad_norm": 16.436813971821554,
"kl_loss_13": 3432.0,
"kl_loss_26": 2027.8,
"kl_loss_39": 1551.4,
"kl_loss_7": 4216.4,
"learning_rate": 0.0008128070347473608,
"loss": 5696.7,
"step": 2920
},
{
"ce_loss_13": 3.1132335126399995,
"ce_loss_26": 2.4610098242759704,
"ce_loss_39": 2.2259542405605317,
"ce_loss_52": 1.4294333100318908,
"ce_loss_7": 3.48697971701622,
"epoch": 0.293,
"grad_norm": 16.527383629623685,
"kl_loss_13": 3448.4,
"kl_loss_26": 2043.0,
"kl_loss_39": 1562.4,
"kl_loss_7": 4237.2,
"learning_rate": 0.0008115676541804455,
"loss": 5734.0,
"step": 2930
},
{
"ce_loss_13": 3.0772423684597014,
"ce_loss_26": 2.413277891278267,
"ce_loss_39": 2.179169711470604,
"ce_loss_52": 1.3933877736330031,
"ce_loss_7": 3.4470324754714965,
"epoch": 0.294,
"grad_norm": 17.18835839034016,
"kl_loss_13": 3431.6,
"kl_loss_26": 2023.0,
"kl_loss_39": 1542.8,
"kl_loss_7": 4223.6,
"learning_rate": 0.0008103251361307119,
"loss": 5705.55,
"step": 2940
},
{
"ce_loss_13": 3.093912643194199,
"ce_loss_26": 2.4372138679027557,
"ce_loss_39": 2.2068597853183745,
"ce_loss_52": 1.4339108556509017,
"ce_loss_7": 3.4713816404342652,
"epoch": 0.295,
"grad_norm": 16.78852919489637,
"kl_loss_13": 3395.2,
"kl_loss_26": 1998.6,
"kl_loss_39": 1533.6,
"kl_loss_7": 4186.4,
"learning_rate": 0.0008090794931103026,
"loss": 5641.3,
"step": 2950
},
{
"ce_loss_13": 3.1308993637561797,
"ce_loss_26": 2.4708112478256226,
"ce_loss_39": 2.22915124297142,
"ce_loss_52": 1.4374129235744477,
"ce_loss_7": 3.4991187393665313,
"epoch": 0.296,
"grad_norm": 16.768783545647338,
"kl_loss_13": 3436.4,
"kl_loss_26": 2035.0,
"kl_loss_39": 1548.6,
"kl_loss_7": 4216.8,
"learning_rate": 0.0008078307376628291,
"loss": 5645.6,
"step": 2960
},
{
"ce_loss_13": 3.0990765929222106,
"ce_loss_26": 2.452856171131134,
"ce_loss_39": 2.21165874004364,
"ce_loss_52": 1.4263496309518815,
"ce_loss_7": 3.472361743450165,
"epoch": 0.297,
"grad_norm": 16.652386900592916,
"kl_loss_13": 3401.6,
"kl_loss_26": 2024.0,
"kl_loss_39": 1532.8,
"kl_loss_7": 4181.6,
"learning_rate": 0.000806578882363245,
"loss": 5645.6,
"step": 2970
},
{
"ce_loss_13": 3.092843067646027,
"ce_loss_26": 2.423402965068817,
"ce_loss_39": 2.1854313611984253,
"ce_loss_52": 1.4033612102270125,
"ce_loss_7": 3.474502944946289,
"epoch": 0.298,
"grad_norm": 17.707727518863294,
"kl_loss_13": 3432.4,
"kl_loss_26": 2016.2,
"kl_loss_39": 1530.6,
"kl_loss_7": 4229.2,
"learning_rate": 0.0008053239398177191,
"loss": 5651.6,
"step": 2980
},
{
"ce_loss_13": 3.0975674211978914,
"ce_loss_26": 2.4374621868133546,
"ce_loss_39": 2.209611228108406,
"ce_loss_52": 1.4303795397281647,
"ce_loss_7": 3.475492590665817,
"epoch": 0.299,
"grad_norm": 17.746850084884183,
"kl_loss_13": 3416.8,
"kl_loss_26": 2002.4,
"kl_loss_39": 1530.8,
"kl_loss_7": 4214.8,
"learning_rate": 0.0008040659226635089,
"loss": 5630.2,
"step": 2990
},
{
"ce_loss_13": 3.089507430791855,
"ce_loss_26": 2.417942848801613,
"ce_loss_39": 2.1790529817342756,
"ce_loss_52": 1.3988826781511308,
"ce_loss_7": 3.468764144182205,
"epoch": 0.3,
"grad_norm": 17.0619103802906,
"kl_loss_13": 3422.4,
"kl_loss_26": 2019.4,
"kl_loss_39": 1533.2,
"kl_loss_7": 4215.2,
"learning_rate": 0.0008028048435688333,
"loss": 5562.8,
"step": 3000
},
{
"ce_loss_13": 3.1326099216938017,
"ce_loss_26": 2.471994733810425,
"ce_loss_39": 2.2380873382091524,
"ce_loss_52": 1.458441223204136,
"ce_loss_7": 3.502814435958862,
"epoch": 0.301,
"grad_norm": 17.222204651935485,
"kl_loss_13": 3436.8,
"kl_loss_26": 2039.6,
"kl_loss_39": 1559.8,
"kl_loss_7": 4214.8,
"learning_rate": 0.0008015407152327448,
"loss": 5664.1,
"step": 3010
},
{
"ce_loss_13": 3.1731098294258118,
"ce_loss_26": 2.504976212978363,
"ce_loss_39": 2.260189512372017,
"ce_loss_52": 1.4524748474359512,
"ce_loss_7": 3.555993539094925,
"epoch": 0.302,
"grad_norm": 16.72113952146357,
"kl_loss_13": 3518.4,
"kl_loss_26": 2087.0,
"kl_loss_39": 1585.0,
"kl_loss_7": 4313.2,
"learning_rate": 0.0008002735503850016,
"loss": 5589.3,
"step": 3020
},
{
"ce_loss_13": 3.118546891212463,
"ce_loss_26": 2.445932698249817,
"ce_loss_39": 2.2121699869632723,
"ce_loss_52": 1.4456641212105752,
"ce_loss_7": 3.491698741912842,
"epoch": 0.303,
"grad_norm": 16.572401194454876,
"kl_loss_13": 3394.8,
"kl_loss_26": 1984.4,
"kl_loss_39": 1511.2,
"kl_loss_7": 4167.2,
"learning_rate": 0.0007990033617859396,
"loss": 5580.5,
"step": 3030
},
{
"ce_loss_13": 3.10419015288353,
"ce_loss_26": 2.447344717383385,
"ce_loss_39": 2.212172231078148,
"ce_loss_52": 1.435505247116089,
"ce_loss_7": 3.477084743976593,
"epoch": 0.304,
"grad_norm": 18.041012229096975,
"kl_loss_13": 3420.8,
"kl_loss_26": 2017.0,
"kl_loss_39": 1525.6,
"kl_loss_7": 4208.8,
"learning_rate": 0.000797730162226344,
"loss": 5556.8,
"step": 3040
},
{
"ce_loss_13": 3.0508037239313124,
"ce_loss_26": 2.3910141468048094,
"ce_loss_39": 2.161831411719322,
"ce_loss_52": 1.3925497516989709,
"ce_loss_7": 3.434678375720978,
"epoch": 0.305,
"grad_norm": 18.2507403656971,
"kl_loss_13": 3360.8,
"kl_loss_26": 1961.2,
"kl_loss_39": 1491.3,
"kl_loss_7": 4164.0,
"learning_rate": 0.0007964539645273203,
"loss": 5538.7,
"step": 3050
},
{
"ce_loss_13": 3.143118643760681,
"ce_loss_26": 2.503050500154495,
"ce_loss_39": 2.269149711728096,
"ce_loss_52": 1.4877858996391295,
"ce_loss_7": 3.510826712846756,
"epoch": 0.306,
"grad_norm": 17.20244258760552,
"kl_loss_13": 3390.4,
"kl_loss_26": 2023.0,
"kl_loss_39": 1542.0,
"kl_loss_7": 4166.8,
"learning_rate": 0.000795174781540165,
"loss": 5547.7,
"step": 3060
},
{
"ce_loss_13": 3.0940939664840696,
"ce_loss_26": 2.4377844393253327,
"ce_loss_39": 2.2110770642757416,
"ce_loss_52": 1.453443130850792,
"ce_loss_7": 3.461427628993988,
"epoch": 0.307,
"grad_norm": 16.19915727951882,
"kl_loss_13": 3353.6,
"kl_loss_26": 1960.2,
"kl_loss_39": 1487.5,
"kl_loss_7": 4131.2,
"learning_rate": 0.0007938926261462366,
"loss": 5534.3,
"step": 3070
},
{
"ce_loss_13": 3.099123537540436,
"ce_loss_26": 2.428412067890167,
"ce_loss_39": 2.1952255785465242,
"ce_loss_52": 1.4312659561634065,
"ce_loss_7": 3.477120190858841,
"epoch": 0.308,
"grad_norm": 16.90807792363964,
"kl_loss_13": 3392.0,
"kl_loss_26": 1979.6,
"kl_loss_39": 1498.8,
"kl_loss_7": 4186.8,
"learning_rate": 0.0007926075112568258,
"loss": 5523.9,
"step": 3080
},
{
"ce_loss_13": 3.0900339841842652,
"ce_loss_26": 2.429807424545288,
"ce_loss_39": 2.1938013613224028,
"ce_loss_52": 1.4448837220668793,
"ce_loss_7": 3.460611253976822,
"epoch": 0.309,
"grad_norm": 17.278764306039758,
"kl_loss_13": 3363.2,
"kl_loss_26": 1959.6,
"kl_loss_39": 1476.2,
"kl_loss_7": 4153.6,
"learning_rate": 0.0007913194498130252,
"loss": 5481.8,
"step": 3090
},
{
"ce_loss_13": 3.0705978155136107,
"ce_loss_26": 2.415473333001137,
"ce_loss_39": 2.1859600633382796,
"ce_loss_52": 1.4387344419956207,
"ce_loss_7": 3.4460779249668123,
"epoch": 0.31,
"grad_norm": 17.744380665090848,
"kl_loss_13": 3328.8,
"kl_loss_26": 1937.2,
"kl_loss_39": 1461.6,
"kl_loss_7": 4109.6,
"learning_rate": 0.0007900284547855992,
"loss": 5494.5,
"step": 3100
},
{
"ce_loss_13": 3.1120304703712462,
"ce_loss_26": 2.437400758266449,
"ce_loss_39": 2.1967616409063337,
"ce_loss_52": 1.4447504609823227,
"ce_loss_7": 3.4870758295059203,
"epoch": 0.311,
"grad_norm": 17.196298155702294,
"kl_loss_13": 3414.8,
"kl_loss_26": 1997.2,
"kl_loss_39": 1507.4,
"kl_loss_7": 4197.6,
"learning_rate": 0.0007887345391748532,
"loss": 5492.4,
"step": 3110
},
{
"ce_loss_13": 3.0714461147785186,
"ce_loss_26": 2.410064917802811,
"ce_loss_39": 2.176686418056488,
"ce_loss_52": 1.4264434427022934,
"ce_loss_7": 3.440903478860855,
"epoch": 0.312,
"grad_norm": 17.274723589437542,
"kl_loss_13": 3352.8,
"kl_loss_26": 1965.6,
"kl_loss_39": 1480.0,
"kl_loss_7": 4134.0,
"learning_rate": 0.0007874377160105036,
"loss": 5478.5,
"step": 3120
},
{
"ce_loss_13": 3.0896170139312744,
"ce_loss_26": 2.440164825320244,
"ce_loss_39": 2.2094400197267534,
"ce_loss_52": 1.4495170325040818,
"ce_loss_7": 3.468176656961441,
"epoch": 0.313,
"grad_norm": 17.297623478505614,
"kl_loss_13": 3375.2,
"kl_loss_26": 1976.4,
"kl_loss_39": 1503.8,
"kl_loss_7": 4173.6,
"learning_rate": 0.0007861379983515449,
"loss": 5461.8,
"step": 3130
},
{
"ce_loss_13": 3.074652445316315,
"ce_loss_26": 2.4188932478427887,
"ce_loss_39": 2.1893287271261217,
"ce_loss_52": 1.440689930319786,
"ce_loss_7": 3.452376401424408,
"epoch": 0.314,
"grad_norm": 18.06264725868681,
"kl_loss_13": 3317.2,
"kl_loss_26": 1935.6,
"kl_loss_39": 1465.8,
"kl_loss_7": 4117.6,
"learning_rate": 0.0007848353992861195,
"loss": 5464.6,
"step": 3140
},
{
"ce_loss_13": 3.078055852651596,
"ce_loss_26": 2.420557659864426,
"ce_loss_39": 2.188734245300293,
"ce_loss_52": 1.4398296728730202,
"ce_loss_7": 3.457850754261017,
"epoch": 0.315,
"grad_norm": 17.00206014366557,
"kl_loss_13": 3318.4,
"kl_loss_26": 1927.6,
"kl_loss_39": 1458.6,
"kl_loss_7": 4114.0,
"learning_rate": 0.0007835299319313853,
"loss": 5381.3,
"step": 3150
},
{
"ce_loss_13": 3.059806948900223,
"ce_loss_26": 2.3743002265691757,
"ce_loss_39": 2.137469917535782,
"ce_loss_52": 1.3903418719768523,
"ce_loss_7": 3.4285161972045897,
"epoch": 0.316,
"grad_norm": 17.46330734643716,
"kl_loss_13": 3354.4,
"kl_loss_26": 1935.6,
"kl_loss_39": 1455.2,
"kl_loss_7": 4136.8,
"learning_rate": 0.0007822216094333848,
"loss": 5407.6,
"step": 3160
},
{
"ce_loss_13": 3.0990252554416657,
"ce_loss_26": 2.4341968923807142,
"ce_loss_39": 2.1952569454908373,
"ce_loss_52": 1.439093704521656,
"ce_loss_7": 3.4743688821792604,
"epoch": 0.317,
"grad_norm": 18.51989341882003,
"kl_loss_13": 3380.4,
"kl_loss_26": 1988.0,
"kl_loss_39": 1492.0,
"kl_loss_7": 4166.4,
"learning_rate": 0.0007809104449669101,
"loss": 5410.7,
"step": 3170
},
{
"ce_loss_13": 3.041397601366043,
"ce_loss_26": 2.382528102397919,
"ce_loss_39": 2.1511587262153626,
"ce_loss_52": 1.435599946975708,
"ce_loss_7": 3.4155047237873077,
"epoch": 0.318,
"grad_norm": 17.0483845332083,
"kl_loss_13": 3256.8,
"kl_loss_26": 1868.0,
"kl_loss_39": 1392.8,
"kl_loss_7": 4038.0,
"learning_rate": 0.0007795964517353734,
"loss": 5354.9,
"step": 3180
},
{
"ce_loss_13": 3.0867488861083983,
"ce_loss_26": 2.425813916325569,
"ce_loss_39": 2.1947717368602753,
"ce_loss_52": 1.4569276213645934,
"ce_loss_7": 3.4604012250900267,
"epoch": 0.319,
"grad_norm": 16.586058341187346,
"kl_loss_13": 3328.8,
"kl_loss_26": 1930.2,
"kl_loss_39": 1449.8,
"kl_loss_7": 4120.8,
"learning_rate": 0.000778279642970672,
"loss": 5344.7,
"step": 3190
},
{
"ce_loss_13": 3.0399708569049837,
"ce_loss_26": 2.3785893470048904,
"ce_loss_39": 2.138497656583786,
"ce_loss_52": 1.4135656535625458,
"ce_loss_7": 3.409178429841995,
"epoch": 0.32,
"grad_norm": 17.983547533058992,
"kl_loss_13": 3304.0,
"kl_loss_26": 1914.8,
"kl_loss_39": 1428.2,
"kl_loss_7": 4088.0,
"learning_rate": 0.0007769600319330552,
"loss": 5362.0,
"step": 3200
},
{
"ce_loss_13": 3.1184182286262514,
"ce_loss_26": 2.4707882523536684,
"ce_loss_39": 2.233083599805832,
"ce_loss_52": 1.4817634999752045,
"ce_loss_7": 3.4818074285984038,
"epoch": 0.321,
"grad_norm": 16.87308179941231,
"kl_loss_13": 3315.6,
"kl_loss_26": 1955.6,
"kl_loss_39": 1470.6,
"kl_loss_7": 4081.2,
"learning_rate": 0.0007756376319109917,
"loss": 5372.8,
"step": 3210
},
{
"ce_loss_13": 3.065703272819519,
"ce_loss_26": 2.4234554558992385,
"ce_loss_39": 2.1948377937078476,
"ce_loss_52": 1.4451974362134934,
"ce_loss_7": 3.4377165257930757,
"epoch": 0.322,
"grad_norm": 17.206055177859767,
"kl_loss_13": 3288.4,
"kl_loss_26": 1929.2,
"kl_loss_39": 1461.2,
"kl_loss_7": 4058.8,
"learning_rate": 0.0007743124562210351,
"loss": 5338.3,
"step": 3220
},
{
"ce_loss_13": 3.0654458463191987,
"ce_loss_26": 2.4097089529037476,
"ce_loss_39": 2.1828925907611847,
"ce_loss_52": 1.4646209165453912,
"ce_loss_7": 3.438837933540344,
"epoch": 0.323,
"grad_norm": 16.43379975440051,
"kl_loss_13": 3246.8,
"kl_loss_26": 1868.4,
"kl_loss_39": 1403.6,
"kl_loss_7": 4034.4,
"learning_rate": 0.0007729845182076895,
"loss": 5337.95,
"step": 3230
},
{
"ce_loss_13": 3.019025903940201,
"ce_loss_26": 2.3812606751918795,
"ce_loss_39": 2.1557460606098173,
"ce_loss_52": 1.4495598763227462,
"ce_loss_7": 3.390954166650772,
"epoch": 0.324,
"grad_norm": 17.50409394447208,
"kl_loss_13": 3213.2,
"kl_loss_26": 1857.6,
"kl_loss_39": 1393.2,
"kl_loss_7": 3999.2,
"learning_rate": 0.0007716538312432765,
"loss": 5323.8,
"step": 3240
},
{
"ce_loss_13": 3.0274185359478,
"ce_loss_26": 2.3673853039741517,
"ce_loss_39": 2.1314821422100065,
"ce_loss_52": 1.4110743701457977,
"ce_loss_7": 3.3962999522686004,
"epoch": 0.325,
"grad_norm": 17.627956174954214,
"kl_loss_13": 3281.2,
"kl_loss_26": 1900.8,
"kl_loss_39": 1419.0,
"kl_loss_7": 4056.8,
"learning_rate": 0.0007703204087277988,
"loss": 5310.9,
"step": 3250
},
{
"ce_loss_13": 2.995425891876221,
"ce_loss_26": 2.3437940657138823,
"ce_loss_39": 2.1112417429685593,
"ce_loss_52": 1.3956570625305176,
"ce_loss_7": 3.3635079681873323,
"epoch": 0.326,
"grad_norm": 17.092527088154757,
"kl_loss_13": 3270.8,
"kl_loss_26": 1881.4,
"kl_loss_39": 1402.8,
"kl_loss_7": 4054.4,
"learning_rate": 0.0007689842640888063,
"loss": 5291.9,
"step": 3260
},
{
"ce_loss_13": 3.0584332168102266,
"ce_loss_26": 2.4099347323179243,
"ce_loss_39": 2.182457607984543,
"ce_loss_52": 1.4547152355313302,
"ce_loss_7": 3.4290026843547823,
"epoch": 0.327,
"grad_norm": 17.31361789139729,
"kl_loss_13": 3256.0,
"kl_loss_26": 1887.8,
"kl_loss_39": 1426.4,
"kl_loss_7": 4026.4,
"learning_rate": 0.0007676454107812607,
"loss": 5264.3,
"step": 3270
},
{
"ce_loss_13": 3.002471148967743,
"ce_loss_26": 2.365675774216652,
"ce_loss_39": 2.1432500898838045,
"ce_loss_52": 1.4313921973109245,
"ce_loss_7": 3.3743964791297913,
"epoch": 0.328,
"grad_norm": 15.793768401685108,
"kl_loss_13": 3248.4,
"kl_loss_26": 1868.8,
"kl_loss_39": 1414.0,
"kl_loss_7": 4030.8,
"learning_rate": 0.0007663038622873999,
"loss": 5285.1,
"step": 3280
},
{
"ce_loss_13": 3.0830911457538606,
"ce_loss_26": 2.4269310742616654,
"ce_loss_39": 2.2025650680065154,
"ce_loss_52": 1.4690157890319824,
"ce_loss_7": 3.454869121313095,
"epoch": 0.329,
"grad_norm": 17.14503971571328,
"kl_loss_13": 3293.2,
"kl_loss_26": 1903.8,
"kl_loss_39": 1434.0,
"kl_loss_7": 4075.6,
"learning_rate": 0.0007649596321166025,
"loss": 5253.65,
"step": 3290
},
{
"ce_loss_13": 2.9723230481147764,
"ce_loss_26": 2.332389995455742,
"ce_loss_39": 2.1094966679811478,
"ce_loss_52": 1.4339970767498016,
"ce_loss_7": 3.333187943696976,
"epoch": 0.33,
"grad_norm": 16.633040610576913,
"kl_loss_13": 3118.0,
"kl_loss_26": 1779.4,
"kl_loss_39": 1323.0,
"kl_loss_7": 3879.6,
"learning_rate": 0.0007636127338052513,
"loss": 5233.1,
"step": 3300
},
{
"ce_loss_13": 2.9914496004581452,
"ce_loss_26": 2.3316532552242277,
"ce_loss_39": 2.0988477796316145,
"ce_loss_52": 1.400558878481388,
"ce_loss_7": 3.363678741455078,
"epoch": 0.331,
"grad_norm": 17.26677566687906,
"kl_loss_13": 3257.2,
"kl_loss_26": 1858.4,
"kl_loss_39": 1375.2,
"kl_loss_7": 4042.8,
"learning_rate": 0.0007622631809165971,
"loss": 5196.15,
"step": 3310
},
{
"ce_loss_13": 3.064756464958191,
"ce_loss_26": 2.422107365727425,
"ce_loss_39": 2.192085716128349,
"ce_loss_52": 1.4793070062994957,
"ce_loss_7": 3.430086314678192,
"epoch": 0.332,
"grad_norm": 17.330178950251707,
"kl_loss_13": 3243.6,
"kl_loss_26": 1878.8,
"kl_loss_39": 1409.8,
"kl_loss_7": 4016.4,
"learning_rate": 0.000760910987040623,
"loss": 5231.55,
"step": 3320
},
{
"ce_loss_13": 2.9637326538562774,
"ce_loss_26": 2.3162154614925385,
"ce_loss_39": 2.097182759642601,
"ce_loss_52": 1.4222271725535394,
"ce_loss_7": 3.332917684316635,
"epoch": 0.333,
"grad_norm": 17.085141982864975,
"kl_loss_13": 3141.2,
"kl_loss_26": 1768.8,
"kl_loss_39": 1318.3,
"kl_loss_7": 3920.0,
"learning_rate": 0.000759556165793906,
"loss": 5154.65,
"step": 3330
},
{
"ce_loss_13": 3.029485374689102,
"ce_loss_26": 2.3887200921773912,
"ce_loss_39": 2.1607041716575623,
"ce_loss_52": 1.4696623742580415,
"ce_loss_7": 3.3922773957252503,
"epoch": 0.334,
"grad_norm": 15.502678294546826,
"kl_loss_13": 3185.2,
"kl_loss_26": 1826.8,
"kl_loss_39": 1360.6,
"kl_loss_7": 3948.8,
"learning_rate": 0.000758198730819481,
"loss": 5180.15,
"step": 3340
},
{
"ce_loss_13": 3.03846270442009,
"ce_loss_26": 2.378660023212433,
"ce_loss_39": 2.1477699905633925,
"ce_loss_52": 1.4347332805395125,
"ce_loss_7": 3.4101031959056853,
"epoch": 0.335,
"grad_norm": 16.024720398541927,
"kl_loss_13": 3270.4,
"kl_loss_26": 1883.0,
"kl_loss_39": 1406.3,
"kl_loss_7": 4056.4,
"learning_rate": 0.0007568386957867032,
"loss": 5194.3,
"step": 3350
},
{
"ce_loss_13": 3.008663833141327,
"ce_loss_26": 2.3700191140174867,
"ce_loss_39": 2.1397975504398348,
"ce_loss_52": 1.4545943021774292,
"ce_loss_7": 3.3673401892185213,
"epoch": 0.336,
"grad_norm": 16.053283356705972,
"kl_loss_13": 3181.6,
"kl_loss_26": 1820.2,
"kl_loss_39": 1348.6,
"kl_loss_7": 3942.0,
"learning_rate": 0.0007554760743911103,
"loss": 5153.55,
"step": 3360
},
{
"ce_loss_13": 2.9829940140247344,
"ce_loss_26": 2.3409414261579515,
"ce_loss_39": 2.1123215198516845,
"ce_loss_52": 1.435232725739479,
"ce_loss_7": 3.355481207370758,
"epoch": 0.337,
"grad_norm": 16.67714591780792,
"kl_loss_13": 3152.4,
"kl_loss_26": 1792.8,
"kl_loss_39": 1321.8,
"kl_loss_7": 3938.0,
"learning_rate": 0.0007541108803542846,
"loss": 5142.8,
"step": 3370
},
{
"ce_loss_13": 3.027516704797745,
"ce_loss_26": 2.386495107412338,
"ce_loss_39": 2.1608838021755217,
"ce_loss_52": 1.4647808492183685,
"ce_loss_7": 3.3936978697776796,
"epoch": 0.338,
"grad_norm": 16.879054589986723,
"kl_loss_13": 3186.4,
"kl_loss_26": 1828.6,
"kl_loss_39": 1351.9,
"kl_loss_7": 3957.6,
"learning_rate": 0.0007527431274237149,
"loss": 5169.0,
"step": 3380
},
{
"ce_loss_13": 2.9946301877498627,
"ce_loss_26": 2.3546741545200347,
"ce_loss_39": 2.1238624840974807,
"ce_loss_52": 1.4416128873825074,
"ce_loss_7": 3.3628919243812563,
"epoch": 0.339,
"grad_norm": 18.526031342338438,
"kl_loss_13": 3157.6,
"kl_loss_26": 1810.6,
"kl_loss_39": 1339.1,
"kl_loss_7": 3932.0,
"learning_rate": 0.0007513728293726579,
"loss": 5107.45,
"step": 3390
},
{
"ce_loss_13": 2.975279802083969,
"ce_loss_26": 2.325047069787979,
"ce_loss_39": 2.1066873967647552,
"ce_loss_52": 1.4382890224456788,
"ce_loss_7": 3.3415417432785035,
"epoch": 0.34,
"grad_norm": 17.516441880993753,
"kl_loss_13": 3144.4,
"kl_loss_26": 1778.8,
"kl_loss_39": 1326.8,
"kl_loss_7": 3920.0,
"learning_rate": 0.00075,
"loss": 5107.0,
"step": 3400
},
{
"ce_loss_13": 2.9631440460681917,
"ce_loss_26": 2.318173348903656,
"ce_loss_39": 2.0847090512514113,
"ce_loss_52": 1.4200364857912064,
"ce_loss_7": 3.3265232741832733,
"epoch": 0.341,
"grad_norm": 16.134403370320147,
"kl_loss_13": 3139.2,
"kl_loss_26": 1766.8,
"kl_loss_39": 1296.8,
"kl_loss_7": 3908.8,
"learning_rate": 0.0007486246531301177,
"loss": 5097.65,
"step": 3410
},
{
"ce_loss_13": 2.99331476688385,
"ce_loss_26": 2.3580960750579836,
"ce_loss_39": 2.1382109016180038,
"ce_loss_52": 1.4535871922969819,
"ce_loss_7": 3.35606609582901,
"epoch": 0.342,
"grad_norm": 16.427769884918277,
"kl_loss_13": 3148.4,
"kl_loss_26": 1796.6,
"kl_loss_39": 1349.0,
"kl_loss_7": 3921.2,
"learning_rate": 0.0007472468026127384,
"loss": 5139.75,
"step": 3420
},
{
"ce_loss_13": 2.921882951259613,
"ce_loss_26": 2.2855687588453293,
"ce_loss_39": 2.067648893594742,
"ce_loss_52": 1.4116598561406135,
"ce_loss_7": 3.2927843034267426,
"epoch": 0.343,
"grad_norm": 16.808808535347794,
"kl_loss_13": 3095.8,
"kl_loss_26": 1756.2,
"kl_loss_39": 1301.9,
"kl_loss_7": 3878.0,
"learning_rate": 0.000745866462322802,
"loss": 5051.15,
"step": 3430
},
{
"ce_loss_13": 3.04699621796608,
"ce_loss_26": 2.399735540151596,
"ce_loss_39": 2.176995486021042,
"ce_loss_52": 1.496598380804062,
"ce_loss_7": 3.418045401573181,
"epoch": 0.344,
"grad_norm": 16.406856004421037,
"kl_loss_13": 3162.0,
"kl_loss_26": 1792.4,
"kl_loss_39": 1329.5,
"kl_loss_7": 3935.6,
"learning_rate": 0.0007444836461603195,
"loss": 5107.85,
"step": 3440
},
{
"ce_loss_13": 2.930357199907303,
"ce_loss_26": 2.3015194088220596,
"ce_loss_39": 2.078587147593498,
"ce_loss_52": 1.414345271885395,
"ce_loss_7": 3.2914562046527864,
"epoch": 0.345,
"grad_norm": 16.994168943169583,
"kl_loss_13": 3103.6,
"kl_loss_26": 1763.0,
"kl_loss_39": 1304.8,
"kl_loss_7": 3859.2,
"learning_rate": 0.0007430983680502344,
"loss": 5063.6,
"step": 3450
},
{
"ce_loss_13": 2.946500468254089,
"ce_loss_26": 2.3075433492660524,
"ce_loss_39": 2.08428935110569,
"ce_loss_52": 1.4254867061972618,
"ce_loss_7": 3.309797298908234,
"epoch": 0.346,
"grad_norm": 16.30494022816115,
"kl_loss_13": 3098.0,
"kl_loss_26": 1752.4,
"kl_loss_39": 1293.6,
"kl_loss_7": 3865.2,
"learning_rate": 0.0007417106419422819,
"loss": 5025.2,
"step": 3460
},
{
"ce_loss_13": 2.935671639442444,
"ce_loss_26": 2.2941204428672792,
"ce_loss_39": 2.071269851922989,
"ce_loss_52": 1.4085116267204285,
"ce_loss_7": 3.3008416891098022,
"epoch": 0.347,
"grad_norm": 17.30594652330039,
"kl_loss_13": 3130.4,
"kl_loss_26": 1779.4,
"kl_loss_39": 1316.9,
"kl_loss_7": 3898.4,
"learning_rate": 0.0007403204818108486,
"loss": 5043.95,
"step": 3470
},
{
"ce_loss_13": 2.934108853340149,
"ce_loss_26": 2.292804607748985,
"ce_loss_39": 2.0716417878866196,
"ce_loss_52": 1.4156519144773483,
"ce_loss_7": 3.2983541190624237,
"epoch": 0.348,
"grad_norm": 16.793884082475312,
"kl_loss_13": 3072.0,
"kl_loss_26": 1727.6,
"kl_loss_39": 1271.6,
"kl_loss_7": 3841.6,
"learning_rate": 0.0007389279016548316,
"loss": 5016.3,
"step": 3480
},
{
"ce_loss_13": 2.8679963111877442,
"ce_loss_26": 2.2394334375858307,
"ce_loss_39": 2.0239282071590425,
"ce_loss_52": 1.3932767808437347,
"ce_loss_7": 3.234779417514801,
"epoch": 0.349,
"grad_norm": 15.946114217155069,
"kl_loss_13": 3023.6,
"kl_loss_26": 1691.8,
"kl_loss_39": 1246.9,
"kl_loss_7": 3788.8,
"learning_rate": 0.0007375329154974975,
"loss": 5018.15,
"step": 3490
},
{
"ce_loss_13": 2.9285870611667635,
"ce_loss_26": 2.287761977314949,
"ce_loss_39": 2.063358634710312,
"ce_loss_52": 1.4101893305778503,
"ce_loss_7": 3.298641562461853,
"epoch": 0.35,
"grad_norm": 16.997559975693584,
"kl_loss_13": 3086.0,
"kl_loss_26": 1746.0,
"kl_loss_39": 1280.7,
"kl_loss_7": 3868.8,
"learning_rate": 0.0007361355373863414,
"loss": 5018.5,
"step": 3500
},
{
"ce_loss_13": 2.931669169664383,
"ce_loss_26": 2.288780450820923,
"ce_loss_39": 2.0723241955041884,
"ce_loss_52": 1.4271863222122192,
"ce_loss_7": 3.2927916407585145,
"epoch": 0.351,
"grad_norm": 15.980672586392506,
"kl_loss_13": 3077.6,
"kl_loss_26": 1726.6,
"kl_loss_39": 1278.3,
"kl_loss_7": 3843.6,
"learning_rate": 0.0007347357813929454,
"loss": 4989.7,
"step": 3510
},
{
"ce_loss_13": 2.928689205646515,
"ce_loss_26": 2.2940947294235228,
"ce_loss_39": 2.0670880317687987,
"ce_loss_52": 1.4149780303239823,
"ce_loss_7": 3.293194830417633,
"epoch": 0.352,
"grad_norm": 16.33808700723808,
"kl_loss_13": 3074.8,
"kl_loss_26": 1738.8,
"kl_loss_39": 1273.7,
"kl_loss_7": 3840.0,
"learning_rate": 0.0007333336616128369,
"loss": 4986.35,
"step": 3520
},
{
"ce_loss_13": 2.940303909778595,
"ce_loss_26": 2.294164848327637,
"ce_loss_39": 2.0657031387090683,
"ce_loss_52": 1.4308805465698242,
"ce_loss_7": 3.3036282479763033,
"epoch": 0.353,
"grad_norm": 16.29503578207681,
"kl_loss_13": 3067.6,
"kl_loss_26": 1716.2,
"kl_loss_39": 1247.9,
"kl_loss_7": 3828.0,
"learning_rate": 0.0007319291921653463,
"loss": 4998.85,
"step": 3530
},
{
"ce_loss_13": 2.916954427957535,
"ce_loss_26": 2.2819162607192993,
"ce_loss_39": 2.0701166808605196,
"ce_loss_52": 1.4166931748390197,
"ce_loss_7": 3.2759189188480375,
"epoch": 0.354,
"grad_norm": 17.822669536905938,
"kl_loss_13": 3073.6,
"kl_loss_26": 1723.6,
"kl_loss_39": 1286.2,
"kl_loss_7": 3837.2,
"learning_rate": 0.0007305223871934656,
"loss": 4995.55,
"step": 3540
},
{
"ce_loss_13": 2.975371015071869,
"ce_loss_26": 2.3552831768989564,
"ce_loss_39": 2.1292835503816603,
"ce_loss_52": 1.4775378912687303,
"ce_loss_7": 3.3292273938655854,
"epoch": 0.355,
"grad_norm": 16.58742338810906,
"kl_loss_13": 3054.4,
"kl_loss_26": 1739.2,
"kl_loss_39": 1279.1,
"kl_loss_7": 3808.8,
"learning_rate": 0.0007291132608637052,
"loss": 4945.1,
"step": 3550
},
{
"ce_loss_13": 2.972325986623764,
"ce_loss_26": 2.3392420560121536,
"ce_loss_39": 2.1207040429115294,
"ce_loss_52": 1.475459137558937,
"ce_loss_7": 3.3383385837078094,
"epoch": 0.356,
"grad_norm": 16.383650902730043,
"kl_loss_13": 3066.4,
"kl_loss_26": 1712.2,
"kl_loss_39": 1260.8,
"kl_loss_7": 3838.8,
"learning_rate": 0.0007277018273659516,
"loss": 4963.15,
"step": 3560
},
{
"ce_loss_13": 3.0111460268497465,
"ce_loss_26": 2.3827701687812803,
"ce_loss_39": 2.1591351449489595,
"ce_loss_52": 1.4967001289129258,
"ce_loss_7": 3.3728690683841704,
"epoch": 0.357,
"grad_norm": 16.824381349061323,
"kl_loss_13": 3101.2,
"kl_loss_26": 1770.6,
"kl_loss_39": 1310.2,
"kl_loss_7": 3856.8,
"learning_rate": 0.0007262881009133242,
"loss": 4952.95,
"step": 3570
},
{
"ce_loss_13": 2.9141285896301268,
"ce_loss_26": 2.2805556029081346,
"ce_loss_39": 2.065216201543808,
"ce_loss_52": 1.4257875666022302,
"ce_loss_7": 3.282390242815018,
"epoch": 0.358,
"grad_norm": 18.058728297237614,
"kl_loss_13": 3046.4,
"kl_loss_26": 1695.0,
"kl_loss_39": 1247.4,
"kl_loss_7": 3819.6,
"learning_rate": 0.0007248720957420329,
"loss": 4964.9,
"step": 3580
},
{
"ce_loss_13": 2.8828001439571382,
"ce_loss_26": 2.253911817073822,
"ce_loss_39": 2.030216920375824,
"ce_loss_52": 1.40321164727211,
"ce_loss_7": 3.251304441690445,
"epoch": 0.359,
"grad_norm": 16.573678980597606,
"kl_loss_13": 3047.2,
"kl_loss_26": 1698.2,
"kl_loss_39": 1242.3,
"kl_loss_7": 3817.6,
"learning_rate": 0.0007234538261112341,
"loss": 4895.95,
"step": 3590
},
{
"ce_loss_13": 2.9461396992206574,
"ce_loss_26": 2.3021911144256593,
"ce_loss_39": 2.0822067111730576,
"ce_loss_52": 1.448357391357422,
"ce_loss_7": 3.3077784180641174,
"epoch": 0.36,
"grad_norm": 17.017357286641733,
"kl_loss_13": 3060.4,
"kl_loss_26": 1708.8,
"kl_loss_39": 1253.4,
"kl_loss_7": 3833.6,
"learning_rate": 0.0007220333063028871,
"loss": 4918.35,
"step": 3600
},
{
"ce_loss_13": 2.846253049373627,
"ce_loss_26": 2.224426531791687,
"ce_loss_39": 2.0076546490192415,
"ce_loss_52": 1.395447552204132,
"ce_loss_7": 3.2072394728660583,
"epoch": 0.361,
"grad_norm": 15.74697405086667,
"kl_loss_13": 2978.0,
"kl_loss_26": 1658.4,
"kl_loss_39": 1211.2,
"kl_loss_7": 3738.4,
"learning_rate": 0.0007206105506216106,
"loss": 4871.3,
"step": 3610
},
{
"ce_loss_13": 3.0099994122982023,
"ce_loss_26": 2.373449808359146,
"ce_loss_39": 2.1529267936944962,
"ce_loss_52": 1.4900053232908248,
"ce_loss_7": 3.3753599405288695,
"epoch": 0.362,
"grad_norm": 16.970809605735944,
"kl_loss_13": 3087.6,
"kl_loss_26": 1745.2,
"kl_loss_39": 1286.7,
"kl_loss_7": 3866.8,
"learning_rate": 0.0007191855733945387,
"loss": 4947.8,
"step": 3620
},
{
"ce_loss_13": 2.937187296152115,
"ce_loss_26": 2.322328266501427,
"ce_loss_39": 2.105859735608101,
"ce_loss_52": 1.474419781565666,
"ce_loss_7": 3.2971576511859895,
"epoch": 0.363,
"grad_norm": 17.141982755892812,
"kl_loss_13": 3009.6,
"kl_loss_26": 1696.8,
"kl_loss_39": 1241.1,
"kl_loss_7": 3762.4,
"learning_rate": 0.0007177583889711762,
"loss": 4882.15,
"step": 3630
},
{
"ce_loss_13": 2.902718555927277,
"ce_loss_26": 2.260812908411026,
"ce_loss_39": 2.042543429136276,
"ce_loss_52": 1.4226751655340195,
"ce_loss_7": 3.2698469936847685,
"epoch": 0.364,
"grad_norm": 17.153862070969048,
"kl_loss_13": 3018.8,
"kl_loss_26": 1673.0,
"kl_loss_39": 1219.0,
"kl_loss_7": 3784.4,
"learning_rate": 0.0007163290117232541,
"loss": 4884.0,
"step": 3640
},
{
"ce_loss_13": 2.9109850347042086,
"ce_loss_26": 2.297417125105858,
"ce_loss_39": 2.077428176999092,
"ce_loss_52": 1.4550551682710648,
"ce_loss_7": 3.268283462524414,
"epoch": 0.365,
"grad_norm": 16.42744245211514,
"kl_loss_13": 2985.2,
"kl_loss_26": 1679.2,
"kl_loss_39": 1227.9,
"kl_loss_7": 3734.0,
"learning_rate": 0.0007148974560445859,
"loss": 4868.65,
"step": 3650
},
{
"ce_loss_13": 2.9199238896369932,
"ce_loss_26": 2.2848848432302473,
"ce_loss_39": 2.060741201043129,
"ce_loss_52": 1.4278603106737138,
"ce_loss_7": 3.2834209561347962,
"epoch": 0.366,
"grad_norm": 16.404556741779928,
"kl_loss_13": 3024.0,
"kl_loss_26": 1686.2,
"kl_loss_39": 1230.9,
"kl_loss_7": 3786.0,
"learning_rate": 0.0007134637363509209,
"loss": 4839.5,
"step": 3660
},
{
"ce_loss_13": 2.9712482690811157,
"ce_loss_26": 2.3368860691785813,
"ce_loss_39": 2.104892411828041,
"ce_loss_52": 1.4633448541164398,
"ce_loss_7": 3.332030898332596,
"epoch": 0.367,
"grad_norm": 15.961228476827497,
"kl_loss_13": 3092.4,
"kl_loss_26": 1760.2,
"kl_loss_39": 1277.5,
"kl_loss_7": 3848.0,
"learning_rate": 0.0007120278670798009,
"loss": 4858.55,
"step": 3670
},
{
"ce_loss_13": 2.951517391204834,
"ce_loss_26": 2.3281659215688704,
"ce_loss_39": 2.0995417445898057,
"ce_loss_52": 1.4656393617391585,
"ce_loss_7": 3.2964209616184235,
"epoch": 0.368,
"grad_norm": 16.089022609349872,
"kl_loss_13": 3003.6,
"kl_loss_26": 1696.8,
"kl_loss_39": 1232.8,
"kl_loss_7": 3745.6,
"learning_rate": 0.0007105898626904133,
"loss": 4774.9,
"step": 3680
},
{
"ce_loss_13": 2.870139628648758,
"ce_loss_26": 2.2511734038591387,
"ce_loss_39": 2.0341389745473863,
"ce_loss_52": 1.4250996381044387,
"ce_loss_7": 3.2268544733524323,
"epoch": 0.369,
"grad_norm": 15.247673028968622,
"kl_loss_13": 2967.2,
"kl_loss_26": 1653.8,
"kl_loss_39": 1205.5,
"kl_loss_7": 3723.2,
"learning_rate": 0.0007091497376634463,
"loss": 4807.45,
"step": 3690
},
{
"ce_loss_13": 2.8762976706027983,
"ce_loss_26": 2.256538024544716,
"ce_loss_39": 2.043423393368721,
"ce_loss_52": 1.4497251689434052,
"ce_loss_7": 3.2377980053424835,
"epoch": 0.37,
"grad_norm": 16.15904103093409,
"kl_loss_13": 2914.4,
"kl_loss_26": 1609.7,
"kl_loss_39": 1170.3,
"kl_loss_7": 3672.0,
"learning_rate": 0.0007077075065009433,
"loss": 4822.75,
"step": 3700
},
{
"ce_loss_13": 2.865807980298996,
"ce_loss_26": 2.2327334135770798,
"ce_loss_39": 2.012790763378143,
"ce_loss_52": 1.4004584282636643,
"ce_loss_7": 3.233772474527359,
"epoch": 0.371,
"grad_norm": 15.511174434634698,
"kl_loss_13": 2980.0,
"kl_loss_26": 1666.4,
"kl_loss_39": 1214.3,
"kl_loss_7": 3742.4,
"learning_rate": 0.0007062631837261557,
"loss": 4816.1,
"step": 3710
},
{
"ce_loss_13": 2.903226691484451,
"ce_loss_26": 2.2818103432655334,
"ce_loss_39": 2.059009611606598,
"ce_loss_52": 1.456637406349182,
"ce_loss_7": 3.263377320766449,
"epoch": 0.372,
"grad_norm": 17.120548608123716,
"kl_loss_13": 2952.8,
"kl_loss_26": 1642.0,
"kl_loss_39": 1187.9,
"kl_loss_7": 3710.8,
"learning_rate": 0.0007048167838833977,
"loss": 4745.55,
"step": 3720
},
{
"ce_loss_13": 2.900358548760414,
"ce_loss_26": 2.2638369113206864,
"ce_loss_39": 2.043374678492546,
"ce_loss_52": 1.4358570337295533,
"ce_loss_7": 3.272378832101822,
"epoch": 0.373,
"grad_norm": 15.762139849070088,
"kl_loss_13": 2995.6,
"kl_loss_26": 1646.4,
"kl_loss_39": 1202.7,
"kl_loss_7": 3778.4,
"learning_rate": 0.0007033683215379002,
"loss": 4819.05,
"step": 3730
},
{
"ce_loss_13": 2.891742479801178,
"ce_loss_26": 2.2577997177839277,
"ce_loss_39": 2.042544272542,
"ce_loss_52": 1.4357560023665428,
"ce_loss_7": 3.2664382100105285,
"epoch": 0.374,
"grad_norm": 17.991228767593586,
"kl_loss_13": 3005.6,
"kl_loss_26": 1661.4,
"kl_loss_39": 1210.7,
"kl_loss_7": 3790.4,
"learning_rate": 0.0007019178112756625,
"loss": 4801.4,
"step": 3740
},
{
"ce_loss_13": 2.937167102098465,
"ce_loss_26": 2.3048900216817856,
"ce_loss_39": 2.077365005016327,
"ce_loss_52": 1.4518427148461341,
"ce_loss_7": 3.2986050605773927,
"epoch": 0.375,
"grad_norm": 17.06397612135392,
"kl_loss_13": 3048.4,
"kl_loss_26": 1714.2,
"kl_loss_39": 1240.0,
"kl_loss_7": 3808.4,
"learning_rate": 0.0007004652677033068,
"loss": 4778.45,
"step": 3750
},
{
"ce_loss_13": 2.953932785987854,
"ce_loss_26": 2.3320761770009995,
"ce_loss_39": 2.1045148581266404,
"ce_loss_52": 1.472703790664673,
"ce_loss_7": 3.3274633824825286,
"epoch": 0.376,
"grad_norm": 16.845736377094994,
"kl_loss_13": 3032.0,
"kl_loss_26": 1703.8,
"kl_loss_39": 1244.7,
"kl_loss_7": 3816.0,
"learning_rate": 0.0006990107054479312,
"loss": 4794.6,
"step": 3760
},
{
"ce_loss_13": 2.8548416674137114,
"ce_loss_26": 2.240122190117836,
"ce_loss_39": 2.0189033895730972,
"ce_loss_52": 1.4262803480029107,
"ce_loss_7": 3.208429366350174,
"epoch": 0.377,
"grad_norm": 16.84130111884451,
"kl_loss_13": 2924.4,
"kl_loss_26": 1609.6,
"kl_loss_39": 1161.2,
"kl_loss_7": 3672.0,
"learning_rate": 0.000697554139156961,
"loss": 4779.2,
"step": 3770
},
{
"ce_loss_13": 2.972896063327789,
"ce_loss_26": 2.335559439659119,
"ce_loss_39": 2.111876127123833,
"ce_loss_52": 1.4984043270349503,
"ce_loss_7": 3.330926328897476,
"epoch": 0.378,
"grad_norm": 17.969038221722915,
"kl_loss_13": 3002.8,
"kl_loss_26": 1674.0,
"kl_loss_39": 1211.2,
"kl_loss_7": 3762.0,
"learning_rate": 0.0006960955834980027,
"loss": 4732.4,
"step": 3780
},
{
"ce_loss_13": 2.863754612207413,
"ce_loss_26": 2.228693225979805,
"ce_loss_39": 2.0101536750793456,
"ce_loss_52": 1.4073660969734192,
"ce_loss_7": 3.2303711056709288,
"epoch": 0.379,
"grad_norm": 15.796823584167846,
"kl_loss_13": 2960.8,
"kl_loss_26": 1639.0,
"kl_loss_39": 1188.6,
"kl_loss_7": 3734.4,
"learning_rate": 0.0006946350531586958,
"loss": 4740.55,
"step": 3790
},
{
"ce_loss_13": 2.819410902261734,
"ce_loss_26": 2.200511318445206,
"ce_loss_39": 1.9842332571744918,
"ce_loss_52": 1.400177489221096,
"ce_loss_7": 3.1923243761062623,
"epoch": 0.38,
"grad_norm": 17.863959287343352,
"kl_loss_13": 2930.0,
"kl_loss_26": 1613.6,
"kl_loss_39": 1162.1,
"kl_loss_7": 3705.2,
"learning_rate": 0.0006931725628465643,
"loss": 4745.35,
"step": 3800
},
{
"ce_loss_13": 2.845439475774765,
"ce_loss_26": 2.2171025544404985,
"ce_loss_39": 1.9986167669296264,
"ce_loss_52": 1.4112813830375672,
"ce_loss_7": 3.2001422882080077,
"epoch": 0.381,
"grad_norm": 15.509448386002845,
"kl_loss_13": 2924.0,
"kl_loss_26": 1603.8,
"kl_loss_39": 1151.4,
"kl_loss_7": 3677.6,
"learning_rate": 0.0006917081272888696,
"loss": 4686.25,
"step": 3810
},
{
"ce_loss_13": 2.875427797436714,
"ce_loss_26": 2.2557172268629073,
"ce_loss_39": 2.0311311304569246,
"ce_loss_52": 1.4279655352234841,
"ce_loss_7": 3.230677658319473,
"epoch": 0.382,
"grad_norm": 17.274488302565285,
"kl_loss_13": 2934.0,
"kl_loss_26": 1621.0,
"kl_loss_39": 1159.3,
"kl_loss_7": 3683.6,
"learning_rate": 0.0006902417612324615,
"loss": 4684.7,
"step": 3820
},
{
"ce_loss_13": 2.9117272198200226,
"ce_loss_26": 2.261174875497818,
"ce_loss_39": 2.036722195148468,
"ce_loss_52": 1.4152167439460754,
"ce_loss_7": 3.282198351621628,
"epoch": 0.383,
"grad_norm": 17.87083708364157,
"kl_loss_13": 3095.2,
"kl_loss_26": 1720.4,
"kl_loss_39": 1253.4,
"kl_loss_7": 3865.2,
"learning_rate": 0.00068877347944363,
"loss": 4739.15,
"step": 3830
},
{
"ce_loss_13": 2.8889047384262083,
"ce_loss_26": 2.2653014570474626,
"ce_loss_39": 2.0420874893665313,
"ce_loss_52": 1.4475852727890015,
"ce_loss_7": 3.253549599647522,
"epoch": 0.384,
"grad_norm": 15.6987701916489,
"kl_loss_13": 2966.0,
"kl_loss_26": 1638.2,
"kl_loss_39": 1187.2,
"kl_loss_7": 3729.2,
"learning_rate": 0.0006873032967079561,
"loss": 4730.9,
"step": 3840
},
{
"ce_loss_13": 2.9057071805000305,
"ce_loss_26": 2.2790849953889847,
"ce_loss_39": 2.0592786610126494,
"ce_loss_52": 1.452454286813736,
"ce_loss_7": 3.266382873058319,
"epoch": 0.385,
"grad_norm": 15.755925332297407,
"kl_loss_13": 2962.0,
"kl_loss_26": 1636.4,
"kl_loss_39": 1179.7,
"kl_loss_7": 3722.8,
"learning_rate": 0.0006858312278301637,
"loss": 4713.7,
"step": 3850
},
{
"ce_loss_13": 2.8342252016067504,
"ce_loss_26": 2.2319850236177445,
"ce_loss_39": 2.022706937789917,
"ce_loss_52": 1.4418139278888702,
"ce_loss_7": 3.186972415447235,
"epoch": 0.386,
"grad_norm": 17.081089442059948,
"kl_loss_13": 2855.2,
"kl_loss_26": 1568.0,
"kl_loss_39": 1131.2,
"kl_loss_7": 3603.2,
"learning_rate": 0.0006843572876339704,
"loss": 4675.25,
"step": 3860
},
{
"ce_loss_13": 2.7886572241783143,
"ce_loss_26": 2.173486915230751,
"ce_loss_39": 1.9662895441055297,
"ce_loss_52": 1.3961340665817261,
"ce_loss_7": 3.1484048068523407,
"epoch": 0.387,
"grad_norm": 18.57744828916969,
"kl_loss_13": 2842.0,
"kl_loss_26": 1551.8,
"kl_loss_39": 1125.9,
"kl_loss_7": 3587.2,
"learning_rate": 0.0006828814909619373,
"loss": 4659.8,
"step": 3870
},
{
"ce_loss_13": 2.84233677983284,
"ce_loss_26": 2.2270043969154356,
"ce_loss_39": 2.011353349685669,
"ce_loss_52": 1.44394671022892,
"ce_loss_7": 3.189998263120651,
"epoch": 0.388,
"grad_norm": 17.116859396660736,
"kl_loss_13": 2866.4,
"kl_loss_26": 1581.4,
"kl_loss_39": 1130.5,
"kl_loss_7": 3602.4,
"learning_rate": 0.0006814038526753205,
"loss": 4652.3,
"step": 3880
},
{
"ce_loss_13": 2.8899350225925446,
"ce_loss_26": 2.268605652451515,
"ce_loss_39": 2.047902289032936,
"ce_loss_52": 1.462986382842064,
"ce_loss_7": 3.2532753586769103,
"epoch": 0.389,
"grad_norm": 16.277065053757138,
"kl_loss_13": 2901.6,
"kl_loss_26": 1603.8,
"kl_loss_39": 1148.8,
"kl_loss_7": 3655.2,
"learning_rate": 0.0006799243876539213,
"loss": 4644.45,
"step": 3890
},
{
"ce_loss_13": 2.852635699510574,
"ce_loss_26": 2.225254198908806,
"ce_loss_39": 2.00534345805645,
"ce_loss_52": 1.420480152964592,
"ce_loss_7": 3.217593324184418,
"epoch": 0.39,
"grad_norm": 17.575618857452827,
"kl_loss_13": 2895.2,
"kl_loss_26": 1582.6,
"kl_loss_39": 1134.8,
"kl_loss_7": 3662.8,
"learning_rate": 0.0006784431107959359,
"loss": 4640.8,
"step": 3900
},
{
"ce_loss_13": 2.9095449209213258,
"ce_loss_26": 2.288859358429909,
"ce_loss_39": 2.069254148006439,
"ce_loss_52": 1.4762457937002182,
"ce_loss_7": 3.2724156618118285,
"epoch": 0.391,
"grad_norm": 15.314925266098216,
"kl_loss_13": 2939.6,
"kl_loss_26": 1620.2,
"kl_loss_39": 1162.8,
"kl_loss_7": 3702.8,
"learning_rate": 0.0006769600370178059,
"loss": 4625.75,
"step": 3910
},
{
"ce_loss_13": 2.79736613035202,
"ce_loss_26": 2.1872033685445786,
"ce_loss_39": 1.9660126984119415,
"ce_loss_52": 1.3993165016174316,
"ce_loss_7": 3.152447110414505,
"epoch": 0.392,
"grad_norm": 15.234701615575748,
"kl_loss_13": 2856.0,
"kl_loss_26": 1574.6,
"kl_loss_39": 1119.8,
"kl_loss_7": 3607.6,
"learning_rate": 0.0006754751812540679,
"loss": 4587.85,
"step": 3920
},
{
"ce_loss_13": 2.8410171031951905,
"ce_loss_26": 2.2249913841485975,
"ce_loss_39": 2.0135372936725617,
"ce_loss_52": 1.4371111243963242,
"ce_loss_7": 3.2084967494010925,
"epoch": 0.393,
"grad_norm": 16.62173105303993,
"kl_loss_13": 2885.6,
"kl_loss_26": 1588.2,
"kl_loss_39": 1146.8,
"kl_loss_7": 3644.4,
"learning_rate": 0.0006739885584572025,
"loss": 4635.2,
"step": 3930
},
{
"ce_loss_13": 2.7806951224803926,
"ce_loss_26": 2.1756977647542954,
"ce_loss_39": 1.96949442923069,
"ce_loss_52": 1.415724617242813,
"ce_loss_7": 3.1287400901317595,
"epoch": 0.394,
"grad_norm": 15.878619218635833,
"kl_loss_13": 2836.2,
"kl_loss_26": 1541.8,
"kl_loss_39": 1104.9,
"kl_loss_7": 3581.6,
"learning_rate": 0.0006725001835974853,
"loss": 4637.75,
"step": 3940
},
{
"ce_loss_13": 2.85609056353569,
"ce_loss_26": 2.228466436266899,
"ce_loss_39": 2.011217701435089,
"ce_loss_52": 1.4336451053619386,
"ce_loss_7": 3.212037581205368,
"epoch": 0.395,
"grad_norm": 15.588059225669095,
"kl_loss_13": 2892.8,
"kl_loss_26": 1574.8,
"kl_loss_39": 1125.7,
"kl_loss_7": 3657.6,
"learning_rate": 0.0006710100716628344,
"loss": 4584.95,
"step": 3950
},
{
"ce_loss_13": 2.820618736743927,
"ce_loss_26": 2.1797895193099976,
"ce_loss_39": 1.9612275928258895,
"ce_loss_52": 1.3932116001844406,
"ce_loss_7": 3.1924599528312685,
"epoch": 0.396,
"grad_norm": 14.878251588185849,
"kl_loss_13": 2911.2,
"kl_loss_26": 1556.2,
"kl_loss_39": 1114.5,
"kl_loss_7": 3694.0,
"learning_rate": 0.0006695182376586602,
"loss": 4607.1,
"step": 3960
},
{
"ce_loss_13": 2.7754017412662506,
"ce_loss_26": 2.1572470903396606,
"ce_loss_39": 1.9344938546419144,
"ce_loss_52": 1.3711352616548538,
"ce_loss_7": 3.1346897959709166,
"epoch": 0.397,
"grad_norm": 15.39943522658609,
"kl_loss_13": 2875.2,
"kl_loss_26": 1575.1,
"kl_loss_39": 1124.5,
"kl_loss_7": 3635.6,
"learning_rate": 0.000668024696607715,
"loss": 4546.3,
"step": 3970
},
{
"ce_loss_13": 2.7410697996616364,
"ce_loss_26": 2.1528750866651536,
"ce_loss_39": 1.944345197081566,
"ce_loss_52": 1.4029324680566788,
"ce_loss_7": 3.0945769369602205,
"epoch": 0.398,
"grad_norm": 16.69493947597699,
"kl_loss_13": 2742.0,
"kl_loss_26": 1499.6,
"kl_loss_39": 1066.1,
"kl_loss_7": 3478.0,
"learning_rate": 0.0006665294635499404,
"loss": 4509.25,
"step": 3980
},
{
"ce_loss_13": 2.7935349524021147,
"ce_loss_26": 2.191756248474121,
"ce_loss_39": 1.9830526530742645,
"ce_loss_52": 1.4325652569532394,
"ce_loss_7": 3.150054842233658,
"epoch": 0.399,
"grad_norm": 15.984763021073704,
"kl_loss_13": 2764.0,
"kl_loss_26": 1503.8,
"kl_loss_39": 1075.4,
"kl_loss_7": 3508.8,
"learning_rate": 0.0006650325535423167,
"loss": 4542.85,
"step": 3990
},
{
"ce_loss_13": 2.7841295659542085,
"ce_loss_26": 2.175816202163696,
"ce_loss_39": 1.9610484838485718,
"ce_loss_52": 1.3994766443967819,
"ce_loss_7": 3.1450257122516634,
"epoch": 0.4,
"grad_norm": 16.383690879711693,
"kl_loss_13": 2832.8,
"kl_loss_26": 1534.6,
"kl_loss_39": 1101.1,
"kl_loss_7": 3587.2,
"learning_rate": 0.0006635339816587109,
"loss": 4584.95,
"step": 4000
},
{
"ce_loss_13": 2.937473142147064,
"ce_loss_26": 2.298046553134918,
"ce_loss_39": 2.071186339855194,
"ce_loss_52": 1.4680579513311387,
"ce_loss_7": 3.2991883754730225,
"epoch": 0.401,
"grad_norm": 16.69896458470603,
"kl_loss_13": 2974.0,
"kl_loss_26": 1650.8,
"kl_loss_39": 1187.0,
"kl_loss_7": 3734.8,
"learning_rate": 0.0006620337629897252,
"loss": 4574.8,
"step": 4010
},
{
"ce_loss_13": 2.803048574924469,
"ce_loss_26": 2.1910858035087584,
"ce_loss_39": 1.977920189499855,
"ce_loss_52": 1.4274337738752365,
"ce_loss_7": 3.1627039849758147,
"epoch": 0.402,
"grad_norm": 15.21058574655926,
"kl_loss_13": 2803.0,
"kl_loss_26": 1508.9,
"kl_loss_39": 1074.1,
"kl_loss_7": 3558.8,
"learning_rate": 0.0006605319126425454,
"loss": 4546.4,
"step": 4020
},
{
"ce_loss_13": 2.8307320177555084,
"ce_loss_26": 2.208324944972992,
"ce_loss_39": 1.9950761079788208,
"ce_loss_52": 1.435056920349598,
"ce_loss_7": 3.19031218290329,
"epoch": 0.403,
"grad_norm": 14.837343102998657,
"kl_loss_13": 2876.0,
"kl_loss_26": 1550.3,
"kl_loss_39": 1112.9,
"kl_loss_7": 3638.4,
"learning_rate": 0.0006590284457407876,
"loss": 4535.35,
"step": 4030
},
{
"ce_loss_13": 2.8277206301689146,
"ce_loss_26": 2.2229607343673705,
"ce_loss_39": 2.0126491367816923,
"ce_loss_52": 1.465662133693695,
"ce_loss_7": 3.178615337610245,
"epoch": 0.404,
"grad_norm": 15.868817769840305,
"kl_loss_13": 2801.6,
"kl_loss_26": 1514.6,
"kl_loss_39": 1078.5,
"kl_loss_7": 3548.0,
"learning_rate": 0.0006575233774243465,
"loss": 4524.1,
"step": 4040
},
{
"ce_loss_13": 2.741392558813095,
"ce_loss_26": 2.1182916700839995,
"ce_loss_39": 1.9061576217412948,
"ce_loss_52": 1.3709532082080842,
"ce_loss_7": 3.1065491139888763,
"epoch": 0.405,
"grad_norm": 16.502947013390255,
"kl_loss_13": 2798.4,
"kl_loss_26": 1495.2,
"kl_loss_39": 1058.1,
"kl_loss_7": 3565.6,
"learning_rate": 0.0006560167228492435,
"loss": 4528.6,
"step": 4050
},
{
"ce_loss_13": 2.8996002614498138,
"ce_loss_26": 2.271700030565262,
"ce_loss_39": 2.045673191547394,
"ce_loss_52": 1.4674718797206878,
"ce_loss_7": 3.2622067093849183,
"epoch": 0.406,
"grad_norm": 15.215707475527795,
"kl_loss_13": 2900.0,
"kl_loss_26": 1589.6,
"kl_loss_39": 1131.4,
"kl_loss_7": 3660.8,
"learning_rate": 0.0006545084971874737,
"loss": 4547.15,
"step": 4060
},
{
"ce_loss_13": 2.8251163959503174,
"ce_loss_26": 2.1874846637248995,
"ce_loss_39": 1.9672167718410491,
"ce_loss_52": 1.4135777831077576,
"ce_loss_7": 3.1873776078224183,
"epoch": 0.407,
"grad_norm": 15.755939255613459,
"kl_loss_13": 2866.0,
"kl_loss_26": 1547.6,
"kl_loss_39": 1092.9,
"kl_loss_7": 3627.6,
"learning_rate": 0.0006529987156268526,
"loss": 4503.1,
"step": 4070
},
{
"ce_loss_13": 2.7349390149116517,
"ce_loss_26": 2.1141091108322145,
"ce_loss_39": 1.909931591153145,
"ce_loss_52": 1.3686757802963256,
"ce_loss_7": 3.0966077923774717,
"epoch": 0.408,
"grad_norm": 15.787212276524022,
"kl_loss_13": 2801.6,
"kl_loss_26": 1509.4,
"kl_loss_39": 1071.0,
"kl_loss_7": 3562.8,
"learning_rate": 0.0006514873933708637,
"loss": 4534.05,
"step": 4080
},
{
"ce_loss_13": 2.742733418941498,
"ce_loss_26": 2.1391125679016114,
"ce_loss_39": 1.9272442519664765,
"ce_loss_52": 1.387654460966587,
"ce_loss_7": 3.0977914452552797,
"epoch": 0.409,
"grad_norm": 15.727797591546214,
"kl_loss_13": 2755.6,
"kl_loss_26": 1488.4,
"kl_loss_39": 1050.3,
"kl_loss_7": 3508.0,
"learning_rate": 0.0006499745456385053,
"loss": 4444.65,
"step": 4090
},
{
"ce_loss_13": 2.7960755199193956,
"ce_loss_26": 2.184322661161423,
"ce_loss_39": 1.9677571415901185,
"ce_loss_52": 1.4271342948079109,
"ce_loss_7": 3.1514409124851226,
"epoch": 0.41,
"grad_norm": 15.52426613691677,
"kl_loss_13": 2809.8,
"kl_loss_26": 1518.3,
"kl_loss_39": 1075.2,
"kl_loss_7": 3551.6,
"learning_rate": 0.0006484601876641375,
"loss": 4500.65,
"step": 4100
},
{
"ce_loss_13": 2.8776713728904726,
"ce_loss_26": 2.257500499486923,
"ce_loss_39": 2.0303492128849028,
"ce_loss_52": 1.4582158356904984,
"ce_loss_7": 3.2387999415397646,
"epoch": 0.411,
"grad_norm": 15.93298743678484,
"kl_loss_13": 2878.8,
"kl_loss_26": 1576.0,
"kl_loss_39": 1115.6,
"kl_loss_7": 3640.8,
"learning_rate": 0.000646944334697328,
"loss": 4470.55,
"step": 4110
},
{
"ce_loss_13": 2.802631789445877,
"ce_loss_26": 2.2029493927955626,
"ce_loss_39": 2.001139259338379,
"ce_loss_52": 1.4623139530420304,
"ce_loss_7": 3.155901437997818,
"epoch": 0.412,
"grad_norm": 14.691054390726734,
"kl_loss_13": 2720.8,
"kl_loss_26": 1465.2,
"kl_loss_39": 1041.4,
"kl_loss_7": 3462.4,
"learning_rate": 0.0006454270020026995,
"loss": 4502.65,
"step": 4120
},
{
"ce_loss_13": 2.8162184596061706,
"ce_loss_26": 2.1934009909629824,
"ce_loss_39": 1.979950374364853,
"ce_loss_52": 1.4344559267163277,
"ce_loss_7": 3.1758966505527497,
"epoch": 0.413,
"grad_norm": 16.25780643806628,
"kl_loss_13": 2816.0,
"kl_loss_26": 1518.6,
"kl_loss_39": 1077.6,
"kl_loss_7": 3573.6,
"learning_rate": 0.0006439082048597755,
"loss": 4487.45,
"step": 4130
},
{
"ce_loss_13": 2.787912631034851,
"ce_loss_26": 2.1966257959604265,
"ce_loss_39": 1.9914580851793289,
"ce_loss_52": 1.4511510521173476,
"ce_loss_7": 3.1392914772033693,
"epoch": 0.414,
"grad_norm": 17.37704963704925,
"kl_loss_13": 2734.8,
"kl_loss_26": 1487.2,
"kl_loss_39": 1057.4,
"kl_loss_7": 3474.0,
"learning_rate": 0.0006423879585628261,
"loss": 4448.15,
"step": 4140
},
{
"ce_loss_13": 2.817258411645889,
"ce_loss_26": 2.1947576314210893,
"ce_loss_39": 1.9762789696455,
"ce_loss_52": 1.433014589548111,
"ce_loss_7": 3.182687884569168,
"epoch": 0.415,
"grad_norm": 15.35502556975723,
"kl_loss_13": 2826.8,
"kl_loss_26": 1522.0,
"kl_loss_39": 1072.5,
"kl_loss_7": 3595.2,
"learning_rate": 0.0006408662784207149,
"loss": 4433.75,
"step": 4150
},
{
"ce_loss_13": 2.817685341835022,
"ce_loss_26": 2.2071537256240843,
"ce_loss_39": 1.9907894372940063,
"ce_loss_52": 1.4230278193950654,
"ce_loss_7": 3.1795800507068632,
"epoch": 0.416,
"grad_norm": 15.573867614749913,
"kl_loss_13": 2866.0,
"kl_loss_26": 1558.6,
"kl_loss_39": 1107.2,
"kl_loss_7": 3632.0,
"learning_rate": 0.0006393431797567439,
"loss": 4436.3,
"step": 4160
},
{
"ce_loss_13": 2.819452613592148,
"ce_loss_26": 2.213544499874115,
"ce_loss_39": 1.9934939831495284,
"ce_loss_52": 1.4420817136764525,
"ce_loss_7": 3.1729123532772063,
"epoch": 0.417,
"grad_norm": 15.840337845359416,
"kl_loss_13": 2809.4,
"kl_loss_26": 1533.4,
"kl_loss_39": 1076.0,
"kl_loss_7": 3544.0,
"learning_rate": 0.0006378186779084996,
"loss": 4429.6,
"step": 4170
},
{
"ce_loss_13": 2.797993552684784,
"ce_loss_26": 2.2015393495559694,
"ce_loss_39": 1.986987265944481,
"ce_loss_52": 1.446770191192627,
"ce_loss_7": 3.145763796567917,
"epoch": 0.418,
"grad_norm": 16.258575254109445,
"kl_loss_13": 2768.4,
"kl_loss_26": 1520.4,
"kl_loss_39": 1076.5,
"kl_loss_7": 3502.0,
"learning_rate": 0.0006362927882276989,
"loss": 4452.8,
"step": 4180
},
{
"ce_loss_13": 2.809996685385704,
"ce_loss_26": 2.1883741706609725,
"ce_loss_39": 1.972084966301918,
"ce_loss_52": 1.4272316336631774,
"ce_loss_7": 3.1641923069953917,
"epoch": 0.419,
"grad_norm": 17.021132117568744,
"kl_loss_13": 2806.8,
"kl_loss_26": 1522.7,
"kl_loss_39": 1076.0,
"kl_loss_7": 3556.4,
"learning_rate": 0.000634765526080034,
"loss": 4434.25,
"step": 4190
},
{
"ce_loss_13": 2.7747348487377166,
"ce_loss_26": 2.1618224531412125,
"ce_loss_39": 1.9505164802074433,
"ce_loss_52": 1.4064817115664483,
"ce_loss_7": 3.1292604207992554,
"epoch": 0.42,
"grad_norm": 15.556302486325128,
"kl_loss_13": 2777.6,
"kl_loss_26": 1495.4,
"kl_loss_39": 1055.2,
"kl_loss_7": 3523.2,
"learning_rate": 0.0006332369068450174,
"loss": 4413.55,
"step": 4200
},
{
"ce_loss_13": 2.748269832134247,
"ce_loss_26": 2.145698443055153,
"ce_loss_39": 1.935601145029068,
"ce_loss_52": 1.4105115324258803,
"ce_loss_7": 3.1001435458660125,
"epoch": 0.421,
"grad_norm": 15.348610438295403,
"kl_loss_13": 2742.8,
"kl_loss_26": 1480.0,
"kl_loss_39": 1039.8,
"kl_loss_7": 3490.0,
"learning_rate": 0.0006317069459158283,
"loss": 4363.8,
"step": 4210
},
{
"ce_loss_13": 2.7747100263834,
"ce_loss_26": 2.16818388402462,
"ce_loss_39": 1.9505507349967957,
"ce_loss_52": 1.4193186193704606,
"ce_loss_7": 3.136371600627899,
"epoch": 0.422,
"grad_norm": 16.358740351868324,
"kl_loss_13": 2764.6,
"kl_loss_26": 1481.3,
"kl_loss_39": 1040.1,
"kl_loss_7": 3516.8,
"learning_rate": 0.0006301756586991561,
"loss": 4421.65,
"step": 4220
},
{
"ce_loss_13": 2.8185549050569536,
"ce_loss_26": 2.226038011908531,
"ce_loss_39": 2.013939729332924,
"ce_loss_52": 1.4788149103522301,
"ce_loss_7": 3.1706756830215452,
"epoch": 0.423,
"grad_norm": 14.82164626530813,
"kl_loss_13": 2758.0,
"kl_loss_26": 1495.6,
"kl_loss_39": 1059.3,
"kl_loss_7": 3503.2,
"learning_rate": 0.0006286430606150459,
"loss": 4398.35,
"step": 4230
},
{
"ce_loss_13": 2.7891676902770994,
"ce_loss_26": 2.1986444026231764,
"ce_loss_39": 1.9819349884986877,
"ce_loss_52": 1.4562569051980971,
"ce_loss_7": 3.1411671698093415,
"epoch": 0.424,
"grad_norm": 15.535941880253773,
"kl_loss_13": 2717.2,
"kl_loss_26": 1468.6,
"kl_loss_39": 1020.2,
"kl_loss_7": 3457.2,
"learning_rate": 0.0006271091670967436,
"loss": 4370.45,
"step": 4240
},
{
"ce_loss_13": 2.8151471495628355,
"ce_loss_26": 2.204052150249481,
"ce_loss_39": 1.9960095703601837,
"ce_loss_52": 1.45780867934227,
"ce_loss_7": 3.1626071453094484,
"epoch": 0.425,
"grad_norm": 16.39177349451075,
"kl_loss_13": 2749.2,
"kl_loss_26": 1471.2,
"kl_loss_39": 1041.9,
"kl_loss_7": 3492.8,
"learning_rate": 0.0006255739935905395,
"loss": 4354.95,
"step": 4250
},
{
"ce_loss_13": 2.7719932794570923,
"ce_loss_26": 2.1723096281290055,
"ce_loss_39": 1.9554951965808869,
"ce_loss_52": 1.4198345810174942,
"ce_loss_7": 3.134742945432663,
"epoch": 0.426,
"grad_norm": 17.215386749382045,
"kl_loss_13": 2775.6,
"kl_loss_26": 1506.6,
"kl_loss_39": 1055.7,
"kl_loss_7": 3532.8,
"learning_rate": 0.0006240375555556145,
"loss": 4360.8,
"step": 4260
},
{
"ce_loss_13": 2.7217872977256774,
"ce_loss_26": 2.1173421651124955,
"ce_loss_39": 1.9085008651018143,
"ce_loss_52": 1.400168927013874,
"ce_loss_7": 3.0799288749694824,
"epoch": 0.427,
"grad_norm": 15.867423276307166,
"kl_loss_13": 2701.0,
"kl_loss_26": 1432.6,
"kl_loss_39": 996.7,
"kl_loss_7": 3452.8,
"learning_rate": 0.000622499868463882,
"loss": 4320.5,
"step": 4270
},
{
"ce_loss_13": 2.7815617978572846,
"ce_loss_26": 2.1786680042743685,
"ce_loss_39": 1.9648784220218658,
"ce_loss_52": 1.4438522070646287,
"ce_loss_7": 3.1414669275283815,
"epoch": 0.428,
"grad_norm": 16.86028992899928,
"kl_loss_13": 2733.2,
"kl_loss_26": 1463.0,
"kl_loss_39": 1028.0,
"kl_loss_7": 3484.8,
"learning_rate": 0.0006209609477998338,
"loss": 4348.9,
"step": 4280
},
{
"ce_loss_13": 2.8184913277626036,
"ce_loss_26": 2.213253751397133,
"ce_loss_39": 1.986603057384491,
"ce_loss_52": 1.4555893182754516,
"ce_loss_7": 3.1685641705989838,
"epoch": 0.429,
"grad_norm": 15.40477364702056,
"kl_loss_13": 2779.6,
"kl_loss_26": 1503.0,
"kl_loss_39": 1049.7,
"kl_loss_7": 3514.4,
"learning_rate": 0.0006194208090603844,
"loss": 4374.7,
"step": 4290
},
{
"ce_loss_13": 2.726405268907547,
"ce_loss_26": 2.1394855052232744,
"ce_loss_39": 1.9364097625017167,
"ce_loss_52": 1.4365313708782197,
"ce_loss_7": 3.0755336761474608,
"epoch": 0.43,
"grad_norm": 14.784393649721942,
"kl_loss_13": 2680.0,
"kl_loss_26": 1434.2,
"kl_loss_39": 1002.6,
"kl_loss_7": 3415.2,
"learning_rate": 0.0006178794677547138,
"loss": 4325.15,
"step": 4300
},
{
"ce_loss_13": 2.78907487988472,
"ce_loss_26": 2.1874548703432084,
"ce_loss_39": 1.9674001038074493,
"ce_loss_52": 1.4388054758310318,
"ce_loss_7": 3.1580194234848022,
"epoch": 0.431,
"grad_norm": 15.540150658114959,
"kl_loss_13": 2772.4,
"kl_loss_26": 1489.8,
"kl_loss_39": 1036.1,
"kl_loss_7": 3534.8,
"learning_rate": 0.0006163369394041111,
"loss": 4337.1,
"step": 4310
},
{
"ce_loss_13": 2.7502326130867005,
"ce_loss_26": 2.1552721470594407,
"ce_loss_39": 1.9502787470817566,
"ce_loss_52": 1.4348126232624054,
"ce_loss_7": 3.1085386633872987,
"epoch": 0.432,
"grad_norm": 15.900486211327715,
"kl_loss_13": 2709.8,
"kl_loss_26": 1438.4,
"kl_loss_39": 1010.4,
"kl_loss_7": 3455.6,
"learning_rate": 0.0006147932395418205,
"loss": 4308.0,
"step": 4320
},
{
"ce_loss_13": 2.7637496650218965,
"ce_loss_26": 2.1625583559274673,
"ce_loss_39": 1.947121372818947,
"ce_loss_52": 1.4198297888040543,
"ce_loss_7": 3.1241161942481996,
"epoch": 0.433,
"grad_norm": 16.260371827994177,
"kl_loss_13": 2733.2,
"kl_loss_26": 1467.8,
"kl_loss_39": 1033.8,
"kl_loss_7": 3485.2,
"learning_rate": 0.0006132483837128823,
"loss": 4327.3,
"step": 4330
},
{
"ce_loss_13": 2.780191105604172,
"ce_loss_26": 2.1823483228683473,
"ce_loss_39": 1.9749175161123276,
"ce_loss_52": 1.4566338241100312,
"ce_loss_7": 3.142491656541824,
"epoch": 0.434,
"grad_norm": 16.173065879753995,
"kl_loss_13": 2713.6,
"kl_loss_26": 1446.4,
"kl_loss_39": 1012.3,
"kl_loss_7": 3465.6,
"learning_rate": 0.0006117023874739772,
"loss": 4346.0,
"step": 4340
},
{
"ce_loss_13": 2.756999599933624,
"ce_loss_26": 2.151958614587784,
"ce_loss_39": 1.9352585464715957,
"ce_loss_52": 1.4167816311120986,
"ce_loss_7": 3.1229954183101656,
"epoch": 0.435,
"grad_norm": 16.656646084830363,
"kl_loss_13": 2759.6,
"kl_loss_26": 1478.0,
"kl_loss_39": 1029.0,
"kl_loss_7": 3524.0,
"learning_rate": 0.0006101552663932703,
"loss": 4336.25,
"step": 4350
},
{
"ce_loss_13": 2.774202525615692,
"ce_loss_26": 2.172477602958679,
"ce_loss_39": 1.9620429188013078,
"ce_loss_52": 1.43767509162426,
"ce_loss_7": 3.1362563192844393,
"epoch": 0.436,
"grad_norm": 16.067338284310296,
"kl_loss_13": 2744.4,
"kl_loss_26": 1472.4,
"kl_loss_39": 1033.1,
"kl_loss_7": 3493.6,
"learning_rate": 0.0006086070360502539,
"loss": 4296.35,
"step": 4360
},
{
"ce_loss_13": 2.787889677286148,
"ce_loss_26": 2.208648791909218,
"ce_loss_39": 1.999781733751297,
"ce_loss_52": 1.4855108827352523,
"ce_loss_7": 3.1249564945697785,
"epoch": 0.437,
"grad_norm": 15.78991831926034,
"kl_loss_13": 2690.0,
"kl_loss_26": 1446.0,
"kl_loss_39": 1014.8,
"kl_loss_7": 3419.2,
"learning_rate": 0.0006070577120355903,
"loss": 4280.75,
"step": 4370
},
{
"ce_loss_13": 2.8026595056056975,
"ce_loss_26": 2.207309713959694,
"ce_loss_39": 2.0008264780044556,
"ce_loss_52": 1.4935471057891845,
"ce_loss_7": 3.1493531346321104,
"epoch": 0.438,
"grad_norm": 15.837154081953376,
"kl_loss_13": 2679.6,
"kl_loss_26": 1429.2,
"kl_loss_39": 1001.1,
"kl_loss_7": 3413.2,
"learning_rate": 0.0006055073099509549,
"loss": 4296.35,
"step": 4380
},
{
"ce_loss_13": 2.755897510051727,
"ce_loss_26": 2.1693040400743486,
"ce_loss_39": 1.9623985677957534,
"ce_loss_52": 1.4462745368480683,
"ce_loss_7": 3.1059607326984406,
"epoch": 0.439,
"grad_norm": 15.629443906703631,
"kl_loss_13": 2694.4,
"kl_loss_26": 1446.6,
"kl_loss_39": 1012.9,
"kl_loss_7": 3427.6,
"learning_rate": 0.0006039558454088796,
"loss": 4277.25,
"step": 4390
},
{
"ce_loss_13": 2.7673678040504455,
"ce_loss_26": 2.159538361430168,
"ce_loss_39": 1.9505891352891922,
"ce_loss_52": 1.4304020568728446,
"ce_loss_7": 3.1244628012180327,
"epoch": 0.44,
"grad_norm": 15.403089942991496,
"kl_loss_13": 2740.4,
"kl_loss_26": 1465.4,
"kl_loss_39": 1024.9,
"kl_loss_7": 3482.4,
"learning_rate": 0.0006024033340325954,
"loss": 4300.2,
"step": 4400
},
{
"ce_loss_13": 2.7479640781879424,
"ce_loss_26": 2.1436998754739762,
"ce_loss_39": 1.9348597198724746,
"ce_loss_52": 1.4162754774093629,
"ce_loss_7": 3.1057413816452026,
"epoch": 0.441,
"grad_norm": 16.11204916554698,
"kl_loss_13": 2726.0,
"kl_loss_26": 1457.7,
"kl_loss_39": 1024.6,
"kl_loss_7": 3474.8,
"learning_rate": 0.0006008497914558743,
"loss": 4264.9,
"step": 4410
},
{
"ce_loss_13": 2.781216788291931,
"ce_loss_26": 2.1821627736091616,
"ce_loss_39": 1.9793646305799484,
"ce_loss_52": 1.456499743461609,
"ce_loss_7": 3.1415066480636598,
"epoch": 0.442,
"grad_norm": 15.839943843413481,
"kl_loss_13": 2703.2,
"kl_loss_26": 1453.2,
"kl_loss_39": 1022.0,
"kl_loss_7": 3462.8,
"learning_rate": 0.0005992952333228728,
"loss": 4320.7,
"step": 4420
},
{
"ce_loss_13": 2.6314639270305635,
"ce_loss_26": 2.0378955364227296,
"ce_loss_39": 1.841288161277771,
"ce_loss_52": 1.367735171318054,
"ce_loss_7": 2.983852916955948,
"epoch": 0.443,
"grad_norm": 15.715297314159198,
"kl_loss_13": 2586.8,
"kl_loss_26": 1338.4,
"kl_loss_39": 929.4,
"kl_loss_7": 3328.4,
"learning_rate": 0.0005977396752879741,
"loss": 4224.0,
"step": 4430
},
{
"ce_loss_13": 2.747091996669769,
"ce_loss_26": 2.1425569266080857,
"ce_loss_39": 1.9297908574342728,
"ce_loss_52": 1.4299270451068877,
"ce_loss_7": 3.0992358028888702,
"epoch": 0.444,
"grad_norm": 15.44431585804275,
"kl_loss_13": 2688.8,
"kl_loss_26": 1428.0,
"kl_loss_39": 986.6,
"kl_loss_7": 3434.0,
"learning_rate": 0.0005961831330156305,
"loss": 4224.4,
"step": 4440
},
{
"ce_loss_13": 2.7738942086696623,
"ce_loss_26": 2.1665944904088974,
"ce_loss_39": 1.9503348082304002,
"ce_loss_52": 1.4405199617147446,
"ce_loss_7": 3.131233388185501,
"epoch": 0.445,
"grad_norm": 15.683988530213393,
"kl_loss_13": 2701.2,
"kl_loss_26": 1441.4,
"kl_loss_39": 999.9,
"kl_loss_7": 3451.6,
"learning_rate": 0.0005946256221802051,
"loss": 4233.55,
"step": 4450
},
{
"ce_loss_13": 2.6961427688598634,
"ce_loss_26": 2.1098335653543474,
"ce_loss_39": 1.8978259444236756,
"ce_loss_52": 1.415482410788536,
"ce_loss_7": 3.044248181581497,
"epoch": 0.446,
"grad_norm": 15.256632143150593,
"kl_loss_13": 2612.8,
"kl_loss_26": 1380.2,
"kl_loss_39": 951.7,
"kl_loss_7": 3350.8,
"learning_rate": 0.0005930671584658151,
"loss": 4214.65,
"step": 4460
},
{
"ce_loss_13": 2.732464927434921,
"ce_loss_26": 2.141967472434044,
"ce_loss_39": 1.929695299267769,
"ce_loss_52": 1.4190144926309585,
"ce_loss_7": 3.091973352432251,
"epoch": 0.447,
"grad_norm": 16.38296166656899,
"kl_loss_13": 2676.8,
"kl_loss_26": 1435.6,
"kl_loss_39": 1000.5,
"kl_loss_7": 3428.4,
"learning_rate": 0.0005915077575661722,
"loss": 4280.4,
"step": 4470
},
{
"ce_loss_13": 2.683997756242752,
"ce_loss_26": 2.091261792182922,
"ce_loss_39": 1.8827648997306823,
"ce_loss_52": 1.3911016047000886,
"ce_loss_7": 3.0366858661174776,
"epoch": 0.448,
"grad_norm": 15.184401397346244,
"kl_loss_13": 2642.4,
"kl_loss_26": 1399.3,
"kl_loss_39": 968.5,
"kl_loss_7": 3392.0,
"learning_rate": 0.000589947435184427,
"loss": 4194.3,
"step": 4480
},
{
"ce_loss_13": 2.7240218341350557,
"ce_loss_26": 2.128333044052124,
"ce_loss_39": 1.924179795384407,
"ce_loss_52": 1.4498969972133637,
"ce_loss_7": 3.0761671125888825,
"epoch": 0.449,
"grad_norm": 17.07642900561258,
"kl_loss_13": 2604.8,
"kl_loss_26": 1344.6,
"kl_loss_39": 924.8,
"kl_loss_7": 3354.0,
"learning_rate": 0.0005883862070330078,
"loss": 4206.7,
"step": 4490
},
{
"ce_loss_13": 2.7219722032547,
"ce_loss_26": 2.1390156149864197,
"ce_loss_39": 1.9286952793598175,
"ce_loss_52": 1.4294085174798965,
"ce_loss_7": 3.0803197801113127,
"epoch": 0.45,
"grad_norm": 15.274632326953679,
"kl_loss_13": 2621.6,
"kl_loss_26": 1403.4,
"kl_loss_39": 973.4,
"kl_loss_7": 3369.2,
"learning_rate": 0.0005868240888334653,
"loss": 4211.9,
"step": 4500
},
{
"ce_loss_13": 2.6810890555381777,
"ce_loss_26": 2.105925416946411,
"ce_loss_39": 1.9109346747398377,
"ce_loss_52": 1.4293138086795807,
"ce_loss_7": 3.033176803588867,
"epoch": 0.451,
"grad_norm": 17.03243058089965,
"kl_loss_13": 2608.2,
"kl_loss_26": 1375.3,
"kl_loss_39": 955.3,
"kl_loss_7": 3345.2,
"learning_rate": 0.0005852610963163119,
"loss": 4209.7,
"step": 4510
},
{
"ce_loss_13": 2.689431291818619,
"ce_loss_26": 2.1146853864192963,
"ce_loss_39": 1.9112016946077346,
"ce_loss_52": 1.4318486779928208,
"ce_loss_7": 3.0395568013191223,
"epoch": 0.452,
"grad_norm": 15.510330374157597,
"kl_loss_13": 2583.2,
"kl_loss_26": 1365.8,
"kl_loss_39": 947.8,
"kl_loss_7": 3313.2,
"learning_rate": 0.0005836972452208654,
"loss": 4185.25,
"step": 4520
},
{
"ce_loss_13": 2.7475471079349516,
"ce_loss_26": 2.157953730225563,
"ce_loss_39": 1.9457335144281387,
"ce_loss_52": 1.4404390811920167,
"ce_loss_7": 3.104820030927658,
"epoch": 0.453,
"grad_norm": 15.220972226004102,
"kl_loss_13": 2688.4,
"kl_loss_26": 1444.2,
"kl_loss_39": 1006.8,
"kl_loss_7": 3431.6,
"learning_rate": 0.0005821325512950885,
"loss": 4222.6,
"step": 4530
},
{
"ce_loss_13": 2.7701157510280607,
"ce_loss_26": 2.1823565661907196,
"ce_loss_39": 1.9763484060764314,
"ce_loss_52": 1.4842363893985748,
"ce_loss_7": 3.1242611587047575,
"epoch": 0.454,
"grad_norm": 16.181871779695452,
"kl_loss_13": 2641.6,
"kl_loss_26": 1398.0,
"kl_loss_39": 968.2,
"kl_loss_7": 3387.6,
"learning_rate": 0.0005805670302954321,
"loss": 4206.95,
"step": 4540
},
{
"ce_loss_13": 2.69073800444603,
"ce_loss_26": 2.1018889248371124,
"ce_loss_39": 1.8928476065397262,
"ce_loss_52": 1.4156933531165123,
"ce_loss_7": 3.051577550172806,
"epoch": 0.455,
"grad_norm": 15.802548274169151,
"kl_loss_13": 2629.2,
"kl_loss_26": 1374.2,
"kl_loss_39": 934.4,
"kl_loss_7": 3382.8,
"learning_rate": 0.000579000697986675,
"loss": 4173.65,
"step": 4550
},
{
"ce_loss_13": 2.734744447469711,
"ce_loss_26": 2.139357805252075,
"ce_loss_39": 1.9327150255441665,
"ce_loss_52": 1.44863750487566,
"ce_loss_7": 3.083251416683197,
"epoch": 0.456,
"grad_norm": 15.332335326805197,
"kl_loss_13": 2646.0,
"kl_loss_26": 1390.4,
"kl_loss_39": 959.2,
"kl_loss_7": 3391.2,
"learning_rate": 0.0005774335701417662,
"loss": 4177.45,
"step": 4560
},
{
"ce_loss_13": 2.696203714609146,
"ce_loss_26": 2.102033945918083,
"ce_loss_39": 1.8971556156873703,
"ce_loss_52": 1.4197801396250724,
"ce_loss_7": 3.047564595937729,
"epoch": 0.457,
"grad_norm": 16.076096882060348,
"kl_loss_13": 2600.0,
"kl_loss_26": 1362.8,
"kl_loss_39": 939.0,
"kl_loss_7": 3341.2,
"learning_rate": 0.0005758656625416658,
"loss": 4183.3,
"step": 4570
},
{
"ce_loss_13": 2.7472688376903536,
"ce_loss_26": 2.1439451813697814,
"ce_loss_39": 1.9378813654184341,
"ce_loss_52": 1.4498099207878112,
"ce_loss_7": 3.1056803286075594,
"epoch": 0.458,
"grad_norm": 15.434602661166036,
"kl_loss_13": 2667.6,
"kl_loss_26": 1394.6,
"kl_loss_39": 964.2,
"kl_loss_7": 3421.6,
"learning_rate": 0.0005742969909751859,
"loss": 4202.65,
"step": 4580
},
{
"ce_loss_13": 2.819844591617584,
"ce_loss_26": 2.217494735121727,
"ce_loss_39": 1.995268750190735,
"ce_loss_52": 1.4828792631626129,
"ce_loss_7": 3.183567076921463,
"epoch": 0.459,
"grad_norm": 15.16665840440692,
"kl_loss_13": 2715.2,
"kl_loss_26": 1447.8,
"kl_loss_39": 1000.9,
"kl_loss_7": 3469.2,
"learning_rate": 0.0005727275712388318,
"loss": 4159.15,
"step": 4590
},
{
"ce_loss_13": 2.757504242658615,
"ce_loss_26": 2.1517707139253615,
"ce_loss_39": 1.9407849818468095,
"ce_loss_52": 1.4452633827924728,
"ce_loss_7": 3.1194639682769774,
"epoch": 0.46,
"grad_norm": 16.189362396808324,
"kl_loss_13": 2690.8,
"kl_loss_26": 1421.0,
"kl_loss_39": 976.6,
"kl_loss_7": 3439.6,
"learning_rate": 0.0005711574191366427,
"loss": 4126.7,
"step": 4600
},
{
"ce_loss_13": 2.7185553312301636,
"ce_loss_26": 2.140464088320732,
"ce_loss_39": 1.9312822461128234,
"ce_loss_52": 1.4499182224273681,
"ce_loss_7": 3.074033808708191,
"epoch": 0.461,
"grad_norm": 15.796779340482095,
"kl_loss_13": 2590.8,
"kl_loss_26": 1373.5,
"kl_loss_39": 946.6,
"kl_loss_7": 3326.0,
"learning_rate": 0.0005695865504800327,
"loss": 4117.15,
"step": 4610
},
{
"ce_loss_13": 2.689697802066803,
"ce_loss_26": 2.1229261219501496,
"ce_loss_39": 1.9236579477787017,
"ce_loss_52": 1.4474295616149901,
"ce_loss_7": 3.0403923749923707,
"epoch": 0.462,
"grad_norm": 15.469933015809259,
"kl_loss_13": 2559.6,
"kl_loss_26": 1353.2,
"kl_loss_39": 936.3,
"kl_loss_7": 3287.2,
"learning_rate": 0.0005680149810876322,
"loss": 4141.45,
"step": 4620
},
{
"ce_loss_13": 2.709194713830948,
"ce_loss_26": 2.1067664295434954,
"ce_loss_39": 1.8838362753391267,
"ce_loss_52": 1.4009573340415955,
"ce_loss_7": 3.070843666791916,
"epoch": 0.463,
"grad_norm": 15.475096434174118,
"kl_loss_13": 2674.8,
"kl_loss_26": 1399.4,
"kl_loss_39": 943.3,
"kl_loss_7": 3422.8,
"learning_rate": 0.0005664427267851271,
"loss": 4160.8,
"step": 4630
},
{
"ce_loss_13": 2.7120527923107147,
"ce_loss_26": 2.120649069547653,
"ce_loss_39": 1.9163700252771378,
"ce_loss_52": 1.4380100429058076,
"ce_loss_7": 3.061056911945343,
"epoch": 0.464,
"grad_norm": 15.555501351653449,
"kl_loss_13": 2608.0,
"kl_loss_26": 1354.6,
"kl_loss_39": 932.5,
"kl_loss_7": 3340.4,
"learning_rate": 0.0005648698034051009,
"loss": 4170.2,
"step": 4640
},
{
"ce_loss_13": 2.7389404594898226,
"ce_loss_26": 2.1454448729753492,
"ce_loss_39": 1.9379454165697099,
"ce_loss_52": 1.45494404733181,
"ce_loss_7": 3.090787374973297,
"epoch": 0.465,
"grad_norm": 17.172011566290227,
"kl_loss_13": 2617.0,
"kl_loss_26": 1379.3,
"kl_loss_39": 945.6,
"kl_loss_7": 3354.0,
"learning_rate": 0.0005632962267868747,
"loss": 4137.2,
"step": 4650
},
{
"ce_loss_13": 2.618588683009148,
"ce_loss_26": 2.0511282205581667,
"ce_loss_39": 1.857603308558464,
"ce_loss_52": 1.3924044981598853,
"ce_loss_7": 2.9628873229026795,
"epoch": 0.466,
"grad_norm": 15.078077143393564,
"kl_loss_13": 2528.4,
"kl_loss_26": 1330.4,
"kl_loss_39": 924.6,
"kl_loss_7": 3258.4,
"learning_rate": 0.0005617220127763474,
"loss": 4108.7,
"step": 4660
},
{
"ce_loss_13": 2.706065672636032,
"ce_loss_26": 2.128128296136856,
"ce_loss_39": 1.921997308731079,
"ce_loss_52": 1.4441686987876892,
"ce_loss_7": 3.059876149892807,
"epoch": 0.467,
"grad_norm": 16.456195061500843,
"kl_loss_13": 2576.0,
"kl_loss_26": 1356.8,
"kl_loss_39": 938.3,
"kl_loss_7": 3319.6,
"learning_rate": 0.0005601471772258368,
"loss": 4092.5,
"step": 4670
},
{
"ce_loss_13": 2.691108763217926,
"ce_loss_26": 2.1213403046131134,
"ce_loss_39": 1.9077781707048416,
"ce_loss_52": 1.4377064436674118,
"ce_loss_7": 3.0406983733177184,
"epoch": 0.468,
"grad_norm": 15.284887521485958,
"kl_loss_13": 2584.8,
"kl_loss_26": 1368.0,
"kl_loss_39": 933.0,
"kl_loss_7": 3316.4,
"learning_rate": 0.0005585717359939192,
"loss": 4090.9,
"step": 4680
},
{
"ce_loss_13": 2.715133213996887,
"ce_loss_26": 2.1481328904628754,
"ce_loss_39": 1.938890340924263,
"ce_loss_52": 1.456964261829853,
"ce_loss_7": 3.063201904296875,
"epoch": 0.469,
"grad_norm": 14.95117298176539,
"kl_loss_13": 2573.6,
"kl_loss_26": 1372.5,
"kl_loss_39": 948.0,
"kl_loss_7": 3294.4,
"learning_rate": 0.0005569957049452703,
"loss": 4067.75,
"step": 4690
},
{
"ce_loss_13": 2.7355633437633515,
"ce_loss_26": 2.130109578371048,
"ce_loss_39": 1.9153006434440614,
"ce_loss_52": 1.4125859558582305,
"ce_loss_7": 3.0963422894477843,
"epoch": 0.47,
"grad_norm": 15.35914349074289,
"kl_loss_13": 2726.0,
"kl_loss_26": 1448.2,
"kl_loss_39": 1002.1,
"kl_loss_7": 3478.0,
"learning_rate": 0.0005554190999505056,
"loss": 4157.55,
"step": 4700
},
{
"ce_loss_13": 2.6879218101501463,
"ce_loss_26": 2.098815104365349,
"ce_loss_39": 1.8952988266944886,
"ce_loss_52": 1.4328953355550766,
"ce_loss_7": 3.040910530090332,
"epoch": 0.471,
"grad_norm": 15.95882741217506,
"kl_loss_13": 2568.0,
"kl_loss_26": 1323.6,
"kl_loss_39": 901.3,
"kl_loss_7": 3319.6,
"learning_rate": 0.0005538419368860196,
"loss": 4062.85,
"step": 4710
},
{
"ce_loss_13": 2.6794900715351107,
"ce_loss_26": 2.0905598402023315,
"ce_loss_39": 1.8867737114429475,
"ce_loss_52": 1.4186928808689117,
"ce_loss_7": 3.0324371635913847,
"epoch": 0.472,
"grad_norm": 15.674607259246525,
"kl_loss_13": 2569.4,
"kl_loss_26": 1340.6,
"kl_loss_39": 914.1,
"kl_loss_7": 3303.6,
"learning_rate": 0.0005522642316338268,
"loss": 4084.1,
"step": 4720
},
{
"ce_loss_13": 2.7094511866569517,
"ce_loss_26": 2.1240269035100936,
"ce_loss_39": 1.9192228257656097,
"ce_loss_52": 1.4589810997247696,
"ce_loss_7": 3.0527816653251647,
"epoch": 0.473,
"grad_norm": 15.84409146313606,
"kl_loss_13": 2555.6,
"kl_loss_26": 1330.2,
"kl_loss_39": 901.4,
"kl_loss_7": 3270.0,
"learning_rate": 0.0005506860000814017,
"loss": 4024.65,
"step": 4730
},
{
"ce_loss_13": 2.686307519674301,
"ce_loss_26": 2.1099446028470994,
"ce_loss_39": 1.9098408967256546,
"ce_loss_52": 1.4574729681015015,
"ce_loss_7": 3.03258957862854,
"epoch": 0.474,
"grad_norm": 16.290287645699628,
"kl_loss_13": 2538.4,
"kl_loss_26": 1308.0,
"kl_loss_39": 889.7,
"kl_loss_7": 3278.4,
"learning_rate": 0.0005491072581215186,
"loss": 4058.25,
"step": 4740
},
{
"ce_loss_13": 2.685838830471039,
"ce_loss_26": 2.097555673122406,
"ce_loss_39": 1.8953343421220779,
"ce_loss_52": 1.4276258319616317,
"ce_loss_7": 3.042680394649506,
"epoch": 0.475,
"grad_norm": 15.834434583556519,
"kl_loss_13": 2590.0,
"kl_loss_26": 1354.4,
"kl_loss_39": 931.0,
"kl_loss_7": 3334.4,
"learning_rate": 0.0005475280216520913,
"loss": 4057.7,
"step": 4750
},
{
"ce_loss_13": 2.645804923772812,
"ce_loss_26": 2.073691374063492,
"ce_loss_39": 1.872296717762947,
"ce_loss_52": 1.4167594254016875,
"ce_loss_7": 2.992863970994949,
"epoch": 0.476,
"grad_norm": 15.93243827944326,
"kl_loss_13": 2532.4,
"kl_loss_26": 1326.8,
"kl_loss_39": 900.3,
"kl_loss_7": 3268.8,
"learning_rate": 0.0005459483065760138,
"loss": 4104.4,
"step": 4760
},
{
"ce_loss_13": 2.7042051672935488,
"ce_loss_26": 2.1117112547159196,
"ce_loss_39": 1.9040750682353973,
"ce_loss_52": 1.4314228266477584,
"ce_loss_7": 3.0566958367824553,
"epoch": 0.477,
"grad_norm": 15.353551244219883,
"kl_loss_13": 2608.4,
"kl_loss_26": 1361.8,
"kl_loss_39": 937.4,
"kl_loss_7": 3352.4,
"learning_rate": 0.0005443681288009991,
"loss": 4078.7,
"step": 4770
},
{
"ce_loss_13": 2.6856160342693327,
"ce_loss_26": 2.0829177469015123,
"ce_loss_39": 1.8763625353574753,
"ce_loss_52": 1.4030324995517731,
"ce_loss_7": 3.0448363423347473,
"epoch": 0.478,
"grad_norm": 16.14315974332923,
"kl_loss_13": 2646.4,
"kl_loss_26": 1377.9,
"kl_loss_39": 941.4,
"kl_loss_7": 3398.4,
"learning_rate": 0.0005427875042394199,
"loss": 4031.6,
"step": 4780
},
{
"ce_loss_13": 2.6821600914001467,
"ce_loss_26": 2.1105621844530105,
"ce_loss_39": 1.9080501794815063,
"ce_loss_52": 1.4579481482505798,
"ce_loss_7": 3.0244390606880187,
"epoch": 0.479,
"grad_norm": 16.41928903258253,
"kl_loss_13": 2508.6,
"kl_loss_26": 1304.7,
"kl_loss_39": 890.8,
"kl_loss_7": 3233.6,
"learning_rate": 0.0005412064488081482,
"loss": 4041.85,
"step": 4790
},
{
"ce_loss_13": 2.644778722524643,
"ce_loss_26": 2.0624290674924852,
"ce_loss_39": 1.8686836928129196,
"ce_loss_52": 1.4204004764556886,
"ce_loss_7": 2.987608629465103,
"epoch": 0.48,
"grad_norm": 15.406655412683705,
"kl_loss_13": 2521.2,
"kl_loss_26": 1293.1,
"kl_loss_39": 883.5,
"kl_loss_7": 3258.4,
"learning_rate": 0.0005396249784283942,
"loss": 4018.65,
"step": 4800
},
{
"ce_loss_13": 2.6721664726734162,
"ce_loss_26": 2.0903854191303255,
"ce_loss_39": 1.8859550595283507,
"ce_loss_52": 1.4339767321944237,
"ce_loss_7": 3.022903573513031,
"epoch": 0.481,
"grad_norm": 15.533152676819311,
"kl_loss_13": 2549.2,
"kl_loss_26": 1311.5,
"kl_loss_39": 887.7,
"kl_loss_7": 3276.8,
"learning_rate": 0.0005380431090255476,
"loss": 4094.5,
"step": 4810
},
{
"ce_loss_13": 2.7146060168743134,
"ce_loss_26": 2.138300836086273,
"ce_loss_39": 1.9245162457227707,
"ce_loss_52": 1.4378316938877105,
"ce_loss_7": 3.0600010454654694,
"epoch": 0.482,
"grad_norm": 15.889585422309347,
"kl_loss_13": 2616.4,
"kl_loss_26": 1396.0,
"kl_loss_39": 954.3,
"kl_loss_7": 3337.2,
"learning_rate": 0.0005364608565290155,
"loss": 4019.85,
"step": 4820
},
{
"ce_loss_13": 2.7267942845821382,
"ce_loss_26": 2.1315987795591353,
"ce_loss_39": 1.92288878262043,
"ce_loss_52": 1.4603519141674042,
"ce_loss_7": 3.089752674102783,
"epoch": 0.483,
"grad_norm": 14.942603293515566,
"kl_loss_13": 2598.4,
"kl_loss_26": 1350.0,
"kl_loss_39": 913.7,
"kl_loss_7": 3351.2,
"learning_rate": 0.0005348782368720626,
"loss": 4054.35,
"step": 4830
},
{
"ce_loss_13": 2.702422133088112,
"ce_loss_26": 2.1349568367004395,
"ce_loss_39": 1.923803049325943,
"ce_loss_52": 1.4461625874042512,
"ce_loss_7": 3.0548571348190308,
"epoch": 0.484,
"grad_norm": 14.924067089524062,
"kl_loss_13": 2579.8,
"kl_loss_26": 1357.8,
"kl_loss_39": 926.9,
"kl_loss_7": 3324.0,
"learning_rate": 0.000533295265991652,
"loss": 4024.65,
"step": 4840
},
{
"ce_loss_13": 2.6280339270830155,
"ce_loss_26": 2.0501014798879624,
"ce_loss_39": 1.8476706713438034,
"ce_loss_52": 1.3977838337421418,
"ce_loss_7": 2.9806483924388885,
"epoch": 0.485,
"grad_norm": 16.07575861381767,
"kl_loss_13": 2515.0,
"kl_loss_26": 1293.9,
"kl_loss_39": 881.9,
"kl_loss_7": 3258.8,
"learning_rate": 0.0005317119598282822,
"loss": 4003.05,
"step": 4850
},
{
"ce_loss_13": 2.7017304062843324,
"ce_loss_26": 2.123658448457718,
"ce_loss_39": 1.9226309835910798,
"ce_loss_52": 1.4685954213142396,
"ce_loss_7": 3.052913784980774,
"epoch": 0.486,
"grad_norm": 14.261884924612158,
"kl_loss_13": 2552.0,
"kl_loss_26": 1330.6,
"kl_loss_39": 901.6,
"kl_loss_7": 3291.2,
"learning_rate": 0.0005301283343258293,
"loss": 4032.7,
"step": 4860
},
{
"ce_loss_13": 2.663911575078964,
"ce_loss_26": 2.0811035096645356,
"ce_loss_39": 1.8746151685714723,
"ce_loss_52": 1.4212765499949456,
"ce_loss_7": 3.0172334790229796,
"epoch": 0.487,
"grad_norm": 15.979242608660392,
"kl_loss_13": 2526.0,
"kl_loss_26": 1301.0,
"kl_loss_39": 883.9,
"kl_loss_7": 3276.4,
"learning_rate": 0.000528544405431384,
"loss": 4020.25,
"step": 4870
},
{
"ce_loss_13": 2.660174161195755,
"ce_loss_26": 2.0942499101161958,
"ce_loss_39": 1.8920150458812715,
"ce_loss_52": 1.4465530335903167,
"ce_loss_7": 3.0074438989162444,
"epoch": 0.488,
"grad_norm": 14.78308909898239,
"kl_loss_13": 2500.8,
"kl_loss_26": 1294.5,
"kl_loss_39": 873.6,
"kl_loss_7": 3228.0,
"learning_rate": 0.000526960189095093,
"loss": 4016.5,
"step": 4880
},
{
"ce_loss_13": 2.635312020778656,
"ce_loss_26": 2.0795453995466233,
"ce_loss_39": 1.8807054102420806,
"ce_loss_52": 1.4317258328199387,
"ce_loss_7": 2.9746360957622526,
"epoch": 0.489,
"grad_norm": 15.033016045239172,
"kl_loss_13": 2473.6,
"kl_loss_26": 1289.8,
"kl_loss_39": 880.7,
"kl_loss_7": 3185.2,
"learning_rate": 0.0005253757012699972,
"loss": 3996.8,
"step": 4890
},
{
"ce_loss_13": 2.68422954082489,
"ce_loss_26": 2.1027563750743865,
"ce_loss_39": 1.8974193513393403,
"ce_loss_52": 1.4426774829626083,
"ce_loss_7": 3.037678909301758,
"epoch": 0.49,
"grad_norm": 15.631967370179172,
"kl_loss_13": 2553.2,
"kl_loss_26": 1323.6,
"kl_loss_39": 894.5,
"kl_loss_7": 3295.2,
"learning_rate": 0.0005237909579118712,
"loss": 3967.65,
"step": 4900
},
{
"ce_loss_13": 2.680465018749237,
"ce_loss_26": 2.1046013057231905,
"ce_loss_39": 1.9009331673383714,
"ce_loss_52": 1.44480240046978,
"ce_loss_7": 3.036197912693024,
"epoch": 0.491,
"grad_norm": 15.759372326211485,
"kl_loss_13": 2524.0,
"kl_loss_26": 1301.7,
"kl_loss_39": 885.8,
"kl_loss_7": 3274.8,
"learning_rate": 0.0005222059749790631,
"loss": 3979.5,
"step": 4910
},
{
"ce_loss_13": 2.6961126804351805,
"ce_loss_26": 2.1182867020368574,
"ce_loss_39": 1.9074458956718445,
"ce_loss_52": 1.456271693110466,
"ce_loss_7": 3.044221115112305,
"epoch": 0.492,
"grad_norm": 16.34015666907617,
"kl_loss_13": 2536.4,
"kl_loss_26": 1322.4,
"kl_loss_39": 892.3,
"kl_loss_7": 3262.8,
"learning_rate": 0.0005206207684323337,
"loss": 3964.95,
"step": 4920
},
{
"ce_loss_13": 2.6250401854515077,
"ce_loss_26": 2.041203039884567,
"ce_loss_39": 1.8369982630014419,
"ce_loss_52": 1.4043618232011794,
"ce_loss_7": 2.976498603820801,
"epoch": 0.493,
"grad_norm": 15.643386607908784,
"kl_loss_13": 2504.8,
"kl_loss_26": 1279.6,
"kl_loss_39": 857.1,
"kl_loss_7": 3243.6,
"learning_rate": 0.000519035354234695,
"loss": 3956.4,
"step": 4930
},
{
"ce_loss_13": 2.731994906067848,
"ce_loss_26": 2.151404523849487,
"ce_loss_39": 1.9346221089363098,
"ce_loss_52": 1.4679641619324684,
"ce_loss_7": 3.0894507080316544,
"epoch": 0.494,
"grad_norm": 15.972342532535162,
"kl_loss_13": 2584.2,
"kl_loss_26": 1354.3,
"kl_loss_39": 912.1,
"kl_loss_7": 3338.8,
"learning_rate": 0.0005174497483512506,
"loss": 3986.95,
"step": 4940
},
{
"ce_loss_13": 2.692367374897003,
"ce_loss_26": 2.122538897395134,
"ce_loss_39": 1.9124073147773744,
"ce_loss_52": 1.452483794093132,
"ce_loss_7": 3.042392200231552,
"epoch": 0.495,
"grad_norm": 15.983001798116502,
"kl_loss_13": 2533.4,
"kl_loss_26": 1327.0,
"kl_loss_39": 895.7,
"kl_loss_7": 3252.0,
"learning_rate": 0.0005158639667490339,
"loss": 3967.55,
"step": 4950
},
{
"ce_loss_13": 2.6044699877500532,
"ce_loss_26": 2.0326590865850447,
"ce_loss_39": 1.8286783695220947,
"ce_loss_52": 1.3870023548603059,
"ce_loss_7": 2.947771966457367,
"epoch": 0.496,
"grad_norm": 15.42571652932802,
"kl_loss_13": 2500.2,
"kl_loss_26": 1296.2,
"kl_loss_39": 875.7,
"kl_loss_7": 3230.8,
"learning_rate": 0.0005142780253968481,
"loss": 3955.8,
"step": 4960
},
{
"ce_loss_13": 2.642567253112793,
"ce_loss_26": 2.0776680946350097,
"ce_loss_39": 1.875117465853691,
"ce_loss_52": 1.4392555862665177,
"ce_loss_7": 2.991294425725937,
"epoch": 0.497,
"grad_norm": 15.451081778073426,
"kl_loss_13": 2501.6,
"kl_loss_26": 1291.4,
"kl_loss_39": 867.8,
"kl_loss_7": 3244.8,
"learning_rate": 0.0005126919402651053,
"loss": 3945.75,
"step": 4970
},
{
"ce_loss_13": 2.6330737471580505,
"ce_loss_26": 2.0630074977874755,
"ce_loss_39": 1.856469190120697,
"ce_loss_52": 1.4164980471134185,
"ce_loss_7": 2.9769130408763886,
"epoch": 0.498,
"grad_norm": 15.34176013733142,
"kl_loss_13": 2503.2,
"kl_loss_26": 1294.2,
"kl_loss_39": 874.8,
"kl_loss_7": 3228.4,
"learning_rate": 0.0005111057273256647,
"loss": 3917.2,
"step": 4980
},
{
"ce_loss_13": 2.6576701521873476,
"ce_loss_26": 2.089341068267822,
"ce_loss_39": 1.8844720661640166,
"ce_loss_52": 1.4477695405483246,
"ce_loss_7": 3.0070637345314024,
"epoch": 0.499,
"grad_norm": 15.00660182184455,
"kl_loss_13": 2473.8,
"kl_loss_26": 1283.9,
"kl_loss_39": 853.9,
"kl_loss_7": 3207.6,
"learning_rate": 0.0005095194025516733,
"loss": 3923.7,
"step": 4990
},
{
"ce_loss_13": 2.697542816400528,
"ce_loss_26": 2.125366801023483,
"ce_loss_39": 1.9231987714767456,
"ce_loss_52": 1.4726029485464096,
"ce_loss_7": 3.0440734326839447,
"epoch": 0.5,
"grad_norm": 15.160859466153447,
"kl_loss_13": 2511.6,
"kl_loss_26": 1303.5,
"kl_loss_39": 880.0,
"kl_loss_7": 3249.2,
"learning_rate": 0.000507932981917404,
"loss": 3938.75,
"step": 5000
},
{
"ce_loss_13": 2.554905018210411,
"ce_loss_26": 1.9959456473588943,
"ce_loss_39": 1.8001023352146148,
"ce_loss_52": 1.370650653541088,
"ce_loss_7": 2.8987653195858,
"epoch": 0.501,
"grad_norm": 17.14356490933402,
"kl_loss_13": 2439.4,
"kl_loss_26": 1259.5,
"kl_loss_39": 849.2,
"kl_loss_7": 3158.0,
"learning_rate": 0.0005063464813980949,
"loss": 3915.65,
"step": 5010
},
{
"ce_loss_13": 2.5977672755718233,
"ce_loss_26": 2.0211563646793365,
"ce_loss_39": 1.8262152045965194,
"ce_loss_52": 1.3998217657208443,
"ce_loss_7": 2.950869935750961,
"epoch": 0.502,
"grad_norm": 14.97441402973133,
"kl_loss_13": 2466.8,
"kl_loss_26": 1257.6,
"kl_loss_39": 848.9,
"kl_loss_7": 3202.0,
"learning_rate": 0.0005047599169697884,
"loss": 3931.7,
"step": 5020
},
{
"ce_loss_13": 2.6368628799915315,
"ce_loss_26": 2.0560896009206773,
"ce_loss_39": 1.8569422334432601,
"ce_loss_52": 1.429998092353344,
"ce_loss_7": 2.981409990787506,
"epoch": 0.503,
"grad_norm": 15.444575545537123,
"kl_loss_13": 2486.4,
"kl_loss_26": 1264.6,
"kl_loss_39": 845.9,
"kl_loss_7": 3216.4,
"learning_rate": 0.000503173304609171,
"loss": 3936.6,
"step": 5030
},
{
"ce_loss_13": 2.694787061214447,
"ce_loss_26": 2.10608988404274,
"ce_loss_39": 1.8957834452390672,
"ce_loss_52": 1.4505297511816024,
"ce_loss_7": 3.0503607213497164,
"epoch": 0.504,
"grad_norm": 15.80764020207202,
"kl_loss_13": 2558.4,
"kl_loss_26": 1318.1,
"kl_loss_39": 884.7,
"kl_loss_7": 3306.4,
"learning_rate": 0.0005015866602934111,
"loss": 3939.75,
"step": 5040
},
{
"ce_loss_13": 2.621226805448532,
"ce_loss_26": 2.0584420263767242,
"ce_loss_39": 1.861902078986168,
"ce_loss_52": 1.4422300636768342,
"ce_loss_7": 2.951041603088379,
"epoch": 0.505,
"grad_norm": 14.881966197673897,
"kl_loss_13": 2426.4,
"kl_loss_26": 1238.3,
"kl_loss_39": 828.1,
"kl_loss_7": 3127.6,
"learning_rate": 0.0005,
"loss": 3934.2,
"step": 5050
},
{
"ce_loss_13": 2.660506749153137,
"ce_loss_26": 2.098774325847626,
"ce_loss_39": 1.886313620209694,
"ce_loss_52": 1.4474970057606698,
"ce_loss_7": 3.011888575553894,
"epoch": 0.506,
"grad_norm": 14.941506448796416,
"kl_loss_13": 2498.2,
"kl_loss_26": 1299.9,
"kl_loss_39": 867.9,
"kl_loss_7": 3221.2,
"learning_rate": 0.0004984133397065889,
"loss": 3903.9,
"step": 5060
},
{
"ce_loss_13": 2.626861798763275,
"ce_loss_26": 2.0604827493429183,
"ce_loss_39": 1.8599964112043381,
"ce_loss_52": 1.4386487394571303,
"ce_loss_7": 2.975832390785217,
"epoch": 0.507,
"grad_norm": 15.44952616226393,
"kl_loss_13": 2452.8,
"kl_loss_26": 1250.8,
"kl_loss_39": 833.3,
"kl_loss_7": 3188.4,
"learning_rate": 0.0004968266953908291,
"loss": 3885.3,
"step": 5070
},
{
"ce_loss_13": 2.562318778038025,
"ce_loss_26": 1.9999902278184891,
"ce_loss_39": 1.8035208880901337,
"ce_loss_52": 1.3915461212396623,
"ce_loss_7": 2.904841202497482,
"epoch": 0.508,
"grad_norm": 14.83407474509182,
"kl_loss_13": 2407.2,
"kl_loss_26": 1226.2,
"kl_loss_39": 815.3,
"kl_loss_7": 3134.0,
"learning_rate": 0.0004952400830302117,
"loss": 3881.25,
"step": 5080
},
{
"ce_loss_13": 2.5728674054145815,
"ce_loss_26": 2.013489532470703,
"ce_loss_39": 1.8151986598968506,
"ce_loss_52": 1.3948280960321426,
"ce_loss_7": 2.9155173718929293,
"epoch": 0.509,
"grad_norm": 14.64837783343134,
"kl_loss_13": 2420.0,
"kl_loss_26": 1236.4,
"kl_loss_39": 829.6,
"kl_loss_7": 3145.6,
"learning_rate": 0.0004936535186019053,
"loss": 3875.9,
"step": 5090
},
{
"ce_loss_13": 2.657623714208603,
"ce_loss_26": 2.0687634259462357,
"ce_loss_39": 1.8608120799064636,
"ce_loss_52": 1.418680590391159,
"ce_loss_7": 3.0090177237987517,
"epoch": 0.51,
"grad_norm": 15.50653814601008,
"kl_loss_13": 2529.6,
"kl_loss_26": 1302.4,
"kl_loss_39": 874.3,
"kl_loss_7": 3273.2,
"learning_rate": 0.000492067018082596,
"loss": 3924.45,
"step": 5100
},
{
"ce_loss_13": 2.651608294248581,
"ce_loss_26": 2.0792359739542006,
"ce_loss_39": 1.8647643625736237,
"ce_loss_52": 1.4323984265327454,
"ce_loss_7": 3.0009018778800964,
"epoch": 0.511,
"grad_norm": 14.358667358457254,
"kl_loss_13": 2496.8,
"kl_loss_26": 1292.0,
"kl_loss_39": 854.6,
"kl_loss_7": 3224.0,
"learning_rate": 0.0004904805974483267,
"loss": 3865.45,
"step": 5110
},
{
"ce_loss_13": 2.641629362106323,
"ce_loss_26": 2.072116160392761,
"ce_loss_39": 1.8716523438692092,
"ce_loss_52": 1.4481347769498825,
"ce_loss_7": 2.989072245359421,
"epoch": 0.512,
"grad_norm": 15.321352631891271,
"kl_loss_13": 2423.6,
"kl_loss_26": 1237.5,
"kl_loss_39": 826.9,
"kl_loss_7": 3150.8,
"learning_rate": 0.0004888942726743353,
"loss": 3863.15,
"step": 5120
},
{
"ce_loss_13": 2.6105270087718964,
"ce_loss_26": 2.041230320930481,
"ce_loss_39": 1.8405257403850555,
"ce_loss_52": 1.4164400100708008,
"ce_loss_7": 2.9590699791908266,
"epoch": 0.513,
"grad_norm": 15.778047819453661,
"kl_loss_13": 2444.0,
"kl_loss_26": 1244.1,
"kl_loss_39": 835.9,
"kl_loss_7": 3179.2,
"learning_rate": 0.0004873080597348947,
"loss": 3860.6,
"step": 5130
},
{
"ce_loss_13": 2.7020871877670287,
"ce_loss_26": 2.1217857897281647,
"ce_loss_39": 1.910724088549614,
"ce_loss_52": 1.4596379309892655,
"ce_loss_7": 3.0534912407398225,
"epoch": 0.514,
"grad_norm": 15.798738758695562,
"kl_loss_13": 2550.4,
"kl_loss_26": 1321.6,
"kl_loss_39": 886.4,
"kl_loss_7": 3290.0,
"learning_rate": 0.0004857219746031519,
"loss": 3877.4,
"step": 5140
},
{
"ce_loss_13": 2.629297113418579,
"ce_loss_26": 2.065913289785385,
"ce_loss_39": 1.8652951270341873,
"ce_loss_52": 1.4406857430934905,
"ce_loss_7": 2.975607615709305,
"epoch": 0.515,
"grad_norm": 15.843426047808201,
"kl_loss_13": 2425.2,
"kl_loss_26": 1235.3,
"kl_loss_39": 819.8,
"kl_loss_7": 3155.2,
"learning_rate": 0.0004841360332509663,
"loss": 3881.85,
"step": 5150
},
{
"ce_loss_13": 2.6566128492355348,
"ce_loss_26": 2.0851503133773805,
"ce_loss_39": 1.879001685976982,
"ce_loss_52": 1.4529996007680892,
"ce_loss_7": 3.001719295978546,
"epoch": 0.516,
"grad_norm": 15.230952623778938,
"kl_loss_13": 2453.2,
"kl_loss_26": 1247.2,
"kl_loss_39": 825.4,
"kl_loss_7": 3185.6,
"learning_rate": 0.0004825502516487497,
"loss": 3877.3,
"step": 5160
},
{
"ce_loss_13": 2.6446830928325653,
"ce_loss_26": 2.087994411587715,
"ce_loss_39": 1.8902768969535828,
"ce_loss_52": 1.4724875479936599,
"ce_loss_7": 2.994678741693497,
"epoch": 0.517,
"grad_norm": 16.39635866458352,
"kl_loss_13": 2415.8,
"kl_loss_26": 1232.4,
"kl_loss_39": 823.7,
"kl_loss_7": 3135.2,
"learning_rate": 0.00048096464576530507,
"loss": 3828.0,
"step": 5170
},
{
"ce_loss_13": 2.5944429993629456,
"ce_loss_26": 2.036814641952515,
"ce_loss_39": 1.841314834356308,
"ce_loss_52": 1.4209656611084938,
"ce_loss_7": 2.9290528297424316,
"epoch": 0.518,
"grad_norm": 14.479752914773284,
"kl_loss_13": 2424.2,
"kl_loss_26": 1242.2,
"kl_loss_39": 829.4,
"kl_loss_7": 3150.4,
"learning_rate": 0.00047937923156766646,
"loss": 3845.8,
"step": 5180
},
{
"ce_loss_13": 2.660686820745468,
"ce_loss_26": 2.094280996918678,
"ce_loss_39": 1.8889294683933258,
"ce_loss_52": 1.4619006276130677,
"ce_loss_7": 3.008899760246277,
"epoch": 0.519,
"grad_norm": 16.01644389337825,
"kl_loss_13": 2475.6,
"kl_loss_26": 1273.8,
"kl_loss_39": 850.7,
"kl_loss_7": 3204.4,
"learning_rate": 0.00047779402502093696,
"loss": 3846.45,
"step": 5190
},
{
"ce_loss_13": 2.5972321003675463,
"ce_loss_26": 2.0274816364049912,
"ce_loss_39": 1.8247474491596223,
"ce_loss_52": 1.4065502554178237,
"ce_loss_7": 2.9485019743442535,
"epoch": 0.52,
"grad_norm": 14.98957972258473,
"kl_loss_13": 2450.8,
"kl_loss_26": 1255.3,
"kl_loss_39": 832.3,
"kl_loss_7": 3178.4,
"learning_rate": 0.0004762090420881289,
"loss": 3895.0,
"step": 5200
},
{
"ce_loss_13": 2.665889722108841,
"ce_loss_26": 2.110061451792717,
"ce_loss_39": 1.9084287703037262,
"ce_loss_52": 1.4813150823116303,
"ce_loss_7": 3.0068661749362944,
"epoch": 0.521,
"grad_norm": 15.21430509367723,
"kl_loss_13": 2448.0,
"kl_loss_26": 1268.2,
"kl_loss_39": 849.3,
"kl_loss_7": 3164.4,
"learning_rate": 0.00047462429873000296,
"loss": 3816.95,
"step": 5210
},
{
"ce_loss_13": 2.655850923061371,
"ce_loss_26": 2.085476315021515,
"ce_loss_39": 1.874689555168152,
"ce_loss_52": 1.446556892991066,
"ce_loss_7": 3.0054982542991637,
"epoch": 0.522,
"grad_norm": 16.427127136749267,
"kl_loss_13": 2452.4,
"kl_loss_26": 1264.0,
"kl_loss_39": 836.0,
"kl_loss_7": 3188.0,
"learning_rate": 0.0004730398109049071,
"loss": 3838.95,
"step": 5220
},
{
"ce_loss_13": 2.677028661966324,
"ce_loss_26": 2.119970577955246,
"ce_loss_39": 1.92548668384552,
"ce_loss_52": 1.4896603375673294,
"ce_loss_7": 3.008550250530243,
"epoch": 0.523,
"grad_norm": 16.0659576233147,
"kl_loss_13": 2444.4,
"kl_loss_26": 1258.1,
"kl_loss_39": 844.1,
"kl_loss_7": 3161.6,
"learning_rate": 0.000471455594568616,
"loss": 3864.55,
"step": 5230
},
{
"ce_loss_13": 2.6650006234645844,
"ce_loss_26": 2.085580566525459,
"ce_loss_39": 1.884287601709366,
"ce_loss_52": 1.453607079386711,
"ce_loss_7": 3.0236206233501433,
"epoch": 0.524,
"grad_norm": 14.653166262246215,
"kl_loss_13": 2489.6,
"kl_loss_26": 1272.8,
"kl_loss_39": 845.7,
"kl_loss_7": 3232.4,
"learning_rate": 0.00046987166567417086,
"loss": 3878.7,
"step": 5240
},
{
"ce_loss_13": 2.584680277109146,
"ce_loss_26": 2.0209302097558974,
"ce_loss_39": 1.8149988621473312,
"ce_loss_52": 1.390892931818962,
"ce_loss_7": 2.9321685075759887,
"epoch": 0.525,
"grad_norm": 15.168241253700586,
"kl_loss_13": 2428.4,
"kl_loss_26": 1252.2,
"kl_loss_39": 829.9,
"kl_loss_7": 3154.8,
"learning_rate": 0.00046828804017171776,
"loss": 3851.45,
"step": 5250
},
{
"ce_loss_13": 2.6199812322854994,
"ce_loss_26": 2.072703015804291,
"ce_loss_39": 1.8745949417352676,
"ce_loss_52": 1.4573716089129447,
"ce_loss_7": 2.9569752633571627,
"epoch": 0.526,
"grad_norm": 15.019656924741254,
"kl_loss_13": 2412.4,
"kl_loss_26": 1238.7,
"kl_loss_39": 825.7,
"kl_loss_7": 3130.0,
"learning_rate": 0.00046670473400834805,
"loss": 3822.4,
"step": 5260
},
{
"ce_loss_13": 2.614406701922417,
"ce_loss_26": 2.038132056593895,
"ce_loss_39": 1.835758489370346,
"ce_loss_52": 1.4097227707505227,
"ce_loss_7": 2.953414297103882,
"epoch": 0.527,
"grad_norm": 15.237091081862786,
"kl_loss_13": 2460.0,
"kl_loss_26": 1259.0,
"kl_loss_39": 841.0,
"kl_loss_7": 3178.4,
"learning_rate": 0.00046512176312793734,
"loss": 3820.5,
"step": 5270
},
{
"ce_loss_13": 2.5675832986831666,
"ce_loss_26": 2.0170306622982026,
"ce_loss_39": 1.8148203998804093,
"ce_loss_52": 1.4015038818120957,
"ce_loss_7": 2.9166265070438384,
"epoch": 0.528,
"grad_norm": 15.622184104337453,
"kl_loss_13": 2384.4,
"kl_loss_26": 1219.7,
"kl_loss_39": 809.7,
"kl_loss_7": 3111.2,
"learning_rate": 0.00046353914347098467,
"loss": 3805.5,
"step": 5280
},
{
"ce_loss_13": 2.600316107273102,
"ce_loss_26": 2.038938209414482,
"ce_loss_39": 1.8384071439504623,
"ce_loss_52": 1.4146052584052087,
"ce_loss_7": 2.9424943923950195,
"epoch": 0.529,
"grad_norm": 16.18662441031771,
"kl_loss_13": 2437.8,
"kl_loss_26": 1252.1,
"kl_loss_39": 829.2,
"kl_loss_7": 3150.0,
"learning_rate": 0.0004619568909744524,
"loss": 3784.9,
"step": 5290
},
{
"ce_loss_13": 2.6181087523698805,
"ce_loss_26": 2.0588752537965775,
"ce_loss_39": 1.85763358771801,
"ce_loss_52": 1.4420970767736434,
"ce_loss_7": 2.962410408258438,
"epoch": 0.53,
"grad_norm": 15.067130011925716,
"kl_loss_13": 2406.6,
"kl_loss_26": 1231.8,
"kl_loss_39": 819.2,
"kl_loss_7": 3132.4,
"learning_rate": 0.00046037502157160573,
"loss": 3815.9,
"step": 5300
},
{
"ce_loss_13": 2.5704480171203614,
"ce_loss_26": 2.007723420858383,
"ce_loss_39": 1.8157259494066238,
"ce_loss_52": 1.4100188240408897,
"ce_loss_7": 2.9128104388713836,
"epoch": 0.531,
"grad_norm": 14.940801357367253,
"kl_loss_13": 2375.2,
"kl_loss_26": 1192.5,
"kl_loss_39": 790.2,
"kl_loss_7": 3097.2,
"learning_rate": 0.00045879355119185207,
"loss": 3774.75,
"step": 5310
},
{
"ce_loss_13": 2.6117980778217316,
"ce_loss_26": 2.049151787161827,
"ce_loss_39": 1.8556257247924806,
"ce_loss_52": 1.4484239488840103,
"ce_loss_7": 2.9571595907211305,
"epoch": 0.532,
"grad_norm": 15.084958990702525,
"kl_loss_13": 2412.8,
"kl_loss_26": 1214.0,
"kl_loss_39": 801.8,
"kl_loss_7": 3144.4,
"learning_rate": 0.0004572124957605803,
"loss": 3796.3,
"step": 5320
},
{
"ce_loss_13": 2.6047778964042663,
"ce_loss_26": 2.0361655563116074,
"ce_loss_39": 1.829242792725563,
"ce_loss_52": 1.4240625560283662,
"ce_loss_7": 2.9437944889068604,
"epoch": 0.533,
"grad_norm": 14.875930516711946,
"kl_loss_13": 2409.0,
"kl_loss_26": 1224.1,
"kl_loss_39": 807.1,
"kl_loss_7": 3128.8,
"learning_rate": 0.00045563187119900103,
"loss": 3772.25,
"step": 5330
},
{
"ce_loss_13": 2.5524469763040543,
"ce_loss_26": 1.9957795530557632,
"ce_loss_39": 1.7972583919763565,
"ce_loss_52": 1.3942344322800637,
"ce_loss_7": 2.899323457479477,
"epoch": 0.534,
"grad_norm": 14.888961320169138,
"kl_loss_13": 2374.0,
"kl_loss_26": 1190.2,
"kl_loss_39": 785.5,
"kl_loss_7": 3107.6,
"learning_rate": 0.00045405169342398633,
"loss": 3809.2,
"step": 5340
},
{
"ce_loss_13": 2.6405540108680725,
"ce_loss_26": 2.080012783408165,
"ce_loss_39": 1.876559317111969,
"ce_loss_52": 1.462306337058544,
"ce_loss_7": 2.981167531013489,
"epoch": 0.535,
"grad_norm": 14.563184685424444,
"kl_loss_13": 2406.4,
"kl_loss_26": 1229.1,
"kl_loss_39": 816.6,
"kl_loss_7": 3130.4,
"learning_rate": 0.0004524719783479088,
"loss": 3797.75,
"step": 5350
},
{
"ce_loss_13": 2.645623618364334,
"ce_loss_26": 2.0881788969039916,
"ce_loss_39": 1.888492900133133,
"ce_loss_52": 1.4696861803531647,
"ce_loss_7": 2.994585871696472,
"epoch": 0.536,
"grad_norm": 15.457746807868878,
"kl_loss_13": 2427.2,
"kl_loss_26": 1243.7,
"kl_loss_39": 825.7,
"kl_loss_7": 3155.2,
"learning_rate": 0.00045089274187848144,
"loss": 3837.25,
"step": 5360
},
{
"ce_loss_13": 2.607480788230896,
"ce_loss_26": 2.0540437757968903,
"ce_loss_39": 1.854740971326828,
"ce_loss_52": 1.4391999766230583,
"ce_loss_7": 2.955876570940018,
"epoch": 0.537,
"grad_norm": 15.654126474702101,
"kl_loss_13": 2406.0,
"kl_loss_26": 1230.1,
"kl_loss_39": 814.4,
"kl_loss_7": 3128.8,
"learning_rate": 0.00044931399991859835,
"loss": 3791.2,
"step": 5370
},
{
"ce_loss_13": 2.5806866496801377,
"ce_loss_26": 2.0246922999620436,
"ce_loss_39": 1.8295573323965073,
"ce_loss_52": 1.433498626947403,
"ce_loss_7": 2.932585430145264,
"epoch": 0.538,
"grad_norm": 15.050745413645357,
"kl_loss_13": 2377.8,
"kl_loss_26": 1193.3,
"kl_loss_39": 790.4,
"kl_loss_7": 3116.4,
"learning_rate": 0.00044773576836617336,
"loss": 3771.2,
"step": 5380
},
{
"ce_loss_13": 2.5772230982780457,
"ce_loss_26": 2.0075444668531417,
"ce_loss_39": 1.8076122283935547,
"ce_loss_52": 1.3957727670669555,
"ce_loss_7": 2.9269763708114622,
"epoch": 0.539,
"grad_norm": 14.739351688834923,
"kl_loss_13": 2418.2,
"kl_loss_26": 1217.6,
"kl_loss_39": 806.9,
"kl_loss_7": 3154.0,
"learning_rate": 0.00044615806311398056,
"loss": 3764.85,
"step": 5390
},
{
"ce_loss_13": 2.6164508730173113,
"ce_loss_26": 2.0489900171756745,
"ce_loss_39": 1.8477211087942123,
"ce_loss_52": 1.4378462180495262,
"ce_loss_7": 2.9610529631376266,
"epoch": 0.54,
"grad_norm": 15.257899541784896,
"kl_loss_13": 2422.8,
"kl_loss_26": 1222.4,
"kl_loss_39": 806.9,
"kl_loss_7": 3149.2,
"learning_rate": 0.00044458090004949454,
"loss": 3789.8,
"step": 5400
},
{
"ce_loss_13": 2.639938807487488,
"ce_loss_26": 2.078041157126427,
"ce_loss_39": 1.8733548551797867,
"ce_loss_52": 1.4663413792848587,
"ce_loss_7": 2.978548914194107,
"epoch": 0.541,
"grad_norm": 15.643709958040844,
"kl_loss_13": 2399.4,
"kl_loss_26": 1217.8,
"kl_loss_39": 804.5,
"kl_loss_7": 3111.2,
"learning_rate": 0.0004430042950547297,
"loss": 3775.4,
"step": 5410
},
{
"ce_loss_13": 2.6770710349082947,
"ce_loss_26": 2.110589724779129,
"ce_loss_39": 1.9134115755558014,
"ce_loss_52": 1.5031546354293823,
"ce_loss_7": 3.0216497242450715,
"epoch": 0.542,
"grad_norm": 14.919746196312753,
"kl_loss_13": 2411.6,
"kl_loss_26": 1216.6,
"kl_loss_39": 809.1,
"kl_loss_7": 3147.6,
"learning_rate": 0.0004414282640060809,
"loss": 3768.95,
"step": 5420
},
{
"ce_loss_13": 2.6296884536743166,
"ce_loss_26": 2.0768262147903442,
"ce_loss_39": 1.8766031116247177,
"ce_loss_52": 1.4701504305005073,
"ce_loss_7": 2.9714462876319887,
"epoch": 0.543,
"grad_norm": 14.109291634946612,
"kl_loss_13": 2361.6,
"kl_loss_26": 1207.6,
"kl_loss_39": 794.5,
"kl_loss_7": 3073.6,
"learning_rate": 0.0004398528227741633,
"loss": 3734.5,
"step": 5430
},
{
"ce_loss_13": 2.5891071379184725,
"ce_loss_26": 2.024920642375946,
"ce_loss_39": 1.8224879026412963,
"ce_loss_52": 1.4225929498672485,
"ce_loss_7": 2.939086824655533,
"epoch": 0.544,
"grad_norm": 15.182283952769808,
"kl_loss_13": 2384.4,
"kl_loss_26": 1201.0,
"kl_loss_39": 785.9,
"kl_loss_7": 3118.0,
"learning_rate": 0.00043827798722365264,
"loss": 3724.5,
"step": 5440
},
{
"ce_loss_13": 2.521664083003998,
"ce_loss_26": 1.9635723412036896,
"ce_loss_39": 1.776253479719162,
"ce_loss_52": 1.3777382284402848,
"ce_loss_7": 2.8684565305709837,
"epoch": 0.545,
"grad_norm": 14.678349850224247,
"kl_loss_13": 2357.4,
"kl_loss_26": 1171.5,
"kl_loss_39": 776.8,
"kl_loss_7": 3081.2,
"learning_rate": 0.00043670377321312535,
"loss": 3748.6,
"step": 5450
},
{
"ce_loss_13": 2.6282208263874054,
"ce_loss_26": 2.063512918353081,
"ce_loss_39": 1.8621816724538802,
"ce_loss_52": 1.4465220913290977,
"ce_loss_7": 2.9661788761615755,
"epoch": 0.546,
"grad_norm": 14.95327030090302,
"kl_loss_13": 2410.4,
"kl_loss_26": 1226.2,
"kl_loss_39": 814.8,
"kl_loss_7": 3130.0,
"learning_rate": 0.0004351301965948991,
"loss": 3750.5,
"step": 5460
},
{
"ce_loss_13": 2.6448668360710146,
"ce_loss_26": 2.092715525627136,
"ce_loss_39": 1.8841570675373078,
"ce_loss_52": 1.465877577662468,
"ce_loss_7": 2.983266705274582,
"epoch": 0.547,
"grad_norm": 14.464098395004918,
"kl_loss_13": 2427.2,
"kl_loss_26": 1254.5,
"kl_loss_39": 827.1,
"kl_loss_7": 3138.8,
"learning_rate": 0.000433557273214873,
"loss": 3760.7,
"step": 5470
},
{
"ce_loss_13": 2.5597914129495623,
"ce_loss_26": 2.0002847105264663,
"ce_loss_39": 1.8126664906740189,
"ce_loss_52": 1.4252239495515824,
"ce_loss_7": 2.89780033826828,
"epoch": 0.548,
"grad_norm": 14.511201984799017,
"kl_loss_13": 2336.4,
"kl_loss_26": 1154.4,
"kl_loss_39": 760.7,
"kl_loss_7": 3046.4,
"learning_rate": 0.000431985018912368,
"loss": 3744.2,
"step": 5480
},
{
"ce_loss_13": 2.556107610464096,
"ce_loss_26": 2.0089694380760195,
"ce_loss_39": 1.8094982028007507,
"ce_loss_52": 1.4210678458213806,
"ce_loss_7": 2.8992061018943787,
"epoch": 0.549,
"grad_norm": 14.88322732328834,
"kl_loss_13": 2332.4,
"kl_loss_26": 1182.7,
"kl_loss_39": 771.7,
"kl_loss_7": 3056.4,
"learning_rate": 0.0004304134495199674,
"loss": 3725.95,
"step": 5490
},
{
"ce_loss_13": 2.5557271778583526,
"ce_loss_26": 2.015131750702858,
"ce_loss_39": 1.8132081747055053,
"ce_loss_52": 1.4253456503152848,
"ce_loss_7": 2.8894282221794128,
"epoch": 0.55,
"grad_norm": 14.509766958879414,
"kl_loss_13": 2342.4,
"kl_loss_26": 1192.8,
"kl_loss_39": 776.15,
"kl_loss_7": 3041.2,
"learning_rate": 0.0004288425808633575,
"loss": 3690.5,
"step": 5500
},
{
"ce_loss_13": 2.6498306572437285,
"ce_loss_26": 2.0864265114068985,
"ce_loss_39": 1.884456393122673,
"ce_loss_52": 1.4724615901708602,
"ce_loss_7": 2.994233113527298,
"epoch": 0.551,
"grad_norm": 15.223257504626806,
"kl_loss_13": 2419.8,
"kl_loss_26": 1227.5,
"kl_loss_39": 805.6,
"kl_loss_7": 3137.2,
"learning_rate": 0.0004272724287611684,
"loss": 3719.95,
"step": 5510
},
{
"ce_loss_13": 2.6063999772071837,
"ce_loss_26": 2.0453016996383666,
"ce_loss_39": 1.8452126443386079,
"ce_loss_52": 1.4395337477326393,
"ce_loss_7": 2.9420916736125946,
"epoch": 0.552,
"grad_norm": 14.707511636553823,
"kl_loss_13": 2389.2,
"kl_loss_26": 1204.1,
"kl_loss_39": 791.2,
"kl_loss_7": 3102.4,
"learning_rate": 0.00042570300902481425,
"loss": 3704.35,
"step": 5520
},
{
"ce_loss_13": 2.563195550441742,
"ce_loss_26": 1.9973567068576812,
"ce_loss_39": 1.7899492472410201,
"ce_loss_52": 1.3841874808073045,
"ce_loss_7": 2.9130555033683776,
"epoch": 0.553,
"grad_norm": 14.788339735855649,
"kl_loss_13": 2417.4,
"kl_loss_26": 1232.3,
"kl_loss_39": 809.2,
"kl_loss_7": 3150.4,
"learning_rate": 0.00042413433745833423,
"loss": 3716.65,
"step": 5530
},
{
"ce_loss_13": 2.5711400628089907,
"ce_loss_26": 2.0185573011636735,
"ce_loss_39": 1.8240427374839783,
"ce_loss_52": 1.4254193544387816,
"ce_loss_7": 2.9173736214637755,
"epoch": 0.554,
"grad_norm": 14.905176252765152,
"kl_loss_13": 2345.2,
"kl_loss_26": 1182.6,
"kl_loss_39": 781.0,
"kl_loss_7": 3069.2,
"learning_rate": 0.0004225664298582339,
"loss": 3692.3,
"step": 5540
},
{
"ce_loss_13": 2.5974901139736177,
"ce_loss_26": 2.046999195218086,
"ce_loss_39": 1.8451066941022873,
"ce_loss_52": 1.436999562382698,
"ce_loss_7": 2.9430564284324645,
"epoch": 0.555,
"grad_norm": 16.142335203190324,
"kl_loss_13": 2402.8,
"kl_loss_26": 1219.2,
"kl_loss_39": 797.9,
"kl_loss_7": 3125.6,
"learning_rate": 0.000420999302013325,
"loss": 3720.0,
"step": 5550
},
{
"ce_loss_13": 2.5756935298442842,
"ce_loss_26": 2.00499467253685,
"ce_loss_39": 1.8017524302005767,
"ce_loss_52": 1.4018217638134955,
"ce_loss_7": 2.9280818104743958,
"epoch": 0.556,
"grad_norm": 15.838882130746192,
"kl_loss_13": 2404.2,
"kl_loss_26": 1195.7,
"kl_loss_39": 787.5,
"kl_loss_7": 3148.8,
"learning_rate": 0.000419432969704568,
"loss": 3744.35,
"step": 5560
},
{
"ce_loss_13": 2.6485643923282622,
"ce_loss_26": 2.0993672519922257,
"ce_loss_39": 1.8959094911813736,
"ce_loss_52": 1.4813728883862496,
"ce_loss_7": 2.994894337654114,
"epoch": 0.557,
"grad_norm": 14.30299821768554,
"kl_loss_13": 2404.0,
"kl_loss_26": 1234.2,
"kl_loss_39": 810.35,
"kl_loss_7": 3130.8,
"learning_rate": 0.00041786744870491154,
"loss": 3698.4,
"step": 5570
},
{
"ce_loss_13": 2.6594822227954866,
"ce_loss_26": 2.0962711691856386,
"ce_loss_39": 1.8888987362384797,
"ce_loss_52": 1.477835801243782,
"ce_loss_7": 3.003460741043091,
"epoch": 0.558,
"grad_norm": 14.927142462067188,
"kl_loss_13": 2421.4,
"kl_loss_26": 1232.3,
"kl_loss_39": 804.5,
"kl_loss_7": 3142.8,
"learning_rate": 0.0004163027547791347,
"loss": 3696.4,
"step": 5580
},
{
"ce_loss_13": 2.6033630073070526,
"ce_loss_26": 2.052795875072479,
"ce_loss_39": 1.8524764776229858,
"ce_loss_52": 1.4589103490114212,
"ce_loss_7": 2.9362357556819916,
"epoch": 0.559,
"grad_norm": 15.035071624412899,
"kl_loss_13": 2332.2,
"kl_loss_26": 1173.9,
"kl_loss_39": 769.3,
"kl_loss_7": 3041.6,
"learning_rate": 0.0004147389036836881,
"loss": 3676.85,
"step": 5590
},
{
"ce_loss_13": 2.5536737203598023,
"ce_loss_26": 2.0111388891935347,
"ce_loss_39": 1.8144014358520508,
"ce_loss_52": 1.4328835308551788,
"ce_loss_7": 2.88922523856163,
"epoch": 0.56,
"grad_norm": 14.534788987143044,
"kl_loss_13": 2316.6,
"kl_loss_26": 1164.9,
"kl_loss_39": 758.2,
"kl_loss_7": 3024.4,
"learning_rate": 0.00041317591116653486,
"loss": 3694.55,
"step": 5600
},
{
"ce_loss_13": 2.606983852386475,
"ce_loss_26": 2.032891970872879,
"ce_loss_39": 1.8340051174163818,
"ce_loss_52": 1.4310238301753997,
"ce_loss_7": 2.9573661506175997,
"epoch": 0.561,
"grad_norm": 16.216124841765133,
"kl_loss_13": 2427.2,
"kl_loss_26": 1205.0,
"kl_loss_39": 794.3,
"kl_loss_7": 3156.0,
"learning_rate": 0.0004116137929669921,
"loss": 3679.35,
"step": 5610
},
{
"ce_loss_13": 2.5370231360197066,
"ce_loss_26": 1.9963056713342666,
"ce_loss_39": 1.803014099597931,
"ce_loss_52": 1.4229481190443038,
"ce_loss_7": 2.8732429146766663,
"epoch": 0.562,
"grad_norm": 15.330886130898222,
"kl_loss_13": 2277.4,
"kl_loss_26": 1124.3,
"kl_loss_39": 729.3,
"kl_loss_7": 2988.8,
"learning_rate": 0.00041005256481557305,
"loss": 3673.5,
"step": 5620
},
{
"ce_loss_13": 2.6017677545547486,
"ce_loss_26": 2.0595314621925356,
"ce_loss_39": 1.8631951808929443,
"ce_loss_52": 1.4627325683832169,
"ce_loss_7": 2.9314393043518066,
"epoch": 0.563,
"grad_norm": 14.465862339922571,
"kl_loss_13": 2332.2,
"kl_loss_26": 1190.7,
"kl_loss_39": 781.3,
"kl_loss_7": 3032.0,
"learning_rate": 0.00040849224243382767,
"loss": 3672.9,
"step": 5630
},
{
"ce_loss_13": 2.5594456523656843,
"ce_loss_26": 2.00023832321167,
"ce_loss_39": 1.8007198423147202,
"ce_loss_52": 1.4131150737404823,
"ce_loss_7": 2.904243141412735,
"epoch": 0.564,
"grad_norm": 15.017201171045896,
"kl_loss_13": 2360.2,
"kl_loss_26": 1180.4,
"kl_loss_39": 768.1,
"kl_loss_7": 3075.6,
"learning_rate": 0.000406932841534185,
"loss": 3693.75,
"step": 5640
},
{
"ce_loss_13": 2.595109748840332,
"ce_loss_26": 2.045265626907349,
"ce_loss_39": 1.85684574842453,
"ce_loss_52": 1.4735014230012893,
"ce_loss_7": 2.943417179584503,
"epoch": 0.565,
"grad_norm": 15.364099861784982,
"kl_loss_13": 2322.0,
"kl_loss_26": 1155.7,
"kl_loss_39": 755.4,
"kl_loss_7": 3048.4,
"learning_rate": 0.0004053743778197951,
"loss": 3668.9,
"step": 5650
},
{
"ce_loss_13": 2.582511156797409,
"ce_loss_26": 2.0224455118179323,
"ce_loss_39": 1.8295785069465638,
"ce_loss_52": 1.4373956888914108,
"ce_loss_7": 2.9102243304252626,
"epoch": 0.566,
"grad_norm": 14.693957764502153,
"kl_loss_13": 2342.4,
"kl_loss_26": 1176.8,
"kl_loss_39": 776.5,
"kl_loss_7": 3046.4,
"learning_rate": 0.0004038168669843697,
"loss": 3650.65,
"step": 5660
},
{
"ce_loss_13": 2.603584831953049,
"ce_loss_26": 2.04022336602211,
"ce_loss_39": 1.8417048037052155,
"ce_loss_52": 1.447445745766163,
"ce_loss_7": 2.9460166096687317,
"epoch": 0.567,
"grad_norm": 15.203721551974317,
"kl_loss_13": 2379.4,
"kl_loss_26": 1187.3,
"kl_loss_39": 777.4,
"kl_loss_7": 3104.8,
"learning_rate": 0.000402260324712026,
"loss": 3688.75,
"step": 5670
},
{
"ce_loss_13": 2.526816266775131,
"ce_loss_26": 1.9893273174762727,
"ce_loss_39": 1.793796670436859,
"ce_loss_52": 1.4289553046226502,
"ce_loss_7": 2.8634801030159,
"epoch": 0.568,
"grad_norm": 14.842310660649245,
"kl_loss_13": 2254.2,
"kl_loss_26": 1116.4,
"kl_loss_39": 714.95,
"kl_loss_7": 2954.8,
"learning_rate": 0.00040070476667712743,
"loss": 3637.75,
"step": 5680
},
{
"ce_loss_13": 2.615302687883377,
"ce_loss_26": 2.059708908200264,
"ce_loss_39": 1.8555728137493133,
"ce_loss_52": 1.4557225406169891,
"ce_loss_7": 2.9577668845653533,
"epoch": 0.569,
"grad_norm": 14.742701130409761,
"kl_loss_13": 2387.6,
"kl_loss_26": 1214.9,
"kl_loss_39": 792.5,
"kl_loss_7": 3105.2,
"learning_rate": 0.0003991502085441259,
"loss": 3676.05,
"step": 5690
},
{
"ce_loss_13": 2.5645705699920653,
"ce_loss_26": 2.007223817706108,
"ce_loss_39": 1.8173159271478654,
"ce_loss_52": 1.4376816004514694,
"ce_loss_7": 2.895137590169907,
"epoch": 0.57,
"grad_norm": 15.460594692772787,
"kl_loss_13": 2314.8,
"kl_loss_26": 1152.0,
"kl_loss_39": 753.2,
"kl_loss_7": 3024.8,
"learning_rate": 0.0003975966659674047,
"loss": 3621.95,
"step": 5700
},
{
"ce_loss_13": 2.559953585267067,
"ce_loss_26": 2.032286322116852,
"ce_loss_39": 1.845168125629425,
"ce_loss_52": 1.461988940834999,
"ce_loss_7": 2.9015457332134247,
"epoch": 0.571,
"grad_norm": 15.171579029800053,
"kl_loss_13": 2278.8,
"kl_loss_26": 1159.2,
"kl_loss_39": 758.6,
"kl_loss_7": 2982.0,
"learning_rate": 0.0003960441545911204,
"loss": 3675.95,
"step": 5710
},
{
"ce_loss_13": 2.6008632302284242,
"ce_loss_26": 2.043924775719643,
"ce_loss_39": 1.8485127180814742,
"ce_loss_52": 1.4697089165449142,
"ce_loss_7": 2.9355547785758973,
"epoch": 0.572,
"grad_norm": 14.834558653485171,
"kl_loss_13": 2319.4,
"kl_loss_26": 1152.8,
"kl_loss_39": 750.0,
"kl_loss_7": 3027.6,
"learning_rate": 0.0003944926900490452,
"loss": 3638.65,
"step": 5720
},
{
"ce_loss_13": 2.532833296060562,
"ce_loss_26": 1.9711334377527236,
"ce_loss_39": 1.77955681681633,
"ce_loss_52": 1.4007374957203864,
"ce_loss_7": 2.8781362950801848,
"epoch": 0.573,
"grad_norm": 16.10932164493431,
"kl_loss_13": 2337.8,
"kl_loss_26": 1151.9,
"kl_loss_39": 757.3,
"kl_loss_7": 3064.8,
"learning_rate": 0.0003929422879644099,
"loss": 3650.2,
"step": 5730
},
{
"ce_loss_13": 2.5908755481243135,
"ce_loss_26": 2.0414842426776887,
"ce_loss_39": 1.8504431873559952,
"ce_loss_52": 1.4606543123722076,
"ce_loss_7": 2.926515054702759,
"epoch": 0.574,
"grad_norm": 14.72871950232802,
"kl_loss_13": 2333.4,
"kl_loss_26": 1164.1,
"kl_loss_39": 763.5,
"kl_loss_7": 3044.4,
"learning_rate": 0.0003913929639497462,
"loss": 3615.45,
"step": 5740
},
{
"ce_loss_13": 2.591219651699066,
"ce_loss_26": 2.044692638516426,
"ce_loss_39": 1.8468156188726426,
"ce_loss_52": 1.452781331539154,
"ce_loss_7": 2.932692265510559,
"epoch": 0.575,
"grad_norm": 14.536189899304834,
"kl_loss_13": 2345.4,
"kl_loss_26": 1189.2,
"kl_loss_39": 775.0,
"kl_loss_7": 3054.8,
"learning_rate": 0.00038984473360672965,
"loss": 3631.3,
"step": 5750
},
{
"ce_loss_13": 2.555169379711151,
"ce_loss_26": 2.0133784860372543,
"ce_loss_39": 1.8234185576438904,
"ce_loss_52": 1.4436144948005676,
"ce_loss_7": 2.896002060174942,
"epoch": 0.576,
"grad_norm": 15.34263044515968,
"kl_loss_13": 2285.0,
"kl_loss_26": 1136.7,
"kl_loss_39": 742.35,
"kl_loss_7": 2993.2,
"learning_rate": 0.0003882976125260229,
"loss": 3658.4,
"step": 5760
},
{
"ce_loss_13": 2.502971774339676,
"ce_loss_26": 1.9493688374757767,
"ce_loss_39": 1.7570990473031998,
"ce_loss_52": 1.3885455280542374,
"ce_loss_7": 2.8447738111019136,
"epoch": 0.577,
"grad_norm": 14.660193254554198,
"kl_loss_13": 2295.0,
"kl_loss_26": 1140.5,
"kl_loss_39": 734.0,
"kl_loss_7": 3013.6,
"learning_rate": 0.00038675161628711776,
"loss": 3632.8,
"step": 5770
},
{
"ce_loss_13": 2.5624574303627012,
"ce_loss_26": 2.0051666617393495,
"ce_loss_39": 1.8087779253721237,
"ce_loss_52": 1.41810100376606,
"ce_loss_7": 2.895737165212631,
"epoch": 0.578,
"grad_norm": 14.482088330386649,
"kl_loss_13": 2343.4,
"kl_loss_26": 1176.1,
"kl_loss_39": 766.9,
"kl_loss_7": 3047.6,
"learning_rate": 0.0003852067604581794,
"loss": 3602.85,
"step": 5780
},
{
"ce_loss_13": 2.5270320236682893,
"ce_loss_26": 1.9888224333524704,
"ce_loss_39": 1.803659090399742,
"ce_loss_52": 1.428774857521057,
"ce_loss_7": 2.8705179512500765,
"epoch": 0.579,
"grad_norm": 14.979884111866252,
"kl_loss_13": 2273.2,
"kl_loss_26": 1136.8,
"kl_loss_39": 739.9,
"kl_loss_7": 2982.8,
"learning_rate": 0.0003836630605958888,
"loss": 3603.35,
"step": 5790
},
{
"ce_loss_13": 2.5794149696826936,
"ce_loss_26": 2.0252732813358305,
"ce_loss_39": 1.8308797210454941,
"ce_loss_52": 1.4384155124425888,
"ce_loss_7": 2.9223524034023285,
"epoch": 0.58,
"grad_norm": 14.8686548037009,
"kl_loss_13": 2322.0,
"kl_loss_26": 1158.7,
"kl_loss_39": 758.3,
"kl_loss_7": 3044.0,
"learning_rate": 0.0003821205322452863,
"loss": 3636.15,
"step": 5800
},
{
"ce_loss_13": 2.6002571165561674,
"ce_loss_26": 2.0497290968894957,
"ce_loss_39": 1.8579988300800323,
"ce_loss_52": 1.4732319116592407,
"ce_loss_7": 2.9367463052272798,
"epoch": 0.581,
"grad_norm": 15.52768956484863,
"kl_loss_13": 2303.2,
"kl_loss_26": 1143.4,
"kl_loss_39": 743.0,
"kl_loss_7": 3018.4,
"learning_rate": 0.0003805791909396155,
"loss": 3651.1,
"step": 5810
},
{
"ce_loss_13": 2.5309071093797684,
"ce_loss_26": 1.9803194522857666,
"ce_loss_39": 1.7861102789640426,
"ce_loss_52": 1.410745631158352,
"ce_loss_7": 2.8677128195762633,
"epoch": 0.582,
"grad_norm": 14.66111038706468,
"kl_loss_13": 2302.0,
"kl_loss_26": 1148.3,
"kl_loss_39": 748.9,
"kl_loss_7": 3008.0,
"learning_rate": 0.0003790390522001662,
"loss": 3564.65,
"step": 5820
},
{
"ce_loss_13": 2.5017096638679504,
"ce_loss_26": 1.94625324010849,
"ce_loss_39": 1.749154046177864,
"ce_loss_52": 1.3770527362823486,
"ce_loss_7": 2.8349112212657928,
"epoch": 0.583,
"grad_norm": 14.629798672555188,
"kl_loss_13": 2294.6,
"kl_loss_26": 1141.8,
"kl_loss_39": 731.9,
"kl_loss_7": 3003.2,
"learning_rate": 0.0003775001315361183,
"loss": 3613.35,
"step": 5830
},
{
"ce_loss_13": 2.5965979039669036,
"ce_loss_26": 2.0513822197914124,
"ce_loss_39": 1.8568279683589934,
"ce_loss_52": 1.486306893825531,
"ce_loss_7": 2.932318705320358,
"epoch": 0.584,
"grad_norm": 15.483089802029363,
"kl_loss_13": 2285.6,
"kl_loss_26": 1133.6,
"kl_loss_39": 729.5,
"kl_loss_7": 2989.2,
"learning_rate": 0.0003759624444443858,
"loss": 3579.55,
"step": 5840
},
{
"ce_loss_13": 2.584410917758942,
"ce_loss_26": 2.022917777299881,
"ce_loss_39": 1.8257231026887895,
"ce_loss_52": 1.437566375732422,
"ce_loss_7": 2.9283429443836213,
"epoch": 0.585,
"grad_norm": 15.042900713620126,
"kl_loss_13": 2353.8,
"kl_loss_26": 1173.8,
"kl_loss_39": 763.0,
"kl_loss_7": 3075.6,
"learning_rate": 0.00037442600640946044,
"loss": 3619.6,
"step": 5850
},
{
"ce_loss_13": 2.5062096178531648,
"ce_loss_26": 1.9529441505670548,
"ce_loss_39": 1.756454050540924,
"ce_loss_52": 1.3877734661102294,
"ce_loss_7": 2.852471035718918,
"epoch": 0.586,
"grad_norm": 15.637281763013858,
"kl_loss_13": 2307.4,
"kl_loss_26": 1145.7,
"kl_loss_39": 736.7,
"kl_loss_7": 3028.0,
"learning_rate": 0.00037289083290325663,
"loss": 3605.0,
"step": 5860
},
{
"ce_loss_13": 2.550817745923996,
"ce_loss_26": 2.0112617135047914,
"ce_loss_39": 1.8207211345434189,
"ce_loss_52": 1.4542410910129546,
"ce_loss_7": 2.8887066781520843,
"epoch": 0.587,
"grad_norm": 14.48910345968502,
"kl_loss_13": 2241.0,
"kl_loss_26": 1095.1,
"kl_loss_39": 709.9,
"kl_loss_7": 2950.4,
"learning_rate": 0.0003713569393849543,
"loss": 3628.55,
"step": 5870
},
{
"ce_loss_13": 2.5198557287454606,
"ce_loss_26": 1.9734882295131684,
"ce_loss_39": 1.7831378549337387,
"ce_loss_52": 1.4195168539881706,
"ce_loss_7": 2.8633585631847382,
"epoch": 0.588,
"grad_norm": 14.606482629979089,
"kl_loss_13": 2270.0,
"kl_loss_26": 1121.9,
"kl_loss_39": 724.8,
"kl_loss_7": 2990.8,
"learning_rate": 0.00036982434130084397,
"loss": 3605.15,
"step": 5880
},
{
"ce_loss_13": 2.5002260982990263,
"ce_loss_26": 1.9510222643613815,
"ce_loss_39": 1.760866141319275,
"ce_loss_52": 1.3993624940514564,
"ce_loss_7": 2.8445383846759795,
"epoch": 0.589,
"grad_norm": 15.157934768830916,
"kl_loss_13": 2246.4,
"kl_loss_26": 1094.8,
"kl_loss_39": 699.8,
"kl_loss_7": 2967.6,
"learning_rate": 0.00036829305408417166,
"loss": 3580.45,
"step": 5890
},
{
"ce_loss_13": 2.4792177438735963,
"ce_loss_26": 1.9329254776239395,
"ce_loss_39": 1.7395812034606934,
"ce_loss_52": 1.3731168687343598,
"ce_loss_7": 2.8229923218488695,
"epoch": 0.59,
"grad_norm": 14.656512940113272,
"kl_loss_13": 2271.0,
"kl_loss_26": 1112.9,
"kl_loss_39": 720.45,
"kl_loss_7": 2995.2,
"learning_rate": 0.0003667630931549826,
"loss": 3601.65,
"step": 5900
},
{
"ce_loss_13": 2.6107949793338774,
"ce_loss_26": 2.0506039649248122,
"ce_loss_39": 1.8547100484371186,
"ce_loss_52": 1.4670722007751464,
"ce_loss_7": 2.947498029470444,
"epoch": 0.591,
"grad_norm": 15.535800254061792,
"kl_loss_13": 2341.4,
"kl_loss_26": 1168.3,
"kl_loss_39": 753.2,
"kl_loss_7": 3065.2,
"learning_rate": 0.00036523447391996613,
"loss": 3580.55,
"step": 5910
},
{
"ce_loss_13": 2.5217359244823454,
"ce_loss_26": 1.988610166311264,
"ce_loss_39": 1.8043963432312011,
"ce_loss_52": 1.4320271372795106,
"ce_loss_7": 2.8523899018764496,
"epoch": 0.592,
"grad_norm": 15.144566757103092,
"kl_loss_13": 2242.6,
"kl_loss_26": 1108.7,
"kl_loss_39": 722.4,
"kl_loss_7": 2935.6,
"learning_rate": 0.00036370721177230114,
"loss": 3609.65,
"step": 5920
},
{
"ce_loss_13": 2.5657771229743958,
"ce_loss_26": 2.024441570043564,
"ce_loss_39": 1.8324565082788467,
"ce_loss_52": 1.4527549773454667,
"ce_loss_7": 2.9029002487659454,
"epoch": 0.593,
"grad_norm": 14.305280391890491,
"kl_loss_13": 2257.4,
"kl_loss_26": 1128.5,
"kl_loss_39": 727.25,
"kl_loss_7": 2971.6,
"learning_rate": 0.00036218132209150044,
"loss": 3561.25,
"step": 5930
},
{
"ce_loss_13": 2.5320515751838686,
"ce_loss_26": 1.993978601694107,
"ce_loss_39": 1.7951736986637115,
"ce_loss_52": 1.4332455009222032,
"ce_loss_7": 2.8734976410865785,
"epoch": 0.594,
"grad_norm": 14.731628028248535,
"kl_loss_13": 2276.0,
"kl_loss_26": 1134.7,
"kl_loss_39": 725.65,
"kl_loss_7": 2984.8,
"learning_rate": 0.0003606568202432562,
"loss": 3568.15,
"step": 5940
},
{
"ce_loss_13": 2.467684972286224,
"ce_loss_26": 1.9258863091468812,
"ce_loss_39": 1.7374547556042672,
"ce_loss_52": 1.3822436913847924,
"ce_loss_7": 2.8027748644351957,
"epoch": 0.595,
"grad_norm": 13.952374257617315,
"kl_loss_13": 2221.6,
"kl_loss_26": 1099.3,
"kl_loss_39": 702.85,
"kl_loss_7": 2916.8,
"learning_rate": 0.0003591337215792851,
"loss": 3573.15,
"step": 5950
},
{
"ce_loss_13": 2.5612458407878878,
"ce_loss_26": 2.005467265844345,
"ce_loss_39": 1.8046582967042923,
"ce_loss_52": 1.4227981299161911,
"ce_loss_7": 2.9120794773101806,
"epoch": 0.596,
"grad_norm": 14.684621517642583,
"kl_loss_13": 2333.2,
"kl_loss_26": 1167.6,
"kl_loss_39": 756.5,
"kl_loss_7": 3067.6,
"learning_rate": 0.00035761204143717383,
"loss": 3598.3,
"step": 5960
},
{
"ce_loss_13": 2.539260357618332,
"ce_loss_26": 1.997820395231247,
"ce_loss_39": 1.7995415717363357,
"ce_loss_52": 1.4228885769844055,
"ce_loss_7": 2.8742454588413238,
"epoch": 0.597,
"grad_norm": 14.834521968922566,
"kl_loss_13": 2285.6,
"kl_loss_26": 1143.0,
"kl_loss_39": 733.0,
"kl_loss_7": 2992.4,
"learning_rate": 0.0003560917951402245,
"loss": 3549.75,
"step": 5970
},
{
"ce_loss_13": 2.514337483048439,
"ce_loss_26": 1.9719054281711579,
"ce_loss_39": 1.7803026676177978,
"ce_loss_52": 1.4242254197597504,
"ce_loss_7": 2.8526304841041563,
"epoch": 0.598,
"grad_norm": 15.184376306356116,
"kl_loss_13": 2262.4,
"kl_loss_26": 1100.6,
"kl_loss_39": 700.55,
"kl_loss_7": 2977.6,
"learning_rate": 0.00035457299799730046,
"loss": 3595.65,
"step": 5980
},
{
"ce_loss_13": 2.5518115133047106,
"ce_loss_26": 2.011714455485344,
"ce_loss_39": 1.8179199546575546,
"ce_loss_52": 1.4457479059696197,
"ce_loss_7": 2.903665816783905,
"epoch": 0.599,
"grad_norm": 17.70688578121031,
"kl_loss_13": 2293.4,
"kl_loss_26": 1146.1,
"kl_loss_39": 736.9,
"kl_loss_7": 3022.8,
"learning_rate": 0.0003530556653026721,
"loss": 3553.45,
"step": 5990
},
{
"ce_loss_13": 2.5623465538024903,
"ce_loss_26": 2.0235190600156785,
"ce_loss_39": 1.8284282714128495,
"ce_loss_52": 1.4427544534206391,
"ce_loss_7": 2.8994126319885254,
"epoch": 0.6,
"grad_norm": 14.620466600171055,
"kl_loss_13": 2310.4,
"kl_loss_26": 1152.8,
"kl_loss_39": 754.8,
"kl_loss_7": 3022.0,
"learning_rate": 0.00035153981233586274,
"loss": 3592.9,
"step": 6000
},
{
"ce_loss_13": 2.592492914199829,
"ce_loss_26": 2.0241310060024262,
"ce_loss_39": 1.8282486945390701,
"ce_loss_52": 1.4501359939575196,
"ce_loss_7": 2.934645599126816,
"epoch": 0.601,
"grad_norm": 15.179282109125554,
"kl_loss_13": 2355.8,
"kl_loss_26": 1159.4,
"kl_loss_39": 745.9,
"kl_loss_7": 3082.4,
"learning_rate": 0.00035002545436149473,
"loss": 3551.8,
"step": 6010
},
{
"ce_loss_13": 2.4968257695436478,
"ce_loss_26": 1.950178360939026,
"ce_loss_39": 1.7565111339092254,
"ce_loss_52": 1.3940304026007653,
"ce_loss_7": 2.8332657337188722,
"epoch": 0.602,
"grad_norm": 15.101566569350041,
"kl_loss_13": 2271.8,
"kl_loss_26": 1114.8,
"kl_loss_39": 717.3,
"kl_loss_7": 2986.8,
"learning_rate": 0.0003485126066291364,
"loss": 3553.3,
"step": 6020
},
{
"ce_loss_13": 2.5295323967933654,
"ce_loss_26": 1.9963963776826859,
"ce_loss_39": 1.8044916093349457,
"ce_loss_52": 1.446289749443531,
"ce_loss_7": 2.8674661338329317,
"epoch": 0.603,
"grad_norm": 14.29880611806114,
"kl_loss_13": 2227.0,
"kl_loss_26": 1097.1,
"kl_loss_39": 702.25,
"kl_loss_7": 2938.0,
"learning_rate": 0.0003470012843731476,
"loss": 3534.85,
"step": 6030
},
{
"ce_loss_13": 2.498071011900902,
"ce_loss_26": 1.9493587136268615,
"ce_loss_39": 1.76174655854702,
"ce_loss_52": 1.407905325293541,
"ce_loss_7": 2.847205549478531,
"epoch": 0.604,
"grad_norm": 14.483859928846364,
"kl_loss_13": 2244.8,
"kl_loss_26": 1096.9,
"kl_loss_39": 701.8,
"kl_loss_7": 2975.2,
"learning_rate": 0.00034549150281252633,
"loss": 3514.35,
"step": 6040
},
{
"ce_loss_13": 2.4966412246227265,
"ce_loss_26": 1.9553426146507262,
"ce_loss_39": 1.7613533914089203,
"ce_loss_52": 1.4072722673416138,
"ce_loss_7": 2.8383829057216645,
"epoch": 0.605,
"grad_norm": 14.78851376475336,
"kl_loss_13": 2235.6,
"kl_loss_26": 1090.3,
"kl_loss_39": 695.0,
"kl_loss_7": 2954.0,
"learning_rate": 0.0003439832771507565,
"loss": 3563.65,
"step": 6050
},
{
"ce_loss_13": 2.502397668361664,
"ce_loss_26": 1.9571218103170396,
"ce_loss_39": 1.7665416598320007,
"ce_loss_52": 1.412816160917282,
"ce_loss_7": 2.8454441905021666,
"epoch": 0.606,
"grad_norm": 15.4871318885597,
"kl_loss_13": 2242.0,
"kl_loss_26": 1096.6,
"kl_loss_39": 696.5,
"kl_loss_7": 2962.4,
"learning_rate": 0.0003424766225756537,
"loss": 3510.25,
"step": 6060
},
{
"ce_loss_13": 2.5223917841911314,
"ce_loss_26": 1.9723280429840089,
"ce_loss_39": 1.7729612857103347,
"ce_loss_52": 1.4048843801021575,
"ce_loss_7": 2.8645106673240663,
"epoch": 0.607,
"grad_norm": 15.573595841239559,
"kl_loss_13": 2300.6,
"kl_loss_26": 1138.2,
"kl_loss_39": 727.85,
"kl_loss_7": 3019.6,
"learning_rate": 0.00034097155425921255,
"loss": 3527.0,
"step": 6070
},
{
"ce_loss_13": 2.491760790348053,
"ce_loss_26": 1.9636650770902633,
"ce_loss_39": 1.7728582590818405,
"ce_loss_52": 1.4259491577744483,
"ce_loss_7": 2.821820729970932,
"epoch": 0.608,
"grad_norm": 14.653123046442355,
"kl_loss_13": 2168.6,
"kl_loss_26": 1063.1,
"kl_loss_39": 679.3,
"kl_loss_7": 2864.4,
"learning_rate": 0.0003394680873574546,
"loss": 3528.9,
"step": 6080
},
{
"ce_loss_13": 2.5085246324539185,
"ce_loss_26": 1.9797437161207199,
"ce_loss_39": 1.7878784984350204,
"ce_loss_52": 1.4316603004932404,
"ce_loss_7": 2.847382205724716,
"epoch": 0.609,
"grad_norm": 14.913942335206821,
"kl_loss_13": 2211.2,
"kl_loss_26": 1091.8,
"kl_loss_39": 696.95,
"kl_loss_7": 2922.0,
"learning_rate": 0.0003379662370102747,
"loss": 3549.95,
"step": 6090
},
{
"ce_loss_13": 2.4759395986795427,
"ce_loss_26": 1.949927881360054,
"ce_loss_39": 1.7604204803705215,
"ce_loss_52": 1.41001408547163,
"ce_loss_7": 2.8064420104026793,
"epoch": 0.61,
"grad_norm": 14.676610718900648,
"kl_loss_13": 2185.6,
"kl_loss_26": 1077.2,
"kl_loss_39": 688.0,
"kl_loss_7": 2875.6,
"learning_rate": 0.0003364660183412892,
"loss": 3507.8,
"step": 6100
},
{
"ce_loss_13": 2.531206899881363,
"ce_loss_26": 1.9836675137281419,
"ce_loss_39": 1.7869503110647202,
"ce_loss_52": 1.432834729552269,
"ce_loss_7": 2.873502719402313,
"epoch": 0.611,
"grad_norm": 14.829577774305747,
"kl_loss_13": 2255.0,
"kl_loss_26": 1106.2,
"kl_loss_39": 703.75,
"kl_loss_7": 2966.4,
"learning_rate": 0.0003349674464576834,
"loss": 3495.25,
"step": 6110
},
{
"ce_loss_13": 2.514421796798706,
"ce_loss_26": 1.9697007417678833,
"ce_loss_39": 1.780946272611618,
"ce_loss_52": 1.4202388614416122,
"ce_loss_7": 2.8420049071311952,
"epoch": 0.612,
"grad_norm": 14.8111788086498,
"kl_loss_13": 2230.2,
"kl_loss_26": 1101.1,
"kl_loss_39": 705.2,
"kl_loss_7": 2920.0,
"learning_rate": 0.00033347053645005966,
"loss": 3492.65,
"step": 6120
},
{
"ce_loss_13": 2.537186449766159,
"ce_loss_26": 2.004375171661377,
"ce_loss_39": 1.8157330989837646,
"ce_loss_52": 1.451986312866211,
"ce_loss_7": 2.878436690568924,
"epoch": 0.613,
"grad_norm": 15.401404522918961,
"kl_loss_13": 2242.0,
"kl_loss_26": 1113.3,
"kl_loss_39": 718.55,
"kl_loss_7": 2945.6,
"learning_rate": 0.00033197530339228485,
"loss": 3459.1,
"step": 6130
},
{
"ce_loss_13": 2.506669583916664,
"ce_loss_26": 1.9629664570093155,
"ce_loss_39": 1.78002208173275,
"ce_loss_52": 1.4117600202560425,
"ce_loss_7": 2.847608286142349,
"epoch": 0.614,
"grad_norm": 15.390199704579972,
"kl_loss_13": 2259.8,
"kl_loss_26": 1112.7,
"kl_loss_39": 727.8,
"kl_loss_7": 2986.8,
"learning_rate": 0.00033048176234133967,
"loss": 3537.45,
"step": 6140
},
{
"ce_loss_13": 2.5588534235954286,
"ce_loss_26": 2.024568209052086,
"ce_loss_39": 1.8284731358289719,
"ce_loss_52": 1.4587910890579223,
"ce_loss_7": 2.8921659886837006,
"epoch": 0.615,
"grad_norm": 14.629003698363993,
"kl_loss_13": 2259.8,
"kl_loss_26": 1126.9,
"kl_loss_39": 724.0,
"kl_loss_7": 2958.8,
"learning_rate": 0.0003289899283371657,
"loss": 3536.8,
"step": 6150
},
{
"ce_loss_13": 2.4743064284324645,
"ce_loss_26": 1.9476153373718261,
"ce_loss_39": 1.7604322880506516,
"ce_loss_52": 1.4059417188167571,
"ce_loss_7": 2.7994522780179976,
"epoch": 0.616,
"grad_norm": 15.287517673195092,
"kl_loss_13": 2207.6,
"kl_loss_26": 1094.2,
"kl_loss_39": 708.05,
"kl_loss_7": 2892.4,
"learning_rate": 0.0003274998164025148,
"loss": 3522.0,
"step": 6160
},
{
"ce_loss_13": 2.61488196849823,
"ce_loss_26": 2.067342773079872,
"ce_loss_39": 1.8675355583429336,
"ce_loss_52": 1.4927641093730926,
"ce_loss_7": 2.948505789041519,
"epoch": 0.617,
"grad_norm": 14.482727532648099,
"kl_loss_13": 2288.0,
"kl_loss_26": 1139.7,
"kl_loss_39": 729.35,
"kl_loss_7": 2993.6,
"learning_rate": 0.0003260114415427975,
"loss": 3494.95,
"step": 6170
},
{
"ce_loss_13": 2.5525584638118746,
"ce_loss_26": 1.9941669285297394,
"ce_loss_39": 1.7988057792186738,
"ce_loss_52": 1.4276691943407058,
"ce_loss_7": 2.888332962989807,
"epoch": 0.618,
"grad_norm": 15.194650585699007,
"kl_loss_13": 2319.2,
"kl_loss_26": 1148.8,
"kl_loss_39": 742.3,
"kl_loss_7": 3022.8,
"learning_rate": 0.0003245248187459323,
"loss": 3535.8,
"step": 6180
},
{
"ce_loss_13": 2.5258300840854644,
"ce_loss_26": 1.9903143167495727,
"ce_loss_39": 1.8010086834430694,
"ce_loss_52": 1.4478329718112946,
"ce_loss_7": 2.86256263256073,
"epoch": 0.619,
"grad_norm": 14.652051426511987,
"kl_loss_13": 2228.6,
"kl_loss_26": 1088.0,
"kl_loss_39": 694.0,
"kl_loss_7": 2940.4,
"learning_rate": 0.00032303996298219416,
"loss": 3513.7,
"step": 6190
},
{
"ce_loss_13": 2.5733933985233306,
"ce_loss_26": 2.0336378514766693,
"ce_loss_39": 1.8403378069400786,
"ce_loss_52": 1.4681322902441025,
"ce_loss_7": 2.920198345184326,
"epoch": 0.62,
"grad_norm": 15.056132330426859,
"kl_loss_13": 2280.4,
"kl_loss_26": 1131.3,
"kl_loss_39": 733.75,
"kl_loss_7": 3002.8,
"learning_rate": 0.00032155688920406414,
"loss": 3507.7,
"step": 6200
},
{
"ce_loss_13": 2.5029452949762345,
"ce_loss_26": 1.9643093675374985,
"ce_loss_39": 1.7774474427103997,
"ce_loss_52": 1.4176854699850083,
"ce_loss_7": 2.823590323328972,
"epoch": 0.621,
"grad_norm": 14.49052860339056,
"kl_loss_13": 2220.0,
"kl_loss_26": 1086.1,
"kl_loss_39": 699.45,
"kl_loss_7": 2902.8,
"learning_rate": 0.0003200756123460788,
"loss": 3535.45,
"step": 6210
},
{
"ce_loss_13": 2.489814931154251,
"ce_loss_26": 1.9501473933458329,
"ce_loss_39": 1.7617685228586197,
"ce_loss_52": 1.41129230260849,
"ce_loss_7": 2.816985684633255,
"epoch": 0.622,
"grad_norm": 14.505478323074753,
"kl_loss_13": 2218.6,
"kl_loss_26": 1080.9,
"kl_loss_39": 689.55,
"kl_loss_7": 2918.0,
"learning_rate": 0.00031859614732467957,
"loss": 3488.95,
"step": 6220
},
{
"ce_loss_13": 2.5316161155700683,
"ce_loss_26": 2.0058958530426025,
"ce_loss_39": 1.8137048929929733,
"ce_loss_52": 1.461652959883213,
"ce_loss_7": 2.8647646605968475,
"epoch": 0.623,
"grad_norm": 13.917606472624449,
"kl_loss_13": 2204.8,
"kl_loss_26": 1085.2,
"kl_loss_39": 685.1,
"kl_loss_7": 2902.4,
"learning_rate": 0.00031711850903806275,
"loss": 3465.2,
"step": 6230
},
{
"ce_loss_13": 2.500165891647339,
"ce_loss_26": 1.9614384204149247,
"ce_loss_39": 1.771432462334633,
"ce_loss_52": 1.4076189696788788,
"ce_loss_7": 2.832933169603348,
"epoch": 0.624,
"grad_norm": 14.223738912104434,
"kl_loss_13": 2258.6,
"kl_loss_26": 1125.7,
"kl_loss_39": 725.1,
"kl_loss_7": 2956.8,
"learning_rate": 0.0003156427123660297,
"loss": 3486.3,
"step": 6240
},
{
"ce_loss_13": 2.5448601841926575,
"ce_loss_26": 2.000015211105347,
"ce_loss_39": 1.8015096932649612,
"ce_loss_52": 1.440242400765419,
"ce_loss_7": 2.878038114309311,
"epoch": 0.625,
"grad_norm": 14.471917182701668,
"kl_loss_13": 2262.4,
"kl_loss_26": 1124.4,
"kl_loss_39": 714.0,
"kl_loss_7": 2962.4,
"learning_rate": 0.0003141687721698363,
"loss": 3490.15,
"step": 6250
},
{
"ce_loss_13": 2.5199947118759156,
"ce_loss_26": 1.986537829041481,
"ce_loss_39": 1.796593463420868,
"ce_loss_52": 1.4425756543874741,
"ce_loss_7": 2.854223221540451,
"epoch": 0.626,
"grad_norm": 14.607284979944513,
"kl_loss_13": 2212.8,
"kl_loss_26": 1085.5,
"kl_loss_39": 686.9,
"kl_loss_7": 2911.6,
"learning_rate": 0.00031269670329204396,
"loss": 3493.3,
"step": 6260
},
{
"ce_loss_13": 2.5326361417770387,
"ce_loss_26": 2.002932494878769,
"ce_loss_39": 1.8195136040449142,
"ce_loss_52": 1.4736543655395509,
"ce_loss_7": 2.8653872847557067,
"epoch": 0.627,
"grad_norm": 13.814637143502924,
"kl_loss_13": 2172.2,
"kl_loss_26": 1054.9,
"kl_loss_39": 669.6,
"kl_loss_7": 2869.6,
"learning_rate": 0.00031122652055637015,
"loss": 3492.5,
"step": 6270
},
{
"ce_loss_13": 2.482167053222656,
"ce_loss_26": 1.9512667179107666,
"ce_loss_39": 1.7665533930063249,
"ce_loss_52": 1.42098408639431,
"ce_loss_7": 2.8168558061122893,
"epoch": 0.628,
"grad_norm": 16.02072851141737,
"kl_loss_13": 2193.0,
"kl_loss_26": 1069.4,
"kl_loss_39": 682.35,
"kl_loss_7": 2894.0,
"learning_rate": 0.0003097582387675385,
"loss": 3459.75,
"step": 6280
},
{
"ce_loss_13": 2.47237606048584,
"ce_loss_26": 1.95036241710186,
"ce_loss_39": 1.764494326710701,
"ce_loss_52": 1.4227848395705223,
"ce_loss_7": 2.8021757781505583,
"epoch": 0.629,
"grad_norm": 15.255934367745192,
"kl_loss_13": 2167.4,
"kl_loss_26": 1053.3,
"kl_loss_39": 664.9,
"kl_loss_7": 2858.8,
"learning_rate": 0.00030829187271113034,
"loss": 3446.7,
"step": 6290
},
{
"ce_loss_13": 2.501003822684288,
"ce_loss_26": 1.958929392695427,
"ce_loss_39": 1.76513631939888,
"ce_loss_52": 1.4046356767416,
"ce_loss_7": 2.8350019991397857,
"epoch": 0.63,
"grad_norm": 14.88714427947574,
"kl_loss_13": 2270.2,
"kl_loss_26": 1123.2,
"kl_loss_39": 716.8,
"kl_loss_7": 2970.0,
"learning_rate": 0.00030682743715343565,
"loss": 3508.45,
"step": 6300
},
{
"ce_loss_13": 2.5800569117069245,
"ce_loss_26": 2.0348214149475097,
"ce_loss_39": 1.8403811991214751,
"ce_loss_52": 1.476933541893959,
"ce_loss_7": 2.9134635806083677,
"epoch": 0.631,
"grad_norm": 14.94722074912583,
"kl_loss_13": 2245.0,
"kl_loss_26": 1107.2,
"kl_loss_39": 706.25,
"kl_loss_7": 2952.8,
"learning_rate": 0.0003053649468413043,
"loss": 3499.45,
"step": 6310
},
{
"ce_loss_13": 2.5208792209625246,
"ce_loss_26": 1.9822687000036239,
"ce_loss_39": 1.7960426419973374,
"ce_loss_52": 1.4455793976783753,
"ce_loss_7": 2.8597992181777956,
"epoch": 0.632,
"grad_norm": 15.453003389066977,
"kl_loss_13": 2216.2,
"kl_loss_26": 1070.1,
"kl_loss_39": 678.0,
"kl_loss_7": 2927.6,
"learning_rate": 0.00030390441650199725,
"loss": 3483.5,
"step": 6320
},
{
"ce_loss_13": 2.441950124502182,
"ce_loss_26": 1.91085105240345,
"ce_loss_39": 1.7248677492141724,
"ce_loss_52": 1.3885301351547241,
"ce_loss_7": 2.777034705877304,
"epoch": 0.633,
"grad_norm": 14.901626155092913,
"kl_loss_13": 2155.8,
"kl_loss_26": 1040.1,
"kl_loss_39": 653.2,
"kl_loss_7": 2859.2,
"learning_rate": 0.00030244586084303903,
"loss": 3433.35,
"step": 6330
},
{
"ce_loss_13": 2.4565594136714934,
"ce_loss_26": 1.9154207110404968,
"ce_loss_39": 1.7311667442321776,
"ce_loss_52": 1.3880270063877105,
"ce_loss_7": 2.79474156498909,
"epoch": 0.634,
"grad_norm": 15.212752240316364,
"kl_loss_13": 2198.2,
"kl_loss_26": 1053.8,
"kl_loss_39": 669.5,
"kl_loss_7": 2908.0,
"learning_rate": 0.00030098929455206903,
"loss": 3450.2,
"step": 6340
},
{
"ce_loss_13": 2.4875996589660643,
"ce_loss_26": 1.9456024587154388,
"ce_loss_39": 1.7510357975959778,
"ce_loss_52": 1.4120293408632278,
"ce_loss_7": 2.8184066474437715,
"epoch": 0.635,
"grad_norm": 14.444701123760842,
"kl_loss_13": 2190.4,
"kl_loss_26": 1070.0,
"kl_loss_39": 670.4,
"kl_loss_7": 2884.8,
"learning_rate": 0.00029953473229669324,
"loss": 3500.6,
"step": 6350
},
{
"ce_loss_13": 2.505690813064575,
"ce_loss_26": 1.968365904688835,
"ce_loss_39": 1.7826423317193985,
"ce_loss_52": 1.4394667357206345,
"ce_loss_7": 2.8453324735164642,
"epoch": 0.636,
"grad_norm": 14.480279054372499,
"kl_loss_13": 2207.2,
"kl_loss_26": 1070.3,
"kl_loss_39": 680.95,
"kl_loss_7": 2914.4,
"learning_rate": 0.00029808218872433767,
"loss": 3473.05,
"step": 6360
},
{
"ce_loss_13": 2.462022843956947,
"ce_loss_26": 1.9284056276082993,
"ce_loss_39": 1.7475204050540925,
"ce_loss_52": 1.3994116008281707,
"ce_loss_7": 2.799399584531784,
"epoch": 0.637,
"grad_norm": 14.854420396233579,
"kl_loss_13": 2187.4,
"kl_loss_26": 1057.9,
"kl_loss_39": 676.9,
"kl_loss_7": 2908.4,
"learning_rate": 0.0002966316784621,
"loss": 3431.55,
"step": 6370
},
{
"ce_loss_13": 2.46474946141243,
"ce_loss_26": 1.9243933081626892,
"ce_loss_39": 1.7351078271865845,
"ce_loss_52": 1.3951061010360717,
"ce_loss_7": 2.809561550617218,
"epoch": 0.638,
"grad_norm": 14.398656186824772,
"kl_loss_13": 2201.4,
"kl_loss_26": 1064.6,
"kl_loss_39": 673.7,
"kl_loss_7": 2913.6,
"learning_rate": 0.0002951832161166024,
"loss": 3433.1,
"step": 6380
},
{
"ce_loss_13": 2.524071788787842,
"ce_loss_26": 1.9882585108280182,
"ce_loss_39": 1.8004582822322845,
"ce_loss_52": 1.457192499935627,
"ce_loss_7": 2.859678488969803,
"epoch": 0.639,
"grad_norm": 14.823787609750735,
"kl_loss_13": 2198.8,
"kl_loss_26": 1062.9,
"kl_loss_39": 671.2,
"kl_loss_7": 2895.2,
"learning_rate": 0.0002937368162738445,
"loss": 3448.55,
"step": 6390
},
{
"ce_loss_13": 2.476853275299072,
"ce_loss_26": 1.940243661403656,
"ce_loss_39": 1.7542554527521133,
"ce_loss_52": 1.4153559118509293,
"ce_loss_7": 2.8181783974170687,
"epoch": 0.64,
"grad_norm": 14.674283953178037,
"kl_loss_13": 2177.8,
"kl_loss_26": 1053.1,
"kl_loss_39": 664.6,
"kl_loss_7": 2899.6,
"learning_rate": 0.0002922924934990568,
"loss": 3441.6,
"step": 6400
},
{
"ce_loss_13": 2.4689641296863556,
"ce_loss_26": 1.933935484290123,
"ce_loss_39": 1.7450189381837844,
"ce_loss_52": 1.3959352299571037,
"ce_loss_7": 2.8043021619319917,
"epoch": 0.641,
"grad_norm": 16.188741684715673,
"kl_loss_13": 2210.6,
"kl_loss_26": 1080.2,
"kl_loss_39": 681.3,
"kl_loss_7": 2920.8,
"learning_rate": 0.0002908502623365536,
"loss": 3439.95,
"step": 6410
},
{
"ce_loss_13": 2.51960112452507,
"ce_loss_26": 1.9860825181007384,
"ce_loss_39": 1.792076262831688,
"ce_loss_52": 1.4388326108455658,
"ce_loss_7": 2.8559226214885713,
"epoch": 0.642,
"grad_norm": 15.164079412983817,
"kl_loss_13": 2205.0,
"kl_loss_26": 1076.5,
"kl_loss_39": 686.15,
"kl_loss_7": 2899.6,
"learning_rate": 0.0002894101373095867,
"loss": 3403.75,
"step": 6420
},
{
"ce_loss_13": 2.57558217048645,
"ce_loss_26": 2.037161833047867,
"ce_loss_39": 1.8481904029846192,
"ce_loss_52": 1.4971486061811448,
"ce_loss_7": 2.9037492871284485,
"epoch": 0.643,
"grad_norm": 14.617370011749491,
"kl_loss_13": 2241.0,
"kl_loss_26": 1099.6,
"kl_loss_39": 706.2,
"kl_loss_7": 2940.0,
"learning_rate": 0.00028797213292019926,
"loss": 3465.25,
"step": 6430
},
{
"ce_loss_13": 2.4703142285346984,
"ce_loss_26": 1.9478118807077407,
"ce_loss_39": 1.7643914371728897,
"ce_loss_52": 1.4303042769432068,
"ce_loss_7": 2.8028221487998963,
"epoch": 0.644,
"grad_norm": 14.268057235198288,
"kl_loss_13": 2150.2,
"kl_loss_26": 1039.5,
"kl_loss_39": 657.25,
"kl_loss_7": 2845.2,
"learning_rate": 0.0002865362636490791,
"loss": 3397.05,
"step": 6440
},
{
"ce_loss_13": 2.5057600528001784,
"ce_loss_26": 1.963181382417679,
"ce_loss_39": 1.766686275601387,
"ce_loss_52": 1.4219027027487754,
"ce_loss_7": 2.842684781551361,
"epoch": 0.645,
"grad_norm": 15.007302910220881,
"kl_loss_13": 2227.2,
"kl_loss_26": 1085.9,
"kl_loss_39": 688.6,
"kl_loss_7": 2943.2,
"learning_rate": 0.0002851025439554142,
"loss": 3420.9,
"step": 6450
},
{
"ce_loss_13": 2.5524505376815796,
"ce_loss_26": 2.00695119202137,
"ce_loss_39": 1.812732595205307,
"ce_loss_52": 1.4619058847427369,
"ce_loss_7": 2.89662281870842,
"epoch": 0.646,
"grad_norm": 14.944231437877365,
"kl_loss_13": 2231.2,
"kl_loss_26": 1087.1,
"kl_loss_39": 688.9,
"kl_loss_7": 2952.8,
"learning_rate": 0.00028367098827674573,
"loss": 3473.25,
"step": 6460
},
{
"ce_loss_13": 2.5118141055107115,
"ce_loss_26": 1.9752016961574554,
"ce_loss_39": 1.790860089659691,
"ce_loss_52": 1.4514876693487166,
"ce_loss_7": 2.8457858681678774,
"epoch": 0.647,
"grad_norm": 14.47327693725919,
"kl_loss_13": 2178.4,
"kl_loss_26": 1057.8,
"kl_loss_39": 666.1,
"kl_loss_7": 2874.4,
"learning_rate": 0.00028224161102882397,
"loss": 3430.95,
"step": 6470
},
{
"ce_loss_13": 2.4975059896707537,
"ce_loss_26": 1.9533880710601808,
"ce_loss_39": 1.7617440074682236,
"ce_loss_52": 1.414775413274765,
"ce_loss_7": 2.8368520498275758,
"epoch": 0.648,
"grad_norm": 14.67215053951943,
"kl_loss_13": 2218.6,
"kl_loss_26": 1078.5,
"kl_loss_39": 690.45,
"kl_loss_7": 2930.0,
"learning_rate": 0.00028081442660546124,
"loss": 3435.85,
"step": 6480
},
{
"ce_loss_13": 2.4587242364883424,
"ce_loss_26": 1.9304294764995575,
"ce_loss_39": 1.7412324339151382,
"ce_loss_52": 1.4029302895069122,
"ce_loss_7": 2.7982348799705505,
"epoch": 0.649,
"grad_norm": 14.728330622094298,
"kl_loss_13": 2170.4,
"kl_loss_26": 1055.8,
"kl_loss_39": 664.85,
"kl_loss_7": 2878.8,
"learning_rate": 0.0002793894493783892,
"loss": 3431.05,
"step": 6490
},
{
"ce_loss_13": 2.5393730461597444,
"ce_loss_26": 1.9960223108530044,
"ce_loss_39": 1.808261874318123,
"ce_loss_52": 1.45538187623024,
"ce_loss_7": 2.8760022819042206,
"epoch": 0.65,
"grad_norm": 15.357970116880674,
"kl_loss_13": 2229.4,
"kl_loss_26": 1089.7,
"kl_loss_39": 695.95,
"kl_loss_7": 2930.8,
"learning_rate": 0.0002779666936971129,
"loss": 3429.5,
"step": 6500
},
{
"ce_loss_13": 2.496321311593056,
"ce_loss_26": 1.9730540215969086,
"ce_loss_39": 1.7846842855215073,
"ce_loss_52": 1.444144432246685,
"ce_loss_7": 2.8269869565963743,
"epoch": 0.651,
"grad_norm": 14.142211217794582,
"kl_loss_13": 2159.4,
"kl_loss_26": 1043.4,
"kl_loss_39": 660.95,
"kl_loss_7": 2856.0,
"learning_rate": 0.00027654617388876614,
"loss": 3409.65,
"step": 6510
},
{
"ce_loss_13": 2.4942274272441862,
"ce_loss_26": 1.9708451181650162,
"ce_loss_39": 1.781627294421196,
"ce_loss_52": 1.436317929625511,
"ce_loss_7": 2.824290210008621,
"epoch": 0.652,
"grad_norm": 14.257728738894219,
"kl_loss_13": 2158.0,
"kl_loss_26": 1061.3,
"kl_loss_39": 676.9,
"kl_loss_7": 2841.6,
"learning_rate": 0.0002751279042579672,
"loss": 3420.3,
"step": 6520
},
{
"ce_loss_13": 2.474119412899017,
"ce_loss_26": 1.9305396527051926,
"ce_loss_39": 1.745158138871193,
"ce_loss_52": 1.4086509764194488,
"ce_loss_7": 2.8088379979133604,
"epoch": 0.653,
"grad_norm": 14.091894330635501,
"kl_loss_13": 2189.8,
"kl_loss_26": 1060.2,
"kl_loss_39": 671.45,
"kl_loss_7": 2896.4,
"learning_rate": 0.00027371189908667604,
"loss": 3430.2,
"step": 6530
},
{
"ce_loss_13": 2.5130710184574125,
"ce_loss_26": 1.9766233384609222,
"ce_loss_39": 1.7931175470352172,
"ce_loss_52": 1.4404253482818603,
"ce_loss_7": 2.8409298956394196,
"epoch": 0.654,
"grad_norm": 14.81675411336707,
"kl_loss_13": 2196.2,
"kl_loss_26": 1079.4,
"kl_loss_39": 687.65,
"kl_loss_7": 2891.2,
"learning_rate": 0.00027229817263404863,
"loss": 3395.3,
"step": 6540
},
{
"ce_loss_13": 2.489423853158951,
"ce_loss_26": 1.9515893071889878,
"ce_loss_39": 1.7590265810489654,
"ce_loss_52": 1.4197510361671448,
"ce_loss_7": 2.832774597406387,
"epoch": 0.655,
"grad_norm": 14.759093026127282,
"kl_loss_13": 2178.4,
"kl_loss_26": 1050.2,
"kl_loss_39": 655.0,
"kl_loss_7": 2893.2,
"learning_rate": 0.0002708867391362948,
"loss": 3416.7,
"step": 6550
},
{
"ce_loss_13": 2.5174727141857147,
"ce_loss_26": 1.9717289686203003,
"ce_loss_39": 1.7858693569898605,
"ce_loss_52": 1.4473539382219314,
"ce_loss_7": 2.8465367794036864,
"epoch": 0.656,
"grad_norm": 14.064047100581472,
"kl_loss_13": 2182.8,
"kl_loss_26": 1041.7,
"kl_loss_39": 656.75,
"kl_loss_7": 2877.6,
"learning_rate": 0.0002694776128065345,
"loss": 3397.05,
"step": 6560
},
{
"ce_loss_13": 2.528963714838028,
"ce_loss_26": 1.981925156712532,
"ce_loss_39": 1.7907918602228166,
"ce_loss_52": 1.4499590158462525,
"ce_loss_7": 2.871799385547638,
"epoch": 0.657,
"grad_norm": 14.569880458498675,
"kl_loss_13": 2200.6,
"kl_loss_26": 1059.1,
"kl_loss_39": 660.35,
"kl_loss_7": 2915.2,
"learning_rate": 0.00026807080783465374,
"loss": 3393.8,
"step": 6570
},
{
"ce_loss_13": 2.5217245757579803,
"ce_loss_26": 1.9828781098127366,
"ce_loss_39": 1.7871235221624375,
"ce_loss_52": 1.4383462622761727,
"ce_loss_7": 2.864704269170761,
"epoch": 0.658,
"grad_norm": 14.405026774063144,
"kl_loss_13": 2246.6,
"kl_loss_26": 1094.4,
"kl_loss_39": 694.55,
"kl_loss_7": 2961.2,
"learning_rate": 0.00026666633838716316,
"loss": 3410.55,
"step": 6580
},
{
"ce_loss_13": 2.525897091627121,
"ce_loss_26": 1.9987964391708375,
"ce_loss_39": 1.815827977657318,
"ce_loss_52": 1.4701957792043685,
"ce_loss_7": 2.8572112381458283,
"epoch": 0.659,
"grad_norm": 14.76642356275535,
"kl_loss_13": 2192.0,
"kl_loss_26": 1080.0,
"kl_loss_39": 689.6,
"kl_loss_7": 2880.4,
"learning_rate": 0.00026526421860705474,
"loss": 3403.15,
"step": 6590
},
{
"ce_loss_13": 2.516835355758667,
"ce_loss_26": 1.9933927595615386,
"ce_loss_39": 1.8040386736392975,
"ce_loss_52": 1.4699463561177253,
"ce_loss_7": 2.839303117990494,
"epoch": 0.66,
"grad_norm": 15.026727729458214,
"kl_loss_13": 2153.2,
"kl_loss_26": 1047.1,
"kl_loss_39": 659.7,
"kl_loss_7": 2838.8,
"learning_rate": 0.0002638644626136587,
"loss": 3420.9,
"step": 6600
},
{
"ce_loss_13": 2.5133367598056795,
"ce_loss_26": 1.9757141143083572,
"ce_loss_39": 1.7787566870450973,
"ce_loss_52": 1.4421678900718689,
"ce_loss_7": 2.84059277176857,
"epoch": 0.661,
"grad_norm": 14.33502800394329,
"kl_loss_13": 2167.2,
"kl_loss_26": 1050.2,
"kl_loss_39": 655.95,
"kl_loss_7": 2853.6,
"learning_rate": 0.00026246708450250255,
"loss": 3363.15,
"step": 6610
},
{
"ce_loss_13": 2.530620867013931,
"ce_loss_26": 2.015302965044975,
"ce_loss_39": 1.8304377377033234,
"ce_loss_52": 1.4897648423910141,
"ce_loss_7": 2.84687961935997,
"epoch": 0.662,
"grad_norm": 14.322777446508148,
"kl_loss_13": 2141.4,
"kl_loss_26": 1047.1,
"kl_loss_39": 661.0,
"kl_loss_7": 2818.4,
"learning_rate": 0.00026107209834516854,
"loss": 3368.65,
"step": 6620
},
{
"ce_loss_13": 2.5134909957647324,
"ce_loss_26": 1.964612963795662,
"ce_loss_39": 1.7641142904758453,
"ce_loss_52": 1.4144739270210267,
"ce_loss_7": 2.851569724082947,
"epoch": 0.663,
"grad_norm": 14.388583697005986,
"kl_loss_13": 2236.4,
"kl_loss_26": 1088.4,
"kl_loss_39": 685.8,
"kl_loss_7": 2945.2,
"learning_rate": 0.0002596795181891514,
"loss": 3390.15,
"step": 6630
},
{
"ce_loss_13": 2.4643958449363708,
"ce_loss_26": 1.9360562086105346,
"ce_loss_39": 1.7430761098861693,
"ce_loss_52": 1.4107858330011367,
"ce_loss_7": 2.7886133015155794,
"epoch": 0.664,
"grad_norm": 14.667067788196036,
"kl_loss_13": 2160.8,
"kl_loss_26": 1055.2,
"kl_loss_39": 663.05,
"kl_loss_7": 2836.8,
"learning_rate": 0.000258289358057718,
"loss": 3433.7,
"step": 6640
},
{
"ce_loss_13": 2.470621481537819,
"ce_loss_26": 1.932977157831192,
"ce_loss_39": 1.7458938509225845,
"ce_loss_52": 1.40322026014328,
"ce_loss_7": 2.8033271014690397,
"epoch": 0.665,
"grad_norm": 14.559695450600435,
"kl_loss_13": 2196.2,
"kl_loss_26": 1066.5,
"kl_loss_39": 673.3,
"kl_loss_7": 2893.6,
"learning_rate": 0.0002569016319497657,
"loss": 3385.35,
"step": 6650
},
{
"ce_loss_13": 2.523782452940941,
"ce_loss_26": 1.9726400285959245,
"ce_loss_39": 1.783732882142067,
"ce_loss_52": 1.441526584327221,
"ce_loss_7": 2.8546798706054686,
"epoch": 0.666,
"grad_norm": 14.338494844564005,
"kl_loss_13": 2201.0,
"kl_loss_26": 1066.9,
"kl_loss_39": 675.1,
"kl_loss_7": 2913.6,
"learning_rate": 0.00025551635383968066,
"loss": 3431.65,
"step": 6660
},
{
"ce_loss_13": 2.496166667342186,
"ce_loss_26": 1.9616001814603805,
"ce_loss_39": 1.7780775994062423,
"ce_loss_52": 1.4437817305326461,
"ce_loss_7": 2.8353283524513246,
"epoch": 0.667,
"grad_norm": 14.333332930511547,
"kl_loss_13": 2162.8,
"kl_loss_26": 1041.8,
"kl_loss_39": 658.3,
"kl_loss_7": 2867.2,
"learning_rate": 0.00025413353767719804,
"loss": 3373.9,
"step": 6670
},
{
"ce_loss_13": 2.4899742186069487,
"ce_loss_26": 1.9639368683099747,
"ce_loss_39": 1.7742518305778503,
"ce_loss_52": 1.4562912076711654,
"ce_loss_7": 2.813701218366623,
"epoch": 0.668,
"grad_norm": 15.020866565536496,
"kl_loss_13": 2118.8,
"kl_loss_26": 1008.9,
"kl_loss_39": 626.15,
"kl_loss_7": 2799.6,
"learning_rate": 0.0002527531973872617,
"loss": 3354.0,
"step": 6680
},
{
"ce_loss_13": 2.4495032489299775,
"ce_loss_26": 1.9232689619064331,
"ce_loss_39": 1.7322240889072418,
"ce_loss_52": 1.4083685100078582,
"ce_loss_7": 2.7809501469135283,
"epoch": 0.669,
"grad_norm": 15.157984637483661,
"kl_loss_13": 2152.4,
"kl_loss_26": 1034.8,
"kl_loss_39": 641.05,
"kl_loss_7": 2845.2,
"learning_rate": 0.0002513753468698826,
"loss": 3397.05,
"step": 6690
},
{
"ce_loss_13": 2.5416204214096068,
"ce_loss_26": 1.9909723430871964,
"ce_loss_39": 1.797818985581398,
"ce_loss_52": 1.4566215574741364,
"ce_loss_7": 2.8839422285556795,
"epoch": 0.67,
"grad_norm": 14.731917297604895,
"kl_loss_13": 2207.8,
"kl_loss_26": 1067.0,
"kl_loss_39": 664.6,
"kl_loss_7": 2918.4,
"learning_rate": 0.0002500000000000001,
"loss": 3410.05,
"step": 6700
},
{
"ce_loss_13": 2.46220725774765,
"ce_loss_26": 1.9426458358764649,
"ce_loss_39": 1.7631336867809295,
"ce_loss_52": 1.4324473321437836,
"ce_loss_7": 2.7896072566509247,
"epoch": 0.671,
"grad_norm": 14.394157937336324,
"kl_loss_13": 2145.4,
"kl_loss_26": 1030.2,
"kl_loss_39": 651.6,
"kl_loss_7": 2836.8,
"learning_rate": 0.0002486271706273421,
"loss": 3349.6,
"step": 6710
},
{
"ce_loss_13": 2.4807921826839445,
"ce_loss_26": 1.9570556044578553,
"ce_loss_39": 1.776718083024025,
"ce_loss_52": 1.4477659314870834,
"ce_loss_7": 2.8088565468788147,
"epoch": 0.672,
"grad_norm": 14.57538335602299,
"kl_loss_13": 2109.6,
"kl_loss_26": 1009.8,
"kl_loss_39": 634.25,
"kl_loss_7": 2806.0,
"learning_rate": 0.0002472568725762853,
"loss": 3376.2,
"step": 6720
},
{
"ce_loss_13": 2.4802849024534224,
"ce_loss_26": 1.9443901777267456,
"ce_loss_39": 1.7523796886205674,
"ce_loss_52": 1.4139477282762527,
"ce_loss_7": 2.8129481852054594,
"epoch": 0.673,
"grad_norm": 14.144296062088605,
"kl_loss_13": 2194.4,
"kl_loss_26": 1069.6,
"kl_loss_39": 674.95,
"kl_loss_7": 2888.8,
"learning_rate": 0.00024588911964571554,
"loss": 3364.55,
"step": 6730
},
{
"ce_loss_13": 2.5132571697235107,
"ce_loss_26": 1.9828839927911759,
"ce_loss_39": 1.7922901511192322,
"ce_loss_52": 1.4625934183597564,
"ce_loss_7": 2.841163671016693,
"epoch": 0.674,
"grad_norm": 14.199249331732203,
"kl_loss_13": 2159.4,
"kl_loss_26": 1039.5,
"kl_loss_39": 646.65,
"kl_loss_7": 2846.0,
"learning_rate": 0.00024452392560888974,
"loss": 3361.05,
"step": 6740
},
{
"ce_loss_13": 2.4865836411714555,
"ce_loss_26": 1.954461258649826,
"ce_loss_39": 1.7649456202983855,
"ce_loss_52": 1.4221005111932754,
"ce_loss_7": 2.821400898694992,
"epoch": 0.675,
"grad_norm": 14.682119193582665,
"kl_loss_13": 2206.2,
"kl_loss_26": 1077.5,
"kl_loss_39": 678.9,
"kl_loss_7": 2902.4,
"learning_rate": 0.00024316130421329695,
"loss": 3347.95,
"step": 6750
},
{
"ce_loss_13": 2.474187096953392,
"ce_loss_26": 1.9446223825216293,
"ce_loss_39": 1.7641993075609208,
"ce_loss_52": 1.436264917254448,
"ce_loss_7": 2.8015049755573274,
"epoch": 0.676,
"grad_norm": 14.72420694586694,
"kl_loss_13": 2155.0,
"kl_loss_26": 1030.2,
"kl_loss_39": 646.8,
"kl_loss_7": 2843.6,
"learning_rate": 0.00024180126918051909,
"loss": 3348.9,
"step": 6760
},
{
"ce_loss_13": 2.480317395925522,
"ce_loss_26": 1.956321433186531,
"ce_loss_39": 1.7687535285949707,
"ce_loss_52": 1.4266707986593246,
"ce_loss_7": 2.808481311798096,
"epoch": 0.677,
"grad_norm": 15.416504395614744,
"kl_loss_13": 2171.4,
"kl_loss_26": 1058.8,
"kl_loss_39": 666.6,
"kl_loss_7": 2869.2,
"learning_rate": 0.00024044383420609406,
"loss": 3413.1,
"step": 6770
},
{
"ce_loss_13": 2.5013114362955093,
"ce_loss_26": 1.9759581625461577,
"ce_loss_39": 1.7926720827817917,
"ce_loss_52": 1.4599666327238083,
"ce_loss_7": 2.8250863194465636,
"epoch": 0.678,
"grad_norm": 13.962008513939274,
"kl_loss_13": 2133.8,
"kl_loss_26": 1039.8,
"kl_loss_39": 651.85,
"kl_loss_7": 2824.8,
"learning_rate": 0.00023908901295937712,
"loss": 3372.05,
"step": 6780
},
{
"ce_loss_13": 2.489407476782799,
"ce_loss_26": 1.9609563022851944,
"ce_loss_39": 1.778808832168579,
"ce_loss_52": 1.454407089948654,
"ce_loss_7": 2.8136882543563844,
"epoch": 0.679,
"grad_norm": 14.19915367627698,
"kl_loss_13": 2110.2,
"kl_loss_26": 1016.4,
"kl_loss_39": 637.05,
"kl_loss_7": 2792.0,
"learning_rate": 0.00023773681908340283,
"loss": 3384.7,
"step": 6790
},
{
"ce_loss_13": 2.4634098410606384,
"ce_loss_26": 1.9384621411561966,
"ce_loss_39": 1.7507896840572357,
"ce_loss_52": 1.4146029382944107,
"ce_loss_7": 2.7943048059940336,
"epoch": 0.68,
"grad_norm": 14.861512399701681,
"kl_loss_13": 2166.4,
"kl_loss_26": 1051.5,
"kl_loss_39": 658.6,
"kl_loss_7": 2860.8,
"learning_rate": 0.00023638726619474876,
"loss": 3356.85,
"step": 6800
},
{
"ce_loss_13": 2.5783134520053865,
"ce_loss_26": 2.045197767019272,
"ce_loss_39": 1.8584345400333404,
"ce_loss_52": 1.5217636466026305,
"ce_loss_7": 2.9070691764354706,
"epoch": 0.681,
"grad_norm": 14.552198364638281,
"kl_loss_13": 2176.2,
"kl_loss_26": 1054.5,
"kl_loss_39": 659.7,
"kl_loss_7": 2877.6,
"learning_rate": 0.0002350403678833976,
"loss": 3347.55,
"step": 6810
},
{
"ce_loss_13": 2.4692390322685243,
"ce_loss_26": 1.943667185306549,
"ce_loss_39": 1.7523112028837204,
"ce_loss_52": 1.426128900051117,
"ce_loss_7": 2.7978816986083985,
"epoch": 0.682,
"grad_norm": 14.998678580001265,
"kl_loss_13": 2167.2,
"kl_loss_26": 1050.9,
"kl_loss_39": 652.95,
"kl_loss_7": 2852.4,
"learning_rate": 0.00023369613771260007,
"loss": 3369.6,
"step": 6820
},
{
"ce_loss_13": 2.4953604638576508,
"ce_loss_26": 1.981321769952774,
"ce_loss_39": 1.7999033033847809,
"ce_loss_52": 1.4693025022745132,
"ce_loss_7": 2.826812982559204,
"epoch": 0.683,
"grad_norm": 14.167664016838927,
"kl_loss_13": 2131.6,
"kl_loss_26": 1041.6,
"kl_loss_39": 654.4,
"kl_loss_7": 2822.8,
"learning_rate": 0.00023235458921873925,
"loss": 3334.7,
"step": 6830
},
{
"ce_loss_13": 2.5015017211437227,
"ce_loss_26": 1.9650517791509627,
"ce_loss_39": 1.7717696577310562,
"ce_loss_52": 1.4357108920812607,
"ce_loss_7": 2.827932006120682,
"epoch": 0.684,
"grad_norm": 14.585579391353733,
"kl_loss_13": 2162.0,
"kl_loss_26": 1051.0,
"kl_loss_39": 656.3,
"kl_loss_7": 2848.0,
"learning_rate": 0.0002310157359111938,
"loss": 3348.15,
"step": 6840
},
{
"ce_loss_13": 2.426406466960907,
"ce_loss_26": 1.9015664726495742,
"ce_loss_39": 1.718049594759941,
"ce_loss_52": 1.3935224622488023,
"ce_loss_7": 2.756322818994522,
"epoch": 0.685,
"grad_norm": 15.055484269746147,
"kl_loss_13": 2134.6,
"kl_loss_26": 1022.1,
"kl_loss_39": 635.6,
"kl_loss_7": 2830.0,
"learning_rate": 0.0002296795912722014,
"loss": 3335.95,
"step": 6850
},
{
"ce_loss_13": 2.414138987660408,
"ce_loss_26": 1.8910569071769714,
"ce_loss_39": 1.708191841840744,
"ce_loss_52": 1.383391012251377,
"ce_loss_7": 2.747501391172409,
"epoch": 0.686,
"grad_norm": 14.125926922706771,
"kl_loss_13": 2113.8,
"kl_loss_26": 1014.7,
"kl_loss_39": 637.05,
"kl_loss_7": 2808.8,
"learning_rate": 0.0002283461687567236,
"loss": 3303.65,
"step": 6860
},
{
"ce_loss_13": 2.464204970002174,
"ce_loss_26": 1.9375032573938369,
"ce_loss_39": 1.7498446986079217,
"ce_loss_52": 1.4226357489824295,
"ce_loss_7": 2.7877202153205873,
"epoch": 0.687,
"grad_norm": 14.738238275957917,
"kl_loss_13": 2139.2,
"kl_loss_26": 1028.9,
"kl_loss_39": 644.15,
"kl_loss_7": 2826.0,
"learning_rate": 0.00022701548179231045,
"loss": 3307.2,
"step": 6870
},
{
"ce_loss_13": 2.494007241725922,
"ce_loss_26": 1.9751898407936097,
"ce_loss_39": 1.7829045623540878,
"ce_loss_52": 1.4496880739927291,
"ce_loss_7": 2.8286533296108245,
"epoch": 0.688,
"grad_norm": 13.852391362837148,
"kl_loss_13": 2141.2,
"kl_loss_26": 1042.5,
"kl_loss_39": 651.3,
"kl_loss_7": 2830.8,
"learning_rate": 0.00022568754377896516,
"loss": 3367.25,
"step": 6880
},
{
"ce_loss_13": 2.4991670876741408,
"ce_loss_26": 1.9593406468629837,
"ce_loss_39": 1.7670493572950363,
"ce_loss_52": 1.426390787959099,
"ce_loss_7": 2.8318077862262725,
"epoch": 0.689,
"grad_norm": 14.446284686187164,
"kl_loss_13": 2202.8,
"kl_loss_26": 1070.1,
"kl_loss_39": 671.2,
"kl_loss_7": 2903.6,
"learning_rate": 0.00022436236808900844,
"loss": 3351.3,
"step": 6890
},
{
"ce_loss_13": 2.5084181427955627,
"ce_loss_26": 1.9811445116996764,
"ce_loss_39": 1.7889297604560852,
"ce_loss_52": 1.4664145559072495,
"ce_loss_7": 2.8357039868831633,
"epoch": 0.69,
"grad_norm": 14.484209525578123,
"kl_loss_13": 2151.6,
"kl_loss_26": 1045.5,
"kl_loss_39": 652.25,
"kl_loss_7": 2853.6,
"learning_rate": 0.00022303996806694487,
"loss": 3356.65,
"step": 6900
},
{
"ce_loss_13": 2.514096361398697,
"ce_loss_26": 1.9786822557449342,
"ce_loss_39": 1.7935864567756652,
"ce_loss_52": 1.4548332244157791,
"ce_loss_7": 2.839564120769501,
"epoch": 0.691,
"grad_norm": 13.76338668087359,
"kl_loss_13": 2184.2,
"kl_loss_26": 1055.1,
"kl_loss_39": 667.4,
"kl_loss_7": 2879.6,
"learning_rate": 0.00022172035702932823,
"loss": 3337.1,
"step": 6910
},
{
"ce_loss_13": 2.4764732241630556,
"ce_loss_26": 1.95515196621418,
"ce_loss_39": 1.772997224330902,
"ce_loss_52": 1.4487987339496613,
"ce_loss_7": 2.8033831179142,
"epoch": 0.692,
"grad_norm": 14.586346705034499,
"kl_loss_13": 2110.2,
"kl_loss_26": 1014.4,
"kl_loss_39": 638.2,
"kl_loss_7": 2793.2,
"learning_rate": 0.00022040354826462666,
"loss": 3310.75,
"step": 6920
},
{
"ce_loss_13": 2.468746620416641,
"ce_loss_26": 1.9468185782432557,
"ce_loss_39": 1.7679857224225999,
"ce_loss_52": 1.4481467604637146,
"ce_loss_7": 2.789846181869507,
"epoch": 0.693,
"grad_norm": 15.005402062047136,
"kl_loss_13": 2109.6,
"kl_loss_26": 1011.9,
"kl_loss_39": 635.85,
"kl_loss_7": 2789.6,
"learning_rate": 0.0002190895550330899,
"loss": 3354.5,
"step": 6930
},
{
"ce_loss_13": 2.4501267641782762,
"ce_loss_26": 1.9237078607082367,
"ce_loss_39": 1.735903450846672,
"ce_loss_52": 1.413103035092354,
"ce_loss_7": 2.7897567749023438,
"epoch": 0.694,
"grad_norm": 14.894293084006021,
"kl_loss_13": 2115.4,
"kl_loss_26": 1009.8,
"kl_loss_39": 623.75,
"kl_loss_7": 2825.6,
"learning_rate": 0.00021777839056661552,
"loss": 3328.85,
"step": 6940
},
{
"ce_loss_13": 2.5025814145803453,
"ce_loss_26": 1.9697064816951753,
"ce_loss_39": 1.7858674556016922,
"ce_loss_52": 1.459896171092987,
"ce_loss_7": 2.8366158485412596,
"epoch": 0.695,
"grad_norm": 14.62978428769736,
"kl_loss_13": 2143.8,
"kl_loss_26": 1026.4,
"kl_loss_39": 644.0,
"kl_loss_7": 2842.8,
"learning_rate": 0.0002164700680686147,
"loss": 3339.6,
"step": 6950
},
{
"ce_loss_13": 2.480437287688255,
"ce_loss_26": 1.9569555580615998,
"ce_loss_39": 1.7707047134637832,
"ce_loss_52": 1.4560858264565468,
"ce_loss_7": 2.8063796043395994,
"epoch": 0.696,
"grad_norm": 14.305917479002375,
"kl_loss_13": 2104.0,
"kl_loss_26": 1003.9,
"kl_loss_39": 621.6,
"kl_loss_7": 2791.2,
"learning_rate": 0.0002151646007138806,
"loss": 3346.15,
"step": 6960
},
{
"ce_loss_13": 2.4904795557260515,
"ce_loss_26": 1.9593748480081559,
"ce_loss_39": 1.7725317537784577,
"ce_loss_52": 1.4421725705266,
"ce_loss_7": 2.82717769742012,
"epoch": 0.697,
"grad_norm": 14.55516871467818,
"kl_loss_13": 2171.2,
"kl_loss_26": 1046.7,
"kl_loss_39": 655.3,
"kl_loss_7": 2878.4,
"learning_rate": 0.00021386200164845526,
"loss": 3321.35,
"step": 6970
},
{
"ce_loss_13": 2.4793561339378356,
"ce_loss_26": 1.9439466089010238,
"ce_loss_39": 1.7569547444581985,
"ce_loss_52": 1.4289204239845277,
"ce_loss_7": 2.807218599319458,
"epoch": 0.698,
"grad_norm": 13.778785355416604,
"kl_loss_13": 2152.0,
"kl_loss_26": 1037.7,
"kl_loss_39": 642.8,
"kl_loss_7": 2848.0,
"learning_rate": 0.0002125622839894964,
"loss": 3315.5,
"step": 6980
},
{
"ce_loss_13": 2.585531139373779,
"ce_loss_26": 2.031143417954445,
"ce_loss_39": 1.8274976074695588,
"ce_loss_52": 1.4753503799438477,
"ce_loss_7": 2.921118849515915,
"epoch": 0.699,
"grad_norm": 14.664216613597723,
"kl_loss_13": 2260.4,
"kl_loss_26": 1103.8,
"kl_loss_39": 685.9,
"kl_loss_7": 2968.0,
"learning_rate": 0.00021126546082514663,
"loss": 3365.55,
"step": 6990
},
{
"ce_loss_13": 2.478432095050812,
"ce_loss_26": 1.950699546933174,
"ce_loss_39": 1.7660550504922867,
"ce_loss_52": 1.4418641477823257,
"ce_loss_7": 2.802116149663925,
"epoch": 0.7,
"grad_norm": 14.414270184643762,
"kl_loss_13": 2128.4,
"kl_loss_26": 1022.4,
"kl_loss_39": 639.9,
"kl_loss_7": 2808.8,
"learning_rate": 0.00020997154521440098,
"loss": 3312.0,
"step": 7000
},
{
"ce_loss_13": 2.4484674006700518,
"ce_loss_26": 1.9334596753120423,
"ce_loss_39": 1.754175427556038,
"ce_loss_52": 1.4356836065649987,
"ce_loss_7": 2.7715867519378663,
"epoch": 0.701,
"grad_norm": 15.009606673101777,
"kl_loss_13": 2096.6,
"kl_loss_26": 1005.6,
"kl_loss_39": 630.9,
"kl_loss_7": 2776.0,
"learning_rate": 0.0002086805501869749,
"loss": 3296.9,
"step": 7010
},
{
"ce_loss_13": 2.500411355495453,
"ce_loss_26": 1.9781237423419953,
"ce_loss_39": 1.7995324105024337,
"ce_loss_52": 1.473289003968239,
"ce_loss_7": 2.8238317251205443,
"epoch": 0.702,
"grad_norm": 14.811490845229788,
"kl_loss_13": 2108.4,
"kl_loss_26": 1018.1,
"kl_loss_39": 642.65,
"kl_loss_7": 2786.4,
"learning_rate": 0.0002073924887431744,
"loss": 3301.85,
"step": 7020
},
{
"ce_loss_13": 2.4331007301807404,
"ce_loss_26": 1.925421604514122,
"ce_loss_39": 1.740053552389145,
"ce_loss_52": 1.4260162442922593,
"ce_loss_7": 2.750682008266449,
"epoch": 0.703,
"grad_norm": 14.093530596736626,
"kl_loss_13": 2071.0,
"kl_loss_26": 996.4,
"kl_loss_39": 613.7,
"kl_loss_7": 2741.2,
"learning_rate": 0.00020610737385376348,
"loss": 3303.45,
"step": 7030
},
{
"ce_loss_13": 2.438492274284363,
"ce_loss_26": 1.917963182926178,
"ce_loss_39": 1.731092056632042,
"ce_loss_52": 1.4118753850460053,
"ce_loss_7": 2.7640158772468566,
"epoch": 0.704,
"grad_norm": 14.575552477174394,
"kl_loss_13": 2113.4,
"kl_loss_26": 1019.3,
"kl_loss_39": 632.85,
"kl_loss_7": 2804.0,
"learning_rate": 0.00020482521845983521,
"loss": 3301.55,
"step": 7040
},
{
"ce_loss_13": 2.4910335719585417,
"ce_loss_26": 1.9715039610862732,
"ce_loss_39": 1.7800425946712495,
"ce_loss_52": 1.4519392430782319,
"ce_loss_7": 2.819239354133606,
"epoch": 0.705,
"grad_norm": 14.820051388586238,
"kl_loss_13": 2136.2,
"kl_loss_26": 1040.9,
"kl_loss_39": 651.75,
"kl_loss_7": 2824.0,
"learning_rate": 0.00020354603547267987,
"loss": 3316.6,
"step": 7050
},
{
"ce_loss_13": 2.4318030804395674,
"ce_loss_26": 1.910761234164238,
"ce_loss_39": 1.7284663796424866,
"ce_loss_52": 1.4129984229803085,
"ce_loss_7": 2.7625936210155486,
"epoch": 0.706,
"grad_norm": 14.59547057670379,
"kl_loss_13": 2113.8,
"kl_loss_26": 1004.1,
"kl_loss_39": 622.2,
"kl_loss_7": 2816.4,
"learning_rate": 0.00020226983777365604,
"loss": 3284.95,
"step": 7060
},
{
"ce_loss_13": 2.4749036192893983,
"ce_loss_26": 1.9450800210237502,
"ce_loss_39": 1.761537629365921,
"ce_loss_52": 1.4373356252908707,
"ce_loss_7": 2.809742730855942,
"epoch": 0.707,
"grad_norm": 14.651682120589488,
"kl_loss_13": 2148.8,
"kl_loss_26": 1038.1,
"kl_loss_39": 647.05,
"kl_loss_7": 2860.4,
"learning_rate": 0.00020099663821406056,
"loss": 3330.65,
"step": 7070
},
{
"ce_loss_13": 2.500520494580269,
"ce_loss_26": 1.9711394160985947,
"ce_loss_39": 1.7850598603487016,
"ce_loss_52": 1.4572303384542464,
"ce_loss_7": 2.822962099313736,
"epoch": 0.708,
"grad_norm": 14.695952402949361,
"kl_loss_13": 2140.6,
"kl_loss_26": 1032.0,
"kl_loss_39": 642.6,
"kl_loss_7": 2828.8,
"learning_rate": 0.00019972644961499853,
"loss": 3310.1,
"step": 7080
},
{
"ce_loss_13": 2.4471381455659866,
"ce_loss_26": 1.9142519533634186,
"ce_loss_39": 1.7273518294095993,
"ce_loss_52": 1.4090154066681861,
"ce_loss_7": 2.775345432758331,
"epoch": 0.709,
"grad_norm": 14.907893630817398,
"kl_loss_13": 2145.0,
"kl_loss_26": 1028.7,
"kl_loss_39": 636.3,
"kl_loss_7": 2839.2,
"learning_rate": 0.00019845928476725522,
"loss": 3284.4,
"step": 7090
},
{
"ce_loss_13": 2.484838107228279,
"ce_loss_26": 1.9752304345369338,
"ce_loss_39": 1.7935984045267106,
"ce_loss_52": 1.4694376409053802,
"ce_loss_7": 2.8051605463027953,
"epoch": 0.71,
"grad_norm": 14.813324344167642,
"kl_loss_13": 2100.4,
"kl_loss_26": 1009.5,
"kl_loss_39": 629.85,
"kl_loss_7": 2784.0,
"learning_rate": 0.00019719515643116677,
"loss": 3271.1,
"step": 7100
},
{
"ce_loss_13": 2.449986720085144,
"ce_loss_26": 1.915557289123535,
"ce_loss_39": 1.7244810461997986,
"ce_loss_52": 1.4009377419948579,
"ce_loss_7": 2.7815246999263765,
"epoch": 0.711,
"grad_norm": 14.72405028446814,
"kl_loss_13": 2128.8,
"kl_loss_26": 1018.3,
"kl_loss_39": 632.1,
"kl_loss_7": 2824.4,
"learning_rate": 0.0001959340773364911,
"loss": 3301.5,
"step": 7110
},
{
"ce_loss_13": 2.4507795870304108,
"ce_loss_26": 1.9264422208070755,
"ce_loss_39": 1.7369968056678773,
"ce_loss_52": 1.4171953916549682,
"ce_loss_7": 2.778002160787582,
"epoch": 0.712,
"grad_norm": 15.123641060610607,
"kl_loss_13": 2145.2,
"kl_loss_26": 1034.6,
"kl_loss_39": 640.6,
"kl_loss_7": 2832.0,
"learning_rate": 0.0001946760601822809,
"loss": 3307.65,
"step": 7120
},
{
"ce_loss_13": 2.4649185329675674,
"ce_loss_26": 1.9448591649532319,
"ce_loss_39": 1.7617656499147416,
"ce_loss_52": 1.4421708196401597,
"ce_loss_7": 2.7942136943340303,
"epoch": 0.713,
"grad_norm": 13.86141587264665,
"kl_loss_13": 2099.6,
"kl_loss_26": 996.3,
"kl_loss_39": 613.9,
"kl_loss_7": 2784.0,
"learning_rate": 0.00019342111763675512,
"loss": 3264.15,
"step": 7130
},
{
"ce_loss_13": 2.431650939583778,
"ce_loss_26": 1.8971330910921096,
"ce_loss_39": 1.7134798288345336,
"ce_loss_52": 1.3959884241223335,
"ce_loss_7": 2.7688992261886596,
"epoch": 0.714,
"grad_norm": 14.868179084191116,
"kl_loss_13": 2103.6,
"kl_loss_26": 997.3,
"kl_loss_39": 614.55,
"kl_loss_7": 2798.8,
"learning_rate": 0.00019216926233717085,
"loss": 3302.05,
"step": 7140
},
{
"ce_loss_13": 2.4574439406394957,
"ce_loss_26": 1.9289735972881317,
"ce_loss_39": 1.738691231608391,
"ce_loss_52": 1.4203062415122987,
"ce_loss_7": 2.7880250751972198,
"epoch": 0.715,
"grad_norm": 14.757879306344181,
"kl_loss_13": 2133.4,
"kl_loss_26": 1021.4,
"kl_loss_39": 631.75,
"kl_loss_7": 2823.2,
"learning_rate": 0.00019092050688969737,
"loss": 3296.5,
"step": 7150
},
{
"ce_loss_13": 2.4601316511631013,
"ce_loss_26": 1.9434845715761184,
"ce_loss_39": 1.7585901826620103,
"ce_loss_52": 1.4390017569065094,
"ce_loss_7": 2.7778116285800936,
"epoch": 0.716,
"grad_norm": 13.991843131427743,
"kl_loss_13": 2085.4,
"kl_loss_26": 1007.0,
"kl_loss_39": 627.45,
"kl_loss_7": 2755.6,
"learning_rate": 0.00018967486386928817,
"loss": 3286.15,
"step": 7160
},
{
"ce_loss_13": 2.451919847726822,
"ce_loss_26": 1.9279222816228867,
"ce_loss_39": 1.7440420866012574,
"ce_loss_52": 1.4374898225069046,
"ce_loss_7": 2.784500467777252,
"epoch": 0.717,
"grad_norm": 14.5708304909804,
"kl_loss_13": 2095.4,
"kl_loss_26": 992.5,
"kl_loss_39": 611.75,
"kl_loss_7": 2794.0,
"learning_rate": 0.00018843234581955443,
"loss": 3292.25,
"step": 7170
},
{
"ce_loss_13": 2.4709593683481215,
"ce_loss_26": 1.9460813373327255,
"ce_loss_39": 1.7575767368078232,
"ce_loss_52": 1.4326383203268052,
"ce_loss_7": 2.7951516568660737,
"epoch": 0.718,
"grad_norm": 14.981137787748375,
"kl_loss_13": 2117.8,
"kl_loss_26": 1019.7,
"kl_loss_39": 633.35,
"kl_loss_7": 2803.2,
"learning_rate": 0.00018719296525263924,
"loss": 3299.6,
"step": 7180
},
{
"ce_loss_13": 2.4041130542755127,
"ce_loss_26": 1.8861528187990189,
"ce_loss_39": 1.7035977393388748,
"ce_loss_52": 1.4066498517990111,
"ce_loss_7": 2.735060691833496,
"epoch": 0.719,
"grad_norm": 14.986994612654895,
"kl_loss_13": 2054.2,
"kl_loss_26": 961.6,
"kl_loss_39": 585.6,
"kl_loss_7": 2750.4,
"learning_rate": 0.0001859567346490913,
"loss": 3264.25,
"step": 7190
},
{
"ce_loss_13": 2.521838116645813,
"ce_loss_26": 2.004297485947609,
"ce_loss_39": 1.810739102959633,
"ce_loss_52": 1.4783420652151107,
"ce_loss_7": 2.849927377700806,
"epoch": 0.72,
"grad_norm": 14.181310648182276,
"kl_loss_13": 2154.6,
"kl_loss_26": 1052.9,
"kl_loss_39": 657.35,
"kl_loss_7": 2848.4,
"learning_rate": 0.0001847236664577389,
"loss": 3278.0,
"step": 7200
},
{
"ce_loss_13": 2.40320103764534,
"ce_loss_26": 1.8900187402963637,
"ce_loss_39": 1.7100308045744896,
"ce_loss_52": 1.3999869018793105,
"ce_loss_7": 2.736201885342598,
"epoch": 0.721,
"grad_norm": 14.793709205482683,
"kl_loss_13": 2080.8,
"kl_loss_26": 990.7,
"kl_loss_39": 610.95,
"kl_loss_7": 2780.0,
"learning_rate": 0.00018349377309556487,
"loss": 3283.25,
"step": 7210
},
{
"ce_loss_13": 2.441250967979431,
"ce_loss_26": 1.9231963992118835,
"ce_loss_39": 1.7388009175658226,
"ce_loss_52": 1.4282974660396577,
"ce_loss_7": 2.7717535465955736,
"epoch": 0.722,
"grad_norm": 15.59238941344996,
"kl_loss_13": 2104.0,
"kl_loss_26": 997.6,
"kl_loss_39": 616.85,
"kl_loss_7": 2801.2,
"learning_rate": 0.00018226706694758193,
"loss": 3263.75,
"step": 7220
},
{
"ce_loss_13": 2.495049071311951,
"ce_loss_26": 1.973162716627121,
"ce_loss_39": 1.7896722644567489,
"ce_loss_52": 1.4697123229503632,
"ce_loss_7": 2.8224462032318116,
"epoch": 0.723,
"grad_norm": 13.997878236797012,
"kl_loss_13": 2123.2,
"kl_loss_26": 1009.4,
"kl_loss_39": 628.15,
"kl_loss_7": 2823.2,
"learning_rate": 0.0001810435603667075,
"loss": 3267.75,
"step": 7230
},
{
"ce_loss_13": 2.4492597192525865,
"ce_loss_26": 1.9282636791467667,
"ce_loss_39": 1.7473824605345727,
"ce_loss_52": 1.4372958570718766,
"ce_loss_7": 2.782766741514206,
"epoch": 0.724,
"grad_norm": 14.73683414882718,
"kl_loss_13": 2088.6,
"kl_loss_26": 990.1,
"kl_loss_39": 613.75,
"kl_loss_7": 2799.6,
"learning_rate": 0.0001798232656736389,
"loss": 3246.35,
"step": 7240
},
{
"ce_loss_13": 2.514770272374153,
"ce_loss_26": 1.9697564780712127,
"ce_loss_39": 1.7806446701288223,
"ce_loss_52": 1.456220605969429,
"ce_loss_7": 2.8567338407039644,
"epoch": 0.725,
"grad_norm": 14.87142240672514,
"kl_loss_13": 2179.8,
"kl_loss_26": 1037.2,
"kl_loss_39": 644.4,
"kl_loss_7": 2889.2,
"learning_rate": 0.0001786061951567303,
"loss": 3273.6,
"step": 7250
},
{
"ce_loss_13": 2.4067626029253004,
"ce_loss_26": 1.8963259696960448,
"ce_loss_39": 1.7143886119127274,
"ce_loss_52": 1.4022117048501967,
"ce_loss_7": 2.7348236978054046,
"epoch": 0.726,
"grad_norm": 14.076850795507209,
"kl_loss_13": 2076.6,
"kl_loss_26": 994.5,
"kl_loss_39": 623.5,
"kl_loss_7": 2766.8,
"learning_rate": 0.00017739236107186857,
"loss": 3281.2,
"step": 7260
},
{
"ce_loss_13": 2.4501163721084596,
"ce_loss_26": 1.926158633828163,
"ce_loss_39": 1.7434884279966354,
"ce_loss_52": 1.4286866545677186,
"ce_loss_7": 2.777343970537186,
"epoch": 0.727,
"grad_norm": 13.813498461115062,
"kl_loss_13": 2114.6,
"kl_loss_26": 1012.1,
"kl_loss_39": 626.6,
"kl_loss_7": 2806.0,
"learning_rate": 0.00017618177564234904,
"loss": 3264.1,
"step": 7270
},
{
"ce_loss_13": 2.4412575274705888,
"ce_loss_26": 1.9107532769441604,
"ce_loss_39": 1.7259235098958015,
"ce_loss_52": 1.4076966106891633,
"ce_loss_7": 2.774235662817955,
"epoch": 0.728,
"grad_norm": 14.801740311988109,
"kl_loss_13": 2113.4,
"kl_loss_26": 1003.6,
"kl_loss_39": 619.55,
"kl_loss_7": 2807.4,
"learning_rate": 0.00017497445105875377,
"loss": 3298.7,
"step": 7280
},
{
"ce_loss_13": 2.445681685209274,
"ce_loss_26": 1.9345449537038804,
"ce_loss_39": 1.7532619833946228,
"ce_loss_52": 1.442472691833973,
"ce_loss_7": 2.764800661802292,
"epoch": 0.729,
"grad_norm": 14.618830047757731,
"kl_loss_13": 2063.8,
"kl_loss_26": 980.7,
"kl_loss_39": 603.55,
"kl_loss_7": 2742.0,
"learning_rate": 0.000173770399478828,
"loss": 3226.7,
"step": 7290
},
{
"ce_loss_13": 2.4301975846290587,
"ce_loss_26": 1.9138565450906753,
"ce_loss_39": 1.736141037940979,
"ce_loss_52": 1.427780945599079,
"ce_loss_7": 2.760072636604309,
"epoch": 0.73,
"grad_norm": 14.242974774472335,
"kl_loss_13": 2095.4,
"kl_loss_26": 990.5,
"kl_loss_39": 610.6,
"kl_loss_7": 2791.2,
"learning_rate": 0.0001725696330273575,
"loss": 3260.65,
"step": 7300
},
{
"ce_loss_13": 2.4727762907743456,
"ce_loss_26": 1.9557204306125642,
"ce_loss_39": 1.766261911392212,
"ce_loss_52": 1.4398792043328286,
"ce_loss_7": 2.79817710518837,
"epoch": 0.731,
"grad_norm": 14.566153451338561,
"kl_loss_13": 2113.6,
"kl_loss_26": 1018.5,
"kl_loss_39": 628.8,
"kl_loss_7": 2796.8,
"learning_rate": 0.00017137216379604724,
"loss": 3240.75,
"step": 7310
},
{
"ce_loss_13": 2.490224635601044,
"ce_loss_26": 1.954663872718811,
"ce_loss_39": 1.7594738394021987,
"ce_loss_52": 1.4360380351543427,
"ce_loss_7": 2.8263917326927186,
"epoch": 0.732,
"grad_norm": 13.205540898906253,
"kl_loss_13": 2161.8,
"kl_loss_26": 1044.6,
"kl_loss_39": 637.45,
"kl_loss_7": 2862.4,
"learning_rate": 0.00017017800384339925,
"loss": 3258.4,
"step": 7320
},
{
"ce_loss_13": 2.4344683617353438,
"ce_loss_26": 1.9195960253477096,
"ce_loss_39": 1.7325531929731368,
"ce_loss_52": 1.419457183778286,
"ce_loss_7": 2.7598242580890657,
"epoch": 0.733,
"grad_norm": 14.107781745249417,
"kl_loss_13": 2087.4,
"kl_loss_26": 1001.6,
"kl_loss_39": 618.5,
"kl_loss_7": 2774.8,
"learning_rate": 0.00016898716519459073,
"loss": 3316.4,
"step": 7330
},
{
"ce_loss_13": 2.4717041492462157,
"ce_loss_26": 1.9320402562618255,
"ce_loss_39": 1.730445721745491,
"ce_loss_52": 1.3991459339857102,
"ce_loss_7": 2.811204159259796,
"epoch": 0.734,
"grad_norm": 14.159198716486541,
"kl_loss_13": 2200.8,
"kl_loss_26": 1069.2,
"kl_loss_39": 657.15,
"kl_loss_7": 2902.8,
"learning_rate": 0.00016779965984135375,
"loss": 3266.3,
"step": 7340
},
{
"ce_loss_13": 2.4648273169994352,
"ce_loss_26": 1.9446689933538437,
"ce_loss_39": 1.7665399879217147,
"ce_loss_52": 1.4552808463573457,
"ce_loss_7": 2.7818954586982727,
"epoch": 0.735,
"grad_norm": 13.974138676843918,
"kl_loss_13": 2093.0,
"kl_loss_26": 999.2,
"kl_loss_39": 622.1,
"kl_loss_7": 2762.4,
"learning_rate": 0.00016661549974185424,
"loss": 3232.6,
"step": 7350
},
{
"ce_loss_13": 2.497272843122482,
"ce_loss_26": 1.9733565777540207,
"ce_loss_39": 1.7902508676052094,
"ce_loss_52": 1.4602745115756988,
"ce_loss_7": 2.8227945923805238,
"epoch": 0.736,
"grad_norm": 15.105414614283358,
"kl_loss_13": 2153.6,
"kl_loss_26": 1043.2,
"kl_loss_39": 652.45,
"kl_loss_7": 2843.6,
"learning_rate": 0.00016543469682057105,
"loss": 3314.1,
"step": 7360
},
{
"ce_loss_13": 2.4817920327186584,
"ce_loss_26": 1.9674188673496247,
"ce_loss_39": 1.788041964173317,
"ce_loss_52": 1.4778558552265166,
"ce_loss_7": 2.801541256904602,
"epoch": 0.737,
"grad_norm": 14.089468466172162,
"kl_loss_13": 2075.4,
"kl_loss_26": 985.2,
"kl_loss_39": 606.6,
"kl_loss_7": 2750.8,
"learning_rate": 0.00016425726296817632,
"loss": 3279.5,
"step": 7370
},
{
"ce_loss_13": 2.4628233551979064,
"ce_loss_26": 1.944148001074791,
"ce_loss_39": 1.7584406644105912,
"ce_loss_52": 1.440820676088333,
"ce_loss_7": 2.7982202231884004,
"epoch": 0.738,
"grad_norm": 14.250790129395915,
"kl_loss_13": 2096.0,
"kl_loss_26": 994.4,
"kl_loss_39": 612.4,
"kl_loss_7": 2800.4,
"learning_rate": 0.00016308321004141607,
"loss": 3270.5,
"step": 7380
},
{
"ce_loss_13": 2.4311512380838396,
"ce_loss_26": 1.910204255580902,
"ce_loss_39": 1.7292486786842347,
"ce_loss_52": 1.4260056450963021,
"ce_loss_7": 2.7644225537776945,
"epoch": 0.739,
"grad_norm": 14.26013452282849,
"kl_loss_13": 2064.2,
"kl_loss_26": 971.8,
"kl_loss_39": 596.1,
"kl_loss_7": 2766.0,
"learning_rate": 0.00016191254986299043,
"loss": 3267.55,
"step": 7390
},
{
"ce_loss_13": 2.3748191058635713,
"ce_loss_26": 1.8720220893621444,
"ce_loss_39": 1.695397737622261,
"ce_loss_52": 1.3954048216342927,
"ce_loss_7": 2.6974743723869326,
"epoch": 0.74,
"grad_norm": 14.042172223471859,
"kl_loss_13": 2036.0,
"kl_loss_26": 961.8,
"kl_loss_39": 589.15,
"kl_loss_7": 2719.6,
"learning_rate": 0.00016074529422143398,
"loss": 3237.3,
"step": 7400
},
{
"ce_loss_13": 2.504778665304184,
"ce_loss_26": 1.9736295342445374,
"ce_loss_39": 1.778898686170578,
"ce_loss_52": 1.458841660618782,
"ce_loss_7": 2.830203241109848,
"epoch": 0.741,
"grad_norm": 14.817704298873846,
"kl_loss_13": 2137.6,
"kl_loss_26": 1026.7,
"kl_loss_39": 628.9,
"kl_loss_7": 2830.4,
"learning_rate": 0.0001595814548709983,
"loss": 3256.85,
"step": 7410
},
{
"ce_loss_13": 2.457485032081604,
"ce_loss_26": 1.955030158162117,
"ce_loss_39": 1.7744358479976654,
"ce_loss_52": 1.4638055652379989,
"ce_loss_7": 2.7708797633647917,
"epoch": 0.742,
"grad_norm": 13.847929994452544,
"kl_loss_13": 2053.2,
"kl_loss_26": 989.3,
"kl_loss_39": 610.65,
"kl_loss_7": 2726.4,
"learning_rate": 0.00015842104353153285,
"loss": 3240.25,
"step": 7420
},
{
"ce_loss_13": 2.5232761919498445,
"ce_loss_26": 1.9848549604415893,
"ce_loss_39": 1.7907847046852112,
"ce_loss_52": 1.473637193441391,
"ce_loss_7": 2.8558732092380525,
"epoch": 0.743,
"grad_norm": 14.575648272616709,
"kl_loss_13": 2149.6,
"kl_loss_26": 1018.4,
"kl_loss_39": 623.4,
"kl_loss_7": 2838.8,
"learning_rate": 0.0001572640718883667,
"loss": 3254.8,
"step": 7430
},
{
"ce_loss_13": 2.4647684305906297,
"ce_loss_26": 1.9386949807405471,
"ce_loss_39": 1.7500061064958572,
"ce_loss_52": 1.4324503019452095,
"ce_loss_7": 2.7884095788002012,
"epoch": 0.744,
"grad_norm": 14.394764150644365,
"kl_loss_13": 2108.4,
"kl_loss_26": 1000.0,
"kl_loss_39": 616.55,
"kl_loss_7": 2791.2,
"learning_rate": 0.0001561105515921915,
"loss": 3224.3,
"step": 7440
},
{
"ce_loss_13": 2.441458174586296,
"ce_loss_26": 1.9374703764915466,
"ce_loss_39": 1.7534300208091735,
"ce_loss_52": 1.4378804206848144,
"ce_loss_7": 2.7636309385299684,
"epoch": 0.745,
"grad_norm": 14.678295349282738,
"kl_loss_13": 2068.2,
"kl_loss_26": 997.1,
"kl_loss_39": 620.85,
"kl_loss_7": 2745.6,
"learning_rate": 0.0001549604942589441,
"loss": 3227.25,
"step": 7450
},
{
"ce_loss_13": 2.4308183819055555,
"ce_loss_26": 1.9100747764110566,
"ce_loss_39": 1.7246440201997757,
"ce_loss_52": 1.409556159377098,
"ce_loss_7": 2.7695399791002275,
"epoch": 0.746,
"grad_norm": 14.694656979242655,
"kl_loss_13": 2094.8,
"kl_loss_26": 993.0,
"kl_loss_39": 612.55,
"kl_loss_7": 2804.8,
"learning_rate": 0.00015381391146968864,
"loss": 3249.4,
"step": 7460
},
{
"ce_loss_13": 2.462578612565994,
"ce_loss_26": 1.940753996372223,
"ce_loss_39": 1.7554692894220352,
"ce_loss_52": 1.441029006242752,
"ce_loss_7": 2.785427051782608,
"epoch": 0.747,
"grad_norm": 14.412450315252437,
"kl_loss_13": 2103.6,
"kl_loss_26": 1003.3,
"kl_loss_39": 620.4,
"kl_loss_7": 2793.2,
"learning_rate": 0.00015267081477050133,
"loss": 3242.1,
"step": 7470
},
{
"ce_loss_13": 2.436378574371338,
"ce_loss_26": 1.9284409761428833,
"ce_loss_39": 1.7525635540485383,
"ce_loss_52": 1.443886636197567,
"ce_loss_7": 2.762845513224602,
"epoch": 0.748,
"grad_norm": 14.082696745240801,
"kl_loss_13": 2056.6,
"kl_loss_26": 983.1,
"kl_loss_39": 607.7,
"kl_loss_7": 2742.6,
"learning_rate": 0.00015153121567235335,
"loss": 3260.75,
"step": 7480
},
{
"ce_loss_13": 2.4219354510307314,
"ce_loss_26": 1.9070833683013917,
"ce_loss_39": 1.727812445163727,
"ce_loss_52": 1.4214952304959296,
"ce_loss_7": 2.751905006170273,
"epoch": 0.749,
"grad_norm": 14.604071674011259,
"kl_loss_13": 2073.4,
"kl_loss_26": 983.2,
"kl_loss_39": 609.55,
"kl_loss_7": 2766.8,
"learning_rate": 0.00015039512565099468,
"loss": 3240.15,
"step": 7490
},
{
"ce_loss_13": 2.4254602432250976,
"ce_loss_26": 1.9135964632034301,
"ce_loss_39": 1.7295757800340652,
"ce_loss_52": 1.423691214621067,
"ce_loss_7": 2.7512109965085982,
"epoch": 0.75,
"grad_norm": 13.87053452241645,
"kl_loss_13": 2059.4,
"kl_loss_26": 978.6,
"kl_loss_39": 597.45,
"kl_loss_7": 2750.4,
"learning_rate": 0.00014926255614683932,
"loss": 3260.75,
"step": 7500
},
{
"ce_loss_13": 2.44639810025692,
"ce_loss_26": 1.93405482172966,
"ce_loss_39": 1.7547091454267503,
"ce_loss_52": 1.4416350960731505,
"ce_loss_7": 2.768189311027527,
"epoch": 0.751,
"grad_norm": 14.071002297078877,
"kl_loss_13": 2088.8,
"kl_loss_26": 995.6,
"kl_loss_39": 616.7,
"kl_loss_7": 2768.8,
"learning_rate": 0.0001481335185648498,
"loss": 3269.45,
"step": 7510
},
{
"ce_loss_13": 2.496452784538269,
"ce_loss_26": 1.9704186409711837,
"ce_loss_39": 1.784249845147133,
"ce_loss_52": 1.4717927530407906,
"ce_loss_7": 2.8235138654708862,
"epoch": 0.752,
"grad_norm": 14.017187066675143,
"kl_loss_13": 2081.6,
"kl_loss_26": 989.5,
"kl_loss_39": 608.0,
"kl_loss_7": 2766.4,
"learning_rate": 0.0001470080242744218,
"loss": 3222.85,
"step": 7520
},
{
"ce_loss_13": 2.4962650299072267,
"ce_loss_26": 1.98213948905468,
"ce_loss_39": 1.7954594939947128,
"ce_loss_52": 1.480476987361908,
"ce_loss_7": 2.827346932888031,
"epoch": 0.753,
"grad_norm": 14.186670012527646,
"kl_loss_13": 2094.8,
"kl_loss_26": 998.5,
"kl_loss_39": 611.15,
"kl_loss_7": 2785.6,
"learning_rate": 0.0001458860846092705,
"loss": 3232.0,
"step": 7530
},
{
"ce_loss_13": 2.4669371783733367,
"ce_loss_26": 1.9375512719154357,
"ce_loss_39": 1.7506729423999787,
"ce_loss_52": 1.4334673672914504,
"ce_loss_7": 2.7915061593055723,
"epoch": 0.754,
"grad_norm": 14.32315966365105,
"kl_loss_13": 2109.0,
"kl_loss_26": 996.8,
"kl_loss_39": 613.7,
"kl_loss_7": 2794.0,
"learning_rate": 0.00014476771086731566,
"loss": 3264.6,
"step": 7540
},
{
"ce_loss_13": 2.4759989261627195,
"ce_loss_26": 1.9555140793323518,
"ce_loss_39": 1.7739178657531738,
"ce_loss_52": 1.4679069191217422,
"ce_loss_7": 2.795573103427887,
"epoch": 0.755,
"grad_norm": 14.011488201466985,
"kl_loss_13": 2070.6,
"kl_loss_26": 974.7,
"kl_loss_39": 593.9,
"kl_loss_7": 2754.8,
"learning_rate": 0.00014365291431056872,
"loss": 3256.8,
"step": 7550
},
{
"ce_loss_13": 2.424694412946701,
"ce_loss_26": 1.903841146826744,
"ce_loss_39": 1.7249469131231308,
"ce_loss_52": 1.4185307189822196,
"ce_loss_7": 2.7595690310001375,
"epoch": 0.756,
"grad_norm": 14.79387989637837,
"kl_loss_13": 2093.2,
"kl_loss_26": 989.9,
"kl_loss_39": 610.5,
"kl_loss_7": 2791.6,
"learning_rate": 0.00014254170616501827,
"loss": 3235.5,
"step": 7560
},
{
"ce_loss_13": 2.4660239934921266,
"ce_loss_26": 1.9451529324054717,
"ce_loss_39": 1.7564547389745713,
"ce_loss_52": 1.4434847444295884,
"ce_loss_7": 2.7976350009441378,
"epoch": 0.757,
"grad_norm": 14.844517268447264,
"kl_loss_13": 2085.6,
"kl_loss_26": 981.2,
"kl_loss_39": 602.45,
"kl_loss_7": 2776.4,
"learning_rate": 0.0001414340976205183,
"loss": 3204.2,
"step": 7570
},
{
"ce_loss_13": 2.4295350134372713,
"ce_loss_26": 1.921643227338791,
"ce_loss_39": 1.738620987534523,
"ce_loss_52": 1.4355527609586716,
"ce_loss_7": 2.7507594347000124,
"epoch": 0.758,
"grad_norm": 14.398235424743639,
"kl_loss_13": 2045.8,
"kl_loss_26": 965.6,
"kl_loss_39": 594.6,
"kl_loss_7": 2732.4,
"learning_rate": 0.00014033009983067452,
"loss": 3240.7,
"step": 7580
},
{
"ce_loss_13": 2.4676227152347563,
"ce_loss_26": 1.9401549130678177,
"ce_loss_39": 1.7562287330627442,
"ce_loss_52": 1.4347741633653641,
"ce_loss_7": 2.8093821585178373,
"epoch": 0.759,
"grad_norm": 13.736433284616705,
"kl_loss_13": 2138.8,
"kl_loss_26": 1028.1,
"kl_loss_39": 641.7,
"kl_loss_7": 2851.2,
"learning_rate": 0.00013922972391273224,
"loss": 3240.15,
"step": 7590
},
{
"ce_loss_13": 2.491154599189758,
"ce_loss_26": 1.9668046951293945,
"ce_loss_39": 1.7748177736997603,
"ce_loss_52": 1.4488209426403045,
"ce_loss_7": 2.8212892413139343,
"epoch": 0.76,
"grad_norm": 14.65229593288616,
"kl_loss_13": 2140.6,
"kl_loss_26": 1032.9,
"kl_loss_39": 637.7,
"kl_loss_7": 2829.6,
"learning_rate": 0.0001381329809474649,
"loss": 3239.9,
"step": 7600
},
{
"ce_loss_13": 2.3942853659391403,
"ce_loss_26": 1.892151090502739,
"ce_loss_39": 1.7125076562166215,
"ce_loss_52": 1.4143452048301697,
"ce_loss_7": 2.720611757040024,
"epoch": 0.761,
"grad_norm": 13.295903354405345,
"kl_loss_13": 2008.0,
"kl_loss_26": 952.3,
"kl_loss_39": 583.5,
"kl_loss_7": 2686.4,
"learning_rate": 0.0001370398819790621,
"loss": 3228.6,
"step": 7610
},
{
"ce_loss_13": 2.48261901140213,
"ce_loss_26": 1.966281446814537,
"ce_loss_39": 1.7803379833698272,
"ce_loss_52": 1.4673886984586715,
"ce_loss_7": 2.805577594041824,
"epoch": 0.762,
"grad_norm": 14.322747311567188,
"kl_loss_13": 2093.4,
"kl_loss_26": 1000.3,
"kl_loss_39": 610.3,
"kl_loss_7": 2776.0,
"learning_rate": 0.00013595043801501794,
"loss": 3201.5,
"step": 7620
},
{
"ce_loss_13": 2.443099784851074,
"ce_loss_26": 1.9284921824932098,
"ce_loss_39": 1.7422718316316606,
"ce_loss_52": 1.435012650489807,
"ce_loss_7": 2.7720457434654238,
"epoch": 0.763,
"grad_norm": 14.405471822802745,
"kl_loss_13": 2082.6,
"kl_loss_26": 994.1,
"kl_loss_39": 608.55,
"kl_loss_7": 2773.2,
"learning_rate": 0.00013486466002602133,
"loss": 3225.725,
"step": 7630
},
{
"ce_loss_13": 2.37467542886734,
"ce_loss_26": 1.8506677508354188,
"ce_loss_39": 1.6710956811904907,
"ce_loss_52": 1.3840662211179733,
"ce_loss_7": 2.7066759169101715,
"epoch": 0.764,
"grad_norm": 13.948958944433121,
"kl_loss_13": 2038.6,
"kl_loss_26": 946.7,
"kl_loss_39": 574.05,
"kl_loss_7": 2722.4,
"learning_rate": 0.00013378255894584462,
"loss": 3167.8,
"step": 7640
},
{
"ce_loss_13": 2.446861132979393,
"ce_loss_26": 1.934667894244194,
"ce_loss_39": 1.7494839936494828,
"ce_loss_52": 1.4413674265146255,
"ce_loss_7": 2.7717737197875976,
"epoch": 0.765,
"grad_norm": 14.489695554621445,
"kl_loss_13": 2087.6,
"kl_loss_26": 996.2,
"kl_loss_39": 612.05,
"kl_loss_7": 2779.4,
"learning_rate": 0.0001327041456712334,
"loss": 3229.05,
"step": 7650
},
{
"ce_loss_13": 2.514678430557251,
"ce_loss_26": 1.9890475004911423,
"ce_loss_39": 1.8049181282520295,
"ce_loss_52": 1.4897184193134307,
"ce_loss_7": 2.8373226463794707,
"epoch": 0.766,
"grad_norm": 13.809410965696319,
"kl_loss_13": 2109.8,
"kl_loss_26": 1013.7,
"kl_loss_39": 623.8,
"kl_loss_7": 2784.8,
"learning_rate": 0.00013162943106179747,
"loss": 3248.2,
"step": 7660
},
{
"ce_loss_13": 2.4804063200950623,
"ce_loss_26": 1.9478756994009019,
"ce_loss_39": 1.765100008249283,
"ce_loss_52": 1.446313591301441,
"ce_loss_7": 2.814295369386673,
"epoch": 0.767,
"grad_norm": 14.599429508154355,
"kl_loss_13": 2147.0,
"kl_loss_26": 1023.2,
"kl_loss_39": 636.45,
"kl_loss_7": 2854.0,
"learning_rate": 0.00013055842593990132,
"loss": 3217.4,
"step": 7670
},
{
"ce_loss_13": 2.4887916058301927,
"ce_loss_26": 1.9737455695867538,
"ce_loss_39": 1.7851827770471573,
"ce_loss_52": 1.4613285958766937,
"ce_loss_7": 2.8164610981941225,
"epoch": 0.768,
"grad_norm": 14.229376114254006,
"kl_loss_13": 2127.6,
"kl_loss_26": 1032.8,
"kl_loss_39": 640.5,
"kl_loss_7": 2811.2,
"learning_rate": 0.00012949114109055414,
"loss": 3223.675,
"step": 7680
},
{
"ce_loss_13": 2.389929732680321,
"ce_loss_26": 1.8889827966690063,
"ce_loss_39": 1.7090455144643784,
"ce_loss_52": 1.4120649307966233,
"ce_loss_7": 2.710143965482712,
"epoch": 0.769,
"grad_norm": 13.823270022584358,
"kl_loss_13": 2025.6,
"kl_loss_26": 958.6,
"kl_loss_39": 589.95,
"kl_loss_7": 2700.0,
"learning_rate": 0.00012842758726130281,
"loss": 3247.75,
"step": 7690
},
{
"ce_loss_13": 2.444611003994942,
"ce_loss_26": 1.9318826824426651,
"ce_loss_39": 1.7527276873588562,
"ce_loss_52": 1.4516576603055,
"ce_loss_7": 2.7657779157161713,
"epoch": 0.77,
"grad_norm": 14.431273089148151,
"kl_loss_13": 2049.2,
"kl_loss_26": 963.2,
"kl_loss_39": 589.25,
"kl_loss_7": 2726.0,
"learning_rate": 0.00012736777516212267,
"loss": 3216.75,
"step": 7700
},
{
"ce_loss_13": 2.441952568292618,
"ce_loss_26": 1.9228222370147705,
"ce_loss_39": 1.7338123947381974,
"ce_loss_52": 1.4193892806768418,
"ce_loss_7": 2.7718379318714144,
"epoch": 0.771,
"grad_norm": 13.661830756949985,
"kl_loss_13": 2115.2,
"kl_loss_26": 1008.4,
"kl_loss_39": 620.0,
"kl_loss_7": 2804.8,
"learning_rate": 0.00012631171546530968,
"loss": 3199.55,
"step": 7710
},
{
"ce_loss_13": 2.4535767167806624,
"ce_loss_26": 1.9207569301128387,
"ce_loss_39": 1.7325547844171525,
"ce_loss_52": 1.4173025369644165,
"ce_loss_7": 2.7801734030246736,
"epoch": 0.772,
"grad_norm": 14.176576196561767,
"kl_loss_13": 2111.8,
"kl_loss_26": 1005.2,
"kl_loss_39": 618.15,
"kl_loss_7": 2793.6,
"learning_rate": 0.00012525941880537307,
"loss": 3214.15,
"step": 7720
},
{
"ce_loss_13": 2.4484546184539795,
"ce_loss_26": 1.9310883104801178,
"ce_loss_39": 1.7474435329437257,
"ce_loss_52": 1.4392137452960014,
"ce_loss_7": 2.779611772298813,
"epoch": 0.773,
"grad_norm": 14.626780180795521,
"kl_loss_13": 2095.4,
"kl_loss_26": 994.8,
"kl_loss_39": 607.9,
"kl_loss_7": 2786.8,
"learning_rate": 0.00012421089577892869,
"loss": 3191.6,
"step": 7730
},
{
"ce_loss_13": 2.463806739449501,
"ce_loss_26": 1.9203673034906388,
"ce_loss_39": 1.7268804877996444,
"ce_loss_52": 1.4067875519394875,
"ce_loss_7": 2.797054660320282,
"epoch": 0.774,
"grad_norm": 14.221427151080144,
"kl_loss_13": 2151.0,
"kl_loss_26": 1032.4,
"kl_loss_39": 637.25,
"kl_loss_7": 2842.0,
"learning_rate": 0.0001231661569445919,
"loss": 3214.8,
"step": 7740
},
{
"ce_loss_13": 2.4840691089630127,
"ce_loss_26": 1.9805045217275619,
"ce_loss_39": 1.7966417849063874,
"ce_loss_52": 1.4883142501115798,
"ce_loss_7": 2.8029735326766967,
"epoch": 0.775,
"grad_norm": 14.614162489546528,
"kl_loss_13": 2069.8,
"kl_loss_26": 990.4,
"kl_loss_39": 609.05,
"kl_loss_7": 2743.2,
"learning_rate": 0.00012212521282287093,
"loss": 3200.5,
"step": 7750
},
{
"ce_loss_13": 2.4842973172664644,
"ce_loss_26": 1.9586603373289109,
"ce_loss_39": 1.767923679947853,
"ce_loss_52": 1.450934961438179,
"ce_loss_7": 2.815416473150253,
"epoch": 0.776,
"grad_norm": 14.872662321169154,
"kl_loss_13": 2137.2,
"kl_loss_26": 1028.5,
"kl_loss_39": 636.95,
"kl_loss_7": 2826.4,
"learning_rate": 0.00012108807389606158,
"loss": 3221.25,
"step": 7760
},
{
"ce_loss_13": 2.430084604024887,
"ce_loss_26": 1.9105432122945785,
"ce_loss_39": 1.7237805485725404,
"ce_loss_52": 1.419256439805031,
"ce_loss_7": 2.7609162449836733,
"epoch": 0.777,
"grad_norm": 14.122349060255786,
"kl_loss_13": 2075.2,
"kl_loss_26": 984.9,
"kl_loss_39": 602.95,
"kl_loss_7": 2769.2,
"learning_rate": 0.00012005475060814159,
"loss": 3219.35,
"step": 7770
},
{
"ce_loss_13": 2.4920803755521774,
"ce_loss_26": 1.977930763363838,
"ce_loss_39": 1.792539432644844,
"ce_loss_52": 1.484375348687172,
"ce_loss_7": 2.818430072069168,
"epoch": 0.778,
"grad_norm": 14.838228117967187,
"kl_loss_13": 2079.2,
"kl_loss_26": 984.5,
"kl_loss_39": 603.55,
"kl_loss_7": 2766.8,
"learning_rate": 0.00011902525336466464,
"loss": 3193.3,
"step": 7780
},
{
"ce_loss_13": 2.4326795816421507,
"ce_loss_26": 1.9094915211200714,
"ce_loss_39": 1.7265714228153228,
"ce_loss_52": 1.4181353628635407,
"ce_loss_7": 2.757854151725769,
"epoch": 0.779,
"grad_norm": 13.98078513544715,
"kl_loss_13": 2078.4,
"kl_loss_26": 991.0,
"kl_loss_39": 610.15,
"kl_loss_7": 2758.8,
"learning_rate": 0.00011799959253265668,
"loss": 3208.85,
"step": 7790
},
{
"ce_loss_13": 2.42523156106472,
"ce_loss_26": 1.9246951520442963,
"ce_loss_39": 1.7428549587726594,
"ce_loss_52": 1.4467457503080368,
"ce_loss_7": 2.7435911536216735,
"epoch": 0.78,
"grad_norm": 13.811296718067993,
"kl_loss_13": 2022.0,
"kl_loss_26": 950.5,
"kl_loss_39": 578.7,
"kl_loss_7": 2701.6,
"learning_rate": 0.00011697777844051105,
"loss": 3204.7,
"step": 7800
},
{
"ce_loss_13": 2.496878683567047,
"ce_loss_26": 1.9677572190761565,
"ce_loss_39": 1.7739976853132249,
"ce_loss_52": 1.459720864892006,
"ce_loss_7": 2.8327562749385833,
"epoch": 0.781,
"grad_norm": 14.08625224164901,
"kl_loss_13": 2120.4,
"kl_loss_26": 1008.7,
"kl_loss_39": 615.4,
"kl_loss_7": 2822.4,
"learning_rate": 0.00011595982137788402,
"loss": 3198.55,
"step": 7810
},
{
"ce_loss_13": 2.4920260161161423,
"ce_loss_26": 1.9670526027679442,
"ce_loss_39": 1.783473041653633,
"ce_loss_52": 1.4704290598630905,
"ce_loss_7": 2.8202459871768952,
"epoch": 0.782,
"grad_norm": 14.200442027165447,
"kl_loss_13": 2103.8,
"kl_loss_26": 1004.1,
"kl_loss_39": 624.45,
"kl_loss_7": 2800.4,
"learning_rate": 0.00011494573159559212,
"loss": 3223.6,
"step": 7820
},
{
"ce_loss_13": 2.4327739059925078,
"ce_loss_26": 1.9302410751581192,
"ce_loss_39": 1.7492202669382095,
"ce_loss_52": 1.4421478152275085,
"ce_loss_7": 2.7496786177158357,
"epoch": 0.783,
"grad_norm": 13.561882382659508,
"kl_loss_13": 2046.4,
"kl_loss_26": 985.3,
"kl_loss_39": 604.05,
"kl_loss_7": 2712.8,
"learning_rate": 0.00011393551930550828,
"loss": 3172.1,
"step": 7830
},
{
"ce_loss_13": 2.456186518073082,
"ce_loss_26": 1.9265149384737015,
"ce_loss_39": 1.7347608864307404,
"ce_loss_52": 1.4248275607824326,
"ce_loss_7": 2.7833105325698853,
"epoch": 0.784,
"grad_norm": 14.152082617728679,
"kl_loss_13": 2109.0,
"kl_loss_26": 1006.1,
"kl_loss_39": 611.45,
"kl_loss_7": 2800.8,
"learning_rate": 0.00011292919468045875,
"loss": 3208.05,
"step": 7840
},
{
"ce_loss_13": 2.4463070958852766,
"ce_loss_26": 1.9245142936706543,
"ce_loss_39": 1.736380136013031,
"ce_loss_52": 1.4392430812120438,
"ce_loss_7": 2.776514196395874,
"epoch": 0.785,
"grad_norm": 13.366541015007158,
"kl_loss_13": 2080.4,
"kl_loss_26": 983.3,
"kl_loss_39": 597.9,
"kl_loss_7": 2776.4,
"learning_rate": 0.00011192676785412154,
"loss": 3185.35,
"step": 7850
},
{
"ce_loss_13": 2.421262636780739,
"ce_loss_26": 1.9249890923500061,
"ce_loss_39": 1.7500147104263306,
"ce_loss_52": 1.4561963319778441,
"ce_loss_7": 2.741842967271805,
"epoch": 0.786,
"grad_norm": 15.122779815741898,
"kl_loss_13": 1987.4,
"kl_loss_26": 937.4,
"kl_loss_39": 570.05,
"kl_loss_7": 2649.2,
"learning_rate": 0.00011092824892092374,
"loss": 3155.1,
"step": 7860
},
{
"ce_loss_13": 2.528256595134735,
"ce_loss_26": 2.005132633447647,
"ce_loss_39": 1.8220074683427812,
"ce_loss_52": 1.49842167198658,
"ce_loss_7": 2.8524305701255797,
"epoch": 0.787,
"grad_norm": 14.392088598180058,
"kl_loss_13": 2135.0,
"kl_loss_26": 1024.4,
"kl_loss_39": 633.4,
"kl_loss_7": 2828.0,
"learning_rate": 0.0001099336479359398,
"loss": 3228.45,
"step": 7870
},
{
"ce_loss_13": 2.4351921498775484,
"ce_loss_26": 1.9118896454572678,
"ce_loss_39": 1.7326159566640853,
"ce_loss_52": 1.43462935090065,
"ce_loss_7": 2.7571564972400666,
"epoch": 0.788,
"grad_norm": 14.109231297319436,
"kl_loss_13": 2083.4,
"kl_loss_26": 971.8,
"kl_loss_39": 593.25,
"kl_loss_7": 2767.2,
"learning_rate": 0.00010894297491479043,
"loss": 3224.35,
"step": 7880
},
{
"ce_loss_13": 2.4037249386310577,
"ce_loss_26": 1.8971479564905167,
"ce_loss_39": 1.713542652130127,
"ce_loss_52": 1.416736051440239,
"ce_loss_7": 2.7244735300540923,
"epoch": 0.789,
"grad_norm": 14.646464194445937,
"kl_loss_13": 2027.0,
"kl_loss_26": 959.6,
"kl_loss_39": 587.05,
"kl_loss_7": 2694.8,
"learning_rate": 0.00010795623983354214,
"loss": 3163.9,
"step": 7890
},
{
"ce_loss_13": 2.462240958213806,
"ce_loss_26": 1.9522877007722854,
"ce_loss_39": 1.7622255086898804,
"ce_loss_52": 1.447747752070427,
"ce_loss_7": 2.781180214881897,
"epoch": 0.79,
"grad_norm": 14.486453301055112,
"kl_loss_13": 2093.8,
"kl_loss_26": 999.0,
"kl_loss_39": 616.7,
"kl_loss_7": 2768.0,
"learning_rate": 0.00010697345262860636,
"loss": 3189.25,
"step": 7900
},
{
"ce_loss_13": 2.443204700946808,
"ce_loss_26": 1.9252238601446152,
"ce_loss_39": 1.7538990557193757,
"ce_loss_52": 1.4549198508262635,
"ce_loss_7": 2.762816107273102,
"epoch": 0.791,
"grad_norm": 14.756973656564183,
"kl_loss_13": 2043.6,
"kl_loss_26": 953.1,
"kl_loss_39": 582.75,
"kl_loss_7": 2730.4,
"learning_rate": 0.00010599462319663906,
"loss": 3189.25,
"step": 7910
},
{
"ce_loss_13": 2.4734450757503508,
"ce_loss_26": 1.9476019829511642,
"ce_loss_39": 1.7569423377513886,
"ce_loss_52": 1.448093169927597,
"ce_loss_7": 2.7963216602802277,
"epoch": 0.792,
"grad_norm": 14.148651159008981,
"kl_loss_13": 2096.8,
"kl_loss_26": 997.8,
"kl_loss_39": 608.35,
"kl_loss_7": 2784.0,
"learning_rate": 0.00010501976139444191,
"loss": 3199.1,
"step": 7920
},
{
"ce_loss_13": 2.460918265581131,
"ce_loss_26": 1.9421131610870361,
"ce_loss_39": 1.7556595474481582,
"ce_loss_52": 1.4451121121644974,
"ce_loss_7": 2.7843388080596925,
"epoch": 0.793,
"grad_norm": 14.590509352597412,
"kl_loss_13": 2099.4,
"kl_loss_26": 995.3,
"kl_loss_39": 605.2,
"kl_loss_7": 2780.8,
"learning_rate": 0.0001040488770388625,
"loss": 3203.35,
"step": 7930
},
{
"ce_loss_13": 2.379360908269882,
"ce_loss_26": 1.8618569314479827,
"ce_loss_39": 1.6797795861959457,
"ce_loss_52": 1.3791978135704994,
"ce_loss_7": 2.708191817998886,
"epoch": 0.794,
"grad_norm": 14.16954347608881,
"kl_loss_13": 2053.0,
"kl_loss_26": 969.8,
"kl_loss_39": 592.9,
"kl_loss_7": 2740.8,
"learning_rate": 0.00010308197990669538,
"loss": 3181.45,
"step": 7940
},
{
"ce_loss_13": 2.4218691647052766,
"ce_loss_26": 1.9039832711219788,
"ce_loss_39": 1.7168581753969192,
"ce_loss_52": 1.414185357093811,
"ce_loss_7": 2.7429304718971252,
"epoch": 0.795,
"grad_norm": 13.813266382907072,
"kl_loss_13": 2088.6,
"kl_loss_26": 990.3,
"kl_loss_39": 604.5,
"kl_loss_7": 2776.4,
"learning_rate": 0.0001021190797345839,
"loss": 3178.1,
"step": 7950
},
{
"ce_loss_13": 2.487806275486946,
"ce_loss_26": 1.9796326756477356,
"ce_loss_39": 1.7957882821559905,
"ce_loss_52": 1.484969075024128,
"ce_loss_7": 2.8111346662044525,
"epoch": 0.796,
"grad_norm": 14.164838891881972,
"kl_loss_13": 2047.4,
"kl_loss_26": 971.5,
"kl_loss_39": 591.45,
"kl_loss_7": 2717.2,
"learning_rate": 0.00010116018621892236,
"loss": 3174.95,
"step": 7960
},
{
"ce_loss_13": 2.398695731163025,
"ce_loss_26": 1.8858011841773987,
"ce_loss_39": 1.7020757973194123,
"ce_loss_52": 1.4106510564684869,
"ce_loss_7": 2.721856439113617,
"epoch": 0.797,
"grad_norm": 14.161604323369735,
"kl_loss_13": 2034.0,
"kl_loss_26": 953.8,
"kl_loss_39": 575.1,
"kl_loss_7": 2712.8,
"learning_rate": 0.00010020530901575753,
"loss": 3177.95,
"step": 7970
},
{
"ce_loss_13": 2.427662065625191,
"ce_loss_26": 1.921607220172882,
"ce_loss_39": 1.741393145918846,
"ce_loss_52": 1.438681322336197,
"ce_loss_7": 2.746969664096832,
"epoch": 0.798,
"grad_norm": 14.770550097448835,
"kl_loss_13": 2042.0,
"kl_loss_26": 965.9,
"kl_loss_39": 592.1,
"kl_loss_7": 2705.2,
"learning_rate": 9.925445774069231e-05,
"loss": 3170.6,
"step": 7980
},
{
"ce_loss_13": 2.4366293847560883,
"ce_loss_26": 1.9163338214159011,
"ce_loss_39": 1.7320117831230164,
"ce_loss_52": 1.4146809190511704,
"ce_loss_7": 2.772968965768814,
"epoch": 0.799,
"grad_norm": 13.97785133684068,
"kl_loss_13": 2098.2,
"kl_loss_26": 1008.8,
"kl_loss_39": 619.4,
"kl_loss_7": 2795.6,
"learning_rate": 9.830764196878872e-05,
"loss": 3210.25,
"step": 7990
},
{
"ce_loss_13": 2.519176536798477,
"ce_loss_26": 2.0015997767448424,
"ce_loss_39": 1.8160304486751557,
"ce_loss_52": 1.4840710669755937,
"ce_loss_7": 2.842372918128967,
"epoch": 0.8,
"grad_norm": 13.949092199595771,
"kl_loss_13": 2133.0,
"kl_loss_26": 1036.3,
"kl_loss_39": 652.95,
"kl_loss_7": 2818.8,
"learning_rate": 9.736487123447069e-05,
"loss": 3181.95,
"step": 8000
},
{
"ce_loss_13": 2.4726769655942915,
"ce_loss_26": 1.9741164237260818,
"ce_loss_39": 1.790221494436264,
"ce_loss_52": 1.482371485233307,
"ce_loss_7": 2.799379500746727,
"epoch": 0.801,
"grad_norm": 13.707660286112029,
"kl_loss_13": 2038.0,
"kl_loss_26": 977.9,
"kl_loss_39": 599.8,
"kl_loss_7": 2716.8,
"learning_rate": 9.642615503142926e-05,
"loss": 3173.65,
"step": 8010
},
{
"ce_loss_13": 2.4063422054052355,
"ce_loss_26": 1.904910859465599,
"ce_loss_39": 1.7236665695905686,
"ce_loss_52": 1.4340474352240562,
"ce_loss_7": 2.726147544384003,
"epoch": 0.802,
"grad_norm": 14.759158260471319,
"kl_loss_13": 2004.0,
"kl_loss_26": 947.5,
"kl_loss_39": 570.8,
"kl_loss_7": 2678.8,
"learning_rate": 9.549150281252633e-05,
"loss": 3210.3,
"step": 8020
},
{
"ce_loss_13": 2.439296191930771,
"ce_loss_26": 1.930625182390213,
"ce_loss_39": 1.7471411645412445,
"ce_loss_52": 1.4468423128128052,
"ce_loss_7": 2.759518486261368,
"epoch": 0.803,
"grad_norm": 14.138193136913905,
"kl_loss_13": 2029.8,
"kl_loss_26": 959.0,
"kl_loss_39": 580.8,
"kl_loss_7": 2710.0,
"learning_rate": 9.4560923989699e-05,
"loss": 3188.85,
"step": 8030
},
{
"ce_loss_13": 2.383785030245781,
"ce_loss_26": 1.8730993419885635,
"ce_loss_39": 1.689489060640335,
"ce_loss_52": 1.3906694814562797,
"ce_loss_7": 2.715326648950577,
"epoch": 0.804,
"grad_norm": 14.525642481529378,
"kl_loss_13": 2063.0,
"kl_loss_26": 966.1,
"kl_loss_39": 591.6,
"kl_loss_7": 2756.4,
"learning_rate": 9.363442793386607e-05,
"loss": 3171.0,
"step": 8040
},
{
"ce_loss_13": 2.436983805894852,
"ce_loss_26": 1.9301572561264038,
"ce_loss_39": 1.7478452265262603,
"ce_loss_52": 1.4431490540504455,
"ce_loss_7": 2.7593387603759765,
"epoch": 0.805,
"grad_norm": 14.17334080276183,
"kl_loss_13": 2038.0,
"kl_loss_26": 967.0,
"kl_loss_39": 590.6,
"kl_loss_7": 2714.4,
"learning_rate": 9.271202397483213e-05,
"loss": 3157.4,
"step": 8050
},
{
"ce_loss_13": 2.4742675691843035,
"ce_loss_26": 1.9660158514976502,
"ce_loss_39": 1.7886695712804794,
"ce_loss_52": 1.4793777346611023,
"ce_loss_7": 2.799517345428467,
"epoch": 0.806,
"grad_norm": 14.807557523499636,
"kl_loss_13": 2047.0,
"kl_loss_26": 977.2,
"kl_loss_39": 606.0,
"kl_loss_7": 2719.6,
"learning_rate": 9.179372140119524e-05,
"loss": 3197.5,
"step": 8060
},
{
"ce_loss_13": 2.404207941889763,
"ce_loss_26": 1.8871434926986694,
"ce_loss_39": 1.7082382440567017,
"ce_loss_52": 1.4106020584702492,
"ce_loss_7": 2.7318927943706512,
"epoch": 0.807,
"grad_norm": 14.241838480847589,
"kl_loss_13": 2054.2,
"kl_loss_26": 966.9,
"kl_loss_39": 589.9,
"kl_loss_7": 2745.6,
"learning_rate": 9.087952946025175e-05,
"loss": 3174.15,
"step": 8070
},
{
"ce_loss_13": 2.405471110343933,
"ce_loss_26": 1.9090194314718247,
"ce_loss_39": 1.7282894462347032,
"ce_loss_52": 1.4298398733139037,
"ce_loss_7": 2.7295302629470823,
"epoch": 0.808,
"grad_norm": 14.246105109452872,
"kl_loss_13": 2023.6,
"kl_loss_26": 956.4,
"kl_loss_39": 585.5,
"kl_loss_7": 2704.0,
"learning_rate": 8.996945735790446e-05,
"loss": 3220.95,
"step": 8080
},
{
"ce_loss_13": 2.4230452179908752,
"ce_loss_26": 1.9108994454145432,
"ce_loss_39": 1.7252773225307465,
"ce_loss_52": 1.4297530561685563,
"ce_loss_7": 2.7479528963565825,
"epoch": 0.809,
"grad_norm": 14.241539859828608,
"kl_loss_13": 2056.4,
"kl_loss_26": 974.5,
"kl_loss_39": 592.4,
"kl_loss_7": 2738.0,
"learning_rate": 8.906351425856951e-05,
"loss": 3187.2,
"step": 8090
},
{
"ce_loss_13": 2.5193986773490904,
"ce_loss_26": 2.004713475704193,
"ce_loss_39": 1.8214319556951524,
"ce_loss_52": 1.5133182466030122,
"ce_loss_7": 2.836967188119888,
"epoch": 0.81,
"grad_norm": 13.856573501498156,
"kl_loss_13": 2059.0,
"kl_loss_26": 983.5,
"kl_loss_39": 604.75,
"kl_loss_7": 2738.8,
"learning_rate": 8.816170928508365e-05,
"loss": 3199.5,
"step": 8100
},
{
"ce_loss_13": 2.463338887691498,
"ce_loss_26": 1.9509768843650819,
"ce_loss_39": 1.7714523404836655,
"ce_loss_52": 1.4671026438474655,
"ce_loss_7": 2.7907890677452087,
"epoch": 0.811,
"grad_norm": 14.442792993010306,
"kl_loss_13": 2039.6,
"kl_loss_26": 963.7,
"kl_loss_39": 594.85,
"kl_loss_7": 2727.2,
"learning_rate": 8.7264051518613e-05,
"loss": 3182.6,
"step": 8110
},
{
"ce_loss_13": 2.35435933470726,
"ce_loss_26": 1.8515879094600678,
"ce_loss_39": 1.675445196032524,
"ce_loss_52": 1.3868303269147872,
"ce_loss_7": 2.6736503660678865,
"epoch": 0.812,
"grad_norm": 15.186200546439618,
"kl_loss_13": 2007.0,
"kl_loss_26": 943.9,
"kl_loss_39": 575.7,
"kl_loss_7": 2679.2,
"learning_rate": 8.637054999856148e-05,
"loss": 3182.4,
"step": 8120
},
{
"ce_loss_13": 2.4665314495563506,
"ce_loss_26": 1.9507400870323182,
"ce_loss_39": 1.7698681026697158,
"ce_loss_52": 1.463287603855133,
"ce_loss_7": 2.790166562795639,
"epoch": 0.813,
"grad_norm": 14.702441450046226,
"kl_loss_13": 2081.6,
"kl_loss_26": 994.3,
"kl_loss_39": 606.6,
"kl_loss_7": 2757.2,
"learning_rate": 8.548121372247918e-05,
"loss": 3195.8,
"step": 8130
},
{
"ce_loss_13": 2.4175081342458724,
"ce_loss_26": 1.894961017370224,
"ce_loss_39": 1.7168689727783204,
"ce_loss_52": 1.4184614822268486,
"ce_loss_7": 2.74496705532074,
"epoch": 0.814,
"grad_norm": 14.150480690009331,
"kl_loss_13": 2042.4,
"kl_loss_26": 956.7,
"kl_loss_39": 584.15,
"kl_loss_7": 2725.2,
"learning_rate": 8.459605164597267e-05,
"loss": 3148.95,
"step": 8140
},
{
"ce_loss_13": 2.4137402385473252,
"ce_loss_26": 1.907509195804596,
"ce_loss_39": 1.7280682563781737,
"ce_loss_52": 1.4408938705921173,
"ce_loss_7": 2.734111136198044,
"epoch": 0.815,
"grad_norm": 14.607407219816483,
"kl_loss_13": 2031.8,
"kl_loss_26": 952.0,
"kl_loss_39": 576.85,
"kl_loss_7": 2711.2,
"learning_rate": 8.371507268261436e-05,
"loss": 3141.15,
"step": 8150
},
{
"ce_loss_13": 2.45993629693985,
"ce_loss_26": 1.9489454805850983,
"ce_loss_39": 1.7687684744596481,
"ce_loss_52": 1.4672614842653275,
"ce_loss_7": 2.7731840908527374,
"epoch": 0.816,
"grad_norm": 13.941573996490533,
"kl_loss_13": 2049.4,
"kl_loss_26": 971.7,
"kl_loss_39": 590.15,
"kl_loss_7": 2723.6,
"learning_rate": 8.283828570385238e-05,
"loss": 3167.65,
"step": 8160
},
{
"ce_loss_13": 2.457903391122818,
"ce_loss_26": 1.9481427311897277,
"ce_loss_39": 1.7692535519599915,
"ce_loss_52": 1.473311385512352,
"ce_loss_7": 2.7900636374950407,
"epoch": 0.817,
"grad_norm": 13.820607552825622,
"kl_loss_13": 2034.8,
"kl_loss_26": 960.7,
"kl_loss_39": 581.95,
"kl_loss_7": 2732.8,
"learning_rate": 8.196569953892202e-05,
"loss": 3175.55,
"step": 8170
},
{
"ce_loss_13": 2.424106791615486,
"ce_loss_26": 1.9179951936006545,
"ce_loss_39": 1.7393042415380477,
"ce_loss_52": 1.4501032710075379,
"ce_loss_7": 2.7422122418880464,
"epoch": 0.818,
"grad_norm": 13.95954126780279,
"kl_loss_13": 2017.4,
"kl_loss_26": 939.3,
"kl_loss_39": 570.5,
"kl_loss_7": 2684.4,
"learning_rate": 8.109732297475635e-05,
"loss": 3172.8,
"step": 8180
},
{
"ce_loss_13": 2.4480927348136903,
"ce_loss_26": 1.9403320997953415,
"ce_loss_39": 1.7644436001777648,
"ce_loss_52": 1.4613411754369736,
"ce_loss_7": 2.767207592725754,
"epoch": 0.819,
"grad_norm": 15.077790655269643,
"kl_loss_13": 2034.2,
"kl_loss_26": 959.3,
"kl_loss_39": 589.55,
"kl_loss_7": 2707.6,
"learning_rate": 8.023316475589754e-05,
"loss": 3151.8,
"step": 8190
},
{
"ce_loss_13": 2.389411324262619,
"ce_loss_26": 1.8819621950387955,
"ce_loss_39": 1.7063862174749374,
"ce_loss_52": 1.4187449038028717,
"ce_loss_7": 2.7186341762542723,
"epoch": 0.82,
"grad_norm": 14.069790186558153,
"kl_loss_13": 2015.6,
"kl_loss_26": 934.0,
"kl_loss_39": 570.4,
"kl_loss_7": 2711.2,
"learning_rate": 7.937323358440934e-05,
"loss": 3158.45,
"step": 8200
},
{
"ce_loss_13": 2.463495451211929,
"ce_loss_26": 1.9460216015577316,
"ce_loss_39": 1.7633485794067383,
"ce_loss_52": 1.459425413608551,
"ce_loss_7": 2.7843497574329374,
"epoch": 0.821,
"grad_norm": 14.129619264599885,
"kl_loss_13": 2034.2,
"kl_loss_26": 962.3,
"kl_loss_39": 588.55,
"kl_loss_7": 2710.0,
"learning_rate": 7.851753811978923e-05,
"loss": 3172.55,
"step": 8210
},
{
"ce_loss_13": 2.3558076560497283,
"ce_loss_26": 1.8546594500541687,
"ce_loss_39": 1.6799951493740082,
"ce_loss_52": 1.392863529920578,
"ce_loss_7": 2.6725959718227386,
"epoch": 0.822,
"grad_norm": 13.287512232663138,
"kl_loss_13": 1979.6,
"kl_loss_26": 919.4,
"kl_loss_39": 562.4,
"kl_loss_7": 2652.0,
"learning_rate": 7.766608697888095e-05,
"loss": 3151.0,
"step": 8220
},
{
"ce_loss_13": 2.4153092801570892,
"ce_loss_26": 1.9061992377042771,
"ce_loss_39": 1.7255131870508194,
"ce_loss_52": 1.4212424442172051,
"ce_loss_7": 2.748347020149231,
"epoch": 0.823,
"grad_norm": 14.43300896159423,
"kl_loss_13": 2076.6,
"kl_loss_26": 977.9,
"kl_loss_39": 599.3,
"kl_loss_7": 2767.2,
"learning_rate": 7.681888873578785e-05,
"loss": 3171.1,
"step": 8230
},
{
"ce_loss_13": 2.4028283417224885,
"ce_loss_26": 1.9037913769483565,
"ce_loss_39": 1.729577499628067,
"ce_loss_52": 1.4313182592391969,
"ce_loss_7": 2.7214892983436583,
"epoch": 0.824,
"grad_norm": 14.091655498494992,
"kl_loss_13": 1997.8,
"kl_loss_26": 945.5,
"kl_loss_39": 582.95,
"kl_loss_7": 2668.0,
"learning_rate": 7.597595192178702e-05,
"loss": 3129.45,
"step": 8240
},
{
"ce_loss_13": 2.385217198729515,
"ce_loss_26": 1.8788419783115387,
"ce_loss_39": 1.7016818612813949,
"ce_loss_52": 1.4092606633901597,
"ce_loss_7": 2.714561605453491,
"epoch": 0.825,
"grad_norm": 14.092565500396708,
"kl_loss_13": 2015.4,
"kl_loss_26": 950.7,
"kl_loss_39": 578.35,
"kl_loss_7": 2700.8,
"learning_rate": 7.513728502524286e-05,
"loss": 3103.55,
"step": 8250
},
{
"ce_loss_13": 2.411863788962364,
"ce_loss_26": 1.8953818708658219,
"ce_loss_39": 1.712015947699547,
"ce_loss_52": 1.4228723630309106,
"ce_loss_7": 2.7335788309574127,
"epoch": 0.826,
"grad_norm": 14.683616091837887,
"kl_loss_13": 2027.2,
"kl_loss_26": 946.2,
"kl_loss_39": 569.45,
"kl_loss_7": 2700.8,
"learning_rate": 7.430289649152156e-05,
"loss": 3186.45,
"step": 8260
},
{
"ce_loss_13": 2.4488519340753556,
"ce_loss_26": 1.9516287744045258,
"ce_loss_39": 1.7749818950891494,
"ce_loss_52": 1.479728889465332,
"ce_loss_7": 2.7712768018245697,
"epoch": 0.827,
"grad_norm": 13.955547255495208,
"kl_loss_13": 1991.0,
"kl_loss_26": 940.5,
"kl_loss_39": 573.4,
"kl_loss_7": 2658.4,
"learning_rate": 7.347279472290646e-05,
"loss": 3163.475,
"step": 8270
},
{
"ce_loss_13": 2.3786238610744475,
"ce_loss_26": 1.8683661013841628,
"ce_loss_39": 1.690899032354355,
"ce_loss_52": 1.4009160608053208,
"ce_loss_7": 2.705441731214523,
"epoch": 0.828,
"grad_norm": 14.039626698915084,
"kl_loss_13": 2016.2,
"kl_loss_26": 938.6,
"kl_loss_39": 571.15,
"kl_loss_7": 2694.4,
"learning_rate": 7.264698807851328e-05,
"loss": 3118.5,
"step": 8280
},
{
"ce_loss_13": 2.466960498690605,
"ce_loss_26": 1.946975302696228,
"ce_loss_39": 1.7688733905553817,
"ce_loss_52": 1.4637437134981155,
"ce_loss_7": 2.8006490588188173,
"epoch": 0.829,
"grad_norm": 14.106474347437752,
"kl_loss_13": 2090.4,
"kl_loss_26": 984.1,
"kl_loss_39": 609.1,
"kl_loss_7": 2790.4,
"learning_rate": 7.182548487420554e-05,
"loss": 3184.4,
"step": 8290
},
{
"ce_loss_13": 2.5116629540920257,
"ce_loss_26": 1.9985181391239166,
"ce_loss_39": 1.8124066442251205,
"ce_loss_52": 1.497176530957222,
"ce_loss_7": 2.8335696399211883,
"epoch": 0.83,
"grad_norm": 14.139665843791814,
"kl_loss_13": 2098.8,
"kl_loss_26": 1001.4,
"kl_loss_39": 614.35,
"kl_loss_7": 2780.6,
"learning_rate": 7.100829338251146e-05,
"loss": 3198.35,
"step": 8300
},
{
"ce_loss_13": 2.4611269533634186,
"ce_loss_26": 1.9527796864509583,
"ce_loss_39": 1.7704098969697952,
"ce_loss_52": 1.462582492828369,
"ce_loss_7": 2.7841490387916563,
"epoch": 0.831,
"grad_norm": 13.992596562086698,
"kl_loss_13": 2062.6,
"kl_loss_26": 988.0,
"kl_loss_39": 605.35,
"kl_loss_7": 2743.2,
"learning_rate": 7.019542183254046e-05,
"loss": 3175.4,
"step": 8310
},
{
"ce_loss_13": 2.4340964376926424,
"ce_loss_26": 1.9217961221933364,
"ce_loss_39": 1.7397069931030273,
"ce_loss_52": 1.4361872345209121,
"ce_loss_7": 2.759904479980469,
"epoch": 0.832,
"grad_norm": 14.479445741622119,
"kl_loss_13": 2039.6,
"kl_loss_26": 971.9,
"kl_loss_39": 599.4,
"kl_loss_7": 2716.0,
"learning_rate": 6.938687840989971e-05,
"loss": 3159.45,
"step": 8320
},
{
"ce_loss_13": 2.4414653837680818,
"ce_loss_26": 1.928215390443802,
"ce_loss_39": 1.7369425565004348,
"ce_loss_52": 1.4351924806833267,
"ce_loss_7": 2.76216436624527,
"epoch": 0.833,
"grad_norm": 14.95621914365995,
"kl_loss_13": 2059.8,
"kl_loss_26": 983.2,
"kl_loss_39": 594.6,
"kl_loss_7": 2738.8,
"learning_rate": 6.858267125661271e-05,
"loss": 3174.0,
"step": 8330
},
{
"ce_loss_13": 2.400873589515686,
"ce_loss_26": 1.90281642973423,
"ce_loss_39": 1.7204748094081879,
"ce_loss_52": 1.4244474336504935,
"ce_loss_7": 2.7152935564517975,
"epoch": 0.834,
"grad_norm": 14.103644864595271,
"kl_loss_13": 2033.8,
"kl_loss_26": 973.5,
"kl_loss_39": 591.7,
"kl_loss_7": 2704.8,
"learning_rate": 6.778280847103668e-05,
"loss": 3170.05,
"step": 8340
},
{
"ce_loss_13": 2.374238893389702,
"ce_loss_26": 1.8607898473739624,
"ce_loss_39": 1.6831828862428666,
"ce_loss_52": 1.396483090519905,
"ce_loss_7": 2.704102611541748,
"epoch": 0.835,
"grad_norm": 14.599308778967789,
"kl_loss_13": 2031.4,
"kl_loss_26": 934.1,
"kl_loss_39": 566.7,
"kl_loss_7": 2719.2,
"learning_rate": 6.698729810778065e-05,
"loss": 3150.95,
"step": 8350
},
{
"ce_loss_13": 2.4648724853992463,
"ce_loss_26": 1.9494601666927338,
"ce_loss_39": 1.7671974629163743,
"ce_loss_52": 1.461349506676197,
"ce_loss_7": 2.794687694311142,
"epoch": 0.836,
"grad_norm": 14.476811719238219,
"kl_loss_13": 2060.2,
"kl_loss_26": 979.5,
"kl_loss_39": 601.55,
"kl_loss_7": 2750.8,
"learning_rate": 6.619614817762538e-05,
"loss": 3140.75,
"step": 8360
},
{
"ce_loss_13": 2.4076102912425994,
"ce_loss_26": 1.9041061371564865,
"ce_loss_39": 1.7215475410223007,
"ce_loss_52": 1.426313552260399,
"ce_loss_7": 2.729493075609207,
"epoch": 0.837,
"grad_norm": 14.66675091557762,
"kl_loss_13": 2014.4,
"kl_loss_26": 958.9,
"kl_loss_39": 585.55,
"kl_loss_7": 2689.6,
"learning_rate": 6.540936664744196e-05,
"loss": 3161.6,
"step": 8370
},
{
"ce_loss_13": 2.4151067316532133,
"ce_loss_26": 1.9024922668933868,
"ce_loss_39": 1.7205139189958571,
"ce_loss_52": 1.4315023928880692,
"ce_loss_7": 2.733920103311539,
"epoch": 0.838,
"grad_norm": 13.748708793526905,
"kl_loss_13": 2020.6,
"kl_loss_26": 957.3,
"kl_loss_39": 574.45,
"kl_loss_7": 2690.8,
"learning_rate": 6.462696144011149e-05,
"loss": 3148.0,
"step": 8380
},
{
"ce_loss_13": 2.4298708856105806,
"ce_loss_26": 1.9152013957500458,
"ce_loss_39": 1.737311202287674,
"ce_loss_52": 1.441886842250824,
"ce_loss_7": 2.757741445302963,
"epoch": 0.839,
"grad_norm": 14.560412200339597,
"kl_loss_13": 2019.6,
"kl_loss_26": 946.5,
"kl_loss_39": 576.85,
"kl_loss_7": 2709.2,
"learning_rate": 6.384894043444567e-05,
"loss": 3144.45,
"step": 8390
},
{
"ce_loss_13": 2.4371220886707308,
"ce_loss_26": 1.9178409904241562,
"ce_loss_39": 1.7318467199802399,
"ce_loss_52": 1.427680206298828,
"ce_loss_7": 2.7716069161891936,
"epoch": 0.84,
"grad_norm": 13.228002541168403,
"kl_loss_13": 2067.8,
"kl_loss_26": 990.0,
"kl_loss_39": 597.05,
"kl_loss_7": 2766.8,
"learning_rate": 6.307531146510753e-05,
"loss": 3145.15,
"step": 8400
},
{
"ce_loss_13": 2.4668938338756563,
"ce_loss_26": 1.9483750283718109,
"ce_loss_39": 1.7615112096071244,
"ce_loss_52": 1.4629206866025926,
"ce_loss_7": 2.7949269711971283,
"epoch": 0.841,
"grad_norm": 14.638289805261575,
"kl_loss_13": 2070.8,
"kl_loss_26": 979.1,
"kl_loss_39": 588.85,
"kl_loss_7": 2755.2,
"learning_rate": 6.230608232253226e-05,
"loss": 3135.55,
"step": 8410
},
{
"ce_loss_13": 2.498942193388939,
"ce_loss_26": 1.9719986289739608,
"ce_loss_39": 1.7773171186447143,
"ce_loss_52": 1.4589012682437896,
"ce_loss_7": 2.8206138908863068,
"epoch": 0.842,
"grad_norm": 14.547037792354297,
"kl_loss_13": 2143.2,
"kl_loss_26": 1027.6,
"kl_loss_39": 626.5,
"kl_loss_7": 2830.0,
"learning_rate": 6.154126075284855e-05,
"loss": 3179.05,
"step": 8420
},
{
"ce_loss_13": 2.339446923136711,
"ce_loss_26": 1.836980375647545,
"ce_loss_39": 1.66352079808712,
"ce_loss_52": 1.3777535080909729,
"ce_loss_7": 2.658161628246307,
"epoch": 0.843,
"grad_norm": 13.822498461755101,
"kl_loss_13": 2006.0,
"kl_loss_26": 933.6,
"kl_loss_39": 565.3,
"kl_loss_7": 2679.6,
"learning_rate": 6.078085445780129e-05,
"loss": 3158.075,
"step": 8430
},
{
"ce_loss_13": 2.4486551761627195,
"ce_loss_26": 1.9329589813947679,
"ce_loss_39": 1.7519038885831832,
"ce_loss_52": 1.4464032799005508,
"ce_loss_7": 2.776008838415146,
"epoch": 0.844,
"grad_norm": 13.676549255063142,
"kl_loss_13": 2050.2,
"kl_loss_26": 971.9,
"kl_loss_39": 595.05,
"kl_loss_7": 2741.2,
"learning_rate": 6.002487109467347e-05,
"loss": 3155.95,
"step": 8440
},
{
"ce_loss_13": 2.467064255475998,
"ce_loss_26": 1.9566147327423096,
"ce_loss_39": 1.7722377121448516,
"ce_loss_52": 1.4749930799007416,
"ce_loss_7": 2.7902898490428925,
"epoch": 0.845,
"grad_norm": 15.195098706872598,
"kl_loss_13": 2033.8,
"kl_loss_26": 959.4,
"kl_loss_39": 581.0,
"kl_loss_7": 2712.0,
"learning_rate": 5.927331827620902e-05,
"loss": 3169.7,
"step": 8450
},
{
"ce_loss_13": 2.389095312356949,
"ce_loss_26": 1.8802162408828735,
"ce_loss_39": 1.7042785853147506,
"ce_loss_52": 1.4126853346824646,
"ce_loss_7": 2.714692497253418,
"epoch": 0.846,
"grad_norm": 14.567703807315205,
"kl_loss_13": 2016.0,
"kl_loss_26": 940.7,
"kl_loss_39": 574.25,
"kl_loss_7": 2700.8,
"learning_rate": 5.852620357053651e-05,
"loss": 3111.0,
"step": 8460
},
{
"ce_loss_13": 2.4798492193222046,
"ce_loss_26": 1.9612985998392105,
"ce_loss_39": 1.7813422173261642,
"ce_loss_52": 1.4634388938546181,
"ce_loss_7": 2.8045433819293977,
"epoch": 0.847,
"grad_norm": 13.718944289269317,
"kl_loss_13": 2114.4,
"kl_loss_26": 1008.1,
"kl_loss_39": 624.7,
"kl_loss_7": 2801.2,
"learning_rate": 5.778353450109286e-05,
"loss": 3195.2,
"step": 8470
},
{
"ce_loss_13": 2.38386265039444,
"ce_loss_26": 1.8872032672166825,
"ce_loss_39": 1.710950767993927,
"ce_loss_52": 1.432640826702118,
"ce_loss_7": 2.694201183319092,
"epoch": 0.848,
"grad_norm": 14.259266485735452,
"kl_loss_13": 1968.6,
"kl_loss_26": 914.3,
"kl_loss_39": 549.1,
"kl_loss_7": 2623.2,
"learning_rate": 5.7045318546547206e-05,
"loss": 3137.025,
"step": 8480
},
{
"ce_loss_13": 2.427330991625786,
"ce_loss_26": 1.910456082224846,
"ce_loss_39": 1.7292529791593552,
"ce_loss_52": 1.4334994465112687,
"ce_loss_7": 2.7538663387298583,
"epoch": 0.849,
"grad_norm": 13.656054783413236,
"kl_loss_13": 2033.4,
"kl_loss_26": 954.9,
"kl_loss_39": 577.85,
"kl_loss_7": 2719.6,
"learning_rate": 5.631156314072605e-05,
"loss": 3150.65,
"step": 8490
},
{
"ce_loss_13": 2.471989703178406,
"ce_loss_26": 1.9505236119031906,
"ce_loss_39": 1.7668047726154328,
"ce_loss_52": 1.4608478724956513,
"ce_loss_7": 2.8063031315803526,
"epoch": 0.85,
"grad_norm": 13.681266151650567,
"kl_loss_13": 2080.4,
"kl_loss_26": 982.8,
"kl_loss_39": 600.35,
"kl_loss_7": 2780.8,
"learning_rate": 5.5582275672538315e-05,
"loss": 3137.95,
"step": 8500
},
{
"ce_loss_13": 2.4605359852313997,
"ce_loss_26": 1.939817100763321,
"ce_loss_39": 1.753285875916481,
"ce_loss_52": 1.4498123317956924,
"ce_loss_7": 2.7863478004932403,
"epoch": 0.851,
"grad_norm": 14.038129063761419,
"kl_loss_13": 2055.6,
"kl_loss_26": 978.1,
"kl_loss_39": 593.3,
"kl_loss_7": 2740.0,
"learning_rate": 5.4857463485900484e-05,
"loss": 3144.75,
"step": 8510
},
{
"ce_loss_13": 2.450565594434738,
"ce_loss_26": 1.9354894876480102,
"ce_loss_39": 1.7570757120847702,
"ce_loss_52": 1.458202052116394,
"ce_loss_7": 2.770776855945587,
"epoch": 0.852,
"grad_norm": 13.945411873449233,
"kl_loss_13": 2028.8,
"kl_loss_26": 956.4,
"kl_loss_39": 584.4,
"kl_loss_7": 2704.4,
"learning_rate": 5.413713387966329e-05,
"loss": 3147.35,
"step": 8520
},
{
"ce_loss_13": 2.384511134028435,
"ce_loss_26": 1.885845959186554,
"ce_loss_39": 1.7117790162563324,
"ce_loss_52": 1.4215580940246582,
"ce_loss_7": 2.6971611440181733,
"epoch": 0.853,
"grad_norm": 14.280539701015499,
"kl_loss_13": 1983.0,
"kl_loss_26": 937.9,
"kl_loss_39": 571.6,
"kl_loss_7": 2646.0,
"learning_rate": 5.34212941075381e-05,
"loss": 3138.7,
"step": 8530
},
{
"ce_loss_13": 2.4187089085578917,
"ce_loss_26": 1.9180882632732392,
"ce_loss_39": 1.7486219108104706,
"ce_loss_52": 1.4668725609779358,
"ce_loss_7": 2.7367011964321137,
"epoch": 0.854,
"grad_norm": 13.894895708827713,
"kl_loss_13": 1988.6,
"kl_loss_26": 918.2,
"kl_loss_39": 556.1,
"kl_loss_7": 2660.4,
"learning_rate": 5.270995137802315e-05,
"loss": 3116.45,
"step": 8540
},
{
"ce_loss_13": 2.415088692307472,
"ce_loss_26": 1.9041693419218064,
"ce_loss_39": 1.7267356216907501,
"ce_loss_52": 1.4279317557811737,
"ce_loss_7": 2.7469893753528596,
"epoch": 0.855,
"grad_norm": 14.31134935015859,
"kl_loss_13": 2040.4,
"kl_loss_26": 959.5,
"kl_loss_39": 585.9,
"kl_loss_7": 2737.2,
"learning_rate": 5.2003112854332125e-05,
"loss": 3108.3,
"step": 8550
},
{
"ce_loss_13": 2.461696755886078,
"ce_loss_26": 1.9468512892723084,
"ce_loss_39": 1.7631896048784257,
"ce_loss_52": 1.4654896438121796,
"ce_loss_7": 2.780461609363556,
"epoch": 0.856,
"grad_norm": 14.365329511248612,
"kl_loss_13": 2070.0,
"kl_loss_26": 974.7,
"kl_loss_39": 589.2,
"kl_loss_7": 2744.0,
"learning_rate": 5.130078565432089e-05,
"loss": 3173.9,
"step": 8560
},
{
"ce_loss_13": 2.4338246136903763,
"ce_loss_26": 1.9260531306266784,
"ce_loss_39": 1.746686053276062,
"ce_loss_52": 1.4506986886262894,
"ce_loss_7": 2.7584114193916323,
"epoch": 0.857,
"grad_norm": 13.58696871299719,
"kl_loss_13": 2040.8,
"kl_loss_26": 958.4,
"kl_loss_39": 587.6,
"kl_loss_7": 2720.8,
"learning_rate": 5.060297685041659e-05,
"loss": 3124.45,
"step": 8570
},
{
"ce_loss_13": 2.465153419971466,
"ce_loss_26": 1.9324041992425918,
"ce_loss_39": 1.747180885076523,
"ce_loss_52": 1.4378513038158416,
"ce_loss_7": 2.8079187512397765,
"epoch": 0.858,
"grad_norm": 13.69346881706034,
"kl_loss_13": 2127.2,
"kl_loss_26": 1005.7,
"kl_loss_39": 610.45,
"kl_loss_7": 2842.8,
"learning_rate": 4.99096934695461e-05,
"loss": 3135.55,
"step": 8580
},
{
"ce_loss_13": 2.428412067890167,
"ce_loss_26": 1.9264894247055053,
"ce_loss_39": 1.748258227109909,
"ce_loss_52": 1.4425264418125152,
"ce_loss_7": 2.7523947954177856,
"epoch": 0.859,
"grad_norm": 14.416520228353955,
"kl_loss_13": 2024.4,
"kl_loss_26": 961.5,
"kl_loss_39": 593.95,
"kl_loss_7": 2704.0,
"learning_rate": 4.922094249306558e-05,
"loss": 3141.65,
"step": 8590
},
{
"ce_loss_13": 2.379640507698059,
"ce_loss_26": 1.8703870117664336,
"ce_loss_39": 1.6925265491008759,
"ce_loss_52": 1.4014200061559676,
"ce_loss_7": 2.7008225679397584,
"epoch": 0.86,
"grad_norm": 14.548164181811538,
"kl_loss_13": 2019.0,
"kl_loss_26": 952.8,
"kl_loss_39": 580.7,
"kl_loss_7": 2704.4,
"learning_rate": 4.853673085668947e-05,
"loss": 3164.35,
"step": 8600
},
{
"ce_loss_13": 2.3882300436496733,
"ce_loss_26": 1.8816026329994202,
"ce_loss_39": 1.701096272468567,
"ce_loss_52": 1.4169353902339936,
"ce_loss_7": 2.711241126060486,
"epoch": 0.861,
"grad_norm": 13.997789256280235,
"kl_loss_13": 2014.0,
"kl_loss_26": 941.1,
"kl_loss_39": 561.45,
"kl_loss_7": 2697.6,
"learning_rate": 4.78570654504214e-05,
"loss": 3160.75,
"step": 8610
},
{
"ce_loss_13": 2.4367538392543793,
"ce_loss_26": 1.9072220534086228,
"ce_loss_39": 1.724021741747856,
"ce_loss_52": 1.4287808299064637,
"ce_loss_7": 2.7643966376781464,
"epoch": 0.862,
"grad_norm": 13.935763093108008,
"kl_loss_13": 2067.0,
"kl_loss_26": 962.6,
"kl_loss_39": 576.05,
"kl_loss_7": 2765.6,
"learning_rate": 4.7181953118484556e-05,
"loss": 3127.35,
"step": 8620
},
{
"ce_loss_13": 2.424758407473564,
"ce_loss_26": 1.9169064074754716,
"ce_loss_39": 1.7346068799495697,
"ce_loss_52": 1.4445551723241805,
"ce_loss_7": 2.7455954015254975,
"epoch": 0.863,
"grad_norm": 14.314671068665351,
"kl_loss_13": 2012.8,
"kl_loss_26": 950.4,
"kl_loss_39": 570.95,
"kl_loss_7": 2679.2,
"learning_rate": 4.651140065925269e-05,
"loss": 3115.55,
"step": 8630
},
{
"ce_loss_13": 2.5092544078826906,
"ce_loss_26": 1.9829395413398743,
"ce_loss_39": 1.7923680394887924,
"ce_loss_52": 1.481997686624527,
"ce_loss_7": 2.838895618915558,
"epoch": 0.864,
"grad_norm": 14.264233585666156,
"kl_loss_13": 2110.2,
"kl_loss_26": 1005.9,
"kl_loss_39": 616.9,
"kl_loss_7": 2805.2,
"learning_rate": 4.58454148251814e-05,
"loss": 3142.0,
"step": 8640
},
{
"ce_loss_13": 2.43146056830883,
"ce_loss_26": 1.9165301382541657,
"ce_loss_39": 1.7361394971609116,
"ce_loss_52": 1.4455705016851426,
"ce_loss_7": 2.7573634922504424,
"epoch": 0.865,
"grad_norm": 13.3838976309547,
"kl_loss_13": 2018.4,
"kl_loss_26": 941.1,
"kl_loss_39": 567.4,
"kl_loss_7": 2702.0,
"learning_rate": 4.518400232274078e-05,
"loss": 3128.55,
"step": 8650
},
{
"ce_loss_13": 2.4056933134794236,
"ce_loss_26": 1.8982498347759247,
"ce_loss_39": 1.7155311942100524,
"ce_loss_52": 1.4283919543027879,
"ce_loss_7": 2.7339151203632355,
"epoch": 0.866,
"grad_norm": 13.776729544594009,
"kl_loss_13": 2029.0,
"kl_loss_26": 958.5,
"kl_loss_39": 575.6,
"kl_loss_7": 2712.0,
"learning_rate": 4.452716981234745e-05,
"loss": 3168.35,
"step": 8660
},
{
"ce_loss_13": 2.4304875314235685,
"ce_loss_26": 1.9086535692214965,
"ce_loss_39": 1.731420534849167,
"ce_loss_52": 1.4311698615550994,
"ce_loss_7": 2.76586651802063,
"epoch": 0.867,
"grad_norm": 14.03220486850696,
"kl_loss_13": 2043.6,
"kl_loss_26": 956.8,
"kl_loss_39": 585.05,
"kl_loss_7": 2738.0,
"learning_rate": 4.3874923908297335e-05,
"loss": 3147.75,
"step": 8670
},
{
"ce_loss_13": 2.422931173443794,
"ce_loss_26": 1.8919979512691498,
"ce_loss_39": 1.7088686615228652,
"ce_loss_52": 1.4077896371483802,
"ce_loss_7": 2.757069969177246,
"epoch": 0.868,
"grad_norm": 14.116898056864903,
"kl_loss_13": 2080.8,
"kl_loss_26": 974.7,
"kl_loss_39": 600.2,
"kl_loss_7": 2778.4,
"learning_rate": 4.322727117869951e-05,
"loss": 3132.8,
"step": 8680
},
{
"ce_loss_13": 2.4079675406217573,
"ce_loss_26": 1.9030864268541337,
"ce_loss_39": 1.7309907704591752,
"ce_loss_52": 1.4429899513721467,
"ce_loss_7": 2.725788599252701,
"epoch": 0.869,
"grad_norm": 13.611192303364593,
"kl_loss_13": 1991.0,
"kl_loss_26": 927.4,
"kl_loss_39": 558.5,
"kl_loss_7": 2668.4,
"learning_rate": 4.2584218145409916e-05,
"loss": 3135.35,
"step": 8690
},
{
"ce_loss_13": 2.3869937509298325,
"ce_loss_26": 1.8738444805145265,
"ce_loss_39": 1.6931981056928636,
"ce_loss_52": 1.400150865316391,
"ce_loss_7": 2.709615921974182,
"epoch": 0.87,
"grad_norm": 14.271447400785455,
"kl_loss_13": 2015.8,
"kl_loss_26": 944.7,
"kl_loss_39": 573.1,
"kl_loss_7": 2692.8,
"learning_rate": 4.194577128396521e-05,
"loss": 3114.95,
"step": 8700
},
{
"ce_loss_13": 2.465124714374542,
"ce_loss_26": 1.961993396282196,
"ce_loss_39": 1.7869856834411622,
"ce_loss_52": 1.4856192290782928,
"ce_loss_7": 2.785760098695755,
"epoch": 0.871,
"grad_norm": 13.70337275168833,
"kl_loss_13": 2015.8,
"kl_loss_26": 955.1,
"kl_loss_39": 582.9,
"kl_loss_7": 2693.6,
"learning_rate": 4.1311937023518264e-05,
"loss": 3146.8,
"step": 8710
},
{
"ce_loss_13": 2.432892268896103,
"ce_loss_26": 1.9152618199586868,
"ce_loss_39": 1.7295002430677413,
"ce_loss_52": 1.4359665989875794,
"ce_loss_7": 2.7681704640388487,
"epoch": 0.872,
"grad_norm": 14.713342569921842,
"kl_loss_13": 2035.8,
"kl_loss_26": 958.4,
"kl_loss_39": 578.45,
"kl_loss_7": 2721.6,
"learning_rate": 4.0682721746773344e-05,
"loss": 3128.35,
"step": 8720
},
{
"ce_loss_13": 2.3963799655437468,
"ce_loss_26": 1.8889498293399811,
"ce_loss_39": 1.7117068350315094,
"ce_loss_52": 1.4276205718517303,
"ce_loss_7": 2.7207881271839143,
"epoch": 0.873,
"grad_norm": 14.269783754551202,
"kl_loss_13": 2001.0,
"kl_loss_26": 932.4,
"kl_loss_39": 561.8,
"kl_loss_7": 2688.8,
"learning_rate": 4.0058131789920904e-05,
"loss": 3131.65,
"step": 8730
},
{
"ce_loss_13": 2.418975955247879,
"ce_loss_26": 1.8960406243801118,
"ce_loss_39": 1.7133139997720719,
"ce_loss_52": 1.4180105909705163,
"ce_loss_7": 2.7614206850528715,
"epoch": 0.874,
"grad_norm": 14.114592900051933,
"kl_loss_13": 2079.4,
"kl_loss_26": 975.5,
"kl_loss_39": 598.35,
"kl_loss_7": 2790.4,
"learning_rate": 3.9438173442575e-05,
"loss": 3100.7,
"step": 8740
},
{
"ce_loss_13": 2.4537671864032746,
"ce_loss_26": 1.946785607933998,
"ce_loss_39": 1.7634260147809981,
"ce_loss_52": 1.4690344750881195,
"ce_loss_7": 2.775768506526947,
"epoch": 0.875,
"grad_norm": 14.35027266854894,
"kl_loss_13": 2027.0,
"kl_loss_26": 955.8,
"kl_loss_39": 572.25,
"kl_loss_7": 2707.2,
"learning_rate": 3.882285294770937e-05,
"loss": 3145.7,
"step": 8750
},
{
"ce_loss_13": 2.4146986842155456,
"ce_loss_26": 1.8880977869033813,
"ce_loss_39": 1.7063632160425186,
"ce_loss_52": 1.403985047340393,
"ce_loss_7": 2.741646242141724,
"epoch": 0.876,
"grad_norm": 14.018669111873198,
"kl_loss_13": 2049.4,
"kl_loss_26": 969.5,
"kl_loss_39": 590.2,
"kl_loss_7": 2731.6,
"learning_rate": 3.821217650159453e-05,
"loss": 3139.0,
"step": 8760
},
{
"ce_loss_13": 2.3532112538814545,
"ce_loss_26": 1.8646484702825545,
"ce_loss_39": 1.6983740404248238,
"ce_loss_52": 1.4196315869688987,
"ce_loss_7": 2.666028293967247,
"epoch": 0.877,
"grad_norm": 13.69993271737963,
"kl_loss_13": 1919.2,
"kl_loss_26": 890.8,
"kl_loss_39": 542.85,
"kl_loss_7": 2582.0,
"learning_rate": 3.760615025373543e-05,
"loss": 3109.35,
"step": 8770
},
{
"ce_loss_13": 2.4570236086845396,
"ce_loss_26": 1.938426810503006,
"ce_loss_39": 1.7564183056354523,
"ce_loss_52": 1.453689630329609,
"ce_loss_7": 2.787124717235565,
"epoch": 0.878,
"grad_norm": 14.764457343001293,
"kl_loss_13": 2048.4,
"kl_loss_26": 962.8,
"kl_loss_39": 585.3,
"kl_loss_7": 2732.8,
"learning_rate": 3.700478030680987e-05,
"loss": 3153.7,
"step": 8780
},
{
"ce_loss_13": 2.4401866495609283,
"ce_loss_26": 1.9257715612649917,
"ce_loss_39": 1.7440615922212601,
"ce_loss_52": 1.4473457425832748,
"ce_loss_7": 2.76454553604126,
"epoch": 0.879,
"grad_norm": 13.624797595400036,
"kl_loss_13": 2038.4,
"kl_loss_26": 958.5,
"kl_loss_39": 579.85,
"kl_loss_7": 2717.6,
"learning_rate": 3.6408072716606344e-05,
"loss": 3158.95,
"step": 8790
},
{
"ce_loss_13": 2.3960734605789185,
"ce_loss_26": 1.8874068677425384,
"ce_loss_39": 1.7073772728443146,
"ce_loss_52": 1.4144951313734055,
"ce_loss_7": 2.7204393565654756,
"epoch": 0.88,
"grad_norm": 13.565627756275363,
"kl_loss_13": 2008.0,
"kl_loss_26": 944.3,
"kl_loss_39": 574.6,
"kl_loss_7": 2694.0,
"learning_rate": 3.5816033491963716e-05,
"loss": 3127.45,
"step": 8800
},
{
"ce_loss_13": 2.426618826389313,
"ce_loss_26": 1.921605721116066,
"ce_loss_39": 1.7422660619020462,
"ce_loss_52": 1.4409180462360383,
"ce_loss_7": 2.7458843529224395,
"epoch": 0.881,
"grad_norm": 14.680252482648882,
"kl_loss_13": 2012.6,
"kl_loss_26": 955.2,
"kl_loss_39": 587.45,
"kl_loss_7": 2678.0,
"learning_rate": 3.522866859471047e-05,
"loss": 3106.075,
"step": 8810
},
{
"ce_loss_13": 2.4589924097061155,
"ce_loss_26": 1.9394809186458588,
"ce_loss_39": 1.7563898861408234,
"ce_loss_52": 1.459865990281105,
"ce_loss_7": 2.780824285745621,
"epoch": 0.882,
"grad_norm": 13.410206661523853,
"kl_loss_13": 2065.4,
"kl_loss_26": 975.8,
"kl_loss_39": 594.35,
"kl_loss_7": 2745.6,
"learning_rate": 3.46459839396045e-05,
"loss": 3162.5,
"step": 8820
},
{
"ce_loss_13": 2.444491392374039,
"ce_loss_26": 1.919321459531784,
"ce_loss_39": 1.7317459166049958,
"ce_loss_52": 1.4241959005594254,
"ce_loss_7": 2.7687501907348633,
"epoch": 0.883,
"grad_norm": 13.604154476294898,
"kl_loss_13": 2078.6,
"kl_loss_26": 980.1,
"kl_loss_39": 596.05,
"kl_loss_7": 2764.8,
"learning_rate": 3.406798539427386e-05,
"loss": 3137.15,
"step": 8830
},
{
"ce_loss_13": 2.4456652402877808,
"ce_loss_26": 1.9403656631708146,
"ce_loss_39": 1.7609369516372682,
"ce_loss_52": 1.4744407176971435,
"ce_loss_7": 2.762556844949722,
"epoch": 0.884,
"grad_norm": 14.0628693009416,
"kl_loss_13": 1994.0,
"kl_loss_26": 932.0,
"kl_loss_39": 557.15,
"kl_loss_7": 2667.2,
"learning_rate": 3.349467877915746e-05,
"loss": 3099.2,
"step": 8840
},
{
"ce_loss_13": 2.4576884746551513,
"ce_loss_26": 1.9445500463247298,
"ce_loss_39": 1.764642345905304,
"ce_loss_52": 1.4638898521661758,
"ce_loss_7": 2.7794412195682527,
"epoch": 0.885,
"grad_norm": 13.731348532638435,
"kl_loss_13": 2048.2,
"kl_loss_26": 971.4,
"kl_loss_39": 596.3,
"kl_loss_7": 2722.8,
"learning_rate": 3.292606986744667e-05,
"loss": 3152.675,
"step": 8850
},
{
"ce_loss_13": 2.4683742761611938,
"ce_loss_26": 1.9468118786811828,
"ce_loss_39": 1.7659433901309967,
"ce_loss_52": 1.4723648518323897,
"ce_loss_7": 2.794690328836441,
"epoch": 0.886,
"grad_norm": 14.768179589766811,
"kl_loss_13": 2053.6,
"kl_loss_26": 960.6,
"kl_loss_39": 581.1,
"kl_loss_7": 2740.0,
"learning_rate": 3.23621643850267e-05,
"loss": 3135.9,
"step": 8860
},
{
"ce_loss_13": 2.3716426849365235,
"ce_loss_26": 1.8652270317077637,
"ce_loss_39": 1.6864412546157836,
"ce_loss_52": 1.4016476958990096,
"ce_loss_7": 2.6954882085323333,
"epoch": 0.887,
"grad_norm": 13.861337667665934,
"kl_loss_13": 1995.8,
"kl_loss_26": 932.7,
"kl_loss_39": 558.6,
"kl_loss_7": 2668.8,
"learning_rate": 3.180296801041971e-05,
"loss": 3116.05,
"step": 8870
},
{
"ce_loss_13": 2.406647819280624,
"ce_loss_26": 1.8954071879386902,
"ce_loss_39": 1.7191426277160644,
"ce_loss_52": 1.4318138241767884,
"ce_loss_7": 2.7277746230363844,
"epoch": 0.888,
"grad_norm": 14.18696677740657,
"kl_loss_13": 1999.0,
"kl_loss_26": 937.5,
"kl_loss_39": 568.1,
"kl_loss_7": 2671.2,
"learning_rate": 3.124848637472688e-05,
"loss": 3120.85,
"step": 8880
},
{
"ce_loss_13": 2.4183617502450945,
"ce_loss_26": 1.9087469071149825,
"ce_loss_39": 1.731605476140976,
"ce_loss_52": 1.4397760301828384,
"ce_loss_7": 2.7426804542541503,
"epoch": 0.889,
"grad_norm": 14.005614784571057,
"kl_loss_13": 2023.4,
"kl_loss_26": 938.6,
"kl_loss_39": 572.0,
"kl_loss_7": 2704.0,
"learning_rate": 3.069872506157212e-05,
"loss": 3140.35,
"step": 8890
},
{
"ce_loss_13": 2.3538796246051787,
"ce_loss_26": 1.8491210967302323,
"ce_loss_39": 1.6793664902448655,
"ce_loss_52": 1.4003751114010812,
"ce_loss_7": 2.6818648397922518,
"epoch": 0.89,
"grad_norm": 13.507964025411896,
"kl_loss_13": 1969.8,
"kl_loss_26": 909.9,
"kl_loss_39": 551.1,
"kl_loss_7": 2655.2,
"learning_rate": 3.0153689607045842e-05,
"loss": 3115.9,
"step": 8900
},
{
"ce_loss_13": 2.3964832425117493,
"ce_loss_26": 1.8831844747066497,
"ce_loss_39": 1.7046507805585862,
"ce_loss_52": 1.4187346428632737,
"ce_loss_7": 2.72187722325325,
"epoch": 0.891,
"grad_norm": 14.258655617102612,
"kl_loss_13": 2013.6,
"kl_loss_26": 929.9,
"kl_loss_39": 559.3,
"kl_loss_7": 2705.2,
"learning_rate": 2.9613385499648926e-05,
"loss": 3133.85,
"step": 8910
},
{
"ce_loss_13": 2.3858442664146424,
"ce_loss_26": 1.8839757442474365,
"ce_loss_39": 1.709816351532936,
"ce_loss_52": 1.430666272342205,
"ce_loss_7": 2.6985366463661196,
"epoch": 0.892,
"grad_norm": 14.05296041871021,
"kl_loss_13": 1977.6,
"kl_loss_26": 918.7,
"kl_loss_39": 552.4,
"kl_loss_7": 2640.0,
"learning_rate": 2.9077818180237692e-05,
"loss": 3161.85,
"step": 8920
},
{
"ce_loss_13": 2.4120604634284972,
"ce_loss_26": 1.9131643176078796,
"ce_loss_39": 1.741806897521019,
"ce_loss_52": 1.4555314972996711,
"ce_loss_7": 2.7308483004570006,
"epoch": 0.893,
"grad_norm": 14.216852449574784,
"kl_loss_13": 1989.0,
"kl_loss_26": 931.9,
"kl_loss_39": 565.55,
"kl_loss_7": 2658.8,
"learning_rate": 2.8546993041969172e-05,
"loss": 3113.0,
"step": 8930
},
{
"ce_loss_13": 2.417782390117645,
"ce_loss_26": 1.9075176060199737,
"ce_loss_39": 1.7230383425951004,
"ce_loss_52": 1.428551298379898,
"ce_loss_7": 2.737876206636429,
"epoch": 0.894,
"grad_norm": 13.81624953017635,
"kl_loss_13": 2024.2,
"kl_loss_26": 956.0,
"kl_loss_39": 579.25,
"kl_loss_7": 2694.8,
"learning_rate": 2.802091543024671e-05,
"loss": 3118.3,
"step": 8940
},
{
"ce_loss_13": 2.4281566560268404,
"ce_loss_26": 1.9209083169698715,
"ce_loss_39": 1.7325717121362687,
"ce_loss_52": 1.4276321291923524,
"ce_loss_7": 2.7494910418987275,
"epoch": 0.895,
"grad_norm": 14.667622613471202,
"kl_loss_13": 2068.0,
"kl_loss_26": 993.0,
"kl_loss_39": 599.95,
"kl_loss_7": 2738.4,
"learning_rate": 2.7499590642665774e-05,
"loss": 3152.3,
"step": 8950
},
{
"ce_loss_13": 2.4123401612043383,
"ce_loss_26": 1.9057573080062866,
"ce_loss_39": 1.725106343626976,
"ce_loss_52": 1.43211932182312,
"ce_loss_7": 2.7356060326099394,
"epoch": 0.896,
"grad_norm": 13.84770444101258,
"kl_loss_13": 2014.8,
"kl_loss_26": 952.6,
"kl_loss_39": 575.75,
"kl_loss_7": 2696.2,
"learning_rate": 2.6983023928961405e-05,
"loss": 3131.5,
"step": 8960
},
{
"ce_loss_13": 2.3709884881973267,
"ce_loss_26": 1.868075394630432,
"ce_loss_39": 1.6942670613527298,
"ce_loss_52": 1.3973538905382157,
"ce_loss_7": 2.69813577234745,
"epoch": 0.897,
"grad_norm": 14.584597117704368,
"kl_loss_13": 2004.8,
"kl_loss_26": 940.7,
"kl_loss_39": 576.3,
"kl_loss_7": 2687.2,
"learning_rate": 2.6471220490954628e-05,
"loss": 3144.15,
"step": 8970
},
{
"ce_loss_13": 2.421136862039566,
"ce_loss_26": 1.9213113605976104,
"ce_loss_39": 1.7483066588640213,
"ce_loss_52": 1.4667307168245316,
"ce_loss_7": 2.7391393184661865,
"epoch": 0.898,
"grad_norm": 14.053042951615785,
"kl_loss_13": 1968.0,
"kl_loss_26": 914.9,
"kl_loss_39": 554.05,
"kl_loss_7": 2636.4,
"learning_rate": 2.596418548250029e-05,
"loss": 3077.8,
"step": 8980
},
{
"ce_loss_13": 2.383489468693733,
"ce_loss_26": 1.8838744014501572,
"ce_loss_39": 1.7122972816228867,
"ce_loss_52": 1.4258252471685409,
"ce_loss_7": 2.7002600908279417,
"epoch": 0.899,
"grad_norm": 13.786718301064743,
"kl_loss_13": 1996.2,
"kl_loss_26": 931.7,
"kl_loss_39": 571.05,
"kl_loss_7": 2666.4,
"learning_rate": 2.5461924009435368e-05,
"loss": 3075.25,
"step": 8990
},
{
"ce_loss_13": 2.3905730485916137,
"ce_loss_26": 1.8808085292577743,
"ce_loss_39": 1.7040841788053513,
"ce_loss_52": 1.4255578130483628,
"ce_loss_7": 2.7172752916812897,
"epoch": 0.9,
"grad_norm": 14.142930179347214,
"kl_loss_13": 1983.0,
"kl_loss_26": 912.3,
"kl_loss_39": 546.25,
"kl_loss_7": 2670.0,
"learning_rate": 2.4964441129527336e-05,
"loss": 3116.85,
"step": 9000
},
{
"ce_loss_13": 2.43511378467083,
"ce_loss_26": 1.926012173295021,
"ce_loss_39": 1.7415268182754517,
"ce_loss_52": 1.4408889025449754,
"ce_loss_7": 2.761294722557068,
"epoch": 0.901,
"grad_norm": 13.928144692729873,
"kl_loss_13": 2050.6,
"kl_loss_26": 980.3,
"kl_loss_39": 592.65,
"kl_loss_7": 2732.0,
"learning_rate": 2.4471741852423235e-05,
"loss": 3132.7,
"step": 9010
},
{
"ce_loss_13": 2.3582999795675277,
"ce_loss_26": 1.8586651980876923,
"ce_loss_39": 1.6817226380109787,
"ce_loss_52": 1.3925694867968559,
"ce_loss_7": 2.6693267047405245,
"epoch": 0.902,
"grad_norm": 14.116392246566738,
"kl_loss_13": 1978.6,
"kl_loss_26": 929.3,
"kl_loss_39": 571.5,
"kl_loss_7": 2638.8,
"learning_rate": 2.3983831139599287e-05,
"loss": 3114.75,
"step": 9020
},
{
"ce_loss_13": 2.418634516000748,
"ce_loss_26": 1.9078166902065277,
"ce_loss_39": 1.7249888181686401,
"ce_loss_52": 1.4335400015115738,
"ce_loss_7": 2.7412546992301943,
"epoch": 0.903,
"grad_norm": 13.637306812880459,
"kl_loss_13": 2036.0,
"kl_loss_26": 950.4,
"kl_loss_39": 575.5,
"kl_loss_7": 2710.0,
"learning_rate": 2.3500713904311022e-05,
"loss": 3133.35,
"step": 9030
},
{
"ce_loss_13": 2.3931309431791306,
"ce_loss_26": 1.877245968580246,
"ce_loss_39": 1.7018247723579407,
"ce_loss_52": 1.4155418664216994,
"ce_loss_7": 2.7214196979999543,
"epoch": 0.904,
"grad_norm": 14.785773850168622,
"kl_loss_13": 2014.2,
"kl_loss_26": 931.6,
"kl_loss_39": 566.7,
"kl_loss_7": 2697.2,
"learning_rate": 2.3022395011543685e-05,
"loss": 3107.4,
"step": 9040
},
{
"ce_loss_13": 2.4141552269458773,
"ce_loss_26": 1.9003157913684845,
"ce_loss_39": 1.7119427561759948,
"ce_loss_52": 1.4218156844377519,
"ce_loss_7": 2.738201731443405,
"epoch": 0.905,
"grad_norm": 15.004045135842484,
"kl_loss_13": 2046.4,
"kl_loss_26": 956.8,
"kl_loss_39": 574.8,
"kl_loss_7": 2728.0,
"learning_rate": 2.2548879277963063e-05,
"loss": 3131.35,
"step": 9050
},
{
"ce_loss_13": 2.4368073105812074,
"ce_loss_26": 1.9358865648508072,
"ce_loss_39": 1.758334356546402,
"ce_loss_52": 1.4668044418096542,
"ce_loss_7": 2.7515600681304933,
"epoch": 0.906,
"grad_norm": 14.409368784921233,
"kl_loss_13": 1999.0,
"kl_loss_26": 944.8,
"kl_loss_39": 573.2,
"kl_loss_7": 2666.4,
"learning_rate": 2.208017147186736e-05,
"loss": 3129.15,
"step": 9060
},
{
"ce_loss_13": 2.4492080837488173,
"ce_loss_26": 1.9512592017650605,
"ce_loss_39": 1.7678290545940398,
"ce_loss_52": 1.4713279128074646,
"ce_loss_7": 2.772859865427017,
"epoch": 0.907,
"grad_norm": 14.409868370647818,
"kl_loss_13": 2027.6,
"kl_loss_26": 962.9,
"kl_loss_39": 580.75,
"kl_loss_7": 2708.4,
"learning_rate": 2.1616276313139227e-05,
"loss": 3136.45,
"step": 9070
},
{
"ce_loss_13": 2.3629566222429275,
"ce_loss_26": 1.866456887125969,
"ce_loss_39": 1.692075565457344,
"ce_loss_52": 1.4036286368966102,
"ce_loss_7": 2.681915229558945,
"epoch": 0.908,
"grad_norm": 13.121489500516578,
"kl_loss_13": 1981.6,
"kl_loss_26": 934.0,
"kl_loss_39": 568.15,
"kl_loss_7": 2653.6,
"learning_rate": 2.1157198473197415e-05,
"loss": 3145.35,
"step": 9080
},
{
"ce_loss_13": 2.4373748511075974,
"ce_loss_26": 1.9266161501407624,
"ce_loss_39": 1.7425854057073593,
"ce_loss_52": 1.4446089684963226,
"ce_loss_7": 2.7682818710803985,
"epoch": 0.909,
"grad_norm": 14.230511024408004,
"kl_loss_13": 2056.6,
"kl_loss_26": 973.2,
"kl_loss_39": 588.0,
"kl_loss_7": 2750.4,
"learning_rate": 2.0702942574950812e-05,
"loss": 3127.5,
"step": 9090
},
{
"ce_loss_13": 2.4287337332963945,
"ce_loss_26": 1.9172603338956833,
"ce_loss_39": 1.7392981857061387,
"ce_loss_52": 1.4425375372171403,
"ce_loss_7": 2.7510289788246154,
"epoch": 0.91,
"grad_norm": 14.057406508919666,
"kl_loss_13": 2046.6,
"kl_loss_26": 964.4,
"kl_loss_39": 593.1,
"kl_loss_7": 2722.4,
"learning_rate": 2.025351319275137e-05,
"loss": 3121.6,
"step": 9100
},
{
"ce_loss_13": 2.4320779502391816,
"ce_loss_26": 1.9104785054922104,
"ce_loss_39": 1.7267427280545236,
"ce_loss_52": 1.437354525923729,
"ce_loss_7": 2.7588888108730316,
"epoch": 0.911,
"grad_norm": 14.001627192514695,
"kl_loss_13": 2041.6,
"kl_loss_26": 960.0,
"kl_loss_39": 579.65,
"kl_loss_7": 2727.2,
"learning_rate": 1.9808914852347816e-05,
"loss": 3132.0,
"step": 9110
},
{
"ce_loss_13": 2.4577192962169647,
"ce_loss_26": 1.9406163454055787,
"ce_loss_39": 1.7553843706846237,
"ce_loss_52": 1.4540565699338912,
"ce_loss_7": 2.7937385022640226,
"epoch": 0.912,
"grad_norm": 14.03191291932469,
"kl_loss_13": 2055.4,
"kl_loss_26": 970.4,
"kl_loss_39": 590.75,
"kl_loss_7": 2749.6,
"learning_rate": 1.9369152030840554e-05,
"loss": 3137.8,
"step": 9120
},
{
"ce_loss_13": 2.37432479262352,
"ce_loss_26": 1.874118760228157,
"ce_loss_39": 1.6963829159736634,
"ce_loss_52": 1.4185152500867844,
"ce_loss_7": 2.692077511548996,
"epoch": 0.913,
"grad_norm": 14.634301127161054,
"kl_loss_13": 1966.6,
"kl_loss_26": 913.3,
"kl_loss_39": 546.15,
"kl_loss_7": 2640.0,
"learning_rate": 1.893422915663645e-05,
"loss": 3130.35,
"step": 9130
},
{
"ce_loss_13": 2.485106924176216,
"ce_loss_26": 1.9777852237224578,
"ce_loss_39": 1.7995707392692566,
"ce_loss_52": 1.4930359899997712,
"ce_loss_7": 2.800886517763138,
"epoch": 0.914,
"grad_norm": 13.987291564294976,
"kl_loss_13": 2050.4,
"kl_loss_26": 985.4,
"kl_loss_39": 605.6,
"kl_loss_7": 2732.4,
"learning_rate": 1.850415060940386e-05,
"loss": 3103.7,
"step": 9140
},
{
"ce_loss_13": 2.4284686923027037,
"ce_loss_26": 1.928224155306816,
"ce_loss_39": 1.7489481002092362,
"ce_loss_52": 1.4568757116794586,
"ce_loss_7": 2.7469853341579435,
"epoch": 0.915,
"grad_norm": 14.256565267478091,
"kl_loss_13": 2014.8,
"kl_loss_26": 950.5,
"kl_loss_39": 576.35,
"kl_loss_7": 2683.2,
"learning_rate": 1.8078920720028978e-05,
"loss": 3089.85,
"step": 9150
},
{
"ce_loss_13": 2.397159770131111,
"ce_loss_26": 1.9007071822881698,
"ce_loss_39": 1.7246300727128983,
"ce_loss_52": 1.4446155533194542,
"ce_loss_7": 2.7181631565093993,
"epoch": 0.916,
"grad_norm": 14.893106945439357,
"kl_loss_13": 1959.0,
"kl_loss_26": 905.8,
"kl_loss_39": 547.35,
"kl_loss_7": 2633.6,
"learning_rate": 1.765854377057219e-05,
"loss": 3113.25,
"step": 9160
},
{
"ce_loss_13": 2.391612654924393,
"ce_loss_26": 1.8863595336675645,
"ce_loss_39": 1.7089181810617446,
"ce_loss_52": 1.4133323535323143,
"ce_loss_7": 2.7117814838886263,
"epoch": 0.917,
"grad_norm": 13.727089751993333,
"kl_loss_13": 2026.6,
"kl_loss_26": 952.4,
"kl_loss_39": 579.55,
"kl_loss_7": 2699.2,
"learning_rate": 1.724302399422456e-05,
"loss": 3114.9,
"step": 9170
},
{
"ce_loss_13": 2.3898652464151384,
"ce_loss_26": 1.8965242326259613,
"ce_loss_39": 1.7190593391656876,
"ce_loss_52": 1.4282272264361382,
"ce_loss_7": 2.70514101088047,
"epoch": 0.918,
"grad_norm": 14.466760453933187,
"kl_loss_13": 1964.6,
"kl_loss_26": 920.2,
"kl_loss_39": 557.4,
"kl_loss_7": 2637.2,
"learning_rate": 1.683236557526574e-05,
"loss": 3117.7,
"step": 9180
},
{
"ce_loss_13": 2.363905116915703,
"ce_loss_26": 1.8732207268476486,
"ce_loss_39": 1.6985140055418015,
"ce_loss_52": 1.4114336684346198,
"ce_loss_7": 2.6808120787143705,
"epoch": 0.919,
"grad_norm": 13.740003598916001,
"kl_loss_13": 1964.6,
"kl_loss_26": 921.6,
"kl_loss_39": 557.8,
"kl_loss_7": 2632.0,
"learning_rate": 1.6426572649021475e-05,
"loss": 3121.6,
"step": 9190
},
{
"ce_loss_13": 2.4235911548137663,
"ce_loss_26": 1.911160832643509,
"ce_loss_39": 1.7327239394187928,
"ce_loss_52": 1.4489689737558364,
"ce_loss_7": 2.740164947509766,
"epoch": 0.92,
"grad_norm": 14.26925795970607,
"kl_loss_13": 2008.2,
"kl_loss_26": 933.2,
"kl_loss_39": 559.8,
"kl_loss_7": 2678.0,
"learning_rate": 1.6025649301821876e-05,
"loss": 3113.6,
"step": 9200
},
{
"ce_loss_13": 2.4683689922094345,
"ce_loss_26": 1.9488540649414063,
"ce_loss_39": 1.7631026744842528,
"ce_loss_52": 1.4620952308177948,
"ce_loss_7": 2.794441765546799,
"epoch": 0.921,
"grad_norm": 14.076823047271347,
"kl_loss_13": 2051.4,
"kl_loss_26": 970.8,
"kl_loss_39": 588.7,
"kl_loss_7": 2729.2,
"learning_rate": 1.5629599570960716e-05,
"loss": 3104.8,
"step": 9210
},
{
"ce_loss_13": 2.3585807204246523,
"ce_loss_26": 1.8531203657388686,
"ce_loss_39": 1.6820847302675248,
"ce_loss_52": 1.4014029562473298,
"ce_loss_7": 2.6779536455869675,
"epoch": 0.922,
"grad_norm": 13.947276602522905,
"kl_loss_13": 1987.2,
"kl_loss_26": 913.9,
"kl_loss_39": 557.25,
"kl_loss_7": 2664.4,
"learning_rate": 1.5238427444654367e-05,
"loss": 3096.2,
"step": 9220
},
{
"ce_loss_13": 2.3690007477998734,
"ce_loss_26": 1.8584237039089202,
"ce_loss_39": 1.6857000291347504,
"ce_loss_52": 1.3957198202610015,
"ce_loss_7": 2.697771596908569,
"epoch": 0.923,
"grad_norm": 13.90974821940606,
"kl_loss_13": 2016.4,
"kl_loss_26": 937.5,
"kl_loss_39": 574.65,
"kl_loss_7": 2710.0,
"learning_rate": 1.4852136862001764e-05,
"loss": 3120.95,
"step": 9230
},
{
"ce_loss_13": 2.386936154961586,
"ce_loss_26": 1.8795882225036622,
"ce_loss_39": 1.7048650175333022,
"ce_loss_52": 1.4252549767494203,
"ce_loss_7": 2.7094414860010145,
"epoch": 0.924,
"grad_norm": 14.075158349443141,
"kl_loss_13": 1994.4,
"kl_loss_26": 926.6,
"kl_loss_39": 558.55,
"kl_loss_7": 2674.0,
"learning_rate": 1.4470731712944884e-05,
"loss": 3095.9,
"step": 9240
},
{
"ce_loss_13": 2.480703926086426,
"ce_loss_26": 1.9693680822849273,
"ce_loss_39": 1.7812021166086196,
"ce_loss_52": 1.4721150636672973,
"ce_loss_7": 2.808130669593811,
"epoch": 0.925,
"grad_norm": 13.823292097408816,
"kl_loss_13": 2057.0,
"kl_loss_26": 989.9,
"kl_loss_39": 602.9,
"kl_loss_7": 2740.4,
"learning_rate": 1.4094215838229174e-05,
"loss": 3114.1,
"step": 9250
},
{
"ce_loss_13": 2.415296331048012,
"ce_loss_26": 1.9087094902992248,
"ce_loss_39": 1.7277994453907013,
"ce_loss_52": 1.4381081372499467,
"ce_loss_7": 2.738765448331833,
"epoch": 0.926,
"grad_norm": 14.170973390394844,
"kl_loss_13": 2029.4,
"kl_loss_26": 949.8,
"kl_loss_39": 570.55,
"kl_loss_7": 2714.0,
"learning_rate": 1.372259302936546e-05,
"loss": 3105.95,
"step": 9260
},
{
"ce_loss_13": 2.348677235841751,
"ce_loss_26": 1.8576316490769387,
"ce_loss_39": 1.6828459605574608,
"ce_loss_52": 1.3921316027641297,
"ce_loss_7": 2.6635967582464217,
"epoch": 0.927,
"grad_norm": 14.119512693749174,
"kl_loss_13": 1987.8,
"kl_loss_26": 934.2,
"kl_loss_39": 566.95,
"kl_loss_7": 2660.0,
"learning_rate": 1.3355867028591206e-05,
"loss": 3097.55,
"step": 9270
},
{
"ce_loss_13": 2.3999607056379317,
"ce_loss_26": 1.8902852058410644,
"ce_loss_39": 1.7127097964286804,
"ce_loss_52": 1.4248219341039658,
"ce_loss_7": 2.7230198085308075,
"epoch": 0.928,
"grad_norm": 14.447160188213138,
"kl_loss_13": 1992.2,
"kl_loss_26": 927.2,
"kl_loss_39": 561.6,
"kl_loss_7": 2679.2,
"learning_rate": 1.2994041528833267e-05,
"loss": 3090.65,
"step": 9280
},
{
"ce_loss_13": 2.502937263250351,
"ce_loss_26": 1.9786556929349899,
"ce_loss_39": 1.7848447173833848,
"ce_loss_52": 1.4703152477741241,
"ce_loss_7": 2.837230235338211,
"epoch": 0.929,
"grad_norm": 13.618067390259025,
"kl_loss_13": 2119.0,
"kl_loss_26": 1012.1,
"kl_loss_39": 614.9,
"kl_loss_7": 2810.8,
"learning_rate": 1.2637120173670358e-05,
"loss": 3139.05,
"step": 9290
},
{
"ce_loss_13": 2.4561884820461275,
"ce_loss_26": 1.9343136429786683,
"ce_loss_39": 1.747347640991211,
"ce_loss_52": 1.4338387340307235,
"ce_loss_7": 2.7828237235546114,
"epoch": 0.93,
"grad_norm": 14.04284428447472,
"kl_loss_13": 2098.8,
"kl_loss_26": 1005.3,
"kl_loss_39": 621.35,
"kl_loss_7": 2786.4,
"learning_rate": 1.2285106557296478e-05,
"loss": 3143.8,
"step": 9300
},
{
"ce_loss_13": 2.3713702976703646,
"ce_loss_26": 1.868654829263687,
"ce_loss_39": 1.6906698912382125,
"ce_loss_52": 1.409242296218872,
"ce_loss_7": 2.7044317960739135,
"epoch": 0.931,
"grad_norm": 14.323129542469303,
"kl_loss_13": 2006.0,
"kl_loss_26": 926.0,
"kl_loss_39": 564.0,
"kl_loss_7": 2676.4,
"learning_rate": 1.1938004224484989e-05,
"loss": 3116.75,
"step": 9310
},
{
"ce_loss_13": 2.428625673055649,
"ce_loss_26": 1.9227415055036545,
"ce_loss_39": 1.7408353060483932,
"ce_loss_52": 1.4440293073654176,
"ce_loss_7": 2.7480487704277037,
"epoch": 0.932,
"grad_norm": 13.286547502225938,
"kl_loss_13": 2016.8,
"kl_loss_26": 948.6,
"kl_loss_39": 573.45,
"kl_loss_7": 2695.6,
"learning_rate": 1.1595816670552429e-05,
"loss": 3098.9,
"step": 9320
},
{
"ce_loss_13": 2.3783950984477995,
"ce_loss_26": 1.871030893921852,
"ce_loss_39": 1.6935174107551574,
"ce_loss_52": 1.4064364448189735,
"ce_loss_7": 2.709024131298065,
"epoch": 0.933,
"grad_norm": 13.906014402683894,
"kl_loss_13": 2025.8,
"kl_loss_26": 943.2,
"kl_loss_39": 573.45,
"kl_loss_7": 2710.0,
"learning_rate": 1.1258547341323699e-05,
"loss": 3112.75,
"step": 9330
},
{
"ce_loss_13": 2.418292981386185,
"ce_loss_26": 1.9166706264019013,
"ce_loss_39": 1.7358173072338103,
"ce_loss_52": 1.4464493066072464,
"ce_loss_7": 2.7368280410766603,
"epoch": 0.934,
"grad_norm": 13.814674823123692,
"kl_loss_13": 2007.6,
"kl_loss_26": 936.8,
"kl_loss_39": 570.05,
"kl_loss_7": 2680.4,
"learning_rate": 1.0926199633097156e-05,
"loss": 3089.9,
"step": 9340
},
{
"ce_loss_13": 2.421270787715912,
"ce_loss_26": 1.9069751173257827,
"ce_loss_39": 1.7246074616909026,
"ce_loss_52": 1.4261210292577744,
"ce_loss_7": 2.7438779413700103,
"epoch": 0.935,
"grad_norm": 14.043137766458962,
"kl_loss_13": 2039.4,
"kl_loss_26": 967.3,
"kl_loss_39": 585.6,
"kl_loss_7": 2722.8,
"learning_rate": 1.0598776892610684e-05,
"loss": 3103.7,
"step": 9350
},
{
"ce_loss_13": 2.4761788189411162,
"ce_loss_26": 1.9670240104198455,
"ce_loss_39": 1.7913133591413497,
"ce_loss_52": 1.5009067565202714,
"ce_loss_7": 2.7962326526641847,
"epoch": 0.936,
"grad_norm": 13.716488729093573,
"kl_loss_13": 2008.0,
"kl_loss_26": 945.5,
"kl_loss_39": 573.65,
"kl_loss_7": 2685.6,
"learning_rate": 1.0276282417007399e-05,
"loss": 3106.15,
"step": 9360
},
{
"ce_loss_13": 2.418415975570679,
"ce_loss_26": 1.916797822713852,
"ce_loss_39": 1.741264235973358,
"ce_loss_52": 1.452787458896637,
"ce_loss_7": 2.744573098421097,
"epoch": 0.937,
"grad_norm": 13.43858517345926,
"kl_loss_13": 1999.2,
"kl_loss_26": 936.2,
"kl_loss_39": 569.25,
"kl_loss_7": 2682.0,
"learning_rate": 9.958719453803277e-06,
"loss": 3109.9,
"step": 9370
},
{
"ce_loss_13": 2.3895679712295532,
"ce_loss_26": 1.8777837812900544,
"ce_loss_39": 1.6948266059160233,
"ce_loss_52": 1.4046757638454437,
"ce_loss_7": 2.7213546216487883,
"epoch": 0.938,
"grad_norm": 14.159970740647127,
"kl_loss_13": 2015.8,
"kl_loss_26": 945.0,
"kl_loss_39": 569.8,
"kl_loss_7": 2706.4,
"learning_rate": 9.646091200853802e-06,
"loss": 3110.975,
"step": 9380
},
{
"ce_loss_13": 2.389327567815781,
"ce_loss_26": 1.8952437072992325,
"ce_loss_39": 1.7221183687448502,
"ce_loss_52": 1.4358048617839814,
"ce_loss_7": 2.7049793720245363,
"epoch": 0.939,
"grad_norm": 13.344604295880838,
"kl_loss_13": 1964.8,
"kl_loss_26": 921.8,
"kl_loss_39": 560.1,
"kl_loss_7": 2638.0,
"learning_rate": 9.338400806321978e-06,
"loss": 3087.6,
"step": 9390
},
{
"ce_loss_13": 2.4069793194532396,
"ce_loss_26": 1.8967778533697128,
"ce_loss_39": 1.7197368562221527,
"ce_loss_52": 1.436858707666397,
"ce_loss_7": 2.7245797514915466,
"epoch": 0.94,
"grad_norm": 13.71832049440062,
"kl_loss_13": 2002.0,
"kl_loss_26": 937.3,
"kl_loss_39": 566.65,
"kl_loss_7": 2677.2,
"learning_rate": 9.035651368646646e-06,
"loss": 3124.5,
"step": 9400
},
{
"ce_loss_13": 2.3785787016153335,
"ce_loss_26": 1.8779745906591416,
"ce_loss_39": 1.7003175497055054,
"ce_loss_52": 1.4152926355600357,
"ce_loss_7": 2.6983864098787307,
"epoch": 0.941,
"grad_norm": 14.37750392989465,
"kl_loss_13": 1983.0,
"kl_loss_26": 932.4,
"kl_loss_39": 563.55,
"kl_loss_7": 2649.2,
"learning_rate": 8.737845936511335e-06,
"loss": 3126.15,
"step": 9410
},
{
"ce_loss_13": 2.4120055079460143,
"ce_loss_26": 1.898421436548233,
"ce_loss_39": 1.7264380306005478,
"ce_loss_52": 1.4368474900722503,
"ce_loss_7": 2.737997555732727,
"epoch": 0.942,
"grad_norm": 13.918132865929852,
"kl_loss_13": 2037.6,
"kl_loss_26": 952.9,
"kl_loss_39": 578.25,
"kl_loss_7": 2716.0,
"learning_rate": 8.444987508813451e-06,
"loss": 3086.75,
"step": 9420
},
{
"ce_loss_13": 2.4210041254758834,
"ce_loss_26": 1.9090048849582673,
"ce_loss_39": 1.732128456234932,
"ce_loss_52": 1.4372442662715912,
"ce_loss_7": 2.743331879377365,
"epoch": 0.943,
"grad_norm": 13.763467997379733,
"kl_loss_13": 2025.8,
"kl_loss_26": 953.0,
"kl_loss_39": 582.75,
"kl_loss_7": 2709.6,
"learning_rate": 8.157079034633974e-06,
"loss": 3102.0,
"step": 9430
},
{
"ce_loss_13": 2.390827241539955,
"ce_loss_26": 1.8993241131305694,
"ce_loss_39": 1.720950961112976,
"ce_loss_52": 1.4380876436829566,
"ce_loss_7": 2.708938491344452,
"epoch": 0.944,
"grad_norm": 13.65095927372199,
"kl_loss_13": 1961.4,
"kl_loss_26": 921.2,
"kl_loss_39": 551.1,
"kl_loss_7": 2626.8,
"learning_rate": 7.874123413208145e-06,
"loss": 3097.4,
"step": 9440
},
{
"ce_loss_13": 2.377914309501648,
"ce_loss_26": 1.8749269813299179,
"ce_loss_39": 1.699908110499382,
"ce_loss_52": 1.4185950323939323,
"ce_loss_7": 2.700740724802017,
"epoch": 0.945,
"grad_norm": 13.057374043926089,
"kl_loss_13": 2000.2,
"kl_loss_26": 931.2,
"kl_loss_39": 558.45,
"kl_loss_7": 2681.6,
"learning_rate": 7.59612349389599e-06,
"loss": 3113.0,
"step": 9450
},
{
"ce_loss_13": 2.428489762544632,
"ce_loss_26": 1.9228288322687148,
"ce_loss_39": 1.7434315174818038,
"ce_loss_52": 1.4502023369073869,
"ce_loss_7": 2.745371562242508,
"epoch": 0.946,
"grad_norm": 13.467471129956634,
"kl_loss_13": 2003.8,
"kl_loss_26": 954.8,
"kl_loss_39": 579.9,
"kl_loss_7": 2674.8,
"learning_rate": 7.323082076153509e-06,
"loss": 3110.35,
"step": 9460
},
{
"ce_loss_13": 2.3993911921977995,
"ce_loss_26": 1.8969668239355086,
"ce_loss_39": 1.7201234728097916,
"ce_loss_52": 1.4261724770069122,
"ce_loss_7": 2.723515260219574,
"epoch": 0.947,
"grad_norm": 14.123562965002943,
"kl_loss_13": 1995.2,
"kl_loss_26": 940.5,
"kl_loss_39": 572.85,
"kl_loss_7": 2665.2,
"learning_rate": 7.055001909504755e-06,
"loss": 3103.5,
"step": 9470
},
{
"ce_loss_13": 2.371204599738121,
"ce_loss_26": 1.8650381177663804,
"ce_loss_39": 1.6901476740837098,
"ce_loss_52": 1.4073528528213501,
"ce_loss_7": 2.69525728225708,
"epoch": 0.948,
"grad_norm": 13.7395501584386,
"kl_loss_13": 2001.2,
"kl_loss_26": 926.3,
"kl_loss_39": 559.35,
"kl_loss_7": 2675.2,
"learning_rate": 6.791885693514133e-06,
"loss": 3117.9,
"step": 9480
},
{
"ce_loss_13": 2.3874946534633636,
"ce_loss_26": 1.8710165858268737,
"ce_loss_39": 1.6918294131755829,
"ce_loss_52": 1.4066254168748855,
"ce_loss_7": 2.718043899536133,
"epoch": 0.949,
"grad_norm": 14.444149745541107,
"kl_loss_13": 2026.6,
"kl_loss_26": 936.5,
"kl_loss_39": 563.35,
"kl_loss_7": 2720.4,
"learning_rate": 6.533736077758867e-06,
"loss": 3144.35,
"step": 9490
},
{
"ce_loss_13": 2.3802697598934173,
"ce_loss_26": 1.8696208387613296,
"ce_loss_39": 1.6913905203342439,
"ce_loss_52": 1.4053042978048325,
"ce_loss_7": 2.706346648931503,
"epoch": 0.95,
"grad_norm": 13.683323615533283,
"kl_loss_13": 2001.6,
"kl_loss_26": 931.6,
"kl_loss_39": 565.45,
"kl_loss_7": 2685.6,
"learning_rate": 6.2805556618028556e-06,
"loss": 3132.15,
"step": 9500
},
{
"ce_loss_13": 2.45430488884449,
"ce_loss_26": 1.9479888796806335,
"ce_loss_39": 1.7709876328706742,
"ce_loss_52": 1.4815271288156509,
"ce_loss_7": 2.7643910527229307,
"epoch": 0.951,
"grad_norm": 14.559144572280042,
"kl_loss_13": 2000.2,
"kl_loss_26": 946.3,
"kl_loss_39": 575.65,
"kl_loss_7": 2660.0,
"learning_rate": 6.032346995169968e-06,
"loss": 3124.85,
"step": 9510
},
{
"ce_loss_13": 2.4814863801002502,
"ce_loss_26": 1.976859924197197,
"ce_loss_39": 1.795655995607376,
"ce_loss_52": 1.4911505609750748,
"ce_loss_7": 2.796732819080353,
"epoch": 0.952,
"grad_norm": 14.307827869099665,
"kl_loss_13": 2052.0,
"kl_loss_26": 982.8,
"kl_loss_39": 599.65,
"kl_loss_7": 2722.8,
"learning_rate": 5.789112577318789e-06,
"loss": 3131.25,
"step": 9520
},
{
"ce_loss_13": 2.370393967628479,
"ce_loss_26": 1.8703551948070527,
"ce_loss_39": 1.6911178916692733,
"ce_loss_52": 1.397288253903389,
"ce_loss_7": 2.6921759128570555,
"epoch": 0.953,
"grad_norm": 13.532268906242434,
"kl_loss_13": 2004.2,
"kl_loss_26": 951.2,
"kl_loss_39": 578.45,
"kl_loss_7": 2674.4,
"learning_rate": 5.550854857617194e-06,
"loss": 3093.1,
"step": 9530
},
{
"ce_loss_13": 2.3659786969423293,
"ce_loss_26": 1.855891814827919,
"ce_loss_39": 1.6768086194992065,
"ce_loss_52": 1.3908632963895797,
"ce_loss_7": 2.6853357315063477,
"epoch": 0.954,
"grad_norm": 14.753075788642166,
"kl_loss_13": 2010.2,
"kl_loss_26": 935.5,
"kl_loss_39": 563.55,
"kl_loss_7": 2681.6,
"learning_rate": 5.317576235317756e-06,
"loss": 3120.8,
"step": 9540
},
{
"ce_loss_13": 2.4337273120880125,
"ce_loss_26": 1.9248733311891555,
"ce_loss_39": 1.7499325275421143,
"ce_loss_52": 1.4674480736255646,
"ce_loss_7": 2.7530871987342835,
"epoch": 0.955,
"grad_norm": 13.311149092546296,
"kl_loss_13": 1991.6,
"kl_loss_26": 925.5,
"kl_loss_39": 562.15,
"kl_loss_7": 2663.6,
"learning_rate": 5.089279059533658e-06,
"loss": 3080.15,
"step": 9550
},
{
"ce_loss_13": 2.477786514163017,
"ce_loss_26": 1.9535741955041885,
"ce_loss_39": 1.7720552951097488,
"ce_loss_52": 1.468274374306202,
"ce_loss_7": 2.806012988090515,
"epoch": 0.956,
"grad_norm": 13.557904970493434,
"kl_loss_13": 2082.8,
"kl_loss_26": 980.6,
"kl_loss_39": 601.5,
"kl_loss_7": 2773.6,
"learning_rate": 4.865965629214819e-06,
"loss": 3106.7,
"step": 9560
},
{
"ce_loss_13": 2.4565936863422393,
"ce_loss_26": 1.9573397368192673,
"ce_loss_39": 1.773080477118492,
"ce_loss_52": 1.4730938911437987,
"ce_loss_7": 2.7732574224472044,
"epoch": 0.957,
"grad_norm": 14.224632183872556,
"kl_loss_13": 2023.4,
"kl_loss_26": 967.5,
"kl_loss_39": 591.9,
"kl_loss_7": 2693.2,
"learning_rate": 4.6476381931251366e-06,
"loss": 3121.6,
"step": 9570
},
{
"ce_loss_13": 2.3847708880901335,
"ce_loss_26": 1.8858718812465667,
"ce_loss_39": 1.7032645136117934,
"ce_loss_52": 1.4209994703531266,
"ce_loss_7": 2.708657431602478,
"epoch": 0.958,
"grad_norm": 13.699065805322247,
"kl_loss_13": 1983.8,
"kl_loss_26": 921.9,
"kl_loss_39": 551.45,
"kl_loss_7": 2664.4,
"learning_rate": 4.434298949819449e-06,
"loss": 3097.9,
"step": 9580
},
{
"ce_loss_13": 2.417462554574013,
"ce_loss_26": 1.9159747958183289,
"ce_loss_39": 1.7397230744361878,
"ce_loss_52": 1.4456302881240846,
"ce_loss_7": 2.7354709684848784,
"epoch": 0.959,
"grad_norm": 13.043144407859455,
"kl_loss_13": 1990.0,
"kl_loss_26": 941.8,
"kl_loss_39": 574.8,
"kl_loss_7": 2662.4,
"learning_rate": 4.2259500476214406e-06,
"loss": 3095.8,
"step": 9590
},
{
"ce_loss_13": 2.427861177921295,
"ce_loss_26": 1.9228525012731552,
"ce_loss_39": 1.7392638593912124,
"ce_loss_52": 1.445976984500885,
"ce_loss_7": 2.7459777116775514,
"epoch": 0.96,
"grad_norm": 13.619745852010974,
"kl_loss_13": 2021.4,
"kl_loss_26": 954.5,
"kl_loss_39": 577.9,
"kl_loss_7": 2698.8,
"learning_rate": 4.02259358460233e-06,
"loss": 3122.9,
"step": 9600
},
{
"ce_loss_13": 2.46402502655983,
"ce_loss_26": 1.9587235629558564,
"ce_loss_39": 1.7794159650802612,
"ce_loss_52": 1.4795134991407395,
"ce_loss_7": 2.7920902401208876,
"epoch": 0.961,
"grad_norm": 13.957824607337622,
"kl_loss_13": 2034.8,
"kl_loss_26": 964.3,
"kl_loss_39": 588.35,
"kl_loss_7": 2719.6,
"learning_rate": 3.8242316085594916e-06,
"loss": 3106.9,
"step": 9610
},
{
"ce_loss_13": 2.4026571094989775,
"ce_loss_26": 1.8842627108097076,
"ce_loss_39": 1.6962791502475738,
"ce_loss_52": 1.4012041926383971,
"ce_loss_7": 2.7310706257820128,
"epoch": 0.962,
"grad_norm": 13.847893098332637,
"kl_loss_13": 2054.8,
"kl_loss_26": 965.5,
"kl_loss_39": 579.7,
"kl_loss_7": 2743.2,
"learning_rate": 3.630866116995757e-06,
"loss": 3149.6,
"step": 9620
},
{
"ce_loss_13": 2.3699823945760725,
"ce_loss_26": 1.8775872141122818,
"ce_loss_39": 1.7078934848308562,
"ce_loss_52": 1.427069191634655,
"ce_loss_7": 2.68268860578537,
"epoch": 0.963,
"grad_norm": 14.006927808523123,
"kl_loss_13": 1949.4,
"kl_loss_26": 909.6,
"kl_loss_39": 550.8,
"kl_loss_7": 2613.6,
"learning_rate": 3.4424990570994797e-06,
"loss": 3088.75,
"step": 9630
},
{
"ce_loss_13": 2.4296331614255906,
"ce_loss_26": 1.923089200258255,
"ce_loss_39": 1.7481043189764023,
"ce_loss_52": 1.458402395248413,
"ce_loss_7": 2.7494013249874114,
"epoch": 0.964,
"grad_norm": 13.971614829348757,
"kl_loss_13": 1997.2,
"kl_loss_26": 933.6,
"kl_loss_39": 570.15,
"kl_loss_7": 2669.6,
"learning_rate": 3.2591323257248896e-06,
"loss": 3114.2,
"step": 9640
},
{
"ce_loss_13": 2.4253605216741563,
"ce_loss_26": 1.920077031850815,
"ce_loss_39": 1.7424649715423584,
"ce_loss_52": 1.4570231169462204,
"ce_loss_7": 2.744251537322998,
"epoch": 0.965,
"grad_norm": 13.868560987861438,
"kl_loss_13": 2000.2,
"kl_loss_26": 937.9,
"kl_loss_39": 567.9,
"kl_loss_7": 2672.0,
"learning_rate": 3.0807677693729385e-06,
"loss": 3117.15,
"step": 9650
},
{
"ce_loss_13": 2.445168226957321,
"ce_loss_26": 1.9308179676532746,
"ce_loss_39": 1.7556968212127686,
"ce_loss_52": 1.462259876728058,
"ce_loss_7": 2.7607338547706606,
"epoch": 0.966,
"grad_norm": 13.868630790191208,
"kl_loss_13": 2030.0,
"kl_loss_26": 955.1,
"kl_loss_39": 582.55,
"kl_loss_7": 2710.4,
"learning_rate": 2.9074071841727055e-06,
"loss": 3136.15,
"step": 9660
},
{
"ce_loss_13": 2.3746090680360794,
"ce_loss_26": 1.8696930974721908,
"ce_loss_39": 1.6932952284812928,
"ce_loss_52": 1.4037827536463738,
"ce_loss_7": 2.6961917489767075,
"epoch": 0.967,
"grad_norm": 13.741681034653173,
"kl_loss_13": 2003.4,
"kl_loss_26": 943.6,
"kl_loss_39": 570.8,
"kl_loss_7": 2675.6,
"learning_rate": 2.739052315863355e-06,
"loss": 3123.325,
"step": 9670
},
{
"ce_loss_13": 2.4609276592731475,
"ce_loss_26": 1.9465218961238862,
"ce_loss_39": 1.755302396416664,
"ce_loss_52": 1.4513660728931428,
"ce_loss_7": 2.783074140548706,
"epoch": 0.968,
"grad_norm": 13.849931878044563,
"kl_loss_13": 2075.0,
"kl_loss_26": 988.8,
"kl_loss_39": 596.8,
"kl_loss_7": 2755.6,
"learning_rate": 2.5757048597765396e-06,
"loss": 3108.55,
"step": 9680
},
{
"ce_loss_13": 2.3671315789222716,
"ce_loss_26": 1.8652487874031067,
"ce_loss_39": 1.6895152300596237,
"ce_loss_52": 1.4115738093852996,
"ce_loss_7": 2.6859952569007874,
"epoch": 0.969,
"grad_norm": 14.271644916152136,
"kl_loss_13": 1975.0,
"kl_loss_26": 910.5,
"kl_loss_39": 544.7,
"kl_loss_7": 2649.2,
"learning_rate": 2.417366460819359e-06,
"loss": 3094.15,
"step": 9690
},
{
"ce_loss_13": 2.4030214190483092,
"ce_loss_26": 1.8979378938674927,
"ce_loss_39": 1.7221683353185653,
"ce_loss_52": 1.4381350710988046,
"ce_loss_7": 2.723929351568222,
"epoch": 0.97,
"grad_norm": 13.991735266099393,
"kl_loss_13": 1985.0,
"kl_loss_26": 927.7,
"kl_loss_39": 554.55,
"kl_loss_7": 2656.0,
"learning_rate": 2.2640387134577057e-06,
"loss": 3121.05,
"step": 9700
},
{
"ce_loss_13": 2.388926792144775,
"ce_loss_26": 1.8821999937295915,
"ce_loss_39": 1.7065987050533296,
"ce_loss_52": 1.427994754910469,
"ce_loss_7": 2.7048760533332823,
"epoch": 0.971,
"grad_norm": 14.179862402525806,
"kl_loss_13": 1973.6,
"kl_loss_26": 911.0,
"kl_loss_39": 548.95,
"kl_loss_7": 2646.4,
"learning_rate": 2.115723161700278e-06,
"loss": 3136.0,
"step": 9710
},
{
"ce_loss_13": 2.448368564248085,
"ce_loss_26": 1.9373161673545838,
"ce_loss_39": 1.7496683716773986,
"ce_loss_52": 1.4501112252473831,
"ce_loss_7": 2.7681332349777223,
"epoch": 0.972,
"grad_norm": 13.312305733104958,
"kl_loss_13": 2063.8,
"kl_loss_26": 986.5,
"kl_loss_39": 593.8,
"kl_loss_7": 2737.6,
"learning_rate": 1.9724212990830937e-06,
"loss": 3096.45,
"step": 9720
},
{
"ce_loss_13": 2.40165196955204,
"ce_loss_26": 1.9097970753908158,
"ce_loss_39": 1.732149314880371,
"ce_loss_52": 1.4418910443782806,
"ce_loss_7": 2.7182520925998688,
"epoch": 0.973,
"grad_norm": 13.186413860645287,
"kl_loss_13": 1986.6,
"kl_loss_26": 939.2,
"kl_loss_39": 575.8,
"kl_loss_7": 2649.6,
"learning_rate": 1.8341345686543331e-06,
"loss": 3096.7,
"step": 9730
},
{
"ce_loss_13": 2.4751327097415925,
"ce_loss_26": 1.9718121886253357,
"ce_loss_39": 1.7938049882650375,
"ce_loss_52": 1.510325726866722,
"ce_loss_7": 2.790462166070938,
"epoch": 0.974,
"grad_norm": 13.542368115510277,
"kl_loss_13": 1991.0,
"kl_loss_26": 919.4,
"kl_loss_39": 555.25,
"kl_loss_7": 2656.0,
"learning_rate": 1.7008643629596864e-06,
"loss": 3139.35,
"step": 9740
},
{
"ce_loss_13": 2.448350805044174,
"ce_loss_26": 1.9393187165260315,
"ce_loss_39": 1.756307190656662,
"ce_loss_52": 1.4625631257891656,
"ce_loss_7": 2.7839869439601896,
"epoch": 0.975,
"grad_norm": 14.153531935288711,
"kl_loss_13": 2030.2,
"kl_loss_26": 953.5,
"kl_loss_39": 573.5,
"kl_loss_7": 2731.6,
"learning_rate": 1.5726120240288633e-06,
"loss": 3091.65,
"step": 9750
},
{
"ce_loss_13": 2.493607670068741,
"ce_loss_26": 1.9704292267560959,
"ce_loss_39": 1.7810406684875488,
"ce_loss_52": 1.4734347879886627,
"ce_loss_7": 2.8225875020027162,
"epoch": 0.976,
"grad_norm": 13.917924014106907,
"kl_loss_13": 2092.8,
"kl_loss_26": 995.9,
"kl_loss_39": 599.65,
"kl_loss_7": 2777.6,
"learning_rate": 1.4493788433612708e-06,
"loss": 3106.15,
"step": 9760
},
{
"ce_loss_13": 2.389453822374344,
"ce_loss_26": 1.889643257856369,
"ce_loss_39": 1.7158276617527009,
"ce_loss_52": 1.4275161743164062,
"ce_loss_7": 2.7086060285568236,
"epoch": 0.977,
"grad_norm": 13.614062458586098,
"kl_loss_13": 1971.2,
"kl_loss_26": 924.0,
"kl_loss_39": 561.0,
"kl_loss_7": 2646.0,
"learning_rate": 1.3311660619138578e-06,
"loss": 3083.9,
"step": 9770
},
{
"ce_loss_13": 2.387269985675812,
"ce_loss_26": 1.8734027475118638,
"ce_loss_39": 1.6912487357854844,
"ce_loss_52": 1.4031418770551682,
"ce_loss_7": 2.7191080808639527,
"epoch": 0.978,
"grad_norm": 14.36678395992836,
"kl_loss_13": 2023.2,
"kl_loss_26": 943.6,
"kl_loss_39": 565.3,
"kl_loss_7": 2722.0,
"learning_rate": 1.2179748700879012e-06,
"loss": 3100.55,
"step": 9780
},
{
"ce_loss_13": 2.3631068110466003,
"ce_loss_26": 1.861675202846527,
"ce_loss_39": 1.6827853351831437,
"ce_loss_52": 1.3983743026852609,
"ce_loss_7": 2.6796926259994507,
"epoch": 0.979,
"grad_norm": 14.06766234963321,
"kl_loss_13": 1993.6,
"kl_loss_26": 930.1,
"kl_loss_39": 561.8,
"kl_loss_7": 2668.4,
"learning_rate": 1.1098064077174619e-06,
"loss": 3119.95,
"step": 9790
},
{
"ce_loss_13": 2.4609683632850645,
"ce_loss_26": 1.948616126179695,
"ce_loss_39": 1.7647934973239898,
"ce_loss_52": 1.454009547829628,
"ce_loss_7": 2.785689663887024,
"epoch": 0.98,
"grad_norm": 13.39999724987333,
"kl_loss_13": 2070.0,
"kl_loss_26": 989.6,
"kl_loss_39": 607.8,
"kl_loss_7": 2749.6,
"learning_rate": 1.006661764057837e-06,
"loss": 3101.0,
"step": 9800
},
{
"ce_loss_13": 2.3849492847919462,
"ce_loss_26": 1.8622053205966949,
"ce_loss_39": 1.6866901487112045,
"ce_loss_52": 1.3904988124966622,
"ce_loss_7": 2.713945233821869,
"epoch": 0.981,
"grad_norm": 13.776704297392317,
"kl_loss_13": 2060.2,
"kl_loss_26": 959.2,
"kl_loss_39": 584.2,
"kl_loss_7": 2753.2,
"learning_rate": 9.085419777743465e-07,
"loss": 3145.375,
"step": 9810
},
{
"ce_loss_13": 2.423547920584679,
"ce_loss_26": 1.9212449431419372,
"ce_loss_39": 1.7484615802764893,
"ce_loss_52": 1.4525489255785942,
"ce_loss_7": 2.7415110945701597,
"epoch": 0.982,
"grad_norm": 13.800146507088886,
"kl_loss_13": 2026.2,
"kl_loss_26": 956.5,
"kl_loss_39": 585.85,
"kl_loss_7": 2700.8,
"learning_rate": 8.15448036932176e-07,
"loss": 3140.075,
"step": 9820
},
{
"ce_loss_13": 2.4265142381191254,
"ce_loss_26": 1.9230076640844345,
"ce_loss_39": 1.7403117150068284,
"ce_loss_52": 1.44933120906353,
"ce_loss_7": 2.743628019094467,
"epoch": 0.983,
"grad_norm": 13.731982955757704,
"kl_loss_13": 2035.6,
"kl_loss_26": 967.5,
"kl_loss_39": 589.65,
"kl_loss_7": 2710.4,
"learning_rate": 7.273808789862724e-07,
"loss": 3097.325,
"step": 9830
},
{
"ce_loss_13": 2.432727184891701,
"ce_loss_26": 1.9222358494997025,
"ce_loss_39": 1.7392481476068498,
"ce_loss_52": 1.4496447369456291,
"ce_loss_7": 2.7563742280006407,
"epoch": 0.984,
"grad_norm": 14.26920700265725,
"kl_loss_13": 2031.0,
"kl_loss_26": 951.8,
"kl_loss_39": 579.1,
"kl_loss_7": 2718.0,
"learning_rate": 6.443413907720186e-07,
"loss": 3091.6,
"step": 9840
},
{
"ce_loss_13": 2.3425186455249785,
"ce_loss_26": 1.8573100596666337,
"ce_loss_39": 1.6887821286916733,
"ce_loss_52": 1.3970566481351852,
"ce_loss_7": 2.6601881802082064,
"epoch": 0.985,
"grad_norm": 14.017763045011195,
"kl_loss_13": 1953.8,
"kl_loss_26": 920.7,
"kl_loss_39": 567.3,
"kl_loss_7": 2625.2,
"learning_rate": 5.663304084960185e-07,
"loss": 3110.05,
"step": 9850
},
{
"ce_loss_13": 2.3719563096761704,
"ce_loss_26": 1.8737152755260467,
"ce_loss_39": 1.698423257470131,
"ce_loss_52": 1.4208501130342484,
"ce_loss_7": 2.6962190210819243,
"epoch": 0.986,
"grad_norm": 14.355657718500265,
"kl_loss_13": 1944.8,
"kl_loss_26": 901.7,
"kl_loss_39": 537.7,
"kl_loss_7": 2617.8,
"learning_rate": 4.933487177280482e-07,
"loss": 3084.175,
"step": 9860
},
{
"ce_loss_13": 2.4431921422481535,
"ce_loss_26": 1.9350731909275054,
"ce_loss_39": 1.7554681122303009,
"ce_loss_52": 1.4598491072654725,
"ce_loss_7": 2.761775279045105,
"epoch": 0.987,
"grad_norm": 14.504857971707292,
"kl_loss_13": 2021.8,
"kl_loss_26": 959.6,
"kl_loss_39": 582.75,
"kl_loss_7": 2686.8,
"learning_rate": 4.2539705339295075e-07,
"loss": 3095.3,
"step": 9870
},
{
"ce_loss_13": 2.3912162601947786,
"ce_loss_26": 1.8867509424686433,
"ce_loss_39": 1.7147331923246383,
"ce_loss_52": 1.4274780035018921,
"ce_loss_7": 2.7152205407619476,
"epoch": 0.988,
"grad_norm": 13.898651253722077,
"kl_loss_13": 1978.6,
"kl_loss_26": 918.4,
"kl_loss_39": 560.0,
"kl_loss_7": 2653.6,
"learning_rate": 3.6247609976319816e-07,
"loss": 3111.5,
"step": 9880
},
{
"ce_loss_13": 2.4592268586158754,
"ce_loss_26": 1.9460572868585586,
"ce_loss_39": 1.7624834805727005,
"ce_loss_52": 1.4708713114261627,
"ce_loss_7": 2.780032974481583,
"epoch": 0.989,
"grad_norm": 13.162597089199776,
"kl_loss_13": 2017.4,
"kl_loss_26": 950.2,
"kl_loss_39": 574.95,
"kl_loss_7": 2680.0,
"learning_rate": 3.0458649045211895e-07,
"loss": 3104.75,
"step": 9890
},
{
"ce_loss_13": 2.318621850013733,
"ce_loss_26": 1.8226055085659028,
"ce_loss_39": 1.6485673993825913,
"ce_loss_52": 1.3747537702322006,
"ce_loss_7": 2.6328552305698394,
"epoch": 0.99,
"grad_norm": 14.177492293626315,
"kl_loss_13": 1957.6,
"kl_loss_26": 901.5,
"kl_loss_39": 543.2,
"kl_loss_7": 2623.6,
"learning_rate": 2.517288084074587e-07,
"loss": 3097.8,
"step": 9900
},
{
"ce_loss_13": 2.450270253419876,
"ce_loss_26": 1.9614087045192719,
"ce_loss_39": 1.820476683974266,
"ce_loss_52": 1.4863917350769043,
"ce_loss_7": 2.776776838302612,
"epoch": 0.991,
"grad_norm": 13.637096409193225,
"kl_loss_13": 2041.0,
"kl_loss_26": 993.3,
"kl_loss_39": 622.6,
"kl_loss_7": 2724.0,
"learning_rate": 2.0390358590538505e-07,
"loss": 3133.55,
"step": 9910
},
{
"ce_loss_13": 2.4164200723171234,
"ce_loss_26": 1.9075238525867462,
"ce_loss_39": 1.7269282668828965,
"ce_loss_52": 1.4252729326486588,
"ce_loss_7": 2.7410891175270082,
"epoch": 0.992,
"grad_norm": 14.06088505281845,
"kl_loss_13": 2041.6,
"kl_loss_26": 973.6,
"kl_loss_39": 593.7,
"kl_loss_7": 2722.8,
"learning_rate": 1.61111304545436e-07,
"loss": 3101.25,
"step": 9920
},
{
"ce_loss_13": 2.4260194152593613,
"ce_loss_26": 1.9236795336008072,
"ce_loss_39": 1.7412341982126236,
"ce_loss_52": 1.445477369427681,
"ce_loss_7": 2.750396305322647,
"epoch": 0.993,
"grad_norm": 13.813190658092928,
"kl_loss_13": 2043.4,
"kl_loss_26": 963.8,
"kl_loss_39": 591.1,
"kl_loss_7": 2727.2,
"learning_rate": 1.2335239524541298e-07,
"loss": 3113.2,
"step": 9930
},
{
"ce_loss_13": 2.4003034621477126,
"ce_loss_26": 1.8995478272438049,
"ce_loss_39": 1.7172971665859222,
"ce_loss_52": 1.4267783105373382,
"ce_loss_7": 2.730258399248123,
"epoch": 0.994,
"grad_norm": 14.088698553106907,
"kl_loss_13": 2017.2,
"kl_loss_26": 955.6,
"kl_loss_39": 580.2,
"kl_loss_7": 2698.8,
"learning_rate": 9.06272382371065e-08,
"loss": 3112.45,
"step": 9940
},
{
"ce_loss_13": 2.3436710268259047,
"ce_loss_26": 1.860148760676384,
"ce_loss_39": 1.6837237626314163,
"ce_loss_52": 1.405136799812317,
"ce_loss_7": 2.659210926294327,
"epoch": 0.995,
"grad_norm": 13.69734193648067,
"kl_loss_13": 1922.2,
"kl_loss_26": 893.6,
"kl_loss_39": 537.7,
"kl_loss_7": 2583.2,
"learning_rate": 6.293616306246586e-08,
"loss": 3122.45,
"step": 9950
},
{
"ce_loss_13": 2.4028525710105897,
"ce_loss_26": 1.9053959518671035,
"ce_loss_39": 1.734974354505539,
"ce_loss_52": 1.4462752103805543,
"ce_loss_7": 2.7197851181030273,
"epoch": 0.996,
"grad_norm": 14.052327245568536,
"kl_loss_13": 1985.8,
"kl_loss_26": 933.9,
"kl_loss_39": 572.05,
"kl_loss_7": 2659.2,
"learning_rate": 4.027944857032395e-08,
"loss": 3115.75,
"step": 9960
},
{
"ce_loss_13": 2.3983709454536437,
"ce_loss_26": 1.897152093052864,
"ce_loss_39": 1.7151427894830704,
"ce_loss_52": 1.4271936371922493,
"ce_loss_7": 2.7171947032213213,
"epoch": 0.997,
"grad_norm": 13.53314291583234,
"kl_loss_13": 1999.0,
"kl_loss_26": 942.7,
"kl_loss_39": 568.8,
"kl_loss_7": 2670.0,
"learning_rate": 2.265732291356626e-08,
"loss": 3094.175,
"step": 9970
},
{
"ce_loss_13": 2.3423147082328795,
"ce_loss_26": 1.8376368135213852,
"ce_loss_39": 1.6704195857048034,
"ce_loss_52": 1.3995309814810752,
"ce_loss_7": 2.6571378737688063,
"epoch": 0.998,
"grad_norm": 13.139494465989356,
"kl_loss_13": 1954.0,
"kl_loss_26": 894.2,
"kl_loss_39": 540.1,
"kl_loss_7": 2622.6,
"learning_rate": 1.0069963546743833e-08,
"loss": 3091.8,
"step": 9980
},
{
"ce_loss_13": 2.371971958875656,
"ce_loss_26": 1.8749190032482148,
"ce_loss_39": 1.7022694885730743,
"ce_loss_52": 1.4237666621804237,
"ce_loss_7": 2.694683998823166,
"epoch": 0.999,
"grad_norm": 13.948245708314273,
"kl_loss_13": 1952.8,
"kl_loss_26": 907.5,
"kl_loss_39": 547.75,
"kl_loss_7": 2620.8,
"learning_rate": 2.517497224463483e-09,
"loss": 3089.0,
"step": 9990
},
{
"ce_loss_13": 2.4080661326646804,
"ce_loss_26": 1.896571347117424,
"ce_loss_39": 1.7096484139561654,
"ce_loss_52": 1.4170419454574585,
"ce_loss_7": 2.7374835878610613,
"epoch": 1.0,
"grad_norm": 13.857134097975196,
"kl_loss_13": 2044.2,
"kl_loss_26": 961.3,
"kl_loss_39": 577.05,
"kl_loss_7": 2731.6,
"learning_rate": 0.0,
"loss": 3103.8,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0167830278176768e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}