Balcony-Model32 / trainer_state.json
adpretko's picture
Upload folder using huggingface_hub
744d064 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"ce_loss_13": 6.402231216430664,
"ce_loss_17": 3.2689143419265747,
"ce_loss_2": 9.390046119689941,
"ce_loss_4": 9.192162036895752,
"ce_loss_9": 6.634675979614258,
"epoch": 0.0001,
"grad_norm": 37632.0,
"kl_loss_13": 7664.65576171875,
"kl_loss_2": 12785.50146484375,
"kl_loss_4": 12609.859375,
"kl_loss_9": 7861.16552734375,
"learning_rate": 1e-05,
"loss": 10554.3262,
"step": 1
},
{
"ce_loss_13": 5.711873133977254,
"ce_loss_17": 3.3221818341149225,
"ce_loss_2": 8.177524778578016,
"ce_loss_4": 7.773941384421454,
"ce_loss_9": 5.815383354822795,
"epoch": 0.001,
"grad_norm": 16640.0,
"kl_loss_13": 5945.98235405816,
"kl_loss_2": 9901.66468641493,
"kl_loss_4": 9238.261610243055,
"kl_loss_9": 5542.482516818576,
"learning_rate": 0.0001,
"loss": 7684.5868,
"step": 10
},
{
"ce_loss_13": 4.2000329971313475,
"ce_loss_17": 3.3278347134590147,
"ce_loss_2": 6.55702052116394,
"ce_loss_4": 6.119993329048157,
"ce_loss_9": 4.6535967826843265,
"epoch": 0.002,
"grad_norm": 1968.0,
"kl_loss_13": 1805.488510131836,
"kl_loss_2": 6092.596118164062,
"kl_loss_4": 5236.483740234375,
"kl_loss_9": 2427.8350463867187,
"learning_rate": 0.0002,
"loss": 3912.6133,
"step": 20
},
{
"ce_loss_13": 3.526861870288849,
"ce_loss_17": 3.1244616389274595,
"ce_loss_2": 5.735381245613098,
"ce_loss_4": 5.213837122917175,
"ce_loss_9": 3.934248983860016,
"epoch": 0.003,
"grad_norm": 1336.0,
"kl_loss_13": 756.9810455322265,
"kl_loss_2": 4979.42373046875,
"kl_loss_4": 3980.5005615234377,
"kl_loss_9": 1485.3927185058594,
"learning_rate": 0.0003,
"loss": 2754.8352,
"step": 30
},
{
"ce_loss_13": 3.5736655235290526,
"ce_loss_17": 3.290929138660431,
"ce_loss_2": 5.478212141990662,
"ce_loss_4": 4.976801490783691,
"ce_loss_9": 3.9243278384208677,
"epoch": 0.004,
"grad_norm": 1720.0,
"kl_loss_13": 516.0072662353516,
"kl_loss_2": 4206.617651367187,
"kl_loss_4": 3240.3652221679686,
"kl_loss_9": 1167.3858154296875,
"learning_rate": 0.0004,
"loss": 2292.9277,
"step": 40
},
{
"ce_loss_13": 3.492569625377655,
"ce_loss_17": 3.263450765609741,
"ce_loss_2": 5.301641321182251,
"ce_loss_4": 4.762786316871643,
"ce_loss_9": 3.813670790195465,
"epoch": 0.005,
"grad_norm": 1336.0,
"kl_loss_13": 408.9797164916992,
"kl_loss_2": 3949.739465332031,
"kl_loss_4": 2919.3654174804688,
"kl_loss_9": 1008.5517547607421,
"learning_rate": 0.0005,
"loss": 2071.7504,
"step": 50
},
{
"ce_loss_13": 3.468132793903351,
"ce_loss_17": 3.281778705120087,
"ce_loss_2": 5.174832701683044,
"ce_loss_4": 4.657959699630737,
"ce_loss_9": 3.7767446517944334,
"epoch": 0.006,
"grad_norm": 1264.0,
"kl_loss_13": 346.4393112182617,
"kl_loss_2": 3695.2262329101563,
"kl_loss_4": 2685.748815917969,
"kl_loss_9": 924.1326385498047,
"learning_rate": 0.0006,
"loss": 1916.6861,
"step": 60
},
{
"ce_loss_13": 3.3564661741256714,
"ce_loss_17": 3.1969870805740355,
"ce_loss_2": 5.081459856033325,
"ce_loss_4": 4.552425503730774,
"ce_loss_9": 3.6519309878349304,
"epoch": 0.007,
"grad_norm": 1640.0,
"kl_loss_13": 296.28175048828126,
"kl_loss_2": 3685.4558715820312,
"kl_loss_4": 2671.9430419921873,
"kl_loss_9": 849.3832427978516,
"learning_rate": 0.0007,
"loss": 1864.6594,
"step": 70
},
{
"ce_loss_13": 3.3460593104362486,
"ce_loss_17": 3.1964768290519716,
"ce_loss_2": 5.04540433883667,
"ce_loss_4": 4.464562845230103,
"ce_loss_9": 3.6478141069412233,
"epoch": 0.008,
"grad_norm": 2192.0,
"kl_loss_13": 271.24009094238284,
"kl_loss_2": 3629.17392578125,
"kl_loss_4": 2484.8527954101564,
"kl_loss_9": 836.3974060058594,
"learning_rate": 0.0008,
"loss": 1818.5662,
"step": 80
},
{
"ce_loss_13": 3.345805835723877,
"ce_loss_17": 3.1532609820365907,
"ce_loss_2": 5.009881067276001,
"ce_loss_4": 4.3814632296562195,
"ce_loss_9": 3.5915980696678163,
"epoch": 0.009,
"grad_norm": 1680.0,
"kl_loss_13": 358.3774284362793,
"kl_loss_2": 3638.965710449219,
"kl_loss_4": 2418.6386596679686,
"kl_loss_9": 819.6164855957031,
"learning_rate": 0.0009000000000000001,
"loss": 1801.5828,
"step": 90
},
{
"ce_loss_13": 3.487445020675659,
"ce_loss_17": 3.269311249256134,
"ce_loss_2": 5.072077345848084,
"ce_loss_4": 4.4792849779129025,
"ce_loss_9": 3.7141328930854796,
"epoch": 0.01,
"grad_norm": 2016.0,
"kl_loss_13": 425.99823303222655,
"kl_loss_2": 3543.0379638671875,
"kl_loss_4": 2380.678662109375,
"kl_loss_9": 826.6310424804688,
"learning_rate": 0.001,
"loss": 1796.9578,
"step": 100
},
{
"ce_loss_13": 3.42359539270401,
"ce_loss_17": 3.226667749881744,
"ce_loss_2": 5.0204795598983765,
"ce_loss_4": 4.459975528717041,
"ce_loss_9": 3.6708388924598694,
"epoch": 0.011,
"grad_norm": 1488.0,
"kl_loss_13": 365.5156448364258,
"kl_loss_2": 3526.2210327148437,
"kl_loss_4": 2427.8171630859374,
"kl_loss_9": 829.8687622070313,
"learning_rate": 0.0009999974825027757,
"loss": 1782.9496,
"step": 110
},
{
"ce_loss_13": 3.4499420881271363,
"ce_loss_17": 3.289404010772705,
"ce_loss_2": 5.006643867492675,
"ce_loss_4": 4.4474083423614506,
"ce_loss_9": 3.717779290676117,
"epoch": 0.012,
"grad_norm": 1424.0,
"kl_loss_13": 293.02129516601565,
"kl_loss_2": 3400.0647583007812,
"kl_loss_4": 2294.327313232422,
"kl_loss_9": 819.5202423095703,
"learning_rate": 0.0009999899300364532,
"loss": 1681.502,
"step": 120
},
{
"ce_loss_13": 3.4073472142219545,
"ce_loss_17": 3.2614750146865843,
"ce_loss_2": 4.989273881912231,
"ce_loss_4": 4.405853271484375,
"ce_loss_9": 3.69451265335083,
"epoch": 0.013,
"grad_norm": 1072.0,
"kl_loss_13": 266.0638145446777,
"kl_loss_2": 3416.4965698242186,
"kl_loss_4": 2282.6978149414062,
"kl_loss_9": 795.0189605712891,
"learning_rate": 0.0009999773426770863,
"loss": 1705.7545,
"step": 130
},
{
"ce_loss_13": 3.4283769369125365,
"ce_loss_17": 3.297203207015991,
"ce_loss_2": 4.972061061859131,
"ce_loss_4": 4.393543720245361,
"ce_loss_9": 3.7087190985679626,
"epoch": 0.014,
"grad_norm": 1192.0,
"kl_loss_13": 239.3195457458496,
"kl_loss_2": 3318.6891845703126,
"kl_loss_4": 2176.9557983398436,
"kl_loss_9": 783.5355072021484,
"learning_rate": 0.0009999597205514296,
"loss": 1645.6031,
"step": 140
},
{
"ce_loss_13": 3.377828299999237,
"ce_loss_17": 3.253422975540161,
"ce_loss_2": 4.94508650302887,
"ce_loss_4": 4.358719778060913,
"ce_loss_9": 3.6640469789505006,
"epoch": 0.015,
"grad_norm": 1056.0,
"kl_loss_13": 223.69755477905272,
"kl_loss_2": 3351.4960815429686,
"kl_loss_4": 2190.3794067382814,
"kl_loss_9": 776.6426544189453,
"learning_rate": 0.0009999370638369377,
"loss": 1640.4398,
"step": 150
},
{
"ce_loss_13": 3.405010259151459,
"ce_loss_17": 3.291413652896881,
"ce_loss_2": 4.949583244323731,
"ce_loss_4": 4.344371438026428,
"ce_loss_9": 3.69311705827713,
"epoch": 0.016,
"grad_norm": 1024.0,
"kl_loss_13": 205.55703506469726,
"kl_loss_2": 3293.4473510742187,
"kl_loss_4": 2095.886193847656,
"kl_loss_9": 763.1888641357422,
"learning_rate": 0.000999909372761763,
"loss": 1593.7709,
"step": 160
},
{
"ce_loss_13": 3.3376732587814333,
"ce_loss_17": 3.2281859874725343,
"ce_loss_2": 4.898326992988586,
"ce_loss_4": 4.281079268455505,
"ce_loss_9": 3.615650844573975,
"epoch": 0.017,
"grad_norm": 960.0,
"kl_loss_13": 205.50527114868163,
"kl_loss_2": 3335.3987670898437,
"kl_loss_4": 2112.4116760253905,
"kl_loss_9": 738.5396911621094,
"learning_rate": 0.0009998766476047546,
"loss": 1608.7034,
"step": 170
},
{
"ce_loss_13": 3.385698843002319,
"ce_loss_17": 3.2674369692802427,
"ce_loss_2": 4.935103106498718,
"ce_loss_4": 4.331924676895142,
"ce_loss_9": 3.6581133723258974,
"epoch": 0.018,
"grad_norm": 984.0,
"kl_loss_13": 214.01333236694336,
"kl_loss_2": 3320.2940185546877,
"kl_loss_4": 2132.7986267089846,
"kl_loss_9": 732.3475860595703,
"learning_rate": 0.0009998388886954545,
"loss": 1609.165,
"step": 180
},
{
"ce_loss_13": 3.344286847114563,
"ce_loss_17": 3.2329752802848817,
"ce_loss_2": 4.9058679103851315,
"ce_loss_4": 4.2970584511756895,
"ce_loss_9": 3.6102590441703795,
"epoch": 0.019,
"grad_norm": 1416.0,
"kl_loss_13": 207.53194122314454,
"kl_loss_2": 3344.6282348632812,
"kl_loss_4": 2104.0942565917967,
"kl_loss_9": 712.7711883544922,
"learning_rate": 0.0009997960964140947,
"loss": 1580.0558,
"step": 190
},
{
"ce_loss_13": 3.3503451228141783,
"ce_loss_17": 3.230080413818359,
"ce_loss_2": 4.908375334739685,
"ce_loss_4": 4.296644294261933,
"ce_loss_9": 3.5918402433395387,
"epoch": 0.02,
"grad_norm": 912.0,
"kl_loss_13": 222.13727722167968,
"kl_loss_2": 3336.8043701171873,
"kl_loss_4": 2133.3271484375,
"kl_loss_9": 681.4191864013671,
"learning_rate": 0.0009997482711915926,
"loss": 1573.9847,
"step": 200
},
{
"ce_loss_13": 3.3222963929176332,
"ce_loss_17": 3.204137551784515,
"ce_loss_2": 4.812633442878723,
"ce_loss_4": 4.211560595035553,
"ce_loss_9": 3.5349189162254335,
"epoch": 0.021,
"grad_norm": 816.0,
"kl_loss_13": 220.91544876098632,
"kl_loss_2": 3218.9809814453124,
"kl_loss_4": 2025.6634887695313,
"kl_loss_9": 634.4633239746094,
"learning_rate": 0.0009996954135095479,
"loss": 1521.7721,
"step": 210
},
{
"ce_loss_13": 3.4005984902381896,
"ce_loss_17": 3.280700922012329,
"ce_loss_2": 4.841261887550354,
"ce_loss_4": 4.240457427501679,
"ce_loss_9": 3.607716429233551,
"epoch": 0.022,
"grad_norm": 836.0,
"kl_loss_13": 223.7593849182129,
"kl_loss_2": 3111.2940673828125,
"kl_loss_4": 1915.070245361328,
"kl_loss_9": 620.2754699707032,
"learning_rate": 0.0009996375239002368,
"loss": 1465.0939,
"step": 220
},
{
"ce_loss_13": 3.463943064212799,
"ce_loss_17": 3.35058434009552,
"ce_loss_2": 4.868658900260925,
"ce_loss_4": 4.278674197196961,
"ce_loss_9": 3.6834152340888977,
"epoch": 0.023,
"grad_norm": 1008.0,
"kl_loss_13": 208.8836555480957,
"kl_loss_2": 3045.0566284179686,
"kl_loss_4": 1886.1902648925782,
"kl_loss_9": 636.5732116699219,
"learning_rate": 0.0009995746029466072,
"loss": 1450.0878,
"step": 230
},
{
"ce_loss_13": 3.252587044239044,
"ce_loss_17": 3.1401400566101074,
"ce_loss_2": 4.778895163536072,
"ce_loss_4": 4.140607786178589,
"ce_loss_9": 3.4961634039878846,
"epoch": 0.024,
"grad_norm": 1072.0,
"kl_loss_13": 202.95092086791993,
"kl_loss_2": 3285.569482421875,
"kl_loss_4": 2015.8346618652345,
"kl_loss_9": 684.3923858642578,
"learning_rate": 0.0009995066512822719,
"loss": 1495.9869,
"step": 240
},
{
"ce_loss_13": 3.339576780796051,
"ce_loss_17": 3.2416956186294557,
"ce_loss_2": 4.896433734893799,
"ce_loss_4": 4.256370925903321,
"ce_loss_9": 3.572561264038086,
"epoch": 0.025,
"grad_norm": 880.0,
"kl_loss_13": 188.93093719482422,
"kl_loss_2": 3313.684326171875,
"kl_loss_4": 2032.2442749023437,
"kl_loss_9": 634.844645690918,
"learning_rate": 0.000999433669591504,
"loss": 1474.7885,
"step": 250
},
{
"ce_loss_13": 3.2495914578437803,
"ce_loss_17": 3.148615872859955,
"ce_loss_2": 4.763438987731933,
"ce_loss_4": 4.118508851528167,
"ce_loss_9": 3.4872807264328003,
"epoch": 0.026,
"grad_norm": 932.0,
"kl_loss_13": 180.5644905090332,
"kl_loss_2": 3247.513635253906,
"kl_loss_4": 1957.0520812988282,
"kl_loss_9": 634.1553527832032,
"learning_rate": 0.000999355658609228,
"loss": 1475.6226,
"step": 260
},
{
"ce_loss_13": 3.278519403934479,
"ce_loss_17": 3.1758210182189943,
"ce_loss_2": 4.820499777793884,
"ce_loss_4": 4.18280280828476,
"ce_loss_9": 3.5275792717933654,
"epoch": 0.027,
"grad_norm": 740.0,
"kl_loss_13": 180.34210205078125,
"kl_loss_2": 3261.2527465820312,
"kl_loss_4": 1987.8143920898438,
"kl_loss_9": 652.8159881591797,
"learning_rate": 0.0009992726191210138,
"loss": 1498.7066,
"step": 270
},
{
"ce_loss_13": 3.3170214653015138,
"ce_loss_17": 3.2177631139755247,
"ce_loss_2": 4.788047671318054,
"ce_loss_4": 4.174385607242584,
"ce_loss_9": 3.556040048599243,
"epoch": 0.028,
"grad_norm": 716.0,
"kl_loss_13": 179.22331695556642,
"kl_loss_2": 3124.784069824219,
"kl_loss_4": 1924.5135559082032,
"kl_loss_9": 634.743994140625,
"learning_rate": 0.0009991845519630679,
"loss": 1450.6994,
"step": 280
},
{
"ce_loss_13": 3.197556221485138,
"ce_loss_17": 3.1011061906814574,
"ce_loss_2": 4.678141450881958,
"ce_loss_4": 4.070699083805084,
"ce_loss_9": 3.4350595355033873,
"epoch": 0.029,
"grad_norm": 752.0,
"kl_loss_13": 174.9555892944336,
"kl_loss_2": 3147.2247314453125,
"kl_loss_4": 1939.0474731445313,
"kl_loss_9": 625.6185363769531,
"learning_rate": 0.0009990914580222257,
"loss": 1464.1248,
"step": 290
},
{
"ce_loss_13": 3.335582661628723,
"ce_loss_17": 3.245238924026489,
"ce_loss_2": 4.727395129203797,
"ce_loss_4": 4.135890567302704,
"ce_loss_9": 3.5657243847846987,
"epoch": 0.03,
"grad_norm": 856.0,
"kl_loss_13": 169.86073608398436,
"kl_loss_2": 3007.0453857421876,
"kl_loss_4": 1823.50703125,
"kl_loss_9": 620.2317199707031,
"learning_rate": 0.0009989933382359422,
"loss": 1438.8438,
"step": 300
},
{
"ce_loss_13": 3.344461226463318,
"ce_loss_17": 3.253080868721008,
"ce_loss_2": 4.734828901290894,
"ce_loss_4": 4.140229022502899,
"ce_loss_9": 3.570216417312622,
"epoch": 0.031,
"grad_norm": 868.0,
"kl_loss_13": 170.47278060913087,
"kl_loss_2": 2986.3127563476564,
"kl_loss_4": 1806.9783508300782,
"kl_loss_9": 611.4689514160157,
"learning_rate": 0.0009988901935922825,
"loss": 1407.2166,
"step": 310
},
{
"ce_loss_13": 3.1942640423774717,
"ce_loss_17": 3.098420965671539,
"ce_loss_2": 4.6632726192474365,
"ce_loss_4": 4.045630240440369,
"ce_loss_9": 3.4386914730072022,
"epoch": 0.032,
"grad_norm": 712.0,
"kl_loss_13": 176.81885147094727,
"kl_loss_2": 3156.0201416015625,
"kl_loss_4": 1910.4445190429688,
"kl_loss_9": 635.6860961914062,
"learning_rate": 0.0009987820251299122,
"loss": 1429.7891,
"step": 320
},
{
"ce_loss_13": 3.323316276073456,
"ce_loss_17": 3.2272282600402833,
"ce_loss_2": 4.688269329071045,
"ce_loss_4": 4.095155251026154,
"ce_loss_9": 3.5452579855918884,
"epoch": 0.033,
"grad_norm": 716.0,
"kl_loss_13": 168.06323318481446,
"kl_loss_2": 2952.4156982421873,
"kl_loss_4": 1773.4942504882813,
"kl_loss_9": 594.2050231933594,
"learning_rate": 0.0009986688339380862,
"loss": 1369.0148,
"step": 330
},
{
"ce_loss_13": 3.263345181941986,
"ce_loss_17": 3.1782546758651735,
"ce_loss_2": 4.621670579910278,
"ce_loss_4": 4.041196537017822,
"ce_loss_9": 3.4756152391433717,
"epoch": 0.034,
"grad_norm": 672.0,
"kl_loss_13": 158.30765762329102,
"kl_loss_2": 2897.716516113281,
"kl_loss_4": 1739.8407348632813,
"kl_loss_9": 566.780419921875,
"learning_rate": 0.0009985506211566387,
"loss": 1351.8492,
"step": 340
},
{
"ce_loss_13": 3.297460699081421,
"ce_loss_17": 3.2095221042633058,
"ce_loss_2": 4.629465413093567,
"ce_loss_4": 4.048249614238739,
"ce_loss_9": 3.4922284483909607,
"epoch": 0.035,
"grad_norm": 708.0,
"kl_loss_13": 154.92753982543945,
"kl_loss_2": 2867.41044921875,
"kl_loss_4": 1712.3839904785157,
"kl_loss_9": 556.999772644043,
"learning_rate": 0.0009984273879759713,
"loss": 1332.5782,
"step": 350
},
{
"ce_loss_13": 3.3271041035652162,
"ce_loss_17": 3.237209177017212,
"ce_loss_2": 4.693679356575013,
"ce_loss_4": 4.1152663230896,
"ce_loss_9": 3.5451638102531433,
"epoch": 0.036,
"grad_norm": 688.0,
"kl_loss_13": 161.71177139282227,
"kl_loss_2": 2938.1607299804687,
"kl_loss_4": 1775.6144470214845,
"kl_loss_9": 578.8257720947265,
"learning_rate": 0.0009982991356370402,
"loss": 1381.892,
"step": 360
},
{
"ce_loss_13": 3.3030256986618043,
"ce_loss_17": 3.216581439971924,
"ce_loss_2": 4.672011542320251,
"ce_loss_4": 4.09724098443985,
"ce_loss_9": 3.513698399066925,
"epoch": 0.037,
"grad_norm": 800.0,
"kl_loss_13": 158.75547256469727,
"kl_loss_2": 2935.6953857421877,
"kl_loss_4": 1770.2281066894532,
"kl_loss_9": 569.4476699829102,
"learning_rate": 0.0009981658654313456,
"loss": 1366.0152,
"step": 370
},
{
"ce_loss_13": 3.377531623840332,
"ce_loss_17": 3.2923927903175354,
"ce_loss_2": 4.704947686195373,
"ce_loss_4": 4.1200734853744505,
"ce_loss_9": 3.582323205471039,
"epoch": 0.038,
"grad_norm": 648.0,
"kl_loss_13": 156.85157165527343,
"kl_loss_2": 2856.6430053710938,
"kl_loss_4": 1691.1573059082032,
"kl_loss_9": 549.3141067504882,
"learning_rate": 0.000998027578700917,
"loss": 1336.6225,
"step": 380
},
{
"ce_loss_13": 3.3232869505882263,
"ce_loss_17": 3.2330244302749636,
"ce_loss_2": 4.671757459640503,
"ce_loss_4": 4.081779193878174,
"ce_loss_9": 3.530698537826538,
"epoch": 0.039,
"grad_norm": 716.0,
"kl_loss_13": 158.69569473266603,
"kl_loss_2": 2900.0949096679688,
"kl_loss_4": 1729.8722045898437,
"kl_loss_9": 555.8867797851562,
"learning_rate": 0.0009978842768382998,
"loss": 1345.1803,
"step": 390
},
{
"ce_loss_13": 3.3343419432640076,
"ce_loss_17": 3.2496871948242188,
"ce_loss_2": 4.6395210981369015,
"ce_loss_4": 4.081280362606049,
"ce_loss_9": 3.536642611026764,
"epoch": 0.04,
"grad_norm": 804.0,
"kl_loss_13": 152.12560119628907,
"kl_loss_2": 2803.1874267578123,
"kl_loss_4": 1715.3488342285157,
"kl_loss_9": 557.2631088256836,
"learning_rate": 0.0009977359612865424,
"loss": 1322.9314,
"step": 400
},
{
"ce_loss_13": 3.3324403047561644,
"ce_loss_17": 3.2530759453773497,
"ce_loss_2": 4.658740139007568,
"ce_loss_4": 4.140996336936951,
"ce_loss_9": 3.5597803592681885,
"epoch": 0.041,
"grad_norm": 628.0,
"kl_loss_13": 150.96018981933594,
"kl_loss_2": 2836.8416381835937,
"kl_loss_4": 1791.5275756835938,
"kl_loss_9": 580.1767044067383,
"learning_rate": 0.0009975826335391806,
"loss": 1326.1738,
"step": 410
},
{
"ce_loss_13": 3.3575534582138062,
"ce_loss_17": 3.2765774965286254,
"ce_loss_2": 4.641478061676025,
"ce_loss_4": 4.087486684322357,
"ce_loss_9": 3.5588002443313598,
"epoch": 0.042,
"grad_norm": 648.0,
"kl_loss_13": 153.96022338867186,
"kl_loss_2": 2786.4625732421873,
"kl_loss_4": 1680.8336120605468,
"kl_loss_9": 544.7720199584961,
"learning_rate": 0.0009974242951402235,
"loss": 1299.9223,
"step": 420
},
{
"ce_loss_13": 3.369288170337677,
"ce_loss_17": 3.2808522462844847,
"ce_loss_2": 4.661474442481994,
"ce_loss_4": 4.091749536991119,
"ce_loss_9": 3.5591038703918456,
"epoch": 0.043,
"grad_norm": 648.0,
"kl_loss_13": 159.96430053710938,
"kl_loss_2": 2818.1042724609374,
"kl_loss_4": 1665.9103454589845,
"kl_loss_9": 548.6190933227539,
"learning_rate": 0.0009972609476841367,
"loss": 1282.6076,
"step": 430
},
{
"ce_loss_13": 3.2764453291893005,
"ce_loss_17": 3.1939987540245056,
"ce_loss_2": 4.619550776481629,
"ce_loss_4": 4.023915350437164,
"ce_loss_9": 3.48334219455719,
"epoch": 0.044,
"grad_norm": 592.0,
"kl_loss_13": 153.44165267944337,
"kl_loss_2": 2867.0172119140625,
"kl_loss_4": 1682.9174743652343,
"kl_loss_9": 539.6375167846679,
"learning_rate": 0.0009970925928158272,
"loss": 1309.0494,
"step": 440
},
{
"ce_loss_13": 3.228921616077423,
"ce_loss_17": 3.13667711019516,
"ce_loss_2": 4.5878925085067745,
"ce_loss_4": 3.9889357686042786,
"ce_loss_9": 3.4261950492858886,
"epoch": 0.045,
"grad_norm": 664.0,
"kl_loss_13": 168.95373001098633,
"kl_loss_2": 2954.3875366210937,
"kl_loss_4": 1740.5748657226563,
"kl_loss_9": 553.5774597167969,
"learning_rate": 0.000996919232230627,
"loss": 1337.5051,
"step": 450
},
{
"ce_loss_13": 3.3128785133361816,
"ce_loss_17": 3.225386643409729,
"ce_loss_2": 4.60572247505188,
"ce_loss_4": 4.035604953765869,
"ce_loss_9": 3.499460959434509,
"epoch": 0.046,
"grad_norm": 668.0,
"kl_loss_13": 157.04938583374025,
"kl_loss_2": 2803.0527587890624,
"kl_loss_4": 1673.0932739257812,
"kl_loss_9": 525.0963729858398,
"learning_rate": 0.0009967408676742752,
"loss": 1261.277,
"step": 460
},
{
"ce_loss_13": 3.445576179027557,
"ce_loss_17": 3.3635016322135924,
"ce_loss_2": 4.702920699119568,
"ce_loss_4": 4.1346688032150265,
"ce_loss_9": 3.6396512627601623,
"epoch": 0.047,
"grad_norm": 692.0,
"kl_loss_13": 152.30962295532225,
"kl_loss_2": 2728.0104858398436,
"kl_loss_4": 1598.9078186035156,
"kl_loss_9": 538.0588653564453,
"learning_rate": 0.0009965575009429006,
"loss": 1291.6952,
"step": 470
},
{
"ce_loss_13": 3.2310086250305177,
"ce_loss_17": 3.14866179227829,
"ce_loss_2": 4.563737893104554,
"ce_loss_4": 3.956212747097015,
"ce_loss_9": 3.436033844947815,
"epoch": 0.048,
"grad_norm": 632.0,
"kl_loss_13": 146.651318359375,
"kl_loss_2": 2878.5802001953125,
"kl_loss_4": 1666.7166137695312,
"kl_loss_9": 539.7155288696289,
"learning_rate": 0.0009963691338830043,
"loss": 1287.7481,
"step": 480
},
{
"ce_loss_13": 3.3250165462493895,
"ce_loss_17": 3.2457290410995485,
"ce_loss_2": 4.615313935279846,
"ce_loss_4": 4.033603394031525,
"ce_loss_9": 3.5086933016777038,
"epoch": 0.049,
"grad_norm": 720.0,
"kl_loss_13": 143.4732437133789,
"kl_loss_2": 2802.4427124023437,
"kl_loss_4": 1628.5113647460937,
"kl_loss_9": 516.1418380737305,
"learning_rate": 0.0009961757683914405,
"loss": 1255.4686,
"step": 490
},
{
"ce_loss_13": 3.3277801632881165,
"ce_loss_17": 3.235699439048767,
"ce_loss_2": 4.563870191574097,
"ce_loss_4": 4.019959104061127,
"ce_loss_9": 3.5216776847839357,
"epoch": 0.05,
"grad_norm": 708.0,
"kl_loss_13": 167.92219467163085,
"kl_loss_2": 2734.481298828125,
"kl_loss_4": 1638.4034912109375,
"kl_loss_9": 555.2191772460938,
"learning_rate": 0.0009959774064153978,
"loss": 1278.8344,
"step": 500
},
{
"ce_loss_13": 3.3241262197494508,
"ce_loss_17": 3.2492061614990235,
"ce_loss_2": 4.542808580398559,
"ce_loss_4": 3.9918776512145997,
"ce_loss_9": 3.5068222641944886,
"epoch": 0.051,
"grad_norm": 624.0,
"kl_loss_13": 140.5283073425293,
"kl_loss_2": 2661.6624389648437,
"kl_loss_4": 1559.182373046875,
"kl_loss_9": 509.0259124755859,
"learning_rate": 0.0009957740499523787,
"loss": 1242.5957,
"step": 510
},
{
"ce_loss_13": 3.347599446773529,
"ce_loss_17": 3.2621068120002747,
"ce_loss_2": 4.5844261169433596,
"ce_loss_4": 4.026308608055115,
"ce_loss_9": 3.5309693813323975,
"epoch": 0.052,
"grad_norm": 656.0,
"kl_loss_13": 155.69999313354492,
"kl_loss_2": 2678.771203613281,
"kl_loss_4": 1568.6827453613282,
"kl_loss_9": 516.6230758666992,
"learning_rate": 0.0009955657010501807,
"loss": 1238.2941,
"step": 520
},
{
"ce_loss_13": 3.31034996509552,
"ce_loss_17": 3.2209251523017883,
"ce_loss_2": 4.5603924751281735,
"ce_loss_4": 4.001909756660462,
"ce_loss_9": 3.494111442565918,
"epoch": 0.053,
"grad_norm": 828.0,
"kl_loss_13": 159.72130584716797,
"kl_loss_2": 2732.4492431640624,
"kl_loss_4": 1603.6663513183594,
"kl_loss_9": 522.7829132080078,
"learning_rate": 0.000995352361806875,
"loss": 1243.36,
"step": 530
},
{
"ce_loss_13": 3.349541389942169,
"ce_loss_17": 3.259581971168518,
"ce_loss_2": 4.599472379684448,
"ce_loss_4": 4.029084694385529,
"ce_loss_9": 3.5391013860702514,
"epoch": 0.054,
"grad_norm": 660.0,
"kl_loss_13": 157.69116287231446,
"kl_loss_2": 2756.20224609375,
"kl_loss_4": 1600.1287353515625,
"kl_loss_9": 529.4489334106445,
"learning_rate": 0.0009951340343707852,
"loss": 1271.6059,
"step": 540
},
{
"ce_loss_13": 3.390316832065582,
"ce_loss_17": 3.311418342590332,
"ce_loss_2": 4.65280933380127,
"ce_loss_4": 4.078594005107879,
"ce_loss_9": 3.573176646232605,
"epoch": 0.055,
"grad_norm": 628.0,
"kl_loss_13": 147.9464485168457,
"kl_loss_2": 2711.2746459960936,
"kl_loss_4": 1571.1894104003907,
"kl_loss_9": 499.5439422607422,
"learning_rate": 0.0009949107209404665,
"loss": 1244.4903,
"step": 550
},
{
"ce_loss_13": 3.3250513792037966,
"ce_loss_17": 3.2271708369255068,
"ce_loss_2": 4.5295474290847775,
"ce_loss_4": 3.989085817337036,
"ce_loss_9": 3.4934698700904847,
"epoch": 0.056,
"grad_norm": 664.0,
"kl_loss_13": 166.64837646484375,
"kl_loss_2": 2681.1214965820313,
"kl_loss_4": 1583.4888671875,
"kl_loss_9": 496.79695892333984,
"learning_rate": 0.0009946824237646824,
"loss": 1236.8229,
"step": 560
},
{
"ce_loss_13": 3.267991077899933,
"ce_loss_17": 3.179435741901398,
"ce_loss_2": 4.517114925384521,
"ce_loss_4": 3.9495364665985107,
"ce_loss_9": 3.4468095779418944,
"epoch": 0.057,
"grad_norm": 596.0,
"kl_loss_13": 168.04478225708007,
"kl_loss_2": 2752.01181640625,
"kl_loss_4": 1609.467059326172,
"kl_loss_9": 518.3810485839844,
"learning_rate": 0.0009944491451423828,
"loss": 1277.3712,
"step": 570
},
{
"ce_loss_13": 3.2650999069213866,
"ce_loss_17": 3.1744595170021057,
"ce_loss_2": 4.5301364183425905,
"ce_loss_4": 3.957633888721466,
"ce_loss_9": 3.445773887634277,
"epoch": 0.058,
"grad_norm": 640.0,
"kl_loss_13": 162.0532470703125,
"kl_loss_2": 2773.3030639648437,
"kl_loss_4": 1614.8263305664063,
"kl_loss_9": 521.4058013916016,
"learning_rate": 0.0009942108874226813,
"loss": 1244.9752,
"step": 580
},
{
"ce_loss_13": 3.3783419728279114,
"ce_loss_17": 3.290017545223236,
"ce_loss_2": 4.564514803886413,
"ce_loss_4": 4.015147042274475,
"ce_loss_9": 3.547626256942749,
"epoch": 0.059,
"grad_norm": 676.0,
"kl_loss_13": 158.40567779541016,
"kl_loss_2": 2587.4620849609373,
"kl_loss_4": 1504.674383544922,
"kl_loss_9": 500.83845977783204,
"learning_rate": 0.00099396765300483,
"loss": 1187.66,
"step": 590
},
{
"ce_loss_13": 3.35656076669693,
"ce_loss_17": 3.2701781749725343,
"ce_loss_2": 4.545877623558044,
"ce_loss_4": 4.003049039840699,
"ce_loss_9": 3.5325764417648315,
"epoch": 0.06,
"grad_norm": 712.0,
"kl_loss_13": 157.77887191772462,
"kl_loss_2": 2607.5798461914064,
"kl_loss_4": 1525.541796875,
"kl_loss_9": 505.99817657470703,
"learning_rate": 0.0009937194443381972,
"loss": 1206.6717,
"step": 600
},
{
"ce_loss_13": 3.3773241877555846,
"ce_loss_17": 3.2941529393196105,
"ce_loss_2": 4.549521040916443,
"ce_loss_4": 4.002827751636505,
"ce_loss_9": 3.5425670981407165,
"epoch": 0.061,
"grad_norm": 600.0,
"kl_loss_13": 156.81272048950194,
"kl_loss_2": 2586.9384765625,
"kl_loss_4": 1490.7980834960938,
"kl_loss_9": 492.3127700805664,
"learning_rate": 0.0009934662639222412,
"loss": 1210.4365,
"step": 610
},
{
"ce_loss_13": 3.325599718093872,
"ce_loss_17": 3.246423053741455,
"ce_loss_2": 4.564360666275024,
"ce_loss_4": 3.9915852546691895,
"ce_loss_9": 3.5050143718719484,
"epoch": 0.062,
"grad_norm": 684.0,
"kl_loss_13": 145.37355117797853,
"kl_loss_2": 2708.588037109375,
"kl_loss_4": 1551.0524780273438,
"kl_loss_9": 509.78137054443357,
"learning_rate": 0.000993208114306486,
"loss": 1216.7808,
"step": 620
},
{
"ce_loss_13": 3.248555541038513,
"ce_loss_17": 3.1725520133972167,
"ce_loss_2": 4.505973243713379,
"ce_loss_4": 3.927742075920105,
"ce_loss_9": 3.437049090862274,
"epoch": 0.063,
"grad_norm": 664.0,
"kl_loss_13": 141.67784690856934,
"kl_loss_2": 2725.5945434570312,
"kl_loss_4": 1559.2127990722656,
"kl_loss_9": 512.6172515869141,
"learning_rate": 0.0009929449980904952,
"loss": 1202.8824,
"step": 630
},
{
"ce_loss_13": 3.3062828302383425,
"ce_loss_17": 3.232676351070404,
"ce_loss_2": 4.524214553833008,
"ce_loss_4": 3.968895471096039,
"ce_loss_9": 3.490144872665405,
"epoch": 0.064,
"grad_norm": 568.0,
"kl_loss_13": 135.26087493896483,
"kl_loss_2": 2661.7224609375,
"kl_loss_4": 1541.601055908203,
"kl_loss_9": 506.95104217529297,
"learning_rate": 0.0009926769179238466,
"loss": 1202.9591,
"step": 640
},
{
"ce_loss_13": 3.346583092212677,
"ce_loss_17": 3.2691202402114867,
"ce_loss_2": 4.551297330856324,
"ce_loss_4": 4.013254928588867,
"ce_loss_9": 3.567150342464447,
"epoch": 0.065,
"grad_norm": 884.0,
"kl_loss_13": 140.0088653564453,
"kl_loss_2": 2635.9703979492188,
"kl_loss_4": 1548.76943359375,
"kl_loss_9": 584.322395324707,
"learning_rate": 0.000992403876506104,
"loss": 1221.5454,
"step": 650
},
{
"ce_loss_13": 3.283110725879669,
"ce_loss_17": 3.20692378282547,
"ce_loss_2": 4.5054912805557255,
"ce_loss_4": 3.93732008934021,
"ce_loss_9": 3.500643289089203,
"epoch": 0.066,
"grad_norm": 612.0,
"kl_loss_13": 133.65259857177733,
"kl_loss_2": 2645.805798339844,
"kl_loss_4": 1509.3870361328125,
"kl_loss_9": 559.6844909667968,
"learning_rate": 0.0009921258765867918,
"loss": 1215.4954,
"step": 660
},
{
"ce_loss_13": 3.252826678752899,
"ce_loss_17": 3.1817604064941407,
"ce_loss_2": 4.496033573150635,
"ce_loss_4": 3.9209199070930483,
"ce_loss_9": 3.449157202243805,
"epoch": 0.067,
"grad_norm": 636.0,
"kl_loss_13": 128.94365310668945,
"kl_loss_2": 2730.5625854492187,
"kl_loss_4": 1557.8232788085938,
"kl_loss_9": 518.3787063598633,
"learning_rate": 0.0009918429209653662,
"loss": 1209.2983,
"step": 670
},
{
"ce_loss_13": 3.3021888971328734,
"ce_loss_17": 3.233319938182831,
"ce_loss_2": 4.521537590026855,
"ce_loss_4": 3.9559776186943054,
"ce_loss_9": 3.4880390763282776,
"epoch": 0.068,
"grad_norm": 628.0,
"kl_loss_13": 127.2779327392578,
"kl_loss_2": 2667.96015625,
"kl_loss_4": 1519.8798095703125,
"kl_loss_9": 498.82932434082034,
"learning_rate": 0.0009915550124911866,
"loss": 1176.0943,
"step": 680
},
{
"ce_loss_13": 3.315458345413208,
"ce_loss_17": 3.2440866470336913,
"ce_loss_2": 4.5074906826019285,
"ce_loss_4": 3.9628826498985292,
"ce_loss_9": 3.500523793697357,
"epoch": 0.069,
"grad_norm": 608.0,
"kl_loss_13": 127.5838581085205,
"kl_loss_2": 2602.7898559570312,
"kl_loss_4": 1498.2195983886718,
"kl_loss_9": 492.14330291748047,
"learning_rate": 0.0009912621540634887,
"loss": 1182.3338,
"step": 690
},
{
"ce_loss_13": 3.3509831070899962,
"ce_loss_17": 3.280188775062561,
"ce_loss_2": 4.51383101940155,
"ce_loss_4": 3.958842468261719,
"ce_loss_9": 3.520415389537811,
"epoch": 0.07,
"grad_norm": 644.0,
"kl_loss_13": 123.37131958007812,
"kl_loss_2": 2550.901086425781,
"kl_loss_4": 1436.4951538085938,
"kl_loss_9": 472.02699279785156,
"learning_rate": 0.0009909643486313534,
"loss": 1159.7512,
"step": 700
},
{
"ce_loss_13": 3.2315321445465086,
"ce_loss_17": 3.160666084289551,
"ce_loss_2": 4.456056094169616,
"ce_loss_4": 3.886583685874939,
"ce_loss_9": 3.4124111890792848,
"epoch": 0.071,
"grad_norm": 616.0,
"kl_loss_13": 127.64315071105958,
"kl_loss_2": 2668.676965332031,
"kl_loss_4": 1520.346856689453,
"kl_loss_9": 484.3673568725586,
"learning_rate": 0.000990661599193678,
"loss": 1218.8443,
"step": 710
},
{
"ce_loss_13": 3.353931748867035,
"ce_loss_17": 3.2803229689598083,
"ce_loss_2": 4.531432890892029,
"ce_loss_4": 4.003308618068695,
"ce_loss_9": 3.526760494709015,
"epoch": 0.072,
"grad_norm": 676.0,
"kl_loss_13": 131.8408176422119,
"kl_loss_2": 2572.8404541015625,
"kl_loss_4": 1497.0823486328125,
"kl_loss_9": 465.3206527709961,
"learning_rate": 0.0009903539087991462,
"loss": 1167.6391,
"step": 720
},
{
"ce_loss_13": 3.332451033592224,
"ce_loss_17": 3.2614585876464846,
"ce_loss_2": 4.516555666923523,
"ce_loss_4": 3.976523768901825,
"ce_loss_9": 3.5042252063751222,
"epoch": 0.073,
"grad_norm": 612.0,
"kl_loss_13": 127.52910423278809,
"kl_loss_2": 2588.143310546875,
"kl_loss_4": 1491.805487060547,
"kl_loss_9": 474.08206634521486,
"learning_rate": 0.0009900412805461966,
"loss": 1178.3563,
"step": 730
},
{
"ce_loss_13": 3.4028484225273132,
"ce_loss_17": 3.3355210065841674,
"ce_loss_2": 4.5544538497924805,
"ce_loss_4": 4.019611120223999,
"ce_loss_9": 3.577867567539215,
"epoch": 0.074,
"grad_norm": 712.0,
"kl_loss_13": 126.20948028564453,
"kl_loss_2": 2530.4266357421875,
"kl_loss_4": 1446.06923828125,
"kl_loss_9": 465.1686264038086,
"learning_rate": 0.0009897237175829927,
"loss": 1162.8408,
"step": 740
},
{
"ce_loss_13": 3.3002208709716796,
"ce_loss_17": 3.2221954345703123,
"ce_loss_2": 4.4848315715789795,
"ce_loss_4": 3.9426618337631227,
"ce_loss_9": 3.4645259857177733,
"epoch": 0.075,
"grad_norm": 636.0,
"kl_loss_13": 136.19006538391113,
"kl_loss_2": 2616.3106323242187,
"kl_loss_4": 1516.6720825195312,
"kl_loss_9": 474.8717376708984,
"learning_rate": 0.0009894012231073895,
"loss": 1176.7424,
"step": 750
},
{
"ce_loss_13": 3.3391860246658327,
"ce_loss_17": 3.2671860218048097,
"ce_loss_2": 4.515418291091919,
"ce_loss_4": 3.9612865805625916,
"ce_loss_9": 3.5085081100463866,
"epoch": 0.076,
"grad_norm": 604.0,
"kl_loss_13": 137.293404006958,
"kl_loss_2": 2573.784558105469,
"kl_loss_4": 1466.8589294433593,
"kl_loss_9": 468.3266311645508,
"learning_rate": 0.0009890738003669028,
"loss": 1172.3907,
"step": 760
},
{
"ce_loss_13": 3.312580978870392,
"ce_loss_17": 3.2395971775054933,
"ce_loss_2": 4.523102068901062,
"ce_loss_4": 3.9617782711982725,
"ce_loss_9": 3.4849427342414856,
"epoch": 0.077,
"grad_norm": 600.0,
"kl_loss_13": 135.65264129638672,
"kl_loss_2": 2657.336071777344,
"kl_loss_4": 1531.0789855957032,
"kl_loss_9": 475.88781585693357,
"learning_rate": 0.0009887414526586764,
"loss": 1164.7951,
"step": 770
},
{
"ce_loss_13": 3.363868010044098,
"ce_loss_17": 3.2938335061073305,
"ce_loss_2": 4.5332248449325565,
"ce_loss_4": 3.9797385334968567,
"ce_loss_9": 3.52528201341629,
"epoch": 0.078,
"grad_norm": 624.0,
"kl_loss_13": 123.87691078186035,
"kl_loss_2": 2563.1283447265623,
"kl_loss_4": 1439.5722351074219,
"kl_loss_9": 451.4059295654297,
"learning_rate": 0.0009884041833294476,
"loss": 1131.074,
"step": 780
},
{
"ce_loss_13": 3.366063630580902,
"ce_loss_17": 3.2975459814071657,
"ce_loss_2": 4.51762444972992,
"ce_loss_4": 3.9779760360717775,
"ce_loss_9": 3.530991554260254,
"epoch": 0.079,
"grad_norm": 672.0,
"kl_loss_13": 121.06926002502442,
"kl_loss_2": 2539.3625732421874,
"kl_loss_4": 1439.5869018554688,
"kl_loss_9": 458.55997467041016,
"learning_rate": 0.000988061995775515,
"loss": 1162.3846,
"step": 790
},
{
"ce_loss_13": 3.29613002538681,
"ce_loss_17": 3.2262673020362853,
"ce_loss_2": 4.4409942626953125,
"ce_loss_4": 3.9230281710624695,
"ce_loss_9": 3.4621812462806703,
"epoch": 0.08,
"grad_norm": 596.0,
"kl_loss_13": 126.79234085083007,
"kl_loss_2": 2531.2521362304688,
"kl_loss_4": 1482.3507019042968,
"kl_loss_9": 465.85094757080077,
"learning_rate": 0.0009877148934427035,
"loss": 1146.9053,
"step": 800
},
{
"ce_loss_13": 3.3456916570663453,
"ce_loss_17": 3.2675469279289246,
"ce_loss_2": 4.4991215467453,
"ce_loss_4": 3.941481518745422,
"ce_loss_9": 3.500859725475311,
"epoch": 0.081,
"grad_norm": 644.0,
"kl_loss_13": 133.1513931274414,
"kl_loss_2": 2553.1867919921874,
"kl_loss_4": 1434.2838317871094,
"kl_loss_9": 449.13096771240237,
"learning_rate": 0.0009873628798263297,
"loss": 1131.1455,
"step": 810
},
{
"ce_loss_13": 3.294445109367371,
"ce_loss_17": 3.223985254764557,
"ce_loss_2": 4.428789710998535,
"ce_loss_4": 3.88575097322464,
"ce_loss_9": 3.4480647444725037,
"epoch": 0.082,
"grad_norm": 576.0,
"kl_loss_13": 131.85743446350097,
"kl_loss_2": 2488.2187744140624,
"kl_loss_4": 1409.9474609375,
"kl_loss_9": 438.8127838134766,
"learning_rate": 0.0009870059584711668,
"loss": 1150.1537,
"step": 820
},
{
"ce_loss_13": 3.3084290266036986,
"ce_loss_17": 3.2389563798904417,
"ce_loss_2": 4.449469542503357,
"ce_loss_4": 3.916575002670288,
"ce_loss_9": 3.4716209053993223,
"epoch": 0.083,
"grad_norm": 628.0,
"kl_loss_13": 122.97907829284668,
"kl_loss_2": 2519.50517578125,
"kl_loss_4": 1433.108642578125,
"kl_loss_9": 449.61902923583983,
"learning_rate": 0.000986644132971409,
"loss": 1128.2717,
"step": 830
},
{
"ce_loss_13": 3.295887219905853,
"ce_loss_17": 3.2251924514770507,
"ce_loss_2": 4.464026820659638,
"ce_loss_4": 3.9351455330848695,
"ce_loss_9": 3.4617645859718325,
"epoch": 0.084,
"grad_norm": 616.0,
"kl_loss_13": 126.87683448791503,
"kl_loss_2": 2557.2621459960938,
"kl_loss_4": 1477.8490234375,
"kl_loss_9": 461.0807281494141,
"learning_rate": 0.0009862774069706345,
"loss": 1142.0318,
"step": 840
},
{
"ce_loss_13": 3.4207282423973084,
"ce_loss_17": 3.355008363723755,
"ce_loss_2": 4.518818354606628,
"ce_loss_4": 4.004429590702057,
"ce_loss_9": 3.5739036083221434,
"epoch": 0.085,
"grad_norm": 704.0,
"kl_loss_13": 124.23913230895997,
"kl_loss_2": 2454.3523193359374,
"kl_loss_4": 1417.0260314941406,
"kl_loss_9": 446.0335723876953,
"learning_rate": 0.000985905784161771,
"loss": 1124.835,
"step": 850
},
{
"ce_loss_13": 3.347672176361084,
"ce_loss_17": 3.281836974620819,
"ce_loss_2": 4.472033071517944,
"ce_loss_4": 3.9360882163047792,
"ce_loss_9": 3.499380612373352,
"epoch": 0.086,
"grad_norm": 644.0,
"kl_loss_13": 118.70066299438477,
"kl_loss_2": 2478.5536499023438,
"kl_loss_4": 1409.6584350585938,
"kl_loss_9": 433.6291168212891,
"learning_rate": 0.000985529268287055,
"loss": 1107.5163,
"step": 860
},
{
"ce_loss_13": 3.276761054992676,
"ce_loss_17": 3.2099615335464478,
"ce_loss_2": 4.449923253059387,
"ce_loss_4": 3.8927905440330504,
"ce_loss_9": 3.438116121292114,
"epoch": 0.087,
"grad_norm": 672.0,
"kl_loss_13": 119.32656173706054,
"kl_loss_2": 2563.2444702148437,
"kl_loss_4": 1447.4735412597656,
"kl_loss_9": 440.7087692260742,
"learning_rate": 0.0009851478631379982,
"loss": 1138.2113,
"step": 870
},
{
"ce_loss_13": 3.338916289806366,
"ce_loss_17": 3.2712655782699587,
"ce_loss_2": 4.4808837890625,
"ce_loss_4": 3.9395508766174316,
"ce_loss_9": 3.500311553478241,
"epoch": 0.088,
"grad_norm": 588.0,
"kl_loss_13": 117.52669868469238,
"kl_loss_2": 2510.5021240234373,
"kl_loss_4": 1403.144793701172,
"kl_loss_9": 436.85083923339846,
"learning_rate": 0.0009847615725553456,
"loss": 1117.7988,
"step": 880
},
{
"ce_loss_13": 3.391915798187256,
"ce_loss_17": 3.326561784744263,
"ce_loss_2": 4.47606725692749,
"ce_loss_4": 3.9601453065872194,
"ce_loss_9": 3.5397919535636904,
"epoch": 0.089,
"grad_norm": 620.0,
"kl_loss_13": 113.68915100097657,
"kl_loss_2": 2387.3920288085938,
"kl_loss_4": 1350.6962036132813,
"kl_loss_9": 414.06656951904296,
"learning_rate": 0.0009843704004290394,
"loss": 1110.0278,
"step": 890
},
{
"ce_loss_13": 3.298319697380066,
"ce_loss_17": 3.2356975436210633,
"ce_loss_2": 4.430491948127747,
"ce_loss_4": 3.9021120190620424,
"ce_loss_9": 3.455435812473297,
"epoch": 0.09,
"grad_norm": 652.0,
"kl_loss_13": 119.12698974609376,
"kl_loss_2": 2516.066857910156,
"kl_loss_4": 1438.2920654296875,
"kl_loss_9": 444.22851409912107,
"learning_rate": 0.0009839743506981783,
"loss": 1126.7336,
"step": 900
},
{
"ce_loss_13": 3.2220940709114076,
"ce_loss_17": 3.1536363482475283,
"ce_loss_2": 4.412917733192444,
"ce_loss_4": 3.8601535081863405,
"ce_loss_9": 3.38604793548584,
"epoch": 0.091,
"grad_norm": 644.0,
"kl_loss_13": 120.22522850036621,
"kl_loss_2": 2636.6438842773437,
"kl_loss_4": 1504.226806640625,
"kl_loss_9": 457.072492980957,
"learning_rate": 0.0009835734273509786,
"loss": 1149.7051,
"step": 910
},
{
"ce_loss_13": 3.3127182841300966,
"ce_loss_17": 3.246254229545593,
"ce_loss_2": 4.4611786842346195,
"ce_loss_4": 3.927472507953644,
"ce_loss_9": 3.4767918229103087,
"epoch": 0.092,
"grad_norm": 616.0,
"kl_loss_13": 116.29349250793457,
"kl_loss_2": 2488.740368652344,
"kl_loss_4": 1402.4096435546876,
"kl_loss_9": 434.8053451538086,
"learning_rate": 0.0009831676344247342,
"loss": 1119.2193,
"step": 920
},
{
"ce_loss_13": 3.3306243419647217,
"ce_loss_17": 3.269066500663757,
"ce_loss_2": 4.432948970794678,
"ce_loss_4": 3.9210447788238527,
"ce_loss_9": 3.4840231418609617,
"epoch": 0.093,
"grad_norm": 616.0,
"kl_loss_13": 113.15401573181153,
"kl_loss_2": 2442.289599609375,
"kl_loss_4": 1401.3232055664062,
"kl_loss_9": 428.3973327636719,
"learning_rate": 0.0009827569760057755,
"loss": 1116.0459,
"step": 930
},
{
"ce_loss_13": 3.252093029022217,
"ce_loss_17": 3.18474862575531,
"ce_loss_2": 4.465179300308227,
"ce_loss_4": 3.8863181352615355,
"ce_loss_9": 3.418609654903412,
"epoch": 0.094,
"grad_norm": 740.0,
"kl_loss_13": 118.65354614257812,
"kl_loss_2": 2651.7362548828123,
"kl_loss_4": 1487.0295043945312,
"kl_loss_9": 452.91857757568357,
"learning_rate": 0.000982341456229428,
"loss": 1138.2396,
"step": 940
},
{
"ce_loss_13": 3.34140704870224,
"ce_loss_17": 3.277027201652527,
"ce_loss_2": 4.492311930656433,
"ce_loss_4": 3.9498108267784118,
"ce_loss_9": 3.5043185353279114,
"epoch": 0.095,
"grad_norm": 588.0,
"kl_loss_13": 114.85740127563477,
"kl_loss_2": 2544.40927734375,
"kl_loss_4": 1438.683056640625,
"kl_loss_9": 444.57925872802736,
"learning_rate": 0.000981921079279971,
"loss": 1106.5577,
"step": 950
},
{
"ce_loss_13": 3.3622724056243896,
"ce_loss_17": 3.300556206703186,
"ce_loss_2": 4.42045681476593,
"ce_loss_4": 3.9144128561019897,
"ce_loss_9": 3.5092538595199585,
"epoch": 0.096,
"grad_norm": 608.0,
"kl_loss_13": 112.38622970581055,
"kl_loss_2": 2378.4241333007812,
"kl_loss_4": 1349.9152770996093,
"kl_loss_9": 425.52757568359374,
"learning_rate": 0.0009814958493905962,
"loss": 1082.7883,
"step": 960
},
{
"ce_loss_13": 3.314443755149841,
"ce_loss_17": 3.249697279930115,
"ce_loss_2": 4.458523607254028,
"ce_loss_4": 3.919697678089142,
"ce_loss_9": 3.47161465883255,
"epoch": 0.097,
"grad_norm": 640.0,
"kl_loss_13": 115.82365684509277,
"kl_loss_2": 2514.8928100585936,
"kl_loss_4": 1421.7318969726562,
"kl_loss_9": 442.53113708496096,
"learning_rate": 0.0009810657708433637,
"loss": 1140.2196,
"step": 970
},
{
"ce_loss_13": 3.394575309753418,
"ce_loss_17": 3.3281876921653746,
"ce_loss_2": 4.4481121301651,
"ce_loss_4": 3.9479944825172426,
"ce_loss_9": 3.539068377017975,
"epoch": 0.098,
"grad_norm": 600.0,
"kl_loss_13": 122.24889755249023,
"kl_loss_2": 2346.068981933594,
"kl_loss_4": 1342.0567199707032,
"kl_loss_9": 424.09859161376954,
"learning_rate": 0.0009806308479691594,
"loss": 1072.8215,
"step": 980
},
{
"ce_loss_13": 3.402330148220062,
"ce_loss_17": 3.331631302833557,
"ce_loss_2": 4.497003102302552,
"ce_loss_4": 3.9845388293266297,
"ce_loss_9": 3.5640967011451723,
"epoch": 0.099,
"grad_norm": 668.0,
"kl_loss_13": 126.92496185302734,
"kl_loss_2": 2429.3766479492188,
"kl_loss_4": 1386.66455078125,
"kl_loss_9": 448.76985321044924,
"learning_rate": 0.0009801910851476522,
"loss": 1093.9625,
"step": 990
},
{
"ce_loss_13": 3.318433976173401,
"ce_loss_17": 3.2502185463905335,
"ce_loss_2": 4.4634592771530155,
"ce_loss_4": 3.917101538181305,
"ce_loss_9": 3.477748930454254,
"epoch": 0.1,
"grad_norm": 628.0,
"kl_loss_13": 123.46217765808106,
"kl_loss_2": 2552.455029296875,
"kl_loss_4": 1429.8879516601562,
"kl_loss_9": 456.84090118408204,
"learning_rate": 0.0009797464868072487,
"loss": 1110.9061,
"step": 1000
},
{
"ce_loss_13": 3.3047823667526246,
"ce_loss_17": 3.238838028907776,
"ce_loss_2": 4.421839463710785,
"ce_loss_4": 3.9042311549186706,
"ce_loss_9": 3.4642874002456665,
"epoch": 0.101,
"grad_norm": 600.0,
"kl_loss_13": 120.86031990051269,
"kl_loss_2": 2483.5817749023436,
"kl_loss_4": 1423.0612670898438,
"kl_loss_9": 453.3130477905273,
"learning_rate": 0.0009792970574250492,
"loss": 1113.9136,
"step": 1010
},
{
"ce_loss_13": 3.3296589612960816,
"ce_loss_17": 3.264625906944275,
"ce_loss_2": 4.431443309783935,
"ce_loss_4": 3.9199029326438906,
"ce_loss_9": 3.49114305973053,
"epoch": 0.102,
"grad_norm": 628.0,
"kl_loss_13": 115.92987747192383,
"kl_loss_2": 2445.0037353515627,
"kl_loss_4": 1405.2209777832031,
"kl_loss_9": 437.9608520507812,
"learning_rate": 0.0009788428015268028,
"loss": 1084.3144,
"step": 1020
},
{
"ce_loss_13": 3.330202805995941,
"ce_loss_17": 3.264923906326294,
"ce_loss_2": 4.413852691650391,
"ce_loss_4": 3.8963095426559446,
"ce_loss_9": 3.476387345790863,
"epoch": 0.103,
"grad_norm": 668.0,
"kl_loss_13": 121.63944854736329,
"kl_loss_2": 2417.3347045898436,
"kl_loss_4": 1368.259210205078,
"kl_loss_9": 420.54846343994143,
"learning_rate": 0.0009783837236868609,
"loss": 1081.6119,
"step": 1030
},
{
"ce_loss_13": 3.2946935057640077,
"ce_loss_17": 3.2287474393844606,
"ce_loss_2": 4.396254158020019,
"ce_loss_4": 3.8797624588012694,
"ce_loss_9": 3.4512138962745667,
"epoch": 0.104,
"grad_norm": 616.0,
"kl_loss_13": 116.54061508178711,
"kl_loss_2": 2410.5272583007813,
"kl_loss_4": 1376.435107421875,
"kl_loss_9": 430.11146240234376,
"learning_rate": 0.0009779198285281327,
"loss": 1080.1969,
"step": 1040
},
{
"ce_loss_13": 3.285519337654114,
"ce_loss_17": 3.2236326575279235,
"ce_loss_2": 4.41151978969574,
"ce_loss_4": 3.881605124473572,
"ce_loss_9": 3.4398640751838685,
"epoch": 0.105,
"grad_norm": 608.0,
"kl_loss_13": 111.27123718261718,
"kl_loss_2": 2481.2762451171875,
"kl_loss_4": 1399.8256469726562,
"kl_loss_9": 421.35252685546874,
"learning_rate": 0.0009774511207220368,
"loss": 1095.8346,
"step": 1050
},
{
"ce_loss_13": 3.3345521807670595,
"ce_loss_17": 3.2681204080581665,
"ce_loss_2": 4.447642612457275,
"ce_loss_4": 3.9166941165924074,
"ce_loss_9": 3.4915797114372253,
"epoch": 0.106,
"grad_norm": 640.0,
"kl_loss_13": 113.97708396911621,
"kl_loss_2": 2457.1487548828127,
"kl_loss_4": 1378.8427001953125,
"kl_loss_9": 430.92195892333984,
"learning_rate": 0.0009769776049884564,
"loss": 1091.3941,
"step": 1060
},
{
"ce_loss_13": 3.2423794984817507,
"ce_loss_17": 3.1796736478805543,
"ce_loss_2": 4.388264894485474,
"ce_loss_4": 3.845630931854248,
"ce_loss_9": 3.4036087512969972,
"epoch": 0.107,
"grad_norm": 700.0,
"kl_loss_13": 112.91108932495118,
"kl_loss_2": 2523.034704589844,
"kl_loss_4": 1431.7101135253906,
"kl_loss_9": 439.0070556640625,
"learning_rate": 0.0009764992860956889,
"loss": 1128.451,
"step": 1070
},
{
"ce_loss_13": 3.393851613998413,
"ce_loss_17": 3.3325206995010377,
"ce_loss_2": 4.425472116470337,
"ce_loss_4": 3.9404685735702514,
"ce_loss_9": 3.532415735721588,
"epoch": 0.108,
"grad_norm": 592.0,
"kl_loss_13": 109.46063995361328,
"kl_loss_2": 2309.0443603515623,
"kl_loss_4": 1306.1042724609374,
"kl_loss_9": 404.70620880126955,
"learning_rate": 0.0009760161688604008,
"loss": 1054.0623,
"step": 1080
},
{
"ce_loss_13": 3.391258454322815,
"ce_loss_17": 3.329734480381012,
"ce_loss_2": 4.481240391731262,
"ce_loss_4": 3.973934280872345,
"ce_loss_9": 3.543959391117096,
"epoch": 0.109,
"grad_norm": 712.0,
"kl_loss_13": 110.02838973999023,
"kl_loss_2": 2395.6950866699217,
"kl_loss_4": 1360.7293579101563,
"kl_loss_9": 417.4671920776367,
"learning_rate": 0.0009755282581475768,
"loss": 1081.7303,
"step": 1090
},
{
"ce_loss_13": 3.4368482708930967,
"ce_loss_17": 3.3751877665519716,
"ce_loss_2": 4.501154685020447,
"ce_loss_4": 3.994403803348541,
"ce_loss_9": 3.5841227054595945,
"epoch": 0.11,
"grad_norm": 644.0,
"kl_loss_13": 116.24259529113769,
"kl_loss_2": 2357.111456298828,
"kl_loss_4": 1332.5497741699219,
"kl_loss_9": 419.2577301025391,
"learning_rate": 0.0009750355588704727,
"loss": 1050.8812,
"step": 1100
},
{
"ce_loss_13": 3.2831246733665465,
"ce_loss_17": 3.2204254388809206,
"ce_loss_2": 4.386692416667938,
"ce_loss_4": 3.8651437640190123,
"ce_loss_9": 3.4342461109161375,
"epoch": 0.111,
"grad_norm": 636.0,
"kl_loss_13": 112.50919799804687,
"kl_loss_2": 2414.4412109375,
"kl_loss_4": 1366.421612548828,
"kl_loss_9": 416.9934906005859,
"learning_rate": 0.0009745380759905647,
"loss": 1097.4557,
"step": 1110
},
{
"ce_loss_13": 3.2317030668258666,
"ce_loss_17": 3.1717841029167175,
"ce_loss_2": 4.353304851055145,
"ce_loss_4": 3.8201396465301514,
"ce_loss_9": 3.3812498927116392,
"epoch": 0.112,
"grad_norm": 668.0,
"kl_loss_13": 109.91761131286621,
"kl_loss_2": 2468.8907836914063,
"kl_loss_4": 1382.1518798828124,
"kl_loss_9": 416.75934143066405,
"learning_rate": 0.0009740358145174998,
"loss": 1106.3107,
"step": 1120
},
{
"ce_loss_13": 3.3809526681900026,
"ce_loss_17": 3.3203678369522094,
"ce_loss_2": 4.416502618789673,
"ce_loss_4": 3.9220649361610413,
"ce_loss_9": 3.5257248759269713,
"epoch": 0.113,
"grad_norm": 588.0,
"kl_loss_13": 108.38679389953613,
"kl_loss_2": 2327.7776733398437,
"kl_loss_4": 1309.3333618164063,
"kl_loss_9": 405.4641632080078,
"learning_rate": 0.0009735287795090455,
"loss": 1055.8825,
"step": 1130
},
{
"ce_loss_13": 3.275561845302582,
"ce_loss_17": 3.2169222950935366,
"ce_loss_2": 4.37807993888855,
"ce_loss_4": 3.8567439675331117,
"ce_loss_9": 3.4257020950317383,
"epoch": 0.114,
"grad_norm": 768.0,
"kl_loss_13": 110.35262985229492,
"kl_loss_2": 2425.5908203125,
"kl_loss_4": 1374.9910888671875,
"kl_loss_9": 407.8123291015625,
"learning_rate": 0.0009730169760710386,
"loss": 1073.9307,
"step": 1140
},
{
"ce_loss_13": 3.359781765937805,
"ce_loss_17": 3.2924877524375917,
"ce_loss_2": 4.439531350135804,
"ce_loss_4": 3.921218383312225,
"ce_loss_9": 3.499290108680725,
"epoch": 0.115,
"grad_norm": 676.0,
"kl_loss_13": 120.64373970031738,
"kl_loss_2": 2390.1439697265623,
"kl_loss_4": 1339.1471130371094,
"kl_loss_9": 405.00170745849607,
"learning_rate": 0.0009725004093573342,
"loss": 1071.8246,
"step": 1150
},
{
"ce_loss_13": 3.3038479328155517,
"ce_loss_17": 3.2363712787628174,
"ce_loss_2": 4.378937613964081,
"ce_loss_4": 3.8651722073554993,
"ce_loss_9": 3.457583713531494,
"epoch": 0.116,
"grad_norm": 792.0,
"kl_loss_13": 119.20105934143066,
"kl_loss_2": 2358.399365234375,
"kl_loss_4": 1332.2092834472655,
"kl_loss_9": 415.86485595703124,
"learning_rate": 0.0009719790845697534,
"loss": 1053.0682,
"step": 1160
},
{
"ce_loss_13": 3.25511212348938,
"ce_loss_17": 3.195575773715973,
"ce_loss_2": 4.293286681175232,
"ce_loss_4": 3.800669753551483,
"ce_loss_9": 3.396199142932892,
"epoch": 0.117,
"grad_norm": 748.0,
"kl_loss_13": 112.51908836364746,
"kl_loss_2": 2321.992822265625,
"kl_loss_4": 1310.780859375,
"kl_loss_9": 399.5624725341797,
"learning_rate": 0.0009714530069580309,
"loss": 1042.5032,
"step": 1170
},
{
"ce_loss_13": 3.349726128578186,
"ce_loss_17": 3.282489287853241,
"ce_loss_2": 4.418605291843415,
"ce_loss_4": 3.918584477901459,
"ce_loss_9": 3.4994783282279966,
"epoch": 0.118,
"grad_norm": 792.0,
"kl_loss_13": 124.80969619750977,
"kl_loss_2": 2370.46015625,
"kl_loss_4": 1353.2211547851562,
"kl_loss_9": 426.6817199707031,
"learning_rate": 0.0009709221818197624,
"loss": 1064.6255,
"step": 1180
},
{
"ce_loss_13": 3.392550361156464,
"ce_loss_17": 3.3208803057670595,
"ce_loss_2": 4.470854425430298,
"ce_loss_4": 3.958542823791504,
"ce_loss_9": 3.533000814914703,
"epoch": 0.119,
"grad_norm": 796.0,
"kl_loss_13": 131.01845664978026,
"kl_loss_2": 2407.263720703125,
"kl_loss_4": 1353.18388671875,
"kl_loss_9": 423.02639923095705,
"learning_rate": 0.0009703866145003512,
"loss": 1079.4176,
"step": 1190
},
{
"ce_loss_13": 3.3545722484588625,
"ce_loss_17": 3.2938650846481323,
"ce_loss_2": 4.412098550796509,
"ce_loss_4": 3.91774640083313,
"ce_loss_9": 3.509088408946991,
"epoch": 0.12,
"grad_norm": 780.0,
"kl_loss_13": 115.15438537597656,
"kl_loss_2": 2365.2002075195314,
"kl_loss_4": 1348.0827087402345,
"kl_loss_9": 422.36648712158205,
"learning_rate": 0.0009698463103929542,
"loss": 1080.3512,
"step": 1200
},
{
"ce_loss_13": 3.3183128476142882,
"ce_loss_17": 3.2566781520843504,
"ce_loss_2": 4.403877472877502,
"ce_loss_4": 3.8917479276657105,
"ce_loss_9": 3.478835713863373,
"epoch": 0.121,
"grad_norm": 580.0,
"kl_loss_13": 121.92639770507813,
"kl_loss_2": 2379.6752685546876,
"kl_loss_4": 1344.7351135253907,
"kl_loss_9": 425.500993347168,
"learning_rate": 0.0009693012749384279,
"loss": 1078.7273,
"step": 1210
},
{
"ce_loss_13": 3.3381242394447326,
"ce_loss_17": 3.2690067172050474,
"ce_loss_2": 4.387892174720764,
"ce_loss_4": 3.8845514178276064,
"ce_loss_9": 3.4794100522994995,
"epoch": 0.122,
"grad_norm": 712.0,
"kl_loss_13": 125.97309150695801,
"kl_loss_2": 2354.9815185546877,
"kl_loss_4": 1331.7310668945313,
"kl_loss_9": 414.407373046875,
"learning_rate": 0.0009687515136252732,
"loss": 1053.1852,
"step": 1220
},
{
"ce_loss_13": 3.291333258152008,
"ce_loss_17": 3.2255096673965453,
"ce_loss_2": 4.409121894836426,
"ce_loss_4": 3.860208344459534,
"ce_loss_9": 3.434011220932007,
"epoch": 0.123,
"grad_norm": 752.0,
"kl_loss_13": 118.75255470275879,
"kl_loss_2": 2480.691931152344,
"kl_loss_4": 1372.1910034179687,
"kl_loss_9": 420.0564437866211,
"learning_rate": 0.0009681970319895803,
"loss": 1096.9516,
"step": 1230
},
{
"ce_loss_13": 3.367513132095337,
"ce_loss_17": 3.3113282799720762,
"ce_loss_2": 4.422825336456299,
"ce_loss_4": 3.913813602924347,
"ce_loss_9": 3.5158339619636534,
"epoch": 0.124,
"grad_norm": 636.0,
"kl_loss_13": 111.10784111022949,
"kl_loss_2": 2337.798858642578,
"kl_loss_4": 1311.4879638671875,
"kl_loss_9": 407.9518844604492,
"learning_rate": 0.0009676378356149733,
"loss": 1043.2992,
"step": 1240
},
{
"ce_loss_13": 3.3376001119613647,
"ce_loss_17": 3.2791189193725585,
"ce_loss_2": 4.370033013820648,
"ce_loss_4": 3.8703083753585816,
"ce_loss_9": 3.4755295276641847,
"epoch": 0.125,
"grad_norm": 688.0,
"kl_loss_13": 106.5986442565918,
"kl_loss_2": 2303.264697265625,
"kl_loss_4": 1290.0798889160155,
"kl_loss_9": 397.0188186645508,
"learning_rate": 0.0009670739301325534,
"loss": 1032.0988,
"step": 1250
},
{
"ce_loss_13": 3.3012579560279844,
"ce_loss_17": 3.2405815005302427,
"ce_loss_2": 4.3439412117004395,
"ce_loss_4": 3.8545931935310365,
"ce_loss_9": 3.4449319005012513,
"epoch": 0.126,
"grad_norm": 648.0,
"kl_loss_13": 107.16764678955079,
"kl_loss_2": 2308.397021484375,
"kl_loss_4": 1315.3893249511718,
"kl_loss_9": 401.0367233276367,
"learning_rate": 0.0009665053212208426,
"loss": 1047.9859,
"step": 1260
},
{
"ce_loss_13": 3.3390562295913697,
"ce_loss_17": 3.2840550541877747,
"ce_loss_2": 4.403453612327576,
"ce_loss_4": 3.890084075927734,
"ce_loss_9": 3.48423947095871,
"epoch": 0.127,
"grad_norm": 692.0,
"kl_loss_13": 106.97278785705566,
"kl_loss_2": 2372.2412658691405,
"kl_loss_4": 1327.5496520996094,
"kl_loss_9": 406.0030044555664,
"learning_rate": 0.0009659320146057262,
"loss": 1051.2734,
"step": 1270
},
{
"ce_loss_13": 3.3499784231185914,
"ce_loss_17": 3.2922070741653444,
"ce_loss_2": 4.3855064630508425,
"ce_loss_4": 3.894232487678528,
"ce_loss_9": 3.4922064304351808,
"epoch": 0.128,
"grad_norm": 612.0,
"kl_loss_13": 103.24569854736328,
"kl_loss_2": 2309.586083984375,
"kl_loss_4": 1304.8849609375,
"kl_loss_9": 395.5906951904297,
"learning_rate": 0.0009653540160603955,
"loss": 1033.1272,
"step": 1280
},
{
"ce_loss_13": 3.3521631360054016,
"ce_loss_17": 3.2958574414253237,
"ce_loss_2": 4.3798192024230955,
"ce_loss_4": 3.8924019932746887,
"ce_loss_9": 3.48896187543869,
"epoch": 0.129,
"grad_norm": 724.0,
"kl_loss_13": 103.10880165100097,
"kl_loss_2": 2327.2175170898436,
"kl_loss_4": 1318.9538818359374,
"kl_loss_9": 393.9052215576172,
"learning_rate": 0.0009647713314052896,
"loss": 1023.6325,
"step": 1290
},
{
"ce_loss_13": 3.300455665588379,
"ce_loss_17": 3.238513100147247,
"ce_loss_2": 4.398746824264526,
"ce_loss_4": 3.8899983763694763,
"ce_loss_9": 3.4518206238746645,
"epoch": 0.13,
"grad_norm": 592.0,
"kl_loss_13": 105.94969062805175,
"kl_loss_2": 2423.0098388671877,
"kl_loss_4": 1370.7003967285157,
"kl_loss_9": 407.3703353881836,
"learning_rate": 0.0009641839665080363,
"loss": 1067.2344,
"step": 1300
},
{
"ce_loss_13": 3.263159728050232,
"ce_loss_17": 3.2069602012634277,
"ce_loss_2": 4.329943525791168,
"ce_loss_4": 3.8214871406555178,
"ce_loss_9": 3.410001218318939,
"epoch": 0.131,
"grad_norm": 732.0,
"kl_loss_13": 102.1422721862793,
"kl_loss_2": 2348.4533203125,
"kl_loss_4": 1318.508233642578,
"kl_loss_9": 402.711003112793,
"learning_rate": 0.0009635919272833937,
"loss": 1031.7428,
"step": 1310
},
{
"ce_loss_13": 3.2946990489959718,
"ce_loss_17": 3.2358811616897585,
"ce_loss_2": 4.366981887817383,
"ce_loss_4": 3.8651524066925047,
"ce_loss_9": 3.4487127661705017,
"epoch": 0.132,
"grad_norm": 704.0,
"kl_loss_13": 104.14997596740723,
"kl_loss_2": 2343.723876953125,
"kl_loss_4": 1323.0115295410155,
"kl_loss_9": 405.50807189941406,
"learning_rate": 0.0009629952196931902,
"loss": 1025.0818,
"step": 1320
},
{
"ce_loss_13": 3.286488246917725,
"ce_loss_17": 3.2299813985824586,
"ce_loss_2": 4.354435992240906,
"ce_loss_4": 3.831107270717621,
"ce_loss_9": 3.4259880065917967,
"epoch": 0.133,
"grad_norm": 620.0,
"kl_loss_13": 102.17385025024414,
"kl_loss_2": 2366.3618713378905,
"kl_loss_4": 1307.6370422363282,
"kl_loss_9": 393.3925552368164,
"learning_rate": 0.0009623938497462645,
"loss": 1037.0735,
"step": 1330
},
{
"ce_loss_13": 3.2744598984718323,
"ce_loss_17": 3.2186703085899353,
"ce_loss_2": 4.334010004997253,
"ce_loss_4": 3.831056308746338,
"ce_loss_9": 3.4211880207061767,
"epoch": 0.134,
"grad_norm": 604.0,
"kl_loss_13": 102.095463180542,
"kl_loss_2": 2336.8879577636717,
"kl_loss_4": 1313.4937805175782,
"kl_loss_9": 402.02156829833984,
"learning_rate": 0.0009617878234984055,
"loss": 1048.8602,
"step": 1340
},
{
"ce_loss_13": 3.3661810636520384,
"ce_loss_17": 3.3122437596321106,
"ce_loss_2": 4.380954694747925,
"ce_loss_4": 3.882122778892517,
"ce_loss_9": 3.5056236147880555,
"epoch": 0.135,
"grad_norm": 716.0,
"kl_loss_13": 99.42040252685547,
"kl_loss_2": 2261.796173095703,
"kl_loss_4": 1254.1968139648438,
"kl_loss_9": 388.378955078125,
"learning_rate": 0.0009611771470522907,
"loss": 1019.8208,
"step": 1350
},
{
"ce_loss_13": 3.2976178526878357,
"ce_loss_17": 3.2403480648994445,
"ce_loss_2": 4.354464423656464,
"ce_loss_4": 3.8516308665275574,
"ce_loss_9": 3.4469806432723997,
"epoch": 0.136,
"grad_norm": 664.0,
"kl_loss_13": 100.53326721191407,
"kl_loss_2": 2299.1686767578126,
"kl_loss_4": 1291.9524169921874,
"kl_loss_9": 397.22457122802734,
"learning_rate": 0.0009605618265574251,
"loss": 1014.9305,
"step": 1360
},
{
"ce_loss_13": 3.268050992488861,
"ce_loss_17": 3.2077465653419495,
"ce_loss_2": 4.356378340721131,
"ce_loss_4": 3.8343679308891296,
"ce_loss_9": 3.4144388794898988,
"epoch": 0.137,
"grad_norm": 696.0,
"kl_loss_13": 110.57073059082032,
"kl_loss_2": 2421.8905639648438,
"kl_loss_4": 1353.3382202148437,
"kl_loss_9": 410.3678497314453,
"learning_rate": 0.0009599418682100792,
"loss": 1051.8801,
"step": 1370
},
{
"ce_loss_13": 3.299811065196991,
"ce_loss_17": 3.246116352081299,
"ce_loss_2": 4.357406163215638,
"ce_loss_4": 3.846174156665802,
"ce_loss_9": 3.4445716023445128,
"epoch": 0.138,
"grad_norm": 768.0,
"kl_loss_13": 101.31869430541992,
"kl_loss_2": 2340.7691650390625,
"kl_loss_4": 1298.7441711425781,
"kl_loss_9": 394.77845764160156,
"learning_rate": 0.0009593172782532268,
"loss": 1037.185,
"step": 1380
},
{
"ce_loss_13": 3.3381003975868224,
"ce_loss_17": 3.2823010683059692,
"ce_loss_2": 4.379236328601837,
"ce_loss_4": 3.878574550151825,
"ce_loss_9": 3.4841482758522035,
"epoch": 0.139,
"grad_norm": 668.0,
"kl_loss_13": 102.40975189208984,
"kl_loss_2": 2298.5057006835937,
"kl_loss_4": 1290.5589904785156,
"kl_loss_9": 396.7320785522461,
"learning_rate": 0.0009586880629764817,
"loss": 1020.4529,
"step": 1390
},
{
"ce_loss_13": 3.2770840644836428,
"ce_loss_17": 3.219211196899414,
"ce_loss_2": 4.331016874313354,
"ce_loss_4": 3.833348023891449,
"ce_loss_9": 3.4286335825920107,
"epoch": 0.14,
"grad_norm": 784.0,
"kl_loss_13": 107.02684516906739,
"kl_loss_2": 2320.654132080078,
"kl_loss_4": 1304.0484802246094,
"kl_loss_9": 410.73537292480466,
"learning_rate": 0.0009580542287160348,
"loss": 1020.2037,
"step": 1400
},
{
"ce_loss_13": 3.23485426902771,
"ce_loss_17": 3.1792452573776244,
"ce_loss_2": 4.287278437614441,
"ce_loss_4": 3.780201089382172,
"ce_loss_9": 3.387134313583374,
"epoch": 0.141,
"grad_norm": 700.0,
"kl_loss_13": 102.53819580078125,
"kl_loss_2": 2329.6465576171877,
"kl_loss_4": 1294.6370422363282,
"kl_loss_9": 414.023176574707,
"learning_rate": 0.0009574157818545901,
"loss": 1019.7475,
"step": 1410
},
{
"ce_loss_13": 3.307619261741638,
"ce_loss_17": 3.252472710609436,
"ce_loss_2": 4.327524995803833,
"ce_loss_4": 3.8272714972496034,
"ce_loss_9": 3.4502039551734924,
"epoch": 0.142,
"grad_norm": 660.0,
"kl_loss_13": 99.3475284576416,
"kl_loss_2": 2264.283074951172,
"kl_loss_4": 1250.3487060546875,
"kl_loss_9": 398.88404846191406,
"learning_rate": 0.0009567727288213005,
"loss": 1023.8275,
"step": 1420
},
{
"ce_loss_13": 3.2860882759094237,
"ce_loss_17": 3.229217326641083,
"ce_loss_2": 4.3413330078125,
"ce_loss_4": 3.826633703708649,
"ce_loss_9": 3.4349179744720457,
"epoch": 0.143,
"grad_norm": 700.0,
"kl_loss_13": 104.36613082885742,
"kl_loss_2": 2341.603790283203,
"kl_loss_4": 1282.8856384277344,
"kl_loss_9": 409.2802230834961,
"learning_rate": 0.0009561250760917027,
"loss": 1023.3168,
"step": 1430
},
{
"ce_loss_13": 3.307588982582092,
"ce_loss_17": 3.2452077507972716,
"ce_loss_2": 4.344717502593994,
"ce_loss_4": 3.8385209679603576,
"ce_loss_9": 3.448259747028351,
"epoch": 0.144,
"grad_norm": 1048.0,
"kl_loss_13": 114.14393768310546,
"kl_loss_2": 2334.0638671875,
"kl_loss_4": 1302.6134704589845,
"kl_loss_9": 412.03075408935547,
"learning_rate": 0.0009554728301876525,
"loss": 1013.7202,
"step": 1440
},
{
"ce_loss_13": 3.35393488407135,
"ce_loss_17": 3.290667915344238,
"ce_loss_2": 4.3726026058197025,
"ce_loss_4": 3.886983585357666,
"ce_loss_9": 3.49219263792038,
"epoch": 0.145,
"grad_norm": 800.0,
"kl_loss_13": 114.66305503845214,
"kl_loss_2": 2275.094317626953,
"kl_loss_4": 1288.3170532226563,
"kl_loss_9": 399.14944915771486,
"learning_rate": 0.0009548159976772592,
"loss": 1047.163,
"step": 1450
},
{
"ce_loss_13": 3.3060134768486025,
"ce_loss_17": 3.2398659944534303,
"ce_loss_2": 4.361707043647766,
"ce_loss_4": 3.8484949827194215,
"ce_loss_9": 3.446886456012726,
"epoch": 0.146,
"grad_norm": 712.0,
"kl_loss_13": 117.4349208831787,
"kl_loss_2": 2349.9953674316407,
"kl_loss_4": 1312.7109619140624,
"kl_loss_9": 406.10399169921874,
"learning_rate": 0.0009541545851748186,
"loss": 1035.0002,
"step": 1460
},
{
"ce_loss_13": 3.1733034372329714,
"ce_loss_17": 3.11181960105896,
"ce_loss_2": 4.268103861808777,
"ce_loss_4": 3.7309717416763304,
"ce_loss_9": 3.3166835069656373,
"epoch": 0.147,
"grad_norm": 764.0,
"kl_loss_13": 117.60221443176269,
"kl_loss_2": 2404.8810546875,
"kl_loss_4": 1318.489569091797,
"kl_loss_9": 399.5591079711914,
"learning_rate": 0.0009534885993407473,
"loss": 1042.8139,
"step": 1470
},
{
"ce_loss_13": 3.353926646709442,
"ce_loss_17": 3.2724978923797607,
"ce_loss_2": 4.392566514015198,
"ce_loss_4": 3.892224359512329,
"ce_loss_9": 3.4726366877555845,
"epoch": 0.148,
"grad_norm": 724.0,
"kl_loss_13": 139.65031089782715,
"kl_loss_2": 2350.8017578125,
"kl_loss_4": 1321.2150573730469,
"kl_loss_9": 393.81555328369143,
"learning_rate": 0.0009528180468815154,
"loss": 1047.985,
"step": 1480
},
{
"ce_loss_13": 3.3915168404579163,
"ce_loss_17": 3.3253281235694887,
"ce_loss_2": 4.396991300582886,
"ce_loss_4": 3.911241555213928,
"ce_loss_9": 3.520040047168732,
"epoch": 0.149,
"grad_norm": 636.0,
"kl_loss_13": 137.39116935729982,
"kl_loss_2": 2266.0220336914062,
"kl_loss_4": 1272.7680358886719,
"kl_loss_9": 399.7412567138672,
"learning_rate": 0.0009521429345495787,
"loss": 1021.5193,
"step": 1490
},
{
"ce_loss_13": 3.377981424331665,
"ce_loss_17": 3.3089918971061705,
"ce_loss_2": 4.36256160736084,
"ce_loss_4": 3.874511981010437,
"ce_loss_9": 3.499216449260712,
"epoch": 0.15,
"grad_norm": 680.0,
"kl_loss_13": 119.94844093322754,
"kl_loss_2": 2246.3206298828127,
"kl_loss_4": 1248.2180053710938,
"kl_loss_9": 388.26966400146483,
"learning_rate": 0.0009514632691433108,
"loss": 1014.2158,
"step": 1500
},
{
"ce_loss_13": 3.329502213001251,
"ce_loss_17": 3.2669502973556517,
"ce_loss_2": 4.347873878479004,
"ce_loss_4": 3.857915127277374,
"ce_loss_9": 3.464929723739624,
"epoch": 0.151,
"grad_norm": 688.0,
"kl_loss_13": 113.11929283142089,
"kl_loss_2": 2280.291882324219,
"kl_loss_4": 1273.6213623046874,
"kl_loss_9": 394.5995330810547,
"learning_rate": 0.0009507790575069346,
"loss": 1013.6383,
"step": 1510
},
{
"ce_loss_13": 3.297570192813873,
"ce_loss_17": 3.232813537120819,
"ce_loss_2": 4.349590492248535,
"ce_loss_4": 3.8426506996154783,
"ce_loss_9": 3.443753886222839,
"epoch": 0.152,
"grad_norm": 712.0,
"kl_loss_13": 110.85068626403809,
"kl_loss_2": 2324.2668701171874,
"kl_loss_4": 1295.7248596191407,
"kl_loss_9": 401.2438629150391,
"learning_rate": 0.0009500903065304539,
"loss": 1043.6035,
"step": 1520
},
{
"ce_loss_13": 3.3399811387062073,
"ce_loss_17": 3.2841987729072573,
"ce_loss_2": 4.330187225341797,
"ce_loss_4": 3.8528904795646666,
"ce_loss_9": 3.471581506729126,
"epoch": 0.153,
"grad_norm": 744.0,
"kl_loss_13": 102.34607124328613,
"kl_loss_2": 2202.66767578125,
"kl_loss_4": 1236.2300415039062,
"kl_loss_9": 375.545426940918,
"learning_rate": 0.0009493970231495835,
"loss": 1002.7671,
"step": 1530
},
{
"ce_loss_13": 3.2758294701576234,
"ce_loss_17": 3.222549247741699,
"ce_loss_2": 4.2692118883132935,
"ce_loss_4": 3.7932890176773073,
"ce_loss_9": 3.4076798796653747,
"epoch": 0.154,
"grad_norm": 740.0,
"kl_loss_13": 97.60850028991699,
"kl_loss_2": 2240.9700256347655,
"kl_loss_4": 1261.6067199707031,
"kl_loss_9": 374.13759460449216,
"learning_rate": 0.0009486992143456792,
"loss": 992.1441,
"step": 1540
},
{
"ce_loss_13": 3.2965641260147094,
"ce_loss_17": 3.2360586524009705,
"ce_loss_2": 4.394272017478943,
"ce_loss_4": 3.863716244697571,
"ce_loss_9": 3.440040957927704,
"epoch": 0.155,
"grad_norm": 636.0,
"kl_loss_13": 105.34446182250977,
"kl_loss_2": 2420.0051208496093,
"kl_loss_4": 1341.6831604003905,
"kl_loss_9": 404.8099853515625,
"learning_rate": 0.0009479968871456679,
"loss": 1036.6555,
"step": 1550
},
{
"ce_loss_13": 3.2709760665893555,
"ce_loss_17": 3.215757656097412,
"ce_loss_2": 4.324735450744629,
"ce_loss_4": 3.8133442997932434,
"ce_loss_9": 3.4105438113212587,
"epoch": 0.156,
"grad_norm": 736.0,
"kl_loss_13": 99.87029724121093,
"kl_loss_2": 2349.607080078125,
"kl_loss_4": 1300.8240966796875,
"kl_loss_9": 390.3031494140625,
"learning_rate": 0.0009472900486219768,
"loss": 1013.3098,
"step": 1560
},
{
"ce_loss_13": 3.2618454694747925,
"ce_loss_17": 3.205492925643921,
"ce_loss_2": 4.283135652542114,
"ce_loss_4": 3.7982996821403505,
"ce_loss_9": 3.400241732597351,
"epoch": 0.157,
"grad_norm": 728.0,
"kl_loss_13": 99.15305404663086,
"kl_loss_2": 2290.1863708496094,
"kl_loss_4": 1283.8908752441407,
"kl_loss_9": 386.46409759521487,
"learning_rate": 0.000946578705892462,
"loss": 1012.301,
"step": 1570
},
{
"ce_loss_13": 3.300940454006195,
"ce_loss_17": 3.245930051803589,
"ce_loss_2": 4.308735108375549,
"ce_loss_4": 3.815536880493164,
"ce_loss_9": 3.439196026325226,
"epoch": 0.158,
"grad_norm": 628.0,
"kl_loss_13": 97.29884643554688,
"kl_loss_2": 2229.8876831054686,
"kl_loss_4": 1225.6342895507812,
"kl_loss_9": 376.42786712646483,
"learning_rate": 0.0009458628661203367,
"loss": 1001.9973,
"step": 1580
},
{
"ce_loss_13": 3.3031871557235717,
"ce_loss_17": 3.248406708240509,
"ce_loss_2": 4.366739439964294,
"ce_loss_4": 3.854965567588806,
"ce_loss_9": 3.4469083189964294,
"epoch": 0.159,
"grad_norm": 780.0,
"kl_loss_13": 98.17051391601562,
"kl_loss_2": 2354.595355224609,
"kl_loss_4": 1316.9095825195313,
"kl_loss_9": 394.9068069458008,
"learning_rate": 0.0009451425365140996,
"loss": 1001.7581,
"step": 1590
},
{
"ce_loss_13": 3.3753222942352297,
"ce_loss_17": 3.3201666951179503,
"ce_loss_2": 4.361679863929749,
"ce_loss_4": 3.8934988975524902,
"ce_loss_9": 3.5151140213012697,
"epoch": 0.16,
"grad_norm": 668.0,
"kl_loss_13": 98.52090950012207,
"kl_loss_2": 2197.9241455078127,
"kl_loss_4": 1234.2187805175781,
"kl_loss_9": 380.74073638916013,
"learning_rate": 0.0009444177243274617,
"loss": 975.1326,
"step": 1600
},
{
"ce_loss_13": 3.2342332720756533,
"ce_loss_17": 3.175268256664276,
"ce_loss_2": 4.2756345748901365,
"ce_loss_4": 3.775297236442566,
"ce_loss_9": 3.3749369978904724,
"epoch": 0.161,
"grad_norm": 776.0,
"kl_loss_13": 105.07829132080079,
"kl_loss_2": 2310.253662109375,
"kl_loss_4": 1293.7348571777343,
"kl_loss_9": 394.623030090332,
"learning_rate": 0.0009436884368592739,
"loss": 1016.6045,
"step": 1610
},
{
"ce_loss_13": 3.282170295715332,
"ce_loss_17": 3.227897846698761,
"ce_loss_2": 4.281191802024841,
"ce_loss_4": 3.8088948130607605,
"ce_loss_9": 3.4205331802368164,
"epoch": 0.162,
"grad_norm": 684.0,
"kl_loss_13": 101.84013671875,
"kl_loss_2": 2230.205743408203,
"kl_loss_4": 1252.5165954589843,
"kl_loss_9": 380.3778244018555,
"learning_rate": 0.0009429546814534529,
"loss": 1012.8927,
"step": 1620
},
{
"ce_loss_13": 3.2896040558815,
"ce_loss_17": 3.2361085057258605,
"ce_loss_2": 4.290119099617004,
"ce_loss_4": 3.80860458612442,
"ce_loss_9": 3.422525429725647,
"epoch": 0.163,
"grad_norm": 596.0,
"kl_loss_13": 101.63015174865723,
"kl_loss_2": 2214.722314453125,
"kl_loss_4": 1243.6845581054688,
"kl_loss_9": 376.9969970703125,
"learning_rate": 0.0009422164654989072,
"loss": 974.724,
"step": 1630
},
{
"ce_loss_13": 3.408917820453644,
"ce_loss_17": 3.3504751324653625,
"ce_loss_2": 4.3929561376571655,
"ce_loss_4": 3.9180466413497923,
"ce_loss_9": 3.539123165607452,
"epoch": 0.164,
"grad_norm": 672.0,
"kl_loss_13": 102.51958618164062,
"kl_loss_2": 2216.1884216308595,
"kl_loss_4": 1243.4335388183595,
"kl_loss_9": 376.5258193969727,
"learning_rate": 0.0009414737964294635,
"loss": 992.4483,
"step": 1640
},
{
"ce_loss_13": 3.336865794658661,
"ce_loss_17": 3.2845290422439577,
"ce_loss_2": 4.295578408241272,
"ce_loss_4": 3.8279440999031067,
"ce_loss_9": 3.464097237586975,
"epoch": 0.165,
"grad_norm": 724.0,
"kl_loss_13": 95.76305503845215,
"kl_loss_2": 2155.4929138183593,
"kl_loss_4": 1196.320068359375,
"kl_loss_9": 359.5951324462891,
"learning_rate": 0.000940726681723791,
"loss": 982.7427,
"step": 1650
},
{
"ce_loss_13": 3.1780314207077027,
"ce_loss_17": 3.1239538311958315,
"ce_loss_2": 4.237276661396026,
"ce_loss_4": 3.7238500833511354,
"ce_loss_9": 3.3144403100013733,
"epoch": 0.166,
"grad_norm": 724.0,
"kl_loss_13": 99.09452362060547,
"kl_loss_2": 2357.867547607422,
"kl_loss_4": 1307.1837219238282,
"kl_loss_9": 384.91309967041013,
"learning_rate": 0.0009399751289053266,
"loss": 991.817,
"step": 1660
},
{
"ce_loss_13": 3.3884536027908325,
"ce_loss_17": 3.3351887702941894,
"ce_loss_2": 4.372224187850952,
"ce_loss_4": 3.897164463996887,
"ce_loss_9": 3.519719648361206,
"epoch": 0.167,
"grad_norm": 744.0,
"kl_loss_13": 97.94064750671387,
"kl_loss_2": 2201.8851684570313,
"kl_loss_4": 1226.9512145996093,
"kl_loss_9": 369.67835388183596,
"learning_rate": 0.0009392191455421988,
"loss": 992.1326,
"step": 1670
},
{
"ce_loss_13": 3.361027550697327,
"ce_loss_17": 3.30734326839447,
"ce_loss_2": 4.355885291099549,
"ce_loss_4": 3.8685357093811037,
"ce_loss_9": 3.4965213537216187,
"epoch": 0.168,
"grad_norm": 712.0,
"kl_loss_13": 98.55237617492676,
"kl_loss_2": 2243.932727050781,
"kl_loss_4": 1255.4773864746094,
"kl_loss_9": 382.85614013671875,
"learning_rate": 0.0009384587392471515,
"loss": 974.9587,
"step": 1680
},
{
"ce_loss_13": 3.3498239159584045,
"ce_loss_17": 3.299537754058838,
"ce_loss_2": 4.313471567630768,
"ce_loss_4": 3.8637603282928468,
"ce_loss_9": 3.483183753490448,
"epoch": 0.169,
"grad_norm": 764.0,
"kl_loss_13": 94.6076271057129,
"kl_loss_2": 2159.169714355469,
"kl_loss_4": 1226.2245056152344,
"kl_loss_9": 369.7983459472656,
"learning_rate": 0.0009376939176774678,
"loss": 962.5213,
"step": 1690
},
{
"ce_loss_13": 3.3299458265304565,
"ce_loss_17": 3.274555242061615,
"ce_loss_2": 4.31501624584198,
"ce_loss_4": 3.8454368233680727,
"ce_loss_9": 3.461405873298645,
"epoch": 0.17,
"grad_norm": 684.0,
"kl_loss_13": 94.80207786560058,
"kl_loss_2": 2201.3943298339846,
"kl_loss_4": 1233.8527954101562,
"kl_loss_9": 372.47161865234375,
"learning_rate": 0.0009369246885348925,
"loss": 994.4191,
"step": 1700
},
{
"ce_loss_13": 3.316060709953308,
"ce_loss_17": 3.263388156890869,
"ce_loss_2": 4.352077054977417,
"ce_loss_4": 3.863167035579681,
"ce_loss_9": 3.456589198112488,
"epoch": 0.171,
"grad_norm": 708.0,
"kl_loss_13": 96.03619194030762,
"kl_loss_2": 2295.2953247070313,
"kl_loss_4": 1286.23359375,
"kl_loss_9": 378.4852294921875,
"learning_rate": 0.0009361510595655545,
"loss": 1000.7859,
"step": 1710
},
{
"ce_loss_13": 3.2737643480300904,
"ce_loss_17": 3.217836821079254,
"ce_loss_2": 4.278836250305176,
"ce_loss_4": 3.8066452503204347,
"ce_loss_9": 3.4152570962905884,
"epoch": 0.172,
"grad_norm": 740.0,
"kl_loss_13": 97.65170364379883,
"kl_loss_2": 2254.1198181152345,
"kl_loss_4": 1270.6001098632812,
"kl_loss_9": 385.4350952148437,
"learning_rate": 0.0009353730385598887,
"loss": 993.403,
"step": 1720
},
{
"ce_loss_13": 3.2072998046875,
"ce_loss_17": 3.155010712146759,
"ce_loss_2": 4.253859066963196,
"ce_loss_4": 3.7527849435806275,
"ce_loss_9": 3.343551850318909,
"epoch": 0.173,
"grad_norm": 636.0,
"kl_loss_13": 94.61254920959473,
"kl_loss_2": 2303.8027160644533,
"kl_loss_4": 1281.1050354003905,
"kl_loss_9": 379.4188980102539,
"learning_rate": 0.0009345906333525581,
"loss": 1012.3521,
"step": 1730
},
{
"ce_loss_13": 3.2443289399147033,
"ce_loss_17": 3.191619837284088,
"ce_loss_2": 4.257315444946289,
"ce_loss_4": 3.7754439234733583,
"ce_loss_9": 3.386739265918732,
"epoch": 0.174,
"grad_norm": 648.0,
"kl_loss_13": 95.79471549987792,
"kl_loss_2": 2262.873565673828,
"kl_loss_4": 1266.0321838378907,
"kl_loss_9": 385.38279876708987,
"learning_rate": 0.0009338038518223745,
"loss": 986.0615,
"step": 1740
},
{
"ce_loss_13": 3.3160131573677063,
"ce_loss_17": 3.258601987361908,
"ce_loss_2": 4.321772587299347,
"ce_loss_4": 3.8466910123825073,
"ce_loss_9": 3.4559434175491335,
"epoch": 0.175,
"grad_norm": 676.0,
"kl_loss_13": 98.67384262084961,
"kl_loss_2": 2265.1025451660157,
"kl_loss_4": 1277.967919921875,
"kl_loss_9": 389.96258544921875,
"learning_rate": 0.0009330127018922195,
"loss": 1019.1023,
"step": 1750
},
{
"ce_loss_13": 3.2658791184425353,
"ce_loss_17": 3.211934673786163,
"ce_loss_2": 4.287610340118408,
"ce_loss_4": 3.787616419792175,
"ce_loss_9": 3.4017861127853393,
"epoch": 0.176,
"grad_norm": 732.0,
"kl_loss_13": 98.68330879211426,
"kl_loss_2": 2283.4465515136717,
"kl_loss_4": 1258.4863525390624,
"kl_loss_9": 387.55812377929686,
"learning_rate": 0.0009322171915289634,
"loss": 1005.191,
"step": 1760
},
{
"ce_loss_13": 3.297553205490112,
"ce_loss_17": 3.2483973145484923,
"ce_loss_2": 4.274235343933105,
"ce_loss_4": 3.803940236568451,
"ce_loss_9": 3.440148711204529,
"epoch": 0.177,
"grad_norm": 676.0,
"kl_loss_13": 93.90616378784179,
"kl_loss_2": 2205.721160888672,
"kl_loss_4": 1229.010723876953,
"kl_loss_9": 393.6399291992187,
"learning_rate": 0.0009314173287433873,
"loss": 975.3521,
"step": 1770
},
{
"ce_loss_13": 3.287331283092499,
"ce_loss_17": 3.234244930744171,
"ce_loss_2": 4.282831525802612,
"ce_loss_4": 3.8102983593940736,
"ce_loss_9": 3.4339799761772154,
"epoch": 0.178,
"grad_norm": 812.0,
"kl_loss_13": 96.42430877685547,
"kl_loss_2": 2225.7175170898436,
"kl_loss_4": 1256.8188354492188,
"kl_loss_9": 393.8082046508789,
"learning_rate": 0.0009306131215901003,
"loss": 974.4363,
"step": 1780
},
{
"ce_loss_13": 3.3164478540420532,
"ce_loss_17": 3.2623019337654116,
"ce_loss_2": 4.310870909690857,
"ce_loss_4": 3.826959025859833,
"ce_loss_9": 3.4496538281440734,
"epoch": 0.179,
"grad_norm": 716.0,
"kl_loss_13": 95.39816360473633,
"kl_loss_2": 2206.229522705078,
"kl_loss_4": 1226.5503540039062,
"kl_loss_9": 378.513117980957,
"learning_rate": 0.0009298045781674596,
"loss": 959.6615,
"step": 1790
},
{
"ce_loss_13": 3.3004026770591737,
"ce_loss_17": 3.2483393430709837,
"ce_loss_2": 4.272047340869904,
"ce_loss_4": 3.8030646204948426,
"ce_loss_9": 3.4380339860916136,
"epoch": 0.18,
"grad_norm": 672.0,
"kl_loss_13": 96.91108589172363,
"kl_loss_2": 2171.518029785156,
"kl_loss_4": 1204.9924926757812,
"kl_loss_9": 371.96835174560545,
"learning_rate": 0.0009289917066174886,
"loss": 977.4246,
"step": 1800
},
{
"ce_loss_13": 3.295607936382294,
"ce_loss_17": 3.246348476409912,
"ce_loss_2": 4.23402863740921,
"ce_loss_4": 3.781065320968628,
"ce_loss_9": 3.4213656067848204,
"epoch": 0.181,
"grad_norm": 616.0,
"kl_loss_13": 92.53666381835937,
"kl_loss_2": 2120.001788330078,
"kl_loss_4": 1176.5944580078126,
"kl_loss_9": 360.6382293701172,
"learning_rate": 0.0009281745151257945,
"loss": 949.8141,
"step": 1810
},
{
"ce_loss_13": 3.318648707866669,
"ce_loss_17": 3.2640655994415284,
"ce_loss_2": 4.306107974052429,
"ce_loss_4": 3.822345507144928,
"ce_loss_9": 3.4511567831039427,
"epoch": 0.182,
"grad_norm": 688.0,
"kl_loss_13": 96.26609077453614,
"kl_loss_2": 2194.0708435058596,
"kl_loss_4": 1218.6878967285156,
"kl_loss_9": 369.2196868896484,
"learning_rate": 0.0009273530119214868,
"loss": 976.1288,
"step": 1820
},
{
"ce_loss_13": 3.4107258915901184,
"ce_loss_17": 3.355591583251953,
"ce_loss_2": 4.380393171310425,
"ce_loss_4": 3.9092475295066835,
"ce_loss_9": 3.5371835827827454,
"epoch": 0.183,
"grad_norm": 680.0,
"kl_loss_13": 100.99568901062011,
"kl_loss_2": 2185.2503173828127,
"kl_loss_4": 1211.135675048828,
"kl_loss_9": 362.42479248046874,
"learning_rate": 0.0009265272052770935,
"loss": 951.5828,
"step": 1830
},
{
"ce_loss_13": 3.2406865477561952,
"ce_loss_17": 3.1844029545784,
"ce_loss_2": 4.2559812545776365,
"ce_loss_4": 3.7556538224220275,
"ce_loss_9": 3.374473440647125,
"epoch": 0.184,
"grad_norm": 840.0,
"kl_loss_13": 100.13202857971191,
"kl_loss_2": 2238.2178466796877,
"kl_loss_4": 1228.1046081542968,
"kl_loss_9": 364.12084045410154,
"learning_rate": 0.0009256971035084784,
"loss": 979.8647,
"step": 1840
},
{
"ce_loss_13": 3.1806642532348635,
"ce_loss_17": 3.124430739879608,
"ce_loss_2": 4.21564245223999,
"ce_loss_4": 3.7193616032600403,
"ce_loss_9": 3.3188551664352417,
"epoch": 0.185,
"grad_norm": 692.0,
"kl_loss_13": 98.3440185546875,
"kl_loss_2": 2289.003448486328,
"kl_loss_4": 1277.9115295410156,
"kl_loss_9": 379.96221313476565,
"learning_rate": 0.0009248627149747573,
"loss": 990.2439,
"step": 1850
},
{
"ce_loss_13": 3.377814221382141,
"ce_loss_17": 3.322389805316925,
"ce_loss_2": 4.341128206253051,
"ce_loss_4": 3.8737547159194947,
"ce_loss_9": 3.507482278347015,
"epoch": 0.186,
"grad_norm": 624.0,
"kl_loss_13": 97.9429141998291,
"kl_loss_2": 2181.674755859375,
"kl_loss_4": 1211.4905883789063,
"kl_loss_9": 364.52012329101564,
"learning_rate": 0.0009240240480782129,
"loss": 964.9949,
"step": 1860
},
{
"ce_loss_13": 3.284755754470825,
"ce_loss_17": 3.228792119026184,
"ce_loss_2": 4.2813700318336485,
"ce_loss_4": 3.8053017973899843,
"ce_loss_9": 3.413658332824707,
"epoch": 0.187,
"grad_norm": 724.0,
"kl_loss_13": 101.41178092956542,
"kl_loss_2": 2236.3501953125,
"kl_loss_4": 1250.810723876953,
"kl_loss_9": 368.65606689453125,
"learning_rate": 0.0009231811112642122,
"loss": 970.5515,
"step": 1870
},
{
"ce_loss_13": 3.323897731304169,
"ce_loss_17": 3.271409976482391,
"ce_loss_2": 4.2732173204422,
"ce_loss_4": 3.8129101991653442,
"ce_loss_9": 3.452033507823944,
"epoch": 0.188,
"grad_norm": 680.0,
"kl_loss_13": 101.72588729858398,
"kl_loss_2": 2143.6392456054687,
"kl_loss_4": 1202.8489196777343,
"kl_loss_9": 365.70509033203126,
"learning_rate": 0.0009223339130211192,
"loss": 955.3392,
"step": 1880
},
{
"ce_loss_13": 3.1829244017601015,
"ce_loss_17": 3.131213593482971,
"ce_loss_2": 4.204755032062531,
"ce_loss_4": 3.696164774894714,
"ce_loss_9": 3.3124831914901733,
"epoch": 0.189,
"grad_norm": 648.0,
"kl_loss_13": 97.00320014953613,
"kl_loss_2": 2264.2671142578124,
"kl_loss_4": 1233.0195007324219,
"kl_loss_9": 358.06385650634763,
"learning_rate": 0.0009214824618802108,
"loss": 981.127,
"step": 1890
},
{
"ce_loss_13": 3.36339693069458,
"ce_loss_17": 3.3088223576545714,
"ce_loss_2": 4.347205972671508,
"ce_loss_4": 3.864045512676239,
"ce_loss_9": 3.4887256264686584,
"epoch": 0.19,
"grad_norm": 648.0,
"kl_loss_13": 98.71356239318848,
"kl_loss_2": 2180.965411376953,
"kl_loss_4": 1211.7989624023437,
"kl_loss_9": 367.0889587402344,
"learning_rate": 0.0009206267664155906,
"loss": 988.14,
"step": 1900
},
{
"ce_loss_13": 3.287778210639954,
"ce_loss_17": 3.2305564880371094,
"ce_loss_2": 4.282608389854431,
"ce_loss_4": 3.797332525253296,
"ce_loss_9": 3.418061101436615,
"epoch": 0.191,
"grad_norm": 712.0,
"kl_loss_13": 97.9106990814209,
"kl_loss_2": 2215.468609619141,
"kl_loss_4": 1221.6683959960938,
"kl_loss_9": 366.5027740478516,
"learning_rate": 0.0009197668352441024,
"loss": 975.1162,
"step": 1910
},
{
"ce_loss_13": 3.3348501682281495,
"ce_loss_17": 3.2833409309387207,
"ce_loss_2": 4.30408536195755,
"ce_loss_4": 3.832443726062775,
"ce_loss_9": 3.462253785133362,
"epoch": 0.192,
"grad_norm": 764.0,
"kl_loss_13": 96.12831916809083,
"kl_loss_2": 2158.1401611328124,
"kl_loss_4": 1201.4192199707031,
"kl_loss_9": 362.71641845703124,
"learning_rate": 0.0009189026770252437,
"loss": 962.7703,
"step": 1920
},
{
"ce_loss_13": 3.3621666431427,
"ce_loss_17": 3.309820866584778,
"ce_loss_2": 4.322707390785217,
"ce_loss_4": 3.8623532176017763,
"ce_loss_9": 3.489624488353729,
"epoch": 0.193,
"grad_norm": 732.0,
"kl_loss_13": 94.84638748168945,
"kl_loss_2": 2159.771588134766,
"kl_loss_4": 1210.8668151855468,
"kl_loss_9": 364.78248138427733,
"learning_rate": 0.000918034300461078,
"loss": 993.4104,
"step": 1930
},
{
"ce_loss_13": 3.3837892532348635,
"ce_loss_17": 3.3325113892555236,
"ce_loss_2": 4.336338758468628,
"ce_loss_4": 3.8793725252151487,
"ce_loss_9": 3.5151267051696777,
"epoch": 0.194,
"grad_norm": 744.0,
"kl_loss_13": 94.3999252319336,
"kl_loss_2": 2147.145361328125,
"kl_loss_4": 1198.3227783203124,
"kl_loss_9": 364.49977416992186,
"learning_rate": 0.0009171617142961477,
"loss": 954.6803,
"step": 1940
},
{
"ce_loss_13": 3.3508265733718874,
"ce_loss_17": 3.2987019777297975,
"ce_loss_2": 4.311159837245941,
"ce_loss_4": 3.844617450237274,
"ce_loss_9": 3.4781469225883486,
"epoch": 0.195,
"grad_norm": 712.0,
"kl_loss_13": 91.6006866455078,
"kl_loss_2": 2154.9497253417967,
"kl_loss_4": 1195.2054809570313,
"kl_loss_9": 360.1728317260742,
"learning_rate": 0.0009162849273173857,
"loss": 956.5812,
"step": 1950
},
{
"ce_loss_13": 3.2878409743309023,
"ce_loss_17": 3.23888703584671,
"ce_loss_2": 4.251902055740357,
"ce_loss_4": 3.778362774848938,
"ce_loss_9": 3.416101062297821,
"epoch": 0.196,
"grad_norm": 732.0,
"kl_loss_13": 89.0274787902832,
"kl_loss_2": 2143.692755126953,
"kl_loss_4": 1174.520458984375,
"kl_loss_9": 357.48160552978516,
"learning_rate": 0.0009154039483540273,
"loss": 954.3681,
"step": 1960
},
{
"ce_loss_13": 3.268400990962982,
"ce_loss_17": 3.217691791057587,
"ce_loss_2": 4.245136177539825,
"ce_loss_4": 3.7626100063323973,
"ce_loss_9": 3.4000661253929136,
"epoch": 0.197,
"grad_norm": 640.0,
"kl_loss_13": 90.23016510009765,
"kl_loss_2": 2190.5732482910157,
"kl_loss_4": 1200.616796875,
"kl_loss_9": 359.080143737793,
"learning_rate": 0.0009145187862775209,
"loss": 958.1281,
"step": 1970
},
{
"ce_loss_13": 3.3009396314620973,
"ce_loss_17": 3.251687526702881,
"ce_loss_2": 4.2589087128639225,
"ce_loss_4": 3.8026643872261046,
"ce_loss_9": 3.4303215622901915,
"epoch": 0.198,
"grad_norm": 740.0,
"kl_loss_13": 90.92903099060058,
"kl_loss_2": 2159.524542236328,
"kl_loss_4": 1213.1752502441407,
"kl_loss_9": 358.58487396240236,
"learning_rate": 0.0009136294500014386,
"loss": 949.0615,
"step": 1980
},
{
"ce_loss_13": 3.2511113047599793,
"ce_loss_17": 3.2019614815711974,
"ce_loss_2": 4.281300258636475,
"ce_loss_4": 3.773765230178833,
"ce_loss_9": 3.385185647010803,
"epoch": 0.199,
"grad_norm": 824.0,
"kl_loss_13": 91.62615509033203,
"kl_loss_2": 2262.9648010253904,
"kl_loss_4": 1240.5688110351562,
"kl_loss_9": 362.73115997314454,
"learning_rate": 0.000912735948481387,
"loss": 979.9354,
"step": 1990
},
{
"ce_loss_13": 3.283021128177643,
"ce_loss_17": 3.232093572616577,
"ce_loss_2": 4.249727368354797,
"ce_loss_4": 3.7801328182220457,
"ce_loss_9": 3.4098451495170594,
"epoch": 0.2,
"grad_norm": 756.0,
"kl_loss_13": 92.56528701782227,
"kl_loss_2": 2181.098516845703,
"kl_loss_4": 1216.8801025390626,
"kl_loss_9": 364.4568374633789,
"learning_rate": 0.0009118382907149164,
"loss": 946.6853,
"step": 2000
},
{
"ce_loss_13": 3.3126049399375916,
"ce_loss_17": 3.261287009716034,
"ce_loss_2": 4.2733923435211185,
"ce_loss_4": 3.816398227214813,
"ce_loss_9": 3.441442632675171,
"epoch": 0.201,
"grad_norm": 676.0,
"kl_loss_13": 91.95423011779785,
"kl_loss_2": 2152.3748168945312,
"kl_loss_4": 1209.0665100097656,
"kl_loss_9": 361.6376892089844,
"learning_rate": 0.0009109364857414306,
"loss": 946.5832,
"step": 2010
},
{
"ce_loss_13": 3.2770270705223083,
"ce_loss_17": 3.228403162956238,
"ce_loss_2": 4.242778646945953,
"ce_loss_4": 3.7632916808128356,
"ce_loss_9": 3.402626168727875,
"epoch": 0.202,
"grad_norm": 724.0,
"kl_loss_13": 89.61309394836425,
"kl_loss_2": 2187.4436950683594,
"kl_loss_4": 1202.5332946777344,
"kl_loss_9": 360.0750030517578,
"learning_rate": 0.0009100305426420956,
"loss": 978.4272,
"step": 2020
},
{
"ce_loss_13": 3.239075553417206,
"ce_loss_17": 3.1912439107894897,
"ce_loss_2": 4.264526665210724,
"ce_loss_4": 3.7580632090568544,
"ce_loss_9": 3.373551666736603,
"epoch": 0.203,
"grad_norm": 900.0,
"kl_loss_13": 89.87334213256835,
"kl_loss_2": 2277.4216430664064,
"kl_loss_4": 1234.525457763672,
"kl_loss_9": 359.5686401367187,
"learning_rate": 0.0009091204705397484,
"loss": 967.3219,
"step": 2030
},
{
"ce_loss_13": 3.2265689611434936,
"ce_loss_17": 3.175269639492035,
"ce_loss_2": 4.258231091499328,
"ce_loss_4": 3.74735563993454,
"ce_loss_9": 3.358277690410614,
"epoch": 0.204,
"grad_norm": 720.0,
"kl_loss_13": 93.58229713439941,
"kl_loss_2": 2282.3120971679687,
"kl_loss_4": 1241.5177368164063,
"kl_loss_9": 361.5880889892578,
"learning_rate": 0.0009082062785988049,
"loss": 977.5709,
"step": 2040
},
{
"ce_loss_13": 3.3685425758361816,
"ce_loss_17": 3.3171595454216005,
"ce_loss_2": 4.292959356307984,
"ce_loss_4": 3.8451284885406496,
"ce_loss_9": 3.4920889496803285,
"epoch": 0.205,
"grad_norm": 800.0,
"kl_loss_13": 91.8431282043457,
"kl_loss_2": 2107.161785888672,
"kl_loss_4": 1178.7062255859375,
"kl_loss_9": 356.6432815551758,
"learning_rate": 0.0009072879760251679,
"loss": 950.2936,
"step": 2050
},
{
"ce_loss_13": 3.3089524507522583,
"ce_loss_17": 3.259350800514221,
"ce_loss_2": 4.307748210430145,
"ce_loss_4": 3.8217745780944825,
"ce_loss_9": 3.4480879664421082,
"epoch": 0.206,
"grad_norm": 680.0,
"kl_loss_13": 93.10884094238281,
"kl_loss_2": 2242.360632324219,
"kl_loss_4": 1230.9670166015626,
"kl_loss_9": 371.57481536865237,
"learning_rate": 0.0009063655720661341,
"loss": 966.1661,
"step": 2060
},
{
"ce_loss_13": 3.3583059072494508,
"ce_loss_17": 3.304564726352692,
"ce_loss_2": 4.292792272567749,
"ce_loss_4": 3.845988953113556,
"ce_loss_9": 3.491236174106598,
"epoch": 0.207,
"grad_norm": 784.0,
"kl_loss_13": 95.84091911315917,
"kl_loss_2": 2116.5053833007814,
"kl_loss_4": 1191.6633178710938,
"kl_loss_9": 373.36004333496095,
"learning_rate": 0.000905439076010301,
"loss": 950.7002,
"step": 2070
},
{
"ce_loss_13": 3.310245227813721,
"ce_loss_17": 3.258601188659668,
"ce_loss_2": 4.283760368824005,
"ce_loss_4": 3.8175146102905275,
"ce_loss_9": 3.4438483238220217,
"epoch": 0.208,
"grad_norm": 676.0,
"kl_loss_13": 94.41566009521485,
"kl_loss_2": 2157.372424316406,
"kl_loss_4": 1198.5336791992188,
"kl_loss_9": 370.09442749023435,
"learning_rate": 0.0009045084971874737,
"loss": 939.0064,
"step": 2080
},
{
"ce_loss_13": 3.2930172204971315,
"ce_loss_17": 3.240478348731995,
"ce_loss_2": 4.2477871656417845,
"ce_loss_4": 3.788450598716736,
"ce_loss_9": 3.4249788761138915,
"epoch": 0.209,
"grad_norm": 812.0,
"kl_loss_13": 94.39330787658692,
"kl_loss_2": 2149.9981384277344,
"kl_loss_4": 1204.7449340820312,
"kl_loss_9": 367.69925994873046,
"learning_rate": 0.0009035738449685707,
"loss": 967.0873,
"step": 2090
},
{
"ce_loss_13": 3.233154094219208,
"ce_loss_17": 3.178975594043732,
"ce_loss_2": 4.249279487133026,
"ce_loss_4": 3.756052219867706,
"ce_loss_9": 3.373828673362732,
"epoch": 0.21,
"grad_norm": 732.0,
"kl_loss_13": 93.99942321777344,
"kl_loss_2": 2255.170574951172,
"kl_loss_4": 1242.3643951416016,
"kl_loss_9": 374.80386505126955,
"learning_rate": 0.0009026351287655293,
"loss": 962.5345,
"step": 2100
},
{
"ce_loss_13": 3.4293829798698425,
"ce_loss_17": 3.3792282700538636,
"ce_loss_2": 4.324404907226563,
"ce_loss_4": 3.8898934006690977,
"ce_loss_9": 3.5465378046035765,
"epoch": 0.211,
"grad_norm": 720.0,
"kl_loss_13": 90.24806594848633,
"kl_loss_2": 2025.4352600097657,
"kl_loss_4": 1124.5046112060547,
"kl_loss_9": 350.2180938720703,
"learning_rate": 0.0009016923580312113,
"loss": 906.7486,
"step": 2110
},
{
"ce_loss_13": 3.289664113521576,
"ce_loss_17": 3.2374181866645815,
"ce_loss_2": 4.234306645393372,
"ce_loss_4": 3.777394378185272,
"ce_loss_9": 3.4207195043563843,
"epoch": 0.212,
"grad_norm": 828.0,
"kl_loss_13": 103.11422386169434,
"kl_loss_2": 2126.588299560547,
"kl_loss_4": 1178.11142578125,
"kl_loss_9": 365.55635528564454,
"learning_rate": 0.0009007455422593077,
"loss": 961.6285,
"step": 2120
},
{
"ce_loss_13": 3.303319585323334,
"ce_loss_17": 3.2473248481750487,
"ce_loss_2": 4.286237025260926,
"ce_loss_4": 3.8030712366104127,
"ce_loss_9": 3.4316128849983216,
"epoch": 0.213,
"grad_norm": 664.0,
"kl_loss_13": 103.71499290466309,
"kl_loss_2": 2218.969274902344,
"kl_loss_4": 1228.2110961914063,
"kl_loss_9": 379.88182525634767,
"learning_rate": 0.0008997946909842425,
"loss": 973.0553,
"step": 2130
},
{
"ce_loss_13": 3.3144681215286256,
"ce_loss_17": 3.2573557019233705,
"ce_loss_2": 4.3399817943573,
"ce_loss_4": 3.8451735019683837,
"ce_loss_9": 3.447383201122284,
"epoch": 0.214,
"grad_norm": 860.0,
"kl_loss_13": 103.66337776184082,
"kl_loss_2": 2279.8397216796875,
"kl_loss_4": 1256.1508544921876,
"kl_loss_9": 377.6374938964844,
"learning_rate": 0.0008988398137810777,
"loss": 966.7542,
"step": 2140
},
{
"ce_loss_13": 3.352087438106537,
"ce_loss_17": 3.3003304481506346,
"ce_loss_2": 4.290535891056061,
"ce_loss_4": 3.8377557158470155,
"ce_loss_9": 3.4746557235717774,
"epoch": 0.215,
"grad_norm": 804.0,
"kl_loss_13": 95.54061927795411,
"kl_loss_2": 2124.250836181641,
"kl_loss_4": 1186.1236877441406,
"kl_loss_9": 357.02324829101565,
"learning_rate": 0.0008978809202654162,
"loss": 934.1131,
"step": 2150
},
{
"ce_loss_13": 3.3268388390541075,
"ce_loss_17": 3.2740718841552736,
"ce_loss_2": 4.268651235103607,
"ce_loss_4": 3.8112971067428587,
"ce_loss_9": 3.4508150935173036,
"epoch": 0.216,
"grad_norm": 952.0,
"kl_loss_13": 94.60899848937989,
"kl_loss_2": 2104.24765625,
"kl_loss_4": 1169.8356872558593,
"kl_loss_9": 357.7300857543945,
"learning_rate": 0.0008969180200933046,
"loss": 948.2873,
"step": 2160
},
{
"ce_loss_13": 3.288419497013092,
"ce_loss_17": 3.2334158539772035,
"ce_loss_2": 4.276037967205047,
"ce_loss_4": 3.8012101531028746,
"ce_loss_9": 3.4172295808792112,
"epoch": 0.217,
"grad_norm": 768.0,
"kl_loss_13": 100.75381240844726,
"kl_loss_2": 2174.3433227539062,
"kl_loss_4": 1217.892315673828,
"kl_loss_9": 363.94210357666014,
"learning_rate": 0.0008959511229611376,
"loss": 967.2604,
"step": 2170
},
{
"ce_loss_13": 3.3669892072677614,
"ce_loss_17": 3.3125024795532227,
"ce_loss_2": 4.324894380569458,
"ce_loss_4": 3.858024787902832,
"ce_loss_9": 3.4884867787361147,
"epoch": 0.218,
"grad_norm": 876.0,
"kl_loss_13": 102.30676040649413,
"kl_loss_2": 2149.9006591796874,
"kl_loss_4": 1193.063946533203,
"kl_loss_9": 354.54992218017577,
"learning_rate": 0.0008949802386055581,
"loss": 947.3659,
"step": 2180
},
{
"ce_loss_13": 3.233458602428436,
"ce_loss_17": 3.176294243335724,
"ce_loss_2": 4.1978265881538395,
"ce_loss_4": 3.7252599954605103,
"ce_loss_9": 3.3537106990814207,
"epoch": 0.219,
"grad_norm": 792.0,
"kl_loss_13": 107.00150070190429,
"kl_loss_2": 2126.529364013672,
"kl_loss_4": 1180.3070739746095,
"kl_loss_9": 348.7339752197266,
"learning_rate": 0.0008940053768033609,
"loss": 965.7881,
"step": 2190
},
{
"ce_loss_13": 3.3277897477149962,
"ce_loss_17": 3.2635307192802427,
"ce_loss_2": 4.257868647575378,
"ce_loss_4": 3.7924643039703367,
"ce_loss_9": 3.431203293800354,
"epoch": 0.22,
"grad_norm": 724.0,
"kl_loss_13": 116.66446228027344,
"kl_loss_2": 2138.7681518554687,
"kl_loss_4": 1179.5128143310546,
"kl_loss_9": 348.44380340576174,
"learning_rate": 0.0008930265473713938,
"loss": 944.1561,
"step": 2200
},
{
"ce_loss_13": 3.287194538116455,
"ce_loss_17": 3.223456788063049,
"ce_loss_2": 4.236075389385223,
"ce_loss_4": 3.766448366641998,
"ce_loss_9": 3.403588020801544,
"epoch": 0.221,
"grad_norm": 744.0,
"kl_loss_13": 112.54497184753419,
"kl_loss_2": 2148.6867065429688,
"kl_loss_4": 1180.5968322753906,
"kl_loss_9": 352.51245880126953,
"learning_rate": 0.0008920437601664579,
"loss": 932.1693,
"step": 2210
},
{
"ce_loss_13": 3.2727954030036925,
"ce_loss_17": 3.2170267343521117,
"ce_loss_2": 4.227043080329895,
"ce_loss_4": 3.7663298726081846,
"ce_loss_9": 3.396084713935852,
"epoch": 0.222,
"grad_norm": 684.0,
"kl_loss_13": 102.1230541229248,
"kl_loss_2": 2138.1913513183595,
"kl_loss_4": 1192.1333435058593,
"kl_loss_9": 356.1757873535156,
"learning_rate": 0.0008910570250852097,
"loss": 928.835,
"step": 2220
},
{
"ce_loss_13": 3.3732096195220946,
"ce_loss_17": 3.3188398361206053,
"ce_loss_2": 4.279922914505005,
"ce_loss_4": 3.832516062259674,
"ce_loss_9": 3.4941823482513428,
"epoch": 0.223,
"grad_norm": 752.0,
"kl_loss_13": 96.15999031066895,
"kl_loss_2": 2059.2168518066405,
"kl_loss_4": 1137.6443603515625,
"kl_loss_9": 346.1767837524414,
"learning_rate": 0.0008900663520640604,
"loss": 915.2586,
"step": 2230
},
{
"ce_loss_13": 3.323720395565033,
"ce_loss_17": 3.2688003540039063,
"ce_loss_2": 4.268552005290985,
"ce_loss_4": 3.8018130540847777,
"ce_loss_9": 3.4431727170944213,
"epoch": 0.224,
"grad_norm": 760.0,
"kl_loss_13": 95.7872543334961,
"kl_loss_2": 2138.9270263671874,
"kl_loss_4": 1177.120379638672,
"kl_loss_9": 353.89476165771487,
"learning_rate": 0.0008890717510790764,
"loss": 941.8158,
"step": 2240
},
{
"ce_loss_13": 3.279287552833557,
"ce_loss_17": 3.228074884414673,
"ce_loss_2": 4.244151520729065,
"ce_loss_4": 3.76785945892334,
"ce_loss_9": 3.4018285393714907,
"epoch": 0.225,
"grad_norm": 712.0,
"kl_loss_13": 94.25489349365235,
"kl_loss_2": 2163.629754638672,
"kl_loss_4": 1185.219903564453,
"kl_loss_9": 352.8628692626953,
"learning_rate": 0.0008880732321458784,
"loss": 950.535,
"step": 2250
},
{
"ce_loss_13": 3.31189683675766,
"ce_loss_17": 3.261502683162689,
"ce_loss_2": 4.251094925403595,
"ce_loss_4": 3.7883784532547,
"ce_loss_9": 3.4407161712646483,
"epoch": 0.226,
"grad_norm": 852.0,
"kl_loss_13": 94.58582611083985,
"kl_loss_2": 2109.0529968261717,
"kl_loss_4": 1163.501885986328,
"kl_loss_9": 358.63934631347655,
"learning_rate": 0.0008870708053195413,
"loss": 947.5316,
"step": 2260
},
{
"ce_loss_13": 3.3335476756095885,
"ce_loss_17": 3.285297894477844,
"ce_loss_2": 4.256447744369507,
"ce_loss_4": 3.8071651816368104,
"ce_loss_9": 3.4511924386024475,
"epoch": 0.227,
"grad_norm": 780.0,
"kl_loss_13": 89.15117149353027,
"kl_loss_2": 2087.6072204589846,
"kl_loss_4": 1158.7156860351563,
"kl_loss_9": 345.3010711669922,
"learning_rate": 0.0008860644806944918,
"loss": 925.3416,
"step": 2270
},
{
"ce_loss_13": 3.2728632211685182,
"ce_loss_17": 3.2228413224220276,
"ce_loss_2": 4.235958611965179,
"ce_loss_4": 3.7654478549957275,
"ce_loss_9": 3.402116930484772,
"epoch": 0.228,
"grad_norm": 736.0,
"kl_loss_13": 90.10444946289063,
"kl_loss_2": 2145.073895263672,
"kl_loss_4": 1184.8575927734375,
"kl_loss_9": 358.54766235351565,
"learning_rate": 0.0008850542684044079,
"loss": 921.0351,
"step": 2280
},
{
"ce_loss_13": 3.232874131202698,
"ce_loss_17": 3.182513749599457,
"ce_loss_2": 4.245934855937958,
"ce_loss_4": 3.7508826732635496,
"ce_loss_9": 3.3689802169799803,
"epoch": 0.229,
"grad_norm": 764.0,
"kl_loss_13": 92.43191909790039,
"kl_loss_2": 2234.869909667969,
"kl_loss_4": 1230.782974243164,
"kl_loss_9": 370.2389450073242,
"learning_rate": 0.0008840401786221159,
"loss": 952.509,
"step": 2290
},
{
"ce_loss_13": 3.3808755040168763,
"ce_loss_17": 3.334891474246979,
"ce_loss_2": 4.299492859840393,
"ce_loss_4": 3.846787965297699,
"ce_loss_9": 3.5042584419250487,
"epoch": 0.23,
"grad_norm": 720.0,
"kl_loss_13": 86.99340934753418,
"kl_loss_2": 2062.409875488281,
"kl_loss_4": 1137.3367095947265,
"kl_loss_9": 343.86165924072264,
"learning_rate": 0.000883022221559489,
"loss": 908.996,
"step": 2300
},
{
"ce_loss_13": 3.338899517059326,
"ce_loss_17": 3.289691352844238,
"ce_loss_2": 4.295863151550293,
"ce_loss_4": 3.8208630442619325,
"ce_loss_9": 3.463158071041107,
"epoch": 0.231,
"grad_norm": 792.0,
"kl_loss_13": 89.7489730834961,
"kl_loss_2": 2138.119091796875,
"kl_loss_4": 1177.0338073730468,
"kl_loss_9": 349.2715377807617,
"learning_rate": 0.0008820004074673434,
"loss": 959.0157,
"step": 2310
},
{
"ce_loss_13": 3.2497766852378844,
"ce_loss_17": 3.2020439863204957,
"ce_loss_2": 4.193786537647247,
"ce_loss_4": 3.731072425842285,
"ce_loss_9": 3.3763773918151854,
"epoch": 0.232,
"grad_norm": 768.0,
"kl_loss_13": 86.36487617492676,
"kl_loss_2": 2146.440118408203,
"kl_loss_4": 1183.485543823242,
"kl_loss_9": 348.9352600097656,
"learning_rate": 0.0008809747466353355,
"loss": 926.4646,
"step": 2320
},
{
"ce_loss_13": 3.2526813745498657,
"ce_loss_17": 3.202362859249115,
"ce_loss_2": 4.212542414665222,
"ce_loss_4": 3.7356587052345276,
"ce_loss_9": 3.3759160161018373,
"epoch": 0.233,
"grad_norm": 812.0,
"kl_loss_13": 88.14268188476562,
"kl_loss_2": 2149.218145751953,
"kl_loss_4": 1156.888134765625,
"kl_loss_9": 342.9568435668945,
"learning_rate": 0.0008799452493918585,
"loss": 937.2152,
"step": 2330
},
{
"ce_loss_13": 3.3354029417037965,
"ce_loss_17": 3.2855377793312073,
"ce_loss_2": 4.26744898557663,
"ce_loss_4": 3.8138632655143736,
"ce_loss_9": 3.4569711685180664,
"epoch": 0.234,
"grad_norm": 844.0,
"kl_loss_13": 88.94196243286133,
"kl_loss_2": 2110.511163330078,
"kl_loss_4": 1166.7712677001953,
"kl_loss_9": 346.45865325927736,
"learning_rate": 0.0008789119261039385,
"loss": 954.3018,
"step": 2340
},
{
"ce_loss_13": 3.2477207660675047,
"ce_loss_17": 3.198263096809387,
"ce_loss_2": 4.197646915912628,
"ce_loss_4": 3.740631628036499,
"ce_loss_9": 3.37202308177948,
"epoch": 0.235,
"grad_norm": 644.0,
"kl_loss_13": 88.45519828796387,
"kl_loss_2": 2121.0713439941405,
"kl_loss_4": 1174.4280090332031,
"kl_loss_9": 346.70821990966795,
"learning_rate": 0.0008778747871771292,
"loss": 921.8231,
"step": 2350
},
{
"ce_loss_13": 3.2942930817604066,
"ce_loss_17": 3.2476529479026794,
"ce_loss_2": 4.214564323425293,
"ce_loss_4": 3.7642132163047792,
"ce_loss_9": 3.419190752506256,
"epoch": 0.236,
"grad_norm": 760.0,
"kl_loss_13": 85.13226013183593,
"kl_loss_2": 2072.194140625,
"kl_loss_4": 1148.5518371582032,
"kl_loss_9": 336.86150512695315,
"learning_rate": 0.0008768338430554083,
"loss": 909.3094,
"step": 2360
},
{
"ce_loss_13": 3.3023596882820128,
"ce_loss_17": 3.2569980144500734,
"ce_loss_2": 4.23673312664032,
"ce_loss_4": 3.785304081439972,
"ce_loss_9": 3.4277263641357423,
"epoch": 0.237,
"grad_norm": 768.0,
"kl_loss_13": 88.49767379760742,
"kl_loss_2": 2089.8222717285157,
"kl_loss_4": 1157.4951171875,
"kl_loss_9": 347.0603393554687,
"learning_rate": 0.0008757891042210713,
"loss": 927.942,
"step": 2370
},
{
"ce_loss_13": 3.3248913526535033,
"ce_loss_17": 3.277131223678589,
"ce_loss_2": 4.254247784614563,
"ce_loss_4": 3.8001523494720457,
"ce_loss_9": 3.4470866441726686,
"epoch": 0.238,
"grad_norm": 752.0,
"kl_loss_13": 87.96815452575683,
"kl_loss_2": 2071.3140380859377,
"kl_loss_4": 1145.922280883789,
"kl_loss_9": 339.0485443115234,
"learning_rate": 0.0008747405811946271,
"loss": 920.3144,
"step": 2380
},
{
"ce_loss_13": 3.224561131000519,
"ce_loss_17": 3.1759562492370605,
"ce_loss_2": 4.214334380626679,
"ce_loss_4": 3.7256261229515077,
"ce_loss_9": 3.353095328807831,
"epoch": 0.239,
"grad_norm": 676.0,
"kl_loss_13": 89.64075889587403,
"kl_loss_2": 2206.202288818359,
"kl_loss_4": 1206.3945404052733,
"kl_loss_9": 354.2266357421875,
"learning_rate": 0.0008736882845346905,
"loss": 926.8486,
"step": 2390
},
{
"ce_loss_13": 3.3134991407394407,
"ce_loss_17": 3.262081706523895,
"ce_loss_2": 4.258616781234741,
"ce_loss_4": 3.791792631149292,
"ce_loss_9": 3.4383889198303224,
"epoch": 0.24,
"grad_norm": 692.0,
"kl_loss_13": 91.79788208007812,
"kl_loss_2": 2099.705029296875,
"kl_loss_4": 1137.2147552490235,
"kl_loss_9": 351.1961166381836,
"learning_rate": 0.0008726322248378774,
"loss": 915.9145,
"step": 2400
},
{
"ce_loss_13": 3.3168442487716674,
"ce_loss_17": 3.2658654928207396,
"ce_loss_2": 4.291086602210998,
"ce_loss_4": 3.798007917404175,
"ce_loss_9": 3.4393696665763853,
"epoch": 0.241,
"grad_norm": 724.0,
"kl_loss_13": 91.18320655822754,
"kl_loss_2": 2183.9902954101562,
"kl_loss_4": 1179.52744140625,
"kl_loss_9": 348.21985015869143,
"learning_rate": 0.0008715724127386971,
"loss": 954.3727,
"step": 2410
},
{
"ce_loss_13": 3.388163149356842,
"ce_loss_17": 3.3374621152877806,
"ce_loss_2": 4.30515775680542,
"ce_loss_4": 3.8508872628211974,
"ce_loss_9": 3.503561234474182,
"epoch": 0.242,
"grad_norm": 632.0,
"kl_loss_13": 93.10037879943847,
"kl_loss_2": 2093.0827880859374,
"kl_loss_4": 1143.714126586914,
"kl_loss_9": 340.34727783203124,
"learning_rate": 0.0008705088589094458,
"loss": 926.0434,
"step": 2420
},
{
"ce_loss_13": 3.397733283042908,
"ce_loss_17": 3.3486640214920045,
"ce_loss_2": 4.326961147785187,
"ce_loss_4": 3.8702669978141784,
"ce_loss_9": 3.5161495447158813,
"epoch": 0.243,
"grad_norm": 692.0,
"kl_loss_13": 94.67743148803712,
"kl_loss_2": 2098.773126220703,
"kl_loss_4": 1147.025424194336,
"kl_loss_9": 343.21431121826174,
"learning_rate": 0.0008694415740600988,
"loss": 930.1328,
"step": 2430
},
{
"ce_loss_13": 3.251414442062378,
"ce_loss_17": 3.204081356525421,
"ce_loss_2": 4.2343505144119264,
"ce_loss_4": 3.746074843406677,
"ce_loss_9": 3.3755380511283875,
"epoch": 0.244,
"grad_norm": 832.0,
"kl_loss_13": 91.5265941619873,
"kl_loss_2": 2183.54365234375,
"kl_loss_4": 1185.4872619628907,
"kl_loss_9": 347.7381362915039,
"learning_rate": 0.0008683705689382025,
"loss": 934.1233,
"step": 2440
},
{
"ce_loss_13": 3.3352036356925963,
"ce_loss_17": 3.2870277643203734,
"ce_loss_2": 4.254025983810425,
"ce_loss_4": 3.8016448616981506,
"ce_loss_9": 3.452436113357544,
"epoch": 0.245,
"grad_norm": 672.0,
"kl_loss_13": 88.54696655273438,
"kl_loss_2": 2080.4288696289063,
"kl_loss_4": 1150.784130859375,
"kl_loss_9": 338.89248809814455,
"learning_rate": 0.0008672958543287666,
"loss": 933.8865,
"step": 2450
},
{
"ce_loss_13": 3.3462072253227233,
"ce_loss_17": 3.2961216807365417,
"ce_loss_2": 4.249486815929413,
"ce_loss_4": 3.8145977020263673,
"ce_loss_9": 3.4672706604003904,
"epoch": 0.246,
"grad_norm": 736.0,
"kl_loss_13": 87.74930000305176,
"kl_loss_2": 2045.5015930175782,
"kl_loss_4": 1146.6499877929687,
"kl_loss_9": 341.08093719482423,
"learning_rate": 0.0008662174410541554,
"loss": 908.4095,
"step": 2460
},
{
"ce_loss_13": 3.3104076981544495,
"ce_loss_17": 3.2633822441101072,
"ce_loss_2": 4.217393136024475,
"ce_loss_4": 3.769331526756287,
"ce_loss_9": 3.432016408443451,
"epoch": 0.247,
"grad_norm": 932.0,
"kl_loss_13": 85.89468612670899,
"kl_loss_2": 2047.9596923828126,
"kl_loss_4": 1122.4431182861329,
"kl_loss_9": 336.0746810913086,
"learning_rate": 0.0008651353399739787,
"loss": 926.9571,
"step": 2470
},
{
"ce_loss_13": 3.337866926193237,
"ce_loss_17": 3.2926376819610597,
"ce_loss_2": 4.262590432167054,
"ce_loss_4": 3.8054956436157226,
"ce_loss_9": 3.4577380299568174,
"epoch": 0.248,
"grad_norm": 880.0,
"kl_loss_13": 86.62553329467774,
"kl_loss_2": 2073.530798339844,
"kl_loss_4": 1142.5131256103516,
"kl_loss_9": 337.92457427978513,
"learning_rate": 0.0008640495619849821,
"loss": 917.6547,
"step": 2480
},
{
"ce_loss_13": 3.2964303255081178,
"ce_loss_17": 3.2484665513038635,
"ce_loss_2": 4.203482365608215,
"ce_loss_4": 3.7624764442443848,
"ce_loss_9": 3.4128145456314085,
"epoch": 0.249,
"grad_norm": 996.0,
"kl_loss_13": 86.48510246276855,
"kl_loss_2": 2064.852166748047,
"kl_loss_4": 1136.9110778808595,
"kl_loss_9": 336.74344635009766,
"learning_rate": 0.0008629601180209381,
"loss": 909.9742,
"step": 2490
},
{
"ce_loss_13": 3.29126091003418,
"ce_loss_17": 3.2420788168907166,
"ce_loss_2": 4.217225980758667,
"ce_loss_4": 3.7575623631477355,
"ce_loss_9": 3.417812240123749,
"epoch": 0.25,
"grad_norm": 784.0,
"kl_loss_13": 88.03675422668456,
"kl_loss_2": 2057.998211669922,
"kl_loss_4": 1128.8810241699218,
"kl_loss_9": 349.3929382324219,
"learning_rate": 0.000861867019052535,
"loss": 927.5346,
"step": 2500
},
{
"ce_loss_13": 3.215069842338562,
"ce_loss_17": 3.1639512538909913,
"ce_loss_2": 4.189428377151489,
"ce_loss_4": 3.7026462078094484,
"ce_loss_9": 3.348872888088226,
"epoch": 0.251,
"grad_norm": 732.0,
"kl_loss_13": 88.78349227905274,
"kl_loss_2": 2161.862829589844,
"kl_loss_4": 1179.3965728759765,
"kl_loss_9": 363.96420135498045,
"learning_rate": 0.0008607702760872678,
"loss": 946.0758,
"step": 2510
},
{
"ce_loss_13": 3.3202267050743104,
"ce_loss_17": 3.2739442229270934,
"ce_loss_2": 4.225949347019196,
"ce_loss_4": 3.7909236431121824,
"ce_loss_9": 3.441824221611023,
"epoch": 0.252,
"grad_norm": 828.0,
"kl_loss_13": 87.51590347290039,
"kl_loss_2": 2030.7117736816406,
"kl_loss_4": 1132.7180267333983,
"kl_loss_9": 340.58583374023436,
"learning_rate": 0.0008596699001693256,
"loss": 924.2797,
"step": 2520
},
{
"ce_loss_13": 3.337341475486755,
"ce_loss_17": 3.2882779955863954,
"ce_loss_2": 4.240027713775635,
"ce_loss_4": 3.7827760577201843,
"ce_loss_9": 3.448138189315796,
"epoch": 0.253,
"grad_norm": 788.0,
"kl_loss_13": 88.29233741760254,
"kl_loss_2": 2056.710101318359,
"kl_loss_4": 1113.5805114746095,
"kl_loss_9": 336.41419525146483,
"learning_rate": 0.0008585659023794818,
"loss": 924.9172,
"step": 2530
},
{
"ce_loss_13": 3.2948466777801513,
"ce_loss_17": 3.247093677520752,
"ce_loss_2": 4.2667999267578125,
"ce_loss_4": 3.7884334683418275,
"ce_loss_9": 3.4190890908241274,
"epoch": 0.254,
"grad_norm": 836.0,
"kl_loss_13": 88.6147274017334,
"kl_loss_2": 2149.3486206054686,
"kl_loss_4": 1175.339892578125,
"kl_loss_9": 347.41344451904297,
"learning_rate": 0.0008574582938349817,
"loss": 931.8402,
"step": 2540
},
{
"ce_loss_13": 3.27849794626236,
"ce_loss_17": 3.228435230255127,
"ce_loss_2": 4.23468028306961,
"ce_loss_4": 3.780582332611084,
"ce_loss_9": 3.4099076628684997,
"epoch": 0.255,
"grad_norm": 684.0,
"kl_loss_13": 90.46429252624512,
"kl_loss_2": 2132.9173889160156,
"kl_loss_4": 1190.2234680175782,
"kl_loss_9": 354.2894012451172,
"learning_rate": 0.0008563470856894315,
"loss": 912.2641,
"step": 2550
},
{
"ce_loss_13": 3.276999664306641,
"ce_loss_17": 3.231548249721527,
"ce_loss_2": 4.2210803627967834,
"ce_loss_4": 3.7623754858970644,
"ce_loss_9": 3.401135301589966,
"epoch": 0.256,
"grad_norm": 880.0,
"kl_loss_13": 85.1612335205078,
"kl_loss_2": 2104.3537841796874,
"kl_loss_4": 1163.8737365722657,
"kl_loss_9": 342.23360748291014,
"learning_rate": 0.0008552322891326845,
"loss": 921.068,
"step": 2560
},
{
"ce_loss_13": 3.2485652446746824,
"ce_loss_17": 3.199524128437042,
"ce_loss_2": 4.210166561603546,
"ce_loss_4": 3.7297805309295655,
"ce_loss_9": 3.370358681678772,
"epoch": 0.257,
"grad_norm": 728.0,
"kl_loss_13": 85.70927581787109,
"kl_loss_2": 2145.8478149414063,
"kl_loss_4": 1164.0143463134766,
"kl_loss_9": 339.1722915649414,
"learning_rate": 0.0008541139153907296,
"loss": 913.6647,
"step": 2570
},
{
"ce_loss_13": 3.207658565044403,
"ce_loss_17": 3.1607483625411987,
"ce_loss_2": 4.142305660247803,
"ce_loss_4": 3.683729314804077,
"ce_loss_9": 3.3294119954109194,
"epoch": 0.258,
"grad_norm": 856.0,
"kl_loss_13": 83.65748710632325,
"kl_loss_2": 2100.0630798339844,
"kl_loss_4": 1151.6668060302734,
"kl_loss_9": 331.9220596313477,
"learning_rate": 0.0008529919757255782,
"loss": 923.8511,
"step": 2580
},
{
"ce_loss_13": 3.2459914326667785,
"ce_loss_17": 3.2003490686416627,
"ce_loss_2": 4.13244149684906,
"ce_loss_4": 3.6941211462020873,
"ce_loss_9": 3.3592882633209227,
"epoch": 0.259,
"grad_norm": 684.0,
"kl_loss_13": 83.24060859680176,
"kl_loss_2": 2014.1521484375,
"kl_loss_4": 1104.7504852294921,
"kl_loss_9": 324.8483596801758,
"learning_rate": 0.0008518664814351503,
"loss": 889.9527,
"step": 2590
},
{
"ce_loss_13": 3.2060733318328856,
"ce_loss_17": 3.159405970573425,
"ce_loss_2": 4.157498776912689,
"ce_loss_4": 3.696187674999237,
"ce_loss_9": 3.331186580657959,
"epoch": 0.26,
"grad_norm": 780.0,
"kl_loss_13": 86.2954875946045,
"kl_loss_2": 2130.951513671875,
"kl_loss_4": 1179.1319793701173,
"kl_loss_9": 346.7232162475586,
"learning_rate": 0.0008507374438531607,
"loss": 958.9413,
"step": 2600
},
{
"ce_loss_13": 3.1857144951820375,
"ce_loss_17": 3.1399152994155886,
"ce_loss_2": 4.121321547031402,
"ce_loss_4": 3.6632447361946108,
"ce_loss_9": 3.3082396030426025,
"epoch": 0.261,
"grad_norm": 592.0,
"kl_loss_13": 84.55880928039551,
"kl_loss_2": 2090.511358642578,
"kl_loss_4": 1147.8235229492188,
"kl_loss_9": 337.9015914916992,
"learning_rate": 0.0008496048743490053,
"loss": 911.0207,
"step": 2610
},
{
"ce_loss_13": 3.3373757362365724,
"ce_loss_17": 3.2904435873031614,
"ce_loss_2": 4.235588300228119,
"ce_loss_4": 3.7929789900779722,
"ce_loss_9": 3.458786392211914,
"epoch": 0.262,
"grad_norm": 788.0,
"kl_loss_13": 86.32050971984863,
"kl_loss_2": 2027.3483154296875,
"kl_loss_4": 1120.2023498535157,
"kl_loss_9": 338.79857177734374,
"learning_rate": 0.0008484687843276469,
"loss": 902.0202,
"step": 2620
},
{
"ce_loss_13": 3.26626056432724,
"ce_loss_17": 3.219456911087036,
"ce_loss_2": 4.196949517726898,
"ce_loss_4": 3.740860116481781,
"ce_loss_9": 3.390087342262268,
"epoch": 0.263,
"grad_norm": 776.0,
"kl_loss_13": 86.06944999694824,
"kl_loss_2": 2084.330389404297,
"kl_loss_4": 1144.0339233398438,
"kl_loss_9": 347.79761810302733,
"learning_rate": 0.0008473291852294987,
"loss": 932.2067,
"step": 2630
},
{
"ce_loss_13": 3.275907301902771,
"ce_loss_17": 3.2278727412223818,
"ce_loss_2": 4.200026178359986,
"ce_loss_4": 3.751169514656067,
"ce_loss_9": 3.400949764251709,
"epoch": 0.264,
"grad_norm": 800.0,
"kl_loss_13": 86.0791404724121,
"kl_loss_2": 2092.987713623047,
"kl_loss_4": 1152.6259460449219,
"kl_loss_9": 343.6297607421875,
"learning_rate": 0.0008461860885303114,
"loss": 909.5027,
"step": 2640
},
{
"ce_loss_13": 3.3062209129333495,
"ce_loss_17": 3.2608874440193176,
"ce_loss_2": 4.206380689144135,
"ce_loss_4": 3.7589853048324584,
"ce_loss_9": 3.4244667410850527,
"epoch": 0.265,
"grad_norm": 732.0,
"kl_loss_13": 85.16450424194336,
"kl_loss_2": 2021.2992797851562,
"kl_loss_4": 1111.3934143066406,
"kl_loss_9": 331.31748352050784,
"learning_rate": 0.000845039505741056,
"loss": 905.8066,
"step": 2650
},
{
"ce_loss_13": 3.285935068130493,
"ce_loss_17": 3.2384222030639647,
"ce_loss_2": 4.207594358921051,
"ce_loss_4": 3.7571651339530945,
"ce_loss_9": 3.406543481349945,
"epoch": 0.266,
"grad_norm": 836.0,
"kl_loss_13": 88.26923942565918,
"kl_loss_2": 2102.5131591796876,
"kl_loss_4": 1163.2055114746095,
"kl_loss_9": 345.05494384765626,
"learning_rate": 0.0008438894484078086,
"loss": 944.8143,
"step": 2660
},
{
"ce_loss_13": 3.2971001744270323,
"ce_loss_17": 3.248241627216339,
"ce_loss_2": 4.204394841194153,
"ce_loss_4": 3.7537492752075194,
"ce_loss_9": 3.414425790309906,
"epoch": 0.267,
"grad_norm": 748.0,
"kl_loss_13": 87.35046119689942,
"kl_loss_2": 2058.653430175781,
"kl_loss_4": 1127.5207611083983,
"kl_loss_9": 336.9515045166016,
"learning_rate": 0.0008427359281116334,
"loss": 906.4635,
"step": 2670
},
{
"ce_loss_13": 3.1938817858695985,
"ce_loss_17": 3.1482104778289797,
"ce_loss_2": 4.149834764003754,
"ce_loss_4": 3.681869113445282,
"ce_loss_9": 3.321103000640869,
"epoch": 0.268,
"grad_norm": 652.0,
"kl_loss_13": 87.67700462341308,
"kl_loss_2": 2123.220263671875,
"kl_loss_4": 1154.3934997558595,
"kl_loss_9": 337.71882781982424,
"learning_rate": 0.0008415789564684673,
"loss": 921.3191,
"step": 2680
},
{
"ce_loss_13": 3.4389868855476378,
"ce_loss_17": 3.3913365483283995,
"ce_loss_2": 4.32799916267395,
"ce_loss_4": 3.896670234203339,
"ce_loss_9": 3.5553231596946717,
"epoch": 0.269,
"grad_norm": 692.0,
"kl_loss_13": 90.58032188415527,
"kl_loss_2": 1989.9671325683594,
"kl_loss_4": 1106.718344116211,
"kl_loss_9": 339.4188598632812,
"learning_rate": 0.0008404185451290017,
"loss": 887.4835,
"step": 2690
},
{
"ce_loss_13": 3.3077424645423887,
"ce_loss_17": 3.262969934940338,
"ce_loss_2": 4.220689129829407,
"ce_loss_4": 3.7679991483688355,
"ce_loss_9": 3.426233208179474,
"epoch": 0.27,
"grad_norm": 824.0,
"kl_loss_13": 86.1768726348877,
"kl_loss_2": 2058.965264892578,
"kl_loss_4": 1120.5839050292968,
"kl_loss_9": 334.05356750488284,
"learning_rate": 0.0008392547057785661,
"loss": 902.1295,
"step": 2700
},
{
"ce_loss_13": 3.239164745807648,
"ce_loss_17": 3.1922030210494996,
"ce_loss_2": 4.191896677017212,
"ce_loss_4": 3.7292771220207213,
"ce_loss_9": 3.364377760887146,
"epoch": 0.271,
"grad_norm": 828.0,
"kl_loss_13": 87.49733276367188,
"kl_loss_2": 2164.0575744628904,
"kl_loss_4": 1183.8456909179688,
"kl_loss_9": 345.6566589355469,
"learning_rate": 0.0008380874501370098,
"loss": 906.6885,
"step": 2710
},
{
"ce_loss_13": 3.229522907733917,
"ce_loss_17": 3.1830254077911375,
"ce_loss_2": 4.188966703414917,
"ce_loss_4": 3.710645842552185,
"ce_loss_9": 3.3521727085113526,
"epoch": 0.272,
"grad_norm": 852.0,
"kl_loss_13": 87.24810829162598,
"kl_loss_2": 2145.7540588378906,
"kl_loss_4": 1171.6363494873046,
"kl_loss_9": 345.628955078125,
"learning_rate": 0.0008369167899585841,
"loss": 922.7014,
"step": 2720
},
{
"ce_loss_13": 3.3517627239227297,
"ce_loss_17": 3.307138133049011,
"ce_loss_2": 4.216698122024536,
"ce_loss_4": 3.801157605648041,
"ce_loss_9": 3.4630095601081847,
"epoch": 0.273,
"grad_norm": 732.0,
"kl_loss_13": 85.38688659667969,
"kl_loss_2": 1976.8460754394532,
"kl_loss_4": 1105.086737060547,
"kl_loss_9": 329.17638092041017,
"learning_rate": 0.0008357427370318238,
"loss": 909.2509,
"step": 2730
},
{
"ce_loss_13": 3.30746066570282,
"ce_loss_17": 3.2615819692611696,
"ce_loss_2": 4.235583579540252,
"ce_loss_4": 3.773705613613129,
"ce_loss_9": 3.4275959730148315,
"epoch": 0.274,
"grad_norm": 760.0,
"kl_loss_13": 86.52706832885742,
"kl_loss_2": 2090.7098693847656,
"kl_loss_4": 1139.3353454589844,
"kl_loss_9": 335.4715240478516,
"learning_rate": 0.0008345653031794292,
"loss": 915.5144,
"step": 2740
},
{
"ce_loss_13": 3.301362681388855,
"ce_loss_17": 3.254427659511566,
"ce_loss_2": 4.222723615169525,
"ce_loss_4": 3.7712342381477355,
"ce_loss_9": 3.42340407371521,
"epoch": 0.275,
"grad_norm": 824.0,
"kl_loss_13": 86.17152290344238,
"kl_loss_2": 2044.7248291015626,
"kl_loss_4": 1119.8676971435548,
"kl_loss_9": 336.4943618774414,
"learning_rate": 0.0008333845002581458,
"loss": 901.5023,
"step": 2750
},
{
"ce_loss_13": 3.233277463912964,
"ce_loss_17": 3.186070966720581,
"ce_loss_2": 4.180034554004669,
"ce_loss_4": 3.7184282660484316,
"ce_loss_9": 3.3558387875556948,
"epoch": 0.276,
"grad_norm": 800.0,
"kl_loss_13": 87.02991333007813,
"kl_loss_2": 2143.9214416503905,
"kl_loss_4": 1183.9332061767577,
"kl_loss_9": 343.59161682128905,
"learning_rate": 0.0008322003401586462,
"loss": 932.0782,
"step": 2760
},
{
"ce_loss_13": 3.2686554908752443,
"ce_loss_17": 3.223686730861664,
"ce_loss_2": 4.163093483448028,
"ce_loss_4": 3.717913568019867,
"ce_loss_9": 3.3815023064613343,
"epoch": 0.277,
"grad_norm": 744.0,
"kl_loss_13": 84.46449165344238,
"kl_loss_2": 2016.767022705078,
"kl_loss_4": 1100.048599243164,
"kl_loss_9": 325.1136108398438,
"learning_rate": 0.0008310128348054094,
"loss": 873.1914,
"step": 2770
},
{
"ce_loss_13": 3.2334190130233766,
"ce_loss_17": 3.1884844064712525,
"ce_loss_2": 4.153000319004059,
"ce_loss_4": 3.696218252182007,
"ce_loss_9": 3.3487244963645937,
"epoch": 0.278,
"grad_norm": 896.0,
"kl_loss_13": 84.94119606018066,
"kl_loss_2": 2061.5789306640627,
"kl_loss_4": 1125.7439544677734,
"kl_loss_9": 335.49708251953126,
"learning_rate": 0.0008298219961566008,
"loss": 900.3799,
"step": 2780
},
{
"ce_loss_13": 3.2026249885559084,
"ce_loss_17": 3.1564487338066103,
"ce_loss_2": 4.1475555300712585,
"ce_loss_4": 3.6848009705543516,
"ce_loss_9": 3.322298550605774,
"epoch": 0.279,
"grad_norm": 736.0,
"kl_loss_13": 85.35611419677734,
"kl_loss_2": 2141.4679565429688,
"kl_loss_4": 1182.722882080078,
"kl_loss_9": 340.7112503051758,
"learning_rate": 0.0008286278362039527,
"loss": 908.9242,
"step": 2790
},
{
"ce_loss_13": 3.2312260270118713,
"ce_loss_17": 3.185028612613678,
"ce_loss_2": 4.187989091873169,
"ce_loss_4": 3.719260597229004,
"ce_loss_9": 3.346155607700348,
"epoch": 0.28,
"grad_norm": 776.0,
"kl_loss_13": 86.70144271850586,
"kl_loss_2": 2161.746417236328,
"kl_loss_4": 1184.7936767578126,
"kl_loss_9": 335.33875274658203,
"learning_rate": 0.0008274303669726426,
"loss": 907.5166,
"step": 2800
},
{
"ce_loss_13": 3.139736700057983,
"ce_loss_17": 3.08803471326828,
"ce_loss_2": 4.11551411151886,
"ce_loss_4": 3.6313345670700072,
"ce_loss_9": 3.25755854845047,
"epoch": 0.281,
"grad_norm": 864.0,
"kl_loss_13": 90.18624916076661,
"kl_loss_2": 2182.975939941406,
"kl_loss_4": 1170.9007141113282,
"kl_loss_9": 333.8880386352539,
"learning_rate": 0.0008262296005211721,
"loss": 907.3379,
"step": 2810
},
{
"ce_loss_13": 3.263656198978424,
"ce_loss_17": 3.214433252811432,
"ce_loss_2": 4.202375113964081,
"ce_loss_4": 3.741171956062317,
"ce_loss_9": 3.38106654882431,
"epoch": 0.282,
"grad_norm": 752.0,
"kl_loss_13": 89.46879920959472,
"kl_loss_2": 2107.242321777344,
"kl_loss_4": 1156.6708587646485,
"kl_loss_9": 336.96667022705077,
"learning_rate": 0.0008250255489412463,
"loss": 906.6707,
"step": 2820
},
{
"ce_loss_13": 3.363376832008362,
"ce_loss_17": 3.3147770762443542,
"ce_loss_2": 4.277840995788575,
"ce_loss_4": 3.8263082027435305,
"ce_loss_9": 3.477301073074341,
"epoch": 0.283,
"grad_norm": 920.0,
"kl_loss_13": 89.64902610778809,
"kl_loss_2": 2062.3288635253907,
"kl_loss_4": 1124.7566528320312,
"kl_loss_9": 328.69190521240233,
"learning_rate": 0.0008238182243576511,
"loss": 903.5663,
"step": 2830
},
{
"ce_loss_13": 3.3324272274971007,
"ce_loss_17": 3.2846037268638613,
"ce_loss_2": 4.176836204528809,
"ce_loss_4": 3.761064279079437,
"ce_loss_9": 3.4376333355903625,
"epoch": 0.284,
"grad_norm": 728.0,
"kl_loss_13": 88.71090469360351,
"kl_loss_2": 1930.1790100097655,
"kl_loss_4": 1080.3647155761719,
"kl_loss_9": 321.2514114379883,
"learning_rate": 0.0008226076389281315,
"loss": 872.3219,
"step": 2840
},
{
"ce_loss_13": 3.376649188995361,
"ce_loss_17": 3.328803825378418,
"ce_loss_2": 4.255143117904663,
"ce_loss_4": 3.813234579563141,
"ce_loss_9": 3.4860986709594726,
"epoch": 0.285,
"grad_norm": 796.0,
"kl_loss_13": 91.1908805847168,
"kl_loss_2": 2033.793310546875,
"kl_loss_4": 1108.385009765625,
"kl_loss_9": 328.55289459228516,
"learning_rate": 0.0008213938048432696,
"loss": 877.8479,
"step": 2850
},
{
"ce_loss_13": 3.2986589074134827,
"ce_loss_17": 3.248919093608856,
"ce_loss_2": 4.190428733825684,
"ce_loss_4": 3.7510254859924315,
"ce_loss_9": 3.4153574228286745,
"epoch": 0.286,
"grad_norm": 928.0,
"kl_loss_13": 90.48378639221191,
"kl_loss_2": 2016.7945983886718,
"kl_loss_4": 1100.9656066894531,
"kl_loss_9": 332.1938949584961,
"learning_rate": 0.0008201767343263612,
"loss": 895.5659,
"step": 2860
},
{
"ce_loss_13": 3.240786147117615,
"ce_loss_17": 3.1924198985099794,
"ce_loss_2": 4.166657328605652,
"ce_loss_4": 3.718879294395447,
"ce_loss_9": 3.3594149589538573,
"epoch": 0.287,
"grad_norm": 940.0,
"kl_loss_13": 87.9628921508789,
"kl_loss_2": 2091.728826904297,
"kl_loss_4": 1153.0123413085937,
"kl_loss_9": 334.0530242919922,
"learning_rate": 0.0008189564396332927,
"loss": 880.5472,
"step": 2870
},
{
"ce_loss_13": 3.223262095451355,
"ce_loss_17": 3.17556711435318,
"ce_loss_2": 4.1609018564224245,
"ce_loss_4": 3.69830641746521,
"ce_loss_9": 3.3397215843200683,
"epoch": 0.288,
"grad_norm": 692.0,
"kl_loss_13": 86.13037757873535,
"kl_loss_2": 2080.58564453125,
"kl_loss_4": 1139.816616821289,
"kl_loss_9": 329.157487487793,
"learning_rate": 0.0008177329330524181,
"loss": 909.0221,
"step": 2880
},
{
"ce_loss_13": 3.283345627784729,
"ce_loss_17": 3.2363038182258608,
"ce_loss_2": 4.173475062847137,
"ce_loss_4": 3.7359678983688354,
"ce_loss_9": 3.396084928512573,
"epoch": 0.289,
"grad_norm": 772.0,
"kl_loss_13": 84.94896850585937,
"kl_loss_2": 2006.2187255859376,
"kl_loss_4": 1102.736669921875,
"kl_loss_9": 324.60801391601564,
"learning_rate": 0.0008165062269044352,
"loss": 888.0768,
"step": 2890
},
{
"ce_loss_13": 3.231962251663208,
"ce_loss_17": 3.1847190499305724,
"ce_loss_2": 4.167845821380615,
"ce_loss_4": 3.69975209236145,
"ce_loss_9": 3.3504262447357176,
"epoch": 0.29,
"grad_norm": 828.0,
"kl_loss_13": 85.72283172607422,
"kl_loss_2": 2110.886083984375,
"kl_loss_4": 1136.065966796875,
"kl_loss_9": 334.0262481689453,
"learning_rate": 0.0008152763335422613,
"loss": 916.1252,
"step": 2900
},
{
"ce_loss_13": 3.222399044036865,
"ce_loss_17": 3.1736102938652038,
"ce_loss_2": 4.142154896259308,
"ce_loss_4": 3.690603697299957,
"ce_loss_9": 3.3418049931526186,
"epoch": 0.291,
"grad_norm": 888.0,
"kl_loss_13": 86.23341636657715,
"kl_loss_2": 2069.0516357421875,
"kl_loss_4": 1140.7152221679687,
"kl_loss_9": 336.0206954956055,
"learning_rate": 0.0008140432653509088,
"loss": 895.5287,
"step": 2910
},
{
"ce_loss_13": 3.27564138174057,
"ce_loss_17": 3.228243827819824,
"ce_loss_2": 4.174822735786438,
"ce_loss_4": 3.733404290676117,
"ce_loss_9": 3.393027651309967,
"epoch": 0.292,
"grad_norm": 752.0,
"kl_loss_13": 86.92860412597656,
"kl_loss_2": 2049.1183898925783,
"kl_loss_4": 1127.7726440429688,
"kl_loss_9": 341.3429443359375,
"learning_rate": 0.0008128070347473608,
"loss": 889.8033,
"step": 2920
},
{
"ce_loss_13": 3.2838281273841856,
"ce_loss_17": 3.238075816631317,
"ce_loss_2": 4.215974903106689,
"ce_loss_4": 3.7511423945426943,
"ce_loss_9": 3.4039663076400757,
"epoch": 0.293,
"grad_norm": 724.0,
"kl_loss_13": 87.57083511352539,
"kl_loss_2": 2116.1800048828127,
"kl_loss_4": 1153.1954772949218,
"kl_loss_9": 348.8721435546875,
"learning_rate": 0.0008115676541804455,
"loss": 910.3477,
"step": 2930
},
{
"ce_loss_13": 3.2814828038215635,
"ce_loss_17": 3.235085892677307,
"ce_loss_2": 4.168118190765381,
"ce_loss_4": 3.734774100780487,
"ce_loss_9": 3.3973249793052673,
"epoch": 0.294,
"grad_norm": 756.0,
"kl_loss_13": 86.96043701171875,
"kl_loss_2": 2015.842791748047,
"kl_loss_4": 1110.1989379882812,
"kl_loss_9": 337.41949768066405,
"learning_rate": 0.0008103251361307119,
"loss": 902.8018,
"step": 2940
},
{
"ce_loss_13": 3.3166361451148987,
"ce_loss_17": 3.2672725558280944,
"ce_loss_2": 4.226753163337707,
"ce_loss_4": 3.778488886356354,
"ce_loss_9": 3.4341904520988464,
"epoch": 0.295,
"grad_norm": 792.0,
"kl_loss_13": 88.19808197021484,
"kl_loss_2": 2055.312109375,
"kl_loss_4": 1131.3970184326172,
"kl_loss_9": 334.36364898681643,
"learning_rate": 0.0008090794931103026,
"loss": 892.6072,
"step": 2950
},
{
"ce_loss_13": 3.296609079837799,
"ce_loss_17": 3.2501901030540465,
"ce_loss_2": 4.189459836483001,
"ce_loss_4": 3.748788833618164,
"ce_loss_9": 3.412396025657654,
"epoch": 0.296,
"grad_norm": 688.0,
"kl_loss_13": 84.80472068786621,
"kl_loss_2": 1989.0398803710937,
"kl_loss_4": 1095.225326538086,
"kl_loss_9": 323.63614959716796,
"learning_rate": 0.0008078307376628291,
"loss": 883.5747,
"step": 2960
},
{
"ce_loss_13": 3.3625754952430724,
"ce_loss_17": 3.3169750809669494,
"ce_loss_2": 4.21538405418396,
"ce_loss_4": 3.798343324661255,
"ce_loss_9": 3.4755647778511047,
"epoch": 0.297,
"grad_norm": 912.0,
"kl_loss_13": 85.28404312133789,
"kl_loss_2": 1934.4125427246095,
"kl_loss_4": 1069.026303100586,
"kl_loss_9": 319.52669219970704,
"learning_rate": 0.000806578882363245,
"loss": 857.3016,
"step": 2970
},
{
"ce_loss_13": 3.2705488443374633,
"ce_loss_17": 3.2260661005973814,
"ce_loss_2": 4.157830369472504,
"ce_loss_4": 3.72437344789505,
"ce_loss_9": 3.38711701631546,
"epoch": 0.298,
"grad_norm": 952.0,
"kl_loss_13": 84.83495826721192,
"kl_loss_2": 2000.8730346679688,
"kl_loss_4": 1108.6398498535157,
"kl_loss_9": 326.1505355834961,
"learning_rate": 0.0008053239398177191,
"loss": 903.929,
"step": 2980
},
{
"ce_loss_13": 3.263384389877319,
"ce_loss_17": 3.2167693614959716,
"ce_loss_2": 4.172792506217957,
"ce_loss_4": 3.720265972614288,
"ce_loss_9": 3.373223125934601,
"epoch": 0.299,
"grad_norm": 860.0,
"kl_loss_13": 86.03630332946777,
"kl_loss_2": 2041.31611328125,
"kl_loss_4": 1108.6524291992187,
"kl_loss_9": 325.85936584472654,
"learning_rate": 0.0008040659226635089,
"loss": 910.0854,
"step": 2990
},
{
"ce_loss_13": 3.385720467567444,
"ce_loss_17": 3.3382191181182863,
"ce_loss_2": 4.266795706748963,
"ce_loss_4": 3.8300618648529055,
"ce_loss_9": 3.507095968723297,
"epoch": 0.3,
"grad_norm": 820.0,
"kl_loss_13": 89.02688827514649,
"kl_loss_2": 2004.2498901367187,
"kl_loss_4": 1094.0719848632812,
"kl_loss_9": 335.1713592529297,
"learning_rate": 0.0008028048435688333,
"loss": 878.5109,
"step": 3000
},
{
"ce_loss_13": 3.257288599014282,
"ce_loss_17": 3.2109145641326906,
"ce_loss_2": 4.18526531457901,
"ce_loss_4": 3.730650246143341,
"ce_loss_9": 3.3745195031166078,
"epoch": 0.301,
"grad_norm": 864.0,
"kl_loss_13": 84.54983787536621,
"kl_loss_2": 2080.8784423828124,
"kl_loss_4": 1137.5463317871095,
"kl_loss_9": 330.99841461181643,
"learning_rate": 0.0008015407152327448,
"loss": 897.2885,
"step": 3010
},
{
"ce_loss_13": 3.30250426530838,
"ce_loss_17": 3.255344307422638,
"ce_loss_2": 4.208631324768066,
"ce_loss_4": 3.7579152584075928,
"ce_loss_9": 3.418510675430298,
"epoch": 0.302,
"grad_norm": 852.0,
"kl_loss_13": 85.88929748535156,
"kl_loss_2": 2057.7919921875,
"kl_loss_4": 1116.4963287353517,
"kl_loss_9": 330.4980941772461,
"learning_rate": 0.0008002735503850016,
"loss": 898.668,
"step": 3020
},
{
"ce_loss_13": 3.2031028985977175,
"ce_loss_17": 3.1481196999549867,
"ce_loss_2": 4.1365126967430115,
"ce_loss_4": 3.6649728655815124,
"ce_loss_9": 3.31855708360672,
"epoch": 0.303,
"grad_norm": 680.0,
"kl_loss_13": 95.30129356384278,
"kl_loss_2": 2118.992889404297,
"kl_loss_4": 1142.1922302246094,
"kl_loss_9": 337.14517364501955,
"learning_rate": 0.0007990033617859396,
"loss": 915.553,
"step": 3030
},
{
"ce_loss_13": 3.2547006607055664,
"ce_loss_17": 3.2032246708869936,
"ce_loss_2": 4.142168891429901,
"ce_loss_4": 3.6976324796676634,
"ce_loss_9": 3.3654952168464662,
"epoch": 0.304,
"grad_norm": 720.0,
"kl_loss_13": 95.32271614074708,
"kl_loss_2": 2014.362725830078,
"kl_loss_4": 1092.9575225830079,
"kl_loss_9": 326.83533630371096,
"learning_rate": 0.000797730162226344,
"loss": 867.2573,
"step": 3040
},
{
"ce_loss_13": 3.2801309704780577,
"ce_loss_17": 3.227008855342865,
"ce_loss_2": 4.175163698196411,
"ce_loss_4": 3.7368531465530395,
"ce_loss_9": 3.3903521060943604,
"epoch": 0.305,
"grad_norm": 824.0,
"kl_loss_13": 94.85666313171387,
"kl_loss_2": 2027.4653747558593,
"kl_loss_4": 1120.939682006836,
"kl_loss_9": 327.00482788085935,
"learning_rate": 0.0007964539645273203,
"loss": 881.1213,
"step": 3050
},
{
"ce_loss_13": 3.2920580983161924,
"ce_loss_17": 3.241214370727539,
"ce_loss_2": 4.164921832084656,
"ce_loss_4": 3.726666307449341,
"ce_loss_9": 3.396126222610474,
"epoch": 0.306,
"grad_norm": 748.0,
"kl_loss_13": 87.54969520568848,
"kl_loss_2": 1971.8020629882812,
"kl_loss_4": 1079.1253387451172,
"kl_loss_9": 318.69213104248047,
"learning_rate": 0.000795174781540165,
"loss": 880.1559,
"step": 3060
},
{
"ce_loss_13": 3.363193917274475,
"ce_loss_17": 3.311252546310425,
"ce_loss_2": 4.219256675243377,
"ce_loss_4": 3.792584311962128,
"ce_loss_9": 3.466492402553558,
"epoch": 0.307,
"grad_norm": 668.0,
"kl_loss_13": 94.67599182128906,
"kl_loss_2": 1946.3895568847656,
"kl_loss_4": 1068.7306396484375,
"kl_loss_9": 316.1515609741211,
"learning_rate": 0.0007938926261462366,
"loss": 880.2222,
"step": 3070
},
{
"ce_loss_13": 3.311126208305359,
"ce_loss_17": 3.261291027069092,
"ce_loss_2": 4.170499920845032,
"ce_loss_4": 3.73604336977005,
"ce_loss_9": 3.4188188672065736,
"epoch": 0.308,
"grad_norm": 972.0,
"kl_loss_13": 91.15548439025879,
"kl_loss_2": 1990.57578125,
"kl_loss_4": 1087.4571441650392,
"kl_loss_9": 321.47308959960935,
"learning_rate": 0.0007926075112568258,
"loss": 890.7071,
"step": 3080
},
{
"ce_loss_13": 3.3046496868133546,
"ce_loss_17": 3.2562206268310545,
"ce_loss_2": 4.18972373008728,
"ce_loss_4": 3.7526055812835692,
"ce_loss_9": 3.417297029495239,
"epoch": 0.309,
"grad_norm": 776.0,
"kl_loss_13": 90.14528007507325,
"kl_loss_2": 2015.0728637695313,
"kl_loss_4": 1102.9831237792969,
"kl_loss_9": 323.41277465820315,
"learning_rate": 0.0007913194498130252,
"loss": 870.3509,
"step": 3090
},
{
"ce_loss_13": 3.2310577392578126,
"ce_loss_17": 3.1834172368049622,
"ce_loss_2": 4.143940055370331,
"ce_loss_4": 3.6971134185791015,
"ce_loss_9": 3.347603809833527,
"epoch": 0.31,
"grad_norm": 736.0,
"kl_loss_13": 88.47995147705078,
"kl_loss_2": 2047.0616760253906,
"kl_loss_4": 1117.8453430175782,
"kl_loss_9": 326.5718460083008,
"learning_rate": 0.0007900284547855992,
"loss": 898.5285,
"step": 3100
},
{
"ce_loss_13": 3.247792375087738,
"ce_loss_17": 3.2005126953125,
"ce_loss_2": 4.1182458877563475,
"ce_loss_4": 3.6890514135360717,
"ce_loss_9": 3.359893488883972,
"epoch": 0.311,
"grad_norm": 688.0,
"kl_loss_13": 87.62182350158692,
"kl_loss_2": 1989.3281494140624,
"kl_loss_4": 1094.8817047119142,
"kl_loss_9": 322.1224609375,
"learning_rate": 0.0007887345391748532,
"loss": 891.0457,
"step": 3110
},
{
"ce_loss_13": 3.3665268778800965,
"ce_loss_17": 3.3208903670310974,
"ce_loss_2": 4.215417850017547,
"ce_loss_4": 3.7931267857551574,
"ce_loss_9": 3.4738640189170837,
"epoch": 0.312,
"grad_norm": 816.0,
"kl_loss_13": 87.93111915588379,
"kl_loss_2": 1944.576806640625,
"kl_loss_4": 1068.2371887207032,
"kl_loss_9": 314.94420928955077,
"learning_rate": 0.0007874377160105036,
"loss": 852.7338,
"step": 3120
},
{
"ce_loss_13": 3.266544818878174,
"ce_loss_17": 3.2198745250701903,
"ce_loss_2": 4.183292400836945,
"ce_loss_4": 3.7293641209602355,
"ce_loss_9": 3.3768165946006774,
"epoch": 0.313,
"grad_norm": 812.0,
"kl_loss_13": 92.22348403930664,
"kl_loss_2": 2072.977648925781,
"kl_loss_4": 1130.5628204345703,
"kl_loss_9": 317.1712127685547,
"learning_rate": 0.0007861379983515449,
"loss": 918.9357,
"step": 3130
},
{
"ce_loss_13": 3.346944522857666,
"ce_loss_17": 3.3014272809028626,
"ce_loss_2": 4.223881769180298,
"ce_loss_4": 3.801562249660492,
"ce_loss_9": 3.460020422935486,
"epoch": 0.314,
"grad_norm": 1004.0,
"kl_loss_13": 87.69670944213867,
"kl_loss_2": 2007.4992248535157,
"kl_loss_4": 1115.0575561523438,
"kl_loss_9": 323.3450332641602,
"learning_rate": 0.0007848353992861195,
"loss": 879.2082,
"step": 3140
},
{
"ce_loss_13": 3.426660692691803,
"ce_loss_17": 3.37130469083786,
"ce_loss_2": 4.312441456317901,
"ce_loss_4": 3.8890052437782288,
"ce_loss_9": 3.546935868263245,
"epoch": 0.315,
"grad_norm": 824.0,
"kl_loss_13": 94.69918479919434,
"kl_loss_2": 1997.0293395996093,
"kl_loss_4": 1118.933282470703,
"kl_loss_9": 335.3253112792969,
"learning_rate": 0.0007835299319313853,
"loss": 891.7405,
"step": 3150
},
{
"ce_loss_13": 3.3141162276268004,
"ce_loss_17": 3.266682839393616,
"ce_loss_2": 4.179789280891418,
"ce_loss_4": 3.750796389579773,
"ce_loss_9": 3.4229891777038572,
"epoch": 0.316,
"grad_norm": 852.0,
"kl_loss_13": 88.69958419799805,
"kl_loss_2": 1990.604473876953,
"kl_loss_4": 1091.772930908203,
"kl_loss_9": 319.4409896850586,
"learning_rate": 0.0007822216094333848,
"loss": 903.952,
"step": 3160
},
{
"ce_loss_13": 3.3159342288970945,
"ce_loss_17": 3.266482150554657,
"ce_loss_2": 4.214651012420655,
"ce_loss_4": 3.7684731245040894,
"ce_loss_9": 3.425386905670166,
"epoch": 0.317,
"grad_norm": 828.0,
"kl_loss_13": 89.16406593322753,
"kl_loss_2": 2032.8277465820313,
"kl_loss_4": 1108.2530944824218,
"kl_loss_9": 323.86759490966796,
"learning_rate": 0.0007809104449669101,
"loss": 878.7519,
"step": 3170
},
{
"ce_loss_13": 3.270819342136383,
"ce_loss_17": 3.223061752319336,
"ce_loss_2": 4.13521009683609,
"ce_loss_4": 3.7123499155044555,
"ce_loss_9": 3.377288591861725,
"epoch": 0.318,
"grad_norm": 864.0,
"kl_loss_13": 85.74185523986816,
"kl_loss_2": 1959.4578186035155,
"kl_loss_4": 1078.3698852539062,
"kl_loss_9": 318.6862518310547,
"learning_rate": 0.0007795964517353734,
"loss": 862.148,
"step": 3180
},
{
"ce_loss_13": 3.2628143787384034,
"ce_loss_17": 3.218585216999054,
"ce_loss_2": 4.16016184091568,
"ce_loss_4": 3.711937928199768,
"ce_loss_9": 3.3730796337127686,
"epoch": 0.319,
"grad_norm": 680.0,
"kl_loss_13": 86.5244125366211,
"kl_loss_2": 2051.4399475097657,
"kl_loss_4": 1119.6945068359375,
"kl_loss_9": 325.8485305786133,
"learning_rate": 0.000778279642970672,
"loss": 867.9717,
"step": 3190
},
{
"ce_loss_13": 3.2614962220191956,
"ce_loss_17": 3.216829836368561,
"ce_loss_2": 4.138477158546448,
"ce_loss_4": 3.704724645614624,
"ce_loss_9": 3.3741748690605164,
"epoch": 0.32,
"grad_norm": 1112.0,
"kl_loss_13": 84.40923233032227,
"kl_loss_2": 1991.6298278808595,
"kl_loss_4": 1090.3296936035156,
"kl_loss_9": 322.5443389892578,
"learning_rate": 0.0007769600319330552,
"loss": 859.8841,
"step": 3200
},
{
"ce_loss_13": 3.288937306404114,
"ce_loss_17": 3.2442583322525023,
"ce_loss_2": 4.222093415260315,
"ce_loss_4": 3.7567214012145995,
"ce_loss_9": 3.4035842776298524,
"epoch": 0.321,
"grad_norm": 1088.0,
"kl_loss_13": 83.93023834228515,
"kl_loss_2": 2080.5573974609374,
"kl_loss_4": 1127.0830047607421,
"kl_loss_9": 325.4534255981445,
"learning_rate": 0.0007756376319109917,
"loss": 887.2564,
"step": 3210
},
{
"ce_loss_13": 3.339513421058655,
"ce_loss_17": 3.2954869866371155,
"ce_loss_2": 4.208236837387085,
"ce_loss_4": 3.7821726322174074,
"ce_loss_9": 3.451899802684784,
"epoch": 0.322,
"grad_norm": 936.0,
"kl_loss_13": 82.67616577148438,
"kl_loss_2": 1983.3537475585938,
"kl_loss_4": 1092.1045135498048,
"kl_loss_9": 322.47215576171874,
"learning_rate": 0.0007743124562210351,
"loss": 852.4111,
"step": 3220
},
{
"ce_loss_13": 3.350459098815918,
"ce_loss_17": 3.3049745678901674,
"ce_loss_2": 4.223370480537414,
"ce_loss_4": 3.792885971069336,
"ce_loss_9": 3.4620816826820375,
"epoch": 0.323,
"grad_norm": 952.0,
"kl_loss_13": 84.83108139038086,
"kl_loss_2": 1979.5841796875,
"kl_loss_4": 1089.1873504638672,
"kl_loss_9": 320.7507629394531,
"learning_rate": 0.0007729845182076895,
"loss": 872.6604,
"step": 3230
},
{
"ce_loss_13": 3.280298352241516,
"ce_loss_17": 3.239156460762024,
"ce_loss_2": 4.138239550590515,
"ce_loss_4": 3.7194194436073302,
"ce_loss_9": 3.3929233312606812,
"epoch": 0.324,
"grad_norm": 848.0,
"kl_loss_13": 82.10867538452149,
"kl_loss_2": 1946.6383728027345,
"kl_loss_4": 1077.661312866211,
"kl_loss_9": 320.58251495361327,
"learning_rate": 0.0007716538312432765,
"loss": 880.8723,
"step": 3240
},
{
"ce_loss_13": 3.242268109321594,
"ce_loss_17": 3.1970906615257264,
"ce_loss_2": 4.153493225574493,
"ce_loss_4": 3.6985382556915285,
"ce_loss_9": 3.3589555382728578,
"epoch": 0.325,
"grad_norm": 744.0,
"kl_loss_13": 83.0129451751709,
"kl_loss_2": 2044.7987487792968,
"kl_loss_4": 1114.1810821533204,
"kl_loss_9": 324.7056488037109,
"learning_rate": 0.0007703204087277988,
"loss": 886.2568,
"step": 3250
},
{
"ce_loss_13": 3.340661096572876,
"ce_loss_17": 3.2950722336769105,
"ce_loss_2": 4.182095468044281,
"ce_loss_4": 3.757711577415466,
"ce_loss_9": 3.44431414604187,
"epoch": 0.326,
"grad_norm": 740.0,
"kl_loss_13": 81.64471549987793,
"kl_loss_2": 1907.3132751464843,
"kl_loss_4": 1040.2453918457031,
"kl_loss_9": 309.3230773925781,
"learning_rate": 0.0007689842640888063,
"loss": 847.0235,
"step": 3260
},
{
"ce_loss_13": 3.3351841807365417,
"ce_loss_17": 3.2892799735069276,
"ce_loss_2": 4.196241044998169,
"ce_loss_4": 3.774498987197876,
"ce_loss_9": 3.448796498775482,
"epoch": 0.327,
"grad_norm": 768.0,
"kl_loss_13": 83.10033264160157,
"kl_loss_2": 1935.7306579589845,
"kl_loss_4": 1063.3290313720704,
"kl_loss_9": 319.72888641357423,
"learning_rate": 0.0007676454107812607,
"loss": 860.6641,
"step": 3270
},
{
"ce_loss_13": 3.27805734872818,
"ce_loss_17": 3.2325761914253235,
"ce_loss_2": 4.163552284240723,
"ce_loss_4": 3.7228221774101256,
"ce_loss_9": 3.3905189394950868,
"epoch": 0.328,
"grad_norm": 1200.0,
"kl_loss_13": 83.2603816986084,
"kl_loss_2": 2022.139111328125,
"kl_loss_4": 1090.8947479248047,
"kl_loss_9": 321.4178863525391,
"learning_rate": 0.0007663038622873999,
"loss": 865.2281,
"step": 3280
},
{
"ce_loss_13": 3.31582168340683,
"ce_loss_17": 3.2736997604370117,
"ce_loss_2": 4.193931818008423,
"ce_loss_4": 3.752781319618225,
"ce_loss_9": 3.42306672334671,
"epoch": 0.329,
"grad_norm": 772.0,
"kl_loss_13": 82.19000549316407,
"kl_loss_2": 1988.5528625488282,
"kl_loss_4": 1076.8862548828124,
"kl_loss_9": 316.0207717895508,
"learning_rate": 0.0007649596321166025,
"loss": 855.2568,
"step": 3290
},
{
"ce_loss_13": 3.2226877927780153,
"ce_loss_17": 3.178584325313568,
"ce_loss_2": 4.08319239616394,
"ce_loss_4": 3.6656685709953307,
"ce_loss_9": 3.3301095366477966,
"epoch": 0.33,
"grad_norm": 724.0,
"kl_loss_13": 78.87243385314942,
"kl_loss_2": 1926.275946044922,
"kl_loss_4": 1067.525653076172,
"kl_loss_9": 306.3053573608398,
"learning_rate": 0.0007636127338052513,
"loss": 860.6953,
"step": 3300
},
{
"ce_loss_13": 3.320180690288544,
"ce_loss_17": 3.276493513584137,
"ce_loss_2": 4.227478420734405,
"ce_loss_4": 3.7749785423278808,
"ce_loss_9": 3.435419762134552,
"epoch": 0.331,
"grad_norm": 632.0,
"kl_loss_13": 84.00535354614257,
"kl_loss_2": 2043.9775085449219,
"kl_loss_4": 1105.5403198242188,
"kl_loss_9": 321.6100845336914,
"learning_rate": 0.0007622631809165971,
"loss": 869.9645,
"step": 3310
},
{
"ce_loss_13": 3.3199275851249697,
"ce_loss_17": 3.2771979689598085,
"ce_loss_2": 4.15064367055893,
"ce_loss_4": 3.7406180262565614,
"ce_loss_9": 3.4273388624191283,
"epoch": 0.332,
"grad_norm": 900.0,
"kl_loss_13": 78.16543731689453,
"kl_loss_2": 1873.4186096191406,
"kl_loss_4": 1031.5828399658203,
"kl_loss_9": 303.8035720825195,
"learning_rate": 0.000760910987040623,
"loss": 841.3832,
"step": 3320
},
{
"ce_loss_13": 3.301027572154999,
"ce_loss_17": 3.2573514699935915,
"ce_loss_2": 4.202074742317199,
"ce_loss_4": 3.752170813083649,
"ce_loss_9": 3.4156262040138246,
"epoch": 0.333,
"grad_norm": 820.0,
"kl_loss_13": 82.72586593627929,
"kl_loss_2": 2041.1314575195313,
"kl_loss_4": 1107.1857055664063,
"kl_loss_9": 323.77696228027344,
"learning_rate": 0.000759556165793906,
"loss": 863.6803,
"step": 3330
},
{
"ce_loss_13": 3.3216323256492615,
"ce_loss_17": 3.2765379667282106,
"ce_loss_2": 4.196370267868042,
"ce_loss_4": 3.769469165802002,
"ce_loss_9": 3.4320739030838014,
"epoch": 0.334,
"grad_norm": 748.0,
"kl_loss_13": 82.56733436584473,
"kl_loss_2": 1987.8205200195312,
"kl_loss_4": 1094.0633605957032,
"kl_loss_9": 319.1539001464844,
"learning_rate": 0.000758198730819481,
"loss": 876.0232,
"step": 3340
},
{
"ce_loss_13": 3.2749180793762207,
"ce_loss_17": 3.2325748682022093,
"ce_loss_2": 4.154002678394318,
"ce_loss_4": 3.724340319633484,
"ce_loss_9": 3.386329698562622,
"epoch": 0.335,
"grad_norm": 880.0,
"kl_loss_13": 80.55043601989746,
"kl_loss_2": 2001.4410034179687,
"kl_loss_4": 1099.623828125,
"kl_loss_9": 314.9535186767578,
"learning_rate": 0.0007568386957867032,
"loss": 869.0264,
"step": 3350
},
{
"ce_loss_13": 3.3380845189094543,
"ce_loss_17": 3.2921542525291443,
"ce_loss_2": 4.198313045501709,
"ce_loss_4": 3.776810646057129,
"ce_loss_9": 3.4500930070877076,
"epoch": 0.336,
"grad_norm": 836.0,
"kl_loss_13": 81.9162425994873,
"kl_loss_2": 1956.8607299804687,
"kl_loss_4": 1075.7296630859375,
"kl_loss_9": 317.184814453125,
"learning_rate": 0.0007554760743911103,
"loss": 869.276,
"step": 3360
},
{
"ce_loss_13": 3.249480664730072,
"ce_loss_17": 3.207807409763336,
"ce_loss_2": 4.117359912395477,
"ce_loss_4": 3.6858110547065737,
"ce_loss_9": 3.3595375537872316,
"epoch": 0.337,
"grad_norm": 752.0,
"kl_loss_13": 78.22098541259766,
"kl_loss_2": 1978.9128662109374,
"kl_loss_4": 1069.7276184082032,
"kl_loss_9": 306.45787353515624,
"learning_rate": 0.0007541108803542846,
"loss": 883.0004,
"step": 3370
},
{
"ce_loss_13": 3.292482590675354,
"ce_loss_17": 3.24836003780365,
"ce_loss_2": 4.1632227301597595,
"ce_loss_4": 3.725235164165497,
"ce_loss_9": 3.399733805656433,
"epoch": 0.338,
"grad_norm": 900.0,
"kl_loss_13": 81.92625350952149,
"kl_loss_2": 1998.8309326171875,
"kl_loss_4": 1077.7982391357423,
"kl_loss_9": 313.54881439208987,
"learning_rate": 0.0007527431274237149,
"loss": 907.5791,
"step": 3380
},
{
"ce_loss_13": 3.2658942699432374,
"ce_loss_17": 3.224719560146332,
"ce_loss_2": 4.125988805294037,
"ce_loss_4": 3.6935991287231444,
"ce_loss_9": 3.3738891005516054,
"epoch": 0.339,
"grad_norm": 776.0,
"kl_loss_13": 80.2432315826416,
"kl_loss_2": 1971.8451171875,
"kl_loss_4": 1068.139666748047,
"kl_loss_9": 310.88482971191405,
"learning_rate": 0.0007513728293726579,
"loss": 861.7771,
"step": 3390
},
{
"ce_loss_13": 3.378425621986389,
"ce_loss_17": 3.3328167319297792,
"ce_loss_2": 4.222032153606415,
"ce_loss_4": 3.809156823158264,
"ce_loss_9": 3.4891692996025085,
"epoch": 0.34,
"grad_norm": 768.0,
"kl_loss_13": 81.9792694091797,
"kl_loss_2": 1940.3130737304687,
"kl_loss_4": 1070.0313262939453,
"kl_loss_9": 317.02344970703126,
"learning_rate": 0.00075,
"loss": 849.7743,
"step": 3400
},
{
"ce_loss_13": 3.3663819909095762,
"ce_loss_17": 3.3200625658035277,
"ce_loss_2": 4.24148062467575,
"ce_loss_4": 3.816836142539978,
"ce_loss_9": 3.4848857522010803,
"epoch": 0.341,
"grad_norm": 912.0,
"kl_loss_13": 83.7885841369629,
"kl_loss_2": 1977.0342834472656,
"kl_loss_4": 1090.5568939208983,
"kl_loss_9": 327.1791015625,
"learning_rate": 0.0007486246531301177,
"loss": 858.6299,
"step": 3410
},
{
"ce_loss_13": 3.1796913862228395,
"ce_loss_17": 3.1363569140434264,
"ce_loss_2": 4.062297952175141,
"ce_loss_4": 3.633475697040558,
"ce_loss_9": 3.2941571712493896,
"epoch": 0.342,
"grad_norm": 788.0,
"kl_loss_13": 79.56804389953614,
"kl_loss_2": 1977.1298095703125,
"kl_loss_4": 1089.7484497070313,
"kl_loss_9": 323.80016326904297,
"learning_rate": 0.0007472468026127384,
"loss": 850.588,
"step": 3420
},
{
"ce_loss_13": 3.315054738521576,
"ce_loss_17": 3.2690629601478576,
"ce_loss_2": 4.222481107711792,
"ce_loss_4": 3.777264642715454,
"ce_loss_9": 3.4370264291763304,
"epoch": 0.343,
"grad_norm": 836.0,
"kl_loss_13": 84.76142272949218,
"kl_loss_2": 2066.071649169922,
"kl_loss_4": 1127.61123046875,
"kl_loss_9": 338.3886428833008,
"learning_rate": 0.000745866462322802,
"loss": 888.6806,
"step": 3430
},
{
"ce_loss_13": 3.3010619044303895,
"ce_loss_17": 3.2584898352622984,
"ce_loss_2": 4.155515027046204,
"ce_loss_4": 3.732189679145813,
"ce_loss_9": 3.4116573691368104,
"epoch": 0.344,
"grad_norm": 792.0,
"kl_loss_13": 80.68924560546876,
"kl_loss_2": 1923.6429443359375,
"kl_loss_4": 1055.2689880371095,
"kl_loss_9": 314.93599700927734,
"learning_rate": 0.0007444836461603195,
"loss": 852.7298,
"step": 3440
},
{
"ce_loss_13": 3.3628268957138063,
"ce_loss_17": 3.3158179044723513,
"ce_loss_2": 4.236175012588501,
"ce_loss_4": 3.80575430393219,
"ce_loss_9": 3.4840613842010497,
"epoch": 0.345,
"grad_norm": 768.0,
"kl_loss_13": 86.91317672729492,
"kl_loss_2": 1995.2550659179688,
"kl_loss_4": 1098.6545684814453,
"kl_loss_9": 338.66367797851564,
"learning_rate": 0.0007430983680502344,
"loss": 884.6416,
"step": 3450
},
{
"ce_loss_13": 3.204190742969513,
"ce_loss_17": 3.160679280757904,
"ce_loss_2": 4.097968113422394,
"ce_loss_4": 3.6541949272155763,
"ce_loss_9": 3.3195021510124207,
"epoch": 0.346,
"grad_norm": 828.0,
"kl_loss_13": 82.72286491394043,
"kl_loss_2": 2016.9924682617188,
"kl_loss_4": 1099.164813232422,
"kl_loss_9": 331.8776428222656,
"learning_rate": 0.0007417106419422819,
"loss": 877.3746,
"step": 3460
},
{
"ce_loss_13": 3.3090489149093627,
"ce_loss_17": 3.262884783744812,
"ce_loss_2": 4.165620517730713,
"ce_loss_4": 3.7416993379592896,
"ce_loss_9": 3.4226615190505982,
"epoch": 0.347,
"grad_norm": 804.0,
"kl_loss_13": 82.66714973449707,
"kl_loss_2": 1941.4239135742187,
"kl_loss_4": 1066.0562683105468,
"kl_loss_9": 318.9803466796875,
"learning_rate": 0.0007403204818108486,
"loss": 867.3693,
"step": 3470
},
{
"ce_loss_13": 3.2859779000282288,
"ce_loss_17": 3.2418403744697573,
"ce_loss_2": 4.1588115215301515,
"ce_loss_4": 3.7164050340652466,
"ce_loss_9": 3.3979223132133485,
"epoch": 0.348,
"grad_norm": 660.0,
"kl_loss_13": 82.70349998474121,
"kl_loss_2": 2008.7912902832031,
"kl_loss_4": 1082.7954498291015,
"kl_loss_9": 321.50720062255857,
"learning_rate": 0.0007389279016548316,
"loss": 845.0812,
"step": 3480
},
{
"ce_loss_13": 3.2886459708213804,
"ce_loss_17": 3.2422160506248474,
"ce_loss_2": 4.201461863517761,
"ce_loss_4": 3.7479749441146852,
"ce_loss_9": 3.4051220774650575,
"epoch": 0.349,
"grad_norm": 812.0,
"kl_loss_13": 82.32789001464843,
"kl_loss_2": 2051.374365234375,
"kl_loss_4": 1109.6338958740234,
"kl_loss_9": 325.52098541259767,
"learning_rate": 0.0007375329154974975,
"loss": 881.8524,
"step": 3490
},
{
"ce_loss_13": 3.2568347334861754,
"ce_loss_17": 3.213172948360443,
"ce_loss_2": 4.103061079978943,
"ce_loss_4": 3.6816585183143617,
"ce_loss_9": 3.364275133609772,
"epoch": 0.35,
"grad_norm": 700.0,
"kl_loss_13": 83.60759010314942,
"kl_loss_2": 1942.3072021484375,
"kl_loss_4": 1067.110647583008,
"kl_loss_9": 311.56414184570315,
"learning_rate": 0.0007361355373863414,
"loss": 870.9309,
"step": 3500
},
{
"ce_loss_13": 3.2974486470222475,
"ce_loss_17": 3.256460976600647,
"ce_loss_2": 4.147087705135346,
"ce_loss_4": 3.7272114038467405,
"ce_loss_9": 3.406346929073334,
"epoch": 0.351,
"grad_norm": 988.0,
"kl_loss_13": 78.86546592712402,
"kl_loss_2": 1917.3578735351562,
"kl_loss_4": 1045.9387237548829,
"kl_loss_9": 310.67402801513674,
"learning_rate": 0.0007347357813929454,
"loss": 867.2453,
"step": 3510
},
{
"ce_loss_13": 3.2474450349807737,
"ce_loss_17": 3.203974211215973,
"ce_loss_2": 4.1023600697517395,
"ce_loss_4": 3.6744820713996886,
"ce_loss_9": 3.3537165760993957,
"epoch": 0.352,
"grad_norm": 924.0,
"kl_loss_13": 79.74258193969726,
"kl_loss_2": 1930.6052978515625,
"kl_loss_4": 1048.7113800048828,
"kl_loss_9": 309.16983489990236,
"learning_rate": 0.0007333336616128369,
"loss": 867.3182,
"step": 3520
},
{
"ce_loss_13": 3.224319100379944,
"ce_loss_17": 3.177726924419403,
"ce_loss_2": 4.1144388794898985,
"ce_loss_4": 3.667049491405487,
"ce_loss_9": 3.3337238669395446,
"epoch": 0.353,
"grad_norm": 852.0,
"kl_loss_13": 81.09270210266114,
"kl_loss_2": 1998.5930908203125,
"kl_loss_4": 1079.8505889892579,
"kl_loss_9": 319.79501190185545,
"learning_rate": 0.0007319291921653463,
"loss": 869.1441,
"step": 3530
},
{
"ce_loss_13": 3.306344509124756,
"ce_loss_17": 3.260231626033783,
"ce_loss_2": 4.194766342639923,
"ce_loss_4": 3.753794813156128,
"ce_loss_9": 3.4233739256858824,
"epoch": 0.354,
"grad_norm": 884.0,
"kl_loss_13": 83.32369499206543,
"kl_loss_2": 1991.9574462890625,
"kl_loss_4": 1089.6318115234376,
"kl_loss_9": 322.709977722168,
"learning_rate": 0.0007305223871934656,
"loss": 855.8797,
"step": 3540
},
{
"ce_loss_13": 3.275039529800415,
"ce_loss_17": 3.228920245170593,
"ce_loss_2": 4.137136328220367,
"ce_loss_4": 3.708537828922272,
"ce_loss_9": 3.3849629878997805,
"epoch": 0.355,
"grad_norm": 948.0,
"kl_loss_13": 81.76578102111816,
"kl_loss_2": 1950.9606018066406,
"kl_loss_4": 1065.2773895263672,
"kl_loss_9": 313.07694854736326,
"learning_rate": 0.0007291132608637052,
"loss": 857.2044,
"step": 3550
},
{
"ce_loss_13": 3.2366208672523498,
"ce_loss_17": 3.193993294239044,
"ce_loss_2": 4.182784128189087,
"ce_loss_4": 3.6868955850601197,
"ce_loss_9": 3.3444697976112367,
"epoch": 0.356,
"grad_norm": 812.0,
"kl_loss_13": 78.44016952514649,
"kl_loss_2": 2117.178143310547,
"kl_loss_4": 1090.2738372802735,
"kl_loss_9": 307.27020263671875,
"learning_rate": 0.0007277018273659516,
"loss": 891.6365,
"step": 3560
},
{
"ce_loss_13": 3.361915683746338,
"ce_loss_17": 3.3157687306404116,
"ce_loss_2": 4.230954706668854,
"ce_loss_4": 3.804699718952179,
"ce_loss_9": 3.4764284491539,
"epoch": 0.357,
"grad_norm": 772.0,
"kl_loss_13": 82.67460632324219,
"kl_loss_2": 1993.5298583984375,
"kl_loss_4": 1096.8324890136719,
"kl_loss_9": 325.3048751831055,
"learning_rate": 0.0007262881009133242,
"loss": 869.1084,
"step": 3570
},
{
"ce_loss_13": 3.2845988869667053,
"ce_loss_17": 3.2426820397377014,
"ce_loss_2": 4.143541800975799,
"ce_loss_4": 3.7120046854019164,
"ce_loss_9": 3.3966021656990053,
"epoch": 0.358,
"grad_norm": 1064.0,
"kl_loss_13": 78.48812313079834,
"kl_loss_2": 1937.2622436523438,
"kl_loss_4": 1051.5308563232422,
"kl_loss_9": 308.0796234130859,
"learning_rate": 0.0007248720957420329,
"loss": 843.0527,
"step": 3580
},
{
"ce_loss_13": 3.2862144470214845,
"ce_loss_17": 3.244723927974701,
"ce_loss_2": 4.132400393486023,
"ce_loss_4": 3.7120439767837525,
"ce_loss_9": 3.390040838718414,
"epoch": 0.359,
"grad_norm": 884.0,
"kl_loss_13": 79.27053871154786,
"kl_loss_2": 1919.6581420898438,
"kl_loss_4": 1045.3176635742188,
"kl_loss_9": 308.4083541870117,
"learning_rate": 0.0007234538261112341,
"loss": 876.4783,
"step": 3590
},
{
"ce_loss_13": 3.320645642280579,
"ce_loss_17": 3.2759631395339968,
"ce_loss_2": 4.197598016262054,
"ce_loss_4": 3.76350314617157,
"ce_loss_9": 3.4308587551116942,
"epoch": 0.36,
"grad_norm": 904.0,
"kl_loss_13": 79.74393005371094,
"kl_loss_2": 1974.381982421875,
"kl_loss_4": 1078.2549438476562,
"kl_loss_9": 316.0038696289063,
"learning_rate": 0.0007220333063028871,
"loss": 854.5963,
"step": 3600
},
{
"ce_loss_13": 3.3596386075019837,
"ce_loss_17": 3.3134003400802614,
"ce_loss_2": 4.298530578613281,
"ce_loss_4": 3.835520887374878,
"ce_loss_9": 3.474874699115753,
"epoch": 0.361,
"grad_norm": 896.0,
"kl_loss_13": 82.07156906127929,
"kl_loss_2": 2145.89267578125,
"kl_loss_4": 1164.2722106933593,
"kl_loss_9": 322.6014999389648,
"learning_rate": 0.0007206105506216106,
"loss": 904.726,
"step": 3610
},
{
"ce_loss_13": 3.2352831959724426,
"ce_loss_17": 3.192703652381897,
"ce_loss_2": 4.088595056533814,
"ce_loss_4": 3.6704824805259704,
"ce_loss_9": 3.344395172595978,
"epoch": 0.362,
"grad_norm": 792.0,
"kl_loss_13": 79.15930404663087,
"kl_loss_2": 1925.1086791992188,
"kl_loss_4": 1052.2397583007812,
"kl_loss_9": 310.4357452392578,
"learning_rate": 0.0007191855733945387,
"loss": 838.3771,
"step": 3620
},
{
"ce_loss_13": 3.3250008821487427,
"ce_loss_17": 3.282253110408783,
"ce_loss_2": 4.182705891132355,
"ce_loss_4": 3.760298418998718,
"ce_loss_9": 3.4336714267730715,
"epoch": 0.363,
"grad_norm": 1032.0,
"kl_loss_13": 79.23486404418945,
"kl_loss_2": 1932.1587280273438,
"kl_loss_4": 1057.2986968994142,
"kl_loss_9": 309.3342681884766,
"learning_rate": 0.0007177583889711762,
"loss": 845.6563,
"step": 3630
},
{
"ce_loss_13": 3.245380866527557,
"ce_loss_17": 3.202890765666962,
"ce_loss_2": 4.11917644739151,
"ce_loss_4": 3.6847108244895934,
"ce_loss_9": 3.356135535240173,
"epoch": 0.364,
"grad_norm": 716.0,
"kl_loss_13": 80.34892082214355,
"kl_loss_2": 1982.9852783203125,
"kl_loss_4": 1083.8937103271485,
"kl_loss_9": 318.4195266723633,
"learning_rate": 0.0007163290117232541,
"loss": 863.3904,
"step": 3640
},
{
"ce_loss_13": 3.359238362312317,
"ce_loss_17": 3.3169869422912597,
"ce_loss_2": 4.1737536787986755,
"ce_loss_4": 3.7707980632781983,
"ce_loss_9": 3.464288854598999,
"epoch": 0.365,
"grad_norm": 832.0,
"kl_loss_13": 78.68237228393555,
"kl_loss_2": 1882.076104736328,
"kl_loss_4": 1027.501986694336,
"kl_loss_9": 305.38031616210935,
"learning_rate": 0.0007148974560445859,
"loss": 837.6867,
"step": 3650
},
{
"ce_loss_13": 3.2792360305786135,
"ce_loss_17": 3.238192355632782,
"ce_loss_2": 4.114881753921509,
"ce_loss_4": 3.706641066074371,
"ce_loss_9": 3.3858341693878176,
"epoch": 0.366,
"grad_norm": 792.0,
"kl_loss_13": 77.95560035705566,
"kl_loss_2": 1890.5672485351563,
"kl_loss_4": 1040.546405029297,
"kl_loss_9": 307.1755569458008,
"learning_rate": 0.0007134637363509209,
"loss": 831.2457,
"step": 3660
},
{
"ce_loss_13": 3.3924476146697997,
"ce_loss_17": 3.3501150131225588,
"ce_loss_2": 4.223927116394043,
"ce_loss_4": 3.811612105369568,
"ce_loss_9": 3.4943058609962465,
"epoch": 0.367,
"grad_norm": 736.0,
"kl_loss_13": 78.58778686523438,
"kl_loss_2": 1878.4443969726562,
"kl_loss_4": 1033.2739837646484,
"kl_loss_9": 305.39368057250977,
"learning_rate": 0.0007120278670798009,
"loss": 845.4932,
"step": 3670
},
{
"ce_loss_13": 3.1926652908325197,
"ce_loss_17": 3.147311043739319,
"ce_loss_2": 4.120092225074768,
"ce_loss_4": 3.6587387681007386,
"ce_loss_9": 3.308837962150574,
"epoch": 0.368,
"grad_norm": 1000.0,
"kl_loss_13": 80.63431091308594,
"kl_loss_2": 2088.163299560547,
"kl_loss_4": 1126.5058013916016,
"kl_loss_9": 326.61964416503906,
"learning_rate": 0.0007105898626904133,
"loss": 900.6866,
"step": 3680
},
{
"ce_loss_13": 3.2909374833106995,
"ce_loss_17": 3.2479761719703673,
"ce_loss_2": 4.165724217891693,
"ce_loss_4": 3.7259296655654905,
"ce_loss_9": 3.401779890060425,
"epoch": 0.369,
"grad_norm": 708.0,
"kl_loss_13": 79.0161735534668,
"kl_loss_2": 1963.41201171875,
"kl_loss_4": 1061.9493225097656,
"kl_loss_9": 310.74631652832034,
"learning_rate": 0.0007091497376634463,
"loss": 845.885,
"step": 3690
},
{
"ce_loss_13": 3.242671525478363,
"ce_loss_17": 3.2003350973129274,
"ce_loss_2": 4.090977871418,
"ce_loss_4": 3.670491564273834,
"ce_loss_9": 3.348677897453308,
"epoch": 0.37,
"grad_norm": 636.0,
"kl_loss_13": 79.4022216796875,
"kl_loss_2": 1923.3630676269531,
"kl_loss_4": 1051.1070129394532,
"kl_loss_9": 309.0306625366211,
"learning_rate": 0.0007077075065009433,
"loss": 857.3949,
"step": 3700
},
{
"ce_loss_13": 3.339042866230011,
"ce_loss_17": 3.2937220692634583,
"ce_loss_2": 4.216702568531036,
"ce_loss_4": 3.786496937274933,
"ce_loss_9": 3.452583134174347,
"epoch": 0.371,
"grad_norm": 808.0,
"kl_loss_13": 83.84554100036621,
"kl_loss_2": 1977.7676391601562,
"kl_loss_4": 1081.7481719970704,
"kl_loss_9": 318.5555221557617,
"learning_rate": 0.0007062631837261557,
"loss": 858.5238,
"step": 3710
},
{
"ce_loss_13": 3.214540886878967,
"ce_loss_17": 3.1731338500976562,
"ce_loss_2": 4.084117066860199,
"ce_loss_4": 3.6512245535850525,
"ce_loss_9": 3.3241248726844788,
"epoch": 0.372,
"grad_norm": 836.0,
"kl_loss_13": 78.77836952209472,
"kl_loss_2": 1959.5369201660155,
"kl_loss_4": 1061.3888275146485,
"kl_loss_9": 308.37495574951174,
"learning_rate": 0.0007048167838833977,
"loss": 871.3439,
"step": 3720
},
{
"ce_loss_13": 3.305317234992981,
"ce_loss_17": 3.2632120013237,
"ce_loss_2": 4.142421758174896,
"ce_loss_4": 3.7283589124679564,
"ce_loss_9": 3.4138307213783263,
"epoch": 0.373,
"grad_norm": 836.0,
"kl_loss_13": 79.6880443572998,
"kl_loss_2": 1905.605615234375,
"kl_loss_4": 1035.902392578125,
"kl_loss_9": 308.8265640258789,
"learning_rate": 0.0007033683215379002,
"loss": 839.7719,
"step": 3730
},
{
"ce_loss_13": 3.2911507964134215,
"ce_loss_17": 3.2498687624931337,
"ce_loss_2": 4.154629027843475,
"ce_loss_4": 3.7290974378585817,
"ce_loss_9": 3.404646301269531,
"epoch": 0.374,
"grad_norm": 840.0,
"kl_loss_13": 79.13087692260743,
"kl_loss_2": 1930.552557373047,
"kl_loss_4": 1047.307080078125,
"kl_loss_9": 307.69998626708986,
"learning_rate": 0.0007019178112756625,
"loss": 854.4352,
"step": 3740
},
{
"ce_loss_13": 3.257925236225128,
"ce_loss_17": 3.217692804336548,
"ce_loss_2": 4.112984991073608,
"ce_loss_4": 3.68343665599823,
"ce_loss_9": 3.3633124232292175,
"epoch": 0.375,
"grad_norm": 884.0,
"kl_loss_13": 77.43262634277343,
"kl_loss_2": 1928.0837707519531,
"kl_loss_4": 1048.2009521484374,
"kl_loss_9": 304.83057403564453,
"learning_rate": 0.0007004652677033068,
"loss": 851.567,
"step": 3750
},
{
"ce_loss_13": 3.3331055283546447,
"ce_loss_17": 3.2939743518829347,
"ce_loss_2": 4.160905277729034,
"ce_loss_4": 3.75036461353302,
"ce_loss_9": 3.4350261807441713,
"epoch": 0.376,
"grad_norm": 820.0,
"kl_loss_13": 76.70789985656738,
"kl_loss_2": 1890.9389343261719,
"kl_loss_4": 1032.068994140625,
"kl_loss_9": 300.15679931640625,
"learning_rate": 0.0006990107054479312,
"loss": 838.1568,
"step": 3760
},
{
"ce_loss_13": 3.3143543720245363,
"ce_loss_17": 3.270555055141449,
"ce_loss_2": 4.165234935283661,
"ce_loss_4": 3.746877431869507,
"ce_loss_9": 3.4235692620277405,
"epoch": 0.377,
"grad_norm": 876.0,
"kl_loss_13": 80.17977790832519,
"kl_loss_2": 1924.0489685058594,
"kl_loss_4": 1052.3332916259765,
"kl_loss_9": 309.0487655639648,
"learning_rate": 0.000697554139156961,
"loss": 849.1367,
"step": 3770
},
{
"ce_loss_13": 3.3027577638626098,
"ce_loss_17": 3.2585461258888246,
"ce_loss_2": 4.1655859589576725,
"ce_loss_4": 3.736509621143341,
"ce_loss_9": 3.4114188671112062,
"epoch": 0.378,
"grad_norm": 824.0,
"kl_loss_13": 79.86643676757812,
"kl_loss_2": 1961.685479736328,
"kl_loss_4": 1064.6449981689452,
"kl_loss_9": 314.34642333984374,
"learning_rate": 0.0006960955834980027,
"loss": 838.839,
"step": 3780
},
{
"ce_loss_13": 3.2778510212898255,
"ce_loss_17": 3.231376898288727,
"ce_loss_2": 4.127654790878296,
"ce_loss_4": 3.7043236970901487,
"ce_loss_9": 3.3832536220550535,
"epoch": 0.379,
"grad_norm": 788.0,
"kl_loss_13": 79.7079948425293,
"kl_loss_2": 1923.5986022949219,
"kl_loss_4": 1057.3545501708984,
"kl_loss_9": 307.5535095214844,
"learning_rate": 0.0006946350531586958,
"loss": 846.7726,
"step": 3790
},
{
"ce_loss_13": 3.3021407604217528,
"ce_loss_17": 3.2583566904067993,
"ce_loss_2": 4.157404685020447,
"ce_loss_4": 3.733813750743866,
"ce_loss_9": 3.4081004738807676,
"epoch": 0.38,
"grad_norm": 884.0,
"kl_loss_13": 79.01305770874023,
"kl_loss_2": 1941.484326171875,
"kl_loss_4": 1055.3097930908202,
"kl_loss_9": 307.8282974243164,
"learning_rate": 0.0006931725628465643,
"loss": 862.8076,
"step": 3800
},
{
"ce_loss_13": 3.315520977973938,
"ce_loss_17": 3.273395228385925,
"ce_loss_2": 4.174019360542298,
"ce_loss_4": 3.7471574902534486,
"ce_loss_9": 3.42430157661438,
"epoch": 0.381,
"grad_norm": 792.0,
"kl_loss_13": 78.43453598022461,
"kl_loss_2": 1927.9592224121093,
"kl_loss_4": 1050.5472503662108,
"kl_loss_9": 307.24270095825193,
"learning_rate": 0.0006917081272888696,
"loss": 846.8363,
"step": 3810
},
{
"ce_loss_13": 3.2242579102516173,
"ce_loss_17": 3.181358051300049,
"ce_loss_2": 4.110445141792297,
"ce_loss_4": 3.653716266155243,
"ce_loss_9": 3.3308806896209715,
"epoch": 0.382,
"grad_norm": 960.0,
"kl_loss_13": 79.20247192382813,
"kl_loss_2": 2012.6092102050782,
"kl_loss_4": 1057.345932006836,
"kl_loss_9": 308.32613067626954,
"learning_rate": 0.0006902417612324615,
"loss": 848.2767,
"step": 3820
},
{
"ce_loss_13": 3.355782425403595,
"ce_loss_17": 3.3093623876571656,
"ce_loss_2": 4.239986300468445,
"ce_loss_4": 3.8019633531570434,
"ce_loss_9": 3.467235004901886,
"epoch": 0.383,
"grad_norm": 908.0,
"kl_loss_13": 82.72949295043945,
"kl_loss_2": 2004.3609008789062,
"kl_loss_4": 1097.67138671875,
"kl_loss_9": 321.2957168579102,
"learning_rate": 0.00068877347944363,
"loss": 866.2252,
"step": 3830
},
{
"ce_loss_13": 3.3492382287979128,
"ce_loss_17": 3.306003439426422,
"ce_loss_2": 4.181626462936402,
"ce_loss_4": 3.7661064505577087,
"ce_loss_9": 3.4544392824172974,
"epoch": 0.384,
"grad_norm": 952.0,
"kl_loss_13": 78.85240173339844,
"kl_loss_2": 1910.4473876953125,
"kl_loss_4": 1039.4304443359374,
"kl_loss_9": 308.61584320068357,
"learning_rate": 0.0006873032967079561,
"loss": 853.8964,
"step": 3840
},
{
"ce_loss_13": 3.336069393157959,
"ce_loss_17": 3.2946664333343505,
"ce_loss_2": 4.151469981670379,
"ce_loss_4": 3.745041239261627,
"ce_loss_9": 3.437327229976654,
"epoch": 0.385,
"grad_norm": 768.0,
"kl_loss_13": 77.62555274963378,
"kl_loss_2": 1888.4370483398438,
"kl_loss_4": 1033.6312164306642,
"kl_loss_9": 302.03850708007815,
"learning_rate": 0.0006858312278301637,
"loss": 829.2557,
"step": 3850
},
{
"ce_loss_13": 3.3677077651023866,
"ce_loss_17": 3.3257739305496217,
"ce_loss_2": 4.178003787994385,
"ce_loss_4": 3.772132909297943,
"ce_loss_9": 3.4704753518104554,
"epoch": 0.386,
"grad_norm": 844.0,
"kl_loss_13": 79.3912525177002,
"kl_loss_2": 1875.7722900390625,
"kl_loss_4": 1026.8799957275392,
"kl_loss_9": 306.31392059326174,
"learning_rate": 0.0006843572876339704,
"loss": 827.2038,
"step": 3860
},
{
"ce_loss_13": 3.2855607509613036,
"ce_loss_17": 3.244386303424835,
"ce_loss_2": 4.093652892112732,
"ce_loss_4": 3.692332851886749,
"ce_loss_9": 3.385841965675354,
"epoch": 0.387,
"grad_norm": 916.0,
"kl_loss_13": 77.1948143005371,
"kl_loss_2": 1841.5903930664062,
"kl_loss_4": 1009.3366516113281,
"kl_loss_9": 300.8722259521484,
"learning_rate": 0.0006828814909619373,
"loss": 847.6008,
"step": 3870
},
{
"ce_loss_13": 3.411894679069519,
"ce_loss_17": 3.3655677914619444,
"ce_loss_2": 4.245130181312561,
"ce_loss_4": 3.8279731273651123,
"ce_loss_9": 3.5144017219543455,
"epoch": 0.388,
"grad_norm": 836.0,
"kl_loss_13": 81.73871154785157,
"kl_loss_2": 1895.018212890625,
"kl_loss_4": 1038.8522857666017,
"kl_loss_9": 310.09679260253904,
"learning_rate": 0.0006814038526753205,
"loss": 824.0662,
"step": 3880
},
{
"ce_loss_13": 3.3071584939956664,
"ce_loss_17": 3.2654017090797423,
"ce_loss_2": 4.141039967536926,
"ce_loss_4": 3.729535710811615,
"ce_loss_9": 3.412937140464783,
"epoch": 0.389,
"grad_norm": 756.0,
"kl_loss_13": 78.23615188598633,
"kl_loss_2": 1893.238671875,
"kl_loss_4": 1039.030029296875,
"kl_loss_9": 307.1902275085449,
"learning_rate": 0.0006799243876539213,
"loss": 834.9811,
"step": 3890
},
{
"ce_loss_13": 3.2321924924850465,
"ce_loss_17": 3.191261577606201,
"ce_loss_2": 4.11304601430893,
"ce_loss_4": 3.6646535277366636,
"ce_loss_9": 3.3411396503448487,
"epoch": 0.39,
"grad_norm": 1096.0,
"kl_loss_13": 80.79813156127929,
"kl_loss_2": 1971.1149475097657,
"kl_loss_4": 1049.593035888672,
"kl_loss_9": 309.8133071899414,
"learning_rate": 0.0006784431107959359,
"loss": 854.1135,
"step": 3900
},
{
"ce_loss_13": 3.294963073730469,
"ce_loss_17": 3.250784397125244,
"ce_loss_2": 4.176942825317383,
"ce_loss_4": 3.7329927563667296,
"ce_loss_9": 3.4081822991371156,
"epoch": 0.391,
"grad_norm": 924.0,
"kl_loss_13": 79.89174423217773,
"kl_loss_2": 1994.7768249511719,
"kl_loss_4": 1062.798147583008,
"kl_loss_9": 311.36636505126955,
"learning_rate": 0.0006769600370178059,
"loss": 849.3525,
"step": 3910
},
{
"ce_loss_13": 3.2559167623519896,
"ce_loss_17": 3.2112300753593446,
"ce_loss_2": 4.119880974292755,
"ce_loss_4": 3.6950501561164857,
"ce_loss_9": 3.3670750975608827,
"epoch": 0.392,
"grad_norm": 724.0,
"kl_loss_13": 78.87548294067383,
"kl_loss_2": 1928.7585510253907,
"kl_loss_4": 1055.2064392089844,
"kl_loss_9": 310.10369262695315,
"learning_rate": 0.0006754751812540679,
"loss": 827.0956,
"step": 3920
},
{
"ce_loss_13": 3.3033456802368164,
"ce_loss_17": 3.2604467153549193,
"ce_loss_2": 4.161333394050598,
"ce_loss_4": 3.7346084475517274,
"ce_loss_9": 3.412954103946686,
"epoch": 0.393,
"grad_norm": 1004.0,
"kl_loss_13": 80.26275672912598,
"kl_loss_2": 1960.345928955078,
"kl_loss_4": 1062.8235137939453,
"kl_loss_9": 310.36848907470704,
"learning_rate": 0.0006739885584572025,
"loss": 854.8727,
"step": 3930
},
{
"ce_loss_13": 3.328982615470886,
"ce_loss_17": 3.285259997844696,
"ce_loss_2": 4.213145470619201,
"ce_loss_4": 3.7565238237380982,
"ce_loss_9": 3.436812424659729,
"epoch": 0.394,
"grad_norm": 1272.0,
"kl_loss_13": 81.48662643432617,
"kl_loss_2": 2019.0970703125,
"kl_loss_4": 1064.3614410400392,
"kl_loss_9": 310.75197448730466,
"learning_rate": 0.0006725001835974853,
"loss": 848.3544,
"step": 3940
},
{
"ce_loss_13": 3.319366729259491,
"ce_loss_17": 3.2750706672668457,
"ce_loss_2": 4.18541134595871,
"ce_loss_4": 3.758718478679657,
"ce_loss_9": 3.430189275741577,
"epoch": 0.395,
"grad_norm": 956.0,
"kl_loss_13": 81.0606185913086,
"kl_loss_2": 1969.6244079589844,
"kl_loss_4": 1066.606381225586,
"kl_loss_9": 310.7843353271484,
"learning_rate": 0.0006710100716628344,
"loss": 835.6175,
"step": 3950
},
{
"ce_loss_13": 3.3074681639671324,
"ce_loss_17": 3.2640940666198732,
"ce_loss_2": 4.156752181053162,
"ce_loss_4": 3.7395651817321776,
"ce_loss_9": 3.412280297279358,
"epoch": 0.396,
"grad_norm": 744.0,
"kl_loss_13": 78.18349380493164,
"kl_loss_2": 1931.2714904785157,
"kl_loss_4": 1062.4384399414062,
"kl_loss_9": 304.9924346923828,
"learning_rate": 0.0006695182376586602,
"loss": 849.6749,
"step": 3960
},
{
"ce_loss_13": 3.3401866912841798,
"ce_loss_17": 3.297827625274658,
"ce_loss_2": 4.143641221523285,
"ce_loss_4": 3.744636571407318,
"ce_loss_9": 3.4408209681510926,
"epoch": 0.397,
"grad_norm": 876.0,
"kl_loss_13": 77.44681243896484,
"kl_loss_2": 1831.1161743164062,
"kl_loss_4": 1002.5733123779297,
"kl_loss_9": 296.31993103027344,
"learning_rate": 0.000668024696607715,
"loss": 842.8701,
"step": 3970
},
{
"ce_loss_13": 3.295796203613281,
"ce_loss_17": 3.2550071120262145,
"ce_loss_2": 4.131110274791718,
"ce_loss_4": 3.7154141306877135,
"ce_loss_9": 3.400242578983307,
"epoch": 0.398,
"grad_norm": 788.0,
"kl_loss_13": 77.02307968139648,
"kl_loss_2": 1907.8619750976563,
"kl_loss_4": 1040.749215698242,
"kl_loss_9": 303.7198455810547,
"learning_rate": 0.0006665294635499404,
"loss": 836.6309,
"step": 3980
},
{
"ce_loss_13": 3.2978402376174927,
"ce_loss_17": 3.2530077695846558,
"ce_loss_2": 4.187583088874817,
"ce_loss_4": 3.7399232268333433,
"ce_loss_9": 3.4086226344108583,
"epoch": 0.399,
"grad_norm": 832.0,
"kl_loss_13": 81.76366004943847,
"kl_loss_2": 2014.4826110839845,
"kl_loss_4": 1080.946337890625,
"kl_loss_9": 316.02596893310545,
"learning_rate": 0.0006650325535423167,
"loss": 853.916,
"step": 3990
},
{
"ce_loss_13": 3.330570673942566,
"ce_loss_17": 3.288055944442749,
"ce_loss_2": 4.126987683773041,
"ce_loss_4": 3.733007574081421,
"ce_loss_9": 3.4324370622634888,
"epoch": 0.4,
"grad_norm": 764.0,
"kl_loss_13": 75.5315990447998,
"kl_loss_2": 1814.0561645507812,
"kl_loss_4": 997.540869140625,
"kl_loss_9": 293.8647766113281,
"learning_rate": 0.0006635339816587109,
"loss": 819.9658,
"step": 4000
},
{
"ce_loss_13": 3.2647815465927126,
"ce_loss_17": 3.22164990901947,
"ce_loss_2": 4.133316695690155,
"ce_loss_4": 3.6849881410598755,
"ce_loss_9": 3.368342387676239,
"epoch": 0.401,
"grad_norm": 928.0,
"kl_loss_13": 79.31331672668458,
"kl_loss_2": 1972.0010803222656,
"kl_loss_4": 1051.2727966308594,
"kl_loss_9": 306.35632476806643,
"learning_rate": 0.0006620337629897252,
"loss": 838.599,
"step": 4010
},
{
"ce_loss_13": 3.2707459211349486,
"ce_loss_17": 3.2277172327041628,
"ce_loss_2": 4.11969610452652,
"ce_loss_4": 3.6980610370635985,
"ce_loss_9": 3.377561020851135,
"epoch": 0.402,
"grad_norm": 876.0,
"kl_loss_13": 77.62689361572265,
"kl_loss_2": 1930.257879638672,
"kl_loss_4": 1045.306314086914,
"kl_loss_9": 304.64718475341795,
"learning_rate": 0.0006605319126425454,
"loss": 851.9811,
"step": 4020
},
{
"ce_loss_13": 3.1794585824012755,
"ce_loss_17": 3.135476553440094,
"ce_loss_2": 4.056850111484527,
"ce_loss_4": 3.6122671961784363,
"ce_loss_9": 3.2904020071029665,
"epoch": 0.403,
"grad_norm": 788.0,
"kl_loss_13": 79.04550437927246,
"kl_loss_2": 1997.5563049316406,
"kl_loss_4": 1072.4226806640625,
"kl_loss_9": 309.31011962890625,
"learning_rate": 0.0006590284457407876,
"loss": 856.1703,
"step": 4030
},
{
"ce_loss_13": 3.271203708648682,
"ce_loss_17": 3.228550398349762,
"ce_loss_2": 4.125647473335266,
"ce_loss_4": 3.696555233001709,
"ce_loss_9": 3.3810480356216432,
"epoch": 0.404,
"grad_norm": 944.0,
"kl_loss_13": 79.07201461791992,
"kl_loss_2": 1917.312451171875,
"kl_loss_4": 1031.7962280273437,
"kl_loss_9": 304.8022430419922,
"learning_rate": 0.0006575233774243465,
"loss": 834.16,
"step": 4040
},
{
"ce_loss_13": 3.2648462891578673,
"ce_loss_17": 3.2220491051673887,
"ce_loss_2": 4.121234345436096,
"ce_loss_4": 3.693406546115875,
"ce_loss_9": 3.374107909202576,
"epoch": 0.405,
"grad_norm": 944.0,
"kl_loss_13": 77.91585922241211,
"kl_loss_2": 1947.8253234863282,
"kl_loss_4": 1051.3325927734375,
"kl_loss_9": 307.1045608520508,
"learning_rate": 0.0006560167228492435,
"loss": 842.7441,
"step": 4050
},
{
"ce_loss_13": 3.3149160623550413,
"ce_loss_17": 3.274205195903778,
"ce_loss_2": 4.132644689083099,
"ce_loss_4": 3.7271281838417054,
"ce_loss_9": 3.419368267059326,
"epoch": 0.406,
"grad_norm": 1012.0,
"kl_loss_13": 77.59754943847656,
"kl_loss_2": 1871.8706176757812,
"kl_loss_4": 1024.0221588134766,
"kl_loss_9": 299.1046112060547,
"learning_rate": 0.0006545084971874737,
"loss": 838.234,
"step": 4060
},
{
"ce_loss_13": 3.2789357542991637,
"ce_loss_17": 3.2333378076553343,
"ce_loss_2": 4.15515753030777,
"ce_loss_4": 3.717312145233154,
"ce_loss_9": 3.3853928685188293,
"epoch": 0.407,
"grad_norm": 768.0,
"kl_loss_13": 81.99721145629883,
"kl_loss_2": 1989.4007446289063,
"kl_loss_4": 1078.338934326172,
"kl_loss_9": 316.0284622192383,
"learning_rate": 0.0006529987156268526,
"loss": 843.0453,
"step": 4070
},
{
"ce_loss_13": 3.1949838519096376,
"ce_loss_17": 3.1512240171432495,
"ce_loss_2": 4.062924098968506,
"ce_loss_4": 3.629937028884888,
"ce_loss_9": 3.305235779285431,
"epoch": 0.408,
"grad_norm": 1200.0,
"kl_loss_13": 80.35381736755372,
"kl_loss_2": 1956.441827392578,
"kl_loss_4": 1060.228433227539,
"kl_loss_9": 309.562190246582,
"learning_rate": 0.0006514873933708637,
"loss": 865.9684,
"step": 4080
},
{
"ce_loss_13": 3.307583379745483,
"ce_loss_17": 3.2647162079811096,
"ce_loss_2": 4.147261226177216,
"ce_loss_4": 3.7207722663879395,
"ce_loss_9": 3.4137214183807374,
"epoch": 0.409,
"grad_norm": 904.0,
"kl_loss_13": 77.66700706481933,
"kl_loss_2": 1919.4348022460938,
"kl_loss_4": 1029.5156524658203,
"kl_loss_9": 300.5586654663086,
"learning_rate": 0.0006499745456385053,
"loss": 828.9072,
"step": 4090
},
{
"ce_loss_13": 3.2745136857032775,
"ce_loss_17": 3.230354392528534,
"ce_loss_2": 4.120101284980774,
"ce_loss_4": 3.7019402742385865,
"ce_loss_9": 3.3798617243766786,
"epoch": 0.41,
"grad_norm": 820.0,
"kl_loss_13": 77.67239074707031,
"kl_loss_2": 1915.2319641113281,
"kl_loss_4": 1042.1595092773437,
"kl_loss_9": 302.0523025512695,
"learning_rate": 0.0006484601876641375,
"loss": 846.3459,
"step": 4100
},
{
"ce_loss_13": 3.2652450799942017,
"ce_loss_17": 3.2237433433532714,
"ce_loss_2": 4.077217090129852,
"ce_loss_4": 3.6771028876304626,
"ce_loss_9": 3.367217707633972,
"epoch": 0.411,
"grad_norm": 824.0,
"kl_loss_13": 76.87406425476074,
"kl_loss_2": 1852.5192993164062,
"kl_loss_4": 1016.7843078613281,
"kl_loss_9": 296.21001586914065,
"learning_rate": 0.000646944334697328,
"loss": 818.729,
"step": 4110
},
{
"ce_loss_13": 3.3710275053977967,
"ce_loss_17": 3.3288058638572693,
"ce_loss_2": 4.173830461502075,
"ce_loss_4": 3.7750932216644286,
"ce_loss_9": 3.4707067608833313,
"epoch": 0.412,
"grad_norm": 1112.0,
"kl_loss_13": 77.32941474914551,
"kl_loss_2": 1825.318096923828,
"kl_loss_4": 1005.8795227050781,
"kl_loss_9": 300.3182144165039,
"learning_rate": 0.0006454270020026995,
"loss": 810.5407,
"step": 4120
},
{
"ce_loss_13": 3.3413079261779783,
"ce_loss_17": 3.300805962085724,
"ce_loss_2": 4.138811671733857,
"ce_loss_4": 3.7457743525505065,
"ce_loss_9": 3.4431130647659303,
"epoch": 0.413,
"grad_norm": 832.0,
"kl_loss_13": 74.75608367919922,
"kl_loss_2": 1811.4225952148438,
"kl_loss_4": 992.4546264648437,
"kl_loss_9": 291.7052032470703,
"learning_rate": 0.0006439082048597755,
"loss": 804.2014,
"step": 4130
},
{
"ce_loss_13": 3.325836753845215,
"ce_loss_17": 3.2841463565826414,
"ce_loss_2": 4.166321110725403,
"ce_loss_4": 3.746056628227234,
"ce_loss_9": 3.4339433550834655,
"epoch": 0.414,
"grad_norm": 1104.0,
"kl_loss_13": 77.54204635620117,
"kl_loss_2": 1915.472186279297,
"kl_loss_4": 1034.2969024658203,
"kl_loss_9": 303.5854751586914,
"learning_rate": 0.0006423879585628261,
"loss": 836.3888,
"step": 4140
},
{
"ce_loss_13": 3.2876922488212585,
"ce_loss_17": 3.244413447380066,
"ce_loss_2": 4.1571802496910095,
"ce_loss_4": 3.7206246495246886,
"ce_loss_9": 3.397969675064087,
"epoch": 0.415,
"grad_norm": 844.0,
"kl_loss_13": 79.64313621520996,
"kl_loss_2": 1972.5984680175782,
"kl_loss_4": 1062.806448364258,
"kl_loss_9": 311.07977294921875,
"learning_rate": 0.0006408662784207149,
"loss": 852.0227,
"step": 4150
},
{
"ce_loss_13": 3.2557998657226563,
"ce_loss_17": 3.213127410411835,
"ce_loss_2": 4.102911353111267,
"ce_loss_4": 3.679526710510254,
"ce_loss_9": 3.3603752493858337,
"epoch": 0.416,
"grad_norm": 1032.0,
"kl_loss_13": 75.84726219177246,
"kl_loss_2": 1918.3287048339844,
"kl_loss_4": 1042.9201538085938,
"kl_loss_9": 302.7628631591797,
"learning_rate": 0.0006393431797567439,
"loss": 837.298,
"step": 4160
},
{
"ce_loss_13": 3.3369251608848574,
"ce_loss_17": 3.2943072080612184,
"ce_loss_2": 4.126527976989746,
"ce_loss_4": 3.724642038345337,
"ce_loss_9": 3.4380351662635804,
"epoch": 0.417,
"grad_norm": 776.0,
"kl_loss_13": 76.58052368164063,
"kl_loss_2": 1837.2471862792968,
"kl_loss_4": 998.3685485839844,
"kl_loss_9": 296.33739318847654,
"learning_rate": 0.0006378186779084996,
"loss": 796.7759,
"step": 4170
},
{
"ce_loss_13": 3.1708509802818297,
"ce_loss_17": 3.129050099849701,
"ce_loss_2": 4.039690005779266,
"ce_loss_4": 3.605795121192932,
"ce_loss_9": 3.276896905899048,
"epoch": 0.418,
"grad_norm": 760.0,
"kl_loss_13": 76.8579605102539,
"kl_loss_2": 1934.0495178222657,
"kl_loss_4": 1046.7709106445313,
"kl_loss_9": 305.6838653564453,
"learning_rate": 0.0006362927882276989,
"loss": 846.643,
"step": 4180
},
{
"ce_loss_13": 3.3557766914367675,
"ce_loss_17": 3.3138336539268494,
"ce_loss_2": 4.165496897697449,
"ce_loss_4": 3.756271946430206,
"ce_loss_9": 3.4560358047485353,
"epoch": 0.419,
"grad_norm": 704.0,
"kl_loss_13": 77.31939125061035,
"kl_loss_2": 1856.0389343261718,
"kl_loss_4": 1000.2576049804687,
"kl_loss_9": 295.1786392211914,
"learning_rate": 0.000634765526080034,
"loss": 801.9405,
"step": 4190
},
{
"ce_loss_13": 3.3638531088829042,
"ce_loss_17": 3.3213589787483215,
"ce_loss_2": 4.186532127857208,
"ce_loss_4": 3.7772738099098206,
"ce_loss_9": 3.466420602798462,
"epoch": 0.42,
"grad_norm": 768.0,
"kl_loss_13": 78.90635223388672,
"kl_loss_2": 1869.4422790527344,
"kl_loss_4": 1022.9750732421875,
"kl_loss_9": 301.8726806640625,
"learning_rate": 0.0006332369068450174,
"loss": 815.1037,
"step": 4200
},
{
"ce_loss_13": 3.2963593363761903,
"ce_loss_17": 3.2548921585083006,
"ce_loss_2": 4.137203359603882,
"ce_loss_4": 3.7190677642822267,
"ce_loss_9": 3.397758388519287,
"epoch": 0.421,
"grad_norm": 1512.0,
"kl_loss_13": 77.09004592895508,
"kl_loss_2": 1897.3307983398438,
"kl_loss_4": 1035.5769775390625,
"kl_loss_9": 300.7089096069336,
"learning_rate": 0.0006317069459158283,
"loss": 823.016,
"step": 4210
},
{
"ce_loss_13": 3.4075854420661926,
"ce_loss_17": 3.3641351580619814,
"ce_loss_2": 4.199348616600036,
"ce_loss_4": 3.7940779328346252,
"ce_loss_9": 3.5064744114875794,
"epoch": 0.422,
"grad_norm": 688.0,
"kl_loss_13": 77.74232749938965,
"kl_loss_2": 1831.5187866210938,
"kl_loss_4": 995.7357452392578,
"kl_loss_9": 296.18746490478514,
"learning_rate": 0.0006301756586991561,
"loss": 812.6053,
"step": 4220
},
{
"ce_loss_13": 3.1881481528282167,
"ce_loss_17": 3.1477632880210877,
"ce_loss_2": 4.054125678539276,
"ce_loss_4": 3.6175846815109254,
"ce_loss_9": 3.2955458760261536,
"epoch": 0.423,
"grad_norm": 852.0,
"kl_loss_13": 78.09756584167481,
"kl_loss_2": 1970.5815002441407,
"kl_loss_4": 1058.3021545410156,
"kl_loss_9": 304.6946334838867,
"learning_rate": 0.0006286430606150459,
"loss": 843.0814,
"step": 4230
},
{
"ce_loss_13": 3.3874191522598265,
"ce_loss_17": 3.3446141123771667,
"ce_loss_2": 4.204588401317596,
"ce_loss_4": 3.800839841365814,
"ce_loss_9": 3.4927867531776426,
"epoch": 0.424,
"grad_norm": 704.0,
"kl_loss_13": 78.68111152648926,
"kl_loss_2": 1860.591729736328,
"kl_loss_4": 1016.1077117919922,
"kl_loss_9": 299.66543731689455,
"learning_rate": 0.0006271091670967436,
"loss": 816.7326,
"step": 4240
},
{
"ce_loss_13": 3.3006239771842956,
"ce_loss_17": 3.254270374774933,
"ce_loss_2": 4.166652262210846,
"ce_loss_4": 3.745281684398651,
"ce_loss_9": 3.409282112121582,
"epoch": 0.425,
"grad_norm": 820.0,
"kl_loss_13": 81.14281044006347,
"kl_loss_2": 1977.9890625,
"kl_loss_4": 1089.311703491211,
"kl_loss_9": 315.2414260864258,
"learning_rate": 0.0006255739935905395,
"loss": 843.2561,
"step": 4250
},
{
"ce_loss_13": 3.3398287296295166,
"ce_loss_17": 3.2982711791992188,
"ce_loss_2": 4.1548157095909115,
"ce_loss_4": 3.7448269844055178,
"ce_loss_9": 3.444038677215576,
"epoch": 0.426,
"grad_norm": 788.0,
"kl_loss_13": 78.06872749328613,
"kl_loss_2": 1849.7942260742188,
"kl_loss_4": 1005.2631042480468,
"kl_loss_9": 299.2171264648438,
"learning_rate": 0.0006240375555556145,
"loss": 841.797,
"step": 4260
},
{
"ce_loss_13": 3.3420338153839113,
"ce_loss_17": 3.298211193084717,
"ce_loss_2": 4.206630194187165,
"ce_loss_4": 3.7783311367034913,
"ce_loss_9": 3.450661611557007,
"epoch": 0.427,
"grad_norm": 820.0,
"kl_loss_13": 80.10629310607911,
"kl_loss_2": 1955.883984375,
"kl_loss_4": 1052.7101623535157,
"kl_loss_9": 304.01519165039065,
"learning_rate": 0.000622499868463882,
"loss": 838.1461,
"step": 4270
},
{
"ce_loss_13": 3.315020728111267,
"ce_loss_17": 3.2741780042648316,
"ce_loss_2": 4.107786476612091,
"ce_loss_4": 3.705011510848999,
"ce_loss_9": 3.4146546721458435,
"epoch": 0.428,
"grad_norm": 872.0,
"kl_loss_13": 77.37769355773926,
"kl_loss_2": 1848.4179931640624,
"kl_loss_4": 997.9491729736328,
"kl_loss_9": 291.1655242919922,
"learning_rate": 0.0006209609477998338,
"loss": 815.0931,
"step": 4280
},
{
"ce_loss_13": 3.3685535907745363,
"ce_loss_17": 3.324531579017639,
"ce_loss_2": 4.187193953990937,
"ce_loss_4": 3.7842114925384522,
"ce_loss_9": 3.468222963809967,
"epoch": 0.429,
"grad_norm": 832.0,
"kl_loss_13": 79.78409614562989,
"kl_loss_2": 1874.082763671875,
"kl_loss_4": 1024.7485290527343,
"kl_loss_9": 300.9065643310547,
"learning_rate": 0.0006194208090603844,
"loss": 831.918,
"step": 4290
},
{
"ce_loss_13": 3.2854601979255675,
"ce_loss_17": 3.243454360961914,
"ce_loss_2": 4.113818681240081,
"ce_loss_4": 3.696045386791229,
"ce_loss_9": 3.3911391139030456,
"epoch": 0.43,
"grad_norm": 816.0,
"kl_loss_13": 75.71716423034668,
"kl_loss_2": 1863.8755798339844,
"kl_loss_4": 1003.1294708251953,
"kl_loss_9": 294.57244262695315,
"learning_rate": 0.0006178794677547138,
"loss": 806.2747,
"step": 4300
},
{
"ce_loss_13": 3.309733045101166,
"ce_loss_17": 3.2677249670028687,
"ce_loss_2": 4.158813059329987,
"ce_loss_4": 3.729399120807648,
"ce_loss_9": 3.4140867710113527,
"epoch": 0.431,
"grad_norm": 812.0,
"kl_loss_13": 78.08214874267578,
"kl_loss_2": 1935.1852661132812,
"kl_loss_4": 1048.1672271728517,
"kl_loss_9": 306.0022201538086,
"learning_rate": 0.0006163369394041111,
"loss": 833.0965,
"step": 4310
},
{
"ce_loss_13": 3.248022508621216,
"ce_loss_17": 3.207003927230835,
"ce_loss_2": 4.110338830947876,
"ce_loss_4": 3.6842548727989195,
"ce_loss_9": 3.357089364528656,
"epoch": 0.432,
"grad_norm": 1008.0,
"kl_loss_13": 75.94662075042724,
"kl_loss_2": 1934.8188049316407,
"kl_loss_4": 1051.1523498535157,
"kl_loss_9": 300.2815399169922,
"learning_rate": 0.0006147932395418205,
"loss": 852.7858,
"step": 4320
},
{
"ce_loss_13": 3.28361519575119,
"ce_loss_17": 3.241096353530884,
"ce_loss_2": 4.102269923686981,
"ce_loss_4": 3.6967898845672607,
"ce_loss_9": 3.3902220487594605,
"epoch": 0.433,
"grad_norm": 888.0,
"kl_loss_13": 76.6314250946045,
"kl_loss_2": 1858.9971618652344,
"kl_loss_4": 1016.6501831054687,
"kl_loss_9": 300.2827835083008,
"learning_rate": 0.0006132483837128823,
"loss": 814.2979,
"step": 4330
},
{
"ce_loss_13": 3.2630454659461976,
"ce_loss_17": 3.2222825288772583,
"ce_loss_2": 4.10932183265686,
"ce_loss_4": 3.676711046695709,
"ce_loss_9": 3.36719868183136,
"epoch": 0.434,
"grad_norm": 732.0,
"kl_loss_13": 76.80081672668457,
"kl_loss_2": 1923.487744140625,
"kl_loss_4": 1023.3612548828125,
"kl_loss_9": 300.2582061767578,
"learning_rate": 0.0006117023874739772,
"loss": 827.8298,
"step": 4340
},
{
"ce_loss_13": 3.2588926672935488,
"ce_loss_17": 3.218635880947113,
"ce_loss_2": 4.1072376608848575,
"ce_loss_4": 3.680481123924255,
"ce_loss_9": 3.3696163058280946,
"epoch": 0.435,
"grad_norm": 976.0,
"kl_loss_13": 76.15673713684082,
"kl_loss_2": 1920.554864501953,
"kl_loss_4": 1039.4910980224608,
"kl_loss_9": 304.0564468383789,
"learning_rate": 0.0006101552663932703,
"loss": 841.0854,
"step": 4350
},
{
"ce_loss_13": 3.2911000847816467,
"ce_loss_17": 3.249547779560089,
"ce_loss_2": 4.114426076412201,
"ce_loss_4": 3.7044010758399963,
"ce_loss_9": 3.395657777786255,
"epoch": 0.436,
"grad_norm": 768.0,
"kl_loss_13": 77.62348098754883,
"kl_loss_2": 1890.3679443359374,
"kl_loss_4": 1025.9166900634766,
"kl_loss_9": 301.8690780639648,
"learning_rate": 0.0006086070360502539,
"loss": 825.298,
"step": 4360
},
{
"ce_loss_13": 3.299925887584686,
"ce_loss_17": 3.2557137250900268,
"ce_loss_2": 4.13572096824646,
"ce_loss_4": 3.7072188019752503,
"ce_loss_9": 3.3994361639022825,
"epoch": 0.437,
"grad_norm": 840.0,
"kl_loss_13": 77.6753143310547,
"kl_loss_2": 1916.3807983398438,
"kl_loss_4": 1023.9032165527344,
"kl_loss_9": 297.97623443603516,
"learning_rate": 0.0006070577120355903,
"loss": 831.7244,
"step": 4370
},
{
"ce_loss_13": 3.295057225227356,
"ce_loss_17": 3.2547211170196535,
"ce_loss_2": 4.113643145561218,
"ce_loss_4": 3.712255358695984,
"ce_loss_9": 3.404051995277405,
"epoch": 0.438,
"grad_norm": 848.0,
"kl_loss_13": 76.0235752105713,
"kl_loss_2": 1835.069677734375,
"kl_loss_4": 1011.377474975586,
"kl_loss_9": 294.67051849365237,
"learning_rate": 0.0006055073099509549,
"loss": 816.6621,
"step": 4380
},
{
"ce_loss_13": 3.3560105204582213,
"ce_loss_17": 3.3139033555984496,
"ce_loss_2": 4.159651386737823,
"ce_loss_4": 3.754903721809387,
"ce_loss_9": 3.455800485610962,
"epoch": 0.439,
"grad_norm": 700.0,
"kl_loss_13": 77.7121353149414,
"kl_loss_2": 1852.9252380371095,
"kl_loss_4": 1004.2511535644531,
"kl_loss_9": 295.7561569213867,
"learning_rate": 0.0006039558454088796,
"loss": 824.942,
"step": 4390
},
{
"ce_loss_13": 3.329780697822571,
"ce_loss_17": 3.285024857521057,
"ce_loss_2": 4.164557564258575,
"ce_loss_4": 3.748790717124939,
"ce_loss_9": 3.4335277676582336,
"epoch": 0.44,
"grad_norm": 804.0,
"kl_loss_13": 77.75255661010742,
"kl_loss_2": 1905.2563781738281,
"kl_loss_4": 1030.5379302978515,
"kl_loss_9": 299.5456596374512,
"learning_rate": 0.0006024033340325954,
"loss": 811.2349,
"step": 4400
},
{
"ce_loss_13": 3.3937047004699705,
"ce_loss_17": 3.3539093375205993,
"ce_loss_2": 4.187200272083283,
"ce_loss_4": 3.796113383769989,
"ce_loss_9": 3.4941094875335694,
"epoch": 0.441,
"grad_norm": 804.0,
"kl_loss_13": 75.44027671813964,
"kl_loss_2": 1791.9597412109374,
"kl_loss_4": 984.2767913818359,
"kl_loss_9": 288.6034698486328,
"learning_rate": 0.0006008497914558743,
"loss": 799.1658,
"step": 4410
},
{
"ce_loss_13": 3.3351574301719666,
"ce_loss_17": 3.291703939437866,
"ce_loss_2": 4.171803283691406,
"ce_loss_4": 3.7514198541641237,
"ce_loss_9": 3.4398239731788633,
"epoch": 0.442,
"grad_norm": 848.0,
"kl_loss_13": 79.80851287841797,
"kl_loss_2": 1910.2213806152345,
"kl_loss_4": 1033.5369079589843,
"kl_loss_9": 304.91014556884767,
"learning_rate": 0.0005992952333228728,
"loss": 827.2408,
"step": 4420
},
{
"ce_loss_13": 3.2761381387710573,
"ce_loss_17": 3.234825384616852,
"ce_loss_2": 4.106686508655548,
"ce_loss_4": 3.6917160749435425,
"ce_loss_9": 3.378207635879517,
"epoch": 0.443,
"grad_norm": 964.0,
"kl_loss_13": 75.30015296936035,
"kl_loss_2": 1906.8137512207031,
"kl_loss_4": 1027.6942047119142,
"kl_loss_9": 293.7550811767578,
"learning_rate": 0.0005977396752879741,
"loss": 824.6945,
"step": 4430
},
{
"ce_loss_13": 3.2037268400192263,
"ce_loss_17": 3.1642729163169863,
"ce_loss_2": 4.042637968063355,
"ce_loss_4": 3.6258600473403932,
"ce_loss_9": 3.312602734565735,
"epoch": 0.444,
"grad_norm": 828.0,
"kl_loss_13": 74.46896743774414,
"kl_loss_2": 1914.5224304199219,
"kl_loss_4": 1037.9090118408203,
"kl_loss_9": 298.5860763549805,
"learning_rate": 0.0005961831330156305,
"loss": 819.113,
"step": 4440
},
{
"ce_loss_13": 3.3450072526931764,
"ce_loss_17": 3.3041136980056764,
"ce_loss_2": 4.193412566184998,
"ce_loss_4": 3.768865776062012,
"ce_loss_9": 3.4508578300476076,
"epoch": 0.445,
"grad_norm": 1576.0,
"kl_loss_13": 76.84782943725585,
"kl_loss_2": 1931.1100830078126,
"kl_loss_4": 1040.968194580078,
"kl_loss_9": 299.5110198974609,
"learning_rate": 0.0005946256221802051,
"loss": 841.9498,
"step": 4450
},
{
"ce_loss_13": 3.322051453590393,
"ce_loss_17": 3.2811499357223513,
"ce_loss_2": 4.1088451743125916,
"ce_loss_4": 3.7157408595085144,
"ce_loss_9": 3.4206302642822264,
"epoch": 0.446,
"grad_norm": 912.0,
"kl_loss_13": 75.70633506774902,
"kl_loss_2": 1808.1333068847657,
"kl_loss_4": 980.8681457519531,
"kl_loss_9": 288.6044303894043,
"learning_rate": 0.0005930671584658151,
"loss": 832.3179,
"step": 4460
},
{
"ce_loss_13": 3.3276433110237122,
"ce_loss_17": 3.2855786204338076,
"ce_loss_2": 4.14680814743042,
"ce_loss_4": 3.737990176677704,
"ce_loss_9": 3.429105854034424,
"epoch": 0.447,
"grad_norm": 852.0,
"kl_loss_13": 76.35856971740722,
"kl_loss_2": 1891.0329956054688,
"kl_loss_4": 1026.7948364257813,
"kl_loss_9": 295.6607940673828,
"learning_rate": 0.0005915077575661722,
"loss": 830.7908,
"step": 4470
},
{
"ce_loss_13": 3.3396317839622496,
"ce_loss_17": 3.295839321613312,
"ce_loss_2": 4.168763411045075,
"ce_loss_4": 3.7516340255737304,
"ce_loss_9": 3.442499566078186,
"epoch": 0.448,
"grad_norm": 808.0,
"kl_loss_13": 79.23018226623535,
"kl_loss_2": 1906.8447692871093,
"kl_loss_4": 1035.3365295410156,
"kl_loss_9": 304.0178924560547,
"learning_rate": 0.000589947435184427,
"loss": 816.7979,
"step": 4480
},
{
"ce_loss_13": 3.4035409450531007,
"ce_loss_17": 3.3630342364311216,
"ce_loss_2": 4.180931556224823,
"ce_loss_4": 3.7933268666267397,
"ce_loss_9": 3.5012203454971313,
"epoch": 0.449,
"grad_norm": 824.0,
"kl_loss_13": 78.1723731994629,
"kl_loss_2": 1819.143408203125,
"kl_loss_4": 1002.5709869384766,
"kl_loss_9": 295.7471405029297,
"learning_rate": 0.0005883862070330078,
"loss": 811.6958,
"step": 4490
},
{
"ce_loss_13": 3.3434889793395994,
"ce_loss_17": 3.300837182998657,
"ce_loss_2": 4.154209268093109,
"ce_loss_4": 3.7567609906196595,
"ce_loss_9": 3.4478923201560976,
"epoch": 0.45,
"grad_norm": 884.0,
"kl_loss_13": 77.33296165466308,
"kl_loss_2": 1873.624853515625,
"kl_loss_4": 1029.102764892578,
"kl_loss_9": 299.1194305419922,
"learning_rate": 0.0005868240888334653,
"loss": 812.2415,
"step": 4500
},
{
"ce_loss_13": 3.2234660267829893,
"ce_loss_17": 3.1812546253204346,
"ce_loss_2": 4.079589867591858,
"ce_loss_4": 3.6489335775375364,
"ce_loss_9": 3.3286787033081056,
"epoch": 0.451,
"grad_norm": 1008.0,
"kl_loss_13": 76.51410064697265,
"kl_loss_2": 1931.3193603515624,
"kl_loss_4": 1046.2738800048828,
"kl_loss_9": 303.19763259887696,
"learning_rate": 0.0005852610963163119,
"loss": 831.9842,
"step": 4510
},
{
"ce_loss_13": 3.2492877006530763,
"ce_loss_17": 3.209109592437744,
"ce_loss_2": 4.060594689846039,
"ce_loss_4": 3.654378688335419,
"ce_loss_9": 3.349640953540802,
"epoch": 0.452,
"grad_norm": 940.0,
"kl_loss_13": 74.64283828735351,
"kl_loss_2": 1859.8475830078125,
"kl_loss_4": 1011.8156707763671,
"kl_loss_9": 294.5323791503906,
"learning_rate": 0.0005836972452208654,
"loss": 804.4784,
"step": 4520
},
{
"ce_loss_13": 3.2525294959545135,
"ce_loss_17": 3.2113110303878782,
"ce_loss_2": 4.088192105293274,
"ce_loss_4": 3.667824113368988,
"ce_loss_9": 3.3553738236427306,
"epoch": 0.453,
"grad_norm": 864.0,
"kl_loss_13": 76.30174942016602,
"kl_loss_2": 1897.0127502441405,
"kl_loss_4": 1024.7700744628905,
"kl_loss_9": 298.32971572875977,
"learning_rate": 0.0005821325512950885,
"loss": 824.1601,
"step": 4530
},
{
"ce_loss_13": 3.279394602775574,
"ce_loss_17": 3.2389808893203735,
"ce_loss_2": 4.09222549200058,
"ce_loss_4": 3.687285912036896,
"ce_loss_9": 3.3819608807563784,
"epoch": 0.454,
"grad_norm": 780.0,
"kl_loss_13": 74.611399269104,
"kl_loss_2": 1832.5005798339844,
"kl_loss_4": 993.5156280517579,
"kl_loss_9": 288.8341064453125,
"learning_rate": 0.0005805670302954321,
"loss": 815.0844,
"step": 4540
},
{
"ce_loss_13": 3.288856554031372,
"ce_loss_17": 3.2489148497581484,
"ce_loss_2": 4.091669178009033,
"ce_loss_4": 3.6877204060554503,
"ce_loss_9": 3.3897912740707397,
"epoch": 0.455,
"grad_norm": 828.0,
"kl_loss_13": 73.65193538665771,
"kl_loss_2": 1845.4981689453125,
"kl_loss_4": 1002.7583282470703,
"kl_loss_9": 293.07848052978517,
"learning_rate": 0.000579000697986675,
"loss": 804.0725,
"step": 4550
},
{
"ce_loss_13": 3.2434154629707335,
"ce_loss_17": 3.199486696720123,
"ce_loss_2": 4.098711669445038,
"ce_loss_4": 3.681813371181488,
"ce_loss_9": 3.3549178719520567,
"epoch": 0.456,
"grad_norm": 744.0,
"kl_loss_13": 77.70943298339844,
"kl_loss_2": 1935.5743041992187,
"kl_loss_4": 1063.0270935058593,
"kl_loss_9": 307.4092712402344,
"learning_rate": 0.0005774335701417662,
"loss": 828.9596,
"step": 4560
},
{
"ce_loss_13": 3.2368232369422913,
"ce_loss_17": 3.196345341205597,
"ce_loss_2": 4.089473974704743,
"ce_loss_4": 3.661019968986511,
"ce_loss_9": 3.3422934889793394,
"epoch": 0.457,
"grad_norm": 816.0,
"kl_loss_13": 73.5301586151123,
"kl_loss_2": 1949.0679992675782,
"kl_loss_4": 1040.8331512451173,
"kl_loss_9": 298.1072036743164,
"learning_rate": 0.0005758656625416658,
"loss": 832.1058,
"step": 4570
},
{
"ce_loss_13": 3.2876862049102784,
"ce_loss_17": 3.2457969188690186,
"ce_loss_2": 4.112439560890198,
"ce_loss_4": 3.695971500873566,
"ce_loss_9": 3.3895508646965027,
"epoch": 0.458,
"grad_norm": 776.0,
"kl_loss_13": 75.88962821960449,
"kl_loss_2": 1860.5553161621094,
"kl_loss_4": 1020.166015625,
"kl_loss_9": 297.5071060180664,
"learning_rate": 0.0005742969909751859,
"loss": 804.1062,
"step": 4580
},
{
"ce_loss_13": 3.3020657181739805,
"ce_loss_17": 3.2607773661613466,
"ce_loss_2": 4.131641614437103,
"ce_loss_4": 3.709846580028534,
"ce_loss_9": 3.405165731906891,
"epoch": 0.459,
"grad_norm": 1048.0,
"kl_loss_13": 75.65441131591797,
"kl_loss_2": 1891.6655334472657,
"kl_loss_4": 1009.7342956542968,
"kl_loss_9": 294.8964126586914,
"learning_rate": 0.0005727275712388318,
"loss": 823.5099,
"step": 4590
},
{
"ce_loss_13": 3.3285714864730833,
"ce_loss_17": 3.287725341320038,
"ce_loss_2": 4.124272167682648,
"ce_loss_4": 3.7236576795578005,
"ce_loss_9": 3.430411958694458,
"epoch": 0.46,
"grad_norm": 956.0,
"kl_loss_13": 75.18080139160156,
"kl_loss_2": 1820.7663513183593,
"kl_loss_4": 986.5203704833984,
"kl_loss_9": 289.905224609375,
"learning_rate": 0.0005711574191366427,
"loss": 804.5652,
"step": 4600
},
{
"ce_loss_13": 3.2801861643791197,
"ce_loss_17": 3.2401907563209535,
"ce_loss_2": 4.089137363433838,
"ce_loss_4": 3.684733271598816,
"ce_loss_9": 3.379390871524811,
"epoch": 0.461,
"grad_norm": 688.0,
"kl_loss_13": 75.0859489440918,
"kl_loss_2": 1857.538348388672,
"kl_loss_4": 999.1473907470703,
"kl_loss_9": 290.6371444702148,
"learning_rate": 0.0005695865504800327,
"loss": 803.9468,
"step": 4610
},
{
"ce_loss_13": 3.214577782154083,
"ce_loss_17": 3.171001160144806,
"ce_loss_2": 4.123684239387512,
"ce_loss_4": 3.668971860408783,
"ce_loss_9": 3.325447380542755,
"epoch": 0.462,
"grad_norm": 968.0,
"kl_loss_13": 77.4129825592041,
"kl_loss_2": 2041.3165405273437,
"kl_loss_4": 1094.7594757080078,
"kl_loss_9": 311.20372467041017,
"learning_rate": 0.0005680149810876322,
"loss": 845.5946,
"step": 4620
},
{
"ce_loss_13": 3.2732278943061828,
"ce_loss_17": 3.2324661135673525,
"ce_loss_2": 4.097072160243988,
"ce_loss_4": 3.676388144493103,
"ce_loss_9": 3.374810588359833,
"epoch": 0.463,
"grad_norm": 776.0,
"kl_loss_13": 74.99469413757325,
"kl_loss_2": 1880.4112670898437,
"kl_loss_4": 1009.9831604003906,
"kl_loss_9": 291.9598579406738,
"learning_rate": 0.0005664427267851271,
"loss": 808.8515,
"step": 4630
},
{
"ce_loss_13": 3.1931955575942994,
"ce_loss_17": 3.1502653479576113,
"ce_loss_2": 4.024770927429199,
"ce_loss_4": 3.610238516330719,
"ce_loss_9": 3.297864890098572,
"epoch": 0.464,
"grad_norm": 1000.0,
"kl_loss_13": 74.38558692932129,
"kl_loss_2": 1880.835107421875,
"kl_loss_4": 1015.0424072265625,
"kl_loss_9": 292.6310218811035,
"learning_rate": 0.0005648698034051009,
"loss": 812.3297,
"step": 4640
},
{
"ce_loss_13": 3.3028565406799317,
"ce_loss_17": 3.259902000427246,
"ce_loss_2": 4.152322435379029,
"ce_loss_4": 3.719174313545227,
"ce_loss_9": 3.406006133556366,
"epoch": 0.465,
"grad_norm": 760.0,
"kl_loss_13": 75.3498405456543,
"kl_loss_2": 1921.487548828125,
"kl_loss_4": 1028.3840270996093,
"kl_loss_9": 291.388444519043,
"learning_rate": 0.0005632962267868747,
"loss": 807.8761,
"step": 4650
},
{
"ce_loss_13": 3.2430774450302122,
"ce_loss_17": 3.2033018827438355,
"ce_loss_2": 4.058165490627289,
"ce_loss_4": 3.6541751623153687,
"ce_loss_9": 3.3456701397895814,
"epoch": 0.466,
"grad_norm": 840.0,
"kl_loss_13": 73.02407569885254,
"kl_loss_2": 1852.6263793945313,
"kl_loss_4": 1005.7894439697266,
"kl_loss_9": 287.7598388671875,
"learning_rate": 0.0005617220127763474,
"loss": 814.0826,
"step": 4660
},
{
"ce_loss_13": 3.3238396286964416,
"ce_loss_17": 3.281664049625397,
"ce_loss_2": 4.129569494724274,
"ce_loss_4": 3.7306875705718996,
"ce_loss_9": 3.4238651275634764,
"epoch": 0.467,
"grad_norm": 924.0,
"kl_loss_13": 75.31039695739746,
"kl_loss_2": 1845.7906188964844,
"kl_loss_4": 1007.4520416259766,
"kl_loss_9": 292.87747955322266,
"learning_rate": 0.0005601471772258368,
"loss": 814.8657,
"step": 4670
},
{
"ce_loss_13": 3.3064895629882813,
"ce_loss_17": 3.2665050864219665,
"ce_loss_2": 4.108851706981659,
"ce_loss_4": 3.703833544254303,
"ce_loss_9": 3.4077421545982363,
"epoch": 0.468,
"grad_norm": 964.0,
"kl_loss_13": 74.9879051208496,
"kl_loss_2": 1819.450994873047,
"kl_loss_4": 974.3571472167969,
"kl_loss_9": 288.794450378418,
"learning_rate": 0.0005585717359939192,
"loss": 812.0005,
"step": 4680
},
{
"ce_loss_13": 3.2169051647186278,
"ce_loss_17": 3.1767580628395082,
"ce_loss_2": 4.030018448829651,
"ce_loss_4": 3.6190481901168825,
"ce_loss_9": 3.3161608457565306,
"epoch": 0.469,
"grad_norm": 1032.0,
"kl_loss_13": 73.6907470703125,
"kl_loss_2": 1837.1917602539063,
"kl_loss_4": 995.9050445556641,
"kl_loss_9": 288.79329681396484,
"learning_rate": 0.0005569957049452703,
"loss": 820.1777,
"step": 4690
},
{
"ce_loss_13": 3.2727556824684143,
"ce_loss_17": 3.233135461807251,
"ce_loss_2": 4.109477472305298,
"ce_loss_4": 3.689884305000305,
"ce_loss_9": 3.3748852491378782,
"epoch": 0.47,
"grad_norm": 856.0,
"kl_loss_13": 76.08234329223633,
"kl_loss_2": 1908.498553466797,
"kl_loss_4": 1032.0423736572266,
"kl_loss_9": 300.4012725830078,
"learning_rate": 0.0005554190999505056,
"loss": 825.159,
"step": 4700
},
{
"ce_loss_13": 3.3975147128105165,
"ce_loss_17": 3.3547534942626953,
"ce_loss_2": 4.215018939971924,
"ce_loss_4": 3.8030583024024964,
"ce_loss_9": 3.4997135996818542,
"epoch": 0.471,
"grad_norm": 660.0,
"kl_loss_13": 77.31126403808594,
"kl_loss_2": 1889.2184509277345,
"kl_loss_4": 1020.566683959961,
"kl_loss_9": 302.34742279052733,
"learning_rate": 0.0005538419368860196,
"loss": 794.3334,
"step": 4710
},
{
"ce_loss_13": 3.3169658899307253,
"ce_loss_17": 3.277206563949585,
"ce_loss_2": 4.13512532711029,
"ce_loss_4": 3.7266653776168823,
"ce_loss_9": 3.4202173471450807,
"epoch": 0.472,
"grad_norm": 732.0,
"kl_loss_13": 76.7188003540039,
"kl_loss_2": 1864.9093627929688,
"kl_loss_4": 1011.0238464355468,
"kl_loss_9": 296.90992279052733,
"learning_rate": 0.0005522642316338268,
"loss": 827.9762,
"step": 4720
},
{
"ce_loss_13": 3.3335940837860107,
"ce_loss_17": 3.2930097460746763,
"ce_loss_2": 4.1359561562538145,
"ce_loss_4": 3.7283730030059816,
"ce_loss_9": 3.433068335056305,
"epoch": 0.473,
"grad_norm": 1032.0,
"kl_loss_13": 76.10838012695312,
"kl_loss_2": 1853.8134521484376,
"kl_loss_4": 1004.4875946044922,
"kl_loss_9": 296.6020965576172,
"learning_rate": 0.0005506860000814017,
"loss": 829.242,
"step": 4730
},
{
"ce_loss_13": 3.355704641342163,
"ce_loss_17": 3.3165703058242797,
"ce_loss_2": 4.14022581577301,
"ce_loss_4": 3.7492562770843505,
"ce_loss_9": 3.4540576934814453,
"epoch": 0.474,
"grad_norm": 952.0,
"kl_loss_13": 74.14773750305176,
"kl_loss_2": 1809.160467529297,
"kl_loss_4": 994.4486236572266,
"kl_loss_9": 289.21658477783205,
"learning_rate": 0.0005491072581215186,
"loss": 810.8421,
"step": 4740
},
{
"ce_loss_13": 3.347885513305664,
"ce_loss_17": 3.305254805088043,
"ce_loss_2": 4.148220801353455,
"ce_loss_4": 3.7457194209098814,
"ce_loss_9": 3.4504737854003906,
"epoch": 0.475,
"grad_norm": 956.0,
"kl_loss_13": 76.87275047302246,
"kl_loss_2": 1862.5593688964843,
"kl_loss_4": 1007.2896026611328,
"kl_loss_9": 299.8138198852539,
"learning_rate": 0.0005475280216520913,
"loss": 793.7902,
"step": 4750
},
{
"ce_loss_13": 3.2711644768714905,
"ce_loss_17": 3.2309195160865785,
"ce_loss_2": 4.070014572143554,
"ce_loss_4": 3.6676210045814512,
"ce_loss_9": 3.3719236969947817,
"epoch": 0.476,
"grad_norm": 752.0,
"kl_loss_13": 73.26801719665528,
"kl_loss_2": 1814.8681579589843,
"kl_loss_4": 981.7904296875,
"kl_loss_9": 286.1250335693359,
"learning_rate": 0.0005459483065760138,
"loss": 816.5074,
"step": 4760
},
{
"ce_loss_13": 3.2079075932502747,
"ce_loss_17": 3.1671441316604616,
"ce_loss_2": 4.089412212371826,
"ce_loss_4": 3.642701601982117,
"ce_loss_9": 3.309967613220215,
"epoch": 0.477,
"grad_norm": 884.0,
"kl_loss_13": 74.20464248657227,
"kl_loss_2": 1979.9038391113281,
"kl_loss_4": 1055.5522186279297,
"kl_loss_9": 293.0203689575195,
"learning_rate": 0.0005443681288009991,
"loss": 826.5146,
"step": 4770
},
{
"ce_loss_13": 3.266829860210419,
"ce_loss_17": 3.2285204768180846,
"ce_loss_2": 4.086682987213135,
"ce_loss_4": 3.6759202241897584,
"ce_loss_9": 3.36565181016922,
"epoch": 0.478,
"grad_norm": 672.0,
"kl_loss_13": 73.82023429870605,
"kl_loss_2": 1879.6912658691406,
"kl_loss_4": 1018.4355865478516,
"kl_loss_9": 290.7284698486328,
"learning_rate": 0.0005427875042394199,
"loss": 820.2439,
"step": 4780
},
{
"ce_loss_13": 3.2985992431640625,
"ce_loss_17": 3.2553022503852844,
"ce_loss_2": 4.103298151493073,
"ce_loss_4": 3.704231595993042,
"ce_loss_9": 3.3976196885108947,
"epoch": 0.479,
"grad_norm": 764.0,
"kl_loss_13": 74.87098045349121,
"kl_loss_2": 1833.0201904296875,
"kl_loss_4": 1016.7945831298828,
"kl_loss_9": 290.94402770996095,
"learning_rate": 0.0005412064488081482,
"loss": 817.3122,
"step": 4790
},
{
"ce_loss_13": 3.3004804491996764,
"ce_loss_17": 3.260096788406372,
"ce_loss_2": 4.096733212471008,
"ce_loss_4": 3.698592507839203,
"ce_loss_9": 3.3992846846580504,
"epoch": 0.48,
"grad_norm": 1072.0,
"kl_loss_13": 73.09108085632325,
"kl_loss_2": 1830.7623352050782,
"kl_loss_4": 989.9618072509766,
"kl_loss_9": 285.9252304077148,
"learning_rate": 0.0005396249784283942,
"loss": 797.4109,
"step": 4800
},
{
"ce_loss_13": 3.316614365577698,
"ce_loss_17": 3.2748780608177186,
"ce_loss_2": 4.164385390281677,
"ce_loss_4": 3.7438156366348267,
"ce_loss_9": 3.4193318128585815,
"epoch": 0.481,
"grad_norm": 760.0,
"kl_loss_13": 76.38266716003417,
"kl_loss_2": 1923.5742980957032,
"kl_loss_4": 1041.6750427246093,
"kl_loss_9": 298.795263671875,
"learning_rate": 0.0005380431090255476,
"loss": 824.89,
"step": 4810
},
{
"ce_loss_13": 3.3142627000808718,
"ce_loss_17": 3.275036060810089,
"ce_loss_2": 4.110712015628815,
"ce_loss_4": 3.7121946454048156,
"ce_loss_9": 3.4122665405273436,
"epoch": 0.482,
"grad_norm": 808.0,
"kl_loss_13": 72.91415100097656,
"kl_loss_2": 1832.0138732910157,
"kl_loss_4": 991.3748962402344,
"kl_loss_9": 284.617170715332,
"learning_rate": 0.0005364608565290155,
"loss": 798.3713,
"step": 4820
},
{
"ce_loss_13": 3.3208075881004335,
"ce_loss_17": 3.2798139452934265,
"ce_loss_2": 4.140064144134522,
"ce_loss_4": 3.727204477787018,
"ce_loss_9": 3.4234581351280213,
"epoch": 0.483,
"grad_norm": 800.0,
"kl_loss_13": 75.80227832794189,
"kl_loss_2": 1862.9129943847656,
"kl_loss_4": 1001.9588317871094,
"kl_loss_9": 291.62636489868163,
"learning_rate": 0.0005348782368720626,
"loss": 808.1032,
"step": 4830
},
{
"ce_loss_13": 3.252118909358978,
"ce_loss_17": 3.2115795969963075,
"ce_loss_2": 4.059725773334503,
"ce_loss_4": 3.6502609133720396,
"ce_loss_9": 3.353605532646179,
"epoch": 0.484,
"grad_norm": 784.0,
"kl_loss_13": 72.63689727783203,
"kl_loss_2": 1815.8565368652344,
"kl_loss_4": 977.6638732910156,
"kl_loss_9": 285.4698852539062,
"learning_rate": 0.000533295265991652,
"loss": 804.2033,
"step": 4840
},
{
"ce_loss_13": 3.3277907848358153,
"ce_loss_17": 3.2861191630363464,
"ce_loss_2": 4.117743873596192,
"ce_loss_4": 3.725143051147461,
"ce_loss_9": 3.4288668394088746,
"epoch": 0.485,
"grad_norm": 836.0,
"kl_loss_13": 73.85648956298829,
"kl_loss_2": 1807.0551513671876,
"kl_loss_4": 990.4627105712891,
"kl_loss_9": 289.78437957763674,
"learning_rate": 0.0005317119598282822,
"loss": 794.5045,
"step": 4850
},
{
"ce_loss_13": 3.3274188876152038,
"ce_loss_17": 3.2868924021720884,
"ce_loss_2": 4.134918856620788,
"ce_loss_4": 3.7305999279022215,
"ce_loss_9": 3.4288418531417846,
"epoch": 0.486,
"grad_norm": 1352.0,
"kl_loss_13": 75.01168785095214,
"kl_loss_2": 1836.2504455566407,
"kl_loss_4": 1000.2125915527344,
"kl_loss_9": 291.90661773681643,
"learning_rate": 0.0005301283343258293,
"loss": 804.1091,
"step": 4860
},
{
"ce_loss_13": 3.3871153354644776,
"ce_loss_17": 3.3473598957061768,
"ce_loss_2": 4.170122230052948,
"ce_loss_4": 3.782205104827881,
"ce_loss_9": 3.4865453600883485,
"epoch": 0.487,
"grad_norm": 928.0,
"kl_loss_13": 75.0543228149414,
"kl_loss_2": 1801.48056640625,
"kl_loss_4": 986.4563842773438,
"kl_loss_9": 290.7179382324219,
"learning_rate": 0.000528544405431384,
"loss": 791.7803,
"step": 4870
},
{
"ce_loss_13": 3.270020830631256,
"ce_loss_17": 3.2277884244918824,
"ce_loss_2": 4.086709308624267,
"ce_loss_4": 3.6876027822494506,
"ce_loss_9": 3.3727181911468507,
"epoch": 0.488,
"grad_norm": 820.0,
"kl_loss_13": 75.41398048400879,
"kl_loss_2": 1875.3604064941405,
"kl_loss_4": 1033.8077758789063,
"kl_loss_9": 298.76770172119143,
"learning_rate": 0.000526960189095093,
"loss": 817.7302,
"step": 4880
},
{
"ce_loss_13": 3.2510303258895874,
"ce_loss_17": 3.211883878707886,
"ce_loss_2": 4.061688530445099,
"ce_loss_4": 3.65454957485199,
"ce_loss_9": 3.3511975765228272,
"epoch": 0.489,
"grad_norm": 888.0,
"kl_loss_13": 72.43558959960937,
"kl_loss_2": 1828.6154235839845,
"kl_loss_4": 991.6234741210938,
"kl_loss_9": 285.3493843078613,
"learning_rate": 0.0005253757012699972,
"loss": 798.6789,
"step": 4890
},
{
"ce_loss_13": 3.3291762590408327,
"ce_loss_17": 3.290039026737213,
"ce_loss_2": 4.127644944190979,
"ce_loss_4": 3.724576246738434,
"ce_loss_9": 3.4292059659957888,
"epoch": 0.49,
"grad_norm": 740.0,
"kl_loss_13": 73.65130043029785,
"kl_loss_2": 1827.6836364746093,
"kl_loss_4": 992.8492248535156,
"kl_loss_9": 288.521134185791,
"learning_rate": 0.0005237909579118712,
"loss": 811.9515,
"step": 4900
},
{
"ce_loss_13": 3.2921934723854065,
"ce_loss_17": 3.249121403694153,
"ce_loss_2": 4.118033158779144,
"ce_loss_4": 3.7062087774276735,
"ce_loss_9": 3.3943717956542967,
"epoch": 0.491,
"grad_norm": 756.0,
"kl_loss_13": 75.26352424621582,
"kl_loss_2": 1878.7477905273438,
"kl_loss_4": 1020.291845703125,
"kl_loss_9": 296.14775390625,
"learning_rate": 0.0005222059749790631,
"loss": 816.4149,
"step": 4910
},
{
"ce_loss_13": 3.3588847875595094,
"ce_loss_17": 3.318212938308716,
"ce_loss_2": 4.122260499000549,
"ce_loss_4": 3.7380555152893065,
"ce_loss_9": 3.456986737251282,
"epoch": 0.492,
"grad_norm": 720.0,
"kl_loss_13": 74.36726608276368,
"kl_loss_2": 1770.406268310547,
"kl_loss_4": 970.4561462402344,
"kl_loss_9": 286.1612060546875,
"learning_rate": 0.0005206207684323337,
"loss": 776.1451,
"step": 4920
},
{
"ce_loss_13": 3.3389157891273498,
"ce_loss_17": 3.2978055119514464,
"ce_loss_2": 4.135409045219421,
"ce_loss_4": 3.736425042152405,
"ce_loss_9": 3.439645564556122,
"epoch": 0.493,
"grad_norm": 920.0,
"kl_loss_13": 75.09701538085938,
"kl_loss_2": 1837.3790893554688,
"kl_loss_4": 992.6164123535157,
"kl_loss_9": 291.5115341186523,
"learning_rate": 0.000519035354234695,
"loss": 816.9218,
"step": 4930
},
{
"ce_loss_13": 3.314695417881012,
"ce_loss_17": 3.2720402002334597,
"ce_loss_2": 4.1193211555480955,
"ce_loss_4": 3.7305904626846313,
"ce_loss_9": 3.4197781324386596,
"epoch": 0.494,
"grad_norm": 800.0,
"kl_loss_13": 75.44659748077393,
"kl_loss_2": 1821.7885375976562,
"kl_loss_4": 1008.4688659667969,
"kl_loss_9": 292.37608642578124,
"learning_rate": 0.0005174497483512506,
"loss": 792.1003,
"step": 4940
},
{
"ce_loss_13": 3.3612324833869933,
"ce_loss_17": 3.3229804396629334,
"ce_loss_2": 4.14638044834137,
"ce_loss_4": 3.750081944465637,
"ce_loss_9": 3.457711327075958,
"epoch": 0.495,
"grad_norm": 800.0,
"kl_loss_13": 74.3962791442871,
"kl_loss_2": 1826.6995910644532,
"kl_loss_4": 992.9568328857422,
"kl_loss_9": 289.2128746032715,
"learning_rate": 0.0005158639667490339,
"loss": 814.1679,
"step": 4950
},
{
"ce_loss_13": 3.2649693608284,
"ce_loss_17": 3.224947285652161,
"ce_loss_2": 4.075822901725769,
"ce_loss_4": 3.669500434398651,
"ce_loss_9": 3.3687703609466553,
"epoch": 0.496,
"grad_norm": 1104.0,
"kl_loss_13": 74.10564365386963,
"kl_loss_2": 1856.5388305664062,
"kl_loss_4": 1008.420068359375,
"kl_loss_9": 294.5977310180664,
"learning_rate": 0.0005142780253968481,
"loss": 804.6434,
"step": 4960
},
{
"ce_loss_13": 3.2142897129058836,
"ce_loss_17": 3.174994421005249,
"ce_loss_2": 4.017913889884949,
"ce_loss_4": 3.6049700498580934,
"ce_loss_9": 3.312189483642578,
"epoch": 0.497,
"grad_norm": 976.0,
"kl_loss_13": 71.54103832244873,
"kl_loss_2": 1836.7127624511718,
"kl_loss_4": 976.3125457763672,
"kl_loss_9": 280.7961006164551,
"learning_rate": 0.0005126919402651053,
"loss": 781.3013,
"step": 4970
},
{
"ce_loss_13": 3.278981614112854,
"ce_loss_17": 3.2370316624641418,
"ce_loss_2": 4.111584794521332,
"ce_loss_4": 3.703745257854462,
"ce_loss_9": 3.381889748573303,
"epoch": 0.498,
"grad_norm": 844.0,
"kl_loss_13": 76.156498336792,
"kl_loss_2": 1858.4030456542969,
"kl_loss_4": 1016.5751037597656,
"kl_loss_9": 293.5657470703125,
"learning_rate": 0.0005111057273256647,
"loss": 810.4161,
"step": 4980
},
{
"ce_loss_13": 3.382327103614807,
"ce_loss_17": 3.343675124645233,
"ce_loss_2": 4.123418486118316,
"ce_loss_4": 3.752922296524048,
"ce_loss_9": 3.4781156182289124,
"epoch": 0.499,
"grad_norm": 700.0,
"kl_loss_13": 72.45312232971192,
"kl_loss_2": 1723.856219482422,
"kl_loss_4": 949.0105102539062,
"kl_loss_9": 278.47458724975587,
"learning_rate": 0.0005095194025516733,
"loss": 772.5709,
"step": 4990
},
{
"ce_loss_13": 3.308869647979736,
"ce_loss_17": 3.270881199836731,
"ce_loss_2": 4.0973071455955505,
"ce_loss_4": 3.697914254665375,
"ce_loss_9": 3.404145121574402,
"epoch": 0.5,
"grad_norm": 844.0,
"kl_loss_13": 72.70045700073243,
"kl_loss_2": 1794.5357177734375,
"kl_loss_4": 970.1500518798828,
"kl_loss_9": 283.13077850341796,
"learning_rate": 0.000507932981917404,
"loss": 812.2307,
"step": 5000
},
{
"ce_loss_13": 3.262911152839661,
"ce_loss_17": 3.2198340773582457,
"ce_loss_2": 4.109212005138398,
"ce_loss_4": 3.6863863348960875,
"ce_loss_9": 3.3666589736938475,
"epoch": 0.501,
"grad_norm": 1080.0,
"kl_loss_13": 77.47729015350342,
"kl_loss_2": 1929.3942321777345,
"kl_loss_4": 1039.4493103027344,
"kl_loss_9": 300.106640625,
"learning_rate": 0.0005063464813980949,
"loss": 830.4402,
"step": 5010
},
{
"ce_loss_13": 3.2479494333267214,
"ce_loss_17": 3.2078561663627623,
"ce_loss_2": 4.062610566616058,
"ce_loss_4": 3.6508336186409,
"ce_loss_9": 3.343203544616699,
"epoch": 0.502,
"grad_norm": 672.0,
"kl_loss_13": 73.65824165344239,
"kl_loss_2": 1878.8285278320313,
"kl_loss_4": 1018.11328125,
"kl_loss_9": 289.3094146728516,
"learning_rate": 0.0005047599169697884,
"loss": 805.0937,
"step": 5020
},
{
"ce_loss_13": 3.185950815677643,
"ce_loss_17": 3.14703825712204,
"ce_loss_2": 4.00815167427063,
"ce_loss_4": 3.5929755568504333,
"ce_loss_9": 3.288442540168762,
"epoch": 0.503,
"grad_norm": 1088.0,
"kl_loss_13": 71.29309349060058,
"kl_loss_2": 1850.965869140625,
"kl_loss_4": 994.7696960449218,
"kl_loss_9": 285.5740135192871,
"learning_rate": 0.000503173304609171,
"loss": 785.2191,
"step": 5030
},
{
"ce_loss_13": 3.305916726589203,
"ce_loss_17": 3.266102612018585,
"ce_loss_2": 4.113384175300598,
"ce_loss_4": 3.70848388671875,
"ce_loss_9": 3.4055826902389525,
"epoch": 0.504,
"grad_norm": 888.0,
"kl_loss_13": 73.97796363830567,
"kl_loss_2": 1829.7077392578126,
"kl_loss_4": 996.4985748291016,
"kl_loss_9": 288.2816551208496,
"learning_rate": 0.0005015866602934111,
"loss": 787.8277,
"step": 5040
},
{
"ce_loss_13": 3.2743287205696108,
"ce_loss_17": 3.233179819583893,
"ce_loss_2": 4.111311686038971,
"ce_loss_4": 3.698232448101044,
"ce_loss_9": 3.3796739101409914,
"epoch": 0.505,
"grad_norm": 780.0,
"kl_loss_13": 76.13735866546631,
"kl_loss_2": 1898.339990234375,
"kl_loss_4": 1037.7702270507812,
"kl_loss_9": 300.18348693847656,
"learning_rate": 0.0005,
"loss": 813.3053,
"step": 5050
},
{
"ce_loss_13": 3.2703390598297117,
"ce_loss_17": 3.2306999921798707,
"ce_loss_2": 4.07856274843216,
"ce_loss_4": 3.668815791606903,
"ce_loss_9": 3.3710572361946105,
"epoch": 0.506,
"grad_norm": 856.0,
"kl_loss_13": 75.43934326171875,
"kl_loss_2": 1849.766552734375,
"kl_loss_4": 1009.7808013916016,
"kl_loss_9": 293.8470733642578,
"learning_rate": 0.0004984133397065889,
"loss": 794.9061,
"step": 5060
},
{
"ce_loss_13": 3.272858726978302,
"ce_loss_17": 3.2313432335853576,
"ce_loss_2": 4.103818774223328,
"ce_loss_4": 3.7008283495903016,
"ce_loss_9": 3.37606999874115,
"epoch": 0.507,
"grad_norm": 752.0,
"kl_loss_13": 74.73295059204102,
"kl_loss_2": 1863.9148071289062,
"kl_loss_4": 1017.3056182861328,
"kl_loss_9": 292.6587875366211,
"learning_rate": 0.0004968266953908291,
"loss": 795.0145,
"step": 5070
},
{
"ce_loss_13": 3.3145520091056824,
"ce_loss_17": 3.2744415640830993,
"ce_loss_2": 4.133978307247162,
"ce_loss_4": 3.7164300322532653,
"ce_loss_9": 3.4116795897483825,
"epoch": 0.508,
"grad_norm": 820.0,
"kl_loss_13": 74.0131664276123,
"kl_loss_2": 1865.4527099609375,
"kl_loss_4": 999.7883392333985,
"kl_loss_9": 286.20834197998045,
"learning_rate": 0.0004952400830302117,
"loss": 803.6775,
"step": 5080
},
{
"ce_loss_13": 3.2411397218704225,
"ce_loss_17": 3.200059103965759,
"ce_loss_2": 4.0809555649757385,
"ce_loss_4": 3.655611753463745,
"ce_loss_9": 3.343540573120117,
"epoch": 0.509,
"grad_norm": 916.0,
"kl_loss_13": 75.35558547973633,
"kl_loss_2": 1893.0482055664063,
"kl_loss_4": 1019.6383666992188,
"kl_loss_9": 296.9830749511719,
"learning_rate": 0.0004936535186019053,
"loss": 804.5086,
"step": 5090
},
{
"ce_loss_13": 3.3392978429794313,
"ce_loss_17": 3.300644409656525,
"ce_loss_2": 4.120908486843109,
"ce_loss_4": 3.7291369080543517,
"ce_loss_9": 3.4352852582931517,
"epoch": 0.51,
"grad_norm": 676.0,
"kl_loss_13": 72.64418449401856,
"kl_loss_2": 1780.3622009277344,
"kl_loss_4": 962.9150421142579,
"kl_loss_9": 280.7718734741211,
"learning_rate": 0.000492067018082596,
"loss": 785.5723,
"step": 5100
},
{
"ce_loss_13": 3.2748634576797486,
"ce_loss_17": 3.233682465553284,
"ce_loss_2": 4.130126976966858,
"ce_loss_4": 3.698560190200806,
"ce_loss_9": 3.3825439453125,
"epoch": 0.511,
"grad_norm": 844.0,
"kl_loss_13": 75.8413314819336,
"kl_loss_2": 1924.5922424316407,
"kl_loss_4": 1034.881884765625,
"kl_loss_9": 296.8105667114258,
"learning_rate": 0.0004904805974483267,
"loss": 832.7997,
"step": 5110
},
{
"ce_loss_13": 3.3867220759391783,
"ce_loss_17": 3.3429943084716798,
"ce_loss_2": 4.215401363372803,
"ce_loss_4": 3.8140215635299684,
"ce_loss_9": 3.4924907803535463,
"epoch": 0.512,
"grad_norm": 836.0,
"kl_loss_13": 79.06632575988769,
"kl_loss_2": 1902.0935119628907,
"kl_loss_4": 1052.306689453125,
"kl_loss_9": 305.49594650268557,
"learning_rate": 0.0004888942726743353,
"loss": 841.9122,
"step": 5120
},
{
"ce_loss_13": 3.260837697982788,
"ce_loss_17": 3.2186120748519897,
"ce_loss_2": 4.087520980834961,
"ce_loss_4": 3.6748701691627503,
"ce_loss_9": 3.3637019872665403,
"epoch": 0.513,
"grad_norm": 748.0,
"kl_loss_13": 74.35917491912842,
"kl_loss_2": 1883.8236267089844,
"kl_loss_4": 1020.7645202636719,
"kl_loss_9": 293.17310638427733,
"learning_rate": 0.0004873080597348947,
"loss": 815.0167,
"step": 5130
},
{
"ce_loss_13": 3.1523805379867555,
"ce_loss_17": 3.1087000250816343,
"ce_loss_2": 4.018757474422455,
"ce_loss_4": 3.583875072002411,
"ce_loss_9": 3.2562185406684874,
"epoch": 0.514,
"grad_norm": 780.0,
"kl_loss_13": 74.4741828918457,
"kl_loss_2": 1964.23408203125,
"kl_loss_4": 1050.2632995605468,
"kl_loss_9": 292.9966156005859,
"learning_rate": 0.0004857219746031519,
"loss": 820.3717,
"step": 5140
},
{
"ce_loss_13": 3.3229424476623537,
"ce_loss_17": 3.282235884666443,
"ce_loss_2": 4.113417172431946,
"ce_loss_4": 3.716164600849152,
"ce_loss_9": 3.420932078361511,
"epoch": 0.515,
"grad_norm": 856.0,
"kl_loss_13": 76.73463134765625,
"kl_loss_2": 1819.937060546875,
"kl_loss_4": 990.5772918701172,
"kl_loss_9": 289.09563064575195,
"learning_rate": 0.0004841360332509663,
"loss": 801.6995,
"step": 5150
},
{
"ce_loss_13": 3.2782339096069335,
"ce_loss_17": 3.236938774585724,
"ce_loss_2": 4.07007886171341,
"ce_loss_4": 3.6759052515029906,
"ce_loss_9": 3.375661253929138,
"epoch": 0.516,
"grad_norm": 760.0,
"kl_loss_13": 72.78237037658691,
"kl_loss_2": 1811.8954650878907,
"kl_loss_4": 982.9941467285156,
"kl_loss_9": 282.3444313049316,
"learning_rate": 0.0004825502516487497,
"loss": 769.3042,
"step": 5160
},
{
"ce_loss_13": 3.23654762506485,
"ce_loss_17": 3.19771009683609,
"ce_loss_2": 4.066962695121765,
"ce_loss_4": 3.6531906723976135,
"ce_loss_9": 3.339952623844147,
"epoch": 0.517,
"grad_norm": 1024.0,
"kl_loss_13": 74.12141265869141,
"kl_loss_2": 1892.0356384277343,
"kl_loss_4": 1020.4204345703125,
"kl_loss_9": 293.0897346496582,
"learning_rate": 0.00048096464576530507,
"loss": 815.3729,
"step": 5170
},
{
"ce_loss_13": 3.341633379459381,
"ce_loss_17": 3.301531136035919,
"ce_loss_2": 4.103941702842713,
"ce_loss_4": 3.724453830718994,
"ce_loss_9": 3.438535213470459,
"epoch": 0.518,
"grad_norm": 716.0,
"kl_loss_13": 73.60916862487792,
"kl_loss_2": 1762.4796569824218,
"kl_loss_4": 962.1791381835938,
"kl_loss_9": 282.87510833740237,
"learning_rate": 0.00047937923156766646,
"loss": 780.2356,
"step": 5180
},
{
"ce_loss_13": 3.3900951504707337,
"ce_loss_17": 3.3490119099617006,
"ce_loss_2": 4.146072888374329,
"ce_loss_4": 3.7663703680038454,
"ce_loss_9": 3.481011140346527,
"epoch": 0.519,
"grad_norm": 736.0,
"kl_loss_13": 73.94559173583984,
"kl_loss_2": 1764.4251647949218,
"kl_loss_4": 968.4230743408203,
"kl_loss_9": 282.1041458129883,
"learning_rate": 0.00047779402502093696,
"loss": 785.2774,
"step": 5190
},
{
"ce_loss_13": 3.3521920800209046,
"ce_loss_17": 3.311307668685913,
"ce_loss_2": 4.136666762828827,
"ce_loss_4": 3.745848596096039,
"ce_loss_9": 3.4521227836608888,
"epoch": 0.52,
"grad_norm": 656.0,
"kl_loss_13": 73.88220329284668,
"kl_loss_2": 1792.12763671875,
"kl_loss_4": 979.6051513671875,
"kl_loss_9": 286.3393188476563,
"learning_rate": 0.0004762090420881289,
"loss": 793.2979,
"step": 5200
},
{
"ce_loss_13": 3.2692319631576536,
"ce_loss_17": 3.2304707527160645,
"ce_loss_2": 4.053920090198517,
"ce_loss_4": 3.664601814746857,
"ce_loss_9": 3.366422188282013,
"epoch": 0.521,
"grad_norm": 728.0,
"kl_loss_13": 74.0851318359375,
"kl_loss_2": 1796.7323913574219,
"kl_loss_4": 979.0724670410157,
"kl_loss_9": 283.4450454711914,
"learning_rate": 0.00047462429873000296,
"loss": 778.6974,
"step": 5210
},
{
"ce_loss_13": 3.353558099269867,
"ce_loss_17": 3.3126341223716738,
"ce_loss_2": 4.132767391204834,
"ce_loss_4": 3.734746587276459,
"ce_loss_9": 3.451547932624817,
"epoch": 0.522,
"grad_norm": 700.0,
"kl_loss_13": 74.0196662902832,
"kl_loss_2": 1803.9498779296875,
"kl_loss_4": 971.2668426513671,
"kl_loss_9": 285.3925193786621,
"learning_rate": 0.0004730398109049071,
"loss": 784.4695,
"step": 5220
},
{
"ce_loss_13": 3.2815186381340027,
"ce_loss_17": 3.2411299347877502,
"ce_loss_2": 4.117280209064484,
"ce_loss_4": 3.7015105962753294,
"ce_loss_9": 3.3863202929496765,
"epoch": 0.523,
"grad_norm": 1328.0,
"kl_loss_13": 75.13538360595703,
"kl_loss_2": 1900.709716796875,
"kl_loss_4": 1028.2422943115234,
"kl_loss_9": 298.4895965576172,
"learning_rate": 0.000471455594568616,
"loss": 806.2253,
"step": 5230
},
{
"ce_loss_13": 3.35224426984787,
"ce_loss_17": 3.313410794734955,
"ce_loss_2": 4.1167085528373715,
"ce_loss_4": 3.735312795639038,
"ce_loss_9": 3.450315523147583,
"epoch": 0.524,
"grad_norm": 992.0,
"kl_loss_13": 74.43409156799316,
"kl_loss_2": 1761.6297302246094,
"kl_loss_4": 960.3011840820312,
"kl_loss_9": 284.1883819580078,
"learning_rate": 0.00046987166567417086,
"loss": 786.022,
"step": 5240
},
{
"ce_loss_13": 3.2736294507980346,
"ce_loss_17": 3.2341216087341307,
"ce_loss_2": 4.069810843467712,
"ce_loss_4": 3.6714844346046447,
"ce_loss_9": 3.3713714718818664,
"epoch": 0.525,
"grad_norm": 828.0,
"kl_loss_13": 72.30632076263427,
"kl_loss_2": 1832.9207580566406,
"kl_loss_4": 987.8688446044922,
"kl_loss_9": 285.9743133544922,
"learning_rate": 0.00046828804017171776,
"loss": 772.2806,
"step": 5250
},
{
"ce_loss_13": 3.316581404209137,
"ce_loss_17": 3.273679721355438,
"ce_loss_2": 4.146453988552094,
"ce_loss_4": 3.733655881881714,
"ce_loss_9": 3.4181808471679687,
"epoch": 0.526,
"grad_norm": 936.0,
"kl_loss_13": 74.23440246582031,
"kl_loss_2": 1845.0445129394532,
"kl_loss_4": 1004.0903381347656,
"kl_loss_9": 291.0963500976562,
"learning_rate": 0.00046670473400834805,
"loss": 808.1028,
"step": 5260
},
{
"ce_loss_13": 3.2513556122779845,
"ce_loss_17": 3.2132788777351378,
"ce_loss_2": 4.033259403705597,
"ce_loss_4": 3.635775065422058,
"ce_loss_9": 3.3464043021202086,
"epoch": 0.527,
"grad_norm": 840.0,
"kl_loss_13": 71.38737449645996,
"kl_loss_2": 1778.4729431152343,
"kl_loss_4": 960.8607696533203,
"kl_loss_9": 277.62567749023435,
"learning_rate": 0.00046512176312793734,
"loss": 809.4104,
"step": 5270
},
{
"ce_loss_13": 3.2459253072738647,
"ce_loss_17": 3.204818546772003,
"ce_loss_2": 4.041217648983002,
"ce_loss_4": 3.645413875579834,
"ce_loss_9": 3.3453406572341917,
"epoch": 0.528,
"grad_norm": 796.0,
"kl_loss_13": 72.22671623229981,
"kl_loss_2": 1825.0807006835937,
"kl_loss_4": 988.5527099609375,
"kl_loss_9": 285.0157127380371,
"learning_rate": 0.00046353914347098467,
"loss": 801.4178,
"step": 5280
},
{
"ce_loss_13": 3.345044183731079,
"ce_loss_17": 3.306043243408203,
"ce_loss_2": 4.1388083577156065,
"ce_loss_4": 3.741454613208771,
"ce_loss_9": 3.4415024399757383,
"epoch": 0.529,
"grad_norm": 1040.0,
"kl_loss_13": 73.76498985290527,
"kl_loss_2": 1808.4031982421875,
"kl_loss_4": 979.0365051269531,
"kl_loss_9": 281.8998863220215,
"learning_rate": 0.0004619568909744524,
"loss": 797.9169,
"step": 5290
},
{
"ce_loss_13": 3.347410809993744,
"ce_loss_17": 3.308348262310028,
"ce_loss_2": 4.13295624256134,
"ce_loss_4": 3.740420389175415,
"ce_loss_9": 3.444485080242157,
"epoch": 0.53,
"grad_norm": 720.0,
"kl_loss_13": 73.7395851135254,
"kl_loss_2": 1801.3715942382812,
"kl_loss_4": 977.6004638671875,
"kl_loss_9": 286.12501373291013,
"learning_rate": 0.00046037502157160573,
"loss": 796.7553,
"step": 5300
},
{
"ce_loss_13": 3.2193509340286255,
"ce_loss_17": 3.1804965138435364,
"ce_loss_2": 4.027743196487426,
"ce_loss_4": 3.6299861907958983,
"ce_loss_9": 3.3204323410987855,
"epoch": 0.531,
"grad_norm": 860.0,
"kl_loss_13": 73.14422378540038,
"kl_loss_2": 1833.43486328125,
"kl_loss_4": 996.2987762451172,
"kl_loss_9": 287.3218566894531,
"learning_rate": 0.00045879355119185207,
"loss": 798.9685,
"step": 5310
},
{
"ce_loss_13": 3.301614260673523,
"ce_loss_17": 3.2612352848052977,
"ce_loss_2": 4.112736761569977,
"ce_loss_4": 3.7088679313659667,
"ce_loss_9": 3.403615081310272,
"epoch": 0.532,
"grad_norm": 796.0,
"kl_loss_13": 73.1871494293213,
"kl_loss_2": 1856.7558044433595,
"kl_loss_4": 1009.262451171875,
"kl_loss_9": 291.91101989746096,
"learning_rate": 0.0004572124957605803,
"loss": 814.7009,
"step": 5320
},
{
"ce_loss_13": 3.321007227897644,
"ce_loss_17": 3.2809830784797667,
"ce_loss_2": 4.108720934391021,
"ce_loss_4": 3.7170739650726317,
"ce_loss_9": 3.4239102602005005,
"epoch": 0.533,
"grad_norm": 732.0,
"kl_loss_13": 73.42384452819825,
"kl_loss_2": 1818.5907470703125,
"kl_loss_4": 989.3438262939453,
"kl_loss_9": 290.179621887207,
"learning_rate": 0.00045563187119900103,
"loss": 784.1296,
"step": 5330
},
{
"ce_loss_13": 3.1643845319747923,
"ce_loss_17": 3.1251938700675965,
"ce_loss_2": 3.995231831073761,
"ce_loss_4": 3.5754489064216615,
"ce_loss_9": 3.2659531235694885,
"epoch": 0.534,
"grad_norm": 1000.0,
"kl_loss_13": 72.27840385437011,
"kl_loss_2": 1878.682550048828,
"kl_loss_4": 1004.5179229736328,
"kl_loss_9": 287.7483283996582,
"learning_rate": 0.00045405169342398633,
"loss": 807.4451,
"step": 5340
},
{
"ce_loss_13": 3.2520262837409972,
"ce_loss_17": 3.2115379691123964,
"ce_loss_2": 4.080039012432098,
"ce_loss_4": 3.657986414432526,
"ce_loss_9": 3.3535806059837343,
"epoch": 0.535,
"grad_norm": 868.0,
"kl_loss_13": 74.16668548583985,
"kl_loss_2": 1865.976123046875,
"kl_loss_4": 997.759033203125,
"kl_loss_9": 288.57018051147463,
"learning_rate": 0.0004524719783479088,
"loss": 789.2814,
"step": 5350
},
{
"ce_loss_13": 3.206182086467743,
"ce_loss_17": 3.1656386494636535,
"ce_loss_2": 4.046193289756775,
"ce_loss_4": 3.624490666389465,
"ce_loss_9": 3.309402322769165,
"epoch": 0.536,
"grad_norm": 808.0,
"kl_loss_13": 74.21720085144042,
"kl_loss_2": 1901.2041381835938,
"kl_loss_4": 1022.456185913086,
"kl_loss_9": 292.5477523803711,
"learning_rate": 0.00045089274187848144,
"loss": 797.1016,
"step": 5360
},
{
"ce_loss_13": 3.330158460140228,
"ce_loss_17": 3.2923543214797975,
"ce_loss_2": 4.103685748577118,
"ce_loss_4": 3.7177846431732178,
"ce_loss_9": 3.4239140868186952,
"epoch": 0.537,
"grad_norm": 1144.0,
"kl_loss_13": 73.13119049072266,
"kl_loss_2": 1799.9715087890625,
"kl_loss_4": 975.4336334228516,
"kl_loss_9": 285.69232788085935,
"learning_rate": 0.00044931399991859835,
"loss": 781.6728,
"step": 5370
},
{
"ce_loss_13": 3.1870726346969604,
"ce_loss_17": 3.1471142411231994,
"ce_loss_2": 3.9945030927658083,
"ce_loss_4": 3.5893633484840395,
"ce_loss_9": 3.2885418176651,
"epoch": 0.538,
"grad_norm": 820.0,
"kl_loss_13": 72.292848777771,
"kl_loss_2": 1837.003564453125,
"kl_loss_4": 990.8762390136719,
"kl_loss_9": 285.67307739257814,
"learning_rate": 0.00044773576836617336,
"loss": 785.0116,
"step": 5380
},
{
"ce_loss_13": 3.277537798881531,
"ce_loss_17": 3.2368523240089417,
"ce_loss_2": 4.089139556884765,
"ce_loss_4": 3.6902815222740175,
"ce_loss_9": 3.380195736885071,
"epoch": 0.539,
"grad_norm": 820.0,
"kl_loss_13": 73.99409484863281,
"kl_loss_2": 1863.6287780761718,
"kl_loss_4": 1013.0081237792969,
"kl_loss_9": 291.7620361328125,
"learning_rate": 0.00044615806311398056,
"loss": 817.4792,
"step": 5390
},
{
"ce_loss_13": 3.3569162130355834,
"ce_loss_17": 3.3178048491477967,
"ce_loss_2": 4.094481468200684,
"ce_loss_4": 3.7208905458450316,
"ce_loss_9": 3.448286509513855,
"epoch": 0.54,
"grad_norm": 848.0,
"kl_loss_13": 72.22508735656739,
"kl_loss_2": 1719.27236328125,
"kl_loss_4": 941.8825927734375,
"kl_loss_9": 278.6324577331543,
"learning_rate": 0.00044458090004949454,
"loss": 786.7616,
"step": 5400
},
{
"ce_loss_13": 3.2138561010360718,
"ce_loss_17": 3.172481417655945,
"ce_loss_2": 4.065579998493194,
"ce_loss_4": 3.6457912802696226,
"ce_loss_9": 3.3184029817581178,
"epoch": 0.541,
"grad_norm": 760.0,
"kl_loss_13": 75.91326332092285,
"kl_loss_2": 1954.336328125,
"kl_loss_4": 1064.4479553222657,
"kl_loss_9": 301.6357475280762,
"learning_rate": 0.0004430042950547297,
"loss": 810.616,
"step": 5410
},
{
"ce_loss_13": 3.306875801086426,
"ce_loss_17": 3.2644275188446046,
"ce_loss_2": 4.119654250144959,
"ce_loss_4": 3.7173089504241945,
"ce_loss_9": 3.411296534538269,
"epoch": 0.542,
"grad_norm": 1072.0,
"kl_loss_13": 76.02639694213867,
"kl_loss_2": 1850.6163391113282,
"kl_loss_4": 1003.1622009277344,
"kl_loss_9": 294.12415466308596,
"learning_rate": 0.0004414282640060809,
"loss": 800.0967,
"step": 5420
},
{
"ce_loss_13": 3.4018345355987547,
"ce_loss_17": 3.357880413532257,
"ce_loss_2": 4.178731369972229,
"ce_loss_4": 3.801996040344238,
"ce_loss_9": 3.5053011655807493,
"epoch": 0.543,
"grad_norm": 844.0,
"kl_loss_13": 77.79082794189453,
"kl_loss_2": 1773.267041015625,
"kl_loss_4": 982.6691650390625,
"kl_loss_9": 305.6775268554687,
"learning_rate": 0.0004398528227741633,
"loss": 816.218,
"step": 5430
},
{
"ce_loss_13": 3.2647178173065186,
"ce_loss_17": 3.222849798202515,
"ce_loss_2": 4.06882221698761,
"ce_loss_4": 3.671201431751251,
"ce_loss_9": 3.3631152153015136,
"epoch": 0.544,
"grad_norm": 1128.0,
"kl_loss_13": 76.07216796875,
"kl_loss_2": 1808.1482543945312,
"kl_loss_4": 997.9567108154297,
"kl_loss_9": 296.2308380126953,
"learning_rate": 0.00043827798722365264,
"loss": 805.9838,
"step": 5440
},
{
"ce_loss_13": 3.387359392642975,
"ce_loss_17": 3.3467159628868104,
"ce_loss_2": 4.153163635730744,
"ce_loss_4": 3.7647279858589173,
"ce_loss_9": 3.48160742521286,
"epoch": 0.545,
"grad_norm": 988.0,
"kl_loss_13": 76.58334579467774,
"kl_loss_2": 1779.9947448730468,
"kl_loss_4": 964.375357055664,
"kl_loss_9": 291.64692840576174,
"learning_rate": 0.00043670377321312535,
"loss": 777.6915,
"step": 5450
},
{
"ce_loss_13": 3.3918582439422607,
"ce_loss_17": 3.3527199625968933,
"ce_loss_2": 4.149947786331177,
"ce_loss_4": 3.7716354727745056,
"ce_loss_9": 3.481933128833771,
"epoch": 0.546,
"grad_norm": 788.0,
"kl_loss_13": 74.72479209899902,
"kl_loss_2": 1766.5358337402345,
"kl_loss_4": 963.9497955322265,
"kl_loss_9": 286.72948455810547,
"learning_rate": 0.0004351301965948991,
"loss": 788.3238,
"step": 5460
},
{
"ce_loss_13": 3.29829740524292,
"ce_loss_17": 3.257559609413147,
"ce_loss_2": 4.066121160984039,
"ce_loss_4": 3.677996289730072,
"ce_loss_9": 3.3944310784339904,
"epoch": 0.547,
"grad_norm": 804.0,
"kl_loss_13": 73.52764205932617,
"kl_loss_2": 1759.8200561523438,
"kl_loss_4": 955.4488830566406,
"kl_loss_9": 282.16875,
"learning_rate": 0.000433557273214873,
"loss": 784.6819,
"step": 5470
},
{
"ce_loss_13": 3.289206564426422,
"ce_loss_17": 3.250258719921112,
"ce_loss_2": 4.070839929580688,
"ce_loss_4": 3.6803099632263185,
"ce_loss_9": 3.38953412771225,
"epoch": 0.548,
"grad_norm": 784.0,
"kl_loss_13": 73.20038318634033,
"kl_loss_2": 1785.5767761230468,
"kl_loss_4": 964.192514038086,
"kl_loss_9": 285.82276916503906,
"learning_rate": 0.000431985018912368,
"loss": 773.3384,
"step": 5480
},
{
"ce_loss_13": 3.2603097796440124,
"ce_loss_17": 3.2189146876335144,
"ce_loss_2": 4.083324456214905,
"ce_loss_4": 3.6726097226142884,
"ce_loss_9": 3.3604193806648253,
"epoch": 0.549,
"grad_norm": 928.0,
"kl_loss_13": 74.56349067687988,
"kl_loss_2": 1874.79150390625,
"kl_loss_4": 1015.2865112304687,
"kl_loss_9": 294.37069549560545,
"learning_rate": 0.0004304134495199674,
"loss": 787.2493,
"step": 5490
},
{
"ce_loss_13": 3.285221612453461,
"ce_loss_17": 3.244589960575104,
"ce_loss_2": 4.08959436416626,
"ce_loss_4": 3.694540321826935,
"ce_loss_9": 3.3886062860488892,
"epoch": 0.55,
"grad_norm": 800.0,
"kl_loss_13": 75.10189781188964,
"kl_loss_2": 1862.0597717285157,
"kl_loss_4": 1024.3334533691407,
"kl_loss_9": 298.7258102416992,
"learning_rate": 0.0004288425808633575,
"loss": 800.5307,
"step": 5500
},
{
"ce_loss_13": 3.2647858023643495,
"ce_loss_17": 3.2276356697082518,
"ce_loss_2": 4.072724485397339,
"ce_loss_4": 3.6583107709884644,
"ce_loss_9": 3.3632304668426514,
"epoch": 0.551,
"grad_norm": 1304.0,
"kl_loss_13": 72.74190845489503,
"kl_loss_2": 1834.6743041992188,
"kl_loss_4": 987.31357421875,
"kl_loss_9": 286.421395111084,
"learning_rate": 0.0004272724287611684,
"loss": 796.3049,
"step": 5510
},
{
"ce_loss_13": 3.238948404788971,
"ce_loss_17": 3.1981234192848205,
"ce_loss_2": 4.060285580158234,
"ce_loss_4": 3.635609674453735,
"ce_loss_9": 3.3366718769073485,
"epoch": 0.552,
"grad_norm": 848.0,
"kl_loss_13": 73.77195167541504,
"kl_loss_2": 1879.8147033691407,
"kl_loss_4": 999.0303314208984,
"kl_loss_9": 289.79567489624026,
"learning_rate": 0.00042570300902481425,
"loss": 798.8034,
"step": 5520
},
{
"ce_loss_13": 3.272075152397156,
"ce_loss_17": 3.234650266170502,
"ce_loss_2": 4.0568290710449215,
"ce_loss_4": 3.6595101237297056,
"ce_loss_9": 3.365482270717621,
"epoch": 0.553,
"grad_norm": 988.0,
"kl_loss_13": 72.45253143310546,
"kl_loss_2": 1820.305108642578,
"kl_loss_4": 980.4863983154297,
"kl_loss_9": 285.43726501464846,
"learning_rate": 0.00042413433745833423,
"loss": 787.6417,
"step": 5530
},
{
"ce_loss_13": 3.26614705324173,
"ce_loss_17": 3.2261630058288575,
"ce_loss_2": 4.0755760908126835,
"ce_loss_4": 3.669107723236084,
"ce_loss_9": 3.364769494533539,
"epoch": 0.554,
"grad_norm": 700.0,
"kl_loss_13": 73.44537525177002,
"kl_loss_2": 1826.7767822265625,
"kl_loss_4": 984.6950897216797,
"kl_loss_9": 285.97044677734374,
"learning_rate": 0.0004225664298582339,
"loss": 771.5904,
"step": 5540
},
{
"ce_loss_13": 3.349853444099426,
"ce_loss_17": 3.3096522808074953,
"ce_loss_2": 4.12833468914032,
"ce_loss_4": 3.7342480659484862,
"ce_loss_9": 3.444065606594086,
"epoch": 0.555,
"grad_norm": 896.0,
"kl_loss_13": 73.12475662231445,
"kl_loss_2": 1772.5195373535157,
"kl_loss_4": 959.9620391845704,
"kl_loss_9": 281.3079231262207,
"learning_rate": 0.000420999302013325,
"loss": 776.8577,
"step": 5550
},
{
"ce_loss_13": 3.253776121139526,
"ce_loss_17": 3.212263309955597,
"ce_loss_2": 4.09758027791977,
"ce_loss_4": 3.661249279975891,
"ce_loss_9": 3.355074954032898,
"epoch": 0.556,
"grad_norm": 868.0,
"kl_loss_13": 75.87601051330566,
"kl_loss_2": 1901.3112548828126,
"kl_loss_4": 1008.3065582275391,
"kl_loss_9": 296.084765625,
"learning_rate": 0.000419432969704568,
"loss": 795.7606,
"step": 5560
},
{
"ce_loss_13": 3.289377510547638,
"ce_loss_17": 3.250764536857605,
"ce_loss_2": 4.073163557052612,
"ce_loss_4": 3.680362272262573,
"ce_loss_9": 3.388530659675598,
"epoch": 0.557,
"grad_norm": 828.0,
"kl_loss_13": 73.03952255249024,
"kl_loss_2": 1783.5357543945313,
"kl_loss_4": 967.2474578857422,
"kl_loss_9": 283.408634185791,
"learning_rate": 0.00041786744870491154,
"loss": 801.2744,
"step": 5570
},
{
"ce_loss_13": 3.2308903098106385,
"ce_loss_17": 3.1919775366783143,
"ce_loss_2": 4.034533071517944,
"ce_loss_4": 3.6301452279090882,
"ce_loss_9": 3.3295583367347716,
"epoch": 0.558,
"grad_norm": 980.0,
"kl_loss_13": 74.31400451660156,
"kl_loss_2": 1842.5608154296874,
"kl_loss_4": 1004.8509399414063,
"kl_loss_9": 293.26367645263673,
"learning_rate": 0.0004163027547791347,
"loss": 794.3833,
"step": 5580
},
{
"ce_loss_13": 3.211347496509552,
"ce_loss_17": 3.1711423277854918,
"ce_loss_2": 4.050317776203156,
"ce_loss_4": 3.624680197238922,
"ce_loss_9": 3.310125172138214,
"epoch": 0.559,
"grad_norm": 880.0,
"kl_loss_13": 73.63075923919678,
"kl_loss_2": 1894.5034240722657,
"kl_loss_4": 1011.8344757080079,
"kl_loss_9": 293.5899291992188,
"learning_rate": 0.0004147389036836881,
"loss": 801.4451,
"step": 5590
},
{
"ce_loss_13": 3.259829878807068,
"ce_loss_17": 3.220303547382355,
"ce_loss_2": 4.069304740428924,
"ce_loss_4": 3.671776497364044,
"ce_loss_9": 3.359443461894989,
"epoch": 0.56,
"grad_norm": 1296.0,
"kl_loss_13": 74.24327507019044,
"kl_loss_2": 1844.3182250976563,
"kl_loss_4": 1008.930209350586,
"kl_loss_9": 290.1443237304687,
"learning_rate": 0.00041317591116653486,
"loss": 810.9732,
"step": 5600
},
{
"ce_loss_13": 3.2949826955795287,
"ce_loss_17": 3.254460871219635,
"ce_loss_2": 4.106885957717895,
"ce_loss_4": 3.699885070323944,
"ce_loss_9": 3.3967799067497255,
"epoch": 0.561,
"grad_norm": 968.0,
"kl_loss_13": 75.05366821289063,
"kl_loss_2": 1856.025897216797,
"kl_loss_4": 1003.9119995117187,
"kl_loss_9": 293.60422973632814,
"learning_rate": 0.0004116137929669921,
"loss": 791.4979,
"step": 5610
},
{
"ce_loss_13": 3.287285017967224,
"ce_loss_17": 3.2479687929153442,
"ce_loss_2": 4.0837935447692875,
"ce_loss_4": 3.680448615550995,
"ce_loss_9": 3.386300873756409,
"epoch": 0.562,
"grad_norm": 1056.0,
"kl_loss_13": 72.04925346374512,
"kl_loss_2": 1821.5766967773438,
"kl_loss_4": 986.7944122314453,
"kl_loss_9": 286.0955436706543,
"learning_rate": 0.00041005256481557305,
"loss": 780.1872,
"step": 5620
},
{
"ce_loss_13": 3.3894767999649047,
"ce_loss_17": 3.3497204542160035,
"ce_loss_2": 4.132556700706482,
"ce_loss_4": 3.76358847618103,
"ce_loss_9": 3.4814849138259887,
"epoch": 0.563,
"grad_norm": 816.0,
"kl_loss_13": 71.50232734680176,
"kl_loss_2": 1727.2944641113281,
"kl_loss_4": 946.6339752197266,
"kl_loss_9": 276.58668060302733,
"learning_rate": 0.00040849224243382767,
"loss": 766.1647,
"step": 5630
},
{
"ce_loss_13": 3.242488920688629,
"ce_loss_17": 3.202657175064087,
"ce_loss_2": 4.0449035406112674,
"ce_loss_4": 3.6400243639945984,
"ce_loss_9": 3.3369064807891844,
"epoch": 0.564,
"grad_norm": 1020.0,
"kl_loss_13": 72.28278465270996,
"kl_loss_2": 1823.1846130371093,
"kl_loss_4": 994.4835876464844,
"kl_loss_9": 287.82414321899415,
"learning_rate": 0.000406932841534185,
"loss": 780.4899,
"step": 5640
},
{
"ce_loss_13": 3.20475310087204,
"ce_loss_17": 3.1642747282981873,
"ce_loss_2": 4.017904949188233,
"ce_loss_4": 3.613080847263336,
"ce_loss_9": 3.3053141593933106,
"epoch": 0.565,
"grad_norm": 1004.0,
"kl_loss_13": 72.92307014465332,
"kl_loss_2": 1857.2842529296875,
"kl_loss_4": 1004.6457885742187,
"kl_loss_9": 287.81507263183596,
"learning_rate": 0.0004053743778197951,
"loss": 813.7131,
"step": 5650
},
{
"ce_loss_13": 3.3094315767288207,
"ce_loss_17": 3.2693715691566467,
"ce_loss_2": 4.100631475448608,
"ce_loss_4": 3.7085015058517454,
"ce_loss_9": 3.40853990316391,
"epoch": 0.566,
"grad_norm": 916.0,
"kl_loss_13": 75.08573303222656,
"kl_loss_2": 1802.2984436035156,
"kl_loss_4": 985.8848937988281,
"kl_loss_9": 289.80066299438477,
"learning_rate": 0.0004038168669843697,
"loss": 799.6597,
"step": 5660
},
{
"ce_loss_13": 3.266611671447754,
"ce_loss_17": 3.226332187652588,
"ce_loss_2": 4.034893548488617,
"ce_loss_4": 3.64892041683197,
"ce_loss_9": 3.363729107379913,
"epoch": 0.567,
"grad_norm": 1328.0,
"kl_loss_13": 72.34437274932861,
"kl_loss_2": 1765.5274291992187,
"kl_loss_4": 955.3150238037109,
"kl_loss_9": 282.6461380004883,
"learning_rate": 0.000402260324712026,
"loss": 790.4768,
"step": 5670
},
{
"ce_loss_13": 3.3097354412078857,
"ce_loss_17": 3.2704842329025268,
"ce_loss_2": 4.124593257904053,
"ce_loss_4": 3.7098461508750917,
"ce_loss_9": 3.411205291748047,
"epoch": 0.568,
"grad_norm": 1224.0,
"kl_loss_13": 72.51471862792968,
"kl_loss_2": 1852.9243041992188,
"kl_loss_4": 988.7502838134766,
"kl_loss_9": 285.3108932495117,
"learning_rate": 0.00040070476667712743,
"loss": 783.2475,
"step": 5680
},
{
"ce_loss_13": 3.340377914905548,
"ce_loss_17": 3.3000412344932557,
"ce_loss_2": 4.13344761133194,
"ce_loss_4": 3.7344311833381654,
"ce_loss_9": 3.4361794233322143,
"epoch": 0.569,
"grad_norm": 688.0,
"kl_loss_13": 74.35391159057617,
"kl_loss_2": 1810.0054870605468,
"kl_loss_4": 979.7866760253906,
"kl_loss_9": 283.6129066467285,
"learning_rate": 0.0003991502085441259,
"loss": 791.0041,
"step": 5690
},
{
"ce_loss_13": 3.3778745532035828,
"ce_loss_17": 3.33834547996521,
"ce_loss_2": 4.130846786499023,
"ce_loss_4": 3.750100874900818,
"ce_loss_9": 3.4685394763946533,
"epoch": 0.57,
"grad_norm": 848.0,
"kl_loss_13": 72.54917221069336,
"kl_loss_2": 1723.6391296386719,
"kl_loss_4": 939.2174407958985,
"kl_loss_9": 277.1242614746094,
"learning_rate": 0.0003975966659674047,
"loss": 776.0271,
"step": 5700
},
{
"ce_loss_13": 3.3454225778579714,
"ce_loss_17": 3.3044866919517517,
"ce_loss_2": 4.126947188377381,
"ce_loss_4": 3.7350040912628173,
"ce_loss_9": 3.4427593469619753,
"epoch": 0.571,
"grad_norm": 848.0,
"kl_loss_13": 73.82932510375977,
"kl_loss_2": 1794.7004455566407,
"kl_loss_4": 971.1542999267579,
"kl_loss_9": 282.82764434814453,
"learning_rate": 0.0003960441545911204,
"loss": 776.3346,
"step": 5710
},
{
"ce_loss_13": 3.338672161102295,
"ce_loss_17": 3.298799526691437,
"ce_loss_2": 4.117006981372834,
"ce_loss_4": 3.7270028710365297,
"ce_loss_9": 3.4329198598861694,
"epoch": 0.572,
"grad_norm": 840.0,
"kl_loss_13": 73.2137435913086,
"kl_loss_2": 1802.4476318359375,
"kl_loss_4": 983.7936096191406,
"kl_loss_9": 285.7454093933105,
"learning_rate": 0.0003944926900490452,
"loss": 782.9495,
"step": 5720
},
{
"ce_loss_13": 3.248178768157959,
"ce_loss_17": 3.2080833554267882,
"ce_loss_2": 4.0701495885849,
"ce_loss_4": 3.6604724168777465,
"ce_loss_9": 3.3536651849746706,
"epoch": 0.573,
"grad_norm": 672.0,
"kl_loss_13": 73.6166389465332,
"kl_loss_2": 1849.4944641113282,
"kl_loss_4": 999.0758972167969,
"kl_loss_9": 290.0587493896484,
"learning_rate": 0.0003929422879644099,
"loss": 784.3274,
"step": 5730
},
{
"ce_loss_13": 3.259362077713013,
"ce_loss_17": 3.218967521190643,
"ce_loss_2": 4.038165628910065,
"ce_loss_4": 3.63946738243103,
"ce_loss_9": 3.353592646121979,
"epoch": 0.574,
"grad_norm": 940.0,
"kl_loss_13": 72.40253295898438,
"kl_loss_2": 1799.1306030273438,
"kl_loss_4": 966.6379364013671,
"kl_loss_9": 281.5230438232422,
"learning_rate": 0.0003913929639497462,
"loss": 765.5293,
"step": 5740
},
{
"ce_loss_13": 3.2139196157455445,
"ce_loss_17": 3.1727853059768676,
"ce_loss_2": 4.041042923927307,
"ce_loss_4": 3.619505214691162,
"ce_loss_9": 3.312306559085846,
"epoch": 0.575,
"grad_norm": 744.0,
"kl_loss_13": 71.97178192138672,
"kl_loss_2": 1872.7219848632812,
"kl_loss_4": 992.9176544189453,
"kl_loss_9": 282.19856185913085,
"learning_rate": 0.00038984473360672965,
"loss": 783.3154,
"step": 5750
},
{
"ce_loss_13": 3.219143533706665,
"ce_loss_17": 3.1789993166923525,
"ce_loss_2": 4.0345776915550235,
"ce_loss_4": 3.621943485736847,
"ce_loss_9": 3.3177709102630617,
"epoch": 0.576,
"grad_norm": 792.0,
"kl_loss_13": 71.56662025451661,
"kl_loss_2": 1847.7013122558594,
"kl_loss_4": 994.68759765625,
"kl_loss_9": 283.5988624572754,
"learning_rate": 0.0003882976125260229,
"loss": 778.3832,
"step": 5760
},
{
"ce_loss_13": 3.286661374568939,
"ce_loss_17": 3.2458075642585755,
"ce_loss_2": 4.075940239429474,
"ce_loss_4": 3.6776213526725767,
"ce_loss_9": 3.387130117416382,
"epoch": 0.577,
"grad_norm": 748.0,
"kl_loss_13": 72.54135475158691,
"kl_loss_2": 1799.4101135253907,
"kl_loss_4": 966.9556396484375,
"kl_loss_9": 282.7928466796875,
"learning_rate": 0.00038675161628711776,
"loss": 783.4904,
"step": 5770
},
{
"ce_loss_13": 3.3262439131736756,
"ce_loss_17": 3.285924530029297,
"ce_loss_2": 4.106383979320526,
"ce_loss_4": 3.714357054233551,
"ce_loss_9": 3.4214155793190004,
"epoch": 0.578,
"grad_norm": 700.0,
"kl_loss_13": 72.68965663909913,
"kl_loss_2": 1772.8457275390624,
"kl_loss_4": 961.9623809814453,
"kl_loss_9": 283.20936126708983,
"learning_rate": 0.0003852067604581794,
"loss": 798.5637,
"step": 5780
},
{
"ce_loss_13": 3.2768237948417664,
"ce_loss_17": 3.237665665149689,
"ce_loss_2": 4.080949985980988,
"ce_loss_4": 3.669422745704651,
"ce_loss_9": 3.370028007030487,
"epoch": 0.579,
"grad_norm": 1072.0,
"kl_loss_13": 72.39575462341308,
"kl_loss_2": 1839.0868286132813,
"kl_loss_4": 986.4563354492187,
"kl_loss_9": 280.40548477172854,
"learning_rate": 0.0003836630605958888,
"loss": 783.4337,
"step": 5790
},
{
"ce_loss_13": 3.3321749329566956,
"ce_loss_17": 3.2925341963768004,
"ce_loss_2": 4.109150612354279,
"ce_loss_4": 3.7234968543052673,
"ce_loss_9": 3.428684854507446,
"epoch": 0.58,
"grad_norm": 1240.0,
"kl_loss_13": 74.08155212402343,
"kl_loss_2": 1816.310040283203,
"kl_loss_4": 986.648226928711,
"kl_loss_9": 286.85925216674804,
"learning_rate": 0.0003821205322452863,
"loss": 815.1135,
"step": 5800
},
{
"ce_loss_13": 3.304441678524017,
"ce_loss_17": 3.2667020320892335,
"ce_loss_2": 4.0839027762413025,
"ce_loss_4": 3.6912376165390013,
"ce_loss_9": 3.399633002281189,
"epoch": 0.581,
"grad_norm": 888.0,
"kl_loss_13": 72.40485496520996,
"kl_loss_2": 1802.7967712402344,
"kl_loss_4": 969.6277923583984,
"kl_loss_9": 282.01318740844727,
"learning_rate": 0.0003805791909396155,
"loss": 785.8026,
"step": 5810
},
{
"ce_loss_13": 3.2645418524742125,
"ce_loss_17": 3.2266998291015625,
"ce_loss_2": 4.0642083287239075,
"ce_loss_4": 3.650386297702789,
"ce_loss_9": 3.3594146370887756,
"epoch": 0.582,
"grad_norm": 932.0,
"kl_loss_13": 71.84885711669922,
"kl_loss_2": 1824.7707580566407,
"kl_loss_4": 971.6696868896485,
"kl_loss_9": 280.12883377075195,
"learning_rate": 0.0003790390522001662,
"loss": 793.4603,
"step": 5820
},
{
"ce_loss_13": 3.199913036823273,
"ce_loss_17": 3.1629793882369994,
"ce_loss_2": 3.999752473831177,
"ce_loss_4": 3.594913971424103,
"ce_loss_9": 3.2937525868415833,
"epoch": 0.583,
"grad_norm": 848.0,
"kl_loss_13": 70.37647552490235,
"kl_loss_2": 1852.6868469238282,
"kl_loss_4": 989.9863128662109,
"kl_loss_9": 280.64996337890625,
"learning_rate": 0.0003775001315361183,
"loss": 780.3628,
"step": 5830
},
{
"ce_loss_13": 3.3063352584838865,
"ce_loss_17": 3.264518618583679,
"ce_loss_2": 4.108177971839905,
"ce_loss_4": 3.697060787677765,
"ce_loss_9": 3.4061919689178466,
"epoch": 0.584,
"grad_norm": 660.0,
"kl_loss_13": 73.38811836242675,
"kl_loss_2": 1819.9228637695312,
"kl_loss_4": 975.4645874023438,
"kl_loss_9": 284.6202987670898,
"learning_rate": 0.0003759624444443858,
"loss": 786.6172,
"step": 5840
},
{
"ce_loss_13": 3.34173401594162,
"ce_loss_17": 3.3037470102310182,
"ce_loss_2": 4.114887666702271,
"ce_loss_4": 3.7185524225234987,
"ce_loss_9": 3.435133862495422,
"epoch": 0.585,
"grad_norm": 956.0,
"kl_loss_13": 73.05100364685059,
"kl_loss_2": 1791.394512939453,
"kl_loss_4": 958.487109375,
"kl_loss_9": 280.1448257446289,
"learning_rate": 0.00037442600640946044,
"loss": 771.5148,
"step": 5850
},
{
"ce_loss_13": 3.2984007954597474,
"ce_loss_17": 3.2617080211639404,
"ce_loss_2": 4.0688137888908384,
"ce_loss_4": 3.67920743227005,
"ce_loss_9": 3.3929454565048216,
"epoch": 0.586,
"grad_norm": 1056.0,
"kl_loss_13": 71.55231246948242,
"kl_loss_2": 1775.8209228515625,
"kl_loss_4": 966.913232421875,
"kl_loss_9": 281.4110786437988,
"learning_rate": 0.00037289083290325663,
"loss": 761.8062,
"step": 5860
},
{
"ce_loss_13": 3.282070851325989,
"ce_loss_17": 3.2435006380081175,
"ce_loss_2": 4.057844626903534,
"ce_loss_4": 3.6678024888038636,
"ce_loss_9": 3.3763722658157347,
"epoch": 0.587,
"grad_norm": 1200.0,
"kl_loss_13": 71.80185241699219,
"kl_loss_2": 1767.1731994628906,
"kl_loss_4": 954.13974609375,
"kl_loss_9": 277.7298324584961,
"learning_rate": 0.0003713569393849543,
"loss": 768.4436,
"step": 5870
},
{
"ce_loss_13": 3.3335991501808167,
"ce_loss_17": 3.2933664560317992,
"ce_loss_2": 4.114494931697846,
"ce_loss_4": 3.718398153781891,
"ce_loss_9": 3.4264504432678224,
"epoch": 0.588,
"grad_norm": 1120.0,
"kl_loss_13": 73.1373176574707,
"kl_loss_2": 1792.4062377929688,
"kl_loss_4": 973.3330139160156,
"kl_loss_9": 283.1918441772461,
"learning_rate": 0.00036982434130084397,
"loss": 782.7954,
"step": 5880
},
{
"ce_loss_13": 3.2465739846229553,
"ce_loss_17": 3.2064101934432983,
"ce_loss_2": 4.027490735054016,
"ce_loss_4": 3.6307048916816713,
"ce_loss_9": 3.3453976035118105,
"epoch": 0.589,
"grad_norm": 820.0,
"kl_loss_13": 72.75676975250244,
"kl_loss_2": 1803.2782409667968,
"kl_loss_4": 972.25625,
"kl_loss_9": 285.99529190063475,
"learning_rate": 0.00036829305408417166,
"loss": 787.7693,
"step": 5890
},
{
"ce_loss_13": 3.2280359148979185,
"ce_loss_17": 3.188660192489624,
"ce_loss_2": 4.046233725547791,
"ce_loss_4": 3.6307573556900024,
"ce_loss_9": 3.327609062194824,
"epoch": 0.59,
"grad_norm": 824.0,
"kl_loss_13": 73.30173187255859,
"kl_loss_2": 1848.1465759277344,
"kl_loss_4": 992.1645568847656,
"kl_loss_9": 286.77287139892576,
"learning_rate": 0.0003667630931549826,
"loss": 788.664,
"step": 5900
},
{
"ce_loss_13": 3.2037394762039186,
"ce_loss_17": 3.164516258239746,
"ce_loss_2": 4.040177667140961,
"ce_loss_4": 3.6129900217056274,
"ce_loss_9": 3.3020408153533936,
"epoch": 0.591,
"grad_norm": 1192.0,
"kl_loss_13": 71.7927785873413,
"kl_loss_2": 1909.3319885253907,
"kl_loss_4": 1010.0443572998047,
"kl_loss_9": 286.49772720336915,
"learning_rate": 0.00036523447391996613,
"loss": 803.2408,
"step": 5910
},
{
"ce_loss_13": 3.286428999900818,
"ce_loss_17": 3.250199830532074,
"ce_loss_2": 4.064352822303772,
"ce_loss_4": 3.67577840089798,
"ce_loss_9": 3.3807467460632323,
"epoch": 0.592,
"grad_norm": 848.0,
"kl_loss_13": 71.02851467132568,
"kl_loss_2": 1782.7820373535155,
"kl_loss_4": 960.0286407470703,
"kl_loss_9": 279.03868942260743,
"learning_rate": 0.00036370721177230114,
"loss": 770.0045,
"step": 5920
},
{
"ce_loss_13": 3.2896131038665772,
"ce_loss_17": 3.2510338187217713,
"ce_loss_2": 4.09786479473114,
"ce_loss_4": 3.6828097462654115,
"ce_loss_9": 3.388933300971985,
"epoch": 0.593,
"grad_norm": 728.0,
"kl_loss_13": 72.76485977172851,
"kl_loss_2": 1831.8380004882813,
"kl_loss_4": 980.9190948486328,
"kl_loss_9": 285.5613159179687,
"learning_rate": 0.00036218132209150044,
"loss": 786.7089,
"step": 5930
},
{
"ce_loss_13": 3.239467215538025,
"ce_loss_17": 3.1968210339546204,
"ce_loss_2": 4.074165380001068,
"ce_loss_4": 3.661141836643219,
"ce_loss_9": 3.3476688265800476,
"epoch": 0.594,
"grad_norm": 984.0,
"kl_loss_13": 75.23736305236817,
"kl_loss_2": 1897.803240966797,
"kl_loss_4": 1030.2247222900392,
"kl_loss_9": 297.45814361572263,
"learning_rate": 0.0003606568202432562,
"loss": 802.2447,
"step": 5940
},
{
"ce_loss_13": 3.316815996170044,
"ce_loss_17": 3.277123522758484,
"ce_loss_2": 4.12262943983078,
"ce_loss_4": 3.7138426423072817,
"ce_loss_9": 3.412475216388702,
"epoch": 0.595,
"grad_norm": 876.0,
"kl_loss_13": 73.52718086242676,
"kl_loss_2": 1856.3543212890625,
"kl_loss_4": 992.5437133789062,
"kl_loss_9": 285.9757568359375,
"learning_rate": 0.0003591337215792851,
"loss": 781.4097,
"step": 5950
},
{
"ce_loss_13": 3.3531678915023804,
"ce_loss_17": 3.314804708957672,
"ce_loss_2": 4.109200823307037,
"ce_loss_4": 3.7312793254852297,
"ce_loss_9": 3.447885584831238,
"epoch": 0.596,
"grad_norm": 628.0,
"kl_loss_13": 71.84714088439941,
"kl_loss_2": 1773.0002014160157,
"kl_loss_4": 959.8357330322266,
"kl_loss_9": 279.19155349731443,
"learning_rate": 0.00035761204143717383,
"loss": 782.717,
"step": 5960
},
{
"ce_loss_13": 3.3036818742752074,
"ce_loss_17": 3.263713240623474,
"ce_loss_2": 4.092971360683441,
"ce_loss_4": 3.698699343204498,
"ce_loss_9": 3.4006860733032225,
"epoch": 0.597,
"grad_norm": 908.0,
"kl_loss_13": 72.98650817871093,
"kl_loss_2": 1813.5230285644532,
"kl_loss_4": 986.8791961669922,
"kl_loss_9": 283.56762619018554,
"learning_rate": 0.0003560917951402245,
"loss": 802.3848,
"step": 5970
},
{
"ce_loss_13": 3.285984826087952,
"ce_loss_17": 3.247555208206177,
"ce_loss_2": 4.072379291057587,
"ce_loss_4": 3.6760414361953737,
"ce_loss_9": 3.3791685938835143,
"epoch": 0.598,
"grad_norm": 900.0,
"kl_loss_13": 71.70736351013184,
"kl_loss_2": 1795.4314697265625,
"kl_loss_4": 970.2411376953125,
"kl_loss_9": 280.68557662963866,
"learning_rate": 0.00035457299799730046,
"loss": 778.7829,
"step": 5980
},
{
"ce_loss_13": 3.3455660462379457,
"ce_loss_17": 3.306633234024048,
"ce_loss_2": 4.122979462146759,
"ce_loss_4": 3.7328011989593506,
"ce_loss_9": 3.4436651349067686,
"epoch": 0.599,
"grad_norm": 792.0,
"kl_loss_13": 71.79905586242675,
"kl_loss_2": 1778.4928039550782,
"kl_loss_4": 960.0759826660156,
"kl_loss_9": 281.6575355529785,
"learning_rate": 0.0003530556653026721,
"loss": 784.7722,
"step": 5990
},
{
"ce_loss_13": 3.266416549682617,
"ce_loss_17": 3.228486883640289,
"ce_loss_2": 4.066090321540832,
"ce_loss_4": 3.647917056083679,
"ce_loss_9": 3.361822855472565,
"epoch": 0.6,
"grad_norm": 1344.0,
"kl_loss_13": 70.68609580993652,
"kl_loss_2": 1819.8283996582031,
"kl_loss_4": 955.7968322753907,
"kl_loss_9": 277.94916610717775,
"learning_rate": 0.00035153981233586274,
"loss": 786.5068,
"step": 6000
},
{
"ce_loss_13": 3.2435776352882386,
"ce_loss_17": 3.203437614440918,
"ce_loss_2": 4.040588653087616,
"ce_loss_4": 3.6361332893371583,
"ce_loss_9": 3.338526356220245,
"epoch": 0.601,
"grad_norm": 1072.0,
"kl_loss_13": 70.50964450836182,
"kl_loss_2": 1814.0566650390624,
"kl_loss_4": 977.1874572753907,
"kl_loss_9": 278.71122283935546,
"learning_rate": 0.00035002545436149473,
"loss": 807.4567,
"step": 6010
},
{
"ce_loss_13": 3.250414502620697,
"ce_loss_17": 3.211272585391998,
"ce_loss_2": 4.0653922200202945,
"ce_loss_4": 3.655634081363678,
"ce_loss_9": 3.3470831632614138,
"epoch": 0.602,
"grad_norm": 664.0,
"kl_loss_13": 74.24386062622071,
"kl_loss_2": 1860.7207580566405,
"kl_loss_4": 1006.5490264892578,
"kl_loss_9": 289.0743743896484,
"learning_rate": 0.0003485126066291364,
"loss": 781.3338,
"step": 6020
},
{
"ce_loss_13": 3.294699478149414,
"ce_loss_17": 3.253976845741272,
"ce_loss_2": 4.0994375348091125,
"ce_loss_4": 3.6909204363822936,
"ce_loss_9": 3.3906335711479185,
"epoch": 0.603,
"grad_norm": 916.0,
"kl_loss_13": 72.10275173187256,
"kl_loss_2": 1822.8707214355468,
"kl_loss_4": 977.2399291992188,
"kl_loss_9": 279.21849060058594,
"learning_rate": 0.0003470012843731476,
"loss": 787.5461,
"step": 6030
},
{
"ce_loss_13": 3.24105304479599,
"ce_loss_17": 3.2001903772354128,
"ce_loss_2": 4.045105612277984,
"ce_loss_4": 3.6382987022399904,
"ce_loss_9": 3.3370439410209656,
"epoch": 0.604,
"grad_norm": 836.0,
"kl_loss_13": 72.2528564453125,
"kl_loss_2": 1836.884100341797,
"kl_loss_4": 988.0165435791016,
"kl_loss_9": 281.7273681640625,
"learning_rate": 0.00034549150281252633,
"loss": 804.6245,
"step": 6040
},
{
"ce_loss_13": 3.2170220375061036,
"ce_loss_17": 3.1779794812202455,
"ce_loss_2": 3.995331084728241,
"ce_loss_4": 3.6100341081619263,
"ce_loss_9": 3.3146474242210386,
"epoch": 0.605,
"grad_norm": 836.0,
"kl_loss_13": 71.00680580139161,
"kl_loss_2": 1757.2800537109374,
"kl_loss_4": 960.993408203125,
"kl_loss_9": 278.24233474731443,
"learning_rate": 0.0003439832771507565,
"loss": 766.7023,
"step": 6050
},
{
"ce_loss_13": 3.228524351119995,
"ce_loss_17": 3.187285077571869,
"ce_loss_2": 4.024105882644653,
"ce_loss_4": 3.625711727142334,
"ce_loss_9": 3.323964500427246,
"epoch": 0.606,
"grad_norm": 712.0,
"kl_loss_13": 71.69733200073242,
"kl_loss_2": 1829.7399353027345,
"kl_loss_4": 992.6620208740235,
"kl_loss_9": 280.5502082824707,
"learning_rate": 0.0003424766225756537,
"loss": 780.052,
"step": 6060
},
{
"ce_loss_13": 3.285083842277527,
"ce_loss_17": 3.2451503396034242,
"ce_loss_2": 4.077549755573273,
"ce_loss_4": 3.676045870780945,
"ce_loss_9": 3.380914330482483,
"epoch": 0.607,
"grad_norm": 772.0,
"kl_loss_13": 72.76493911743164,
"kl_loss_2": 1803.9171203613282,
"kl_loss_4": 968.7603179931641,
"kl_loss_9": 281.1850296020508,
"learning_rate": 0.00034097155425921255,
"loss": 769.2323,
"step": 6070
},
{
"ce_loss_13": 3.1820239067077636,
"ce_loss_17": 3.142336893081665,
"ce_loss_2": 3.9852845311164855,
"ce_loss_4": 3.57771520614624,
"ce_loss_9": 3.278271722793579,
"epoch": 0.608,
"grad_norm": 840.0,
"kl_loss_13": 72.62495079040528,
"kl_loss_2": 1856.206787109375,
"kl_loss_4": 989.25322265625,
"kl_loss_9": 283.5368026733398,
"learning_rate": 0.0003394680873574546,
"loss": 785.9624,
"step": 6080
},
{
"ce_loss_13": 3.2822832107543944,
"ce_loss_17": 3.239939785003662,
"ce_loss_2": 4.101767385005951,
"ce_loss_4": 3.685703420639038,
"ce_loss_9": 3.382130753993988,
"epoch": 0.609,
"grad_norm": 1024.0,
"kl_loss_13": 73.62976379394532,
"kl_loss_2": 1866.8404479980468,
"kl_loss_4": 996.1072570800782,
"kl_loss_9": 283.988956451416,
"learning_rate": 0.0003379662370102747,
"loss": 784.3762,
"step": 6090
},
{
"ce_loss_13": 3.296636772155762,
"ce_loss_17": 3.2578766703605653,
"ce_loss_2": 4.068477404117584,
"ce_loss_4": 3.681435239315033,
"ce_loss_9": 3.391046917438507,
"epoch": 0.61,
"grad_norm": 820.0,
"kl_loss_13": 71.44107837677002,
"kl_loss_2": 1790.297344970703,
"kl_loss_4": 970.0491180419922,
"kl_loss_9": 278.9227890014648,
"learning_rate": 0.0003364660183412892,
"loss": 782.6667,
"step": 6100
},
{
"ce_loss_13": 3.279218602180481,
"ce_loss_17": 3.239975702762604,
"ce_loss_2": 4.057258355617523,
"ce_loss_4": 3.662726712226868,
"ce_loss_9": 3.3751845836639403,
"epoch": 0.611,
"grad_norm": 940.0,
"kl_loss_13": 71.98993682861328,
"kl_loss_2": 1803.81904296875,
"kl_loss_4": 971.1891784667969,
"kl_loss_9": 282.17127532958983,
"learning_rate": 0.0003349674464576834,
"loss": 789.9217,
"step": 6110
},
{
"ce_loss_13": 3.228991961479187,
"ce_loss_17": 3.1899941682815554,
"ce_loss_2": 4.037423419952392,
"ce_loss_4": 3.6243603825569153,
"ce_loss_9": 3.326341247558594,
"epoch": 0.612,
"grad_norm": 772.0,
"kl_loss_13": 72.28171844482422,
"kl_loss_2": 1837.7317016601562,
"kl_loss_4": 978.088656616211,
"kl_loss_9": 281.76746063232423,
"learning_rate": 0.00033347053645005966,
"loss": 770.0652,
"step": 6120
},
{
"ce_loss_13": 3.331780731678009,
"ce_loss_17": 3.292486870288849,
"ce_loss_2": 4.094799375534057,
"ce_loss_4": 3.7155101776123045,
"ce_loss_9": 3.4249905228614805,
"epoch": 0.613,
"grad_norm": 1040.0,
"kl_loss_13": 71.58653755187989,
"kl_loss_2": 1745.4302124023438,
"kl_loss_4": 957.3974456787109,
"kl_loss_9": 277.33105697631834,
"learning_rate": 0.00033197530339228485,
"loss": 779.0169,
"step": 6130
},
{
"ce_loss_13": 3.2880977034568786,
"ce_loss_17": 3.248146677017212,
"ce_loss_2": 4.074892008304596,
"ce_loss_4": 3.6834922194480897,
"ce_loss_9": 3.387561297416687,
"epoch": 0.614,
"grad_norm": 756.0,
"kl_loss_13": 72.65009574890136,
"kl_loss_2": 1790.276544189453,
"kl_loss_4": 975.4449920654297,
"kl_loss_9": 283.23338775634767,
"learning_rate": 0.00033048176234133967,
"loss": 775.951,
"step": 6140
},
{
"ce_loss_13": 3.2806970953941343,
"ce_loss_17": 3.2423112869262694,
"ce_loss_2": 4.056227326393127,
"ce_loss_4": 3.6680848240852355,
"ce_loss_9": 3.37451308965683,
"epoch": 0.615,
"grad_norm": 944.0,
"kl_loss_13": 72.70654010772705,
"kl_loss_2": 1784.218359375,
"kl_loss_4": 975.8216552734375,
"kl_loss_9": 283.3952438354492,
"learning_rate": 0.0003289899283371657,
"loss": 784.4755,
"step": 6150
},
{
"ce_loss_13": 3.2955815315246584,
"ce_loss_17": 3.255915582180023,
"ce_loss_2": 4.095048213005066,
"ce_loss_4": 3.6850364565849305,
"ce_loss_9": 3.3905885100364683,
"epoch": 0.616,
"grad_norm": 1240.0,
"kl_loss_13": 72.33249015808106,
"kl_loss_2": 1809.0255187988282,
"kl_loss_4": 961.4775451660156,
"kl_loss_9": 279.44675674438474,
"learning_rate": 0.0003274998164025148,
"loss": 792.1812,
"step": 6160
},
{
"ce_loss_13": 3.328199291229248,
"ce_loss_17": 3.2899797320365907,
"ce_loss_2": 4.103036093711853,
"ce_loss_4": 3.7185376644134522,
"ce_loss_9": 3.4223578333854676,
"epoch": 0.617,
"grad_norm": 952.0,
"kl_loss_13": 73.33154029846192,
"kl_loss_2": 1780.0444702148438,
"kl_loss_4": 974.3755706787109,
"kl_loss_9": 282.3663688659668,
"learning_rate": 0.0003260114415427975,
"loss": 793.5954,
"step": 6170
},
{
"ce_loss_13": 3.252111566066742,
"ce_loss_17": 3.212690567970276,
"ce_loss_2": 4.058794844150543,
"ce_loss_4": 3.6559091925621034,
"ce_loss_9": 3.350780153274536,
"epoch": 0.618,
"grad_norm": 928.0,
"kl_loss_13": 72.44158325195312,
"kl_loss_2": 1846.1583312988282,
"kl_loss_4": 986.2647827148437,
"kl_loss_9": 280.55818099975585,
"learning_rate": 0.0003245248187459323,
"loss": 801.7746,
"step": 6180
},
{
"ce_loss_13": 3.240708112716675,
"ce_loss_17": 3.2041526675224303,
"ce_loss_2": 4.004272377490997,
"ce_loss_4": 3.614217388629913,
"ce_loss_9": 3.3318390011787415,
"epoch": 0.619,
"grad_norm": 728.0,
"kl_loss_13": 68.6929853439331,
"kl_loss_2": 1759.1183471679688,
"kl_loss_4": 946.2114318847656,
"kl_loss_9": 271.18813400268556,
"learning_rate": 0.00032303996298219416,
"loss": 764.9716,
"step": 6190
},
{
"ce_loss_13": 3.3225024580955504,
"ce_loss_17": 3.2831245183944704,
"ce_loss_2": 4.081291854381561,
"ce_loss_4": 3.7007321953773498,
"ce_loss_9": 3.4160247802734376,
"epoch": 0.62,
"grad_norm": 836.0,
"kl_loss_13": 71.11275367736816,
"kl_loss_2": 1734.1385681152344,
"kl_loss_4": 945.437710571289,
"kl_loss_9": 272.4640754699707,
"learning_rate": 0.00032155688920406414,
"loss": 760.9354,
"step": 6200
},
{
"ce_loss_13": 3.236972117424011,
"ce_loss_17": 3.1968539357185364,
"ce_loss_2": 4.053024530410767,
"ce_loss_4": 3.636395263671875,
"ce_loss_9": 3.3320537924766542,
"epoch": 0.621,
"grad_norm": 912.0,
"kl_loss_13": 72.77989196777344,
"kl_loss_2": 1853.1631958007813,
"kl_loss_4": 989.524203491211,
"kl_loss_9": 284.0096466064453,
"learning_rate": 0.0003200756123460788,
"loss": 803.549,
"step": 6210
},
{
"ce_loss_13": 3.2674758195877076,
"ce_loss_17": 3.226785624027252,
"ce_loss_2": 4.08230230808258,
"ce_loss_4": 3.671753633022308,
"ce_loss_9": 3.367294156551361,
"epoch": 0.622,
"grad_norm": 1064.0,
"kl_loss_13": 73.93520374298096,
"kl_loss_2": 1856.7218139648437,
"kl_loss_4": 1001.2637176513672,
"kl_loss_9": 288.70165252685547,
"learning_rate": 0.00031859614732467957,
"loss": 802.2571,
"step": 6220
},
{
"ce_loss_13": 3.3192309975624084,
"ce_loss_17": 3.2794345736503603,
"ce_loss_2": 4.087488758563995,
"ce_loss_4": 3.6964677214622497,
"ce_loss_9": 3.4120842099189757,
"epoch": 0.623,
"grad_norm": 992.0,
"kl_loss_13": 71.44276008605956,
"kl_loss_2": 1754.6969421386718,
"kl_loss_4": 947.3564575195312,
"kl_loss_9": 274.8693054199219,
"learning_rate": 0.00031711850903806275,
"loss": 765.0162,
"step": 6230
},
{
"ce_loss_13": 3.224324369430542,
"ce_loss_17": 3.1861729264259337,
"ce_loss_2": 4.034242498874664,
"ce_loss_4": 3.627521336078644,
"ce_loss_9": 3.32495493888855,
"epoch": 0.624,
"grad_norm": 812.0,
"kl_loss_13": 73.42914161682128,
"kl_loss_2": 1844.0267211914063,
"kl_loss_4": 993.9605987548828,
"kl_loss_9": 288.5894271850586,
"learning_rate": 0.0003156427123660297,
"loss": 782.3298,
"step": 6240
},
{
"ce_loss_13": 3.308632123470306,
"ce_loss_17": 3.269201564788818,
"ce_loss_2": 4.075583910942077,
"ce_loss_4": 3.693323624134064,
"ce_loss_9": 3.4053150177001954,
"epoch": 0.625,
"grad_norm": 964.0,
"kl_loss_13": 71.81601142883301,
"kl_loss_2": 1760.6226501464844,
"kl_loss_4": 959.6227325439453,
"kl_loss_9": 277.5846717834473,
"learning_rate": 0.0003141687721698363,
"loss": 778.7586,
"step": 6250
},
{
"ce_loss_13": 3.285798990726471,
"ce_loss_17": 3.246725380420685,
"ce_loss_2": 4.028945982456207,
"ce_loss_4": 3.649042856693268,
"ce_loss_9": 3.3741363763809202,
"epoch": 0.626,
"grad_norm": 932.0,
"kl_loss_13": 69.40913047790528,
"kl_loss_2": 1709.059881591797,
"kl_loss_4": 922.3430023193359,
"kl_loss_9": 265.99488525390626,
"learning_rate": 0.00031269670329204396,
"loss": 764.9511,
"step": 6260
},
{
"ce_loss_13": 3.318275809288025,
"ce_loss_17": 3.2813219666481017,
"ce_loss_2": 4.072272872924804,
"ce_loss_4": 3.6981289267539976,
"ce_loss_9": 3.4113965749740602,
"epoch": 0.627,
"grad_norm": 904.0,
"kl_loss_13": 71.68196277618408,
"kl_loss_2": 1742.3362915039063,
"kl_loss_4": 950.3534515380859,
"kl_loss_9": 277.0380157470703,
"learning_rate": 0.00031122652055637015,
"loss": 774.6862,
"step": 6270
},
{
"ce_loss_13": 3.2794430017471314,
"ce_loss_17": 3.2412443280220034,
"ce_loss_2": 4.075256025791168,
"ce_loss_4": 3.671683204174042,
"ce_loss_9": 3.3764753937721252,
"epoch": 0.628,
"grad_norm": 932.0,
"kl_loss_13": 72.09937229156495,
"kl_loss_2": 1823.369482421875,
"kl_loss_4": 986.5869415283203,
"kl_loss_9": 282.3317504882813,
"learning_rate": 0.0003097582387675385,
"loss": 774.2291,
"step": 6280
},
{
"ce_loss_13": 3.3189268708229065,
"ce_loss_17": 3.2824313163757326,
"ce_loss_2": 4.103376257419586,
"ce_loss_4": 3.7095051646232604,
"ce_loss_9": 3.415855610370636,
"epoch": 0.629,
"grad_norm": 856.0,
"kl_loss_13": 71.44106979370117,
"kl_loss_2": 1805.623046875,
"kl_loss_4": 971.4082427978516,
"kl_loss_9": 279.6691734313965,
"learning_rate": 0.00030829187271113034,
"loss": 774.3185,
"step": 6290
},
{
"ce_loss_13": 3.313884997367859,
"ce_loss_17": 3.2744386076927183,
"ce_loss_2": 4.0814232468605045,
"ce_loss_4": 3.692212975025177,
"ce_loss_9": 3.40540634393692,
"epoch": 0.63,
"grad_norm": 988.0,
"kl_loss_13": 71.26968612670899,
"kl_loss_2": 1759.0361206054688,
"kl_loss_4": 956.8941284179688,
"kl_loss_9": 275.8732261657715,
"learning_rate": 0.00030682743715343565,
"loss": 782.9761,
"step": 6300
},
{
"ce_loss_13": 3.2659210562705994,
"ce_loss_17": 3.223229229450226,
"ce_loss_2": 4.058343923091888,
"ce_loss_4": 3.6658867835998534,
"ce_loss_9": 3.3643734216690064,
"epoch": 0.631,
"grad_norm": 660.0,
"kl_loss_13": 74.21347732543946,
"kl_loss_2": 1803.3361572265626,
"kl_loss_4": 977.7171142578125,
"kl_loss_9": 284.70903701782225,
"learning_rate": 0.0003053649468413043,
"loss": 791.4564,
"step": 6310
},
{
"ce_loss_13": 3.373299503326416,
"ce_loss_17": 3.3342126488685606,
"ce_loss_2": 4.143603193759918,
"ce_loss_4": 3.7604181289672853,
"ce_loss_9": 3.4721192598342894,
"epoch": 0.632,
"grad_norm": 1032.0,
"kl_loss_13": 72.66786975860596,
"kl_loss_2": 1778.5544677734374,
"kl_loss_4": 964.4415985107422,
"kl_loss_9": 281.6828071594238,
"learning_rate": 0.00030390441650199725,
"loss": 774.1748,
"step": 6320
},
{
"ce_loss_13": 3.277493715286255,
"ce_loss_17": 3.2391820192337035,
"ce_loss_2": 4.05932000875473,
"ce_loss_4": 3.664527249336243,
"ce_loss_9": 3.3744643330574036,
"epoch": 0.633,
"grad_norm": 1016.0,
"kl_loss_13": 70.67644805908203,
"kl_loss_2": 1778.2772399902344,
"kl_loss_4": 962.4036499023438,
"kl_loss_9": 278.81672897338865,
"learning_rate": 0.00030244586084303903,
"loss": 770.6784,
"step": 6330
},
{
"ce_loss_13": 3.244217538833618,
"ce_loss_17": 3.206530499458313,
"ce_loss_2": 4.046750092506409,
"ce_loss_4": 3.65035218000412,
"ce_loss_9": 3.341222083568573,
"epoch": 0.634,
"grad_norm": 756.0,
"kl_loss_13": 71.81494064331055,
"kl_loss_2": 1835.2865966796876,
"kl_loss_4": 1000.7807067871094,
"kl_loss_9": 283.35120391845703,
"learning_rate": 0.00030098929455206903,
"loss": 775.5071,
"step": 6340
},
{
"ce_loss_13": 3.252631652355194,
"ce_loss_17": 3.215214204788208,
"ce_loss_2": 4.041942119598389,
"ce_loss_4": 3.6305495619773867,
"ce_loss_9": 3.3447246193885802,
"epoch": 0.635,
"grad_norm": 784.0,
"kl_loss_13": 70.43671970367431,
"kl_loss_2": 1820.333349609375,
"kl_loss_4": 969.7455963134765,
"kl_loss_9": 277.15399780273435,
"learning_rate": 0.00029953473229669324,
"loss": 797.8032,
"step": 6350
},
{
"ce_loss_13": 3.281371021270752,
"ce_loss_17": 3.2443987011909483,
"ce_loss_2": 4.0675465822219845,
"ce_loss_4": 3.678115463256836,
"ce_loss_9": 3.3796138882637026,
"epoch": 0.636,
"grad_norm": 1304.0,
"kl_loss_13": 71.03854198455811,
"kl_loss_2": 1794.914044189453,
"kl_loss_4": 974.7052612304688,
"kl_loss_9": 282.9618148803711,
"learning_rate": 0.00029808218872433767,
"loss": 771.7335,
"step": 6360
},
{
"ce_loss_13": 3.3407238960266112,
"ce_loss_17": 3.3002508759498594,
"ce_loss_2": 4.118294715881348,
"ce_loss_4": 3.7223580479621887,
"ce_loss_9": 3.4371691823005674,
"epoch": 0.637,
"grad_norm": 996.0,
"kl_loss_13": 71.89597148895264,
"kl_loss_2": 1791.3907897949218,
"kl_loss_4": 955.3046051025391,
"kl_loss_9": 277.70820846557615,
"learning_rate": 0.0002966316784621,
"loss": 765.5395,
"step": 6370
},
{
"ce_loss_13": 3.2532244086265565,
"ce_loss_17": 3.213400721549988,
"ce_loss_2": 4.059457647800445,
"ce_loss_4": 3.6510413765907286,
"ce_loss_9": 3.349591112136841,
"epoch": 0.638,
"grad_norm": 836.0,
"kl_loss_13": 72.27867050170899,
"kl_loss_2": 1827.22919921875,
"kl_loss_4": 989.6232208251953,
"kl_loss_9": 285.76849670410155,
"learning_rate": 0.0002951832161166024,
"loss": 776.5623,
"step": 6380
},
{
"ce_loss_13": 3.3263792872428892,
"ce_loss_17": 3.287059986591339,
"ce_loss_2": 4.109494614601135,
"ce_loss_4": 3.719343626499176,
"ce_loss_9": 3.425124990940094,
"epoch": 0.639,
"grad_norm": 812.0,
"kl_loss_13": 73.35769863128662,
"kl_loss_2": 1783.822198486328,
"kl_loss_4": 967.2407745361328,
"kl_loss_9": 281.5798004150391,
"learning_rate": 0.0002937368162738445,
"loss": 763.2839,
"step": 6390
},
{
"ce_loss_13": 3.2767720222473145,
"ce_loss_17": 3.238836967945099,
"ce_loss_2": 4.0461488485336305,
"ce_loss_4": 3.654835057258606,
"ce_loss_9": 3.3681108474731447,
"epoch": 0.64,
"grad_norm": 948.0,
"kl_loss_13": 69.1386381149292,
"kl_loss_2": 1787.5851501464845,
"kl_loss_4": 960.3883850097657,
"kl_loss_9": 271.1270004272461,
"learning_rate": 0.0002922924934990568,
"loss": 779.4924,
"step": 6400
},
{
"ce_loss_13": 3.21414737701416,
"ce_loss_17": 3.173914396762848,
"ce_loss_2": 4.030556344985962,
"ce_loss_4": 3.611742639541626,
"ce_loss_9": 3.309198236465454,
"epoch": 0.641,
"grad_norm": 804.0,
"kl_loss_13": 71.55045490264892,
"kl_loss_2": 1865.8428894042968,
"kl_loss_4": 997.1090881347657,
"kl_loss_9": 281.19031829833983,
"learning_rate": 0.0002908502623365536,
"loss": 787.647,
"step": 6410
},
{
"ce_loss_13": 3.142470693588257,
"ce_loss_17": 3.10512797832489,
"ce_loss_2": 3.9646692752838133,
"ce_loss_4": 3.547313940525055,
"ce_loss_9": 3.2446085810661316,
"epoch": 0.642,
"grad_norm": 848.0,
"kl_loss_13": 69.80724754333497,
"kl_loss_2": 1873.81376953125,
"kl_loss_4": 1001.1044128417968,
"kl_loss_9": 281.8624671936035,
"learning_rate": 0.0002894101373095867,
"loss": 791.2895,
"step": 6420
},
{
"ce_loss_13": 3.355082380771637,
"ce_loss_17": 3.3160241842269897,
"ce_loss_2": 4.120398831367493,
"ce_loss_4": 3.737993931770325,
"ce_loss_9": 3.4494681477546694,
"epoch": 0.643,
"grad_norm": 1040.0,
"kl_loss_13": 72.17458419799804,
"kl_loss_2": 1769.3141662597657,
"kl_loss_4": 963.356201171875,
"kl_loss_9": 278.2576614379883,
"learning_rate": 0.00028797213292019926,
"loss": 770.181,
"step": 6430
},
{
"ce_loss_13": 3.3366548657417296,
"ce_loss_17": 3.295503115653992,
"ce_loss_2": 4.1048006296157835,
"ce_loss_4": 3.7235260963439942,
"ce_loss_9": 3.4315634846687315,
"epoch": 0.644,
"grad_norm": 940.0,
"kl_loss_13": 72.5972915649414,
"kl_loss_2": 1776.367205810547,
"kl_loss_4": 970.1185089111328,
"kl_loss_9": 280.8289596557617,
"learning_rate": 0.0002865362636490791,
"loss": 788.4355,
"step": 6440
},
{
"ce_loss_13": 3.3477864146232603,
"ce_loss_17": 3.310180294513702,
"ce_loss_2": 4.12326066493988,
"ce_loss_4": 3.7306790232658384,
"ce_loss_9": 3.44403954744339,
"epoch": 0.645,
"grad_norm": 924.0,
"kl_loss_13": 71.45477027893067,
"kl_loss_2": 1776.1947631835938,
"kl_loss_4": 955.2560546875,
"kl_loss_9": 276.38500061035154,
"learning_rate": 0.0002851025439554142,
"loss": 764.4826,
"step": 6450
},
{
"ce_loss_13": 3.3339331269264223,
"ce_loss_17": 3.2946556568145753,
"ce_loss_2": 4.094459521770477,
"ce_loss_4": 3.7216048359870912,
"ce_loss_9": 3.4312340378761292,
"epoch": 0.646,
"grad_norm": 904.0,
"kl_loss_13": 72.28151435852051,
"kl_loss_2": 1738.59453125,
"kl_loss_4": 967.1679748535156,
"kl_loss_9": 280.8125907897949,
"learning_rate": 0.00028367098827674573,
"loss": 767.1638,
"step": 6460
},
{
"ce_loss_13": 3.262351620197296,
"ce_loss_17": 3.225614595413208,
"ce_loss_2": 4.04227089881897,
"ce_loss_4": 3.643923032283783,
"ce_loss_9": 3.3568783521652223,
"epoch": 0.647,
"grad_norm": 824.0,
"kl_loss_13": 70.0902738571167,
"kl_loss_2": 1774.7165893554688,
"kl_loss_4": 950.2580627441406,
"kl_loss_9": 275.03301162719725,
"learning_rate": 0.00028224161102882397,
"loss": 771.2599,
"step": 6470
},
{
"ce_loss_13": 3.2347336411476135,
"ce_loss_17": 3.200877916812897,
"ce_loss_2": 4.004123604297638,
"ce_loss_4": 3.6204243063926698,
"ce_loss_9": 3.328202188014984,
"epoch": 0.648,
"grad_norm": 1464.0,
"kl_loss_13": 69.69904022216797,
"kl_loss_2": 1754.0138366699218,
"kl_loss_4": 955.4148651123047,
"kl_loss_9": 272.5667182922363,
"learning_rate": 0.00028081442660546124,
"loss": 771.9115,
"step": 6480
},
{
"ce_loss_13": 3.305151629447937,
"ce_loss_17": 3.2654871940612793,
"ce_loss_2": 4.065175533294678,
"ce_loss_4": 3.6872007489204406,
"ce_loss_9": 3.3978879928588865,
"epoch": 0.649,
"grad_norm": 788.0,
"kl_loss_13": 72.08305015563965,
"kl_loss_2": 1752.1914611816405,
"kl_loss_4": 951.0301177978515,
"kl_loss_9": 277.8635765075684,
"learning_rate": 0.0002793894493783892,
"loss": 769.7433,
"step": 6490
},
{
"ce_loss_13": 3.322451424598694,
"ce_loss_17": 3.284602701663971,
"ce_loss_2": 4.086758828163147,
"ce_loss_4": 3.695036458969116,
"ce_loss_9": 3.4139219641685488,
"epoch": 0.65,
"grad_norm": 820.0,
"kl_loss_13": 70.80225067138672,
"kl_loss_2": 1760.5633239746094,
"kl_loss_4": 943.2197174072265,
"kl_loss_9": 270.98013763427736,
"learning_rate": 0.0002779666936971129,
"loss": 761.3337,
"step": 6500
},
{
"ce_loss_13": 3.3305124044418335,
"ce_loss_17": 3.2916394114494323,
"ce_loss_2": 4.117040920257568,
"ce_loss_4": 3.7206697344779966,
"ce_loss_9": 3.4260012030601503,
"epoch": 0.651,
"grad_norm": 912.0,
"kl_loss_13": 71.9400619506836,
"kl_loss_2": 1809.14013671875,
"kl_loss_4": 977.0250427246094,
"kl_loss_9": 281.1179428100586,
"learning_rate": 0.00027654617388876614,
"loss": 785.1118,
"step": 6510
},
{
"ce_loss_13": 3.3561911463737486,
"ce_loss_17": 3.3163033366203307,
"ce_loss_2": 4.123410153388977,
"ce_loss_4": 3.734295201301575,
"ce_loss_9": 3.4477269887924193,
"epoch": 0.652,
"grad_norm": 792.0,
"kl_loss_13": 72.00549278259277,
"kl_loss_2": 1767.6321655273437,
"kl_loss_4": 956.7952453613282,
"kl_loss_9": 276.63285598754885,
"learning_rate": 0.0002751279042579672,
"loss": 769.8588,
"step": 6520
},
{
"ce_loss_13": 3.2965205788612364,
"ce_loss_17": 3.258060562610626,
"ce_loss_2": 4.066187536716461,
"ce_loss_4": 3.6735449194908143,
"ce_loss_9": 3.3904851913452148,
"epoch": 0.653,
"grad_norm": 1016.0,
"kl_loss_13": 70.93943119049072,
"kl_loss_2": 1765.4940246582032,
"kl_loss_4": 948.104296875,
"kl_loss_9": 273.96680755615233,
"learning_rate": 0.00027371189908667604,
"loss": 780.4364,
"step": 6530
},
{
"ce_loss_13": 3.3481835246086122,
"ce_loss_17": 3.3083571434020995,
"ce_loss_2": 4.160701036453247,
"ce_loss_4": 3.751006531715393,
"ce_loss_9": 3.4436028599739075,
"epoch": 0.654,
"grad_norm": 864.0,
"kl_loss_13": 73.45802383422851,
"kl_loss_2": 1832.6454040527344,
"kl_loss_4": 981.995571899414,
"kl_loss_9": 284.39140167236326,
"learning_rate": 0.00027229817263404863,
"loss": 796.8684,
"step": 6540
},
{
"ce_loss_13": 3.330435812473297,
"ce_loss_17": 3.2924392580986024,
"ce_loss_2": 4.067032384872436,
"ce_loss_4": 3.698940932750702,
"ce_loss_9": 3.423515808582306,
"epoch": 0.655,
"grad_norm": 840.0,
"kl_loss_13": 71.07789249420166,
"kl_loss_2": 1709.2124084472657,
"kl_loss_4": 935.6491851806641,
"kl_loss_9": 273.2137001037598,
"learning_rate": 0.0002708867391362948,
"loss": 762.9658,
"step": 6550
},
{
"ce_loss_13": 3.3154842495918273,
"ce_loss_17": 3.277799355983734,
"ce_loss_2": 4.051917147636414,
"ce_loss_4": 3.676171064376831,
"ce_loss_9": 3.402910828590393,
"epoch": 0.656,
"grad_norm": 856.0,
"kl_loss_13": 69.70827026367188,
"kl_loss_2": 1708.5784423828125,
"kl_loss_4": 915.2664428710938,
"kl_loss_9": 267.1766471862793,
"learning_rate": 0.0002694776128065345,
"loss": 762.0438,
"step": 6560
},
{
"ce_loss_13": 3.248313331604004,
"ce_loss_17": 3.210926651954651,
"ce_loss_2": 4.031670546531677,
"ce_loss_4": 3.640976977348328,
"ce_loss_9": 3.343873071670532,
"epoch": 0.657,
"grad_norm": 696.0,
"kl_loss_13": 70.72250576019287,
"kl_loss_2": 1804.789971923828,
"kl_loss_4": 979.3676391601563,
"kl_loss_9": 282.7828567504883,
"learning_rate": 0.00026807080783465374,
"loss": 769.7072,
"step": 6570
},
{
"ce_loss_13": 3.353252899646759,
"ce_loss_17": 3.3145036935806274,
"ce_loss_2": 4.141715168952942,
"ce_loss_4": 3.7431769251823424,
"ce_loss_9": 3.450781464576721,
"epoch": 0.658,
"grad_norm": 724.0,
"kl_loss_13": 72.77521324157715,
"kl_loss_2": 1798.7067626953126,
"kl_loss_4": 975.6858001708985,
"kl_loss_9": 282.0204391479492,
"learning_rate": 0.00026666633838716316,
"loss": 785.6488,
"step": 6580
},
{
"ce_loss_13": 3.2512298226356506,
"ce_loss_17": 3.209996056556702,
"ce_loss_2": 4.05036506652832,
"ce_loss_4": 3.6514967322349547,
"ce_loss_9": 3.35011887550354,
"epoch": 0.659,
"grad_norm": 780.0,
"kl_loss_13": 72.99883079528809,
"kl_loss_2": 1824.5407653808593,
"kl_loss_4": 993.5420104980469,
"kl_loss_9": 284.9843505859375,
"learning_rate": 0.00026526421860705474,
"loss": 793.5008,
"step": 6590
},
{
"ce_loss_13": 3.275307810306549,
"ce_loss_17": 3.234655296802521,
"ce_loss_2": 4.0622793674469,
"ce_loss_4": 3.665205705165863,
"ce_loss_9": 3.372453773021698,
"epoch": 0.66,
"grad_norm": 800.0,
"kl_loss_13": 72.58459072113037,
"kl_loss_2": 1791.7223022460937,
"kl_loss_4": 965.9852325439454,
"kl_loss_9": 280.37607803344724,
"learning_rate": 0.0002638644626136587,
"loss": 769.9592,
"step": 6600
},
{
"ce_loss_13": 3.2888129353523254,
"ce_loss_17": 3.251253354549408,
"ce_loss_2": 4.072038042545318,
"ce_loss_4": 3.6752144932746886,
"ce_loss_9": 3.3817059636116027,
"epoch": 0.661,
"grad_norm": 816.0,
"kl_loss_13": 70.77959651947022,
"kl_loss_2": 1782.0645324707032,
"kl_loss_4": 968.8410400390625,
"kl_loss_9": 276.794002532959,
"learning_rate": 0.00026246708450250255,
"loss": 774.9196,
"step": 6610
},
{
"ce_loss_13": 3.2809179782867433,
"ce_loss_17": 3.2430922031402587,
"ce_loss_2": 4.0411675572395325,
"ce_loss_4": 3.660265898704529,
"ce_loss_9": 3.372435915470123,
"epoch": 0.662,
"grad_norm": 972.0,
"kl_loss_13": 71.35952968597412,
"kl_loss_2": 1753.348388671875,
"kl_loss_4": 947.7372283935547,
"kl_loss_9": 274.24386978149414,
"learning_rate": 0.00026107209834516854,
"loss": 769.384,
"step": 6620
},
{
"ce_loss_13": 3.2341336011886597,
"ce_loss_17": 3.195348930358887,
"ce_loss_2": 4.057987570762634,
"ce_loss_4": 3.636424171924591,
"ce_loss_9": 3.3336671113967897,
"epoch": 0.663,
"grad_norm": 636.0,
"kl_loss_13": 72.059228515625,
"kl_loss_2": 1867.8700805664062,
"kl_loss_4": 991.2097351074219,
"kl_loss_9": 283.4357620239258,
"learning_rate": 0.0002596795181891514,
"loss": 794.6483,
"step": 6630
},
{
"ce_loss_13": 3.23643182516098,
"ce_loss_17": 3.1967314720153808,
"ce_loss_2": 4.034952008724213,
"ce_loss_4": 3.6355783104896546,
"ce_loss_9": 3.336190640926361,
"epoch": 0.664,
"grad_norm": 1104.0,
"kl_loss_13": 72.54963207244873,
"kl_loss_2": 1816.6078491210938,
"kl_loss_4": 983.9069519042969,
"kl_loss_9": 283.86315460205077,
"learning_rate": 0.000258289358057718,
"loss": 812.8871,
"step": 6640
},
{
"ce_loss_13": 3.312631404399872,
"ce_loss_17": 3.2709787249565125,
"ce_loss_2": 4.105361819267273,
"ce_loss_4": 3.7066175818443297,
"ce_loss_9": 3.4137498497962953,
"epoch": 0.665,
"grad_norm": 880.0,
"kl_loss_13": 73.74160442352294,
"kl_loss_2": 1820.5200256347657,
"kl_loss_4": 976.478402709961,
"kl_loss_9": 286.28767547607424,
"learning_rate": 0.0002569016319497657,
"loss": 788.9465,
"step": 6650
},
{
"ce_loss_13": 3.2948471903800964,
"ce_loss_17": 3.2543782114982607,
"ce_loss_2": 4.078556907176972,
"ce_loss_4": 3.686791181564331,
"ce_loss_9": 3.3982311129570006,
"epoch": 0.666,
"grad_norm": 812.0,
"kl_loss_13": 73.74290733337402,
"kl_loss_2": 1815.1135498046874,
"kl_loss_4": 980.0942779541016,
"kl_loss_9": 287.9010688781738,
"learning_rate": 0.00025551635383968066,
"loss": 793.4953,
"step": 6660
},
{
"ce_loss_13": 3.213699662685394,
"ce_loss_17": 3.1740002036094666,
"ce_loss_2": 4.008650445938111,
"ce_loss_4": 3.6066946148872376,
"ce_loss_9": 3.3078731894493103,
"epoch": 0.667,
"grad_norm": 920.0,
"kl_loss_13": 71.81263065338135,
"kl_loss_2": 1831.4727355957032,
"kl_loss_4": 988.4091125488281,
"kl_loss_9": 283.7192153930664,
"learning_rate": 0.00025413353767719804,
"loss": 787.2924,
"step": 6670
},
{
"ce_loss_13": 3.2675450086593627,
"ce_loss_17": 3.230876660346985,
"ce_loss_2": 4.050916016101837,
"ce_loss_4": 3.6535902619361877,
"ce_loss_9": 3.3607989072799684,
"epoch": 0.668,
"grad_norm": 876.0,
"kl_loss_13": 70.10343036651611,
"kl_loss_2": 1803.5049194335938,
"kl_loss_4": 975.379360961914,
"kl_loss_9": 275.31723861694337,
"learning_rate": 0.0002527531973872617,
"loss": 783.0383,
"step": 6680
},
{
"ce_loss_13": 3.2812340021133424,
"ce_loss_17": 3.2442690253257753,
"ce_loss_2": 4.050098311901093,
"ce_loss_4": 3.6664043426513673,
"ce_loss_9": 3.3740264654159544,
"epoch": 0.669,
"grad_norm": 1096.0,
"kl_loss_13": 70.2721643447876,
"kl_loss_2": 1771.3466003417968,
"kl_loss_4": 963.0172088623046,
"kl_loss_9": 275.112654876709,
"learning_rate": 0.0002513753468698826,
"loss": 772.7548,
"step": 6690
},
{
"ce_loss_13": 3.251974892616272,
"ce_loss_17": 3.2128549456596374,
"ce_loss_2": 4.043203258514405,
"ce_loss_4": 3.643755042552948,
"ce_loss_9": 3.345508944988251,
"epoch": 0.67,
"grad_norm": 880.0,
"kl_loss_13": 71.74250259399415,
"kl_loss_2": 1831.1538635253905,
"kl_loss_4": 983.5442596435547,
"kl_loss_9": 282.8565361022949,
"learning_rate": 0.0002500000000000001,
"loss": 786.5076,
"step": 6700
},
{
"ce_loss_13": 3.3693856596946716,
"ce_loss_17": 3.3300490021705627,
"ce_loss_2": 4.101691889762878,
"ce_loss_4": 3.727284145355225,
"ce_loss_9": 3.4568962335586546,
"epoch": 0.671,
"grad_norm": 772.0,
"kl_loss_13": 70.97701282501221,
"kl_loss_2": 1705.0662841796875,
"kl_loss_4": 922.4608154296875,
"kl_loss_9": 270.672705078125,
"learning_rate": 0.0002486271706273421,
"loss": 782.0349,
"step": 6710
},
{
"ce_loss_13": 3.3036414980888367,
"ce_loss_17": 3.267821168899536,
"ce_loss_2": 4.045556378364563,
"ce_loss_4": 3.670479393005371,
"ce_loss_9": 3.392921674251556,
"epoch": 0.672,
"grad_norm": 1536.0,
"kl_loss_13": 70.02148818969727,
"kl_loss_2": 1703.5993408203126,
"kl_loss_4": 918.4036315917969,
"kl_loss_9": 267.0767433166504,
"learning_rate": 0.0002472568725762853,
"loss": 763.772,
"step": 6720
},
{
"ce_loss_13": 3.2973303318023683,
"ce_loss_17": 3.25869699716568,
"ce_loss_2": 4.035550963878632,
"ce_loss_4": 3.6621091604232787,
"ce_loss_9": 3.388404607772827,
"epoch": 0.673,
"grad_norm": 972.0,
"kl_loss_13": 69.29530754089356,
"kl_loss_2": 1722.9706298828125,
"kl_loss_4": 927.8394775390625,
"kl_loss_9": 266.9626518249512,
"learning_rate": 0.00024588911964571554,
"loss": 758.6145,
"step": 6730
},
{
"ce_loss_13": 3.3077115178108216,
"ce_loss_17": 3.2654620885848997,
"ce_loss_2": 4.116430401802063,
"ce_loss_4": 3.7116676926612855,
"ce_loss_9": 3.4078907489776613,
"epoch": 0.674,
"grad_norm": 1072.0,
"kl_loss_13": 75.20856895446778,
"kl_loss_2": 1833.9399047851562,
"kl_loss_4": 993.9445495605469,
"kl_loss_9": 289.59678802490237,
"learning_rate": 0.00024452392560888974,
"loss": 779.83,
"step": 6740
},
{
"ce_loss_13": 3.202228772640228,
"ce_loss_17": 3.1627680897712707,
"ce_loss_2": 3.9784467577934266,
"ce_loss_4": 3.589304792881012,
"ce_loss_9": 3.301189970970154,
"epoch": 0.675,
"grad_norm": 840.0,
"kl_loss_13": 69.91371078491211,
"kl_loss_2": 1786.9942443847656,
"kl_loss_4": 964.2825653076172,
"kl_loss_9": 273.65016479492186,
"learning_rate": 0.00024316130421329695,
"loss": 766.0736,
"step": 6750
},
{
"ce_loss_13": 3.2838388085365295,
"ce_loss_17": 3.246099424362183,
"ce_loss_2": 4.045943284034729,
"ce_loss_4": 3.6641584515571592,
"ce_loss_9": 3.377654695510864,
"epoch": 0.676,
"grad_norm": 700.0,
"kl_loss_13": 70.78494644165039,
"kl_loss_2": 1752.5845092773438,
"kl_loss_4": 948.2985046386718,
"kl_loss_9": 273.8147804260254,
"learning_rate": 0.00024180126918051909,
"loss": 767.4725,
"step": 6760
},
{
"ce_loss_13": 3.3248173236846923,
"ce_loss_17": 3.2853858709335326,
"ce_loss_2": 4.0837649464607235,
"ce_loss_4": 3.7026811838150024,
"ce_loss_9": 3.419490098953247,
"epoch": 0.677,
"grad_norm": 1000.0,
"kl_loss_13": 71.17267417907715,
"kl_loss_2": 1750.3757202148438,
"kl_loss_4": 943.9945373535156,
"kl_loss_9": 275.70510177612306,
"learning_rate": 0.00024044383420609406,
"loss": 761.4471,
"step": 6770
},
{
"ce_loss_13": 3.3338271021842956,
"ce_loss_17": 3.2949508309364317,
"ce_loss_2": 4.071341669559478,
"ce_loss_4": 3.6980969429016115,
"ce_loss_9": 3.4230048298835754,
"epoch": 0.678,
"grad_norm": 1144.0,
"kl_loss_13": 70.32768592834472,
"kl_loss_2": 1737.1424133300782,
"kl_loss_4": 944.9273620605469,
"kl_loss_9": 273.1325569152832,
"learning_rate": 0.00023908901295937712,
"loss": 776.1757,
"step": 6780
},
{
"ce_loss_13": 3.329776632785797,
"ce_loss_17": 3.2886913418769836,
"ce_loss_2": 4.091409718990326,
"ce_loss_4": 3.708410894870758,
"ce_loss_9": 3.4214085578918456,
"epoch": 0.679,
"grad_norm": 1088.0,
"kl_loss_13": 71.47972583770752,
"kl_loss_2": 1743.7018920898438,
"kl_loss_4": 947.4816619873047,
"kl_loss_9": 272.5430030822754,
"learning_rate": 0.00023773681908340283,
"loss": 779.8013,
"step": 6790
},
{
"ce_loss_13": 3.304816460609436,
"ce_loss_17": 3.2629502773284913,
"ce_loss_2": 4.09833025932312,
"ce_loss_4": 3.7017361760139464,
"ce_loss_9": 3.404166114330292,
"epoch": 0.68,
"grad_norm": 764.0,
"kl_loss_13": 75.02878093719482,
"kl_loss_2": 1834.7515686035156,
"kl_loss_4": 995.4694244384766,
"kl_loss_9": 290.82249908447267,
"learning_rate": 0.00023638726619474876,
"loss": 799.3655,
"step": 6800
},
{
"ce_loss_13": 3.286585295200348,
"ce_loss_17": 3.2471044182777407,
"ce_loss_2": 4.1025919079780575,
"ce_loss_4": 3.694387984275818,
"ce_loss_9": 3.3894974827766418,
"epoch": 0.681,
"grad_norm": 936.0,
"kl_loss_13": 71.55782508850098,
"kl_loss_2": 1831.8867919921875,
"kl_loss_4": 990.6810455322266,
"kl_loss_9": 282.49219741821287,
"learning_rate": 0.0002350403678833976,
"loss": 785.4149,
"step": 6810
},
{
"ce_loss_13": 3.217432200908661,
"ce_loss_17": 3.1785250067710877,
"ce_loss_2": 4.012353837490082,
"ce_loss_4": 3.611852240562439,
"ce_loss_9": 3.313712215423584,
"epoch": 0.682,
"grad_norm": 732.0,
"kl_loss_13": 70.16158847808838,
"kl_loss_2": 1814.9119995117187,
"kl_loss_4": 981.8430053710938,
"kl_loss_9": 277.7752067565918,
"learning_rate": 0.00023369613771260007,
"loss": 774.9444,
"step": 6820
},
{
"ce_loss_13": 3.336046314239502,
"ce_loss_17": 3.295581614971161,
"ce_loss_2": 4.117106795310974,
"ce_loss_4": 3.7286535143852233,
"ce_loss_9": 3.4304684519767763,
"epoch": 0.683,
"grad_norm": 1056.0,
"kl_loss_13": 73.21032981872558,
"kl_loss_2": 1802.8844848632812,
"kl_loss_4": 978.7585845947266,
"kl_loss_9": 281.62718200683594,
"learning_rate": 0.00023235458921873925,
"loss": 786.7231,
"step": 6830
},
{
"ce_loss_13": 3.289022660255432,
"ce_loss_17": 3.248283493518829,
"ce_loss_2": 4.117696058750153,
"ce_loss_4": 3.704160213470459,
"ce_loss_9": 3.3890608310699464,
"epoch": 0.684,
"grad_norm": 1032.0,
"kl_loss_13": 74.40035495758056,
"kl_loss_2": 1897.1766357421875,
"kl_loss_4": 1015.8494293212891,
"kl_loss_9": 289.44225082397463,
"learning_rate": 0.0002310157359111938,
"loss": 812.3593,
"step": 6840
},
{
"ce_loss_13": 3.17997887134552,
"ce_loss_17": 3.140820550918579,
"ce_loss_2": 4.045794034004212,
"ce_loss_4": 3.602759265899658,
"ce_loss_9": 3.283980429172516,
"epoch": 0.685,
"grad_norm": 1872.0,
"kl_loss_13": 71.50569801330566,
"kl_loss_2": 1944.2283752441406,
"kl_loss_4": 1027.8053771972657,
"kl_loss_9": 287.1462005615234,
"learning_rate": 0.0002296795912722014,
"loss": 809.6538,
"step": 6850
},
{
"ce_loss_13": 3.3196268558502195,
"ce_loss_17": 3.2815457463264464,
"ce_loss_2": 4.0760578393936155,
"ce_loss_4": 3.6949382424354553,
"ce_loss_9": 3.417670750617981,
"epoch": 0.686,
"grad_norm": 684.0,
"kl_loss_13": 70.69927768707275,
"kl_loss_2": 1754.9938415527345,
"kl_loss_4": 945.5229858398437,
"kl_loss_9": 274.71204833984376,
"learning_rate": 0.0002283461687567236,
"loss": 755.9741,
"step": 6860
},
{
"ce_loss_13": 3.378780448436737,
"ce_loss_17": 3.339313018321991,
"ce_loss_2": 4.116203796863556,
"ce_loss_4": 3.7454458117485045,
"ce_loss_9": 3.4698862433433533,
"epoch": 0.687,
"grad_norm": 764.0,
"kl_loss_13": 71.76581039428712,
"kl_loss_2": 1704.88330078125,
"kl_loss_4": 929.5515380859375,
"kl_loss_9": 270.881037902832,
"learning_rate": 0.00022701548179231045,
"loss": 771.088,
"step": 6870
},
{
"ce_loss_13": 3.330306875705719,
"ce_loss_17": 3.2887945055961607,
"ce_loss_2": 4.113952279090881,
"ce_loss_4": 3.714215672016144,
"ce_loss_9": 3.425825297832489,
"epoch": 0.688,
"grad_norm": 820.0,
"kl_loss_13": 72.66767101287842,
"kl_loss_2": 1807.6771850585938,
"kl_loss_4": 971.4168090820312,
"kl_loss_9": 281.0403091430664,
"learning_rate": 0.00022568754377896516,
"loss": 766.6914,
"step": 6880
},
{
"ce_loss_13": 3.321236217021942,
"ce_loss_17": 3.2834801197052004,
"ce_loss_2": 4.078975677490234,
"ce_loss_4": 3.7029571533203125,
"ce_loss_9": 3.4183809041976927,
"epoch": 0.689,
"grad_norm": 1104.0,
"kl_loss_13": 71.08349094390869,
"kl_loss_2": 1754.2367858886719,
"kl_loss_4": 954.1943084716797,
"kl_loss_9": 278.3357391357422,
"learning_rate": 0.00022436236808900844,
"loss": 765.88,
"step": 6890
},
{
"ce_loss_13": 3.2175270676612855,
"ce_loss_17": 3.17860347032547,
"ce_loss_2": 4.004483795166015,
"ce_loss_4": 3.607356011867523,
"ce_loss_9": 3.3122127175331117,
"epoch": 0.69,
"grad_norm": 816.0,
"kl_loss_13": 70.88688526153564,
"kl_loss_2": 1818.0381103515624,
"kl_loss_4": 978.0747833251953,
"kl_loss_9": 279.3812728881836,
"learning_rate": 0.00022303996806694487,
"loss": 776.254,
"step": 6900
},
{
"ce_loss_13": 3.2926143050193786,
"ce_loss_17": 3.2546900272369386,
"ce_loss_2": 4.065900957584381,
"ce_loss_4": 3.676529657840729,
"ce_loss_9": 3.386691427230835,
"epoch": 0.691,
"grad_norm": 792.0,
"kl_loss_13": 70.55873546600341,
"kl_loss_2": 1799.1060607910156,
"kl_loss_4": 966.7993988037109,
"kl_loss_9": 275.59252166748047,
"learning_rate": 0.00022172035702932823,
"loss": 775.97,
"step": 6910
},
{
"ce_loss_13": 3.337917852401733,
"ce_loss_17": 3.3001177310943604,
"ce_loss_2": 4.088469135761261,
"ce_loss_4": 3.7172390818595886,
"ce_loss_9": 3.426655948162079,
"epoch": 0.692,
"grad_norm": 1304.0,
"kl_loss_13": 71.22828941345215,
"kl_loss_2": 1717.326348876953,
"kl_loss_4": 943.609243774414,
"kl_loss_9": 274.3278610229492,
"learning_rate": 0.00022040354826462666,
"loss": 759.6712,
"step": 6920
},
{
"ce_loss_13": 3.2716445088386537,
"ce_loss_17": 3.2332261562347413,
"ce_loss_2": 4.051820051670075,
"ce_loss_4": 3.653087794780731,
"ce_loss_9": 3.3642398953437804,
"epoch": 0.693,
"grad_norm": 1020.0,
"kl_loss_13": 70.62501392364501,
"kl_loss_2": 1778.5100158691407,
"kl_loss_4": 947.1623840332031,
"kl_loss_9": 271.7375068664551,
"learning_rate": 0.0002190895550330899,
"loss": 776.3381,
"step": 6930
},
{
"ce_loss_13": 3.2025559544563293,
"ce_loss_17": 3.1621774196624757,
"ce_loss_2": 4.008305644989013,
"ce_loss_4": 3.609113883972168,
"ce_loss_9": 3.301098358631134,
"epoch": 0.694,
"grad_norm": 1208.0,
"kl_loss_13": 71.739794921875,
"kl_loss_2": 1830.299560546875,
"kl_loss_4": 996.0399810791016,
"kl_loss_9": 283.3093772888184,
"learning_rate": 0.00021777839056661552,
"loss": 775.2254,
"step": 6940
},
{
"ce_loss_13": 3.2846802711486816,
"ce_loss_17": 3.2477630376815796,
"ce_loss_2": 4.051619839668274,
"ce_loss_4": 3.6623515844345094,
"ce_loss_9": 3.3748562932014465,
"epoch": 0.695,
"grad_norm": 712.0,
"kl_loss_13": 70.80554389953613,
"kl_loss_2": 1764.8639221191406,
"kl_loss_4": 951.2530120849609,
"kl_loss_9": 274.18594512939455,
"learning_rate": 0.0002164700680686147,
"loss": 760.6781,
"step": 6950
},
{
"ce_loss_13": 3.334276628494263,
"ce_loss_17": 3.2962963581085205,
"ce_loss_2": 4.088710451126099,
"ce_loss_4": 3.7073917031288146,
"ce_loss_9": 3.4302581310272218,
"epoch": 0.696,
"grad_norm": 1152.0,
"kl_loss_13": 72.1072265625,
"kl_loss_2": 1724.0063781738281,
"kl_loss_4": 936.3488128662109,
"kl_loss_9": 275.84831008911135,
"learning_rate": 0.0002151646007138806,
"loss": 761.3031,
"step": 6960
},
{
"ce_loss_13": 3.209850013256073,
"ce_loss_17": 3.1725902438163756,
"ce_loss_2": 4.002887773513794,
"ce_loss_4": 3.606672966480255,
"ce_loss_9": 3.3055623888969423,
"epoch": 0.697,
"grad_norm": 772.0,
"kl_loss_13": 72.02248516082764,
"kl_loss_2": 1826.7546997070312,
"kl_loss_4": 989.4366668701172,
"kl_loss_9": 279.758309173584,
"learning_rate": 0.00021386200164845526,
"loss": 778.6626,
"step": 6970
},
{
"ce_loss_13": 3.3914801478385925,
"ce_loss_17": 3.3524523973464966,
"ce_loss_2": 4.114296960830688,
"ce_loss_4": 3.755442941188812,
"ce_loss_9": 3.4810468912124635,
"epoch": 0.698,
"grad_norm": 636.0,
"kl_loss_13": 71.40635414123535,
"kl_loss_2": 1709.2556945800782,
"kl_loss_4": 944.0305419921875,
"kl_loss_9": 274.5273628234863,
"learning_rate": 0.0002125622839894964,
"loss": 755.8117,
"step": 6980
},
{
"ce_loss_13": 3.329510045051575,
"ce_loss_17": 3.290641796588898,
"ce_loss_2": 4.083862352371216,
"ce_loss_4": 3.706824839115143,
"ce_loss_9": 3.4205021142959593,
"epoch": 0.699,
"grad_norm": 780.0,
"kl_loss_13": 71.35521602630615,
"kl_loss_2": 1725.0163208007812,
"kl_loss_4": 938.1391387939453,
"kl_loss_9": 272.0928482055664,
"learning_rate": 0.00021126546082514663,
"loss": 758.0833,
"step": 6990
},
{
"ce_loss_13": 3.3541366338729857,
"ce_loss_17": 3.316024458408356,
"ce_loss_2": 4.095506346225738,
"ce_loss_4": 3.722522163391113,
"ce_loss_9": 3.4431343197822573,
"epoch": 0.7,
"grad_norm": 760.0,
"kl_loss_13": 71.6666015625,
"kl_loss_2": 1725.105419921875,
"kl_loss_4": 942.9446807861328,
"kl_loss_9": 274.0315826416016,
"learning_rate": 0.00020997154521440098,
"loss": 755.1295,
"step": 7000
},
{
"ce_loss_13": 3.2969266891479494,
"ce_loss_17": 3.2600142121315003,
"ce_loss_2": 4.0547042965888975,
"ce_loss_4": 3.6686680912971497,
"ce_loss_9": 3.386641597747803,
"epoch": 0.701,
"grad_norm": 1080.0,
"kl_loss_13": 70.24392051696778,
"kl_loss_2": 1758.719580078125,
"kl_loss_4": 950.8714721679687,
"kl_loss_9": 272.28409042358396,
"learning_rate": 0.0002086805501869749,
"loss": 755.9173,
"step": 7010
},
{
"ce_loss_13": 3.264370834827423,
"ce_loss_17": 3.226525294780731,
"ce_loss_2": 4.065334379673004,
"ce_loss_4": 3.667639875411987,
"ce_loss_9": 3.3651340007781982,
"epoch": 0.702,
"grad_norm": 780.0,
"kl_loss_13": 71.73685340881347,
"kl_loss_2": 1839.5365051269532,
"kl_loss_4": 993.7012023925781,
"kl_loss_9": 285.7314987182617,
"learning_rate": 0.0002073924887431744,
"loss": 780.9709,
"step": 7020
},
{
"ce_loss_13": 3.2721614122390745,
"ce_loss_17": 3.2347609400749207,
"ce_loss_2": 4.044773375988006,
"ce_loss_4": 3.6571881771087646,
"ce_loss_9": 3.369364929199219,
"epoch": 0.703,
"grad_norm": 728.0,
"kl_loss_13": 71.0749008178711,
"kl_loss_2": 1778.470965576172,
"kl_loss_4": 964.4342071533204,
"kl_loss_9": 277.28941345214844,
"learning_rate": 0.00020610737385376348,
"loss": 786.926,
"step": 7030
},
{
"ce_loss_13": 3.332412588596344,
"ce_loss_17": 3.294099271297455,
"ce_loss_2": 4.061883521080017,
"ce_loss_4": 3.698824071884155,
"ce_loss_9": 3.4222095012664795,
"epoch": 0.704,
"grad_norm": 876.0,
"kl_loss_13": 70.61465110778809,
"kl_loss_2": 1688.5162780761718,
"kl_loss_4": 928.5938690185546,
"kl_loss_9": 269.72947082519534,
"learning_rate": 0.00020482521845983521,
"loss": 767.581,
"step": 7040
},
{
"ce_loss_13": 3.3323171854019167,
"ce_loss_17": 3.2925487518310548,
"ce_loss_2": 4.104314303398132,
"ce_loss_4": 3.7108531475067137,
"ce_loss_9": 3.427067816257477,
"epoch": 0.705,
"grad_norm": 1160.0,
"kl_loss_13": 72.79951210021973,
"kl_loss_2": 1783.0196350097656,
"kl_loss_4": 961.49697265625,
"kl_loss_9": 278.71324615478517,
"learning_rate": 0.00020354603547267987,
"loss": 781.7756,
"step": 7050
},
{
"ce_loss_13": 3.3129538536071776,
"ce_loss_17": 3.2723392963409426,
"ce_loss_2": 4.110062229633331,
"ce_loss_4": 3.7134634494781493,
"ce_loss_9": 3.410679376125336,
"epoch": 0.706,
"grad_norm": 736.0,
"kl_loss_13": 72.24404449462891,
"kl_loss_2": 1795.7587646484376,
"kl_loss_4": 975.1206726074219,
"kl_loss_9": 279.9232604980469,
"learning_rate": 0.00020226983777365604,
"loss": 797.2193,
"step": 7060
},
{
"ce_loss_13": 3.224683380126953,
"ce_loss_17": 3.186890208721161,
"ce_loss_2": 4.026706266403198,
"ce_loss_4": 3.619293999671936,
"ce_loss_9": 3.318616247177124,
"epoch": 0.707,
"grad_norm": 760.0,
"kl_loss_13": 68.36813163757324,
"kl_loss_2": 1822.1632385253906,
"kl_loss_4": 961.1641967773437,
"kl_loss_9": 269.3346450805664,
"learning_rate": 0.00020099663821406056,
"loss": 771.0888,
"step": 7070
},
{
"ce_loss_13": 3.3214773178100585,
"ce_loss_17": 3.283447802066803,
"ce_loss_2": 4.071311795711518,
"ce_loss_4": 3.695223093032837,
"ce_loss_9": 3.413736712932587,
"epoch": 0.708,
"grad_norm": 1704.0,
"kl_loss_13": 70.34514198303222,
"kl_loss_2": 1715.6263061523437,
"kl_loss_4": 928.1057037353515,
"kl_loss_9": 269.1682067871094,
"learning_rate": 0.00019972644961499853,
"loss": 767.0122,
"step": 7080
},
{
"ce_loss_13": 3.2867833733558656,
"ce_loss_17": 3.247623884677887,
"ce_loss_2": 4.082489454746247,
"ce_loss_4": 3.6845450401306152,
"ce_loss_9": 3.3835286021232607,
"epoch": 0.709,
"grad_norm": 720.0,
"kl_loss_13": 72.10076866149902,
"kl_loss_2": 1813.9811889648438,
"kl_loss_4": 983.6025085449219,
"kl_loss_9": 281.21164016723634,
"learning_rate": 0.00019845928476725522,
"loss": 779.535,
"step": 7090
},
{
"ce_loss_13": 3.3654056072235106,
"ce_loss_17": 3.3253042817115785,
"ce_loss_2": 4.129805624485016,
"ce_loss_4": 3.7489982724189757,
"ce_loss_9": 3.4594806432724,
"epoch": 0.71,
"grad_norm": 872.0,
"kl_loss_13": 72.55214748382568,
"kl_loss_2": 1756.3334106445313,
"kl_loss_4": 960.7546752929687,
"kl_loss_9": 277.62411270141604,
"learning_rate": 0.00019719515643116677,
"loss": 792.3727,
"step": 7100
},
{
"ce_loss_13": 3.309456527233124,
"ce_loss_17": 3.27019317150116,
"ce_loss_2": 4.06982558965683,
"ce_loss_4": 3.6808470845222474,
"ce_loss_9": 3.402991759777069,
"epoch": 0.711,
"grad_norm": 952.0,
"kl_loss_13": 71.02709293365479,
"kl_loss_2": 1760.2141357421874,
"kl_loss_4": 942.3459259033203,
"kl_loss_9": 273.5096015930176,
"learning_rate": 0.0001959340773364911,
"loss": 773.76,
"step": 7110
},
{
"ce_loss_13": 3.3214038491249083,
"ce_loss_17": 3.2831618547439576,
"ce_loss_2": 4.088900756835938,
"ce_loss_4": 3.7054635286331177,
"ce_loss_9": 3.4160606980323793,
"epoch": 0.712,
"grad_norm": 744.0,
"kl_loss_13": 71.1388599395752,
"kl_loss_2": 1771.7801818847656,
"kl_loss_4": 960.0322570800781,
"kl_loss_9": 276.5096008300781,
"learning_rate": 0.0001946760601822809,
"loss": 758.2503,
"step": 7120
},
{
"ce_loss_13": 3.37441748380661,
"ce_loss_17": 3.336512637138367,
"ce_loss_2": 4.1263908505439755,
"ce_loss_4": 3.7414964199066163,
"ce_loss_9": 3.4689518332481386,
"epoch": 0.713,
"grad_norm": 812.0,
"kl_loss_13": 70.92561588287353,
"kl_loss_2": 1740.7447570800782,
"kl_loss_4": 931.0078186035156,
"kl_loss_9": 273.3617431640625,
"learning_rate": 0.00019342111763675512,
"loss": 746.9193,
"step": 7130
},
{
"ce_loss_13": 3.373600149154663,
"ce_loss_17": 3.334926092624664,
"ce_loss_2": 4.108856225013733,
"ce_loss_4": 3.73887095451355,
"ce_loss_9": 3.46598562002182,
"epoch": 0.714,
"grad_norm": 1240.0,
"kl_loss_13": 72.69193038940429,
"kl_loss_2": 1716.9721557617188,
"kl_loss_4": 944.558950805664,
"kl_loss_9": 276.685969543457,
"learning_rate": 0.00019216926233717085,
"loss": 753.1709,
"step": 7140
},
{
"ce_loss_13": 3.263760483264923,
"ce_loss_17": 3.2253942251205445,
"ce_loss_2": 4.082875895500183,
"ce_loss_4": 3.6528494596481322,
"ce_loss_9": 3.356517326831818,
"epoch": 0.715,
"grad_norm": 840.0,
"kl_loss_13": 69.8310625076294,
"kl_loss_2": 1855.0406066894532,
"kl_loss_4": 966.3692352294922,
"kl_loss_9": 269.43797836303713,
"learning_rate": 0.00019092050688969737,
"loss": 785.2349,
"step": 7150
},
{
"ce_loss_13": 3.332518148422241,
"ce_loss_17": 3.2948469281196595,
"ce_loss_2": 4.081777715682984,
"ce_loss_4": 3.7075140953063963,
"ce_loss_9": 3.4232553839683533,
"epoch": 0.716,
"grad_norm": 696.0,
"kl_loss_13": 70.54215602874756,
"kl_loss_2": 1763.9240600585938,
"kl_loss_4": 959.4861602783203,
"kl_loss_9": 273.6377548217773,
"learning_rate": 0.00018967486386928817,
"loss": 763.6613,
"step": 7160
},
{
"ce_loss_13": 3.205380403995514,
"ce_loss_17": 3.165660285949707,
"ce_loss_2": 3.997770869731903,
"ce_loss_4": 3.5989809036254883,
"ce_loss_9": 3.3011061429977415,
"epoch": 0.717,
"grad_norm": 752.0,
"kl_loss_13": 70.15151481628418,
"kl_loss_2": 1823.458935546875,
"kl_loss_4": 981.4985076904297,
"kl_loss_9": 280.47804107666013,
"learning_rate": 0.00018843234581955443,
"loss": 801.8435,
"step": 7170
},
{
"ce_loss_13": 3.2218088269233705,
"ce_loss_17": 3.182110404968262,
"ce_loss_2": 4.012683880329132,
"ce_loss_4": 3.6198145747184753,
"ce_loss_9": 3.318508338928223,
"epoch": 0.718,
"grad_norm": 884.0,
"kl_loss_13": 71.75006408691407,
"kl_loss_2": 1810.262518310547,
"kl_loss_4": 978.4985412597656,
"kl_loss_9": 281.0864456176758,
"learning_rate": 0.00018719296525263924,
"loss": 781.7542,
"step": 7180
},
{
"ce_loss_13": 3.316895771026611,
"ce_loss_17": 3.2792091727256776,
"ce_loss_2": 4.051657652854919,
"ce_loss_4": 3.6806211829185487,
"ce_loss_9": 3.407132875919342,
"epoch": 0.719,
"grad_norm": 780.0,
"kl_loss_13": 71.19637126922608,
"kl_loss_2": 1715.8442016601562,
"kl_loss_4": 928.7437896728516,
"kl_loss_9": 271.83887710571287,
"learning_rate": 0.0001859567346490913,
"loss": 752.6343,
"step": 7190
},
{
"ce_loss_13": 3.2919270396232605,
"ce_loss_17": 3.2515564560890198,
"ce_loss_2": 4.07712619304657,
"ce_loss_4": 3.681968426704407,
"ce_loss_9": 3.386718785762787,
"epoch": 0.72,
"grad_norm": 1020.0,
"kl_loss_13": 72.11697540283203,
"kl_loss_2": 1800.6586608886719,
"kl_loss_4": 971.411962890625,
"kl_loss_9": 280.454451751709,
"learning_rate": 0.0001847236664577389,
"loss": 766.3369,
"step": 7200
},
{
"ce_loss_13": 3.3172996997833253,
"ce_loss_17": 3.278312313556671,
"ce_loss_2": 4.056863987445832,
"ce_loss_4": 3.686291182041168,
"ce_loss_9": 3.4074368476867676,
"epoch": 0.721,
"grad_norm": 740.0,
"kl_loss_13": 70.0596134185791,
"kl_loss_2": 1703.831787109375,
"kl_loss_4": 926.7533538818359,
"kl_loss_9": 269.8570755004883,
"learning_rate": 0.00018349377309556487,
"loss": 744.4033,
"step": 7210
},
{
"ce_loss_13": 3.2637869834899904,
"ce_loss_17": 3.2251646637916567,
"ce_loss_2": 4.072896242141724,
"ce_loss_4": 3.6544003009796144,
"ce_loss_9": 3.36046644449234,
"epoch": 0.722,
"grad_norm": 1168.0,
"kl_loss_13": 71.50100860595703,
"kl_loss_2": 1873.2872375488282,
"kl_loss_4": 992.8349487304688,
"kl_loss_9": 280.7294204711914,
"learning_rate": 0.00018226706694758193,
"loss": 789.8526,
"step": 7220
},
{
"ce_loss_13": 3.334029030799866,
"ce_loss_17": 3.297024190425873,
"ce_loss_2": 4.095572996139526,
"ce_loss_4": 3.712455523014069,
"ce_loss_9": 3.4249449014663695,
"epoch": 0.723,
"grad_norm": 856.0,
"kl_loss_13": 71.00616970062256,
"kl_loss_2": 1758.424822998047,
"kl_loss_4": 952.4993621826172,
"kl_loss_9": 276.48938674926757,
"learning_rate": 0.0001810435603667075,
"loss": 786.2653,
"step": 7230
},
{
"ce_loss_13": 3.191468024253845,
"ce_loss_17": 3.153413712978363,
"ce_loss_2": 3.9690565705299377,
"ce_loss_4": 3.5743834137916566,
"ce_loss_9": 3.2823387384414673,
"epoch": 0.724,
"grad_norm": 1008.0,
"kl_loss_13": 68.86694049835205,
"kl_loss_2": 1788.1756591796875,
"kl_loss_4": 959.1340911865234,
"kl_loss_9": 270.8941192626953,
"learning_rate": 0.0001798232656736389,
"loss": 784.757,
"step": 7240
},
{
"ce_loss_13": 3.3603003859519958,
"ce_loss_17": 3.322044885158539,
"ce_loss_2": 4.09637211561203,
"ce_loss_4": 3.72024085521698,
"ce_loss_9": 3.451583540439606,
"epoch": 0.725,
"grad_norm": 836.0,
"kl_loss_13": 70.9534122467041,
"kl_loss_2": 1698.4436950683594,
"kl_loss_4": 915.228955078125,
"kl_loss_9": 270.6935722351074,
"learning_rate": 0.0001786061951567303,
"loss": 755.1846,
"step": 7250
},
{
"ce_loss_13": 3.274634397029877,
"ce_loss_17": 3.2367483615875243,
"ce_loss_2": 4.048952245712281,
"ce_loss_4": 3.656607174873352,
"ce_loss_9": 3.369658660888672,
"epoch": 0.726,
"grad_norm": 804.0,
"kl_loss_13": 71.53393859863282,
"kl_loss_2": 1769.5117797851562,
"kl_loss_4": 957.7466979980469,
"kl_loss_9": 276.7366523742676,
"learning_rate": 0.00017739236107186857,
"loss": 776.2261,
"step": 7260
},
{
"ce_loss_13": 3.3679779410362243,
"ce_loss_17": 3.329963207244873,
"ce_loss_2": 4.0942219495773315,
"ce_loss_4": 3.7239694952964784,
"ce_loss_9": 3.4533618807792665,
"epoch": 0.727,
"grad_norm": 1464.0,
"kl_loss_13": 70.42549552917481,
"kl_loss_2": 1689.8609008789062,
"kl_loss_4": 919.8603881835937,
"kl_loss_9": 267.1950225830078,
"learning_rate": 0.00017618177564234904,
"loss": 749.2871,
"step": 7270
},
{
"ce_loss_13": 3.3417418003082275,
"ce_loss_17": 3.3037155866622925,
"ce_loss_2": 4.073198449611664,
"ce_loss_4": 3.7028775930404665,
"ce_loss_9": 3.4276382446289064,
"epoch": 0.728,
"grad_norm": 760.0,
"kl_loss_13": 69.97520542144775,
"kl_loss_2": 1678.5777709960937,
"kl_loss_4": 915.3362426757812,
"kl_loss_9": 263.50071105957034,
"learning_rate": 0.00017497445105875377,
"loss": 749.8755,
"step": 7280
},
{
"ce_loss_13": 3.247866189479828,
"ce_loss_17": 3.2090843081474305,
"ce_loss_2": 4.045860695838928,
"ce_loss_4": 3.6415121078491213,
"ce_loss_9": 3.3421841621398927,
"epoch": 0.729,
"grad_norm": 992.0,
"kl_loss_13": 70.66439056396484,
"kl_loss_2": 1826.7165100097657,
"kl_loss_4": 977.5601196289062,
"kl_loss_9": 280.64800415039065,
"learning_rate": 0.000173770399478828,
"loss": 779.078,
"step": 7290
},
{
"ce_loss_13": 3.173096179962158,
"ce_loss_17": 3.1370855569839478,
"ce_loss_2": 3.9375186562538147,
"ce_loss_4": 3.5511539220809936,
"ce_loss_9": 3.263213336467743,
"epoch": 0.73,
"grad_norm": 1032.0,
"kl_loss_13": 68.80027599334717,
"kl_loss_2": 1763.6752075195313,
"kl_loss_4": 956.6611968994141,
"kl_loss_9": 271.0298553466797,
"learning_rate": 0.0001725696330273575,
"loss": 787.5494,
"step": 7300
},
{
"ce_loss_13": 3.3573803544044494,
"ce_loss_17": 3.3180949211120607,
"ce_loss_2": 4.097061896324158,
"ce_loss_4": 3.7262882709503176,
"ce_loss_9": 3.448265993595123,
"epoch": 0.731,
"grad_norm": 836.0,
"kl_loss_13": 69.84502964019775,
"kl_loss_2": 1707.2565124511718,
"kl_loss_4": 931.3904113769531,
"kl_loss_9": 270.31018524169923,
"learning_rate": 0.00017137216379604724,
"loss": 749.7715,
"step": 7310
},
{
"ce_loss_13": 3.241015446186066,
"ce_loss_17": 3.2024821400642396,
"ce_loss_2": 4.016211712360382,
"ce_loss_4": 3.619802701473236,
"ce_loss_9": 3.3317625522613525,
"epoch": 0.732,
"grad_norm": 984.0,
"kl_loss_13": 70.11958351135254,
"kl_loss_2": 1761.5307678222657,
"kl_loss_4": 948.8353576660156,
"kl_loss_9": 269.7608154296875,
"learning_rate": 0.00017017800384339925,
"loss": 766.8838,
"step": 7320
},
{
"ce_loss_13": 3.193186175823212,
"ce_loss_17": 3.154158186912537,
"ce_loss_2": 4.000348472595215,
"ce_loss_4": 3.593831789493561,
"ce_loss_9": 3.289596879482269,
"epoch": 0.733,
"grad_norm": 868.0,
"kl_loss_13": 69.91508522033692,
"kl_loss_2": 1842.14296875,
"kl_loss_4": 985.5766326904297,
"kl_loss_9": 278.39381408691406,
"learning_rate": 0.00016898716519459073,
"loss": 763.6745,
"step": 7330
},
{
"ce_loss_13": 3.313058865070343,
"ce_loss_17": 3.2736072659492494,
"ce_loss_2": 4.118879449367523,
"ce_loss_4": 3.7130431056022646,
"ce_loss_9": 3.4101476788520815,
"epoch": 0.734,
"grad_norm": 844.0,
"kl_loss_13": 72.03692722320557,
"kl_loss_2": 1814.1436828613282,
"kl_loss_4": 976.4756866455078,
"kl_loss_9": 282.7153793334961,
"learning_rate": 0.00016779965984135375,
"loss": 774.9484,
"step": 7340
},
{
"ce_loss_13": 3.225519323348999,
"ce_loss_17": 3.1890533685684206,
"ce_loss_2": 4.00371423959732,
"ce_loss_4": 3.6043476462364197,
"ce_loss_9": 3.3188387513160706,
"epoch": 0.735,
"grad_norm": 776.0,
"kl_loss_13": 68.39549350738525,
"kl_loss_2": 1771.8436584472656,
"kl_loss_4": 936.6364837646485,
"kl_loss_9": 268.1987022399902,
"learning_rate": 0.00016661549974185424,
"loss": 763.8604,
"step": 7350
},
{
"ce_loss_13": 3.2641552090644836,
"ce_loss_17": 3.226039266586304,
"ce_loss_2": 4.0332492113113405,
"ce_loss_4": 3.6448875188827516,
"ce_loss_9": 3.356839954853058,
"epoch": 0.736,
"grad_norm": 824.0,
"kl_loss_13": 71.28998851776123,
"kl_loss_2": 1765.07041015625,
"kl_loss_4": 949.3002319335938,
"kl_loss_9": 274.99008255004884,
"learning_rate": 0.00016543469682057105,
"loss": 755.6967,
"step": 7360
},
{
"ce_loss_13": 3.2904924869537355,
"ce_loss_17": 3.2508055090904238,
"ce_loss_2": 4.065948736667633,
"ce_loss_4": 3.678385245800018,
"ce_loss_9": 3.3866549491882325,
"epoch": 0.737,
"grad_norm": 816.0,
"kl_loss_13": 71.95163116455078,
"kl_loss_2": 1769.7718627929687,
"kl_loss_4": 962.1701629638671,
"kl_loss_9": 277.88295516967776,
"learning_rate": 0.00016425726296817632,
"loss": 766.5517,
"step": 7370
},
{
"ce_loss_13": 3.3040218114852906,
"ce_loss_17": 3.2673555254936217,
"ce_loss_2": 4.0545696258544925,
"ce_loss_4": 3.682534599304199,
"ce_loss_9": 3.3967278480529783,
"epoch": 0.738,
"grad_norm": 740.0,
"kl_loss_13": 69.9533712387085,
"kl_loss_2": 1727.5446228027345,
"kl_loss_4": 941.7377288818359,
"kl_loss_9": 268.64248809814455,
"learning_rate": 0.00016308321004141607,
"loss": 756.726,
"step": 7380
},
{
"ce_loss_13": 3.2565553188323975,
"ce_loss_17": 3.2167924642562866,
"ce_loss_2": 4.043980371952057,
"ce_loss_4": 3.6463310599327086,
"ce_loss_9": 3.356897795200348,
"epoch": 0.739,
"grad_norm": 1056.0,
"kl_loss_13": 72.12949066162109,
"kl_loss_2": 1785.9837768554687,
"kl_loss_4": 958.8319763183594,
"kl_loss_9": 281.2048934936523,
"learning_rate": 0.00016191254986299043,
"loss": 763.5404,
"step": 7390
},
{
"ce_loss_13": 3.305033326148987,
"ce_loss_17": 3.267452096939087,
"ce_loss_2": 4.053663873672486,
"ce_loss_4": 3.6732759952545164,
"ce_loss_9": 3.3918822526931764,
"epoch": 0.74,
"grad_norm": 884.0,
"kl_loss_13": 70.24519844055176,
"kl_loss_2": 1748.568621826172,
"kl_loss_4": 941.6577880859375,
"kl_loss_9": 269.2971351623535,
"learning_rate": 0.00016074529422143398,
"loss": 772.0949,
"step": 7400
},
{
"ce_loss_13": 3.2546313643455504,
"ce_loss_17": 3.216951239109039,
"ce_loss_2": 4.049028539657593,
"ce_loss_4": 3.6475675821304323,
"ce_loss_9": 3.34945809841156,
"epoch": 0.741,
"grad_norm": 1176.0,
"kl_loss_13": 70.28467655181885,
"kl_loss_2": 1803.59521484375,
"kl_loss_4": 959.3927185058594,
"kl_loss_9": 274.4232048034668,
"learning_rate": 0.0001595814548709983,
"loss": 781.1009,
"step": 7410
},
{
"ce_loss_13": 3.3178256511688233,
"ce_loss_17": 3.2781765699386596,
"ce_loss_2": 4.100561285018921,
"ce_loss_4": 3.7127739071846007,
"ce_loss_9": 3.417123830318451,
"epoch": 0.742,
"grad_norm": 760.0,
"kl_loss_13": 72.4094009399414,
"kl_loss_2": 1799.5929931640626,
"kl_loss_4": 974.4059661865234,
"kl_loss_9": 284.3437103271484,
"learning_rate": 0.00015842104353153285,
"loss": 776.3952,
"step": 7420
},
{
"ce_loss_13": 3.3347533464431764,
"ce_loss_17": 3.297309637069702,
"ce_loss_2": 4.107506430149078,
"ce_loss_4": 3.7230252385139466,
"ce_loss_9": 3.428642463684082,
"epoch": 0.743,
"grad_norm": 784.0,
"kl_loss_13": 71.99139099121093,
"kl_loss_2": 1769.0837463378907,
"kl_loss_4": 962.6025939941406,
"kl_loss_9": 277.31082305908205,
"learning_rate": 0.0001572640718883667,
"loss": 785.4607,
"step": 7430
},
{
"ce_loss_13": 3.2768795490264893,
"ce_loss_17": 3.2394585609436035,
"ce_loss_2": 4.0312792420387265,
"ce_loss_4": 3.6453895688056948,
"ce_loss_9": 3.366493320465088,
"epoch": 0.744,
"grad_norm": 844.0,
"kl_loss_13": 69.40192909240723,
"kl_loss_2": 1744.5479919433594,
"kl_loss_4": 938.4931243896484,
"kl_loss_9": 269.1444465637207,
"learning_rate": 0.0001561105515921915,
"loss": 776.6132,
"step": 7440
},
{
"ce_loss_13": 3.1278898119926453,
"ce_loss_17": 3.090610134601593,
"ce_loss_2": 3.942419695854187,
"ce_loss_4": 3.5280018210411073,
"ce_loss_9": 3.223814272880554,
"epoch": 0.745,
"grad_norm": 1104.0,
"kl_loss_13": 68.1939962387085,
"kl_loss_2": 1857.2819396972657,
"kl_loss_4": 978.3077850341797,
"kl_loss_9": 272.3531295776367,
"learning_rate": 0.0001549604942589441,
"loss": 772.6961,
"step": 7450
},
{
"ce_loss_13": 3.313250517845154,
"ce_loss_17": 3.276069223880768,
"ce_loss_2": 4.036946654319763,
"ce_loss_4": 3.6701719045639036,
"ce_loss_9": 3.400228428840637,
"epoch": 0.746,
"grad_norm": 736.0,
"kl_loss_13": 69.25663681030274,
"kl_loss_2": 1669.5231079101563,
"kl_loss_4": 901.7798034667969,
"kl_loss_9": 264.0007888793945,
"learning_rate": 0.00015381391146968864,
"loss": 746.5012,
"step": 7460
},
{
"ce_loss_13": 3.287136948108673,
"ce_loss_17": 3.2516398191452027,
"ce_loss_2": 4.058957719802857,
"ce_loss_4": 3.6662606596946716,
"ce_loss_9": 3.379166769981384,
"epoch": 0.747,
"grad_norm": 1312.0,
"kl_loss_13": 68.81890754699707,
"kl_loss_2": 1759.797265625,
"kl_loss_4": 944.7041290283203,
"kl_loss_9": 268.30206604003905,
"learning_rate": 0.00015267081477050133,
"loss": 769.2019,
"step": 7470
},
{
"ce_loss_13": 3.3837154507637024,
"ce_loss_17": 3.3446292996406557,
"ce_loss_2": 4.122562909126282,
"ce_loss_4": 3.7535754799842835,
"ce_loss_9": 3.4761662602424623,
"epoch": 0.748,
"grad_norm": 712.0,
"kl_loss_13": 72.16933269500733,
"kl_loss_2": 1710.0824951171876,
"kl_loss_4": 931.889291381836,
"kl_loss_9": 273.9661148071289,
"learning_rate": 0.00015153121567235335,
"loss": 746.5872,
"step": 7480
},
{
"ce_loss_13": 3.283895766735077,
"ce_loss_17": 3.244150185585022,
"ce_loss_2": 4.0544509291648865,
"ce_loss_4": 3.661958086490631,
"ce_loss_9": 3.374240827560425,
"epoch": 0.749,
"grad_norm": 704.0,
"kl_loss_13": 69.87776603698731,
"kl_loss_2": 1792.9642272949218,
"kl_loss_4": 965.1159057617188,
"kl_loss_9": 272.7176933288574,
"learning_rate": 0.00015039512565099468,
"loss": 751.652,
"step": 7490
},
{
"ce_loss_13": 3.346974790096283,
"ce_loss_17": 3.307431769371033,
"ce_loss_2": 4.096277952194214,
"ce_loss_4": 3.716240656375885,
"ce_loss_9": 3.435083281993866,
"epoch": 0.75,
"grad_norm": 824.0,
"kl_loss_13": 70.67133522033691,
"kl_loss_2": 1747.3429260253906,
"kl_loss_4": 947.5887268066406,
"kl_loss_9": 273.0171401977539,
"learning_rate": 0.00014926255614683932,
"loss": 788.2286,
"step": 7500
},
{
"ce_loss_13": 3.2786112070083617,
"ce_loss_17": 3.2404692649841307,
"ce_loss_2": 4.0347212433815,
"ce_loss_4": 3.6455458045005797,
"ce_loss_9": 3.3715378761291506,
"epoch": 0.751,
"grad_norm": 732.0,
"kl_loss_13": 70.24715995788574,
"kl_loss_2": 1751.265087890625,
"kl_loss_4": 936.3882263183593,
"kl_loss_9": 272.389094543457,
"learning_rate": 0.0001481335185648498,
"loss": 765.7348,
"step": 7510
},
{
"ce_loss_13": 3.2962894678115844,
"ce_loss_17": 3.2575737595558167,
"ce_loss_2": 4.060336661338806,
"ce_loss_4": 3.676244294643402,
"ce_loss_9": 3.391019034385681,
"epoch": 0.752,
"grad_norm": 876.0,
"kl_loss_13": 70.00695781707763,
"kl_loss_2": 1750.6751708984375,
"kl_loss_4": 946.6380157470703,
"kl_loss_9": 275.6133514404297,
"learning_rate": 0.0001470080242744218,
"loss": 756.1388,
"step": 7520
},
{
"ce_loss_13": 3.291015386581421,
"ce_loss_17": 3.253824019432068,
"ce_loss_2": 4.068705368041992,
"ce_loss_4": 3.669373023509979,
"ce_loss_9": 3.381570076942444,
"epoch": 0.753,
"grad_norm": 872.0,
"kl_loss_13": 69.27585773468017,
"kl_loss_2": 1781.7065185546876,
"kl_loss_4": 955.3471130371094,
"kl_loss_9": 270.36650772094725,
"learning_rate": 0.0001458860846092705,
"loss": 771.5325,
"step": 7530
},
{
"ce_loss_13": 3.3303808689117433,
"ce_loss_17": 3.2930827856063845,
"ce_loss_2": 4.065006506443024,
"ce_loss_4": 3.6996832132339477,
"ce_loss_9": 3.420238471031189,
"epoch": 0.754,
"grad_norm": 840.0,
"kl_loss_13": 69.98066291809081,
"kl_loss_2": 1698.9991882324218,
"kl_loss_4": 931.16552734375,
"kl_loss_9": 268.8388671875,
"learning_rate": 0.00014476771086731566,
"loss": 742.7994,
"step": 7540
},
{
"ce_loss_13": 3.423077344894409,
"ce_loss_17": 3.3821932554244993,
"ce_loss_2": 4.17581205368042,
"ce_loss_4": 3.7954395055770873,
"ce_loss_9": 3.5166402459144592,
"epoch": 0.755,
"grad_norm": 788.0,
"kl_loss_13": 73.9686882019043,
"kl_loss_2": 1728.0974670410155,
"kl_loss_4": 930.5460083007813,
"kl_loss_9": 276.084765625,
"learning_rate": 0.00014365291431056872,
"loss": 776.2555,
"step": 7550
},
{
"ce_loss_13": 3.2581419229507445,
"ce_loss_17": 3.2202382564544676,
"ce_loss_2": 4.035905528068542,
"ce_loss_4": 3.6437175273895264,
"ce_loss_9": 3.3557262897491453,
"epoch": 0.756,
"grad_norm": 948.0,
"kl_loss_13": 72.2882619857788,
"kl_loss_2": 1792.8220336914062,
"kl_loss_4": 968.6297149658203,
"kl_loss_9": 281.67093353271486,
"learning_rate": 0.00014254170616501827,
"loss": 770.4111,
"step": 7560
},
{
"ce_loss_13": 3.195040285587311,
"ce_loss_17": 3.1560064554214478,
"ce_loss_2": 4.0122485756874084,
"ce_loss_4": 3.609569180011749,
"ce_loss_9": 3.297301399707794,
"epoch": 0.757,
"grad_norm": 1064.0,
"kl_loss_13": 71.20706291198731,
"kl_loss_2": 1849.8933044433593,
"kl_loss_4": 1010.1989196777344,
"kl_loss_9": 285.5230583190918,
"learning_rate": 0.0001414340976205183,
"loss": 802.3704,
"step": 7570
},
{
"ce_loss_13": 3.2119127988815306,
"ce_loss_17": 3.174885427951813,
"ce_loss_2": 4.007180690765381,
"ce_loss_4": 3.6032917380332945,
"ce_loss_9": 3.3073129057884216,
"epoch": 0.758,
"grad_norm": 800.0,
"kl_loss_13": 70.78567810058594,
"kl_loss_2": 1807.2638732910157,
"kl_loss_4": 963.6115509033203,
"kl_loss_9": 274.7288459777832,
"learning_rate": 0.00014033009983067452,
"loss": 770.9219,
"step": 7580
},
{
"ce_loss_13": 3.371620202064514,
"ce_loss_17": 3.33318635225296,
"ce_loss_2": 4.107252395153045,
"ce_loss_4": 3.7321237087249757,
"ce_loss_9": 3.459933066368103,
"epoch": 0.759,
"grad_norm": 908.0,
"kl_loss_13": 69.97175979614258,
"kl_loss_2": 1699.2034057617188,
"kl_loss_4": 914.4719757080078,
"kl_loss_9": 266.3812965393066,
"learning_rate": 0.00013922972391273224,
"loss": 751.4869,
"step": 7590
},
{
"ce_loss_13": 3.376069927215576,
"ce_loss_17": 3.337877118587494,
"ce_loss_2": 4.137043678760529,
"ce_loss_4": 3.7415095448493956,
"ce_loss_9": 3.465313446521759,
"epoch": 0.76,
"grad_norm": 1072.0,
"kl_loss_13": 70.89640998840332,
"kl_loss_2": 1749.950537109375,
"kl_loss_4": 923.0523040771484,
"kl_loss_9": 269.26611404418946,
"learning_rate": 0.0001381329809474649,
"loss": 764.8768,
"step": 7600
},
{
"ce_loss_13": 3.272611165046692,
"ce_loss_17": 3.232573592662811,
"ce_loss_2": 4.078569507598877,
"ce_loss_4": 3.673508608341217,
"ce_loss_9": 3.371352481842041,
"epoch": 0.761,
"grad_norm": 900.0,
"kl_loss_13": 72.22393836975098,
"kl_loss_2": 1831.3680297851563,
"kl_loss_4": 984.5613494873047,
"kl_loss_9": 281.33798599243164,
"learning_rate": 0.0001370398819790621,
"loss": 784.8211,
"step": 7610
},
{
"ce_loss_13": 3.412788820266724,
"ce_loss_17": 3.372195541858673,
"ce_loss_2": 4.15156922340393,
"ce_loss_4": 3.778453195095062,
"ce_loss_9": 3.5044657707214357,
"epoch": 0.762,
"grad_norm": 680.0,
"kl_loss_13": 71.23473243713379,
"kl_loss_2": 1704.420654296875,
"kl_loss_4": 923.1366424560547,
"kl_loss_9": 270.8112342834473,
"learning_rate": 0.00013595043801501794,
"loss": 740.0381,
"step": 7620
},
{
"ce_loss_13": 3.2104438662528993,
"ce_loss_17": 3.1701866149902345,
"ce_loss_2": 4.04260162115097,
"ce_loss_4": 3.6209548711776733,
"ce_loss_9": 3.3081387162208555,
"epoch": 0.763,
"grad_norm": 1240.0,
"kl_loss_13": 71.1663465499878,
"kl_loss_2": 1879.5203369140625,
"kl_loss_4": 1001.1256713867188,
"kl_loss_9": 282.31495513916013,
"learning_rate": 0.00013486466002602133,
"loss": 786.8135,
"step": 7630
},
{
"ce_loss_13": 3.3238829135894776,
"ce_loss_17": 3.2859877705574037,
"ce_loss_2": 4.06448130607605,
"ce_loss_4": 3.6895562171936036,
"ce_loss_9": 3.414611339569092,
"epoch": 0.764,
"grad_norm": 844.0,
"kl_loss_13": 70.97187271118165,
"kl_loss_2": 1728.2161987304687,
"kl_loss_4": 934.7195190429687,
"kl_loss_9": 270.5533042907715,
"learning_rate": 0.00013378255894584462,
"loss": 777.6823,
"step": 7640
},
{
"ce_loss_13": 3.2600265622138975,
"ce_loss_17": 3.2197046518325805,
"ce_loss_2": 4.051760137081146,
"ce_loss_4": 3.6472837567329406,
"ce_loss_9": 3.3538818955421448,
"epoch": 0.765,
"grad_norm": 924.0,
"kl_loss_13": 71.37140407562256,
"kl_loss_2": 1799.4996520996094,
"kl_loss_4": 965.9475006103515,
"kl_loss_9": 278.5297454833984,
"learning_rate": 0.0001327041456712334,
"loss": 776.7692,
"step": 7650
},
{
"ce_loss_13": 3.298428177833557,
"ce_loss_17": 3.2592820644378664,
"ce_loss_2": 4.06556681394577,
"ce_loss_4": 3.6799978971481324,
"ce_loss_9": 3.3925637722015383,
"epoch": 0.766,
"grad_norm": 980.0,
"kl_loss_13": 71.00599212646485,
"kl_loss_2": 1772.591778564453,
"kl_loss_4": 963.7923431396484,
"kl_loss_9": 276.6527587890625,
"learning_rate": 0.00013162943106179747,
"loss": 778.0987,
"step": 7660
},
{
"ce_loss_13": 3.281181848049164,
"ce_loss_17": 3.24316520690918,
"ce_loss_2": 4.04365484714508,
"ce_loss_4": 3.651939356327057,
"ce_loss_9": 3.3700647592544555,
"epoch": 0.767,
"grad_norm": 736.0,
"kl_loss_13": 70.76591453552246,
"kl_loss_2": 1750.6124877929688,
"kl_loss_4": 946.7601104736328,
"kl_loss_9": 271.6425071716309,
"learning_rate": 0.00013055842593990132,
"loss": 760.269,
"step": 7670
},
{
"ce_loss_13": 3.2225858211517333,
"ce_loss_17": 3.1849503040313722,
"ce_loss_2": 3.9886361718177796,
"ce_loss_4": 3.603174602985382,
"ce_loss_9": 3.316388738155365,
"epoch": 0.768,
"grad_norm": 904.0,
"kl_loss_13": 68.82950401306152,
"kl_loss_2": 1732.0895629882812,
"kl_loss_4": 934.3409393310546,
"kl_loss_9": 269.33024139404296,
"learning_rate": 0.00012949114109055414,
"loss": 772.0984,
"step": 7680
},
{
"ce_loss_13": 3.2685733318328856,
"ce_loss_17": 3.230886149406433,
"ce_loss_2": 4.043664062023163,
"ce_loss_4": 3.6572733879089356,
"ce_loss_9": 3.366124129295349,
"epoch": 0.769,
"grad_norm": 768.0,
"kl_loss_13": 69.85037384033203,
"kl_loss_2": 1776.966748046875,
"kl_loss_4": 955.4688507080078,
"kl_loss_9": 275.6646896362305,
"learning_rate": 0.00012842758726130281,
"loss": 776.2871,
"step": 7690
},
{
"ce_loss_13": 3.3045955061912538,
"ce_loss_17": 3.265139162540436,
"ce_loss_2": 4.105229377746582,
"ce_loss_4": 3.698564016819,
"ce_loss_9": 3.4019991993904113,
"epoch": 0.77,
"grad_norm": 1032.0,
"kl_loss_13": 71.63125114440918,
"kl_loss_2": 1811.1598754882812,
"kl_loss_4": 967.3991241455078,
"kl_loss_9": 280.4630210876465,
"learning_rate": 0.00012736777516212267,
"loss": 765.7549,
"step": 7700
},
{
"ce_loss_13": 3.305664074420929,
"ce_loss_17": 3.2651020765304564,
"ce_loss_2": 4.0766695737838745,
"ce_loss_4": 3.6917852401733398,
"ce_loss_9": 3.40242223739624,
"epoch": 0.771,
"grad_norm": 892.0,
"kl_loss_13": 71.45978965759278,
"kl_loss_2": 1781.9480834960937,
"kl_loss_4": 961.7313262939454,
"kl_loss_9": 278.37108001708987,
"learning_rate": 0.00012631171546530968,
"loss": 759.236,
"step": 7710
},
{
"ce_loss_13": 3.313371980190277,
"ce_loss_17": 3.272728908061981,
"ce_loss_2": 4.079601192474366,
"ce_loss_4": 3.702372431755066,
"ce_loss_9": 3.4114127159118652,
"epoch": 0.772,
"grad_norm": 872.0,
"kl_loss_13": 72.43958892822266,
"kl_loss_2": 1763.9843139648438,
"kl_loss_4": 964.8334869384765,
"kl_loss_9": 279.30160369873045,
"learning_rate": 0.00012525941880537307,
"loss": 779.6013,
"step": 7720
},
{
"ce_loss_13": 3.350739538669586,
"ce_loss_17": 3.3108729243278505,
"ce_loss_2": 4.1055583477020265,
"ce_loss_4": 3.7289539337158204,
"ce_loss_9": 3.4414247274398804,
"epoch": 0.773,
"grad_norm": 2352.0,
"kl_loss_13": 71.25534629821777,
"kl_loss_2": 1736.2442199707032,
"kl_loss_4": 947.9007110595703,
"kl_loss_9": 272.0314407348633,
"learning_rate": 0.00012421089577892869,
"loss": 762.51,
"step": 7730
},
{
"ce_loss_13": 3.2997055292129516,
"ce_loss_17": 3.2605361342430115,
"ce_loss_2": 4.074028778076172,
"ce_loss_4": 3.6856356263160706,
"ce_loss_9": 3.398003101348877,
"epoch": 0.774,
"grad_norm": 1080.0,
"kl_loss_13": 70.94810771942139,
"kl_loss_2": 1778.2087951660155,
"kl_loss_4": 962.2582336425781,
"kl_loss_9": 278.71899871826173,
"learning_rate": 0.0001231661569445919,
"loss": 771.0827,
"step": 7740
},
{
"ce_loss_13": 3.1601253509521485,
"ce_loss_17": 3.121994066238403,
"ce_loss_2": 3.9486268877983095,
"ce_loss_4": 3.5523595929145815,
"ce_loss_9": 3.25460444688797,
"epoch": 0.775,
"grad_norm": 808.0,
"kl_loss_13": 69.09685134887695,
"kl_loss_2": 1793.243145751953,
"kl_loss_4": 957.2483337402343,
"kl_loss_9": 272.4994636535645,
"learning_rate": 0.00012212521282287093,
"loss": 784.6723,
"step": 7750
},
{
"ce_loss_13": 3.3081719994544985,
"ce_loss_17": 3.269343101978302,
"ce_loss_2": 4.065685200691223,
"ce_loss_4": 3.689472830295563,
"ce_loss_9": 3.405069386959076,
"epoch": 0.776,
"grad_norm": 748.0,
"kl_loss_13": 72.37533378601074,
"kl_loss_2": 1743.4986999511718,
"kl_loss_4": 953.6687194824219,
"kl_loss_9": 277.7476356506348,
"learning_rate": 0.00012108807389606158,
"loss": 779.1344,
"step": 7760
},
{
"ce_loss_13": 3.3046793580055236,
"ce_loss_17": 3.2680047869682314,
"ce_loss_2": 4.072437536716461,
"ce_loss_4": 3.6793755173683165,
"ce_loss_9": 3.398095953464508,
"epoch": 0.777,
"grad_norm": 948.0,
"kl_loss_13": 68.8588794708252,
"kl_loss_2": 1749.9745239257813,
"kl_loss_4": 931.3860107421875,
"kl_loss_9": 265.7514595031738,
"learning_rate": 0.00012005475060814159,
"loss": 758.0013,
"step": 7770
},
{
"ce_loss_13": 3.24513418674469,
"ce_loss_17": 3.205347514152527,
"ce_loss_2": 4.035976684093475,
"ce_loss_4": 3.631837856769562,
"ce_loss_9": 3.3390307068824767,
"epoch": 0.778,
"grad_norm": 1112.0,
"kl_loss_13": 71.06086196899415,
"kl_loss_2": 1820.938134765625,
"kl_loss_4": 973.9619323730469,
"kl_loss_9": 278.1083953857422,
"learning_rate": 0.00011902525336466464,
"loss": 777.8083,
"step": 7780
},
{
"ce_loss_13": 3.2333223700523375,
"ce_loss_17": 3.1922889590263366,
"ce_loss_2": 4.0394504308700565,
"ce_loss_4": 3.6332613825798035,
"ce_loss_9": 3.3305997610092164,
"epoch": 0.779,
"grad_norm": 996.0,
"kl_loss_13": 71.51114463806152,
"kl_loss_2": 1842.697412109375,
"kl_loss_4": 987.1933624267579,
"kl_loss_9": 281.46219329833986,
"learning_rate": 0.00011799959253265668,
"loss": 777.4664,
"step": 7790
},
{
"ce_loss_13": 3.293571639060974,
"ce_loss_17": 3.2538684844970702,
"ce_loss_2": 4.073249363899231,
"ce_loss_4": 3.674374008178711,
"ce_loss_9": 3.3861318469047545,
"epoch": 0.78,
"grad_norm": 992.0,
"kl_loss_13": 72.05066471099853,
"kl_loss_2": 1801.1092712402344,
"kl_loss_4": 965.7812194824219,
"kl_loss_9": 278.4226219177246,
"learning_rate": 0.00011697777844051105,
"loss": 774.5723,
"step": 7800
},
{
"ce_loss_13": 3.2768689155578614,
"ce_loss_17": 3.2374550819396974,
"ce_loss_2": 4.0919262886047365,
"ce_loss_4": 3.6746891021728514,
"ce_loss_9": 3.37604638338089,
"epoch": 0.781,
"grad_norm": 1056.0,
"kl_loss_13": 71.38128356933593,
"kl_loss_2": 1849.8265380859375,
"kl_loss_4": 979.9144989013672,
"kl_loss_9": 278.7414749145508,
"learning_rate": 0.00011595982137788402,
"loss": 781.9421,
"step": 7810
},
{
"ce_loss_13": 3.2553463697433473,
"ce_loss_17": 3.2170026302337646,
"ce_loss_2": 4.000703608989715,
"ce_loss_4": 3.627398419380188,
"ce_loss_9": 3.346537911891937,
"epoch": 0.782,
"grad_norm": 876.0,
"kl_loss_13": 69.83545207977295,
"kl_loss_2": 1720.7229370117188,
"kl_loss_4": 933.8058410644531,
"kl_loss_9": 268.9400184631348,
"learning_rate": 0.00011494573159559212,
"loss": 762.1486,
"step": 7820
},
{
"ce_loss_13": 3.240817403793335,
"ce_loss_17": 3.2020283579826354,
"ce_loss_2": 4.018700480461121,
"ce_loss_4": 3.6315406918525697,
"ce_loss_9": 3.3373527765274047,
"epoch": 0.783,
"grad_norm": 788.0,
"kl_loss_13": 70.06516742706299,
"kl_loss_2": 1784.2691650390625,
"kl_loss_4": 969.1496063232422,
"kl_loss_9": 275.128840637207,
"learning_rate": 0.00011393551930550828,
"loss": 784.6266,
"step": 7830
},
{
"ce_loss_13": 3.3773342847824095,
"ce_loss_17": 3.337771785259247,
"ce_loss_2": 4.118187880516052,
"ce_loss_4": 3.749299919605255,
"ce_loss_9": 3.4668367743492126,
"epoch": 0.784,
"grad_norm": 976.0,
"kl_loss_13": 72.88168487548828,
"kl_loss_2": 1722.021337890625,
"kl_loss_4": 940.9131134033203,
"kl_loss_9": 276.2025405883789,
"learning_rate": 0.00011292919468045875,
"loss": 759.5694,
"step": 7840
},
{
"ce_loss_13": 3.331298661231995,
"ce_loss_17": 3.293097233772278,
"ce_loss_2": 4.090363371372223,
"ce_loss_4": 3.708501470088959,
"ce_loss_9": 3.4256391763687133,
"epoch": 0.785,
"grad_norm": 716.0,
"kl_loss_13": 70.71659488677979,
"kl_loss_2": 1758.3239807128907,
"kl_loss_4": 952.2195709228515,
"kl_loss_9": 275.8377281188965,
"learning_rate": 0.00011192676785412154,
"loss": 758.0768,
"step": 7850
},
{
"ce_loss_13": 3.2701008558273315,
"ce_loss_17": 3.2280179619789124,
"ce_loss_2": 4.070445036888122,
"ce_loss_4": 3.6703123331069945,
"ce_loss_9": 3.367347037792206,
"epoch": 0.786,
"grad_norm": 1312.0,
"kl_loss_13": 71.71622810363769,
"kl_loss_2": 1807.7423217773437,
"kl_loss_4": 965.1001220703125,
"kl_loss_9": 275.57702484130857,
"learning_rate": 0.00011092824892092374,
"loss": 775.3067,
"step": 7860
},
{
"ce_loss_13": 3.2034618616104127,
"ce_loss_17": 3.16623512506485,
"ce_loss_2": 4.00084820985794,
"ce_loss_4": 3.602157771587372,
"ce_loss_9": 3.298418068885803,
"epoch": 0.787,
"grad_norm": 664.0,
"kl_loss_13": 70.10608253479003,
"kl_loss_2": 1820.81669921875,
"kl_loss_4": 982.6229064941406,
"kl_loss_9": 274.26637802124026,
"learning_rate": 0.0001099336479359398,
"loss": 770.4278,
"step": 7870
},
{
"ce_loss_13": 3.327226400375366,
"ce_loss_17": 3.289413809776306,
"ce_loss_2": 4.066450893878937,
"ce_loss_4": 3.6946874141693113,
"ce_loss_9": 3.4174214959144593,
"epoch": 0.788,
"grad_norm": 760.0,
"kl_loss_13": 70.14546699523926,
"kl_loss_2": 1729.8601928710937,
"kl_loss_4": 936.3539489746094,
"kl_loss_9": 270.65756072998045,
"learning_rate": 0.00010894297491479043,
"loss": 763.074,
"step": 7880
},
{
"ce_loss_13": 3.314143991470337,
"ce_loss_17": 3.27738618850708,
"ce_loss_2": 4.085150945186615,
"ce_loss_4": 3.6851558089256287,
"ce_loss_9": 3.4041661500930784,
"epoch": 0.789,
"grad_norm": 736.0,
"kl_loss_13": 70.67064800262452,
"kl_loss_2": 1768.3765869140625,
"kl_loss_4": 944.1422546386718,
"kl_loss_9": 274.3588096618652,
"learning_rate": 0.00010795623983354214,
"loss": 759.3542,
"step": 7890
},
{
"ce_loss_13": 3.2119219303131104,
"ce_loss_17": 3.1734250664710997,
"ce_loss_2": 3.9959056615829467,
"ce_loss_4": 3.6035253524780275,
"ce_loss_9": 3.309849750995636,
"epoch": 0.79,
"grad_norm": 972.0,
"kl_loss_13": 70.98051776885987,
"kl_loss_2": 1806.2532165527343,
"kl_loss_4": 973.5648162841796,
"kl_loss_9": 282.1634231567383,
"learning_rate": 0.00010697345262860636,
"loss": 769.9423,
"step": 7900
},
{
"ce_loss_13": 3.346919822692871,
"ce_loss_17": 3.3095391273498533,
"ce_loss_2": 4.095262408256531,
"ce_loss_4": 3.709192931652069,
"ce_loss_9": 3.437203562259674,
"epoch": 0.791,
"grad_norm": 1072.0,
"kl_loss_13": 70.88391819000245,
"kl_loss_2": 1729.3513916015625,
"kl_loss_4": 928.2963439941407,
"kl_loss_9": 271.72196731567385,
"learning_rate": 0.00010599462319663906,
"loss": 752.6117,
"step": 7910
},
{
"ce_loss_13": 3.3200356721878053,
"ce_loss_17": 3.2812179923057556,
"ce_loss_2": 4.047398543357849,
"ce_loss_4": 3.6830509424209597,
"ce_loss_9": 3.4092038750648497,
"epoch": 0.792,
"grad_norm": 784.0,
"kl_loss_13": 69.72565822601318,
"kl_loss_2": 1688.08408203125,
"kl_loss_4": 914.8679321289062,
"kl_loss_9": 266.0558052062988,
"learning_rate": 0.00010501976139444191,
"loss": 744.4802,
"step": 7920
},
{
"ce_loss_13": 3.3473254561424257,
"ce_loss_17": 3.3079727411270143,
"ce_loss_2": 4.098410105705261,
"ce_loss_4": 3.719019615650177,
"ce_loss_9": 3.43493013381958,
"epoch": 0.793,
"grad_norm": 1472.0,
"kl_loss_13": 71.42715454101562,
"kl_loss_2": 1740.3159423828124,
"kl_loss_4": 933.7770782470703,
"kl_loss_9": 269.4993034362793,
"learning_rate": 0.0001040488770388625,
"loss": 768.4107,
"step": 7930
},
{
"ce_loss_13": 3.2970604300498962,
"ce_loss_17": 3.2603843569755555,
"ce_loss_2": 4.063302505016327,
"ce_loss_4": 3.673347556591034,
"ce_loss_9": 3.387963795661926,
"epoch": 0.794,
"grad_norm": 968.0,
"kl_loss_13": 70.85658683776856,
"kl_loss_2": 1781.4008056640625,
"kl_loss_4": 959.971401977539,
"kl_loss_9": 274.77855758666993,
"learning_rate": 0.00010308197990669538,
"loss": 766.2293,
"step": 7940
},
{
"ce_loss_13": 3.4099225759506226,
"ce_loss_17": 3.3691091775894164,
"ce_loss_2": 4.165284764766693,
"ce_loss_4": 3.784447419643402,
"ce_loss_9": 3.500907635688782,
"epoch": 0.795,
"grad_norm": 880.0,
"kl_loss_13": 73.56418590545654,
"kl_loss_2": 1752.826593017578,
"kl_loss_4": 951.5711303710938,
"kl_loss_9": 278.11546325683594,
"learning_rate": 0.0001021190797345839,
"loss": 760.0999,
"step": 7950
},
{
"ce_loss_13": 3.135342812538147,
"ce_loss_17": 3.0957832813262938,
"ce_loss_2": 3.9510748744010926,
"ce_loss_4": 3.5491525173187255,
"ce_loss_9": 3.233706223964691,
"epoch": 0.796,
"grad_norm": 1040.0,
"kl_loss_13": 71.97039318084717,
"kl_loss_2": 1850.091290283203,
"kl_loss_4": 1004.5883972167969,
"kl_loss_9": 287.36984634399414,
"learning_rate": 0.00010116018621892236,
"loss": 782.3515,
"step": 7960
},
{
"ce_loss_13": 3.341820991039276,
"ce_loss_17": 3.3028963685035704,
"ce_loss_2": 4.1250855922698975,
"ce_loss_4": 3.729242372512817,
"ce_loss_9": 3.4384153485298157,
"epoch": 0.797,
"grad_norm": 836.0,
"kl_loss_13": 74.41549568176269,
"kl_loss_2": 1800.5012512207031,
"kl_loss_4": 978.0294982910157,
"kl_loss_9": 285.6070960998535,
"learning_rate": 0.00010020530901575753,
"loss": 762.0277,
"step": 7970
},
{
"ce_loss_13": 3.369021511077881,
"ce_loss_17": 3.3300336837768554,
"ce_loss_2": 4.121864223480225,
"ce_loss_4": 3.742775762081146,
"ce_loss_9": 3.4580808758735655,
"epoch": 0.798,
"grad_norm": 688.0,
"kl_loss_13": 71.71014213562012,
"kl_loss_2": 1744.4852905273438,
"kl_loss_4": 954.4276824951172,
"kl_loss_9": 276.1943420410156,
"learning_rate": 9.925445774069231e-05,
"loss": 751.1472,
"step": 7980
},
{
"ce_loss_13": 3.3181877017021177,
"ce_loss_17": 3.2799045562744142,
"ce_loss_2": 4.085768818855286,
"ce_loss_4": 3.7050577878952025,
"ce_loss_9": 3.41319922208786,
"epoch": 0.799,
"grad_norm": 908.0,
"kl_loss_13": 71.46154289245605,
"kl_loss_2": 1742.6618408203126,
"kl_loss_4": 944.5083557128906,
"kl_loss_9": 273.0162544250488,
"learning_rate": 9.830764196878872e-05,
"loss": 749.2419,
"step": 7990
},
{
"ce_loss_13": 3.263114702701569,
"ce_loss_17": 3.2257192015647886,
"ce_loss_2": 4.035439610481262,
"ce_loss_4": 3.6414616227149965,
"ce_loss_9": 3.3571359753608703,
"epoch": 0.8,
"grad_norm": 748.0,
"kl_loss_13": 69.72845554351807,
"kl_loss_2": 1804.6130981445312,
"kl_loss_4": 963.6896820068359,
"kl_loss_9": 273.2717483520508,
"learning_rate": 9.736487123447069e-05,
"loss": 769.4352,
"step": 8000
},
{
"ce_loss_13": 3.215253698825836,
"ce_loss_17": 3.175714838504791,
"ce_loss_2": 4.042688941955566,
"ce_loss_4": 3.6105726838111876,
"ce_loss_9": 3.3094853520393372,
"epoch": 0.801,
"grad_norm": 692.0,
"kl_loss_13": 70.90237617492676,
"kl_loss_2": 1899.3806396484374,
"kl_loss_4": 990.8334075927735,
"kl_loss_9": 274.07350997924806,
"learning_rate": 9.642615503142926e-05,
"loss": 789.7463,
"step": 8010
},
{
"ce_loss_13": 3.2806339621543885,
"ce_loss_17": 3.2416765809059145,
"ce_loss_2": 4.056443822383881,
"ce_loss_4": 3.6604487776756285,
"ce_loss_9": 3.376413810253143,
"epoch": 0.802,
"grad_norm": 884.0,
"kl_loss_13": 70.28888111114502,
"kl_loss_2": 1780.5822021484375,
"kl_loss_4": 944.0170684814453,
"kl_loss_9": 270.7223930358887,
"learning_rate": 9.549150281252633e-05,
"loss": 755.7942,
"step": 8020
},
{
"ce_loss_13": 3.3055353283882143,
"ce_loss_17": 3.2666314482688903,
"ce_loss_2": 4.071729218959808,
"ce_loss_4": 3.6816184878349305,
"ce_loss_9": 3.398531424999237,
"epoch": 0.803,
"grad_norm": 852.0,
"kl_loss_13": 71.74902591705322,
"kl_loss_2": 1772.0785400390625,
"kl_loss_4": 942.818148803711,
"kl_loss_9": 274.69599533081055,
"learning_rate": 9.4560923989699e-05,
"loss": 776.9241,
"step": 8030
},
{
"ce_loss_13": 3.2983041524887087,
"ce_loss_17": 3.260856258869171,
"ce_loss_2": 4.068617153167724,
"ce_loss_4": 3.6743965983390807,
"ce_loss_9": 3.3917889475822447,
"epoch": 0.804,
"grad_norm": 900.0,
"kl_loss_13": 70.64652194976807,
"kl_loss_2": 1753.8257873535156,
"kl_loss_4": 938.5328674316406,
"kl_loss_9": 273.6092224121094,
"learning_rate": 9.363442793386607e-05,
"loss": 777.0836,
"step": 8040
},
{
"ce_loss_13": 3.2714555144309996,
"ce_loss_17": 3.231795275211334,
"ce_loss_2": 4.066785430908203,
"ce_loss_4": 3.6732001066207887,
"ce_loss_9": 3.3679249763488768,
"epoch": 0.805,
"grad_norm": 984.0,
"kl_loss_13": 72.06962203979492,
"kl_loss_2": 1801.3849243164063,
"kl_loss_4": 984.603271484375,
"kl_loss_9": 283.51671295166017,
"learning_rate": 9.271202397483213e-05,
"loss": 761.9754,
"step": 8050
},
{
"ce_loss_13": 3.3052454233169555,
"ce_loss_17": 3.2672803163528443,
"ce_loss_2": 4.0526569247245785,
"ce_loss_4": 3.667846715450287,
"ce_loss_9": 3.3939131259918214,
"epoch": 0.806,
"grad_norm": 832.0,
"kl_loss_13": 70.0618745803833,
"kl_loss_2": 1736.9171203613282,
"kl_loss_4": 927.4877319335938,
"kl_loss_9": 268.56889114379885,
"learning_rate": 9.179372140119524e-05,
"loss": 771.3464,
"step": 8060
},
{
"ce_loss_13": 3.250062882900238,
"ce_loss_17": 3.2121487855911255,
"ce_loss_2": 3.9981072664260866,
"ce_loss_4": 3.618996787071228,
"ce_loss_9": 3.339201021194458,
"epoch": 0.807,
"grad_norm": 968.0,
"kl_loss_13": 69.4912302017212,
"kl_loss_2": 1744.63623046875,
"kl_loss_4": 938.9472076416016,
"kl_loss_9": 271.9808044433594,
"learning_rate": 9.087952946025175e-05,
"loss": 772.0036,
"step": 8070
},
{
"ce_loss_13": 3.3562884092330934,
"ce_loss_17": 3.3175664067268373,
"ce_loss_2": 4.0800391912460325,
"ce_loss_4": 3.7106515407562255,
"ce_loss_9": 3.4446619272232057,
"epoch": 0.808,
"grad_norm": 704.0,
"kl_loss_13": 70.61629962921143,
"kl_loss_2": 1696.0239379882812,
"kl_loss_4": 904.205810546875,
"kl_loss_9": 266.76557540893555,
"learning_rate": 8.996945735790446e-05,
"loss": 758.6735,
"step": 8080
},
{
"ce_loss_13": 3.2506332993507385,
"ce_loss_17": 3.212847054004669,
"ce_loss_2": 4.017482662200928,
"ce_loss_4": 3.631740617752075,
"ce_loss_9": 3.3438016176223755,
"epoch": 0.809,
"grad_norm": 756.0,
"kl_loss_13": 70.42041702270508,
"kl_loss_2": 1766.6884155273438,
"kl_loss_4": 954.8519500732422,
"kl_loss_9": 271.8013496398926,
"learning_rate": 8.906351425856951e-05,
"loss": 775.4984,
"step": 8090
},
{
"ce_loss_13": 3.2348312735557556,
"ce_loss_17": 3.196401846408844,
"ce_loss_2": 4.024988865852356,
"ce_loss_4": 3.624850368499756,
"ce_loss_9": 3.3307148933410646,
"epoch": 0.81,
"grad_norm": 1096.0,
"kl_loss_13": 70.86194591522217,
"kl_loss_2": 1827.4809509277343,
"kl_loss_4": 976.5987243652344,
"kl_loss_9": 276.3275917053223,
"learning_rate": 8.816170928508365e-05,
"loss": 782.7902,
"step": 8100
},
{
"ce_loss_13": 3.202365779876709,
"ce_loss_17": 3.164315390586853,
"ce_loss_2": 4.0153038740158085,
"ce_loss_4": 3.5968923449516295,
"ce_loss_9": 3.3008163809776305,
"epoch": 0.811,
"grad_norm": 700.0,
"kl_loss_13": 70.01141414642333,
"kl_loss_2": 1855.7152587890625,
"kl_loss_4": 981.6325653076171,
"kl_loss_9": 277.6419342041016,
"learning_rate": 8.7264051518613e-05,
"loss": 779.224,
"step": 8110
},
{
"ce_loss_13": 3.290323805809021,
"ce_loss_17": 3.2542584896087647,
"ce_loss_2": 4.046256864070893,
"ce_loss_4": 3.662633013725281,
"ce_loss_9": 3.3823811054229735,
"epoch": 0.812,
"grad_norm": 948.0,
"kl_loss_13": 68.87970733642578,
"kl_loss_2": 1737.5715454101562,
"kl_loss_4": 926.0573944091797,
"kl_loss_9": 267.5809020996094,
"learning_rate": 8.637054999856148e-05,
"loss": 759.8539,
"step": 8120
},
{
"ce_loss_13": 3.2804335355758667,
"ce_loss_17": 3.2413512229919434,
"ce_loss_2": 4.054854559898376,
"ce_loss_4": 3.6653516054153443,
"ce_loss_9": 3.3767602682113647,
"epoch": 0.813,
"grad_norm": 800.0,
"kl_loss_13": 70.57294750213623,
"kl_loss_2": 1777.4925964355468,
"kl_loss_4": 956.8676177978516,
"kl_loss_9": 274.3025924682617,
"learning_rate": 8.548121372247918e-05,
"loss": 779.4507,
"step": 8130
},
{
"ce_loss_13": 3.349512314796448,
"ce_loss_17": 3.3119771838188172,
"ce_loss_2": 4.094702482223511,
"ce_loss_4": 3.7121527791023254,
"ce_loss_9": 3.4385990619659426,
"epoch": 0.814,
"grad_norm": 960.0,
"kl_loss_13": 70.90967864990235,
"kl_loss_2": 1739.1879577636719,
"kl_loss_4": 929.7545379638672,
"kl_loss_9": 271.06103591918946,
"learning_rate": 8.459605164597267e-05,
"loss": 759.1175,
"step": 8140
},
{
"ce_loss_13": 3.2360759139060975,
"ce_loss_17": 3.197964668273926,
"ce_loss_2": 4.00811208486557,
"ce_loss_4": 3.6193493843078612,
"ce_loss_9": 3.3254319429397583,
"epoch": 0.815,
"grad_norm": 900.0,
"kl_loss_13": 69.83507804870605,
"kl_loss_2": 1777.051202392578,
"kl_loss_4": 955.8897918701172,
"kl_loss_9": 271.30548400878905,
"learning_rate": 8.371507268261436e-05,
"loss": 771.6216,
"step": 8150
},
{
"ce_loss_13": 3.3079094648361207,
"ce_loss_17": 3.2703657865524294,
"ce_loss_2": 4.0787346959114075,
"ce_loss_4": 3.692156195640564,
"ce_loss_9": 3.4013009071350098,
"epoch": 0.816,
"grad_norm": 640.0,
"kl_loss_13": 70.76464653015137,
"kl_loss_2": 1769.2670593261719,
"kl_loss_4": 954.974008178711,
"kl_loss_9": 275.3981559753418,
"learning_rate": 8.283828570385238e-05,
"loss": 750.3438,
"step": 8160
},
{
"ce_loss_13": 3.3092649817466735,
"ce_loss_17": 3.2702112197875977,
"ce_loss_2": 4.073802161216736,
"ce_loss_4": 3.688433313369751,
"ce_loss_9": 3.4022815227508545,
"epoch": 0.817,
"grad_norm": 728.0,
"kl_loss_13": 71.31716861724854,
"kl_loss_2": 1728.0110778808594,
"kl_loss_4": 938.0338684082031,
"kl_loss_9": 272.21169128417966,
"learning_rate": 8.196569953892202e-05,
"loss": 758.1972,
"step": 8170
},
{
"ce_loss_13": 3.230646347999573,
"ce_loss_17": 3.1929505705833434,
"ce_loss_2": 4.0089329242706295,
"ce_loss_4": 3.6149255514144896,
"ce_loss_9": 3.3274157643318176,
"epoch": 0.818,
"grad_norm": 888.0,
"kl_loss_13": 70.58127098083496,
"kl_loss_2": 1758.4310180664063,
"kl_loss_4": 950.9697113037109,
"kl_loss_9": 274.587052154541,
"learning_rate": 8.109732297475635e-05,
"loss": 758.4494,
"step": 8180
},
{
"ce_loss_13": 3.1995676875114443,
"ce_loss_17": 3.1593656182289123,
"ce_loss_2": 4.04011173248291,
"ce_loss_4": 3.627755606174469,
"ce_loss_9": 3.3016679525375365,
"epoch": 0.819,
"grad_norm": 796.0,
"kl_loss_13": 72.1693660736084,
"kl_loss_2": 1876.0396484375,
"kl_loss_4": 1012.8940368652344,
"kl_loss_9": 286.2684616088867,
"learning_rate": 8.023316475589754e-05,
"loss": 792.0807,
"step": 8190
},
{
"ce_loss_13": 3.168242931365967,
"ce_loss_17": 3.1274430990219115,
"ce_loss_2": 4.0250523686409,
"ce_loss_4": 3.5892801761627195,
"ce_loss_9": 3.2700828194618223,
"epoch": 0.82,
"grad_norm": 1320.0,
"kl_loss_13": 73.71393585205078,
"kl_loss_2": 1926.69619140625,
"kl_loss_4": 1025.3121826171875,
"kl_loss_9": 291.9488471984863,
"learning_rate": 7.937323358440934e-05,
"loss": 808.6539,
"step": 8200
},
{
"ce_loss_13": 3.291755425930023,
"ce_loss_17": 3.256150817871094,
"ce_loss_2": 4.031732153892517,
"ce_loss_4": 3.6551050424575804,
"ce_loss_9": 3.379641282558441,
"epoch": 0.821,
"grad_norm": 708.0,
"kl_loss_13": 69.81248378753662,
"kl_loss_2": 1714.6736938476563,
"kl_loss_4": 931.0094848632813,
"kl_loss_9": 268.1382438659668,
"learning_rate": 7.851753811978923e-05,
"loss": 758.6292,
"step": 8210
},
{
"ce_loss_13": 3.306903636455536,
"ce_loss_17": 3.2677652716636656,
"ce_loss_2": 4.0906357884407045,
"ce_loss_4": 3.69010044336319,
"ce_loss_9": 3.4002402305603026,
"epoch": 0.822,
"grad_norm": 868.0,
"kl_loss_13": 71.25135803222656,
"kl_loss_2": 1789.5710693359374,
"kl_loss_4": 954.998403930664,
"kl_loss_9": 273.52307586669923,
"learning_rate": 7.766608697888095e-05,
"loss": 763.3939,
"step": 8220
},
{
"ce_loss_13": 3.3179776072502136,
"ce_loss_17": 3.2780291557312013,
"ce_loss_2": 4.08966873884201,
"ce_loss_4": 3.69758517742157,
"ce_loss_9": 3.4125046730041504,
"epoch": 0.823,
"grad_norm": 1112.0,
"kl_loss_13": 72.05121192932128,
"kl_loss_2": 1786.785711669922,
"kl_loss_4": 958.7785858154297,
"kl_loss_9": 277.2726997375488,
"learning_rate": 7.681888873578785e-05,
"loss": 778.8477,
"step": 8230
},
{
"ce_loss_13": 3.2508747458457945,
"ce_loss_17": 3.2092309474945067,
"ce_loss_2": 4.041729521751404,
"ce_loss_4": 3.643390250205994,
"ce_loss_9": 3.349603259563446,
"epoch": 0.824,
"grad_norm": 884.0,
"kl_loss_13": 72.39484767913818,
"kl_loss_2": 1815.295880126953,
"kl_loss_4": 973.4565734863281,
"kl_loss_9": 281.2719497680664,
"learning_rate": 7.597595192178702e-05,
"loss": 769.4248,
"step": 8240
},
{
"ce_loss_13": 3.250980806350708,
"ce_loss_17": 3.2109704494476317,
"ce_loss_2": 4.054522025585174,
"ce_loss_4": 3.645539367198944,
"ce_loss_9": 3.34756623506546,
"epoch": 0.825,
"grad_norm": 768.0,
"kl_loss_13": 72.21156806945801,
"kl_loss_2": 1856.1005126953125,
"kl_loss_4": 995.8562927246094,
"kl_loss_9": 282.67078399658203,
"learning_rate": 7.513728502524286e-05,
"loss": 790.4012,
"step": 8250
},
{
"ce_loss_13": 3.250458598136902,
"ce_loss_17": 3.2125913977622984,
"ce_loss_2": 4.01266096830368,
"ce_loss_4": 3.622589910030365,
"ce_loss_9": 3.3403961658477783,
"epoch": 0.826,
"grad_norm": 1040.0,
"kl_loss_13": 68.84297733306884,
"kl_loss_2": 1746.9054443359375,
"kl_loss_4": 937.1652435302734,
"kl_loss_9": 268.0669677734375,
"learning_rate": 7.430289649152156e-05,
"loss": 771.2598,
"step": 8260
},
{
"ce_loss_13": 3.1503018498420716,
"ce_loss_17": 3.1115196704864503,
"ce_loss_2": 3.9706924200057983,
"ce_loss_4": 3.5614026308059694,
"ce_loss_9": 3.2497955560684204,
"epoch": 0.827,
"grad_norm": 900.0,
"kl_loss_13": 70.41055603027344,
"kl_loss_2": 1863.2432922363282,
"kl_loss_4": 1000.8920288085938,
"kl_loss_9": 280.12621841430666,
"learning_rate": 7.347279472290646e-05,
"loss": 778.9465,
"step": 8270
},
{
"ce_loss_13": 3.2909953236579894,
"ce_loss_17": 3.2529433488845827,
"ce_loss_2": 4.078157913684845,
"ce_loss_4": 3.674878942966461,
"ce_loss_9": 3.3848023533821108,
"epoch": 0.828,
"grad_norm": 744.0,
"kl_loss_13": 71.25290489196777,
"kl_loss_2": 1796.028582763672,
"kl_loss_4": 962.6837554931641,
"kl_loss_9": 272.6476791381836,
"learning_rate": 7.264698807851328e-05,
"loss": 779.5428,
"step": 8280
},
{
"ce_loss_13": 3.263857388496399,
"ce_loss_17": 3.2270354986190797,
"ce_loss_2": 4.014006841182709,
"ce_loss_4": 3.6293476581573487,
"ce_loss_9": 3.351469397544861,
"epoch": 0.829,
"grad_norm": 840.0,
"kl_loss_13": 68.5794059753418,
"kl_loss_2": 1724.5256225585938,
"kl_loss_4": 925.4598114013672,
"kl_loss_9": 267.2643295288086,
"learning_rate": 7.182548487420554e-05,
"loss": 759.1056,
"step": 8290
},
{
"ce_loss_13": 3.311834120750427,
"ce_loss_17": 3.272626531124115,
"ce_loss_2": 4.066555631160736,
"ce_loss_4": 3.6880828857421877,
"ce_loss_9": 3.4068371653556824,
"epoch": 0.83,
"grad_norm": 692.0,
"kl_loss_13": 71.82446117401123,
"kl_loss_2": 1767.156024169922,
"kl_loss_4": 956.2914825439453,
"kl_loss_9": 277.0776992797852,
"learning_rate": 7.100829338251146e-05,
"loss": 766.3485,
"step": 8300
},
{
"ce_loss_13": 3.249209761619568,
"ce_loss_17": 3.209535229206085,
"ce_loss_2": 4.045548975467682,
"ce_loss_4": 3.6445645332336425,
"ce_loss_9": 3.344308114051819,
"epoch": 0.831,
"grad_norm": 984.0,
"kl_loss_13": 71.40059185028076,
"kl_loss_2": 1808.8923950195312,
"kl_loss_4": 975.227816772461,
"kl_loss_9": 279.40746307373047,
"learning_rate": 7.019542183254046e-05,
"loss": 768.1299,
"step": 8310
},
{
"ce_loss_13": 3.285986268520355,
"ce_loss_17": 3.24474892616272,
"ce_loss_2": 4.044291996955872,
"ce_loss_4": 3.65914249420166,
"ce_loss_9": 3.379481887817383,
"epoch": 0.832,
"grad_norm": 1136.0,
"kl_loss_13": 74.21089000701905,
"kl_loss_2": 1763.9286071777344,
"kl_loss_4": 951.864404296875,
"kl_loss_9": 279.7374038696289,
"learning_rate": 6.938687840989971e-05,
"loss": 768.0131,
"step": 8320
},
{
"ce_loss_13": 3.227405917644501,
"ce_loss_17": 3.1868462920188905,
"ce_loss_2": 3.9979352712631226,
"ce_loss_4": 3.6121195673942568,
"ce_loss_9": 3.3207536339759827,
"epoch": 0.833,
"grad_norm": 992.0,
"kl_loss_13": 71.92461414337158,
"kl_loss_2": 1749.97158203125,
"kl_loss_4": 951.9216796875,
"kl_loss_9": 275.5987823486328,
"learning_rate": 6.858267125661271e-05,
"loss": 773.1094,
"step": 8330
},
{
"ce_loss_13": 3.283363175392151,
"ce_loss_17": 3.2460866570472717,
"ce_loss_2": 4.060705983638764,
"ce_loss_4": 3.6730141282081603,
"ce_loss_9": 3.380064380168915,
"epoch": 0.834,
"grad_norm": 1192.0,
"kl_loss_13": 69.57496089935303,
"kl_loss_2": 1759.7617492675781,
"kl_loss_4": 951.299429321289,
"kl_loss_9": 272.3924873352051,
"learning_rate": 6.778280847103668e-05,
"loss": 782.0201,
"step": 8340
},
{
"ce_loss_13": 3.2931501269340515,
"ce_loss_17": 3.2552053213119505,
"ce_loss_2": 4.059948515892029,
"ce_loss_4": 3.6772831797599794,
"ce_loss_9": 3.386395478248596,
"epoch": 0.835,
"grad_norm": 756.0,
"kl_loss_13": 71.99072380065918,
"kl_loss_2": 1784.4557189941406,
"kl_loss_4": 968.9391723632813,
"kl_loss_9": 280.3265556335449,
"learning_rate": 6.698729810778065e-05,
"loss": 768.9306,
"step": 8350
},
{
"ce_loss_13": 3.2062022924423217,
"ce_loss_17": 3.167280352115631,
"ce_loss_2": 3.992838132381439,
"ce_loss_4": 3.594524645805359,
"ce_loss_9": 3.3012266397476195,
"epoch": 0.836,
"grad_norm": 1392.0,
"kl_loss_13": 68.42381629943847,
"kl_loss_2": 1783.9000061035156,
"kl_loss_4": 959.2278900146484,
"kl_loss_9": 273.95938339233396,
"learning_rate": 6.619614817762538e-05,
"loss": 774.9958,
"step": 8360
},
{
"ce_loss_13": 3.170720875263214,
"ce_loss_17": 3.131775438785553,
"ce_loss_2": 4.007088744640351,
"ce_loss_4": 3.580740916728973,
"ce_loss_9": 3.26929292678833,
"epoch": 0.837,
"grad_norm": 808.0,
"kl_loss_13": 69.11339492797852,
"kl_loss_2": 1886.8944580078125,
"kl_loss_4": 999.9775939941406,
"kl_loss_9": 279.94310607910154,
"learning_rate": 6.540936664744196e-05,
"loss": 790.6644,
"step": 8370
},
{
"ce_loss_13": 3.3124632954597475,
"ce_loss_17": 3.272116649150848,
"ce_loss_2": 4.103182435035706,
"ce_loss_4": 3.7041156768798826,
"ce_loss_9": 3.4053141117095946,
"epoch": 0.838,
"grad_norm": 652.0,
"kl_loss_13": 72.02493133544922,
"kl_loss_2": 1790.3144104003907,
"kl_loss_4": 962.9958099365234,
"kl_loss_9": 275.7400604248047,
"learning_rate": 6.462696144011149e-05,
"loss": 764.9788,
"step": 8380
},
{
"ce_loss_13": 3.2692295789718626,
"ce_loss_17": 3.230850112438202,
"ce_loss_2": 4.035164856910706,
"ce_loss_4": 3.6583542227745056,
"ce_loss_9": 3.3649362325668335,
"epoch": 0.839,
"grad_norm": 964.0,
"kl_loss_13": 72.52585830688477,
"kl_loss_2": 1767.6659545898438,
"kl_loss_4": 971.1108917236328,
"kl_loss_9": 279.3069702148438,
"learning_rate": 6.384894043444567e-05,
"loss": 762.8141,
"step": 8390
},
{
"ce_loss_13": 3.295042598247528,
"ce_loss_17": 3.257158863544464,
"ce_loss_2": 4.079833257198334,
"ce_loss_4": 3.6833842754364015,
"ce_loss_9": 3.39192214012146,
"epoch": 0.84,
"grad_norm": 1008.0,
"kl_loss_13": 71.43204746246337,
"kl_loss_2": 1781.3298583984374,
"kl_loss_4": 956.8086730957032,
"kl_loss_9": 275.4544845581055,
"learning_rate": 6.307531146510753e-05,
"loss": 764.5186,
"step": 8400
},
{
"ce_loss_13": 3.270242619514465,
"ce_loss_17": 3.231538414955139,
"ce_loss_2": 4.02031763792038,
"ce_loss_4": 3.6488274216651915,
"ce_loss_9": 3.361456108093262,
"epoch": 0.841,
"grad_norm": 992.0,
"kl_loss_13": 70.6785327911377,
"kl_loss_2": 1725.7596313476563,
"kl_loss_4": 943.61474609375,
"kl_loss_9": 273.6351890563965,
"learning_rate": 6.230608232253226e-05,
"loss": 751.6663,
"step": 8410
},
{
"ce_loss_13": 3.229662823677063,
"ce_loss_17": 3.19106924533844,
"ce_loss_2": 4.043300378322601,
"ce_loss_4": 3.634285008907318,
"ce_loss_9": 3.324590063095093,
"epoch": 0.842,
"grad_norm": 1056.0,
"kl_loss_13": 71.09436721801758,
"kl_loss_2": 1843.1129028320313,
"kl_loss_4": 992.1776947021484,
"kl_loss_9": 281.0560470581055,
"learning_rate": 6.154126075284855e-05,
"loss": 770.8019,
"step": 8420
},
{
"ce_loss_13": 3.3237983703613283,
"ce_loss_17": 3.2856169939041138,
"ce_loss_2": 4.068436872959137,
"ce_loss_4": 3.697681736946106,
"ce_loss_9": 3.4135434150695803,
"epoch": 0.843,
"grad_norm": 964.0,
"kl_loss_13": 69.21185264587402,
"kl_loss_2": 1709.265692138672,
"kl_loss_4": 943.1334503173828,
"kl_loss_9": 268.73162231445315,
"learning_rate": 6.078085445780129e-05,
"loss": 747.3139,
"step": 8430
},
{
"ce_loss_13": 3.326972723007202,
"ce_loss_17": 3.2875264525413512,
"ce_loss_2": 4.109538185596466,
"ce_loss_4": 3.707826542854309,
"ce_loss_9": 3.420196759700775,
"epoch": 0.844,
"grad_norm": 988.0,
"kl_loss_13": 72.10048751831054,
"kl_loss_2": 1796.7720275878905,
"kl_loss_4": 955.2393432617188,
"kl_loss_9": 276.44380645751954,
"learning_rate": 6.002487109467347e-05,
"loss": 757.8997,
"step": 8440
},
{
"ce_loss_13": 3.3340129971504213,
"ce_loss_17": 3.2949753284454344,
"ce_loss_2": 4.08952693939209,
"ce_loss_4": 3.7143755912780763,
"ce_loss_9": 3.4288853764533997,
"epoch": 0.845,
"grad_norm": 788.0,
"kl_loss_13": 72.3598985671997,
"kl_loss_2": 1756.4138244628907,
"kl_loss_4": 955.092416381836,
"kl_loss_9": 280.5437271118164,
"learning_rate": 5.927331827620902e-05,
"loss": 761.8184,
"step": 8450
},
{
"ce_loss_13": 3.3196855664253233,
"ce_loss_17": 3.281851589679718,
"ce_loss_2": 4.05379341840744,
"ce_loss_4": 3.686984992027283,
"ce_loss_9": 3.4138710618019106,
"epoch": 0.846,
"grad_norm": 776.0,
"kl_loss_13": 69.6445505142212,
"kl_loss_2": 1699.683575439453,
"kl_loss_4": 928.4897003173828,
"kl_loss_9": 270.78993072509763,
"learning_rate": 5.852620357053651e-05,
"loss": 758.1724,
"step": 8460
},
{
"ce_loss_13": 3.3563570737838746,
"ce_loss_17": 3.3188074350357057,
"ce_loss_2": 4.095889627933502,
"ce_loss_4": 3.7254838228225706,
"ce_loss_9": 3.4483654141426086,
"epoch": 0.847,
"grad_norm": 980.0,
"kl_loss_13": 69.58336372375489,
"kl_loss_2": 1713.8079406738282,
"kl_loss_4": 931.7415771484375,
"kl_loss_9": 271.48472900390624,
"learning_rate": 5.778353450109286e-05,
"loss": 753.4014,
"step": 8470
},
{
"ce_loss_13": 3.3939215540885925,
"ce_loss_17": 3.3531723380088807,
"ce_loss_2": 4.165327072143555,
"ce_loss_4": 3.7743943333625793,
"ce_loss_9": 3.4884645462036135,
"epoch": 0.848,
"grad_norm": 932.0,
"kl_loss_13": 72.31379737854004,
"kl_loss_2": 1771.1787109375,
"kl_loss_4": 953.3261779785156,
"kl_loss_9": 278.8832099914551,
"learning_rate": 5.7045318546547206e-05,
"loss": 763.3915,
"step": 8480
},
{
"ce_loss_13": 3.2871436715126037,
"ce_loss_17": 3.2488813638687133,
"ce_loss_2": 4.066386353969574,
"ce_loss_4": 3.6715317487716677,
"ce_loss_9": 3.381435012817383,
"epoch": 0.849,
"grad_norm": 900.0,
"kl_loss_13": 71.21525192260742,
"kl_loss_2": 1788.6527099609375,
"kl_loss_4": 959.9057922363281,
"kl_loss_9": 273.3738731384277,
"learning_rate": 5.631156314072605e-05,
"loss": 761.9224,
"step": 8490
},
{
"ce_loss_13": 3.310305631160736,
"ce_loss_17": 3.2725605964660645,
"ce_loss_2": 4.052412235736847,
"ce_loss_4": 3.6782548785209657,
"ce_loss_9": 3.400036323070526,
"epoch": 0.85,
"grad_norm": 940.0,
"kl_loss_13": 70.49066848754883,
"kl_loss_2": 1720.9379455566407,
"kl_loss_4": 929.7799072265625,
"kl_loss_9": 269.5110496520996,
"learning_rate": 5.5582275672538315e-05,
"loss": 752.5031,
"step": 8500
},
{
"ce_loss_13": 3.2241469621658325,
"ce_loss_17": 3.1831390619277955,
"ce_loss_2": 4.047346830368042,
"ce_loss_4": 3.6304913997650146,
"ce_loss_9": 3.3220279693603514,
"epoch": 0.851,
"grad_norm": 752.0,
"kl_loss_13": 72.84465465545654,
"kl_loss_2": 1857.0891418457031,
"kl_loss_4": 999.8686706542969,
"kl_loss_9": 281.9967445373535,
"learning_rate": 5.4857463485900484e-05,
"loss": 786.0194,
"step": 8510
},
{
"ce_loss_13": 3.2842156052589417,
"ce_loss_17": 3.2467063069343567,
"ce_loss_2": 4.037990629673004,
"ce_loss_4": 3.6584272980690002,
"ce_loss_9": 3.3805489897727967,
"epoch": 0.852,
"grad_norm": 1232.0,
"kl_loss_13": 69.57874660491943,
"kl_loss_2": 1749.3746459960937,
"kl_loss_4": 943.728482055664,
"kl_loss_9": 272.5829315185547,
"learning_rate": 5.413713387966329e-05,
"loss": 759.3067,
"step": 8520
},
{
"ce_loss_13": 3.2964441180229187,
"ce_loss_17": 3.2589587688446047,
"ce_loss_2": 4.080042326450348,
"ce_loss_4": 3.6818908572196962,
"ce_loss_9": 3.3896549224853514,
"epoch": 0.853,
"grad_norm": 1216.0,
"kl_loss_13": 72.38710823059083,
"kl_loss_2": 1792.3215270996093,
"kl_loss_4": 959.0531524658203,
"kl_loss_9": 274.66367111206057,
"learning_rate": 5.34212941075381e-05,
"loss": 770.6358,
"step": 8530
},
{
"ce_loss_13": 3.307350420951843,
"ce_loss_17": 3.2698061943054197,
"ce_loss_2": 4.054671609401703,
"ce_loss_4": 3.672791314125061,
"ce_loss_9": 3.392826998233795,
"epoch": 0.854,
"grad_norm": 728.0,
"kl_loss_13": 69.6476016998291,
"kl_loss_2": 1730.8017272949219,
"kl_loss_4": 920.2828186035156,
"kl_loss_9": 264.1802864074707,
"learning_rate": 5.270995137802315e-05,
"loss": 754.1865,
"step": 8540
},
{
"ce_loss_13": 3.243489348888397,
"ce_loss_17": 3.2078842401504515,
"ce_loss_2": 4.012664020061493,
"ce_loss_4": 3.621446192264557,
"ce_loss_9": 3.3361738204956053,
"epoch": 0.855,
"grad_norm": 736.0,
"kl_loss_13": 68.58904209136963,
"kl_loss_2": 1769.7939025878907,
"kl_loss_4": 945.7029113769531,
"kl_loss_9": 271.47595977783203,
"learning_rate": 5.2003112854332125e-05,
"loss": 767.745,
"step": 8550
},
{
"ce_loss_13": 3.2445086240768433,
"ce_loss_17": 3.206673777103424,
"ce_loss_2": 4.001877117156982,
"ce_loss_4": 3.617502176761627,
"ce_loss_9": 3.334064221382141,
"epoch": 0.856,
"grad_norm": 752.0,
"kl_loss_13": 69.51929016113282,
"kl_loss_2": 1765.8089904785156,
"kl_loss_4": 951.4055114746094,
"kl_loss_9": 270.62868423461913,
"learning_rate": 5.130078565432089e-05,
"loss": 750.355,
"step": 8560
},
{
"ce_loss_13": 3.312978744506836,
"ce_loss_17": 3.2759904861450195,
"ce_loss_2": 4.05236177444458,
"ce_loss_4": 3.6772449254989623,
"ce_loss_9": 3.400533843040466,
"epoch": 0.857,
"grad_norm": 1184.0,
"kl_loss_13": 69.48748779296875,
"kl_loss_2": 1731.421649169922,
"kl_loss_4": 937.3368255615235,
"kl_loss_9": 268.3721839904785,
"learning_rate": 5.060297685041659e-05,
"loss": 745.1645,
"step": 8570
},
{
"ce_loss_13": 3.2397432565689086,
"ce_loss_17": 3.2003438830375672,
"ce_loss_2": 4.030338478088379,
"ce_loss_4": 3.628654670715332,
"ce_loss_9": 3.3365538835525514,
"epoch": 0.858,
"grad_norm": 812.0,
"kl_loss_13": 72.3453441619873,
"kl_loss_2": 1798.8057678222656,
"kl_loss_4": 965.1929901123046,
"kl_loss_9": 279.8908744812012,
"learning_rate": 4.99096934695461e-05,
"loss": 778.915,
"step": 8580
},
{
"ce_loss_13": 3.3056263446807863,
"ce_loss_17": 3.266205370426178,
"ce_loss_2": 4.073195433616638,
"ce_loss_4": 3.687685859203339,
"ce_loss_9": 3.395856535434723,
"epoch": 0.859,
"grad_norm": 584.0,
"kl_loss_13": 70.5243480682373,
"kl_loss_2": 1767.046942138672,
"kl_loss_4": 954.3834838867188,
"kl_loss_9": 272.76340255737307,
"learning_rate": 4.922094249306558e-05,
"loss": 754.2901,
"step": 8590
},
{
"ce_loss_13": 3.3336752533912657,
"ce_loss_17": 3.294740152359009,
"ce_loss_2": 4.099012005329132,
"ce_loss_4": 3.713496136665344,
"ce_loss_9": 3.427042770385742,
"epoch": 0.86,
"grad_norm": 1000.0,
"kl_loss_13": 72.1313404083252,
"kl_loss_2": 1759.39765625,
"kl_loss_4": 944.4590118408203,
"kl_loss_9": 275.7927375793457,
"learning_rate": 4.853673085668947e-05,
"loss": 750.4285,
"step": 8600
},
{
"ce_loss_13": 3.351103699207306,
"ce_loss_17": 3.312099051475525,
"ce_loss_2": 4.120914161205292,
"ce_loss_4": 3.7291797757148744,
"ce_loss_9": 3.4441885232925413,
"epoch": 0.861,
"grad_norm": 824.0,
"kl_loss_13": 71.3762767791748,
"kl_loss_2": 1763.5246337890626,
"kl_loss_4": 944.3128356933594,
"kl_loss_9": 272.16346435546876,
"learning_rate": 4.78570654504214e-05,
"loss": 764.83,
"step": 8610
},
{
"ce_loss_13": 3.2950334191322326,
"ce_loss_17": 3.2570829153060914,
"ce_loss_2": 4.068877625465393,
"ce_loss_4": 3.6807502269744874,
"ce_loss_9": 3.386111795902252,
"epoch": 0.862,
"grad_norm": 800.0,
"kl_loss_13": 70.43105850219726,
"kl_loss_2": 1788.814599609375,
"kl_loss_4": 968.8083038330078,
"kl_loss_9": 274.6896087646484,
"learning_rate": 4.7181953118484556e-05,
"loss": 771.1818,
"step": 8620
},
{
"ce_loss_13": 3.3184030055999756,
"ce_loss_17": 3.2805970191955565,
"ce_loss_2": 4.080092549324036,
"ce_loss_4": 3.693860375881195,
"ce_loss_9": 3.4120346784591673,
"epoch": 0.863,
"grad_norm": 848.0,
"kl_loss_13": 70.71467151641846,
"kl_loss_2": 1714.6408752441407,
"kl_loss_4": 927.5413970947266,
"kl_loss_9": 269.1174514770508,
"learning_rate": 4.651140065925269e-05,
"loss": 769.4998,
"step": 8630
},
{
"ce_loss_13": 3.2543252825737,
"ce_loss_17": 3.2153276324272158,
"ce_loss_2": 4.0182312488555905,
"ce_loss_4": 3.625260126590729,
"ce_loss_9": 3.346437680721283,
"epoch": 0.864,
"grad_norm": 820.0,
"kl_loss_13": 70.5960153579712,
"kl_loss_2": 1772.5950439453125,
"kl_loss_4": 949.2362548828125,
"kl_loss_9": 272.66626968383787,
"learning_rate": 4.58454148251814e-05,
"loss": 770.8148,
"step": 8640
},
{
"ce_loss_13": 3.2696855068206787,
"ce_loss_17": 3.229102146625519,
"ce_loss_2": 4.069872319698334,
"ce_loss_4": 3.668764066696167,
"ce_loss_9": 3.3631327509880067,
"epoch": 0.865,
"grad_norm": 868.0,
"kl_loss_13": 70.95263595581055,
"kl_loss_2": 1812.3966369628906,
"kl_loss_4": 973.0469512939453,
"kl_loss_9": 273.66270751953124,
"learning_rate": 4.518400232274078e-05,
"loss": 767.7344,
"step": 8650
},
{
"ce_loss_13": 3.2935158729553224,
"ce_loss_17": 3.253829777240753,
"ce_loss_2": 4.04986002445221,
"ce_loss_4": 3.6717586398124693,
"ce_loss_9": 3.38398414850235,
"epoch": 0.866,
"grad_norm": 800.0,
"kl_loss_13": 71.67686386108399,
"kl_loss_2": 1741.0731506347656,
"kl_loss_4": 944.9275756835938,
"kl_loss_9": 272.5065689086914,
"learning_rate": 4.452716981234745e-05,
"loss": 742.4328,
"step": 8660
},
{
"ce_loss_13": 3.267419683933258,
"ce_loss_17": 3.2313178896903993,
"ce_loss_2": 4.023157560825348,
"ce_loss_4": 3.637619066238403,
"ce_loss_9": 3.358725380897522,
"epoch": 0.867,
"grad_norm": 860.0,
"kl_loss_13": 68.30587711334229,
"kl_loss_2": 1740.3493225097657,
"kl_loss_4": 934.4726257324219,
"kl_loss_9": 267.2579002380371,
"learning_rate": 4.3874923908297335e-05,
"loss": 746.8162,
"step": 8670
},
{
"ce_loss_13": 3.31860488653183,
"ce_loss_17": 3.2785238146781923,
"ce_loss_2": 4.093624210357666,
"ce_loss_4": 3.705119812488556,
"ce_loss_9": 3.413517880439758,
"epoch": 0.868,
"grad_norm": 1016.0,
"kl_loss_13": 72.45567092895507,
"kl_loss_2": 1782.8880004882812,
"kl_loss_4": 965.7743316650391,
"kl_loss_9": 274.9748229980469,
"learning_rate": 4.322727117869951e-05,
"loss": 764.1808,
"step": 8680
},
{
"ce_loss_13": 3.3246540427207947,
"ce_loss_17": 3.285372281074524,
"ce_loss_2": 4.089873361587524,
"ce_loss_4": 3.703606963157654,
"ce_loss_9": 3.4189262986183167,
"epoch": 0.869,
"grad_norm": 892.0,
"kl_loss_13": 70.91092338562012,
"kl_loss_2": 1777.360284423828,
"kl_loss_4": 954.4211364746094,
"kl_loss_9": 273.534854888916,
"learning_rate": 4.2584218145409916e-05,
"loss": 760.0275,
"step": 8690
},
{
"ce_loss_13": 3.3691744565963746,
"ce_loss_17": 3.3325124382972717,
"ce_loss_2": 4.093606245517731,
"ce_loss_4": 3.727621281147003,
"ce_loss_9": 3.458368957042694,
"epoch": 0.87,
"grad_norm": 840.0,
"kl_loss_13": 70.12667446136474,
"kl_loss_2": 1697.2558349609376,
"kl_loss_4": 926.7784637451172,
"kl_loss_9": 266.85135955810546,
"learning_rate": 4.194577128396521e-05,
"loss": 742.6759,
"step": 8700
},
{
"ce_loss_13": 3.251688504219055,
"ce_loss_17": 3.213710355758667,
"ce_loss_2": 4.014662778377533,
"ce_loss_4": 3.626564681529999,
"ce_loss_9": 3.3434009909629823,
"epoch": 0.871,
"grad_norm": 768.0,
"kl_loss_13": 69.05314979553222,
"kl_loss_2": 1765.7520751953125,
"kl_loss_4": 944.2459381103515,
"kl_loss_9": 267.29578399658203,
"learning_rate": 4.1311937023518264e-05,
"loss": 768.9531,
"step": 8710
},
{
"ce_loss_13": 3.27214492559433,
"ce_loss_17": 3.2352864265441896,
"ce_loss_2": 4.068167233467102,
"ce_loss_4": 3.640877389907837,
"ce_loss_9": 3.360073173046112,
"epoch": 0.872,
"grad_norm": 796.0,
"kl_loss_13": 68.70249729156494,
"kl_loss_2": 1826.7659301757812,
"kl_loss_4": 930.3711578369141,
"kl_loss_9": 261.41344299316404,
"learning_rate": 4.0682721746773344e-05,
"loss": 761.2753,
"step": 8720
},
{
"ce_loss_13": 3.1379726767539977,
"ce_loss_17": 3.099522340297699,
"ce_loss_2": 3.9501651883125306,
"ce_loss_4": 3.5429770708084107,
"ce_loss_9": 3.2354901313781737,
"epoch": 0.873,
"grad_norm": 1208.0,
"kl_loss_13": 68.82833328247071,
"kl_loss_2": 1811.7914794921876,
"kl_loss_4": 972.6370147705078,
"kl_loss_9": 273.3907341003418,
"learning_rate": 4.0058131789920904e-05,
"loss": 757.3973,
"step": 8730
},
{
"ce_loss_13": 3.289105761051178,
"ce_loss_17": 3.2499382734298705,
"ce_loss_2": 4.047884678840637,
"ce_loss_4": 3.6669172763824465,
"ce_loss_9": 3.3774807810783387,
"epoch": 0.874,
"grad_norm": 832.0,
"kl_loss_13": 69.40928993225097,
"kl_loss_2": 1764.611505126953,
"kl_loss_4": 951.75166015625,
"kl_loss_9": 268.1239875793457,
"learning_rate": 3.9438173442575e-05,
"loss": 781.4809,
"step": 8740
},
{
"ce_loss_13": 3.320170760154724,
"ce_loss_17": 3.2803778886795043,
"ce_loss_2": 4.067899703979492,
"ce_loss_4": 3.691530239582062,
"ce_loss_9": 3.4129509568214416,
"epoch": 0.875,
"grad_norm": 872.0,
"kl_loss_13": 69.87012176513672,
"kl_loss_2": 1723.4821533203126,
"kl_loss_4": 929.8530395507812,
"kl_loss_9": 268.4810333251953,
"learning_rate": 3.882285294770937e-05,
"loss": 754.1662,
"step": 8750
},
{
"ce_loss_13": 3.283842885494232,
"ce_loss_17": 3.245735538005829,
"ce_loss_2": 4.028535318374634,
"ce_loss_4": 3.652010107040405,
"ce_loss_9": 3.373887574672699,
"epoch": 0.876,
"grad_norm": 740.0,
"kl_loss_13": 70.28794174194336,
"kl_loss_2": 1731.2416015625,
"kl_loss_4": 935.3825469970703,
"kl_loss_9": 269.52797775268556,
"learning_rate": 3.821217650159453e-05,
"loss": 763.3816,
"step": 8760
},
{
"ce_loss_13": 3.158191645145416,
"ce_loss_17": 3.1189328789711,
"ce_loss_2": 3.9809357166290282,
"ce_loss_4": 3.5679367542266847,
"ce_loss_9": 3.2566331028938293,
"epoch": 0.877,
"grad_norm": 1304.0,
"kl_loss_13": 69.88195037841797,
"kl_loss_2": 1841.284228515625,
"kl_loss_4": 990.7837341308593,
"kl_loss_9": 277.99508438110354,
"learning_rate": 3.760615025373543e-05,
"loss": 778.5645,
"step": 8770
},
{
"ce_loss_13": 3.330407190322876,
"ce_loss_17": 3.2909754276275636,
"ce_loss_2": 4.113665688037872,
"ce_loss_4": 3.7197707295417786,
"ce_loss_9": 3.427734637260437,
"epoch": 0.878,
"grad_norm": 980.0,
"kl_loss_13": 73.42694778442383,
"kl_loss_2": 1789.1897705078125,
"kl_loss_4": 958.443881225586,
"kl_loss_9": 277.80702514648436,
"learning_rate": 3.700478030680987e-05,
"loss": 777.7257,
"step": 8780
},
{
"ce_loss_13": 3.3253095865249636,
"ce_loss_17": 3.287730133533478,
"ce_loss_2": 4.088598692417145,
"ce_loss_4": 3.6988863945007324,
"ce_loss_9": 3.4193095088005068,
"epoch": 0.879,
"grad_norm": 772.0,
"kl_loss_13": 70.39273853302002,
"kl_loss_2": 1752.1946899414063,
"kl_loss_4": 939.2807342529297,
"kl_loss_9": 270.5202178955078,
"learning_rate": 3.6408072716606344e-05,
"loss": 755.5979,
"step": 8790
},
{
"ce_loss_13": 3.2510732650756835,
"ce_loss_17": 3.2137936115264893,
"ce_loss_2": 4.049818813800812,
"ce_loss_4": 3.6421456575393676,
"ce_loss_9": 3.3426763296127318,
"epoch": 0.88,
"grad_norm": 720.0,
"kl_loss_13": 70.29140071868896,
"kl_loss_2": 1819.5076477050782,
"kl_loss_4": 972.0310791015625,
"kl_loss_9": 276.21380920410155,
"learning_rate": 3.5816033491963716e-05,
"loss": 787.8986,
"step": 8800
},
{
"ce_loss_13": 3.1142725348472595,
"ce_loss_17": 3.075247824192047,
"ce_loss_2": 3.9192667841911315,
"ce_loss_4": 3.5038288831710815,
"ce_loss_9": 3.2087692499160765,
"epoch": 0.881,
"grad_norm": 636.0,
"kl_loss_13": 69.20652503967285,
"kl_loss_2": 1830.6102783203125,
"kl_loss_4": 967.6713928222656,
"kl_loss_9": 269.281755065918,
"learning_rate": 3.522866859471047e-05,
"loss": 767.3122,
"step": 8810
},
{
"ce_loss_13": 3.34773451089859,
"ce_loss_17": 3.3099034905433653,
"ce_loss_2": 4.070950448513031,
"ce_loss_4": 3.7069722771644593,
"ce_loss_9": 3.4367212653160095,
"epoch": 0.882,
"grad_norm": 968.0,
"kl_loss_13": 68.84079093933106,
"kl_loss_2": 1672.555108642578,
"kl_loss_4": 900.7840759277344,
"kl_loss_9": 262.96395416259764,
"learning_rate": 3.46459839396045e-05,
"loss": 743.6263,
"step": 8820
},
{
"ce_loss_13": 3.26260107755661,
"ce_loss_17": 3.223562455177307,
"ce_loss_2": 4.047707068920135,
"ce_loss_4": 3.6520734786987306,
"ce_loss_9": 3.356485903263092,
"epoch": 0.883,
"grad_norm": 808.0,
"kl_loss_13": 70.78400783538818,
"kl_loss_2": 1764.89443359375,
"kl_loss_4": 950.646841430664,
"kl_loss_9": 271.79892044067384,
"learning_rate": 3.406798539427386e-05,
"loss": 779.9631,
"step": 8830
},
{
"ce_loss_13": 3.3257412910461426,
"ce_loss_17": 3.287832188606262,
"ce_loss_2": 4.086572694778442,
"ce_loss_4": 3.70034202337265,
"ce_loss_9": 3.4149993896484374,
"epoch": 0.884,
"grad_norm": 1040.0,
"kl_loss_13": 70.3533836364746,
"kl_loss_2": 1780.2422241210938,
"kl_loss_4": 953.5696105957031,
"kl_loss_9": 272.3615303039551,
"learning_rate": 3.349467877915746e-05,
"loss": 766.4402,
"step": 8840
},
{
"ce_loss_13": 3.285241413116455,
"ce_loss_17": 3.2471628308296205,
"ce_loss_2": 4.070745611190796,
"ce_loss_4": 3.6734976291656496,
"ce_loss_9": 3.3793052554130556,
"epoch": 0.885,
"grad_norm": 1120.0,
"kl_loss_13": 70.29715118408203,
"kl_loss_2": 1816.7627563476562,
"kl_loss_4": 976.2758880615235,
"kl_loss_9": 275.21948318481446,
"learning_rate": 3.292606986744667e-05,
"loss": 792.1923,
"step": 8850
},
{
"ce_loss_13": 3.2434609055519106,
"ce_loss_17": 3.206607627868652,
"ce_loss_2": 4.017269504070282,
"ce_loss_4": 3.620222342014313,
"ce_loss_9": 3.331614947319031,
"epoch": 0.886,
"grad_norm": 760.0,
"kl_loss_13": 69.23771839141845,
"kl_loss_2": 1774.5667114257812,
"kl_loss_4": 956.0857360839843,
"kl_loss_9": 266.13800888061525,
"learning_rate": 3.23621643850267e-05,
"loss": 764.0102,
"step": 8860
},
{
"ce_loss_13": 3.3150471806526185,
"ce_loss_17": 3.2769230365753175,
"ce_loss_2": 4.073702692985535,
"ce_loss_4": 3.692244303226471,
"ce_loss_9": 3.410197043418884,
"epoch": 0.887,
"grad_norm": 976.0,
"kl_loss_13": 71.57453002929688,
"kl_loss_2": 1763.2067504882812,
"kl_loss_4": 960.6644958496094,
"kl_loss_9": 275.66893005371094,
"learning_rate": 3.180296801041971e-05,
"loss": 753.6317,
"step": 8870
},
{
"ce_loss_13": 3.340101194381714,
"ce_loss_17": 3.302070701122284,
"ce_loss_2": 4.106904423236847,
"ce_loss_4": 3.7093899130821226,
"ce_loss_9": 3.431441366672516,
"epoch": 0.888,
"grad_norm": 628.0,
"kl_loss_13": 70.40896244049073,
"kl_loss_2": 1772.5249816894532,
"kl_loss_4": 935.5421752929688,
"kl_loss_9": 267.8436454772949,
"learning_rate": 3.124848637472688e-05,
"loss": 746.5057,
"step": 8880
},
{
"ce_loss_13": 3.160101127624512,
"ce_loss_17": 3.1225696921348574,
"ce_loss_2": 3.940399968624115,
"ce_loss_4": 3.5493147373199463,
"ce_loss_9": 3.2547220468521116,
"epoch": 0.889,
"grad_norm": 1008.0,
"kl_loss_13": 67.81524658203125,
"kl_loss_2": 1787.1769104003906,
"kl_loss_4": 962.53515625,
"kl_loss_9": 267.5764961242676,
"learning_rate": 3.069872506157212e-05,
"loss": 762.1292,
"step": 8890
},
{
"ce_loss_13": 3.263330328464508,
"ce_loss_17": 3.2273592352867126,
"ce_loss_2": 4.024621450901032,
"ce_loss_4": 3.642102038860321,
"ce_loss_9": 3.3568263411521913,
"epoch": 0.89,
"grad_norm": 804.0,
"kl_loss_13": 69.50649681091309,
"kl_loss_2": 1758.631689453125,
"kl_loss_4": 941.1656555175781,
"kl_loss_9": 271.157022857666,
"learning_rate": 3.0153689607045842e-05,
"loss": 756.4652,
"step": 8900
},
{
"ce_loss_13": 3.166585099697113,
"ce_loss_17": 3.127001166343689,
"ce_loss_2": 3.9992379188537597,
"ce_loss_4": 3.573567008972168,
"ce_loss_9": 3.266878294944763,
"epoch": 0.891,
"grad_norm": 996.0,
"kl_loss_13": 71.30703067779541,
"kl_loss_2": 1902.9365478515624,
"kl_loss_4": 1013.771694946289,
"kl_loss_9": 282.7529136657715,
"learning_rate": 2.9613385499648926e-05,
"loss": 777.2954,
"step": 8910
},
{
"ce_loss_13": 3.21743483543396,
"ce_loss_17": 3.1798877000808714,
"ce_loss_2": 3.980141830444336,
"ce_loss_4": 3.600106048583984,
"ce_loss_9": 3.3116451501846313,
"epoch": 0.892,
"grad_norm": 864.0,
"kl_loss_13": 68.77742424011231,
"kl_loss_2": 1738.6429748535156,
"kl_loss_4": 944.1782318115235,
"kl_loss_9": 268.3657096862793,
"learning_rate": 2.9077818180237692e-05,
"loss": 761.2621,
"step": 8920
},
{
"ce_loss_13": 3.260299324989319,
"ce_loss_17": 3.2211027264595034,
"ce_loss_2": 4.056830155849457,
"ce_loss_4": 3.656483030319214,
"ce_loss_9": 3.357074999809265,
"epoch": 0.893,
"grad_norm": 1760.0,
"kl_loss_13": 69.7451015472412,
"kl_loss_2": 1785.2978942871093,
"kl_loss_4": 953.3751007080078,
"kl_loss_9": 269.3721778869629,
"learning_rate": 2.8546993041969172e-05,
"loss": 762.2178,
"step": 8930
},
{
"ce_loss_13": 3.2998342752456664,
"ce_loss_17": 3.2634523510932922,
"ce_loss_2": 4.0385368943214415,
"ce_loss_4": 3.671085524559021,
"ce_loss_9": 3.3905824542045595,
"epoch": 0.894,
"grad_norm": 796.0,
"kl_loss_13": 68.27731208801269,
"kl_loss_2": 1728.0355590820313,
"kl_loss_4": 936.2373199462891,
"kl_loss_9": 267.1129722595215,
"learning_rate": 2.802091543024671e-05,
"loss": 759.3199,
"step": 8940
},
{
"ce_loss_13": 3.2947364926338194,
"ce_loss_17": 3.2571501612663267,
"ce_loss_2": 4.081899428367615,
"ce_loss_4": 3.679962158203125,
"ce_loss_9": 3.3866132140159606,
"epoch": 0.895,
"grad_norm": 840.0,
"kl_loss_13": 70.01423645019531,
"kl_loss_2": 1806.8572692871094,
"kl_loss_4": 965.6742645263672,
"kl_loss_9": 273.54743881225585,
"learning_rate": 2.7499590642665774e-05,
"loss": 785.5458,
"step": 8950
},
{
"ce_loss_13": 3.3065556645393372,
"ce_loss_17": 3.2688744187355043,
"ce_loss_2": 4.072537469863891,
"ce_loss_4": 3.6804751992225646,
"ce_loss_9": 3.406609535217285,
"epoch": 0.896,
"grad_norm": 740.0,
"kl_loss_13": 74.98095989227295,
"kl_loss_2": 1751.6958984375,
"kl_loss_4": 938.6008728027343,
"kl_loss_9": 289.8558975219727,
"learning_rate": 2.6983023928961405e-05,
"loss": 755.7607,
"step": 8960
},
{
"ce_loss_13": 3.2771018624305723,
"ce_loss_17": 3.2386683106422423,
"ce_loss_2": 4.048685503005982,
"ce_loss_4": 3.660785710811615,
"ce_loss_9": 3.3700268983840944,
"epoch": 0.897,
"grad_norm": 780.0,
"kl_loss_13": 70.14976196289062,
"kl_loss_2": 1756.1913452148438,
"kl_loss_4": 947.1248443603515,
"kl_loss_9": 269.67872009277346,
"learning_rate": 2.6471220490954628e-05,
"loss": 770.3491,
"step": 8970
},
{
"ce_loss_13": 3.2675058484077453,
"ce_loss_17": 3.2324760437011717,
"ce_loss_2": 4.026072096824646,
"ce_loss_4": 3.62881281375885,
"ce_loss_9": 3.354077172279358,
"epoch": 0.898,
"grad_norm": 840.0,
"kl_loss_13": 69.91112670898437,
"kl_loss_2": 1751.6456481933594,
"kl_loss_4": 933.82841796875,
"kl_loss_9": 269.164444732666,
"learning_rate": 2.596418548250029e-05,
"loss": 763.478,
"step": 8980
},
{
"ce_loss_13": 3.3039443254470826,
"ce_loss_17": 3.265969121456146,
"ce_loss_2": 4.057813549041748,
"ce_loss_4": 3.682532238960266,
"ce_loss_9": 3.398101258277893,
"epoch": 0.899,
"grad_norm": 828.0,
"kl_loss_13": 71.41869773864747,
"kl_loss_2": 1756.7818115234375,
"kl_loss_4": 951.8295043945312,
"kl_loss_9": 274.06252059936526,
"learning_rate": 2.5461924009435368e-05,
"loss": 753.8367,
"step": 8990
},
{
"ce_loss_13": 3.298236906528473,
"ce_loss_17": 3.2587198138237,
"ce_loss_2": 4.062357556819916,
"ce_loss_4": 3.6736649870872498,
"ce_loss_9": 3.390058147907257,
"epoch": 0.9,
"grad_norm": 880.0,
"kl_loss_13": 71.20307960510254,
"kl_loss_2": 1756.3238525390625,
"kl_loss_4": 942.694546508789,
"kl_loss_9": 271.4934455871582,
"learning_rate": 2.4964441129527336e-05,
"loss": 775.5805,
"step": 9000
},
{
"ce_loss_13": 3.3012996196746824,
"ce_loss_17": 3.2620099425315856,
"ce_loss_2": 4.042165410518646,
"ce_loss_4": 3.6681466937065124,
"ce_loss_9": 3.3898880243301392,
"epoch": 0.901,
"grad_norm": 1160.0,
"kl_loss_13": 69.23692741394044,
"kl_loss_2": 1715.5816650390625,
"kl_loss_4": 922.6554473876953,
"kl_loss_9": 265.27770233154297,
"learning_rate": 2.4471741852423235e-05,
"loss": 747.8052,
"step": 9010
},
{
"ce_loss_13": 3.346455466747284,
"ce_loss_17": 3.3067072510719298,
"ce_loss_2": 4.1050170183181764,
"ce_loss_4": 3.7224635004997255,
"ce_loss_9": 3.4403756499290465,
"epoch": 0.902,
"grad_norm": 736.0,
"kl_loss_13": 70.2386646270752,
"kl_loss_2": 1725.9721374511719,
"kl_loss_4": 930.9395690917969,
"kl_loss_9": 270.36619567871094,
"learning_rate": 2.3983831139599287e-05,
"loss": 754.2608,
"step": 9020
},
{
"ce_loss_13": 3.2672984123229982,
"ce_loss_17": 3.229282486438751,
"ce_loss_2": 4.023867189884186,
"ce_loss_4": 3.6345030784606935,
"ce_loss_9": 3.3552754759788512,
"epoch": 0.903,
"grad_norm": 648.0,
"kl_loss_13": 68.81279144287109,
"kl_loss_2": 1725.340350341797,
"kl_loss_4": 918.9158569335938,
"kl_loss_9": 264.1955795288086,
"learning_rate": 2.3500713904311022e-05,
"loss": 737.7361,
"step": 9030
},
{
"ce_loss_13": 3.3072561860084533,
"ce_loss_17": 3.269752490520477,
"ce_loss_2": 4.04286288022995,
"ce_loss_4": 3.6655190706253054,
"ce_loss_9": 3.3957794904708862,
"epoch": 0.904,
"grad_norm": 984.0,
"kl_loss_13": 69.29099216461182,
"kl_loss_2": 1694.4139587402344,
"kl_loss_4": 911.9616882324219,
"kl_loss_9": 261.82068862915037,
"learning_rate": 2.3022395011543685e-05,
"loss": 739.5109,
"step": 9040
},
{
"ce_loss_13": 3.3339088559150696,
"ce_loss_17": 3.2950241684913637,
"ce_loss_2": 4.0952001214027405,
"ce_loss_4": 3.714173400402069,
"ce_loss_9": 3.4319105625152586,
"epoch": 0.905,
"grad_norm": 912.0,
"kl_loss_13": 71.4954969406128,
"kl_loss_2": 1753.8956970214845,
"kl_loss_4": 951.8698364257813,
"kl_loss_9": 276.7628715515137,
"learning_rate": 2.2548879277963063e-05,
"loss": 773.6446,
"step": 9050
},
{
"ce_loss_13": 3.253114938735962,
"ce_loss_17": 3.215239441394806,
"ce_loss_2": 4.0072418570518495,
"ce_loss_4": 3.6237417459487915,
"ce_loss_9": 3.342426073551178,
"epoch": 0.906,
"grad_norm": 828.0,
"kl_loss_13": 69.18032398223878,
"kl_loss_2": 1732.9671569824218,
"kl_loss_4": 933.28525390625,
"kl_loss_9": 266.85338287353517,
"learning_rate": 2.208017147186736e-05,
"loss": 737.7998,
"step": 9060
},
{
"ce_loss_13": 3.2505879163742066,
"ce_loss_17": 3.211529290676117,
"ce_loss_2": 4.008640742301941,
"ce_loss_4": 3.6223735451698302,
"ce_loss_9": 3.342756152153015,
"epoch": 0.907,
"grad_norm": 936.0,
"kl_loss_13": 69.45951023101807,
"kl_loss_2": 1758.497900390625,
"kl_loss_4": 942.9026062011719,
"kl_loss_9": 268.5462837219238,
"learning_rate": 2.1616276313139227e-05,
"loss": 751.1222,
"step": 9070
},
{
"ce_loss_13": 3.28677442073822,
"ce_loss_17": 3.248307991027832,
"ce_loss_2": 4.057864952087402,
"ce_loss_4": 3.6663498282432556,
"ce_loss_9": 3.3798938751220704,
"epoch": 0.908,
"grad_norm": 844.0,
"kl_loss_13": 69.54161643981934,
"kl_loss_2": 1760.1901550292969,
"kl_loss_4": 946.1911895751953,
"kl_loss_9": 270.7872001647949,
"learning_rate": 2.1157198473197415e-05,
"loss": 767.2781,
"step": 9080
},
{
"ce_loss_13": 3.347646725177765,
"ce_loss_17": 3.3092511177062987,
"ce_loss_2": 4.1128313660621645,
"ce_loss_4": 3.7317168712615967,
"ce_loss_9": 3.4412038564682006,
"epoch": 0.909,
"grad_norm": 1024.0,
"kl_loss_13": 71.05716209411621,
"kl_loss_2": 1744.008056640625,
"kl_loss_4": 947.9106719970703,
"kl_loss_9": 274.9560401916504,
"learning_rate": 2.0702942574950812e-05,
"loss": 759.0129,
"step": 9090
},
{
"ce_loss_13": 3.2757676362991335,
"ce_loss_17": 3.2370603919029235,
"ce_loss_2": 4.05085917711258,
"ce_loss_4": 3.658015692234039,
"ce_loss_9": 3.3714335680007936,
"epoch": 0.91,
"grad_norm": 828.0,
"kl_loss_13": 70.6139980316162,
"kl_loss_2": 1773.1221130371093,
"kl_loss_4": 956.3177947998047,
"kl_loss_9": 274.6713508605957,
"learning_rate": 2.025351319275137e-05,
"loss": 763.2487,
"step": 9100
},
{
"ce_loss_13": 3.3982797265052795,
"ce_loss_17": 3.359361159801483,
"ce_loss_2": 4.155125546455383,
"ce_loss_4": 3.780303680896759,
"ce_loss_9": 3.4923812985420226,
"epoch": 0.911,
"grad_norm": 772.0,
"kl_loss_13": 73.26307582855225,
"kl_loss_2": 1774.17333984375,
"kl_loss_4": 974.4539184570312,
"kl_loss_9": 282.6258262634277,
"learning_rate": 1.9808914852347816e-05,
"loss": 784.291,
"step": 9110
},
{
"ce_loss_13": 3.2470030188560486,
"ce_loss_17": 3.208009135723114,
"ce_loss_2": 4.013169312477112,
"ce_loss_4": 3.6329050064086914,
"ce_loss_9": 3.342044270038605,
"epoch": 0.912,
"grad_norm": 940.0,
"kl_loss_13": 69.41524143218994,
"kl_loss_2": 1744.8079162597655,
"kl_loss_4": 952.8638854980469,
"kl_loss_9": 271.1719367980957,
"learning_rate": 1.9369152030840554e-05,
"loss": 755.5211,
"step": 9120
},
{
"ce_loss_13": 3.328555727005005,
"ce_loss_17": 3.291154706478119,
"ce_loss_2": 4.090105664730072,
"ce_loss_4": 3.709492301940918,
"ce_loss_9": 3.4200507760047913,
"epoch": 0.913,
"grad_norm": 792.0,
"kl_loss_13": 71.5401647567749,
"kl_loss_2": 1769.6208740234374,
"kl_loss_4": 956.116665649414,
"kl_loss_9": 270.7011413574219,
"learning_rate": 1.893422915663645e-05,
"loss": 761.0888,
"step": 9130
},
{
"ce_loss_13": 3.197735035419464,
"ce_loss_17": 3.158833086490631,
"ce_loss_2": 4.001493084430694,
"ce_loss_4": 3.5969720602035524,
"ce_loss_9": 3.2945626020431518,
"epoch": 0.914,
"grad_norm": 880.0,
"kl_loss_13": 69.98915824890136,
"kl_loss_2": 1819.2923645019532,
"kl_loss_4": 977.3836547851563,
"kl_loss_9": 276.6566581726074,
"learning_rate": 1.850415060940386e-05,
"loss": 776.1738,
"step": 9140
},
{
"ce_loss_13": 3.3226425528526304,
"ce_loss_17": 3.2849931478500367,
"ce_loss_2": 4.062166357040406,
"ce_loss_4": 3.6929720759391786,
"ce_loss_9": 3.4111554622650146,
"epoch": 0.915,
"grad_norm": 928.0,
"kl_loss_13": 70.22557373046875,
"kl_loss_2": 1711.80859375,
"kl_loss_4": 938.1763580322265,
"kl_loss_9": 269.89337692260744,
"learning_rate": 1.8078920720028978e-05,
"loss": 756.1047,
"step": 9150
},
{
"ce_loss_13": 3.2460474491119387,
"ce_loss_17": 3.210865044593811,
"ce_loss_2": 3.9874136805534364,
"ce_loss_4": 3.6203092336654663,
"ce_loss_9": 3.337096083164215,
"epoch": 0.916,
"grad_norm": 944.0,
"kl_loss_13": 68.0163013458252,
"kl_loss_2": 1707.0913818359375,
"kl_loss_4": 931.6345520019531,
"kl_loss_9": 263.9716209411621,
"learning_rate": 1.765854377057219e-05,
"loss": 761.1198,
"step": 9160
},
{
"ce_loss_13": 3.2273457169532778,
"ce_loss_17": 3.1899821162223816,
"ce_loss_2": 3.9820481896400453,
"ce_loss_4": 3.595924234390259,
"ce_loss_9": 3.3174880385398864,
"epoch": 0.917,
"grad_norm": 764.0,
"kl_loss_13": 68.14966316223145,
"kl_loss_2": 1748.0338256835937,
"kl_loss_4": 931.5319671630859,
"kl_loss_9": 263.98573303222656,
"learning_rate": 1.724302399422456e-05,
"loss": 756.9239,
"step": 9170
},
{
"ce_loss_13": 3.188059365749359,
"ce_loss_17": 3.150203537940979,
"ce_loss_2": 3.963750922679901,
"ce_loss_4": 3.5690385460853578,
"ce_loss_9": 3.2794461131095884,
"epoch": 0.918,
"grad_norm": 944.0,
"kl_loss_13": 70.40487995147706,
"kl_loss_2": 1777.5741821289062,
"kl_loss_4": 961.4688934326172,
"kl_loss_9": 273.7363990783691,
"learning_rate": 1.683236557526574e-05,
"loss": 767.8744,
"step": 9180
},
{
"ce_loss_13": 3.3032285809516906,
"ce_loss_17": 3.2665912747383117,
"ce_loss_2": 4.032867658138275,
"ce_loss_4": 3.657480251789093,
"ce_loss_9": 3.390727710723877,
"epoch": 0.919,
"grad_norm": 740.0,
"kl_loss_13": 68.06188373565674,
"kl_loss_2": 1685.8097534179688,
"kl_loss_4": 901.4606018066406,
"kl_loss_9": 260.56498565673826,
"learning_rate": 1.6426572649021475e-05,
"loss": 749.3797,
"step": 9190
},
{
"ce_loss_13": 3.3341614603996277,
"ce_loss_17": 3.296714127063751,
"ce_loss_2": 4.0532737374305725,
"ce_loss_4": 3.684324491024017,
"ce_loss_9": 3.423136281967163,
"epoch": 0.92,
"grad_norm": 1004.0,
"kl_loss_13": 71.2539436340332,
"kl_loss_2": 1687.29228515625,
"kl_loss_4": 912.562939453125,
"kl_loss_9": 266.74498748779297,
"learning_rate": 1.6025649301821876e-05,
"loss": 744.0132,
"step": 9200
},
{
"ce_loss_13": 3.322655665874481,
"ce_loss_17": 3.2867024898529054,
"ce_loss_2": 4.051113891601562,
"ce_loss_4": 3.688315272331238,
"ce_loss_9": 3.4139885306358337,
"epoch": 0.921,
"grad_norm": 1072.0,
"kl_loss_13": 69.86746578216552,
"kl_loss_2": 1715.9775390625,
"kl_loss_4": 933.6245697021484,
"kl_loss_9": 271.504940032959,
"learning_rate": 1.5629599570960716e-05,
"loss": 748.0333,
"step": 9210
},
{
"ce_loss_13": 3.2324368953704834,
"ce_loss_17": 3.1948946714401245,
"ce_loss_2": 3.9962414503097534,
"ce_loss_4": 3.605444300174713,
"ce_loss_9": 3.3203277349472047,
"epoch": 0.922,
"grad_norm": 888.0,
"kl_loss_13": 68.84123592376709,
"kl_loss_2": 1762.1533569335938,
"kl_loss_4": 940.2346588134766,
"kl_loss_9": 268.61144332885743,
"learning_rate": 1.5238427444654367e-05,
"loss": 755.0762,
"step": 9220
},
{
"ce_loss_13": 3.2893491268157957,
"ce_loss_17": 3.2509705901145933,
"ce_loss_2": 4.038589036464691,
"ce_loss_4": 3.6586158990859987,
"ce_loss_9": 3.3821335554122927,
"epoch": 0.923,
"grad_norm": 1056.0,
"kl_loss_13": 69.26623306274413,
"kl_loss_2": 1720.7350646972657,
"kl_loss_4": 922.943701171875,
"kl_loss_9": 264.8475883483887,
"learning_rate": 1.4852136862001764e-05,
"loss": 747.6942,
"step": 9230
},
{
"ce_loss_13": 3.252811241149902,
"ce_loss_17": 3.216975140571594,
"ce_loss_2": 3.999893867969513,
"ce_loss_4": 3.624435234069824,
"ce_loss_9": 3.346466529369354,
"epoch": 0.924,
"grad_norm": 724.0,
"kl_loss_13": 67.6477201461792,
"kl_loss_2": 1716.8084045410155,
"kl_loss_4": 932.0351043701172,
"kl_loss_9": 264.6703620910645,
"learning_rate": 1.4470731712944884e-05,
"loss": 758.1266,
"step": 9240
},
{
"ce_loss_13": 3.2765080451965334,
"ce_loss_17": 3.239612317085266,
"ce_loss_2": 4.037005198001862,
"ce_loss_4": 3.6546990990638735,
"ce_loss_9": 3.3715944051742555,
"epoch": 0.925,
"grad_norm": 772.0,
"kl_loss_13": 69.63426055908204,
"kl_loss_2": 1746.720361328125,
"kl_loss_4": 935.8729522705078,
"kl_loss_9": 272.0946846008301,
"learning_rate": 1.4094215838229174e-05,
"loss": 772.5833,
"step": 9250
},
{
"ce_loss_13": 3.247589886188507,
"ce_loss_17": 3.2106730580329894,
"ce_loss_2": 4.026830673217773,
"ce_loss_4": 3.631477081775665,
"ce_loss_9": 3.3435757398605346,
"epoch": 0.926,
"grad_norm": 1000.0,
"kl_loss_13": 69.61430225372314,
"kl_loss_2": 1786.5716674804687,
"kl_loss_4": 955.6620147705078,
"kl_loss_9": 273.12003631591796,
"learning_rate": 1.372259302936546e-05,
"loss": 789.6091,
"step": 9260
},
{
"ce_loss_13": 3.354861545562744,
"ce_loss_17": 3.3138466238975526,
"ce_loss_2": 4.110436654090881,
"ce_loss_4": 3.7305679082870484,
"ce_loss_9": 3.4497798085212708,
"epoch": 0.927,
"grad_norm": 776.0,
"kl_loss_13": 72.69700546264649,
"kl_loss_2": 1737.003369140625,
"kl_loss_4": 939.8022277832031,
"kl_loss_9": 276.0119041442871,
"learning_rate": 1.3355867028591206e-05,
"loss": 750.552,
"step": 9270
},
{
"ce_loss_13": 3.26385555267334,
"ce_loss_17": 3.2263644456863405,
"ce_loss_2": 3.9958433508872986,
"ce_loss_4": 3.6246315956115724,
"ce_loss_9": 3.3551544904708863,
"epoch": 0.928,
"grad_norm": 776.0,
"kl_loss_13": 68.76478576660156,
"kl_loss_2": 1707.2232849121094,
"kl_loss_4": 926.3538391113282,
"kl_loss_9": 266.91420669555663,
"learning_rate": 1.2994041528833267e-05,
"loss": 746.6288,
"step": 9280
},
{
"ce_loss_13": 3.2609859108924866,
"ce_loss_17": 3.222886061668396,
"ce_loss_2": 4.019745421409607,
"ce_loss_4": 3.6354535818099976,
"ce_loss_9": 3.3499014139175416,
"epoch": 0.929,
"grad_norm": 780.0,
"kl_loss_13": 68.63880157470703,
"kl_loss_2": 1757.2555541992188,
"kl_loss_4": 940.3133087158203,
"kl_loss_9": 266.6288612365723,
"learning_rate": 1.2637120173670358e-05,
"loss": 751.512,
"step": 9290
},
{
"ce_loss_13": 3.2835487365722655,
"ce_loss_17": 3.244655930995941,
"ce_loss_2": 4.051596641540527,
"ce_loss_4": 3.6634366631507875,
"ce_loss_9": 3.378000283241272,
"epoch": 0.93,
"grad_norm": 1296.0,
"kl_loss_13": 70.42365875244141,
"kl_loss_2": 1761.8828002929688,
"kl_loss_4": 952.9559265136719,
"kl_loss_9": 273.18845825195314,
"learning_rate": 1.2285106557296478e-05,
"loss": 757.7055,
"step": 9300
},
{
"ce_loss_13": 3.1630964636802674,
"ce_loss_17": 3.1254794001579285,
"ce_loss_2": 3.98508734703064,
"ce_loss_4": 3.562894332408905,
"ce_loss_9": 3.257805061340332,
"epoch": 0.931,
"grad_norm": 780.0,
"kl_loss_13": 69.35065059661865,
"kl_loss_2": 1842.6054992675781,
"kl_loss_4": 977.738735961914,
"kl_loss_9": 272.13949966430664,
"learning_rate": 1.1938004224484989e-05,
"loss": 771.8602,
"step": 9310
},
{
"ce_loss_13": 3.3944175481796264,
"ce_loss_17": 3.353794741630554,
"ce_loss_2": 4.1406211972236635,
"ce_loss_4": 3.7649981141090394,
"ce_loss_9": 3.4874671697616577,
"epoch": 0.932,
"grad_norm": 1104.0,
"kl_loss_13": 72.44851455688476,
"kl_loss_2": 1742.682635498047,
"kl_loss_4": 943.5166168212891,
"kl_loss_9": 273.04381866455077,
"learning_rate": 1.1595816670552429e-05,
"loss": 771.1343,
"step": 9320
},
{
"ce_loss_13": 3.323309564590454,
"ce_loss_17": 3.2835536003112793,
"ce_loss_2": 4.059554195404052,
"ce_loss_4": 3.683386528491974,
"ce_loss_9": 3.412449359893799,
"epoch": 0.933,
"grad_norm": 1000.0,
"kl_loss_13": 71.7368278503418,
"kl_loss_2": 1710.4193237304687,
"kl_loss_4": 919.0308288574219,
"kl_loss_9": 266.2962310791016,
"learning_rate": 1.1258547341323699e-05,
"loss": 743.8206,
"step": 9330
},
{
"ce_loss_13": 3.3531580924987794,
"ce_loss_17": 3.3139570116996766,
"ce_loss_2": 4.0956674933433534,
"ce_loss_4": 3.722786843776703,
"ce_loss_9": 3.443931555747986,
"epoch": 0.934,
"grad_norm": 680.0,
"kl_loss_13": 70.99226303100586,
"kl_loss_2": 1743.4335693359376,
"kl_loss_4": 943.1558502197265,
"kl_loss_9": 272.2658096313477,
"learning_rate": 1.0926199633097156e-05,
"loss": 753.9291,
"step": 9340
},
{
"ce_loss_13": 3.3619842648506166,
"ce_loss_17": 3.3256566524505615,
"ce_loss_2": 4.070989274978638,
"ce_loss_4": 3.71152184009552,
"ce_loss_9": 3.446810233592987,
"epoch": 0.935,
"grad_norm": 748.0,
"kl_loss_13": 68.85320167541504,
"kl_loss_2": 1681.0407958984374,
"kl_loss_4": 915.6587646484375,
"kl_loss_9": 264.3614471435547,
"learning_rate": 1.0598776892610684e-05,
"loss": 758.3277,
"step": 9350
},
{
"ce_loss_13": 3.174557101726532,
"ce_loss_17": 3.1382050037384035,
"ce_loss_2": 3.94193377494812,
"ce_loss_4": 3.554804193973541,
"ce_loss_9": 3.2670865774154665,
"epoch": 0.936,
"grad_norm": 1096.0,
"kl_loss_13": 68.04401416778565,
"kl_loss_2": 1760.4666748046875,
"kl_loss_4": 945.3590911865234,
"kl_loss_9": 266.08999099731443,
"learning_rate": 1.0276282417007399e-05,
"loss": 749.2618,
"step": 9360
},
{
"ce_loss_13": 3.3314119935035706,
"ce_loss_17": 3.2932984948158266,
"ce_loss_2": 4.055443024635315,
"ce_loss_4": 3.685200798511505,
"ce_loss_9": 3.4210570573806764,
"epoch": 0.937,
"grad_norm": 800.0,
"kl_loss_13": 69.36395244598388,
"kl_loss_2": 1693.5143676757812,
"kl_loss_4": 913.9628021240235,
"kl_loss_9": 265.6849952697754,
"learning_rate": 9.958719453803277e-06,
"loss": 748.5033,
"step": 9370
},
{
"ce_loss_13": 3.3231441020965575,
"ce_loss_17": 3.283005452156067,
"ce_loss_2": 4.085646188259124,
"ce_loss_4": 3.704668116569519,
"ce_loss_9": 3.4138529539108275,
"epoch": 0.938,
"grad_norm": 640.0,
"kl_loss_13": 70.55078353881837,
"kl_loss_2": 1759.910662841797,
"kl_loss_4": 956.954931640625,
"kl_loss_9": 271.58061294555665,
"learning_rate": 9.646091200853802e-06,
"loss": 754.8813,
"step": 9380
},
{
"ce_loss_13": 3.2818185925483703,
"ce_loss_17": 3.245718610286713,
"ce_loss_2": 4.025053870677948,
"ce_loss_4": 3.652445447444916,
"ce_loss_9": 3.370171332359314,
"epoch": 0.939,
"grad_norm": 1312.0,
"kl_loss_13": 67.64159717559815,
"kl_loss_2": 1702.7493835449218,
"kl_loss_4": 921.6332214355468,
"kl_loss_9": 264.55640106201173,
"learning_rate": 9.338400806321978e-06,
"loss": 728.2665,
"step": 9390
},
{
"ce_loss_13": 3.3152188658714294,
"ce_loss_17": 3.275552845001221,
"ce_loss_2": 4.059438633918762,
"ce_loss_4": 3.6847678899765013,
"ce_loss_9": 3.410316598415375,
"epoch": 0.94,
"grad_norm": 692.0,
"kl_loss_13": 71.03326644897462,
"kl_loss_2": 1715.1363952636718,
"kl_loss_4": 928.6465850830078,
"kl_loss_9": 271.49635848999026,
"learning_rate": 9.035651368646646e-06,
"loss": 743.4288,
"step": 9400
},
{
"ce_loss_13": 3.3191699028015136,
"ce_loss_17": 3.2819036841392517,
"ce_loss_2": 4.0510072112083435,
"ce_loss_4": 3.6800065636634827,
"ce_loss_9": 3.4086018681526182,
"epoch": 0.941,
"grad_norm": 768.0,
"kl_loss_13": 69.9085536956787,
"kl_loss_2": 1709.1678588867187,
"kl_loss_4": 927.9352905273438,
"kl_loss_9": 265.0817222595215,
"learning_rate": 8.737845936511335e-06,
"loss": 749.88,
"step": 9410
},
{
"ce_loss_13": 3.2696847438812258,
"ce_loss_17": 3.2313279032707216,
"ce_loss_2": 4.0388831973075865,
"ce_loss_4": 3.6455859422683714,
"ce_loss_9": 3.3634910702705385,
"epoch": 0.942,
"grad_norm": 740.0,
"kl_loss_13": 70.54738807678223,
"kl_loss_2": 1764.8729553222656,
"kl_loss_4": 944.1729400634765,
"kl_loss_9": 272.425553894043,
"learning_rate": 8.444987508813451e-06,
"loss": 753.2954,
"step": 9420
},
{
"ce_loss_13": 3.2250452876091003,
"ce_loss_17": 3.187439298629761,
"ce_loss_2": 4.013418805599213,
"ce_loss_4": 3.613393557071686,
"ce_loss_9": 3.32097065448761,
"epoch": 0.943,
"grad_norm": 932.0,
"kl_loss_13": 71.29195957183838,
"kl_loss_2": 1826.9091674804688,
"kl_loss_4": 979.7543212890625,
"kl_loss_9": 276.6710952758789,
"learning_rate": 8.157079034633974e-06,
"loss": 775.8956,
"step": 9430
},
{
"ce_loss_13": 3.2237427115440367,
"ce_loss_17": 3.1853844165802,
"ce_loss_2": 3.98459529876709,
"ce_loss_4": 3.5957419633865357,
"ce_loss_9": 3.3150734543800353,
"epoch": 0.944,
"grad_norm": 1120.0,
"kl_loss_13": 69.64744243621826,
"kl_loss_2": 1783.77939453125,
"kl_loss_4": 956.6733428955079,
"kl_loss_9": 270.16100692749023,
"learning_rate": 7.874123413208145e-06,
"loss": 758.3695,
"step": 9440
},
{
"ce_loss_13": 3.193096709251404,
"ce_loss_17": 3.155752348899841,
"ce_loss_2": 3.9789442300796507,
"ce_loss_4": 3.581227695941925,
"ce_loss_9": 3.2874748349189757,
"epoch": 0.945,
"grad_norm": 876.0,
"kl_loss_13": 69.05770931243896,
"kl_loss_2": 1784.2775451660157,
"kl_loss_4": 953.4078979492188,
"kl_loss_9": 271.03979644775393,
"learning_rate": 7.59612349389599e-06,
"loss": 766.0222,
"step": 9450
},
{
"ce_loss_13": 3.2857839107513427,
"ce_loss_17": 3.248835825920105,
"ce_loss_2": 4.009725487232208,
"ce_loss_4": 3.6440842509269715,
"ce_loss_9": 3.375727343559265,
"epoch": 0.946,
"grad_norm": 896.0,
"kl_loss_13": 67.77833862304688,
"kl_loss_2": 1678.2259338378906,
"kl_loss_4": 907.5334350585938,
"kl_loss_9": 261.4477348327637,
"learning_rate": 7.323082076153509e-06,
"loss": 742.3237,
"step": 9460
},
{
"ce_loss_13": 3.328112506866455,
"ce_loss_17": 3.290648400783539,
"ce_loss_2": 4.060686004161835,
"ce_loss_4": 3.688987469673157,
"ce_loss_9": 3.4178372263908385,
"epoch": 0.947,
"grad_norm": 852.0,
"kl_loss_13": 70.47306404113769,
"kl_loss_2": 1702.2720703125,
"kl_loss_4": 920.1532928466797,
"kl_loss_9": 269.27613067626953,
"learning_rate": 7.055001909504755e-06,
"loss": 757.1909,
"step": 9470
},
{
"ce_loss_13": 3.3599886775016783,
"ce_loss_17": 3.3227712154388427,
"ce_loss_2": 4.101234364509582,
"ce_loss_4": 3.7277470231056213,
"ce_loss_9": 3.4521798849105836,
"epoch": 0.948,
"grad_norm": 688.0,
"kl_loss_13": 69.91639251708985,
"kl_loss_2": 1721.4495849609375,
"kl_loss_4": 929.4572998046875,
"kl_loss_9": 271.32519454956054,
"learning_rate": 6.791885693514133e-06,
"loss": 753.6105,
"step": 9480
},
{
"ce_loss_13": 3.2632110476493836,
"ce_loss_17": 3.225801682472229,
"ce_loss_2": 4.036578476428986,
"ce_loss_4": 3.6461342096328737,
"ce_loss_9": 3.3560299634933473,
"epoch": 0.949,
"grad_norm": 852.0,
"kl_loss_13": 70.22738361358643,
"kl_loss_2": 1787.6828125,
"kl_loss_4": 959.2408905029297,
"kl_loss_9": 271.782169342041,
"learning_rate": 6.533736077758867e-06,
"loss": 769.4135,
"step": 9490
},
{
"ce_loss_13": 3.228712463378906,
"ce_loss_17": 3.1906594753265383,
"ce_loss_2": 4.0256568670272825,
"ce_loss_4": 3.6140143275260925,
"ce_loss_9": 3.322402632236481,
"epoch": 0.95,
"grad_norm": 1048.0,
"kl_loss_13": 70.73279113769532,
"kl_loss_2": 1830.7975463867188,
"kl_loss_4": 965.3727020263672,
"kl_loss_9": 273.8419059753418,
"learning_rate": 6.2805556618028556e-06,
"loss": 767.9269,
"step": 9500
},
{
"ce_loss_13": 3.3222765564918517,
"ce_loss_17": 3.2846568703651426,
"ce_loss_2": 4.04825781583786,
"ce_loss_4": 3.6733174681663514,
"ce_loss_9": 3.4051283955574037,
"epoch": 0.951,
"grad_norm": 864.0,
"kl_loss_13": 68.7835521697998,
"kl_loss_2": 1683.3633361816405,
"kl_loss_4": 898.0291534423828,
"kl_loss_9": 259.48881378173826,
"learning_rate": 6.032346995169968e-06,
"loss": 721.3496,
"step": 9510
},
{
"ce_loss_13": 3.3196557998657226,
"ce_loss_17": 3.282727527618408,
"ce_loss_2": 4.06610267162323,
"ce_loss_4": 3.6882732272148133,
"ce_loss_9": 3.4062796950340273,
"epoch": 0.952,
"grad_norm": 864.0,
"kl_loss_13": 69.86494426727295,
"kl_loss_2": 1729.0597351074218,
"kl_loss_4": 936.212890625,
"kl_loss_9": 267.50542373657225,
"learning_rate": 5.789112577318789e-06,
"loss": 746.7356,
"step": 9520
},
{
"ce_loss_13": 3.3010855078697205,
"ce_loss_17": 3.2626437067985536,
"ce_loss_2": 4.069916033744812,
"ce_loss_4": 3.6735063433647155,
"ce_loss_9": 3.3919169664382935,
"epoch": 0.953,
"grad_norm": 772.0,
"kl_loss_13": 70.53233699798584,
"kl_loss_2": 1780.1747192382813,
"kl_loss_4": 953.9167205810547,
"kl_loss_9": 270.62007904052734,
"learning_rate": 5.550854857617194e-06,
"loss": 751.1559,
"step": 9530
},
{
"ce_loss_13": 3.2827063322067263,
"ce_loss_17": 3.2447108030319214,
"ce_loss_2": 4.069734442234039,
"ce_loss_4": 3.6681645154953,
"ce_loss_9": 3.3760493993759155,
"epoch": 0.954,
"grad_norm": 932.0,
"kl_loss_13": 71.62755641937255,
"kl_loss_2": 1796.897705078125,
"kl_loss_4": 958.7881378173828,
"kl_loss_9": 275.33373107910154,
"learning_rate": 5.317576235317756e-06,
"loss": 766.1111,
"step": 9540
},
{
"ce_loss_13": 3.3136985898017883,
"ce_loss_17": 3.2769460320472716,
"ce_loss_2": 4.0408616065979,
"ce_loss_4": 3.669191324710846,
"ce_loss_9": 3.402140426635742,
"epoch": 0.955,
"grad_norm": 972.0,
"kl_loss_13": 69.35324230194092,
"kl_loss_2": 1673.2431579589843,
"kl_loss_4": 898.83974609375,
"kl_loss_9": 260.54905395507814,
"learning_rate": 5.089279059533658e-06,
"loss": 750.5405,
"step": 9550
},
{
"ce_loss_13": 3.3648689150810243,
"ce_loss_17": 3.3249842524528503,
"ce_loss_2": 4.105914556980133,
"ce_loss_4": 3.7360412955284117,
"ce_loss_9": 3.4594300985336304,
"epoch": 0.956,
"grad_norm": 900.0,
"kl_loss_13": 71.78082599639893,
"kl_loss_2": 1721.1691772460938,
"kl_loss_4": 939.7376190185547,
"kl_loss_9": 275.466544342041,
"learning_rate": 4.865965629214819e-06,
"loss": 750.1981,
"step": 9560
},
{
"ce_loss_13": 3.3124891042709352,
"ce_loss_17": 3.2736460328102113,
"ce_loss_2": 4.072525918483734,
"ce_loss_4": 3.6866251111030577,
"ce_loss_9": 3.4061823725700378,
"epoch": 0.957,
"grad_norm": 1264.0,
"kl_loss_13": 71.31147899627686,
"kl_loss_2": 1768.5251342773438,
"kl_loss_4": 946.2674346923828,
"kl_loss_9": 274.1868476867676,
"learning_rate": 4.6476381931251366e-06,
"loss": 746.5137,
"step": 9570
},
{
"ce_loss_13": 3.298474097251892,
"ce_loss_17": 3.260497677326202,
"ce_loss_2": 4.046734261512756,
"ce_loss_4": 3.6683933973312377,
"ce_loss_9": 3.390977942943573,
"epoch": 0.958,
"grad_norm": 1112.0,
"kl_loss_13": 69.28888702392578,
"kl_loss_2": 1714.635205078125,
"kl_loss_4": 926.0194061279296,
"kl_loss_9": 266.40323333740236,
"learning_rate": 4.434298949819449e-06,
"loss": 749.155,
"step": 9580
},
{
"ce_loss_13": 3.2587222695350646,
"ce_loss_17": 3.219885218143463,
"ce_loss_2": 4.048289930820465,
"ce_loss_4": 3.6507463574409487,
"ce_loss_9": 3.35064240694046,
"epoch": 0.959,
"grad_norm": 788.0,
"kl_loss_13": 72.76544532775878,
"kl_loss_2": 1839.6946411132812,
"kl_loss_4": 989.5889343261719,
"kl_loss_9": 280.65069885253905,
"learning_rate": 4.2259500476214406e-06,
"loss": 773.379,
"step": 9590
},
{
"ce_loss_13": 3.240147149562836,
"ce_loss_17": 3.201702296733856,
"ce_loss_2": 4.003677499294281,
"ce_loss_4": 3.6168144822120665,
"ce_loss_9": 3.332273817062378,
"epoch": 0.96,
"grad_norm": 752.0,
"kl_loss_13": 69.94089698791504,
"kl_loss_2": 1777.3868408203125,
"kl_loss_4": 952.8343505859375,
"kl_loss_9": 270.85561447143556,
"learning_rate": 4.02259358460233e-06,
"loss": 754.1871,
"step": 9600
},
{
"ce_loss_13": 3.305615413188934,
"ce_loss_17": 3.2665465831756593,
"ce_loss_2": 4.051685702800751,
"ce_loss_4": 3.675060486793518,
"ce_loss_9": 3.398111867904663,
"epoch": 0.961,
"grad_norm": 960.0,
"kl_loss_13": 70.52569160461425,
"kl_loss_2": 1709.8893249511718,
"kl_loss_4": 921.5408935546875,
"kl_loss_9": 268.09111938476565,
"learning_rate": 3.8242316085594916e-06,
"loss": 742.2291,
"step": 9610
},
{
"ce_loss_13": 3.190785253047943,
"ce_loss_17": 3.1509253859519957,
"ce_loss_2": 4.003580093383789,
"ce_loss_4": 3.5900234818458556,
"ce_loss_9": 3.2888937115669252,
"epoch": 0.962,
"grad_norm": 692.0,
"kl_loss_13": 70.176953125,
"kl_loss_2": 1853.9943420410157,
"kl_loss_4": 980.0655944824218,
"kl_loss_9": 275.7702995300293,
"learning_rate": 3.630866116995757e-06,
"loss": 782.2559,
"step": 9620
},
{
"ce_loss_13": 3.3454036831855776,
"ce_loss_17": 3.3080078125,
"ce_loss_2": 4.0747090697288515,
"ce_loss_4": 3.700935626029968,
"ce_loss_9": 3.433185613155365,
"epoch": 0.963,
"grad_norm": 712.0,
"kl_loss_13": 69.57482261657715,
"kl_loss_2": 1695.5358520507812,
"kl_loss_4": 906.6251342773437,
"kl_loss_9": 265.3205223083496,
"learning_rate": 3.4424990570994797e-06,
"loss": 757.5875,
"step": 9630
},
{
"ce_loss_13": 3.3344528794288637,
"ce_loss_17": 3.296273350715637,
"ce_loss_2": 4.070157301425934,
"ce_loss_4": 3.700620484352112,
"ce_loss_9": 3.425011682510376,
"epoch": 0.964,
"grad_norm": 752.0,
"kl_loss_13": 69.56573696136475,
"kl_loss_2": 1715.6682678222655,
"kl_loss_4": 931.6341369628906,
"kl_loss_9": 266.6449722290039,
"learning_rate": 3.2591323257248896e-06,
"loss": 749.5438,
"step": 9640
},
{
"ce_loss_13": 3.1882264971733094,
"ce_loss_17": 3.1511335372924805,
"ce_loss_2": 3.953037369251251,
"ce_loss_4": 3.562658357620239,
"ce_loss_9": 3.276075565814972,
"epoch": 0.965,
"grad_norm": 872.0,
"kl_loss_13": 68.23988037109375,
"kl_loss_2": 1771.6603393554688,
"kl_loss_4": 947.5275238037109,
"kl_loss_9": 267.8297752380371,
"learning_rate": 3.0807677693729385e-06,
"loss": 764.7062,
"step": 9650
},
{
"ce_loss_13": 3.3730565428733827,
"ce_loss_17": 3.336056077480316,
"ce_loss_2": 4.112894988059997,
"ce_loss_4": 3.7414841294288634,
"ce_loss_9": 3.4624873042106628,
"epoch": 0.966,
"grad_norm": 752.0,
"kl_loss_13": 70.90753707885742,
"kl_loss_2": 1713.3698181152345,
"kl_loss_4": 927.716064453125,
"kl_loss_9": 266.6476768493652,
"learning_rate": 2.9074071841727055e-06,
"loss": 739.7657,
"step": 9660
},
{
"ce_loss_13": 3.296717894077301,
"ce_loss_17": 3.2591960549354555,
"ce_loss_2": 4.048906767368317,
"ce_loss_4": 3.6761029839515684,
"ce_loss_9": 3.3900691509246825,
"epoch": 0.967,
"grad_norm": 652.0,
"kl_loss_13": 69.59696311950684,
"kl_loss_2": 1729.5003051757812,
"kl_loss_4": 946.6276977539062,
"kl_loss_9": 269.5311737060547,
"learning_rate": 2.739052315863355e-06,
"loss": 738.4226,
"step": 9670
},
{
"ce_loss_13": 3.2766026735305784,
"ce_loss_17": 3.2395278930664064,
"ce_loss_2": 4.038357901573181,
"ce_loss_4": 3.648110294342041,
"ce_loss_9": 3.3652175784111025,
"epoch": 0.968,
"grad_norm": 1080.0,
"kl_loss_13": 70.5869369506836,
"kl_loss_2": 1758.3627502441407,
"kl_loss_4": 933.8951812744141,
"kl_loss_9": 264.542862701416,
"learning_rate": 2.5757048597765396e-06,
"loss": 750.0461,
"step": 9680
},
{
"ce_loss_13": 3.2924631953239443,
"ce_loss_17": 3.2535227656364443,
"ce_loss_2": 4.052191209793091,
"ce_loss_4": 3.664053797721863,
"ce_loss_9": 3.384879839420319,
"epoch": 0.969,
"grad_norm": 1072.0,
"kl_loss_13": 69.67913131713867,
"kl_loss_2": 1746.7089233398438,
"kl_loss_4": 945.0013824462891,
"kl_loss_9": 270.4563980102539,
"learning_rate": 2.417366460819359e-06,
"loss": 753.9213,
"step": 9690
},
{
"ce_loss_13": 3.300505042076111,
"ce_loss_17": 3.260878264904022,
"ce_loss_2": 4.087180781364441,
"ce_loss_4": 3.6878235220909117,
"ce_loss_9": 3.3978606462478638,
"epoch": 0.97,
"grad_norm": 2768.0,
"kl_loss_13": 71.02720127105712,
"kl_loss_2": 1801.905743408203,
"kl_loss_4": 961.648291015625,
"kl_loss_9": 274.4756118774414,
"learning_rate": 2.2640387134577057e-06,
"loss": 754.1334,
"step": 9700
},
{
"ce_loss_13": 3.233382725715637,
"ce_loss_17": 3.196085739135742,
"ce_loss_2": 3.9578961730003357,
"ce_loss_4": 3.587084412574768,
"ce_loss_9": 3.320077121257782,
"epoch": 0.971,
"grad_norm": 816.0,
"kl_loss_13": 66.40662364959717,
"kl_loss_2": 1661.5213989257813,
"kl_loss_4": 899.0106079101563,
"kl_loss_9": 257.6714744567871,
"learning_rate": 2.115723161700278e-06,
"loss": 735.0702,
"step": 9710
},
{
"ce_loss_13": 3.2087143778800966,
"ce_loss_17": 3.169182777404785,
"ce_loss_2": 3.998633527755737,
"ce_loss_4": 3.590976357460022,
"ce_loss_9": 3.302764666080475,
"epoch": 0.972,
"grad_norm": 852.0,
"kl_loss_13": 70.9349552154541,
"kl_loss_2": 1806.8160339355468,
"kl_loss_4": 960.1527740478516,
"kl_loss_9": 273.3882225036621,
"learning_rate": 1.9724212990830937e-06,
"loss": 772.3686,
"step": 9720
},
{
"ce_loss_13": 3.347757840156555,
"ce_loss_17": 3.310051369667053,
"ce_loss_2": 4.111681115627289,
"ce_loss_4": 3.7260345101356505,
"ce_loss_9": 3.4391422271728516,
"epoch": 0.973,
"grad_norm": 668.0,
"kl_loss_13": 70.93805503845215,
"kl_loss_2": 1759.4647155761718,
"kl_loss_4": 944.5461242675781,
"kl_loss_9": 271.56855697631835,
"learning_rate": 1.8341345686543331e-06,
"loss": 757.8961,
"step": 9730
},
{
"ce_loss_13": 3.3323220372200013,
"ce_loss_17": 3.2967097997665404,
"ce_loss_2": 4.05302050113678,
"ce_loss_4": 3.6953728199005127,
"ce_loss_9": 3.4248616099357605,
"epoch": 0.974,
"grad_norm": 1040.0,
"kl_loss_13": 69.34260234832763,
"kl_loss_2": 1672.5962463378905,
"kl_loss_4": 911.3729583740235,
"kl_loss_9": 265.0969268798828,
"learning_rate": 1.7008643629596864e-06,
"loss": 755.02,
"step": 9740
},
{
"ce_loss_13": 3.31864572763443,
"ce_loss_17": 3.2793949127197264,
"ce_loss_2": 4.070427584648132,
"ce_loss_4": 3.683055078983307,
"ce_loss_9": 3.410471832752228,
"epoch": 0.975,
"grad_norm": 1012.0,
"kl_loss_13": 69.98407096862793,
"kl_loss_2": 1750.4641174316407,
"kl_loss_4": 933.7738891601563,
"kl_loss_9": 268.15416793823243,
"learning_rate": 1.5726120240288633e-06,
"loss": 762.7929,
"step": 9750
},
{
"ce_loss_13": 3.218663954734802,
"ce_loss_17": 3.181781232357025,
"ce_loss_2": 3.9690282464027407,
"ce_loss_4": 3.5920695543289183,
"ce_loss_9": 3.3074343681335447,
"epoch": 0.976,
"grad_norm": 712.0,
"kl_loss_13": 68.4236557006836,
"kl_loss_2": 1734.748388671875,
"kl_loss_4": 934.9843048095703,
"kl_loss_9": 265.47901916503906,
"learning_rate": 1.4493788433612708e-06,
"loss": 748.722,
"step": 9760
},
{
"ce_loss_13": 3.335367035865784,
"ce_loss_17": 3.297475850582123,
"ce_loss_2": 4.103602564334869,
"ce_loss_4": 3.7194851636886597,
"ce_loss_9": 3.4302751779556275,
"epoch": 0.977,
"grad_norm": 620.0,
"kl_loss_13": 70.56827011108399,
"kl_loss_2": 1770.5423828125,
"kl_loss_4": 957.4742462158204,
"kl_loss_9": 272.5340232849121,
"learning_rate": 1.3311660619138578e-06,
"loss": 765.3009,
"step": 9770
},
{
"ce_loss_13": 3.336168646812439,
"ce_loss_17": 3.2970511078834535,
"ce_loss_2": 4.043975496292115,
"ce_loss_4": 3.690586745738983,
"ce_loss_9": 3.4224077343940733,
"epoch": 0.978,
"grad_norm": 904.0,
"kl_loss_13": 69.4926971435547,
"kl_loss_2": 1652.4708862304688,
"kl_loss_4": 911.5842163085938,
"kl_loss_9": 264.75740280151365,
"learning_rate": 1.2179748700879012e-06,
"loss": 746.8001,
"step": 9780
},
{
"ce_loss_13": 3.263418173789978,
"ce_loss_17": 3.2262269020080567,
"ce_loss_2": 4.012387645244599,
"ce_loss_4": 3.6370876669883727,
"ce_loss_9": 3.3552389621734617,
"epoch": 0.979,
"grad_norm": 1248.0,
"kl_loss_13": 69.25509204864503,
"kl_loss_2": 1723.763494873047,
"kl_loss_4": 927.4136138916016,
"kl_loss_9": 265.8643524169922,
"learning_rate": 1.1098064077174619e-06,
"loss": 753.7449,
"step": 9790
},
{
"ce_loss_13": 3.292123317718506,
"ce_loss_17": 3.253508412837982,
"ce_loss_2": 4.069903910160065,
"ce_loss_4": 3.6727489948272707,
"ce_loss_9": 3.3865288853645326,
"epoch": 0.98,
"grad_norm": 900.0,
"kl_loss_13": 69.55743618011475,
"kl_loss_2": 1785.3747497558593,
"kl_loss_4": 953.881314086914,
"kl_loss_9": 269.7953392028809,
"learning_rate": 1.006661764057837e-06,
"loss": 757.5746,
"step": 9800
},
{
"ce_loss_13": 3.29565806388855,
"ce_loss_17": 3.259297585487366,
"ce_loss_2": 4.048216080665588,
"ce_loss_4": 3.6683563709259035,
"ce_loss_9": 3.386642026901245,
"epoch": 0.981,
"grad_norm": 968.0,
"kl_loss_13": 69.07173500061035,
"kl_loss_2": 1738.47861328125,
"kl_loss_4": 937.5677337646484,
"kl_loss_9": 266.25218734741213,
"learning_rate": 9.085419777743465e-07,
"loss": 748.7905,
"step": 9810
},
{
"ce_loss_13": 3.2435696125030518,
"ce_loss_17": 3.207517647743225,
"ce_loss_2": 4.002567946910858,
"ce_loss_4": 3.617623841762543,
"ce_loss_9": 3.333637535572052,
"epoch": 0.982,
"grad_norm": 800.0,
"kl_loss_13": 67.94173545837403,
"kl_loss_2": 1745.14423828125,
"kl_loss_4": 936.8636291503906,
"kl_loss_9": 262.454483795166,
"learning_rate": 8.15448036932176e-07,
"loss": 739.0622,
"step": 9820
},
{
"ce_loss_13": 3.291981852054596,
"ce_loss_17": 3.2551918506622313,
"ce_loss_2": 4.038154816627502,
"ce_loss_4": 3.6620378851890565,
"ce_loss_9": 3.3822624683380127,
"epoch": 0.983,
"grad_norm": 648.0,
"kl_loss_13": 69.3007080078125,
"kl_loss_2": 1744.5392822265626,
"kl_loss_4": 948.7087493896485,
"kl_loss_9": 269.4570854187012,
"learning_rate": 7.273808789862724e-07,
"loss": 763.5148,
"step": 9830
},
{
"ce_loss_13": 3.369704580307007,
"ce_loss_17": 3.3324461221694945,
"ce_loss_2": 4.108627426624298,
"ce_loss_4": 3.735966980457306,
"ce_loss_9": 3.461838722229004,
"epoch": 0.984,
"grad_norm": 732.0,
"kl_loss_13": 71.17484645843506,
"kl_loss_2": 1731.5099853515626,
"kl_loss_4": 940.5303466796875,
"kl_loss_9": 271.71323165893557,
"learning_rate": 6.443413907720186e-07,
"loss": 750.0659,
"step": 9840
},
{
"ce_loss_13": 3.3022388219833374,
"ce_loss_17": 3.2654573798179625,
"ce_loss_2": 4.047054243087769,
"ce_loss_4": 3.6680091381073,
"ce_loss_9": 3.3926560521125793,
"epoch": 0.985,
"grad_norm": 804.0,
"kl_loss_13": 69.3776517868042,
"kl_loss_2": 1703.3996826171874,
"kl_loss_4": 919.8436981201172,
"kl_loss_9": 267.2551780700684,
"learning_rate": 5.663304084960185e-07,
"loss": 741.2038,
"step": 9850
},
{
"ce_loss_13": 3.228341591358185,
"ce_loss_17": 3.1917863130569457,
"ce_loss_2": 4.006502139568329,
"ce_loss_4": 3.6165035367012024,
"ce_loss_9": 3.321926999092102,
"epoch": 0.986,
"grad_norm": 628.0,
"kl_loss_13": 69.7114330291748,
"kl_loss_2": 1775.5684326171875,
"kl_loss_4": 950.0264434814453,
"kl_loss_9": 269.45206451416016,
"learning_rate": 4.933487177280482e-07,
"loss": 746.5763,
"step": 9860
},
{
"ce_loss_13": 3.3282948970794677,
"ce_loss_17": 3.2902629494667055,
"ce_loss_2": 4.073453938961029,
"ce_loss_4": 3.6912740230560304,
"ce_loss_9": 3.418955981731415,
"epoch": 0.987,
"grad_norm": 704.0,
"kl_loss_13": 68.88297080993652,
"kl_loss_2": 1733.244970703125,
"kl_loss_4": 929.6509796142578,
"kl_loss_9": 264.36970443725585,
"learning_rate": 4.2539705339295075e-07,
"loss": 743.9317,
"step": 9870
},
{
"ce_loss_13": 3.184259068965912,
"ce_loss_17": 3.146634590625763,
"ce_loss_2": 3.948678719997406,
"ce_loss_4": 3.568000066280365,
"ce_loss_9": 3.279090178012848,
"epoch": 0.988,
"grad_norm": 756.0,
"kl_loss_13": 69.63558444976806,
"kl_loss_2": 1756.38798828125,
"kl_loss_4": 957.2883392333985,
"kl_loss_9": 278.4889938354492,
"learning_rate": 3.6247609976319816e-07,
"loss": 749.9763,
"step": 9880
},
{
"ce_loss_13": 3.2724010348320007,
"ce_loss_17": 3.233578050136566,
"ce_loss_2": 4.047141480445862,
"ce_loss_4": 3.6549957036972045,
"ce_loss_9": 3.3679153680801392,
"epoch": 0.989,
"grad_norm": 1024.0,
"kl_loss_13": 70.6495204925537,
"kl_loss_2": 1769.112371826172,
"kl_loss_4": 951.6210327148438,
"kl_loss_9": 272.63334426879885,
"learning_rate": 3.0458649045211895e-07,
"loss": 773.768,
"step": 9890
},
{
"ce_loss_13": 3.2451439619064333,
"ce_loss_17": 3.205696094036102,
"ce_loss_2": 4.007219898700714,
"ce_loss_4": 3.629718315601349,
"ce_loss_9": 3.3400197505950926,
"epoch": 0.99,
"grad_norm": 800.0,
"kl_loss_13": 70.50988845825195,
"kl_loss_2": 1742.4984558105468,
"kl_loss_4": 959.8676452636719,
"kl_loss_9": 274.8178520202637,
"learning_rate": 2.517288084074587e-07,
"loss": 772.7277,
"step": 9900
},
{
"ce_loss_13": 3.2862741708755494,
"ce_loss_17": 3.246384072303772,
"ce_loss_2": 4.084227788448334,
"ce_loss_4": 3.6856178522109984,
"ce_loss_9": 3.386175799369812,
"epoch": 0.991,
"grad_norm": 736.0,
"kl_loss_13": 71.46778984069825,
"kl_loss_2": 1809.738525390625,
"kl_loss_4": 976.1868469238282,
"kl_loss_9": 278.51739807128905,
"learning_rate": 2.0390358590538505e-07,
"loss": 768.0669,
"step": 9910
},
{
"ce_loss_13": 3.296647012233734,
"ce_loss_17": 3.2595779418945314,
"ce_loss_2": 4.049106597900391,
"ce_loss_4": 3.6708460211753846,
"ce_loss_9": 3.3929797291755674,
"epoch": 0.992,
"grad_norm": 816.0,
"kl_loss_13": 70.1730453491211,
"kl_loss_2": 1743.3746154785156,
"kl_loss_4": 954.984341430664,
"kl_loss_9": 275.02985229492185,
"learning_rate": 1.61111304545436e-07,
"loss": 753.1512,
"step": 9920
},
{
"ce_loss_13": 3.259925878047943,
"ce_loss_17": 3.2214688301086425,
"ce_loss_2": 4.006985282897949,
"ce_loss_4": 3.6319169282913206,
"ce_loss_9": 3.3517908215522767,
"epoch": 0.993,
"grad_norm": 704.0,
"kl_loss_13": 69.38464469909668,
"kl_loss_2": 1734.0220458984375,
"kl_loss_4": 941.4155578613281,
"kl_loss_9": 268.93776626586913,
"learning_rate": 1.2335239524541298e-07,
"loss": 742.7448,
"step": 9930
},
{
"ce_loss_13": 3.232028913497925,
"ce_loss_17": 3.1935453176498414,
"ce_loss_2": 3.9849515080451967,
"ce_loss_4": 3.60744651556015,
"ce_loss_9": 3.3257043600082397,
"epoch": 0.994,
"grad_norm": 836.0,
"kl_loss_13": 69.2954231262207,
"kl_loss_2": 1727.1304809570313,
"kl_loss_4": 928.5043914794921,
"kl_loss_9": 265.77542266845705,
"learning_rate": 9.06272382371065e-08,
"loss": 752.2372,
"step": 9940
},
{
"ce_loss_13": 3.297999620437622,
"ce_loss_17": 3.2620956897735596,
"ce_loss_2": 4.070128989219666,
"ce_loss_4": 3.679764378070831,
"ce_loss_9": 3.3932497262954713,
"epoch": 0.995,
"grad_norm": 696.0,
"kl_loss_13": 71.25127658843994,
"kl_loss_2": 1781.1347351074219,
"kl_loss_4": 961.7238128662109,
"kl_loss_9": 274.8592224121094,
"learning_rate": 6.293616306246586e-08,
"loss": 759.6051,
"step": 9950
},
{
"ce_loss_13": 3.2957133054733276,
"ce_loss_17": 3.2595414757728576,
"ce_loss_2": 4.026886129379273,
"ce_loss_4": 3.658927488327026,
"ce_loss_9": 3.383803868293762,
"epoch": 0.996,
"grad_norm": 800.0,
"kl_loss_13": 68.08552322387695,
"kl_loss_2": 1692.548760986328,
"kl_loss_4": 916.358920288086,
"kl_loss_9": 262.1323112487793,
"learning_rate": 4.027944857032395e-08,
"loss": 726.9326,
"step": 9960
},
{
"ce_loss_13": 3.29274468421936,
"ce_loss_17": 3.257336509227753,
"ce_loss_2": 4.0016671299934385,
"ce_loss_4": 3.640883004665375,
"ce_loss_9": 3.3762596607208253,
"epoch": 0.997,
"grad_norm": 676.0,
"kl_loss_13": 67.64072341918946,
"kl_loss_2": 1639.7742065429688,
"kl_loss_4": 890.015170288086,
"kl_loss_9": 255.31717681884766,
"learning_rate": 2.265732291356626e-08,
"loss": 724.3973,
"step": 9970
},
{
"ce_loss_13": 3.3301588773727415,
"ce_loss_17": 3.292578196525574,
"ce_loss_2": 4.057374143600464,
"ce_loss_4": 3.6901230216026306,
"ce_loss_9": 3.4179542779922487,
"epoch": 0.998,
"grad_norm": 680.0,
"kl_loss_13": 69.15830268859864,
"kl_loss_2": 1689.9597412109374,
"kl_loss_4": 915.1488616943359,
"kl_loss_9": 266.0099395751953,
"learning_rate": 1.0069963546743833e-08,
"loss": 755.9442,
"step": 9980
},
{
"ce_loss_13": 3.3151816725730896,
"ce_loss_17": 3.275153863430023,
"ce_loss_2": 4.068166565895081,
"ce_loss_4": 3.6815107583999636,
"ce_loss_9": 3.405549705028534,
"epoch": 0.999,
"grad_norm": 792.0,
"kl_loss_13": 69.87775192260742,
"kl_loss_2": 1741.6067260742188,
"kl_loss_4": 939.5706390380859,
"kl_loss_9": 270.8768714904785,
"learning_rate": 2.517497224463483e-09,
"loss": 750.72,
"step": 9990
},
{
"ce_loss_13": 3.264886772632599,
"ce_loss_17": 3.225277531147003,
"ce_loss_2": 4.066579639911652,
"ce_loss_4": 3.6539052367210387,
"ce_loss_9": 3.362174320220947,
"epoch": 1.0,
"grad_norm": 776.0,
"kl_loss_13": 71.4085069656372,
"kl_loss_2": 1836.1367431640624,
"kl_loss_4": 969.44462890625,
"kl_loss_9": 277.70877304077146,
"learning_rate": 0.0,
"loss": 777.38,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.447557417823109e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}