{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_13": 6.402231216430664, "ce_loss_17": 3.2689143419265747, "ce_loss_2": 9.390046119689941, "ce_loss_4": 9.192162036895752, "ce_loss_9": 6.634675979614258, "epoch": 0.0001, "grad_norm": 37632.0, "kl_loss_13": 7664.65576171875, "kl_loss_2": 12785.50146484375, "kl_loss_4": 12609.859375, "kl_loss_9": 7861.16552734375, "learning_rate": 1e-05, "loss": 10554.3262, "step": 1 }, { "ce_loss_13": 5.711873133977254, "ce_loss_17": 3.3221818341149225, "ce_loss_2": 8.177524778578016, "ce_loss_4": 7.773941384421454, "ce_loss_9": 5.815383354822795, "epoch": 0.001, "grad_norm": 16640.0, "kl_loss_13": 5945.98235405816, "kl_loss_2": 9901.66468641493, "kl_loss_4": 9238.261610243055, "kl_loss_9": 5542.482516818576, "learning_rate": 0.0001, "loss": 7684.5868, "step": 10 }, { "ce_loss_13": 4.2000329971313475, "ce_loss_17": 3.3278347134590147, "ce_loss_2": 6.55702052116394, "ce_loss_4": 6.119993329048157, "ce_loss_9": 4.6535967826843265, "epoch": 0.002, "grad_norm": 1968.0, "kl_loss_13": 1805.488510131836, "kl_loss_2": 6092.596118164062, "kl_loss_4": 5236.483740234375, "kl_loss_9": 2427.8350463867187, "learning_rate": 0.0002, "loss": 3912.6133, "step": 20 }, { "ce_loss_13": 3.526861870288849, "ce_loss_17": 3.1244616389274595, "ce_loss_2": 5.735381245613098, "ce_loss_4": 5.213837122917175, "ce_loss_9": 3.934248983860016, "epoch": 0.003, "grad_norm": 1336.0, "kl_loss_13": 756.9810455322265, "kl_loss_2": 4979.42373046875, "kl_loss_4": 3980.5005615234377, "kl_loss_9": 1485.3927185058594, "learning_rate": 0.0003, "loss": 2754.8352, "step": 30 }, { "ce_loss_13": 3.5736655235290526, "ce_loss_17": 3.290929138660431, "ce_loss_2": 5.478212141990662, "ce_loss_4": 4.976801490783691, "ce_loss_9": 3.9243278384208677, "epoch": 0.004, "grad_norm": 1720.0, "kl_loss_13": 516.0072662353516, "kl_loss_2": 4206.617651367187, "kl_loss_4": 3240.3652221679686, "kl_loss_9": 1167.3858154296875, "learning_rate": 0.0004, "loss": 2292.9277, "step": 40 }, { "ce_loss_13": 3.492569625377655, "ce_loss_17": 3.263450765609741, "ce_loss_2": 5.301641321182251, "ce_loss_4": 4.762786316871643, "ce_loss_9": 3.813670790195465, "epoch": 0.005, "grad_norm": 1336.0, "kl_loss_13": 408.9797164916992, "kl_loss_2": 3949.739465332031, "kl_loss_4": 2919.3654174804688, "kl_loss_9": 1008.5517547607421, "learning_rate": 0.0005, "loss": 2071.7504, "step": 50 }, { "ce_loss_13": 3.468132793903351, "ce_loss_17": 3.281778705120087, "ce_loss_2": 5.174832701683044, "ce_loss_4": 4.657959699630737, "ce_loss_9": 3.7767446517944334, "epoch": 0.006, "grad_norm": 1264.0, "kl_loss_13": 346.4393112182617, "kl_loss_2": 3695.2262329101563, "kl_loss_4": 2685.748815917969, "kl_loss_9": 924.1326385498047, "learning_rate": 0.0006, "loss": 1916.6861, "step": 60 }, { "ce_loss_13": 3.3564661741256714, "ce_loss_17": 3.1969870805740355, "ce_loss_2": 5.081459856033325, "ce_loss_4": 4.552425503730774, "ce_loss_9": 3.6519309878349304, "epoch": 0.007, "grad_norm": 1640.0, "kl_loss_13": 296.28175048828126, "kl_loss_2": 3685.4558715820312, "kl_loss_4": 2671.9430419921873, "kl_loss_9": 849.3832427978516, "learning_rate": 0.0007, "loss": 1864.6594, "step": 70 }, { "ce_loss_13": 3.3460593104362486, "ce_loss_17": 3.1964768290519716, "ce_loss_2": 5.04540433883667, "ce_loss_4": 4.464562845230103, "ce_loss_9": 3.6478141069412233, "epoch": 0.008, "grad_norm": 2192.0, "kl_loss_13": 271.24009094238284, "kl_loss_2": 3629.17392578125, "kl_loss_4": 2484.8527954101564, "kl_loss_9": 836.3974060058594, "learning_rate": 0.0008, "loss": 1818.5662, "step": 80 }, { "ce_loss_13": 3.345805835723877, "ce_loss_17": 3.1532609820365907, "ce_loss_2": 5.009881067276001, "ce_loss_4": 4.3814632296562195, "ce_loss_9": 3.5915980696678163, "epoch": 0.009, "grad_norm": 1680.0, "kl_loss_13": 358.3774284362793, "kl_loss_2": 3638.965710449219, "kl_loss_4": 2418.6386596679686, "kl_loss_9": 819.6164855957031, "learning_rate": 0.0009000000000000001, "loss": 1801.5828, "step": 90 }, { "ce_loss_13": 3.487445020675659, "ce_loss_17": 3.269311249256134, "ce_loss_2": 5.072077345848084, "ce_loss_4": 4.4792849779129025, "ce_loss_9": 3.7141328930854796, "epoch": 0.01, "grad_norm": 2016.0, "kl_loss_13": 425.99823303222655, "kl_loss_2": 3543.0379638671875, "kl_loss_4": 2380.678662109375, "kl_loss_9": 826.6310424804688, "learning_rate": 0.001, "loss": 1796.9578, "step": 100 }, { "ce_loss_13": 3.42359539270401, "ce_loss_17": 3.226667749881744, "ce_loss_2": 5.0204795598983765, "ce_loss_4": 4.459975528717041, "ce_loss_9": 3.6708388924598694, "epoch": 0.011, "grad_norm": 1488.0, "kl_loss_13": 365.5156448364258, "kl_loss_2": 3526.2210327148437, "kl_loss_4": 2427.8171630859374, "kl_loss_9": 829.8687622070313, "learning_rate": 0.0009999974825027757, "loss": 1782.9496, "step": 110 }, { "ce_loss_13": 3.4499420881271363, "ce_loss_17": 3.289404010772705, "ce_loss_2": 5.006643867492675, "ce_loss_4": 4.4474083423614506, "ce_loss_9": 3.717779290676117, "epoch": 0.012, "grad_norm": 1424.0, "kl_loss_13": 293.02129516601565, "kl_loss_2": 3400.0647583007812, "kl_loss_4": 2294.327313232422, "kl_loss_9": 819.5202423095703, "learning_rate": 0.0009999899300364532, "loss": 1681.502, "step": 120 }, { "ce_loss_13": 3.4073472142219545, "ce_loss_17": 3.2614750146865843, "ce_loss_2": 4.989273881912231, "ce_loss_4": 4.405853271484375, "ce_loss_9": 3.69451265335083, "epoch": 0.013, "grad_norm": 1072.0, "kl_loss_13": 266.0638145446777, "kl_loss_2": 3416.4965698242186, "kl_loss_4": 2282.6978149414062, "kl_loss_9": 795.0189605712891, "learning_rate": 0.0009999773426770863, "loss": 1705.7545, "step": 130 }, { "ce_loss_13": 3.4283769369125365, "ce_loss_17": 3.297203207015991, "ce_loss_2": 4.972061061859131, "ce_loss_4": 4.393543720245361, "ce_loss_9": 3.7087190985679626, "epoch": 0.014, "grad_norm": 1192.0, "kl_loss_13": 239.3195457458496, "kl_loss_2": 3318.6891845703126, "kl_loss_4": 2176.9557983398436, "kl_loss_9": 783.5355072021484, "learning_rate": 0.0009999597205514296, "loss": 1645.6031, "step": 140 }, { "ce_loss_13": 3.377828299999237, "ce_loss_17": 3.253422975540161, "ce_loss_2": 4.94508650302887, "ce_loss_4": 4.358719778060913, "ce_loss_9": 3.6640469789505006, "epoch": 0.015, "grad_norm": 1056.0, "kl_loss_13": 223.69755477905272, "kl_loss_2": 3351.4960815429686, "kl_loss_4": 2190.3794067382814, "kl_loss_9": 776.6426544189453, "learning_rate": 0.0009999370638369377, "loss": 1640.4398, "step": 150 }, { "ce_loss_13": 3.405010259151459, "ce_loss_17": 3.291413652896881, "ce_loss_2": 4.949583244323731, "ce_loss_4": 4.344371438026428, "ce_loss_9": 3.69311705827713, "epoch": 0.016, "grad_norm": 1024.0, "kl_loss_13": 205.55703506469726, "kl_loss_2": 3293.4473510742187, "kl_loss_4": 2095.886193847656, "kl_loss_9": 763.1888641357422, "learning_rate": 0.000999909372761763, "loss": 1593.7709, "step": 160 }, { "ce_loss_13": 3.3376732587814333, "ce_loss_17": 3.2281859874725343, "ce_loss_2": 4.898326992988586, "ce_loss_4": 4.281079268455505, "ce_loss_9": 3.615650844573975, "epoch": 0.017, "grad_norm": 960.0, "kl_loss_13": 205.50527114868163, "kl_loss_2": 3335.3987670898437, "kl_loss_4": 2112.4116760253905, "kl_loss_9": 738.5396911621094, "learning_rate": 0.0009998766476047546, "loss": 1608.7034, "step": 170 }, { "ce_loss_13": 3.385698843002319, "ce_loss_17": 3.2674369692802427, "ce_loss_2": 4.935103106498718, "ce_loss_4": 4.331924676895142, "ce_loss_9": 3.6581133723258974, "epoch": 0.018, "grad_norm": 984.0, "kl_loss_13": 214.01333236694336, "kl_loss_2": 3320.2940185546877, "kl_loss_4": 2132.7986267089846, "kl_loss_9": 732.3475860595703, "learning_rate": 0.0009998388886954545, "loss": 1609.165, "step": 180 }, { "ce_loss_13": 3.344286847114563, "ce_loss_17": 3.2329752802848817, "ce_loss_2": 4.9058679103851315, "ce_loss_4": 4.2970584511756895, "ce_loss_9": 3.6102590441703795, "epoch": 0.019, "grad_norm": 1416.0, "kl_loss_13": 207.53194122314454, "kl_loss_2": 3344.6282348632812, "kl_loss_4": 2104.0942565917967, "kl_loss_9": 712.7711883544922, "learning_rate": 0.0009997960964140947, "loss": 1580.0558, "step": 190 }, { "ce_loss_13": 3.3503451228141783, "ce_loss_17": 3.230080413818359, "ce_loss_2": 4.908375334739685, "ce_loss_4": 4.296644294261933, "ce_loss_9": 3.5918402433395387, "epoch": 0.02, "grad_norm": 912.0, "kl_loss_13": 222.13727722167968, "kl_loss_2": 3336.8043701171873, "kl_loss_4": 2133.3271484375, "kl_loss_9": 681.4191864013671, "learning_rate": 0.0009997482711915926, "loss": 1573.9847, "step": 200 }, { "ce_loss_13": 3.3222963929176332, "ce_loss_17": 3.204137551784515, "ce_loss_2": 4.812633442878723, "ce_loss_4": 4.211560595035553, "ce_loss_9": 3.5349189162254335, "epoch": 0.021, "grad_norm": 816.0, "kl_loss_13": 220.91544876098632, "kl_loss_2": 3218.9809814453124, "kl_loss_4": 2025.6634887695313, "kl_loss_9": 634.4633239746094, "learning_rate": 0.0009996954135095479, "loss": 1521.7721, "step": 210 }, { "ce_loss_13": 3.4005984902381896, "ce_loss_17": 3.280700922012329, "ce_loss_2": 4.841261887550354, "ce_loss_4": 4.240457427501679, "ce_loss_9": 3.607716429233551, "epoch": 0.022, "grad_norm": 836.0, "kl_loss_13": 223.7593849182129, "kl_loss_2": 3111.2940673828125, "kl_loss_4": 1915.070245361328, "kl_loss_9": 620.2754699707032, "learning_rate": 0.0009996375239002368, "loss": 1465.0939, "step": 220 }, { "ce_loss_13": 3.463943064212799, "ce_loss_17": 3.35058434009552, "ce_loss_2": 4.868658900260925, "ce_loss_4": 4.278674197196961, "ce_loss_9": 3.6834152340888977, "epoch": 0.023, "grad_norm": 1008.0, "kl_loss_13": 208.8836555480957, "kl_loss_2": 3045.0566284179686, "kl_loss_4": 1886.1902648925782, "kl_loss_9": 636.5732116699219, "learning_rate": 0.0009995746029466072, "loss": 1450.0878, "step": 230 }, { "ce_loss_13": 3.252587044239044, "ce_loss_17": 3.1401400566101074, "ce_loss_2": 4.778895163536072, "ce_loss_4": 4.140607786178589, "ce_loss_9": 3.4961634039878846, "epoch": 0.024, "grad_norm": 1072.0, "kl_loss_13": 202.95092086791993, "kl_loss_2": 3285.569482421875, "kl_loss_4": 2015.8346618652345, "kl_loss_9": 684.3923858642578, "learning_rate": 0.0009995066512822719, "loss": 1495.9869, "step": 240 }, { "ce_loss_13": 3.339576780796051, "ce_loss_17": 3.2416956186294557, "ce_loss_2": 4.896433734893799, "ce_loss_4": 4.256370925903321, "ce_loss_9": 3.572561264038086, "epoch": 0.025, "grad_norm": 880.0, "kl_loss_13": 188.93093719482422, "kl_loss_2": 3313.684326171875, "kl_loss_4": 2032.2442749023437, "kl_loss_9": 634.844645690918, "learning_rate": 0.000999433669591504, "loss": 1474.7885, "step": 250 }, { "ce_loss_13": 3.2495914578437803, "ce_loss_17": 3.148615872859955, "ce_loss_2": 4.763438987731933, "ce_loss_4": 4.118508851528167, "ce_loss_9": 3.4872807264328003, "epoch": 0.026, "grad_norm": 932.0, "kl_loss_13": 180.5644905090332, "kl_loss_2": 3247.513635253906, "kl_loss_4": 1957.0520812988282, "kl_loss_9": 634.1553527832032, "learning_rate": 0.000999355658609228, "loss": 1475.6226, "step": 260 }, { "ce_loss_13": 3.278519403934479, "ce_loss_17": 3.1758210182189943, "ce_loss_2": 4.820499777793884, "ce_loss_4": 4.18280280828476, "ce_loss_9": 3.5275792717933654, "epoch": 0.027, "grad_norm": 740.0, "kl_loss_13": 180.34210205078125, "kl_loss_2": 3261.2527465820312, "kl_loss_4": 1987.8143920898438, "kl_loss_9": 652.8159881591797, "learning_rate": 0.0009992726191210138, "loss": 1498.7066, "step": 270 }, { "ce_loss_13": 3.3170214653015138, "ce_loss_17": 3.2177631139755247, "ce_loss_2": 4.788047671318054, "ce_loss_4": 4.174385607242584, "ce_loss_9": 3.556040048599243, "epoch": 0.028, "grad_norm": 716.0, "kl_loss_13": 179.22331695556642, "kl_loss_2": 3124.784069824219, "kl_loss_4": 1924.5135559082032, "kl_loss_9": 634.743994140625, "learning_rate": 0.0009991845519630679, "loss": 1450.6994, "step": 280 }, { "ce_loss_13": 3.197556221485138, "ce_loss_17": 3.1011061906814574, "ce_loss_2": 4.678141450881958, "ce_loss_4": 4.070699083805084, "ce_loss_9": 3.4350595355033873, "epoch": 0.029, "grad_norm": 752.0, "kl_loss_13": 174.9555892944336, "kl_loss_2": 3147.2247314453125, "kl_loss_4": 1939.0474731445313, "kl_loss_9": 625.6185363769531, "learning_rate": 0.0009990914580222257, "loss": 1464.1248, "step": 290 }, { "ce_loss_13": 3.335582661628723, "ce_loss_17": 3.245238924026489, "ce_loss_2": 4.727395129203797, "ce_loss_4": 4.135890567302704, "ce_loss_9": 3.5657243847846987, "epoch": 0.03, "grad_norm": 856.0, "kl_loss_13": 169.86073608398436, "kl_loss_2": 3007.0453857421876, "kl_loss_4": 1823.50703125, "kl_loss_9": 620.2317199707031, "learning_rate": 0.0009989933382359422, "loss": 1438.8438, "step": 300 }, { "ce_loss_13": 3.344461226463318, "ce_loss_17": 3.253080868721008, "ce_loss_2": 4.734828901290894, "ce_loss_4": 4.140229022502899, "ce_loss_9": 3.570216417312622, "epoch": 0.031, "grad_norm": 868.0, "kl_loss_13": 170.47278060913087, "kl_loss_2": 2986.3127563476564, "kl_loss_4": 1806.9783508300782, "kl_loss_9": 611.4689514160157, "learning_rate": 0.0009988901935922825, "loss": 1407.2166, "step": 310 }, { "ce_loss_13": 3.1942640423774717, "ce_loss_17": 3.098420965671539, "ce_loss_2": 4.6632726192474365, "ce_loss_4": 4.045630240440369, "ce_loss_9": 3.4386914730072022, "epoch": 0.032, "grad_norm": 712.0, "kl_loss_13": 176.81885147094727, "kl_loss_2": 3156.0201416015625, "kl_loss_4": 1910.4445190429688, "kl_loss_9": 635.6860961914062, "learning_rate": 0.0009987820251299122, "loss": 1429.7891, "step": 320 }, { "ce_loss_13": 3.323316276073456, "ce_loss_17": 3.2272282600402833, "ce_loss_2": 4.688269329071045, "ce_loss_4": 4.095155251026154, "ce_loss_9": 3.5452579855918884, "epoch": 0.033, "grad_norm": 716.0, "kl_loss_13": 168.06323318481446, "kl_loss_2": 2952.4156982421873, "kl_loss_4": 1773.4942504882813, "kl_loss_9": 594.2050231933594, "learning_rate": 0.0009986688339380862, "loss": 1369.0148, "step": 330 }, { "ce_loss_13": 3.263345181941986, "ce_loss_17": 3.1782546758651735, "ce_loss_2": 4.621670579910278, "ce_loss_4": 4.041196537017822, "ce_loss_9": 3.4756152391433717, "epoch": 0.034, "grad_norm": 672.0, "kl_loss_13": 158.30765762329102, "kl_loss_2": 2897.716516113281, "kl_loss_4": 1739.8407348632813, "kl_loss_9": 566.780419921875, "learning_rate": 0.0009985506211566387, "loss": 1351.8492, "step": 340 }, { "ce_loss_13": 3.297460699081421, "ce_loss_17": 3.2095221042633058, "ce_loss_2": 4.629465413093567, "ce_loss_4": 4.048249614238739, "ce_loss_9": 3.4922284483909607, "epoch": 0.035, "grad_norm": 708.0, "kl_loss_13": 154.92753982543945, "kl_loss_2": 2867.41044921875, "kl_loss_4": 1712.3839904785157, "kl_loss_9": 556.999772644043, "learning_rate": 0.0009984273879759713, "loss": 1332.5782, "step": 350 }, { "ce_loss_13": 3.3271041035652162, "ce_loss_17": 3.237209177017212, "ce_loss_2": 4.693679356575013, "ce_loss_4": 4.1152663230896, "ce_loss_9": 3.5451638102531433, "epoch": 0.036, "grad_norm": 688.0, "kl_loss_13": 161.71177139282227, "kl_loss_2": 2938.1607299804687, "kl_loss_4": 1775.6144470214845, "kl_loss_9": 578.8257720947265, "learning_rate": 0.0009982991356370402, "loss": 1381.892, "step": 360 }, { "ce_loss_13": 3.3030256986618043, "ce_loss_17": 3.216581439971924, "ce_loss_2": 4.672011542320251, "ce_loss_4": 4.09724098443985, "ce_loss_9": 3.513698399066925, "epoch": 0.037, "grad_norm": 800.0, "kl_loss_13": 158.75547256469727, "kl_loss_2": 2935.6953857421877, "kl_loss_4": 1770.2281066894532, "kl_loss_9": 569.4476699829102, "learning_rate": 0.0009981658654313456, "loss": 1366.0152, "step": 370 }, { "ce_loss_13": 3.377531623840332, "ce_loss_17": 3.2923927903175354, "ce_loss_2": 4.704947686195373, "ce_loss_4": 4.1200734853744505, "ce_loss_9": 3.582323205471039, "epoch": 0.038, "grad_norm": 648.0, "kl_loss_13": 156.85157165527343, "kl_loss_2": 2856.6430053710938, "kl_loss_4": 1691.1573059082032, "kl_loss_9": 549.3141067504882, "learning_rate": 0.000998027578700917, "loss": 1336.6225, "step": 380 }, { "ce_loss_13": 3.3232869505882263, "ce_loss_17": 3.2330244302749636, "ce_loss_2": 4.671757459640503, "ce_loss_4": 4.081779193878174, "ce_loss_9": 3.530698537826538, "epoch": 0.039, "grad_norm": 716.0, "kl_loss_13": 158.69569473266603, "kl_loss_2": 2900.0949096679688, "kl_loss_4": 1729.8722045898437, "kl_loss_9": 555.8867797851562, "learning_rate": 0.0009978842768382998, "loss": 1345.1803, "step": 390 }, { "ce_loss_13": 3.3343419432640076, "ce_loss_17": 3.2496871948242188, "ce_loss_2": 4.6395210981369015, "ce_loss_4": 4.081280362606049, "ce_loss_9": 3.536642611026764, "epoch": 0.04, "grad_norm": 804.0, "kl_loss_13": 152.12560119628907, "kl_loss_2": 2803.1874267578123, "kl_loss_4": 1715.3488342285157, "kl_loss_9": 557.2631088256836, "learning_rate": 0.0009977359612865424, "loss": 1322.9314, "step": 400 }, { "ce_loss_13": 3.3324403047561644, "ce_loss_17": 3.2530759453773497, "ce_loss_2": 4.658740139007568, "ce_loss_4": 4.140996336936951, "ce_loss_9": 3.5597803592681885, "epoch": 0.041, "grad_norm": 628.0, "kl_loss_13": 150.96018981933594, "kl_loss_2": 2836.8416381835937, "kl_loss_4": 1791.5275756835938, "kl_loss_9": 580.1767044067383, "learning_rate": 0.0009975826335391806, "loss": 1326.1738, "step": 410 }, { "ce_loss_13": 3.3575534582138062, "ce_loss_17": 3.2765774965286254, "ce_loss_2": 4.641478061676025, "ce_loss_4": 4.087486684322357, "ce_loss_9": 3.5588002443313598, "epoch": 0.042, "grad_norm": 648.0, "kl_loss_13": 153.96022338867186, "kl_loss_2": 2786.4625732421873, "kl_loss_4": 1680.8336120605468, "kl_loss_9": 544.7720199584961, "learning_rate": 0.0009974242951402235, "loss": 1299.9223, "step": 420 }, { "ce_loss_13": 3.369288170337677, "ce_loss_17": 3.2808522462844847, "ce_loss_2": 4.661474442481994, "ce_loss_4": 4.091749536991119, "ce_loss_9": 3.5591038703918456, "epoch": 0.043, "grad_norm": 648.0, "kl_loss_13": 159.96430053710938, "kl_loss_2": 2818.1042724609374, "kl_loss_4": 1665.9103454589845, "kl_loss_9": 548.6190933227539, "learning_rate": 0.0009972609476841367, "loss": 1282.6076, "step": 430 }, { "ce_loss_13": 3.2764453291893005, "ce_loss_17": 3.1939987540245056, "ce_loss_2": 4.619550776481629, "ce_loss_4": 4.023915350437164, "ce_loss_9": 3.48334219455719, "epoch": 0.044, "grad_norm": 592.0, "kl_loss_13": 153.44165267944337, "kl_loss_2": 2867.0172119140625, "kl_loss_4": 1682.9174743652343, "kl_loss_9": 539.6375167846679, "learning_rate": 0.0009970925928158272, "loss": 1309.0494, "step": 440 }, { "ce_loss_13": 3.228921616077423, "ce_loss_17": 3.13667711019516, "ce_loss_2": 4.5878925085067745, "ce_loss_4": 3.9889357686042786, "ce_loss_9": 3.4261950492858886, "epoch": 0.045, "grad_norm": 664.0, "kl_loss_13": 168.95373001098633, "kl_loss_2": 2954.3875366210937, "kl_loss_4": 1740.5748657226563, "kl_loss_9": 553.5774597167969, "learning_rate": 0.000996919232230627, "loss": 1337.5051, "step": 450 }, { "ce_loss_13": 3.3128785133361816, "ce_loss_17": 3.225386643409729, "ce_loss_2": 4.60572247505188, "ce_loss_4": 4.035604953765869, "ce_loss_9": 3.499460959434509, "epoch": 0.046, "grad_norm": 668.0, "kl_loss_13": 157.04938583374025, "kl_loss_2": 2803.0527587890624, "kl_loss_4": 1673.0932739257812, "kl_loss_9": 525.0963729858398, "learning_rate": 0.0009967408676742752, "loss": 1261.277, "step": 460 }, { "ce_loss_13": 3.445576179027557, "ce_loss_17": 3.3635016322135924, "ce_loss_2": 4.702920699119568, "ce_loss_4": 4.1346688032150265, "ce_loss_9": 3.6396512627601623, "epoch": 0.047, "grad_norm": 692.0, "kl_loss_13": 152.30962295532225, "kl_loss_2": 2728.0104858398436, "kl_loss_4": 1598.9078186035156, "kl_loss_9": 538.0588653564453, "learning_rate": 0.0009965575009429006, "loss": 1291.6952, "step": 470 }, { "ce_loss_13": 3.2310086250305177, "ce_loss_17": 3.14866179227829, "ce_loss_2": 4.563737893104554, "ce_loss_4": 3.956212747097015, "ce_loss_9": 3.436033844947815, "epoch": 0.048, "grad_norm": 632.0, "kl_loss_13": 146.651318359375, "kl_loss_2": 2878.5802001953125, "kl_loss_4": 1666.7166137695312, "kl_loss_9": 539.7155288696289, "learning_rate": 0.0009963691338830043, "loss": 1287.7481, "step": 480 }, { "ce_loss_13": 3.3250165462493895, "ce_loss_17": 3.2457290410995485, "ce_loss_2": 4.615313935279846, "ce_loss_4": 4.033603394031525, "ce_loss_9": 3.5086933016777038, "epoch": 0.049, "grad_norm": 720.0, "kl_loss_13": 143.4732437133789, "kl_loss_2": 2802.4427124023437, "kl_loss_4": 1628.5113647460937, "kl_loss_9": 516.1418380737305, "learning_rate": 0.0009961757683914405, "loss": 1255.4686, "step": 490 }, { "ce_loss_13": 3.3277801632881165, "ce_loss_17": 3.235699439048767, "ce_loss_2": 4.563870191574097, "ce_loss_4": 4.019959104061127, "ce_loss_9": 3.5216776847839357, "epoch": 0.05, "grad_norm": 708.0, "kl_loss_13": 167.92219467163085, "kl_loss_2": 2734.481298828125, "kl_loss_4": 1638.4034912109375, "kl_loss_9": 555.2191772460938, "learning_rate": 0.0009959774064153978, "loss": 1278.8344, "step": 500 }, { "ce_loss_13": 3.3241262197494508, "ce_loss_17": 3.2492061614990235, "ce_loss_2": 4.542808580398559, "ce_loss_4": 3.9918776512145997, "ce_loss_9": 3.5068222641944886, "epoch": 0.051, "grad_norm": 624.0, "kl_loss_13": 140.5283073425293, "kl_loss_2": 2661.6624389648437, "kl_loss_4": 1559.182373046875, "kl_loss_9": 509.0259124755859, "learning_rate": 0.0009957740499523787, "loss": 1242.5957, "step": 510 }, { "ce_loss_13": 3.347599446773529, "ce_loss_17": 3.2621068120002747, "ce_loss_2": 4.5844261169433596, "ce_loss_4": 4.026308608055115, "ce_loss_9": 3.5309693813323975, "epoch": 0.052, "grad_norm": 656.0, "kl_loss_13": 155.69999313354492, "kl_loss_2": 2678.771203613281, "kl_loss_4": 1568.6827453613282, "kl_loss_9": 516.6230758666992, "learning_rate": 0.0009955657010501807, "loss": 1238.2941, "step": 520 }, { "ce_loss_13": 3.31034996509552, "ce_loss_17": 3.2209251523017883, "ce_loss_2": 4.5603924751281735, "ce_loss_4": 4.001909756660462, "ce_loss_9": 3.494111442565918, "epoch": 0.053, "grad_norm": 828.0, "kl_loss_13": 159.72130584716797, "kl_loss_2": 2732.4492431640624, "kl_loss_4": 1603.6663513183594, "kl_loss_9": 522.7829132080078, "learning_rate": 0.000995352361806875, "loss": 1243.36, "step": 530 }, { "ce_loss_13": 3.349541389942169, "ce_loss_17": 3.259581971168518, "ce_loss_2": 4.599472379684448, "ce_loss_4": 4.029084694385529, "ce_loss_9": 3.5391013860702514, "epoch": 0.054, "grad_norm": 660.0, "kl_loss_13": 157.69116287231446, "kl_loss_2": 2756.20224609375, "kl_loss_4": 1600.1287353515625, "kl_loss_9": 529.4489334106445, "learning_rate": 0.0009951340343707852, "loss": 1271.6059, "step": 540 }, { "ce_loss_13": 3.390316832065582, "ce_loss_17": 3.311418342590332, "ce_loss_2": 4.65280933380127, "ce_loss_4": 4.078594005107879, "ce_loss_9": 3.573176646232605, "epoch": 0.055, "grad_norm": 628.0, "kl_loss_13": 147.9464485168457, "kl_loss_2": 2711.2746459960936, "kl_loss_4": 1571.1894104003907, "kl_loss_9": 499.5439422607422, "learning_rate": 0.0009949107209404665, "loss": 1244.4903, "step": 550 }, { "ce_loss_13": 3.3250513792037966, "ce_loss_17": 3.2271708369255068, "ce_loss_2": 4.5295474290847775, "ce_loss_4": 3.989085817337036, "ce_loss_9": 3.4934698700904847, "epoch": 0.056, "grad_norm": 664.0, "kl_loss_13": 166.64837646484375, "kl_loss_2": 2681.1214965820313, "kl_loss_4": 1583.4888671875, "kl_loss_9": 496.79695892333984, "learning_rate": 0.0009946824237646824, "loss": 1236.8229, "step": 560 }, { "ce_loss_13": 3.267991077899933, "ce_loss_17": 3.179435741901398, "ce_loss_2": 4.517114925384521, "ce_loss_4": 3.9495364665985107, "ce_loss_9": 3.4468095779418944, "epoch": 0.057, "grad_norm": 596.0, "kl_loss_13": 168.04478225708007, "kl_loss_2": 2752.01181640625, "kl_loss_4": 1609.467059326172, "kl_loss_9": 518.3810485839844, "learning_rate": 0.0009944491451423828, "loss": 1277.3712, "step": 570 }, { "ce_loss_13": 3.2650999069213866, "ce_loss_17": 3.1744595170021057, "ce_loss_2": 4.5301364183425905, "ce_loss_4": 3.957633888721466, "ce_loss_9": 3.445773887634277, "epoch": 0.058, "grad_norm": 640.0, "kl_loss_13": 162.0532470703125, "kl_loss_2": 2773.3030639648437, "kl_loss_4": 1614.8263305664063, "kl_loss_9": 521.4058013916016, "learning_rate": 0.0009942108874226813, "loss": 1244.9752, "step": 580 }, { "ce_loss_13": 3.3783419728279114, "ce_loss_17": 3.290017545223236, "ce_loss_2": 4.564514803886413, "ce_loss_4": 4.015147042274475, "ce_loss_9": 3.547626256942749, "epoch": 0.059, "grad_norm": 676.0, "kl_loss_13": 158.40567779541016, "kl_loss_2": 2587.4620849609373, "kl_loss_4": 1504.674383544922, "kl_loss_9": 500.83845977783204, "learning_rate": 0.00099396765300483, "loss": 1187.66, "step": 590 }, { "ce_loss_13": 3.35656076669693, "ce_loss_17": 3.2701781749725343, "ce_loss_2": 4.545877623558044, "ce_loss_4": 4.003049039840699, "ce_loss_9": 3.5325764417648315, "epoch": 0.06, "grad_norm": 712.0, "kl_loss_13": 157.77887191772462, "kl_loss_2": 2607.5798461914064, "kl_loss_4": 1525.541796875, "kl_loss_9": 505.99817657470703, "learning_rate": 0.0009937194443381972, "loss": 1206.6717, "step": 600 }, { "ce_loss_13": 3.3773241877555846, "ce_loss_17": 3.2941529393196105, "ce_loss_2": 4.549521040916443, "ce_loss_4": 4.002827751636505, "ce_loss_9": 3.5425670981407165, "epoch": 0.061, "grad_norm": 600.0, "kl_loss_13": 156.81272048950194, "kl_loss_2": 2586.9384765625, "kl_loss_4": 1490.7980834960938, "kl_loss_9": 492.3127700805664, "learning_rate": 0.0009934662639222412, "loss": 1210.4365, "step": 610 }, { "ce_loss_13": 3.325599718093872, "ce_loss_17": 3.246423053741455, "ce_loss_2": 4.564360666275024, "ce_loss_4": 3.9915852546691895, "ce_loss_9": 3.5050143718719484, "epoch": 0.062, "grad_norm": 684.0, "kl_loss_13": 145.37355117797853, "kl_loss_2": 2708.588037109375, "kl_loss_4": 1551.0524780273438, "kl_loss_9": 509.78137054443357, "learning_rate": 0.000993208114306486, "loss": 1216.7808, "step": 620 }, { "ce_loss_13": 3.248555541038513, "ce_loss_17": 3.1725520133972167, "ce_loss_2": 4.505973243713379, "ce_loss_4": 3.927742075920105, "ce_loss_9": 3.437049090862274, "epoch": 0.063, "grad_norm": 664.0, "kl_loss_13": 141.67784690856934, "kl_loss_2": 2725.5945434570312, "kl_loss_4": 1559.2127990722656, "kl_loss_9": 512.6172515869141, "learning_rate": 0.0009929449980904952, "loss": 1202.8824, "step": 630 }, { "ce_loss_13": 3.3062828302383425, "ce_loss_17": 3.232676351070404, "ce_loss_2": 4.524214553833008, "ce_loss_4": 3.968895471096039, "ce_loss_9": 3.490144872665405, "epoch": 0.064, "grad_norm": 568.0, "kl_loss_13": 135.26087493896483, "kl_loss_2": 2661.7224609375, "kl_loss_4": 1541.601055908203, "kl_loss_9": 506.95104217529297, "learning_rate": 0.0009926769179238466, "loss": 1202.9591, "step": 640 }, { "ce_loss_13": 3.346583092212677, "ce_loss_17": 3.2691202402114867, "ce_loss_2": 4.551297330856324, "ce_loss_4": 4.013254928588867, "ce_loss_9": 3.567150342464447, "epoch": 0.065, "grad_norm": 884.0, "kl_loss_13": 140.0088653564453, "kl_loss_2": 2635.9703979492188, "kl_loss_4": 1548.76943359375, "kl_loss_9": 584.322395324707, "learning_rate": 0.000992403876506104, "loss": 1221.5454, "step": 650 }, { "ce_loss_13": 3.283110725879669, "ce_loss_17": 3.20692378282547, "ce_loss_2": 4.5054912805557255, "ce_loss_4": 3.93732008934021, "ce_loss_9": 3.500643289089203, "epoch": 0.066, "grad_norm": 612.0, "kl_loss_13": 133.65259857177733, "kl_loss_2": 2645.805798339844, "kl_loss_4": 1509.3870361328125, "kl_loss_9": 559.6844909667968, "learning_rate": 0.0009921258765867918, "loss": 1215.4954, "step": 660 }, { "ce_loss_13": 3.252826678752899, "ce_loss_17": 3.1817604064941407, "ce_loss_2": 4.496033573150635, "ce_loss_4": 3.9209199070930483, "ce_loss_9": 3.449157202243805, "epoch": 0.067, "grad_norm": 636.0, "kl_loss_13": 128.94365310668945, "kl_loss_2": 2730.5625854492187, "kl_loss_4": 1557.8232788085938, "kl_loss_9": 518.3787063598633, "learning_rate": 0.0009918429209653662, "loss": 1209.2983, "step": 670 }, { "ce_loss_13": 3.3021888971328734, "ce_loss_17": 3.233319938182831, "ce_loss_2": 4.521537590026855, "ce_loss_4": 3.9559776186943054, "ce_loss_9": 3.4880390763282776, "epoch": 0.068, "grad_norm": 628.0, "kl_loss_13": 127.2779327392578, "kl_loss_2": 2667.96015625, "kl_loss_4": 1519.8798095703125, "kl_loss_9": 498.82932434082034, "learning_rate": 0.0009915550124911866, "loss": 1176.0943, "step": 680 }, { "ce_loss_13": 3.315458345413208, "ce_loss_17": 3.2440866470336913, "ce_loss_2": 4.5074906826019285, "ce_loss_4": 3.9628826498985292, "ce_loss_9": 3.500523793697357, "epoch": 0.069, "grad_norm": 608.0, "kl_loss_13": 127.5838581085205, "kl_loss_2": 2602.7898559570312, "kl_loss_4": 1498.2195983886718, "kl_loss_9": 492.14330291748047, "learning_rate": 0.0009912621540634887, "loss": 1182.3338, "step": 690 }, { "ce_loss_13": 3.3509831070899962, "ce_loss_17": 3.280188775062561, "ce_loss_2": 4.51383101940155, "ce_loss_4": 3.958842468261719, "ce_loss_9": 3.520415389537811, "epoch": 0.07, "grad_norm": 644.0, "kl_loss_13": 123.37131958007812, "kl_loss_2": 2550.901086425781, "kl_loss_4": 1436.4951538085938, "kl_loss_9": 472.02699279785156, "learning_rate": 0.0009909643486313534, "loss": 1159.7512, "step": 700 }, { "ce_loss_13": 3.2315321445465086, "ce_loss_17": 3.160666084289551, "ce_loss_2": 4.456056094169616, "ce_loss_4": 3.886583685874939, "ce_loss_9": 3.4124111890792848, "epoch": 0.071, "grad_norm": 616.0, "kl_loss_13": 127.64315071105958, "kl_loss_2": 2668.676965332031, "kl_loss_4": 1520.346856689453, "kl_loss_9": 484.3673568725586, "learning_rate": 0.000990661599193678, "loss": 1218.8443, "step": 710 }, { "ce_loss_13": 3.353931748867035, "ce_loss_17": 3.2803229689598083, "ce_loss_2": 4.531432890892029, "ce_loss_4": 4.003308618068695, "ce_loss_9": 3.526760494709015, "epoch": 0.072, "grad_norm": 676.0, "kl_loss_13": 131.8408176422119, "kl_loss_2": 2572.8404541015625, "kl_loss_4": 1497.0823486328125, "kl_loss_9": 465.3206527709961, "learning_rate": 0.0009903539087991462, "loss": 1167.6391, "step": 720 }, { "ce_loss_13": 3.332451033592224, "ce_loss_17": 3.2614585876464846, "ce_loss_2": 4.516555666923523, "ce_loss_4": 3.976523768901825, "ce_loss_9": 3.5042252063751222, "epoch": 0.073, "grad_norm": 612.0, "kl_loss_13": 127.52910423278809, "kl_loss_2": 2588.143310546875, "kl_loss_4": 1491.805487060547, "kl_loss_9": 474.08206634521486, "learning_rate": 0.0009900412805461966, "loss": 1178.3563, "step": 730 }, { "ce_loss_13": 3.4028484225273132, "ce_loss_17": 3.3355210065841674, "ce_loss_2": 4.5544538497924805, "ce_loss_4": 4.019611120223999, "ce_loss_9": 3.577867567539215, "epoch": 0.074, "grad_norm": 712.0, "kl_loss_13": 126.20948028564453, "kl_loss_2": 2530.4266357421875, "kl_loss_4": 1446.06923828125, "kl_loss_9": 465.1686264038086, "learning_rate": 0.0009897237175829927, "loss": 1162.8408, "step": 740 }, { "ce_loss_13": 3.3002208709716796, "ce_loss_17": 3.2221954345703123, "ce_loss_2": 4.4848315715789795, "ce_loss_4": 3.9426618337631227, "ce_loss_9": 3.4645259857177733, "epoch": 0.075, "grad_norm": 636.0, "kl_loss_13": 136.19006538391113, "kl_loss_2": 2616.3106323242187, "kl_loss_4": 1516.6720825195312, "kl_loss_9": 474.8717376708984, "learning_rate": 0.0009894012231073895, "loss": 1176.7424, "step": 750 }, { "ce_loss_13": 3.3391860246658327, "ce_loss_17": 3.2671860218048097, "ce_loss_2": 4.515418291091919, "ce_loss_4": 3.9612865805625916, "ce_loss_9": 3.5085081100463866, "epoch": 0.076, "grad_norm": 604.0, "kl_loss_13": 137.293404006958, "kl_loss_2": 2573.784558105469, "kl_loss_4": 1466.8589294433593, "kl_loss_9": 468.3266311645508, "learning_rate": 0.0009890738003669028, "loss": 1172.3907, "step": 760 }, { "ce_loss_13": 3.312580978870392, "ce_loss_17": 3.2395971775054933, "ce_loss_2": 4.523102068901062, "ce_loss_4": 3.9617782711982725, "ce_loss_9": 3.4849427342414856, "epoch": 0.077, "grad_norm": 600.0, "kl_loss_13": 135.65264129638672, "kl_loss_2": 2657.336071777344, "kl_loss_4": 1531.0789855957032, "kl_loss_9": 475.88781585693357, "learning_rate": 0.0009887414526586764, "loss": 1164.7951, "step": 770 }, { "ce_loss_13": 3.363868010044098, "ce_loss_17": 3.2938335061073305, "ce_loss_2": 4.5332248449325565, "ce_loss_4": 3.9797385334968567, "ce_loss_9": 3.52528201341629, "epoch": 0.078, "grad_norm": 624.0, "kl_loss_13": 123.87691078186035, "kl_loss_2": 2563.1283447265623, "kl_loss_4": 1439.5722351074219, "kl_loss_9": 451.4059295654297, "learning_rate": 0.0009884041833294476, "loss": 1131.074, "step": 780 }, { "ce_loss_13": 3.366063630580902, "ce_loss_17": 3.2975459814071657, "ce_loss_2": 4.51762444972992, "ce_loss_4": 3.9779760360717775, "ce_loss_9": 3.530991554260254, "epoch": 0.079, "grad_norm": 672.0, "kl_loss_13": 121.06926002502442, "kl_loss_2": 2539.3625732421874, "kl_loss_4": 1439.5869018554688, "kl_loss_9": 458.55997467041016, "learning_rate": 0.000988061995775515, "loss": 1162.3846, "step": 790 }, { "ce_loss_13": 3.29613002538681, "ce_loss_17": 3.2262673020362853, "ce_loss_2": 4.4409942626953125, "ce_loss_4": 3.9230281710624695, "ce_loss_9": 3.4621812462806703, "epoch": 0.08, "grad_norm": 596.0, "kl_loss_13": 126.79234085083007, "kl_loss_2": 2531.2521362304688, "kl_loss_4": 1482.3507019042968, "kl_loss_9": 465.85094757080077, "learning_rate": 0.0009877148934427035, "loss": 1146.9053, "step": 800 }, { "ce_loss_13": 3.3456916570663453, "ce_loss_17": 3.2675469279289246, "ce_loss_2": 4.4991215467453, "ce_loss_4": 3.941481518745422, "ce_loss_9": 3.500859725475311, "epoch": 0.081, "grad_norm": 644.0, "kl_loss_13": 133.1513931274414, "kl_loss_2": 2553.1867919921874, "kl_loss_4": 1434.2838317871094, "kl_loss_9": 449.13096771240237, "learning_rate": 0.0009873628798263297, "loss": 1131.1455, "step": 810 }, { "ce_loss_13": 3.294445109367371, "ce_loss_17": 3.223985254764557, "ce_loss_2": 4.428789710998535, "ce_loss_4": 3.88575097322464, "ce_loss_9": 3.4480647444725037, "epoch": 0.082, "grad_norm": 576.0, "kl_loss_13": 131.85743446350097, "kl_loss_2": 2488.2187744140624, "kl_loss_4": 1409.9474609375, "kl_loss_9": 438.8127838134766, "learning_rate": 0.0009870059584711668, "loss": 1150.1537, "step": 820 }, { "ce_loss_13": 3.3084290266036986, "ce_loss_17": 3.2389563798904417, "ce_loss_2": 4.449469542503357, "ce_loss_4": 3.916575002670288, "ce_loss_9": 3.4716209053993223, "epoch": 0.083, "grad_norm": 628.0, "kl_loss_13": 122.97907829284668, "kl_loss_2": 2519.50517578125, "kl_loss_4": 1433.108642578125, "kl_loss_9": 449.61902923583983, "learning_rate": 0.000986644132971409, "loss": 1128.2717, "step": 830 }, { "ce_loss_13": 3.295887219905853, "ce_loss_17": 3.2251924514770507, "ce_loss_2": 4.464026820659638, "ce_loss_4": 3.9351455330848695, "ce_loss_9": 3.4617645859718325, "epoch": 0.084, "grad_norm": 616.0, "kl_loss_13": 126.87683448791503, "kl_loss_2": 2557.2621459960938, "kl_loss_4": 1477.8490234375, "kl_loss_9": 461.0807281494141, "learning_rate": 0.0009862774069706345, "loss": 1142.0318, "step": 840 }, { "ce_loss_13": 3.4207282423973084, "ce_loss_17": 3.355008363723755, "ce_loss_2": 4.518818354606628, "ce_loss_4": 4.004429590702057, "ce_loss_9": 3.5739036083221434, "epoch": 0.085, "grad_norm": 704.0, "kl_loss_13": 124.23913230895997, "kl_loss_2": 2454.3523193359374, "kl_loss_4": 1417.0260314941406, "kl_loss_9": 446.0335723876953, "learning_rate": 0.000985905784161771, "loss": 1124.835, "step": 850 }, { "ce_loss_13": 3.347672176361084, "ce_loss_17": 3.281836974620819, "ce_loss_2": 4.472033071517944, "ce_loss_4": 3.9360882163047792, "ce_loss_9": 3.499380612373352, "epoch": 0.086, "grad_norm": 644.0, "kl_loss_13": 118.70066299438477, "kl_loss_2": 2478.5536499023438, "kl_loss_4": 1409.6584350585938, "kl_loss_9": 433.6291168212891, "learning_rate": 0.000985529268287055, "loss": 1107.5163, "step": 860 }, { "ce_loss_13": 3.276761054992676, "ce_loss_17": 3.2099615335464478, "ce_loss_2": 4.449923253059387, "ce_loss_4": 3.8927905440330504, "ce_loss_9": 3.438116121292114, "epoch": 0.087, "grad_norm": 672.0, "kl_loss_13": 119.32656173706054, "kl_loss_2": 2563.2444702148437, "kl_loss_4": 1447.4735412597656, "kl_loss_9": 440.7087692260742, "learning_rate": 0.0009851478631379982, "loss": 1138.2113, "step": 870 }, { "ce_loss_13": 3.338916289806366, "ce_loss_17": 3.2712655782699587, "ce_loss_2": 4.4808837890625, "ce_loss_4": 3.9395508766174316, "ce_loss_9": 3.500311553478241, "epoch": 0.088, "grad_norm": 588.0, "kl_loss_13": 117.52669868469238, "kl_loss_2": 2510.5021240234373, "kl_loss_4": 1403.144793701172, "kl_loss_9": 436.85083923339846, "learning_rate": 0.0009847615725553456, "loss": 1117.7988, "step": 880 }, { "ce_loss_13": 3.391915798187256, "ce_loss_17": 3.326561784744263, "ce_loss_2": 4.47606725692749, "ce_loss_4": 3.9601453065872194, "ce_loss_9": 3.5397919535636904, "epoch": 0.089, "grad_norm": 620.0, "kl_loss_13": 113.68915100097657, "kl_loss_2": 2387.3920288085938, "kl_loss_4": 1350.6962036132813, "kl_loss_9": 414.06656951904296, "learning_rate": 0.0009843704004290394, "loss": 1110.0278, "step": 890 }, { "ce_loss_13": 3.298319697380066, "ce_loss_17": 3.2356975436210633, "ce_loss_2": 4.430491948127747, "ce_loss_4": 3.9021120190620424, "ce_loss_9": 3.455435812473297, "epoch": 0.09, "grad_norm": 652.0, "kl_loss_13": 119.12698974609376, "kl_loss_2": 2516.066857910156, "kl_loss_4": 1438.2920654296875, "kl_loss_9": 444.22851409912107, "learning_rate": 0.0009839743506981783, "loss": 1126.7336, "step": 900 }, { "ce_loss_13": 3.2220940709114076, "ce_loss_17": 3.1536363482475283, "ce_loss_2": 4.412917733192444, "ce_loss_4": 3.8601535081863405, "ce_loss_9": 3.38604793548584, "epoch": 0.091, "grad_norm": 644.0, "kl_loss_13": 120.22522850036621, "kl_loss_2": 2636.6438842773437, "kl_loss_4": 1504.226806640625, "kl_loss_9": 457.072492980957, "learning_rate": 0.0009835734273509786, "loss": 1149.7051, "step": 910 }, { "ce_loss_13": 3.3127182841300966, "ce_loss_17": 3.246254229545593, "ce_loss_2": 4.4611786842346195, "ce_loss_4": 3.927472507953644, "ce_loss_9": 3.4767918229103087, "epoch": 0.092, "grad_norm": 616.0, "kl_loss_13": 116.29349250793457, "kl_loss_2": 2488.740368652344, "kl_loss_4": 1402.4096435546876, "kl_loss_9": 434.8053451538086, "learning_rate": 0.0009831676344247342, "loss": 1119.2193, "step": 920 }, { "ce_loss_13": 3.3306243419647217, "ce_loss_17": 3.269066500663757, "ce_loss_2": 4.432948970794678, "ce_loss_4": 3.9210447788238527, "ce_loss_9": 3.4840231418609617, "epoch": 0.093, "grad_norm": 616.0, "kl_loss_13": 113.15401573181153, "kl_loss_2": 2442.289599609375, "kl_loss_4": 1401.3232055664062, "kl_loss_9": 428.3973327636719, "learning_rate": 0.0009827569760057755, "loss": 1116.0459, "step": 930 }, { "ce_loss_13": 3.252093029022217, "ce_loss_17": 3.18474862575531, "ce_loss_2": 4.465179300308227, "ce_loss_4": 3.8863181352615355, "ce_loss_9": 3.418609654903412, "epoch": 0.094, "grad_norm": 740.0, "kl_loss_13": 118.65354614257812, "kl_loss_2": 2651.7362548828123, "kl_loss_4": 1487.0295043945312, "kl_loss_9": 452.91857757568357, "learning_rate": 0.000982341456229428, "loss": 1138.2396, "step": 940 }, { "ce_loss_13": 3.34140704870224, "ce_loss_17": 3.277027201652527, "ce_loss_2": 4.492311930656433, "ce_loss_4": 3.9498108267784118, "ce_loss_9": 3.5043185353279114, "epoch": 0.095, "grad_norm": 588.0, "kl_loss_13": 114.85740127563477, "kl_loss_2": 2544.40927734375, "kl_loss_4": 1438.683056640625, "kl_loss_9": 444.57925872802736, "learning_rate": 0.000981921079279971, "loss": 1106.5577, "step": 950 }, { "ce_loss_13": 3.3622724056243896, "ce_loss_17": 3.300556206703186, "ce_loss_2": 4.42045681476593, "ce_loss_4": 3.9144128561019897, "ce_loss_9": 3.5092538595199585, "epoch": 0.096, "grad_norm": 608.0, "kl_loss_13": 112.38622970581055, "kl_loss_2": 2378.4241333007812, "kl_loss_4": 1349.9152770996093, "kl_loss_9": 425.52757568359374, "learning_rate": 0.0009814958493905962, "loss": 1082.7883, "step": 960 }, { "ce_loss_13": 3.314443755149841, "ce_loss_17": 3.249697279930115, "ce_loss_2": 4.458523607254028, "ce_loss_4": 3.919697678089142, "ce_loss_9": 3.47161465883255, "epoch": 0.097, "grad_norm": 640.0, "kl_loss_13": 115.82365684509277, "kl_loss_2": 2514.8928100585936, "kl_loss_4": 1421.7318969726562, "kl_loss_9": 442.53113708496096, "learning_rate": 0.0009810657708433637, "loss": 1140.2196, "step": 970 }, { "ce_loss_13": 3.394575309753418, "ce_loss_17": 3.3281876921653746, "ce_loss_2": 4.4481121301651, "ce_loss_4": 3.9479944825172426, "ce_loss_9": 3.539068377017975, "epoch": 0.098, "grad_norm": 600.0, "kl_loss_13": 122.24889755249023, "kl_loss_2": 2346.068981933594, "kl_loss_4": 1342.0567199707032, "kl_loss_9": 424.09859161376954, "learning_rate": 0.0009806308479691594, "loss": 1072.8215, "step": 980 }, { "ce_loss_13": 3.402330148220062, "ce_loss_17": 3.331631302833557, "ce_loss_2": 4.497003102302552, "ce_loss_4": 3.9845388293266297, "ce_loss_9": 3.5640967011451723, "epoch": 0.099, "grad_norm": 668.0, "kl_loss_13": 126.92496185302734, "kl_loss_2": 2429.3766479492188, "kl_loss_4": 1386.66455078125, "kl_loss_9": 448.76985321044924, "learning_rate": 0.0009801910851476522, "loss": 1093.9625, "step": 990 }, { "ce_loss_13": 3.318433976173401, "ce_loss_17": 3.2502185463905335, "ce_loss_2": 4.4634592771530155, "ce_loss_4": 3.917101538181305, "ce_loss_9": 3.477748930454254, "epoch": 0.1, "grad_norm": 628.0, "kl_loss_13": 123.46217765808106, "kl_loss_2": 2552.455029296875, "kl_loss_4": 1429.8879516601562, "kl_loss_9": 456.84090118408204, "learning_rate": 0.0009797464868072487, "loss": 1110.9061, "step": 1000 }, { "ce_loss_13": 3.3047823667526246, "ce_loss_17": 3.238838028907776, "ce_loss_2": 4.421839463710785, "ce_loss_4": 3.9042311549186706, "ce_loss_9": 3.4642874002456665, "epoch": 0.101, "grad_norm": 600.0, "kl_loss_13": 120.86031990051269, "kl_loss_2": 2483.5817749023436, "kl_loss_4": 1423.0612670898438, "kl_loss_9": 453.3130477905273, "learning_rate": 0.0009792970574250492, "loss": 1113.9136, "step": 1010 }, { "ce_loss_13": 3.3296589612960816, "ce_loss_17": 3.264625906944275, "ce_loss_2": 4.431443309783935, "ce_loss_4": 3.9199029326438906, "ce_loss_9": 3.49114305973053, "epoch": 0.102, "grad_norm": 628.0, "kl_loss_13": 115.92987747192383, "kl_loss_2": 2445.0037353515627, "kl_loss_4": 1405.2209777832031, "kl_loss_9": 437.9608520507812, "learning_rate": 0.0009788428015268028, "loss": 1084.3144, "step": 1020 }, { "ce_loss_13": 3.330202805995941, "ce_loss_17": 3.264923906326294, "ce_loss_2": 4.413852691650391, "ce_loss_4": 3.8963095426559446, "ce_loss_9": 3.476387345790863, "epoch": 0.103, "grad_norm": 668.0, "kl_loss_13": 121.63944854736329, "kl_loss_2": 2417.3347045898436, "kl_loss_4": 1368.259210205078, "kl_loss_9": 420.54846343994143, "learning_rate": 0.0009783837236868609, "loss": 1081.6119, "step": 1030 }, { "ce_loss_13": 3.2946935057640077, "ce_loss_17": 3.2287474393844606, "ce_loss_2": 4.396254158020019, "ce_loss_4": 3.8797624588012694, "ce_loss_9": 3.4512138962745667, "epoch": 0.104, "grad_norm": 616.0, "kl_loss_13": 116.54061508178711, "kl_loss_2": 2410.5272583007813, "kl_loss_4": 1376.435107421875, "kl_loss_9": 430.11146240234376, "learning_rate": 0.0009779198285281327, "loss": 1080.1969, "step": 1040 }, { "ce_loss_13": 3.285519337654114, "ce_loss_17": 3.2236326575279235, "ce_loss_2": 4.41151978969574, "ce_loss_4": 3.881605124473572, "ce_loss_9": 3.4398640751838685, "epoch": 0.105, "grad_norm": 608.0, "kl_loss_13": 111.27123718261718, "kl_loss_2": 2481.2762451171875, "kl_loss_4": 1399.8256469726562, "kl_loss_9": 421.35252685546874, "learning_rate": 0.0009774511207220368, "loss": 1095.8346, "step": 1050 }, { "ce_loss_13": 3.3345521807670595, "ce_loss_17": 3.2681204080581665, "ce_loss_2": 4.447642612457275, "ce_loss_4": 3.9166941165924074, "ce_loss_9": 3.4915797114372253, "epoch": 0.106, "grad_norm": 640.0, "kl_loss_13": 113.97708396911621, "kl_loss_2": 2457.1487548828127, "kl_loss_4": 1378.8427001953125, "kl_loss_9": 430.92195892333984, "learning_rate": 0.0009769776049884564, "loss": 1091.3941, "step": 1060 }, { "ce_loss_13": 3.2423794984817507, "ce_loss_17": 3.1796736478805543, "ce_loss_2": 4.388264894485474, "ce_loss_4": 3.845630931854248, "ce_loss_9": 3.4036087512969972, "epoch": 0.107, "grad_norm": 700.0, "kl_loss_13": 112.91108932495118, "kl_loss_2": 2523.034704589844, "kl_loss_4": 1431.7101135253906, "kl_loss_9": 439.0070556640625, "learning_rate": 0.0009764992860956889, "loss": 1128.451, "step": 1070 }, { "ce_loss_13": 3.393851613998413, "ce_loss_17": 3.3325206995010377, "ce_loss_2": 4.425472116470337, "ce_loss_4": 3.9404685735702514, "ce_loss_9": 3.532415735721588, "epoch": 0.108, "grad_norm": 592.0, "kl_loss_13": 109.46063995361328, "kl_loss_2": 2309.0443603515623, "kl_loss_4": 1306.1042724609374, "kl_loss_9": 404.70620880126955, "learning_rate": 0.0009760161688604008, "loss": 1054.0623, "step": 1080 }, { "ce_loss_13": 3.391258454322815, "ce_loss_17": 3.329734480381012, "ce_loss_2": 4.481240391731262, "ce_loss_4": 3.973934280872345, "ce_loss_9": 3.543959391117096, "epoch": 0.109, "grad_norm": 712.0, "kl_loss_13": 110.02838973999023, "kl_loss_2": 2395.6950866699217, "kl_loss_4": 1360.7293579101563, "kl_loss_9": 417.4671920776367, "learning_rate": 0.0009755282581475768, "loss": 1081.7303, "step": 1090 }, { "ce_loss_13": 3.4368482708930967, "ce_loss_17": 3.3751877665519716, "ce_loss_2": 4.501154685020447, "ce_loss_4": 3.994403803348541, "ce_loss_9": 3.5841227054595945, "epoch": 0.11, "grad_norm": 644.0, "kl_loss_13": 116.24259529113769, "kl_loss_2": 2357.111456298828, "kl_loss_4": 1332.5497741699219, "kl_loss_9": 419.2577301025391, "learning_rate": 0.0009750355588704727, "loss": 1050.8812, "step": 1100 }, { "ce_loss_13": 3.2831246733665465, "ce_loss_17": 3.2204254388809206, "ce_loss_2": 4.386692416667938, "ce_loss_4": 3.8651437640190123, "ce_loss_9": 3.4342461109161375, "epoch": 0.111, "grad_norm": 636.0, "kl_loss_13": 112.50919799804687, "kl_loss_2": 2414.4412109375, "kl_loss_4": 1366.421612548828, "kl_loss_9": 416.9934906005859, "learning_rate": 0.0009745380759905647, "loss": 1097.4557, "step": 1110 }, { "ce_loss_13": 3.2317030668258666, "ce_loss_17": 3.1717841029167175, "ce_loss_2": 4.353304851055145, "ce_loss_4": 3.8201396465301514, "ce_loss_9": 3.3812498927116392, "epoch": 0.112, "grad_norm": 668.0, "kl_loss_13": 109.91761131286621, "kl_loss_2": 2468.8907836914063, "kl_loss_4": 1382.1518798828124, "kl_loss_9": 416.75934143066405, "learning_rate": 0.0009740358145174998, "loss": 1106.3107, "step": 1120 }, { "ce_loss_13": 3.3809526681900026, "ce_loss_17": 3.3203678369522094, "ce_loss_2": 4.416502618789673, "ce_loss_4": 3.9220649361610413, "ce_loss_9": 3.5257248759269713, "epoch": 0.113, "grad_norm": 588.0, "kl_loss_13": 108.38679389953613, "kl_loss_2": 2327.7776733398437, "kl_loss_4": 1309.3333618164063, "kl_loss_9": 405.4641632080078, "learning_rate": 0.0009735287795090455, "loss": 1055.8825, "step": 1130 }, { "ce_loss_13": 3.275561845302582, "ce_loss_17": 3.2169222950935366, "ce_loss_2": 4.37807993888855, "ce_loss_4": 3.8567439675331117, "ce_loss_9": 3.4257020950317383, "epoch": 0.114, "grad_norm": 768.0, "kl_loss_13": 110.35262985229492, "kl_loss_2": 2425.5908203125, "kl_loss_4": 1374.9910888671875, "kl_loss_9": 407.8123291015625, "learning_rate": 0.0009730169760710386, "loss": 1073.9307, "step": 1140 }, { "ce_loss_13": 3.359781765937805, "ce_loss_17": 3.2924877524375917, "ce_loss_2": 4.439531350135804, "ce_loss_4": 3.921218383312225, "ce_loss_9": 3.499290108680725, "epoch": 0.115, "grad_norm": 676.0, "kl_loss_13": 120.64373970031738, "kl_loss_2": 2390.1439697265623, "kl_loss_4": 1339.1471130371094, "kl_loss_9": 405.00170745849607, "learning_rate": 0.0009725004093573342, "loss": 1071.8246, "step": 1150 }, { "ce_loss_13": 3.3038479328155517, "ce_loss_17": 3.2363712787628174, "ce_loss_2": 4.378937613964081, "ce_loss_4": 3.8651722073554993, "ce_loss_9": 3.457583713531494, "epoch": 0.116, "grad_norm": 792.0, "kl_loss_13": 119.20105934143066, "kl_loss_2": 2358.399365234375, "kl_loss_4": 1332.2092834472655, "kl_loss_9": 415.86485595703124, "learning_rate": 0.0009719790845697534, "loss": 1053.0682, "step": 1160 }, { "ce_loss_13": 3.25511212348938, "ce_loss_17": 3.195575773715973, "ce_loss_2": 4.293286681175232, "ce_loss_4": 3.800669753551483, "ce_loss_9": 3.396199142932892, "epoch": 0.117, "grad_norm": 748.0, "kl_loss_13": 112.51908836364746, "kl_loss_2": 2321.992822265625, "kl_loss_4": 1310.780859375, "kl_loss_9": 399.5624725341797, "learning_rate": 0.0009714530069580309, "loss": 1042.5032, "step": 1170 }, { "ce_loss_13": 3.349726128578186, "ce_loss_17": 3.282489287853241, "ce_loss_2": 4.418605291843415, "ce_loss_4": 3.918584477901459, "ce_loss_9": 3.4994783282279966, "epoch": 0.118, "grad_norm": 792.0, "kl_loss_13": 124.80969619750977, "kl_loss_2": 2370.46015625, "kl_loss_4": 1353.2211547851562, "kl_loss_9": 426.6817199707031, "learning_rate": 0.0009709221818197624, "loss": 1064.6255, "step": 1180 }, { "ce_loss_13": 3.392550361156464, "ce_loss_17": 3.3208803057670595, "ce_loss_2": 4.470854425430298, "ce_loss_4": 3.958542823791504, "ce_loss_9": 3.533000814914703, "epoch": 0.119, "grad_norm": 796.0, "kl_loss_13": 131.01845664978026, "kl_loss_2": 2407.263720703125, "kl_loss_4": 1353.18388671875, "kl_loss_9": 423.02639923095705, "learning_rate": 0.0009703866145003512, "loss": 1079.4176, "step": 1190 }, { "ce_loss_13": 3.3545722484588625, "ce_loss_17": 3.2938650846481323, "ce_loss_2": 4.412098550796509, "ce_loss_4": 3.91774640083313, "ce_loss_9": 3.509088408946991, "epoch": 0.12, "grad_norm": 780.0, "kl_loss_13": 115.15438537597656, "kl_loss_2": 2365.2002075195314, "kl_loss_4": 1348.0827087402345, "kl_loss_9": 422.36648712158205, "learning_rate": 0.0009698463103929542, "loss": 1080.3512, "step": 1200 }, { "ce_loss_13": 3.3183128476142882, "ce_loss_17": 3.2566781520843504, "ce_loss_2": 4.403877472877502, "ce_loss_4": 3.8917479276657105, "ce_loss_9": 3.478835713863373, "epoch": 0.121, "grad_norm": 580.0, "kl_loss_13": 121.92639770507813, "kl_loss_2": 2379.6752685546876, "kl_loss_4": 1344.7351135253907, "kl_loss_9": 425.500993347168, "learning_rate": 0.0009693012749384279, "loss": 1078.7273, "step": 1210 }, { "ce_loss_13": 3.3381242394447326, "ce_loss_17": 3.2690067172050474, "ce_loss_2": 4.387892174720764, "ce_loss_4": 3.8845514178276064, "ce_loss_9": 3.4794100522994995, "epoch": 0.122, "grad_norm": 712.0, "kl_loss_13": 125.97309150695801, "kl_loss_2": 2354.9815185546877, "kl_loss_4": 1331.7310668945313, "kl_loss_9": 414.407373046875, "learning_rate": 0.0009687515136252732, "loss": 1053.1852, "step": 1220 }, { "ce_loss_13": 3.291333258152008, "ce_loss_17": 3.2255096673965453, "ce_loss_2": 4.409121894836426, "ce_loss_4": 3.860208344459534, "ce_loss_9": 3.434011220932007, "epoch": 0.123, "grad_norm": 752.0, "kl_loss_13": 118.75255470275879, "kl_loss_2": 2480.691931152344, "kl_loss_4": 1372.1910034179687, "kl_loss_9": 420.0564437866211, "learning_rate": 0.0009681970319895803, "loss": 1096.9516, "step": 1230 }, { "ce_loss_13": 3.367513132095337, "ce_loss_17": 3.3113282799720762, "ce_loss_2": 4.422825336456299, "ce_loss_4": 3.913813602924347, "ce_loss_9": 3.5158339619636534, "epoch": 0.124, "grad_norm": 636.0, "kl_loss_13": 111.10784111022949, "kl_loss_2": 2337.798858642578, "kl_loss_4": 1311.4879638671875, "kl_loss_9": 407.9518844604492, "learning_rate": 0.0009676378356149733, "loss": 1043.2992, "step": 1240 }, { "ce_loss_13": 3.3376001119613647, "ce_loss_17": 3.2791189193725585, "ce_loss_2": 4.370033013820648, "ce_loss_4": 3.8703083753585816, "ce_loss_9": 3.4755295276641847, "epoch": 0.125, "grad_norm": 688.0, "kl_loss_13": 106.5986442565918, "kl_loss_2": 2303.264697265625, "kl_loss_4": 1290.0798889160155, "kl_loss_9": 397.0188186645508, "learning_rate": 0.0009670739301325534, "loss": 1032.0988, "step": 1250 }, { "ce_loss_13": 3.3012579560279844, "ce_loss_17": 3.2405815005302427, "ce_loss_2": 4.3439412117004395, "ce_loss_4": 3.8545931935310365, "ce_loss_9": 3.4449319005012513, "epoch": 0.126, "grad_norm": 648.0, "kl_loss_13": 107.16764678955079, "kl_loss_2": 2308.397021484375, "kl_loss_4": 1315.3893249511718, "kl_loss_9": 401.0367233276367, "learning_rate": 0.0009665053212208426, "loss": 1047.9859, "step": 1260 }, { "ce_loss_13": 3.3390562295913697, "ce_loss_17": 3.2840550541877747, "ce_loss_2": 4.403453612327576, "ce_loss_4": 3.890084075927734, "ce_loss_9": 3.48423947095871, "epoch": 0.127, "grad_norm": 692.0, "kl_loss_13": 106.97278785705566, "kl_loss_2": 2372.2412658691405, "kl_loss_4": 1327.5496520996094, "kl_loss_9": 406.0030044555664, "learning_rate": 0.0009659320146057262, "loss": 1051.2734, "step": 1270 }, { "ce_loss_13": 3.3499784231185914, "ce_loss_17": 3.2922070741653444, "ce_loss_2": 4.3855064630508425, "ce_loss_4": 3.894232487678528, "ce_loss_9": 3.4922064304351808, "epoch": 0.128, "grad_norm": 612.0, "kl_loss_13": 103.24569854736328, "kl_loss_2": 2309.586083984375, "kl_loss_4": 1304.8849609375, "kl_loss_9": 395.5906951904297, "learning_rate": 0.0009653540160603955, "loss": 1033.1272, "step": 1280 }, { "ce_loss_13": 3.3521631360054016, "ce_loss_17": 3.2958574414253237, "ce_loss_2": 4.3798192024230955, "ce_loss_4": 3.8924019932746887, "ce_loss_9": 3.48896187543869, "epoch": 0.129, "grad_norm": 724.0, "kl_loss_13": 103.10880165100097, "kl_loss_2": 2327.2175170898436, "kl_loss_4": 1318.9538818359374, "kl_loss_9": 393.9052215576172, "learning_rate": 0.0009647713314052896, "loss": 1023.6325, "step": 1290 }, { "ce_loss_13": 3.300455665588379, "ce_loss_17": 3.238513100147247, "ce_loss_2": 4.398746824264526, "ce_loss_4": 3.8899983763694763, "ce_loss_9": 3.4518206238746645, "epoch": 0.13, "grad_norm": 592.0, "kl_loss_13": 105.94969062805175, "kl_loss_2": 2423.0098388671877, "kl_loss_4": 1370.7003967285157, "kl_loss_9": 407.3703353881836, "learning_rate": 0.0009641839665080363, "loss": 1067.2344, "step": 1300 }, { "ce_loss_13": 3.263159728050232, "ce_loss_17": 3.2069602012634277, "ce_loss_2": 4.329943525791168, "ce_loss_4": 3.8214871406555178, "ce_loss_9": 3.410001218318939, "epoch": 0.131, "grad_norm": 732.0, "kl_loss_13": 102.1422721862793, "kl_loss_2": 2348.4533203125, "kl_loss_4": 1318.508233642578, "kl_loss_9": 402.711003112793, "learning_rate": 0.0009635919272833937, "loss": 1031.7428, "step": 1310 }, { "ce_loss_13": 3.2946990489959718, "ce_loss_17": 3.2358811616897585, "ce_loss_2": 4.366981887817383, "ce_loss_4": 3.8651524066925047, "ce_loss_9": 3.4487127661705017, "epoch": 0.132, "grad_norm": 704.0, "kl_loss_13": 104.14997596740723, "kl_loss_2": 2343.723876953125, "kl_loss_4": 1323.0115295410155, "kl_loss_9": 405.50807189941406, "learning_rate": 0.0009629952196931902, "loss": 1025.0818, "step": 1320 }, { "ce_loss_13": 3.286488246917725, "ce_loss_17": 3.2299813985824586, "ce_loss_2": 4.354435992240906, "ce_loss_4": 3.831107270717621, "ce_loss_9": 3.4259880065917967, "epoch": 0.133, "grad_norm": 620.0, "kl_loss_13": 102.17385025024414, "kl_loss_2": 2366.3618713378905, "kl_loss_4": 1307.6370422363282, "kl_loss_9": 393.3925552368164, "learning_rate": 0.0009623938497462645, "loss": 1037.0735, "step": 1330 }, { "ce_loss_13": 3.2744598984718323, "ce_loss_17": 3.2186703085899353, "ce_loss_2": 4.334010004997253, "ce_loss_4": 3.831056308746338, "ce_loss_9": 3.4211880207061767, "epoch": 0.134, "grad_norm": 604.0, "kl_loss_13": 102.095463180542, "kl_loss_2": 2336.8879577636717, "kl_loss_4": 1313.4937805175782, "kl_loss_9": 402.02156829833984, "learning_rate": 0.0009617878234984055, "loss": 1048.8602, "step": 1340 }, { "ce_loss_13": 3.3661810636520384, "ce_loss_17": 3.3122437596321106, "ce_loss_2": 4.380954694747925, "ce_loss_4": 3.882122778892517, "ce_loss_9": 3.5056236147880555, "epoch": 0.135, "grad_norm": 716.0, "kl_loss_13": 99.42040252685547, "kl_loss_2": 2261.796173095703, "kl_loss_4": 1254.1968139648438, "kl_loss_9": 388.378955078125, "learning_rate": 0.0009611771470522907, "loss": 1019.8208, "step": 1350 }, { "ce_loss_13": 3.2976178526878357, "ce_loss_17": 3.2403480648994445, "ce_loss_2": 4.354464423656464, "ce_loss_4": 3.8516308665275574, "ce_loss_9": 3.4469806432723997, "epoch": 0.136, "grad_norm": 664.0, "kl_loss_13": 100.53326721191407, "kl_loss_2": 2299.1686767578126, "kl_loss_4": 1291.9524169921874, "kl_loss_9": 397.22457122802734, "learning_rate": 0.0009605618265574251, "loss": 1014.9305, "step": 1360 }, { "ce_loss_13": 3.268050992488861, "ce_loss_17": 3.2077465653419495, "ce_loss_2": 4.356378340721131, "ce_loss_4": 3.8343679308891296, "ce_loss_9": 3.4144388794898988, "epoch": 0.137, "grad_norm": 696.0, "kl_loss_13": 110.57073059082032, "kl_loss_2": 2421.8905639648438, "kl_loss_4": 1353.3382202148437, "kl_loss_9": 410.3678497314453, "learning_rate": 0.0009599418682100792, "loss": 1051.8801, "step": 1370 }, { "ce_loss_13": 3.299811065196991, "ce_loss_17": 3.246116352081299, "ce_loss_2": 4.357406163215638, "ce_loss_4": 3.846174156665802, "ce_loss_9": 3.4445716023445128, "epoch": 0.138, "grad_norm": 768.0, "kl_loss_13": 101.31869430541992, "kl_loss_2": 2340.7691650390625, "kl_loss_4": 1298.7441711425781, "kl_loss_9": 394.77845764160156, "learning_rate": 0.0009593172782532268, "loss": 1037.185, "step": 1380 }, { "ce_loss_13": 3.3381003975868224, "ce_loss_17": 3.2823010683059692, "ce_loss_2": 4.379236328601837, "ce_loss_4": 3.878574550151825, "ce_loss_9": 3.4841482758522035, "epoch": 0.139, "grad_norm": 668.0, "kl_loss_13": 102.40975189208984, "kl_loss_2": 2298.5057006835937, "kl_loss_4": 1290.5589904785156, "kl_loss_9": 396.7320785522461, "learning_rate": 0.0009586880629764817, "loss": 1020.4529, "step": 1390 }, { "ce_loss_13": 3.2770840644836428, "ce_loss_17": 3.219211196899414, "ce_loss_2": 4.331016874313354, "ce_loss_4": 3.833348023891449, "ce_loss_9": 3.4286335825920107, "epoch": 0.14, "grad_norm": 784.0, "kl_loss_13": 107.02684516906739, "kl_loss_2": 2320.654132080078, "kl_loss_4": 1304.0484802246094, "kl_loss_9": 410.73537292480466, "learning_rate": 0.0009580542287160348, "loss": 1020.2037, "step": 1400 }, { "ce_loss_13": 3.23485426902771, "ce_loss_17": 3.1792452573776244, "ce_loss_2": 4.287278437614441, "ce_loss_4": 3.780201089382172, "ce_loss_9": 3.387134313583374, "epoch": 0.141, "grad_norm": 700.0, "kl_loss_13": 102.53819580078125, "kl_loss_2": 2329.6465576171877, "kl_loss_4": 1294.6370422363282, "kl_loss_9": 414.023176574707, "learning_rate": 0.0009574157818545901, "loss": 1019.7475, "step": 1410 }, { "ce_loss_13": 3.307619261741638, "ce_loss_17": 3.252472710609436, "ce_loss_2": 4.327524995803833, "ce_loss_4": 3.8272714972496034, "ce_loss_9": 3.4502039551734924, "epoch": 0.142, "grad_norm": 660.0, "kl_loss_13": 99.3475284576416, "kl_loss_2": 2264.283074951172, "kl_loss_4": 1250.3487060546875, "kl_loss_9": 398.88404846191406, "learning_rate": 0.0009567727288213005, "loss": 1023.8275, "step": 1420 }, { "ce_loss_13": 3.2860882759094237, "ce_loss_17": 3.229217326641083, "ce_loss_2": 4.3413330078125, "ce_loss_4": 3.826633703708649, "ce_loss_9": 3.4349179744720457, "epoch": 0.143, "grad_norm": 700.0, "kl_loss_13": 104.36613082885742, "kl_loss_2": 2341.603790283203, "kl_loss_4": 1282.8856384277344, "kl_loss_9": 409.2802230834961, "learning_rate": 0.0009561250760917027, "loss": 1023.3168, "step": 1430 }, { "ce_loss_13": 3.307588982582092, "ce_loss_17": 3.2452077507972716, "ce_loss_2": 4.344717502593994, "ce_loss_4": 3.8385209679603576, "ce_loss_9": 3.448259747028351, "epoch": 0.144, "grad_norm": 1048.0, "kl_loss_13": 114.14393768310546, "kl_loss_2": 2334.0638671875, "kl_loss_4": 1302.6134704589845, "kl_loss_9": 412.03075408935547, "learning_rate": 0.0009554728301876525, "loss": 1013.7202, "step": 1440 }, { "ce_loss_13": 3.35393488407135, "ce_loss_17": 3.290667915344238, "ce_loss_2": 4.3726026058197025, "ce_loss_4": 3.886983585357666, "ce_loss_9": 3.49219263792038, "epoch": 0.145, "grad_norm": 800.0, "kl_loss_13": 114.66305503845214, "kl_loss_2": 2275.094317626953, "kl_loss_4": 1288.3170532226563, "kl_loss_9": 399.14944915771486, "learning_rate": 0.0009548159976772592, "loss": 1047.163, "step": 1450 }, { "ce_loss_13": 3.3060134768486025, "ce_loss_17": 3.2398659944534303, "ce_loss_2": 4.361707043647766, "ce_loss_4": 3.8484949827194215, "ce_loss_9": 3.446886456012726, "epoch": 0.146, "grad_norm": 712.0, "kl_loss_13": 117.4349208831787, "kl_loss_2": 2349.9953674316407, "kl_loss_4": 1312.7109619140624, "kl_loss_9": 406.10399169921874, "learning_rate": 0.0009541545851748186, "loss": 1035.0002, "step": 1460 }, { "ce_loss_13": 3.1733034372329714, "ce_loss_17": 3.11181960105896, "ce_loss_2": 4.268103861808777, "ce_loss_4": 3.7309717416763304, "ce_loss_9": 3.3166835069656373, "epoch": 0.147, "grad_norm": 764.0, "kl_loss_13": 117.60221443176269, "kl_loss_2": 2404.8810546875, "kl_loss_4": 1318.489569091797, "kl_loss_9": 399.5591079711914, "learning_rate": 0.0009534885993407473, "loss": 1042.8139, "step": 1470 }, { "ce_loss_13": 3.353926646709442, "ce_loss_17": 3.2724978923797607, "ce_loss_2": 4.392566514015198, "ce_loss_4": 3.892224359512329, "ce_loss_9": 3.4726366877555845, "epoch": 0.148, "grad_norm": 724.0, "kl_loss_13": 139.65031089782715, "kl_loss_2": 2350.8017578125, "kl_loss_4": 1321.2150573730469, "kl_loss_9": 393.81555328369143, "learning_rate": 0.0009528180468815154, "loss": 1047.985, "step": 1480 }, { "ce_loss_13": 3.3915168404579163, "ce_loss_17": 3.3253281235694887, "ce_loss_2": 4.396991300582886, "ce_loss_4": 3.911241555213928, "ce_loss_9": 3.520040047168732, "epoch": 0.149, "grad_norm": 636.0, "kl_loss_13": 137.39116935729982, "kl_loss_2": 2266.0220336914062, "kl_loss_4": 1272.7680358886719, "kl_loss_9": 399.7412567138672, "learning_rate": 0.0009521429345495787, "loss": 1021.5193, "step": 1490 }, { "ce_loss_13": 3.377981424331665, "ce_loss_17": 3.3089918971061705, "ce_loss_2": 4.36256160736084, "ce_loss_4": 3.874511981010437, "ce_loss_9": 3.499216449260712, "epoch": 0.15, "grad_norm": 680.0, "kl_loss_13": 119.94844093322754, "kl_loss_2": 2246.3206298828127, "kl_loss_4": 1248.2180053710938, "kl_loss_9": 388.26966400146483, "learning_rate": 0.0009514632691433108, "loss": 1014.2158, "step": 1500 }, { "ce_loss_13": 3.329502213001251, "ce_loss_17": 3.2669502973556517, "ce_loss_2": 4.347873878479004, "ce_loss_4": 3.857915127277374, "ce_loss_9": 3.464929723739624, "epoch": 0.151, "grad_norm": 688.0, "kl_loss_13": 113.11929283142089, "kl_loss_2": 2280.291882324219, "kl_loss_4": 1273.6213623046874, "kl_loss_9": 394.5995330810547, "learning_rate": 0.0009507790575069346, "loss": 1013.6383, "step": 1510 }, { "ce_loss_13": 3.297570192813873, "ce_loss_17": 3.232813537120819, "ce_loss_2": 4.349590492248535, "ce_loss_4": 3.8426506996154783, "ce_loss_9": 3.443753886222839, "epoch": 0.152, "grad_norm": 712.0, "kl_loss_13": 110.85068626403809, "kl_loss_2": 2324.2668701171874, "kl_loss_4": 1295.7248596191407, "kl_loss_9": 401.2438629150391, "learning_rate": 0.0009500903065304539, "loss": 1043.6035, "step": 1520 }, { "ce_loss_13": 3.3399811387062073, "ce_loss_17": 3.2841987729072573, "ce_loss_2": 4.330187225341797, "ce_loss_4": 3.8528904795646666, "ce_loss_9": 3.471581506729126, "epoch": 0.153, "grad_norm": 744.0, "kl_loss_13": 102.34607124328613, "kl_loss_2": 2202.66767578125, "kl_loss_4": 1236.2300415039062, "kl_loss_9": 375.545426940918, "learning_rate": 0.0009493970231495835, "loss": 1002.7671, "step": 1530 }, { "ce_loss_13": 3.2758294701576234, "ce_loss_17": 3.222549247741699, "ce_loss_2": 4.2692118883132935, "ce_loss_4": 3.7932890176773073, "ce_loss_9": 3.4076798796653747, "epoch": 0.154, "grad_norm": 740.0, "kl_loss_13": 97.60850028991699, "kl_loss_2": 2240.9700256347655, "kl_loss_4": 1261.6067199707031, "kl_loss_9": 374.13759460449216, "learning_rate": 0.0009486992143456792, "loss": 992.1441, "step": 1540 }, { "ce_loss_13": 3.2965641260147094, "ce_loss_17": 3.2360586524009705, "ce_loss_2": 4.394272017478943, "ce_loss_4": 3.863716244697571, "ce_loss_9": 3.440040957927704, "epoch": 0.155, "grad_norm": 636.0, "kl_loss_13": 105.34446182250977, "kl_loss_2": 2420.0051208496093, "kl_loss_4": 1341.6831604003905, "kl_loss_9": 404.8099853515625, "learning_rate": 0.0009479968871456679, "loss": 1036.6555, "step": 1550 }, { "ce_loss_13": 3.2709760665893555, "ce_loss_17": 3.215757656097412, "ce_loss_2": 4.324735450744629, "ce_loss_4": 3.8133442997932434, "ce_loss_9": 3.4105438113212587, "epoch": 0.156, "grad_norm": 736.0, "kl_loss_13": 99.87029724121093, "kl_loss_2": 2349.607080078125, "kl_loss_4": 1300.8240966796875, "kl_loss_9": 390.3031494140625, "learning_rate": 0.0009472900486219768, "loss": 1013.3098, "step": 1560 }, { "ce_loss_13": 3.2618454694747925, "ce_loss_17": 3.205492925643921, "ce_loss_2": 4.283135652542114, "ce_loss_4": 3.7982996821403505, "ce_loss_9": 3.400241732597351, "epoch": 0.157, "grad_norm": 728.0, "kl_loss_13": 99.15305404663086, "kl_loss_2": 2290.1863708496094, "kl_loss_4": 1283.8908752441407, "kl_loss_9": 386.46409759521487, "learning_rate": 0.000946578705892462, "loss": 1012.301, "step": 1570 }, { "ce_loss_13": 3.300940454006195, "ce_loss_17": 3.245930051803589, "ce_loss_2": 4.308735108375549, "ce_loss_4": 3.815536880493164, "ce_loss_9": 3.439196026325226, "epoch": 0.158, "grad_norm": 628.0, "kl_loss_13": 97.29884643554688, "kl_loss_2": 2229.8876831054686, "kl_loss_4": 1225.6342895507812, "kl_loss_9": 376.42786712646483, "learning_rate": 0.0009458628661203367, "loss": 1001.9973, "step": 1580 }, { "ce_loss_13": 3.3031871557235717, "ce_loss_17": 3.248406708240509, "ce_loss_2": 4.366739439964294, "ce_loss_4": 3.854965567588806, "ce_loss_9": 3.4469083189964294, "epoch": 0.159, "grad_norm": 780.0, "kl_loss_13": 98.17051391601562, "kl_loss_2": 2354.595355224609, "kl_loss_4": 1316.9095825195313, "kl_loss_9": 394.9068069458008, "learning_rate": 0.0009451425365140996, "loss": 1001.7581, "step": 1590 }, { "ce_loss_13": 3.3753222942352297, "ce_loss_17": 3.3201666951179503, "ce_loss_2": 4.361679863929749, "ce_loss_4": 3.8934988975524902, "ce_loss_9": 3.5151140213012697, "epoch": 0.16, "grad_norm": 668.0, "kl_loss_13": 98.52090950012207, "kl_loss_2": 2197.9241455078127, "kl_loss_4": 1234.2187805175781, "kl_loss_9": 380.74073638916013, "learning_rate": 0.0009444177243274617, "loss": 975.1326, "step": 1600 }, { "ce_loss_13": 3.2342332720756533, "ce_loss_17": 3.175268256664276, "ce_loss_2": 4.2756345748901365, "ce_loss_4": 3.775297236442566, "ce_loss_9": 3.3749369978904724, "epoch": 0.161, "grad_norm": 776.0, "kl_loss_13": 105.07829132080079, "kl_loss_2": 2310.253662109375, "kl_loss_4": 1293.7348571777343, "kl_loss_9": 394.623030090332, "learning_rate": 0.0009436884368592739, "loss": 1016.6045, "step": 1610 }, { "ce_loss_13": 3.282170295715332, "ce_loss_17": 3.227897846698761, "ce_loss_2": 4.281191802024841, "ce_loss_4": 3.8088948130607605, "ce_loss_9": 3.4205331802368164, "epoch": 0.162, "grad_norm": 684.0, "kl_loss_13": 101.84013671875, "kl_loss_2": 2230.205743408203, "kl_loss_4": 1252.5165954589843, "kl_loss_9": 380.3778244018555, "learning_rate": 0.0009429546814534529, "loss": 1012.8927, "step": 1620 }, { "ce_loss_13": 3.2896040558815, "ce_loss_17": 3.2361085057258605, "ce_loss_2": 4.290119099617004, "ce_loss_4": 3.80860458612442, "ce_loss_9": 3.422525429725647, "epoch": 0.163, "grad_norm": 596.0, "kl_loss_13": 101.63015174865723, "kl_loss_2": 2214.722314453125, "kl_loss_4": 1243.6845581054688, "kl_loss_9": 376.9969970703125, "learning_rate": 0.0009422164654989072, "loss": 974.724, "step": 1630 }, { "ce_loss_13": 3.408917820453644, "ce_loss_17": 3.3504751324653625, "ce_loss_2": 4.3929561376571655, "ce_loss_4": 3.9180466413497923, "ce_loss_9": 3.539123165607452, "epoch": 0.164, "grad_norm": 672.0, "kl_loss_13": 102.51958618164062, "kl_loss_2": 2216.1884216308595, "kl_loss_4": 1243.4335388183595, "kl_loss_9": 376.5258193969727, "learning_rate": 0.0009414737964294635, "loss": 992.4483, "step": 1640 }, { "ce_loss_13": 3.336865794658661, "ce_loss_17": 3.2845290422439577, "ce_loss_2": 4.295578408241272, "ce_loss_4": 3.8279440999031067, "ce_loss_9": 3.464097237586975, "epoch": 0.165, "grad_norm": 724.0, "kl_loss_13": 95.76305503845215, "kl_loss_2": 2155.4929138183593, "kl_loss_4": 1196.320068359375, "kl_loss_9": 359.5951324462891, "learning_rate": 0.000940726681723791, "loss": 982.7427, "step": 1650 }, { "ce_loss_13": 3.1780314207077027, "ce_loss_17": 3.1239538311958315, "ce_loss_2": 4.237276661396026, "ce_loss_4": 3.7238500833511354, "ce_loss_9": 3.3144403100013733, "epoch": 0.166, "grad_norm": 724.0, "kl_loss_13": 99.09452362060547, "kl_loss_2": 2357.867547607422, "kl_loss_4": 1307.1837219238282, "kl_loss_9": 384.91309967041013, "learning_rate": 0.0009399751289053266, "loss": 991.817, "step": 1660 }, { "ce_loss_13": 3.3884536027908325, "ce_loss_17": 3.3351887702941894, "ce_loss_2": 4.372224187850952, "ce_loss_4": 3.897164463996887, "ce_loss_9": 3.519719648361206, "epoch": 0.167, "grad_norm": 744.0, "kl_loss_13": 97.94064750671387, "kl_loss_2": 2201.8851684570313, "kl_loss_4": 1226.9512145996093, "kl_loss_9": 369.67835388183596, "learning_rate": 0.0009392191455421988, "loss": 992.1326, "step": 1670 }, { "ce_loss_13": 3.361027550697327, "ce_loss_17": 3.30734326839447, "ce_loss_2": 4.355885291099549, "ce_loss_4": 3.8685357093811037, "ce_loss_9": 3.4965213537216187, "epoch": 0.168, "grad_norm": 712.0, "kl_loss_13": 98.55237617492676, "kl_loss_2": 2243.932727050781, "kl_loss_4": 1255.4773864746094, "kl_loss_9": 382.85614013671875, "learning_rate": 0.0009384587392471515, "loss": 974.9587, "step": 1680 }, { "ce_loss_13": 3.3498239159584045, "ce_loss_17": 3.299537754058838, "ce_loss_2": 4.313471567630768, "ce_loss_4": 3.8637603282928468, "ce_loss_9": 3.483183753490448, "epoch": 0.169, "grad_norm": 764.0, "kl_loss_13": 94.6076271057129, "kl_loss_2": 2159.169714355469, "kl_loss_4": 1226.2245056152344, "kl_loss_9": 369.7983459472656, "learning_rate": 0.0009376939176774678, "loss": 962.5213, "step": 1690 }, { "ce_loss_13": 3.3299458265304565, "ce_loss_17": 3.274555242061615, "ce_loss_2": 4.31501624584198, "ce_loss_4": 3.8454368233680727, "ce_loss_9": 3.461405873298645, "epoch": 0.17, "grad_norm": 684.0, "kl_loss_13": 94.80207786560058, "kl_loss_2": 2201.3943298339846, "kl_loss_4": 1233.8527954101562, "kl_loss_9": 372.47161865234375, "learning_rate": 0.0009369246885348925, "loss": 994.4191, "step": 1700 }, { "ce_loss_13": 3.316060709953308, "ce_loss_17": 3.263388156890869, "ce_loss_2": 4.352077054977417, "ce_loss_4": 3.863167035579681, "ce_loss_9": 3.456589198112488, "epoch": 0.171, "grad_norm": 708.0, "kl_loss_13": 96.03619194030762, "kl_loss_2": 2295.2953247070313, "kl_loss_4": 1286.23359375, "kl_loss_9": 378.4852294921875, "learning_rate": 0.0009361510595655545, "loss": 1000.7859, "step": 1710 }, { "ce_loss_13": 3.2737643480300904, "ce_loss_17": 3.217836821079254, "ce_loss_2": 4.278836250305176, "ce_loss_4": 3.8066452503204347, "ce_loss_9": 3.4152570962905884, "epoch": 0.172, "grad_norm": 740.0, "kl_loss_13": 97.65170364379883, "kl_loss_2": 2254.1198181152345, "kl_loss_4": 1270.6001098632812, "kl_loss_9": 385.4350952148437, "learning_rate": 0.0009353730385598887, "loss": 993.403, "step": 1720 }, { "ce_loss_13": 3.2072998046875, "ce_loss_17": 3.155010712146759, "ce_loss_2": 4.253859066963196, "ce_loss_4": 3.7527849435806275, "ce_loss_9": 3.343551850318909, "epoch": 0.173, "grad_norm": 636.0, "kl_loss_13": 94.61254920959473, "kl_loss_2": 2303.8027160644533, "kl_loss_4": 1281.1050354003905, "kl_loss_9": 379.4188980102539, "learning_rate": 0.0009345906333525581, "loss": 1012.3521, "step": 1730 }, { "ce_loss_13": 3.2443289399147033, "ce_loss_17": 3.191619837284088, "ce_loss_2": 4.257315444946289, "ce_loss_4": 3.7754439234733583, "ce_loss_9": 3.386739265918732, "epoch": 0.174, "grad_norm": 648.0, "kl_loss_13": 95.79471549987792, "kl_loss_2": 2262.873565673828, "kl_loss_4": 1266.0321838378907, "kl_loss_9": 385.38279876708987, "learning_rate": 0.0009338038518223745, "loss": 986.0615, "step": 1740 }, { "ce_loss_13": 3.3160131573677063, "ce_loss_17": 3.258601987361908, "ce_loss_2": 4.321772587299347, "ce_loss_4": 3.8466910123825073, "ce_loss_9": 3.4559434175491335, "epoch": 0.175, "grad_norm": 676.0, "kl_loss_13": 98.67384262084961, "kl_loss_2": 2265.1025451660157, "kl_loss_4": 1277.967919921875, "kl_loss_9": 389.96258544921875, "learning_rate": 0.0009330127018922195, "loss": 1019.1023, "step": 1750 }, { "ce_loss_13": 3.2658791184425353, "ce_loss_17": 3.211934673786163, "ce_loss_2": 4.287610340118408, "ce_loss_4": 3.787616419792175, "ce_loss_9": 3.4017861127853393, "epoch": 0.176, "grad_norm": 732.0, "kl_loss_13": 98.68330879211426, "kl_loss_2": 2283.4465515136717, "kl_loss_4": 1258.4863525390624, "kl_loss_9": 387.55812377929686, "learning_rate": 0.0009322171915289634, "loss": 1005.191, "step": 1760 }, { "ce_loss_13": 3.297553205490112, "ce_loss_17": 3.2483973145484923, "ce_loss_2": 4.274235343933105, "ce_loss_4": 3.803940236568451, "ce_loss_9": 3.440148711204529, "epoch": 0.177, "grad_norm": 676.0, "kl_loss_13": 93.90616378784179, "kl_loss_2": 2205.721160888672, "kl_loss_4": 1229.010723876953, "kl_loss_9": 393.6399291992187, "learning_rate": 0.0009314173287433873, "loss": 975.3521, "step": 1770 }, { "ce_loss_13": 3.287331283092499, "ce_loss_17": 3.234244930744171, "ce_loss_2": 4.282831525802612, "ce_loss_4": 3.8102983593940736, "ce_loss_9": 3.4339799761772154, "epoch": 0.178, "grad_norm": 812.0, "kl_loss_13": 96.42430877685547, "kl_loss_2": 2225.7175170898436, "kl_loss_4": 1256.8188354492188, "kl_loss_9": 393.8082046508789, "learning_rate": 0.0009306131215901003, "loss": 974.4363, "step": 1780 }, { "ce_loss_13": 3.3164478540420532, "ce_loss_17": 3.2623019337654116, "ce_loss_2": 4.310870909690857, "ce_loss_4": 3.826959025859833, "ce_loss_9": 3.4496538281440734, "epoch": 0.179, "grad_norm": 716.0, "kl_loss_13": 95.39816360473633, "kl_loss_2": 2206.229522705078, "kl_loss_4": 1226.5503540039062, "kl_loss_9": 378.513117980957, "learning_rate": 0.0009298045781674596, "loss": 959.6615, "step": 1790 }, { "ce_loss_13": 3.3004026770591737, "ce_loss_17": 3.2483393430709837, "ce_loss_2": 4.272047340869904, "ce_loss_4": 3.8030646204948426, "ce_loss_9": 3.4380339860916136, "epoch": 0.18, "grad_norm": 672.0, "kl_loss_13": 96.91108589172363, "kl_loss_2": 2171.518029785156, "kl_loss_4": 1204.9924926757812, "kl_loss_9": 371.96835174560545, "learning_rate": 0.0009289917066174886, "loss": 977.4246, "step": 1800 }, { "ce_loss_13": 3.295607936382294, "ce_loss_17": 3.246348476409912, "ce_loss_2": 4.23402863740921, "ce_loss_4": 3.781065320968628, "ce_loss_9": 3.4213656067848204, "epoch": 0.181, "grad_norm": 616.0, "kl_loss_13": 92.53666381835937, "kl_loss_2": 2120.001788330078, "kl_loss_4": 1176.5944580078126, "kl_loss_9": 360.6382293701172, "learning_rate": 0.0009281745151257945, "loss": 949.8141, "step": 1810 }, { "ce_loss_13": 3.318648707866669, "ce_loss_17": 3.2640655994415284, "ce_loss_2": 4.306107974052429, "ce_loss_4": 3.822345507144928, "ce_loss_9": 3.4511567831039427, "epoch": 0.182, "grad_norm": 688.0, "kl_loss_13": 96.26609077453614, "kl_loss_2": 2194.0708435058596, "kl_loss_4": 1218.6878967285156, "kl_loss_9": 369.2196868896484, "learning_rate": 0.0009273530119214868, "loss": 976.1288, "step": 1820 }, { "ce_loss_13": 3.4107258915901184, "ce_loss_17": 3.355591583251953, "ce_loss_2": 4.380393171310425, "ce_loss_4": 3.9092475295066835, "ce_loss_9": 3.5371835827827454, "epoch": 0.183, "grad_norm": 680.0, "kl_loss_13": 100.99568901062011, "kl_loss_2": 2185.2503173828127, "kl_loss_4": 1211.135675048828, "kl_loss_9": 362.42479248046874, "learning_rate": 0.0009265272052770935, "loss": 951.5828, "step": 1830 }, { "ce_loss_13": 3.2406865477561952, "ce_loss_17": 3.1844029545784, "ce_loss_2": 4.2559812545776365, "ce_loss_4": 3.7556538224220275, "ce_loss_9": 3.374473440647125, "epoch": 0.184, "grad_norm": 840.0, "kl_loss_13": 100.13202857971191, "kl_loss_2": 2238.2178466796877, "kl_loss_4": 1228.1046081542968, "kl_loss_9": 364.12084045410154, "learning_rate": 0.0009256971035084784, "loss": 979.8647, "step": 1840 }, { "ce_loss_13": 3.1806642532348635, "ce_loss_17": 3.124430739879608, "ce_loss_2": 4.21564245223999, "ce_loss_4": 3.7193616032600403, "ce_loss_9": 3.3188551664352417, "epoch": 0.185, "grad_norm": 692.0, "kl_loss_13": 98.3440185546875, "kl_loss_2": 2289.003448486328, "kl_loss_4": 1277.9115295410156, "kl_loss_9": 379.96221313476565, "learning_rate": 0.0009248627149747573, "loss": 990.2439, "step": 1850 }, { "ce_loss_13": 3.377814221382141, "ce_loss_17": 3.322389805316925, "ce_loss_2": 4.341128206253051, "ce_loss_4": 3.8737547159194947, "ce_loss_9": 3.507482278347015, "epoch": 0.186, "grad_norm": 624.0, "kl_loss_13": 97.9429141998291, "kl_loss_2": 2181.674755859375, "kl_loss_4": 1211.4905883789063, "kl_loss_9": 364.52012329101564, "learning_rate": 0.0009240240480782129, "loss": 964.9949, "step": 1860 }, { "ce_loss_13": 3.284755754470825, "ce_loss_17": 3.228792119026184, "ce_loss_2": 4.2813700318336485, "ce_loss_4": 3.8053017973899843, "ce_loss_9": 3.413658332824707, "epoch": 0.187, "grad_norm": 724.0, "kl_loss_13": 101.41178092956542, "kl_loss_2": 2236.3501953125, "kl_loss_4": 1250.810723876953, "kl_loss_9": 368.65606689453125, "learning_rate": 0.0009231811112642122, "loss": 970.5515, "step": 1870 }, { "ce_loss_13": 3.323897731304169, "ce_loss_17": 3.271409976482391, "ce_loss_2": 4.2732173204422, "ce_loss_4": 3.8129101991653442, "ce_loss_9": 3.452033507823944, "epoch": 0.188, "grad_norm": 680.0, "kl_loss_13": 101.72588729858398, "kl_loss_2": 2143.6392456054687, "kl_loss_4": 1202.8489196777343, "kl_loss_9": 365.70509033203126, "learning_rate": 0.0009223339130211192, "loss": 955.3392, "step": 1880 }, { "ce_loss_13": 3.1829244017601015, "ce_loss_17": 3.131213593482971, "ce_loss_2": 4.204755032062531, "ce_loss_4": 3.696164774894714, "ce_loss_9": 3.3124831914901733, "epoch": 0.189, "grad_norm": 648.0, "kl_loss_13": 97.00320014953613, "kl_loss_2": 2264.2671142578124, "kl_loss_4": 1233.0195007324219, "kl_loss_9": 358.06385650634763, "learning_rate": 0.0009214824618802108, "loss": 981.127, "step": 1890 }, { "ce_loss_13": 3.36339693069458, "ce_loss_17": 3.3088223576545714, "ce_loss_2": 4.347205972671508, "ce_loss_4": 3.864045512676239, "ce_loss_9": 3.4887256264686584, "epoch": 0.19, "grad_norm": 648.0, "kl_loss_13": 98.71356239318848, "kl_loss_2": 2180.965411376953, "kl_loss_4": 1211.7989624023437, "kl_loss_9": 367.0889587402344, "learning_rate": 0.0009206267664155906, "loss": 988.14, "step": 1900 }, { "ce_loss_13": 3.287778210639954, "ce_loss_17": 3.2305564880371094, "ce_loss_2": 4.282608389854431, "ce_loss_4": 3.797332525253296, "ce_loss_9": 3.418061101436615, "epoch": 0.191, "grad_norm": 712.0, "kl_loss_13": 97.9106990814209, "kl_loss_2": 2215.468609619141, "kl_loss_4": 1221.6683959960938, "kl_loss_9": 366.5027740478516, "learning_rate": 0.0009197668352441024, "loss": 975.1162, "step": 1910 }, { "ce_loss_13": 3.3348501682281495, "ce_loss_17": 3.2833409309387207, "ce_loss_2": 4.30408536195755, "ce_loss_4": 3.832443726062775, "ce_loss_9": 3.462253785133362, "epoch": 0.192, "grad_norm": 764.0, "kl_loss_13": 96.12831916809083, "kl_loss_2": 2158.1401611328124, "kl_loss_4": 1201.4192199707031, "kl_loss_9": 362.71641845703124, "learning_rate": 0.0009189026770252437, "loss": 962.7703, "step": 1920 }, { "ce_loss_13": 3.3621666431427, "ce_loss_17": 3.309820866584778, "ce_loss_2": 4.322707390785217, "ce_loss_4": 3.8623532176017763, "ce_loss_9": 3.489624488353729, "epoch": 0.193, "grad_norm": 732.0, "kl_loss_13": 94.84638748168945, "kl_loss_2": 2159.771588134766, "kl_loss_4": 1210.8668151855468, "kl_loss_9": 364.78248138427733, "learning_rate": 0.000918034300461078, "loss": 993.4104, "step": 1930 }, { "ce_loss_13": 3.3837892532348635, "ce_loss_17": 3.3325113892555236, "ce_loss_2": 4.336338758468628, "ce_loss_4": 3.8793725252151487, "ce_loss_9": 3.5151267051696777, "epoch": 0.194, "grad_norm": 744.0, "kl_loss_13": 94.3999252319336, "kl_loss_2": 2147.145361328125, "kl_loss_4": 1198.3227783203124, "kl_loss_9": 364.49977416992186, "learning_rate": 0.0009171617142961477, "loss": 954.6803, "step": 1940 }, { "ce_loss_13": 3.3508265733718874, "ce_loss_17": 3.2987019777297975, "ce_loss_2": 4.311159837245941, "ce_loss_4": 3.844617450237274, "ce_loss_9": 3.4781469225883486, "epoch": 0.195, "grad_norm": 712.0, "kl_loss_13": 91.6006866455078, "kl_loss_2": 2154.9497253417967, "kl_loss_4": 1195.2054809570313, "kl_loss_9": 360.1728317260742, "learning_rate": 0.0009162849273173857, "loss": 956.5812, "step": 1950 }, { "ce_loss_13": 3.2878409743309023, "ce_loss_17": 3.23888703584671, "ce_loss_2": 4.251902055740357, "ce_loss_4": 3.778362774848938, "ce_loss_9": 3.416101062297821, "epoch": 0.196, "grad_norm": 732.0, "kl_loss_13": 89.0274787902832, "kl_loss_2": 2143.692755126953, "kl_loss_4": 1174.520458984375, "kl_loss_9": 357.48160552978516, "learning_rate": 0.0009154039483540273, "loss": 954.3681, "step": 1960 }, { "ce_loss_13": 3.268400990962982, "ce_loss_17": 3.217691791057587, "ce_loss_2": 4.245136177539825, "ce_loss_4": 3.7626100063323973, "ce_loss_9": 3.4000661253929136, "epoch": 0.197, "grad_norm": 640.0, "kl_loss_13": 90.23016510009765, "kl_loss_2": 2190.5732482910157, "kl_loss_4": 1200.616796875, "kl_loss_9": 359.080143737793, "learning_rate": 0.0009145187862775209, "loss": 958.1281, "step": 1970 }, { "ce_loss_13": 3.3009396314620973, "ce_loss_17": 3.251687526702881, "ce_loss_2": 4.2589087128639225, "ce_loss_4": 3.8026643872261046, "ce_loss_9": 3.4303215622901915, "epoch": 0.198, "grad_norm": 740.0, "kl_loss_13": 90.92903099060058, "kl_loss_2": 2159.524542236328, "kl_loss_4": 1213.1752502441407, "kl_loss_9": 358.58487396240236, "learning_rate": 0.0009136294500014386, "loss": 949.0615, "step": 1980 }, { "ce_loss_13": 3.2511113047599793, "ce_loss_17": 3.2019614815711974, "ce_loss_2": 4.281300258636475, "ce_loss_4": 3.773765230178833, "ce_loss_9": 3.385185647010803, "epoch": 0.199, "grad_norm": 824.0, "kl_loss_13": 91.62615509033203, "kl_loss_2": 2262.9648010253904, "kl_loss_4": 1240.5688110351562, "kl_loss_9": 362.73115997314454, "learning_rate": 0.000912735948481387, "loss": 979.9354, "step": 1990 }, { "ce_loss_13": 3.283021128177643, "ce_loss_17": 3.232093572616577, "ce_loss_2": 4.249727368354797, "ce_loss_4": 3.7801328182220457, "ce_loss_9": 3.4098451495170594, "epoch": 0.2, "grad_norm": 756.0, "kl_loss_13": 92.56528701782227, "kl_loss_2": 2181.098516845703, "kl_loss_4": 1216.8801025390626, "kl_loss_9": 364.4568374633789, "learning_rate": 0.0009118382907149164, "loss": 946.6853, "step": 2000 }, { "ce_loss_13": 3.3126049399375916, "ce_loss_17": 3.261287009716034, "ce_loss_2": 4.2733923435211185, "ce_loss_4": 3.816398227214813, "ce_loss_9": 3.441442632675171, "epoch": 0.201, "grad_norm": 676.0, "kl_loss_13": 91.95423011779785, "kl_loss_2": 2152.3748168945312, "kl_loss_4": 1209.0665100097656, "kl_loss_9": 361.6376892089844, "learning_rate": 0.0009109364857414306, "loss": 946.5832, "step": 2010 }, { "ce_loss_13": 3.2770270705223083, "ce_loss_17": 3.228403162956238, "ce_loss_2": 4.242778646945953, "ce_loss_4": 3.7632916808128356, "ce_loss_9": 3.402626168727875, "epoch": 0.202, "grad_norm": 724.0, "kl_loss_13": 89.61309394836425, "kl_loss_2": 2187.4436950683594, "kl_loss_4": 1202.5332946777344, "kl_loss_9": 360.0750030517578, "learning_rate": 0.0009100305426420956, "loss": 978.4272, "step": 2020 }, { "ce_loss_13": 3.239075553417206, "ce_loss_17": 3.1912439107894897, "ce_loss_2": 4.264526665210724, "ce_loss_4": 3.7580632090568544, "ce_loss_9": 3.373551666736603, "epoch": 0.203, "grad_norm": 900.0, "kl_loss_13": 89.87334213256835, "kl_loss_2": 2277.4216430664064, "kl_loss_4": 1234.525457763672, "kl_loss_9": 359.5686401367187, "learning_rate": 0.0009091204705397484, "loss": 967.3219, "step": 2030 }, { "ce_loss_13": 3.2265689611434936, "ce_loss_17": 3.175269639492035, "ce_loss_2": 4.258231091499328, "ce_loss_4": 3.74735563993454, "ce_loss_9": 3.358277690410614, "epoch": 0.204, "grad_norm": 720.0, "kl_loss_13": 93.58229713439941, "kl_loss_2": 2282.3120971679687, "kl_loss_4": 1241.5177368164063, "kl_loss_9": 361.5880889892578, "learning_rate": 0.0009082062785988049, "loss": 977.5709, "step": 2040 }, { "ce_loss_13": 3.3685425758361816, "ce_loss_17": 3.3171595454216005, "ce_loss_2": 4.292959356307984, "ce_loss_4": 3.8451284885406496, "ce_loss_9": 3.4920889496803285, "epoch": 0.205, "grad_norm": 800.0, "kl_loss_13": 91.8431282043457, "kl_loss_2": 2107.161785888672, "kl_loss_4": 1178.7062255859375, "kl_loss_9": 356.6432815551758, "learning_rate": 0.0009072879760251679, "loss": 950.2936, "step": 2050 }, { "ce_loss_13": 3.3089524507522583, "ce_loss_17": 3.259350800514221, "ce_loss_2": 4.307748210430145, "ce_loss_4": 3.8217745780944825, "ce_loss_9": 3.4480879664421082, "epoch": 0.206, "grad_norm": 680.0, "kl_loss_13": 93.10884094238281, "kl_loss_2": 2242.360632324219, "kl_loss_4": 1230.9670166015626, "kl_loss_9": 371.57481536865237, "learning_rate": 0.0009063655720661341, "loss": 966.1661, "step": 2060 }, { "ce_loss_13": 3.3583059072494508, "ce_loss_17": 3.304564726352692, "ce_loss_2": 4.292792272567749, "ce_loss_4": 3.845988953113556, "ce_loss_9": 3.491236174106598, "epoch": 0.207, "grad_norm": 784.0, "kl_loss_13": 95.84091911315917, "kl_loss_2": 2116.5053833007814, "kl_loss_4": 1191.6633178710938, "kl_loss_9": 373.36004333496095, "learning_rate": 0.000905439076010301, "loss": 950.7002, "step": 2070 }, { "ce_loss_13": 3.310245227813721, "ce_loss_17": 3.258601188659668, "ce_loss_2": 4.283760368824005, "ce_loss_4": 3.8175146102905275, "ce_loss_9": 3.4438483238220217, "epoch": 0.208, "grad_norm": 676.0, "kl_loss_13": 94.41566009521485, "kl_loss_2": 2157.372424316406, "kl_loss_4": 1198.5336791992188, "kl_loss_9": 370.09442749023435, "learning_rate": 0.0009045084971874737, "loss": 939.0064, "step": 2080 }, { "ce_loss_13": 3.2930172204971315, "ce_loss_17": 3.240478348731995, "ce_loss_2": 4.2477871656417845, "ce_loss_4": 3.788450598716736, "ce_loss_9": 3.4249788761138915, "epoch": 0.209, "grad_norm": 812.0, "kl_loss_13": 94.39330787658692, "kl_loss_2": 2149.9981384277344, "kl_loss_4": 1204.7449340820312, "kl_loss_9": 367.69925994873046, "learning_rate": 0.0009035738449685707, "loss": 967.0873, "step": 2090 }, { "ce_loss_13": 3.233154094219208, "ce_loss_17": 3.178975594043732, "ce_loss_2": 4.249279487133026, "ce_loss_4": 3.756052219867706, "ce_loss_9": 3.373828673362732, "epoch": 0.21, "grad_norm": 732.0, "kl_loss_13": 93.99942321777344, "kl_loss_2": 2255.170574951172, "kl_loss_4": 1242.3643951416016, "kl_loss_9": 374.80386505126955, "learning_rate": 0.0009026351287655293, "loss": 962.5345, "step": 2100 }, { "ce_loss_13": 3.4293829798698425, "ce_loss_17": 3.3792282700538636, "ce_loss_2": 4.324404907226563, "ce_loss_4": 3.8898934006690977, "ce_loss_9": 3.5465378046035765, "epoch": 0.211, "grad_norm": 720.0, "kl_loss_13": 90.24806594848633, "kl_loss_2": 2025.4352600097657, "kl_loss_4": 1124.5046112060547, "kl_loss_9": 350.2180938720703, "learning_rate": 0.0009016923580312113, "loss": 906.7486, "step": 2110 }, { "ce_loss_13": 3.289664113521576, "ce_loss_17": 3.2374181866645815, "ce_loss_2": 4.234306645393372, "ce_loss_4": 3.777394378185272, "ce_loss_9": 3.4207195043563843, "epoch": 0.212, "grad_norm": 828.0, "kl_loss_13": 103.11422386169434, "kl_loss_2": 2126.588299560547, "kl_loss_4": 1178.11142578125, "kl_loss_9": 365.55635528564454, "learning_rate": 0.0009007455422593077, "loss": 961.6285, "step": 2120 }, { "ce_loss_13": 3.303319585323334, "ce_loss_17": 3.2473248481750487, "ce_loss_2": 4.286237025260926, "ce_loss_4": 3.8030712366104127, "ce_loss_9": 3.4316128849983216, "epoch": 0.213, "grad_norm": 664.0, "kl_loss_13": 103.71499290466309, "kl_loss_2": 2218.969274902344, "kl_loss_4": 1228.2110961914063, "kl_loss_9": 379.88182525634767, "learning_rate": 0.0008997946909842425, "loss": 973.0553, "step": 2130 }, { "ce_loss_13": 3.3144681215286256, "ce_loss_17": 3.2573557019233705, "ce_loss_2": 4.3399817943573, "ce_loss_4": 3.8451735019683837, "ce_loss_9": 3.447383201122284, "epoch": 0.214, "grad_norm": 860.0, "kl_loss_13": 103.66337776184082, "kl_loss_2": 2279.8397216796875, "kl_loss_4": 1256.1508544921876, "kl_loss_9": 377.6374938964844, "learning_rate": 0.0008988398137810777, "loss": 966.7542, "step": 2140 }, { "ce_loss_13": 3.352087438106537, "ce_loss_17": 3.3003304481506346, "ce_loss_2": 4.290535891056061, "ce_loss_4": 3.8377557158470155, "ce_loss_9": 3.4746557235717774, "epoch": 0.215, "grad_norm": 804.0, "kl_loss_13": 95.54061927795411, "kl_loss_2": 2124.250836181641, "kl_loss_4": 1186.1236877441406, "kl_loss_9": 357.02324829101565, "learning_rate": 0.0008978809202654162, "loss": 934.1131, "step": 2150 }, { "ce_loss_13": 3.3268388390541075, "ce_loss_17": 3.2740718841552736, "ce_loss_2": 4.268651235103607, "ce_loss_4": 3.8112971067428587, "ce_loss_9": 3.4508150935173036, "epoch": 0.216, "grad_norm": 952.0, "kl_loss_13": 94.60899848937989, "kl_loss_2": 2104.24765625, "kl_loss_4": 1169.8356872558593, "kl_loss_9": 357.7300857543945, "learning_rate": 0.0008969180200933046, "loss": 948.2873, "step": 2160 }, { "ce_loss_13": 3.288419497013092, "ce_loss_17": 3.2334158539772035, "ce_loss_2": 4.276037967205047, "ce_loss_4": 3.8012101531028746, "ce_loss_9": 3.4172295808792112, "epoch": 0.217, "grad_norm": 768.0, "kl_loss_13": 100.75381240844726, "kl_loss_2": 2174.3433227539062, "kl_loss_4": 1217.892315673828, "kl_loss_9": 363.94210357666014, "learning_rate": 0.0008959511229611376, "loss": 967.2604, "step": 2170 }, { "ce_loss_13": 3.3669892072677614, "ce_loss_17": 3.3125024795532227, "ce_loss_2": 4.324894380569458, "ce_loss_4": 3.858024787902832, "ce_loss_9": 3.4884867787361147, "epoch": 0.218, "grad_norm": 876.0, "kl_loss_13": 102.30676040649413, "kl_loss_2": 2149.9006591796874, "kl_loss_4": 1193.063946533203, "kl_loss_9": 354.54992218017577, "learning_rate": 0.0008949802386055581, "loss": 947.3659, "step": 2180 }, { "ce_loss_13": 3.233458602428436, "ce_loss_17": 3.176294243335724, "ce_loss_2": 4.1978265881538395, "ce_loss_4": 3.7252599954605103, "ce_loss_9": 3.3537106990814207, "epoch": 0.219, "grad_norm": 792.0, "kl_loss_13": 107.00150070190429, "kl_loss_2": 2126.529364013672, "kl_loss_4": 1180.3070739746095, "kl_loss_9": 348.7339752197266, "learning_rate": 0.0008940053768033609, "loss": 965.7881, "step": 2190 }, { "ce_loss_13": 3.3277897477149962, "ce_loss_17": 3.2635307192802427, "ce_loss_2": 4.257868647575378, "ce_loss_4": 3.7924643039703367, "ce_loss_9": 3.431203293800354, "epoch": 0.22, "grad_norm": 724.0, "kl_loss_13": 116.66446228027344, "kl_loss_2": 2138.7681518554687, "kl_loss_4": 1179.5128143310546, "kl_loss_9": 348.44380340576174, "learning_rate": 0.0008930265473713938, "loss": 944.1561, "step": 2200 }, { "ce_loss_13": 3.287194538116455, "ce_loss_17": 3.223456788063049, "ce_loss_2": 4.236075389385223, "ce_loss_4": 3.766448366641998, "ce_loss_9": 3.403588020801544, "epoch": 0.221, "grad_norm": 744.0, "kl_loss_13": 112.54497184753419, "kl_loss_2": 2148.6867065429688, "kl_loss_4": 1180.5968322753906, "kl_loss_9": 352.51245880126953, "learning_rate": 0.0008920437601664579, "loss": 932.1693, "step": 2210 }, { "ce_loss_13": 3.2727954030036925, "ce_loss_17": 3.2170267343521117, "ce_loss_2": 4.227043080329895, "ce_loss_4": 3.7663298726081846, "ce_loss_9": 3.396084713935852, "epoch": 0.222, "grad_norm": 684.0, "kl_loss_13": 102.1230541229248, "kl_loss_2": 2138.1913513183595, "kl_loss_4": 1192.1333435058593, "kl_loss_9": 356.1757873535156, "learning_rate": 0.0008910570250852097, "loss": 928.835, "step": 2220 }, { "ce_loss_13": 3.3732096195220946, "ce_loss_17": 3.3188398361206053, "ce_loss_2": 4.279922914505005, "ce_loss_4": 3.832516062259674, "ce_loss_9": 3.4941823482513428, "epoch": 0.223, "grad_norm": 752.0, "kl_loss_13": 96.15999031066895, "kl_loss_2": 2059.2168518066405, "kl_loss_4": 1137.6443603515625, "kl_loss_9": 346.1767837524414, "learning_rate": 0.0008900663520640604, "loss": 915.2586, "step": 2230 }, { "ce_loss_13": 3.323720395565033, "ce_loss_17": 3.2688003540039063, "ce_loss_2": 4.268552005290985, "ce_loss_4": 3.8018130540847777, "ce_loss_9": 3.4431727170944213, "epoch": 0.224, "grad_norm": 760.0, "kl_loss_13": 95.7872543334961, "kl_loss_2": 2138.9270263671874, "kl_loss_4": 1177.120379638672, "kl_loss_9": 353.89476165771487, "learning_rate": 0.0008890717510790764, "loss": 941.8158, "step": 2240 }, { "ce_loss_13": 3.279287552833557, "ce_loss_17": 3.228074884414673, "ce_loss_2": 4.244151520729065, "ce_loss_4": 3.76785945892334, "ce_loss_9": 3.4018285393714907, "epoch": 0.225, "grad_norm": 712.0, "kl_loss_13": 94.25489349365235, "kl_loss_2": 2163.629754638672, "kl_loss_4": 1185.219903564453, "kl_loss_9": 352.8628692626953, "learning_rate": 0.0008880732321458784, "loss": 950.535, "step": 2250 }, { "ce_loss_13": 3.31189683675766, "ce_loss_17": 3.261502683162689, "ce_loss_2": 4.251094925403595, "ce_loss_4": 3.7883784532547, "ce_loss_9": 3.4407161712646483, "epoch": 0.226, "grad_norm": 852.0, "kl_loss_13": 94.58582611083985, "kl_loss_2": 2109.0529968261717, "kl_loss_4": 1163.501885986328, "kl_loss_9": 358.63934631347655, "learning_rate": 0.0008870708053195413, "loss": 947.5316, "step": 2260 }, { "ce_loss_13": 3.3335476756095885, "ce_loss_17": 3.285297894477844, "ce_loss_2": 4.256447744369507, "ce_loss_4": 3.8071651816368104, "ce_loss_9": 3.4511924386024475, "epoch": 0.227, "grad_norm": 780.0, "kl_loss_13": 89.15117149353027, "kl_loss_2": 2087.6072204589846, "kl_loss_4": 1158.7156860351563, "kl_loss_9": 345.3010711669922, "learning_rate": 0.0008860644806944918, "loss": 925.3416, "step": 2270 }, { "ce_loss_13": 3.2728632211685182, "ce_loss_17": 3.2228413224220276, "ce_loss_2": 4.235958611965179, "ce_loss_4": 3.7654478549957275, "ce_loss_9": 3.402116930484772, "epoch": 0.228, "grad_norm": 736.0, "kl_loss_13": 90.10444946289063, "kl_loss_2": 2145.073895263672, "kl_loss_4": 1184.8575927734375, "kl_loss_9": 358.54766235351565, "learning_rate": 0.0008850542684044079, "loss": 921.0351, "step": 2280 }, { "ce_loss_13": 3.232874131202698, "ce_loss_17": 3.182513749599457, "ce_loss_2": 4.245934855937958, "ce_loss_4": 3.7508826732635496, "ce_loss_9": 3.3689802169799803, "epoch": 0.229, "grad_norm": 764.0, "kl_loss_13": 92.43191909790039, "kl_loss_2": 2234.869909667969, "kl_loss_4": 1230.782974243164, "kl_loss_9": 370.2389450073242, "learning_rate": 0.0008840401786221159, "loss": 952.509, "step": 2290 }, { "ce_loss_13": 3.3808755040168763, "ce_loss_17": 3.334891474246979, "ce_loss_2": 4.299492859840393, "ce_loss_4": 3.846787965297699, "ce_loss_9": 3.5042584419250487, "epoch": 0.23, "grad_norm": 720.0, "kl_loss_13": 86.99340934753418, "kl_loss_2": 2062.409875488281, "kl_loss_4": 1137.3367095947265, "kl_loss_9": 343.86165924072264, "learning_rate": 0.000883022221559489, "loss": 908.996, "step": 2300 }, { "ce_loss_13": 3.338899517059326, "ce_loss_17": 3.289691352844238, "ce_loss_2": 4.295863151550293, "ce_loss_4": 3.8208630442619325, "ce_loss_9": 3.463158071041107, "epoch": 0.231, "grad_norm": 792.0, "kl_loss_13": 89.7489730834961, "kl_loss_2": 2138.119091796875, "kl_loss_4": 1177.0338073730468, "kl_loss_9": 349.2715377807617, "learning_rate": 0.0008820004074673434, "loss": 959.0157, "step": 2310 }, { "ce_loss_13": 3.2497766852378844, "ce_loss_17": 3.2020439863204957, "ce_loss_2": 4.193786537647247, "ce_loss_4": 3.731072425842285, "ce_loss_9": 3.3763773918151854, "epoch": 0.232, "grad_norm": 768.0, "kl_loss_13": 86.36487617492676, "kl_loss_2": 2146.440118408203, "kl_loss_4": 1183.485543823242, "kl_loss_9": 348.9352600097656, "learning_rate": 0.0008809747466353355, "loss": 926.4646, "step": 2320 }, { "ce_loss_13": 3.2526813745498657, "ce_loss_17": 3.202362859249115, "ce_loss_2": 4.212542414665222, "ce_loss_4": 3.7356587052345276, "ce_loss_9": 3.3759160161018373, "epoch": 0.233, "grad_norm": 812.0, "kl_loss_13": 88.14268188476562, "kl_loss_2": 2149.218145751953, "kl_loss_4": 1156.888134765625, "kl_loss_9": 342.9568435668945, "learning_rate": 0.0008799452493918585, "loss": 937.2152, "step": 2330 }, { "ce_loss_13": 3.3354029417037965, "ce_loss_17": 3.2855377793312073, "ce_loss_2": 4.26744898557663, "ce_loss_4": 3.8138632655143736, "ce_loss_9": 3.4569711685180664, "epoch": 0.234, "grad_norm": 844.0, "kl_loss_13": 88.94196243286133, "kl_loss_2": 2110.511163330078, "kl_loss_4": 1166.7712677001953, "kl_loss_9": 346.45865325927736, "learning_rate": 0.0008789119261039385, "loss": 954.3018, "step": 2340 }, { "ce_loss_13": 3.2477207660675047, "ce_loss_17": 3.198263096809387, "ce_loss_2": 4.197646915912628, "ce_loss_4": 3.740631628036499, "ce_loss_9": 3.37202308177948, "epoch": 0.235, "grad_norm": 644.0, "kl_loss_13": 88.45519828796387, "kl_loss_2": 2121.0713439941405, "kl_loss_4": 1174.4280090332031, "kl_loss_9": 346.70821990966795, "learning_rate": 0.0008778747871771292, "loss": 921.8231, "step": 2350 }, { "ce_loss_13": 3.2942930817604066, "ce_loss_17": 3.2476529479026794, "ce_loss_2": 4.214564323425293, "ce_loss_4": 3.7642132163047792, "ce_loss_9": 3.419190752506256, "epoch": 0.236, "grad_norm": 760.0, "kl_loss_13": 85.13226013183593, "kl_loss_2": 2072.194140625, "kl_loss_4": 1148.5518371582032, "kl_loss_9": 336.86150512695315, "learning_rate": 0.0008768338430554083, "loss": 909.3094, "step": 2360 }, { "ce_loss_13": 3.3023596882820128, "ce_loss_17": 3.2569980144500734, "ce_loss_2": 4.23673312664032, "ce_loss_4": 3.785304081439972, "ce_loss_9": 3.4277263641357423, "epoch": 0.237, "grad_norm": 768.0, "kl_loss_13": 88.49767379760742, "kl_loss_2": 2089.8222717285157, "kl_loss_4": 1157.4951171875, "kl_loss_9": 347.0603393554687, "learning_rate": 0.0008757891042210713, "loss": 927.942, "step": 2370 }, { "ce_loss_13": 3.3248913526535033, "ce_loss_17": 3.277131223678589, "ce_loss_2": 4.254247784614563, "ce_loss_4": 3.8001523494720457, "ce_loss_9": 3.4470866441726686, "epoch": 0.238, "grad_norm": 752.0, "kl_loss_13": 87.96815452575683, "kl_loss_2": 2071.3140380859377, "kl_loss_4": 1145.922280883789, "kl_loss_9": 339.0485443115234, "learning_rate": 0.0008747405811946271, "loss": 920.3144, "step": 2380 }, { "ce_loss_13": 3.224561131000519, "ce_loss_17": 3.1759562492370605, "ce_loss_2": 4.214334380626679, "ce_loss_4": 3.7256261229515077, "ce_loss_9": 3.353095328807831, "epoch": 0.239, "grad_norm": 676.0, "kl_loss_13": 89.64075889587403, "kl_loss_2": 2206.202288818359, "kl_loss_4": 1206.3945404052733, "kl_loss_9": 354.2266357421875, "learning_rate": 0.0008736882845346905, "loss": 926.8486, "step": 2390 }, { "ce_loss_13": 3.3134991407394407, "ce_loss_17": 3.262081706523895, "ce_loss_2": 4.258616781234741, "ce_loss_4": 3.791792631149292, "ce_loss_9": 3.4383889198303224, "epoch": 0.24, "grad_norm": 692.0, "kl_loss_13": 91.79788208007812, "kl_loss_2": 2099.705029296875, "kl_loss_4": 1137.2147552490235, "kl_loss_9": 351.1961166381836, "learning_rate": 0.0008726322248378774, "loss": 915.9145, "step": 2400 }, { "ce_loss_13": 3.3168442487716674, "ce_loss_17": 3.2658654928207396, "ce_loss_2": 4.291086602210998, "ce_loss_4": 3.798007917404175, "ce_loss_9": 3.4393696665763853, "epoch": 0.241, "grad_norm": 724.0, "kl_loss_13": 91.18320655822754, "kl_loss_2": 2183.9902954101562, "kl_loss_4": 1179.52744140625, "kl_loss_9": 348.21985015869143, "learning_rate": 0.0008715724127386971, "loss": 954.3727, "step": 2410 }, { "ce_loss_13": 3.388163149356842, "ce_loss_17": 3.3374621152877806, "ce_loss_2": 4.30515775680542, "ce_loss_4": 3.8508872628211974, "ce_loss_9": 3.503561234474182, "epoch": 0.242, "grad_norm": 632.0, "kl_loss_13": 93.10037879943847, "kl_loss_2": 2093.0827880859374, "kl_loss_4": 1143.714126586914, "kl_loss_9": 340.34727783203124, "learning_rate": 0.0008705088589094458, "loss": 926.0434, "step": 2420 }, { "ce_loss_13": 3.397733283042908, "ce_loss_17": 3.3486640214920045, "ce_loss_2": 4.326961147785187, "ce_loss_4": 3.8702669978141784, "ce_loss_9": 3.5161495447158813, "epoch": 0.243, "grad_norm": 692.0, "kl_loss_13": 94.67743148803712, "kl_loss_2": 2098.773126220703, "kl_loss_4": 1147.025424194336, "kl_loss_9": 343.21431121826174, "learning_rate": 0.0008694415740600988, "loss": 930.1328, "step": 2430 }, { "ce_loss_13": 3.251414442062378, "ce_loss_17": 3.204081356525421, "ce_loss_2": 4.2343505144119264, "ce_loss_4": 3.746074843406677, "ce_loss_9": 3.3755380511283875, "epoch": 0.244, "grad_norm": 832.0, "kl_loss_13": 91.5265941619873, "kl_loss_2": 2183.54365234375, "kl_loss_4": 1185.4872619628907, "kl_loss_9": 347.7381362915039, "learning_rate": 0.0008683705689382025, "loss": 934.1233, "step": 2440 }, { "ce_loss_13": 3.3352036356925963, "ce_loss_17": 3.2870277643203734, "ce_loss_2": 4.254025983810425, "ce_loss_4": 3.8016448616981506, "ce_loss_9": 3.452436113357544, "epoch": 0.245, "grad_norm": 672.0, "kl_loss_13": 88.54696655273438, "kl_loss_2": 2080.4288696289063, "kl_loss_4": 1150.784130859375, "kl_loss_9": 338.89248809814455, "learning_rate": 0.0008672958543287666, "loss": 933.8865, "step": 2450 }, { "ce_loss_13": 3.3462072253227233, "ce_loss_17": 3.2961216807365417, "ce_loss_2": 4.249486815929413, "ce_loss_4": 3.8145977020263673, "ce_loss_9": 3.4672706604003904, "epoch": 0.246, "grad_norm": 736.0, "kl_loss_13": 87.74930000305176, "kl_loss_2": 2045.5015930175782, "kl_loss_4": 1146.6499877929687, "kl_loss_9": 341.08093719482423, "learning_rate": 0.0008662174410541554, "loss": 908.4095, "step": 2460 }, { "ce_loss_13": 3.3104076981544495, "ce_loss_17": 3.2633822441101072, "ce_loss_2": 4.217393136024475, "ce_loss_4": 3.769331526756287, "ce_loss_9": 3.432016408443451, "epoch": 0.247, "grad_norm": 932.0, "kl_loss_13": 85.89468612670899, "kl_loss_2": 2047.9596923828126, "kl_loss_4": 1122.4431182861329, "kl_loss_9": 336.0746810913086, "learning_rate": 0.0008651353399739787, "loss": 926.9571, "step": 2470 }, { "ce_loss_13": 3.337866926193237, "ce_loss_17": 3.2926376819610597, "ce_loss_2": 4.262590432167054, "ce_loss_4": 3.8054956436157226, "ce_loss_9": 3.4577380299568174, "epoch": 0.248, "grad_norm": 880.0, "kl_loss_13": 86.62553329467774, "kl_loss_2": 2073.530798339844, "kl_loss_4": 1142.5131256103516, "kl_loss_9": 337.92457427978513, "learning_rate": 0.0008640495619849821, "loss": 917.6547, "step": 2480 }, { "ce_loss_13": 3.2964303255081178, "ce_loss_17": 3.2484665513038635, "ce_loss_2": 4.203482365608215, "ce_loss_4": 3.7624764442443848, "ce_loss_9": 3.4128145456314085, "epoch": 0.249, "grad_norm": 996.0, "kl_loss_13": 86.48510246276855, "kl_loss_2": 2064.852166748047, "kl_loss_4": 1136.9110778808595, "kl_loss_9": 336.74344635009766, "learning_rate": 0.0008629601180209381, "loss": 909.9742, "step": 2490 }, { "ce_loss_13": 3.29126091003418, "ce_loss_17": 3.2420788168907166, "ce_loss_2": 4.217225980758667, "ce_loss_4": 3.7575623631477355, "ce_loss_9": 3.417812240123749, "epoch": 0.25, "grad_norm": 784.0, "kl_loss_13": 88.03675422668456, "kl_loss_2": 2057.998211669922, "kl_loss_4": 1128.8810241699218, "kl_loss_9": 349.3929382324219, "learning_rate": 0.000861867019052535, "loss": 927.5346, "step": 2500 }, { "ce_loss_13": 3.215069842338562, "ce_loss_17": 3.1639512538909913, "ce_loss_2": 4.189428377151489, "ce_loss_4": 3.7026462078094484, "ce_loss_9": 3.348872888088226, "epoch": 0.251, "grad_norm": 732.0, "kl_loss_13": 88.78349227905274, "kl_loss_2": 2161.862829589844, "kl_loss_4": 1179.3965728759765, "kl_loss_9": 363.96420135498045, "learning_rate": 0.0008607702760872678, "loss": 946.0758, "step": 2510 }, { "ce_loss_13": 3.3202267050743104, "ce_loss_17": 3.2739442229270934, "ce_loss_2": 4.225949347019196, "ce_loss_4": 3.7909236431121824, "ce_loss_9": 3.441824221611023, "epoch": 0.252, "grad_norm": 828.0, "kl_loss_13": 87.51590347290039, "kl_loss_2": 2030.7117736816406, "kl_loss_4": 1132.7180267333983, "kl_loss_9": 340.58583374023436, "learning_rate": 0.0008596699001693256, "loss": 924.2797, "step": 2520 }, { "ce_loss_13": 3.337341475486755, "ce_loss_17": 3.2882779955863954, "ce_loss_2": 4.240027713775635, "ce_loss_4": 3.7827760577201843, "ce_loss_9": 3.448138189315796, "epoch": 0.253, "grad_norm": 788.0, "kl_loss_13": 88.29233741760254, "kl_loss_2": 2056.710101318359, "kl_loss_4": 1113.5805114746095, "kl_loss_9": 336.41419525146483, "learning_rate": 0.0008585659023794818, "loss": 924.9172, "step": 2530 }, { "ce_loss_13": 3.2948466777801513, "ce_loss_17": 3.247093677520752, "ce_loss_2": 4.2667999267578125, "ce_loss_4": 3.7884334683418275, "ce_loss_9": 3.4190890908241274, "epoch": 0.254, "grad_norm": 836.0, "kl_loss_13": 88.6147274017334, "kl_loss_2": 2149.3486206054686, "kl_loss_4": 1175.339892578125, "kl_loss_9": 347.41344451904297, "learning_rate": 0.0008574582938349817, "loss": 931.8402, "step": 2540 }, { "ce_loss_13": 3.27849794626236, "ce_loss_17": 3.228435230255127, "ce_loss_2": 4.23468028306961, "ce_loss_4": 3.780582332611084, "ce_loss_9": 3.4099076628684997, "epoch": 0.255, "grad_norm": 684.0, "kl_loss_13": 90.46429252624512, "kl_loss_2": 2132.9173889160156, "kl_loss_4": 1190.2234680175782, "kl_loss_9": 354.2894012451172, "learning_rate": 0.0008563470856894315, "loss": 912.2641, "step": 2550 }, { "ce_loss_13": 3.276999664306641, "ce_loss_17": 3.231548249721527, "ce_loss_2": 4.2210803627967834, "ce_loss_4": 3.7623754858970644, "ce_loss_9": 3.401135301589966, "epoch": 0.256, "grad_norm": 880.0, "kl_loss_13": 85.1612335205078, "kl_loss_2": 2104.3537841796874, "kl_loss_4": 1163.8737365722657, "kl_loss_9": 342.23360748291014, "learning_rate": 0.0008552322891326845, "loss": 921.068, "step": 2560 }, { "ce_loss_13": 3.2485652446746824, "ce_loss_17": 3.199524128437042, "ce_loss_2": 4.210166561603546, "ce_loss_4": 3.7297805309295655, "ce_loss_9": 3.370358681678772, "epoch": 0.257, "grad_norm": 728.0, "kl_loss_13": 85.70927581787109, "kl_loss_2": 2145.8478149414063, "kl_loss_4": 1164.0143463134766, "kl_loss_9": 339.1722915649414, "learning_rate": 0.0008541139153907296, "loss": 913.6647, "step": 2570 }, { "ce_loss_13": 3.207658565044403, "ce_loss_17": 3.1607483625411987, "ce_loss_2": 4.142305660247803, "ce_loss_4": 3.683729314804077, "ce_loss_9": 3.3294119954109194, "epoch": 0.258, "grad_norm": 856.0, "kl_loss_13": 83.65748710632325, "kl_loss_2": 2100.0630798339844, "kl_loss_4": 1151.6668060302734, "kl_loss_9": 331.9220596313477, "learning_rate": 0.0008529919757255782, "loss": 923.8511, "step": 2580 }, { "ce_loss_13": 3.2459914326667785, "ce_loss_17": 3.2003490686416627, "ce_loss_2": 4.13244149684906, "ce_loss_4": 3.6941211462020873, "ce_loss_9": 3.3592882633209227, "epoch": 0.259, "grad_norm": 684.0, "kl_loss_13": 83.24060859680176, "kl_loss_2": 2014.1521484375, "kl_loss_4": 1104.7504852294921, "kl_loss_9": 324.8483596801758, "learning_rate": 0.0008518664814351503, "loss": 889.9527, "step": 2590 }, { "ce_loss_13": 3.2060733318328856, "ce_loss_17": 3.159405970573425, "ce_loss_2": 4.157498776912689, "ce_loss_4": 3.696187674999237, "ce_loss_9": 3.331186580657959, "epoch": 0.26, "grad_norm": 780.0, "kl_loss_13": 86.2954875946045, "kl_loss_2": 2130.951513671875, "kl_loss_4": 1179.1319793701173, "kl_loss_9": 346.7232162475586, "learning_rate": 0.0008507374438531607, "loss": 958.9413, "step": 2600 }, { "ce_loss_13": 3.1857144951820375, "ce_loss_17": 3.1399152994155886, "ce_loss_2": 4.121321547031402, "ce_loss_4": 3.6632447361946108, "ce_loss_9": 3.3082396030426025, "epoch": 0.261, "grad_norm": 592.0, "kl_loss_13": 84.55880928039551, "kl_loss_2": 2090.511358642578, "kl_loss_4": 1147.8235229492188, "kl_loss_9": 337.9015914916992, "learning_rate": 0.0008496048743490053, "loss": 911.0207, "step": 2610 }, { "ce_loss_13": 3.3373757362365724, "ce_loss_17": 3.2904435873031614, "ce_loss_2": 4.235588300228119, "ce_loss_4": 3.7929789900779722, "ce_loss_9": 3.458786392211914, "epoch": 0.262, "grad_norm": 788.0, "kl_loss_13": 86.32050971984863, "kl_loss_2": 2027.3483154296875, "kl_loss_4": 1120.2023498535157, "kl_loss_9": 338.79857177734374, "learning_rate": 0.0008484687843276469, "loss": 902.0202, "step": 2620 }, { "ce_loss_13": 3.26626056432724, "ce_loss_17": 3.219456911087036, "ce_loss_2": 4.196949517726898, "ce_loss_4": 3.740860116481781, "ce_loss_9": 3.390087342262268, "epoch": 0.263, "grad_norm": 776.0, "kl_loss_13": 86.06944999694824, "kl_loss_2": 2084.330389404297, "kl_loss_4": 1144.0339233398438, "kl_loss_9": 347.79761810302733, "learning_rate": 0.0008473291852294987, "loss": 932.2067, "step": 2630 }, { "ce_loss_13": 3.275907301902771, "ce_loss_17": 3.2278727412223818, "ce_loss_2": 4.200026178359986, "ce_loss_4": 3.751169514656067, "ce_loss_9": 3.400949764251709, "epoch": 0.264, "grad_norm": 800.0, "kl_loss_13": 86.0791404724121, "kl_loss_2": 2092.987713623047, "kl_loss_4": 1152.6259460449219, "kl_loss_9": 343.6297607421875, "learning_rate": 0.0008461860885303114, "loss": 909.5027, "step": 2640 }, { "ce_loss_13": 3.3062209129333495, "ce_loss_17": 3.2608874440193176, "ce_loss_2": 4.206380689144135, "ce_loss_4": 3.7589853048324584, "ce_loss_9": 3.4244667410850527, "epoch": 0.265, "grad_norm": 732.0, "kl_loss_13": 85.16450424194336, "kl_loss_2": 2021.2992797851562, "kl_loss_4": 1111.3934143066406, "kl_loss_9": 331.31748352050784, "learning_rate": 0.000845039505741056, "loss": 905.8066, "step": 2650 }, { "ce_loss_13": 3.285935068130493, "ce_loss_17": 3.2384222030639647, "ce_loss_2": 4.207594358921051, "ce_loss_4": 3.7571651339530945, "ce_loss_9": 3.406543481349945, "epoch": 0.266, "grad_norm": 836.0, "kl_loss_13": 88.26923942565918, "kl_loss_2": 2102.5131591796876, "kl_loss_4": 1163.2055114746095, "kl_loss_9": 345.05494384765626, "learning_rate": 0.0008438894484078086, "loss": 944.8143, "step": 2660 }, { "ce_loss_13": 3.2971001744270323, "ce_loss_17": 3.248241627216339, "ce_loss_2": 4.204394841194153, "ce_loss_4": 3.7537492752075194, "ce_loss_9": 3.414425790309906, "epoch": 0.267, "grad_norm": 748.0, "kl_loss_13": 87.35046119689942, "kl_loss_2": 2058.653430175781, "kl_loss_4": 1127.5207611083983, "kl_loss_9": 336.9515045166016, "learning_rate": 0.0008427359281116334, "loss": 906.4635, "step": 2670 }, { "ce_loss_13": 3.1938817858695985, "ce_loss_17": 3.1482104778289797, "ce_loss_2": 4.149834764003754, "ce_loss_4": 3.681869113445282, "ce_loss_9": 3.321103000640869, "epoch": 0.268, "grad_norm": 652.0, "kl_loss_13": 87.67700462341308, "kl_loss_2": 2123.220263671875, "kl_loss_4": 1154.3934997558595, "kl_loss_9": 337.71882781982424, "learning_rate": 0.0008415789564684673, "loss": 921.3191, "step": 2680 }, { "ce_loss_13": 3.4389868855476378, "ce_loss_17": 3.3913365483283995, "ce_loss_2": 4.32799916267395, "ce_loss_4": 3.896670234203339, "ce_loss_9": 3.5553231596946717, "epoch": 0.269, "grad_norm": 692.0, "kl_loss_13": 90.58032188415527, "kl_loss_2": 1989.9671325683594, "kl_loss_4": 1106.718344116211, "kl_loss_9": 339.4188598632812, "learning_rate": 0.0008404185451290017, "loss": 887.4835, "step": 2690 }, { "ce_loss_13": 3.3077424645423887, "ce_loss_17": 3.262969934940338, "ce_loss_2": 4.220689129829407, "ce_loss_4": 3.7679991483688355, "ce_loss_9": 3.426233208179474, "epoch": 0.27, "grad_norm": 824.0, "kl_loss_13": 86.1768726348877, "kl_loss_2": 2058.965264892578, "kl_loss_4": 1120.5839050292968, "kl_loss_9": 334.05356750488284, "learning_rate": 0.0008392547057785661, "loss": 902.1295, "step": 2700 }, { "ce_loss_13": 3.239164745807648, "ce_loss_17": 3.1922030210494996, "ce_loss_2": 4.191896677017212, "ce_loss_4": 3.7292771220207213, "ce_loss_9": 3.364377760887146, "epoch": 0.271, "grad_norm": 828.0, "kl_loss_13": 87.49733276367188, "kl_loss_2": 2164.0575744628904, "kl_loss_4": 1183.8456909179688, "kl_loss_9": 345.6566589355469, "learning_rate": 0.0008380874501370098, "loss": 906.6885, "step": 2710 }, { "ce_loss_13": 3.229522907733917, "ce_loss_17": 3.1830254077911375, "ce_loss_2": 4.188966703414917, "ce_loss_4": 3.710645842552185, "ce_loss_9": 3.3521727085113526, "epoch": 0.272, "grad_norm": 852.0, "kl_loss_13": 87.24810829162598, "kl_loss_2": 2145.7540588378906, "kl_loss_4": 1171.6363494873046, "kl_loss_9": 345.628955078125, "learning_rate": 0.0008369167899585841, "loss": 922.7014, "step": 2720 }, { "ce_loss_13": 3.3517627239227297, "ce_loss_17": 3.307138133049011, "ce_loss_2": 4.216698122024536, "ce_loss_4": 3.801157605648041, "ce_loss_9": 3.4630095601081847, "epoch": 0.273, "grad_norm": 732.0, "kl_loss_13": 85.38688659667969, "kl_loss_2": 1976.8460754394532, "kl_loss_4": 1105.086737060547, "kl_loss_9": 329.17638092041017, "learning_rate": 0.0008357427370318238, "loss": 909.2509, "step": 2730 }, { "ce_loss_13": 3.30746066570282, "ce_loss_17": 3.2615819692611696, "ce_loss_2": 4.235583579540252, "ce_loss_4": 3.773705613613129, "ce_loss_9": 3.4275959730148315, "epoch": 0.274, "grad_norm": 760.0, "kl_loss_13": 86.52706832885742, "kl_loss_2": 2090.7098693847656, "kl_loss_4": 1139.3353454589844, "kl_loss_9": 335.4715240478516, "learning_rate": 0.0008345653031794292, "loss": 915.5144, "step": 2740 }, { "ce_loss_13": 3.301362681388855, "ce_loss_17": 3.254427659511566, "ce_loss_2": 4.222723615169525, "ce_loss_4": 3.7712342381477355, "ce_loss_9": 3.42340407371521, "epoch": 0.275, "grad_norm": 824.0, "kl_loss_13": 86.17152290344238, "kl_loss_2": 2044.7248291015626, "kl_loss_4": 1119.8676971435548, "kl_loss_9": 336.4943618774414, "learning_rate": 0.0008333845002581458, "loss": 901.5023, "step": 2750 }, { "ce_loss_13": 3.233277463912964, "ce_loss_17": 3.186070966720581, "ce_loss_2": 4.180034554004669, "ce_loss_4": 3.7184282660484316, "ce_loss_9": 3.3558387875556948, "epoch": 0.276, "grad_norm": 800.0, "kl_loss_13": 87.02991333007813, "kl_loss_2": 2143.9214416503905, "kl_loss_4": 1183.9332061767577, "kl_loss_9": 343.59161682128905, "learning_rate": 0.0008322003401586462, "loss": 932.0782, "step": 2760 }, { "ce_loss_13": 3.2686554908752443, "ce_loss_17": 3.223686730861664, "ce_loss_2": 4.163093483448028, "ce_loss_4": 3.717913568019867, "ce_loss_9": 3.3815023064613343, "epoch": 0.277, "grad_norm": 744.0, "kl_loss_13": 84.46449165344238, "kl_loss_2": 2016.767022705078, "kl_loss_4": 1100.048599243164, "kl_loss_9": 325.1136108398438, "learning_rate": 0.0008310128348054094, "loss": 873.1914, "step": 2770 }, { "ce_loss_13": 3.2334190130233766, "ce_loss_17": 3.1884844064712525, "ce_loss_2": 4.153000319004059, "ce_loss_4": 3.696218252182007, "ce_loss_9": 3.3487244963645937, "epoch": 0.278, "grad_norm": 896.0, "kl_loss_13": 84.94119606018066, "kl_loss_2": 2061.5789306640627, "kl_loss_4": 1125.7439544677734, "kl_loss_9": 335.49708251953126, "learning_rate": 0.0008298219961566008, "loss": 900.3799, "step": 2780 }, { "ce_loss_13": 3.2026249885559084, "ce_loss_17": 3.1564487338066103, "ce_loss_2": 4.1475555300712585, "ce_loss_4": 3.6848009705543516, "ce_loss_9": 3.322298550605774, "epoch": 0.279, "grad_norm": 736.0, "kl_loss_13": 85.35611419677734, "kl_loss_2": 2141.4679565429688, "kl_loss_4": 1182.722882080078, "kl_loss_9": 340.7112503051758, "learning_rate": 0.0008286278362039527, "loss": 908.9242, "step": 2790 }, { "ce_loss_13": 3.2312260270118713, "ce_loss_17": 3.185028612613678, "ce_loss_2": 4.187989091873169, "ce_loss_4": 3.719260597229004, "ce_loss_9": 3.346155607700348, "epoch": 0.28, "grad_norm": 776.0, "kl_loss_13": 86.70144271850586, "kl_loss_2": 2161.746417236328, "kl_loss_4": 1184.7936767578126, "kl_loss_9": 335.33875274658203, "learning_rate": 0.0008274303669726426, "loss": 907.5166, "step": 2800 }, { "ce_loss_13": 3.139736700057983, "ce_loss_17": 3.08803471326828, "ce_loss_2": 4.11551411151886, "ce_loss_4": 3.6313345670700072, "ce_loss_9": 3.25755854845047, "epoch": 0.281, "grad_norm": 864.0, "kl_loss_13": 90.18624916076661, "kl_loss_2": 2182.975939941406, "kl_loss_4": 1170.9007141113282, "kl_loss_9": 333.8880386352539, "learning_rate": 0.0008262296005211721, "loss": 907.3379, "step": 2810 }, { "ce_loss_13": 3.263656198978424, "ce_loss_17": 3.214433252811432, "ce_loss_2": 4.202375113964081, "ce_loss_4": 3.741171956062317, "ce_loss_9": 3.38106654882431, "epoch": 0.282, "grad_norm": 752.0, "kl_loss_13": 89.46879920959472, "kl_loss_2": 2107.242321777344, "kl_loss_4": 1156.6708587646485, "kl_loss_9": 336.96667022705077, "learning_rate": 0.0008250255489412463, "loss": 906.6707, "step": 2820 }, { "ce_loss_13": 3.363376832008362, "ce_loss_17": 3.3147770762443542, "ce_loss_2": 4.277840995788575, "ce_loss_4": 3.8263082027435305, "ce_loss_9": 3.477301073074341, "epoch": 0.283, "grad_norm": 920.0, "kl_loss_13": 89.64902610778809, "kl_loss_2": 2062.3288635253907, "kl_loss_4": 1124.7566528320312, "kl_loss_9": 328.69190521240233, "learning_rate": 0.0008238182243576511, "loss": 903.5663, "step": 2830 }, { "ce_loss_13": 3.3324272274971007, "ce_loss_17": 3.2846037268638613, "ce_loss_2": 4.176836204528809, "ce_loss_4": 3.761064279079437, "ce_loss_9": 3.4376333355903625, "epoch": 0.284, "grad_norm": 728.0, "kl_loss_13": 88.71090469360351, "kl_loss_2": 1930.1790100097655, "kl_loss_4": 1080.3647155761719, "kl_loss_9": 321.2514114379883, "learning_rate": 0.0008226076389281315, "loss": 872.3219, "step": 2840 }, { "ce_loss_13": 3.376649188995361, "ce_loss_17": 3.328803825378418, "ce_loss_2": 4.255143117904663, "ce_loss_4": 3.813234579563141, "ce_loss_9": 3.4860986709594726, "epoch": 0.285, "grad_norm": 796.0, "kl_loss_13": 91.1908805847168, "kl_loss_2": 2033.793310546875, "kl_loss_4": 1108.385009765625, "kl_loss_9": 328.55289459228516, "learning_rate": 0.0008213938048432696, "loss": 877.8479, "step": 2850 }, { "ce_loss_13": 3.2986589074134827, "ce_loss_17": 3.248919093608856, "ce_loss_2": 4.190428733825684, "ce_loss_4": 3.7510254859924315, "ce_loss_9": 3.4153574228286745, "epoch": 0.286, "grad_norm": 928.0, "kl_loss_13": 90.48378639221191, "kl_loss_2": 2016.7945983886718, "kl_loss_4": 1100.9656066894531, "kl_loss_9": 332.1938949584961, "learning_rate": 0.0008201767343263612, "loss": 895.5659, "step": 2860 }, { "ce_loss_13": 3.240786147117615, "ce_loss_17": 3.1924198985099794, "ce_loss_2": 4.166657328605652, "ce_loss_4": 3.718879294395447, "ce_loss_9": 3.3594149589538573, "epoch": 0.287, "grad_norm": 940.0, "kl_loss_13": 87.9628921508789, "kl_loss_2": 2091.728826904297, "kl_loss_4": 1153.0123413085937, "kl_loss_9": 334.0530242919922, "learning_rate": 0.0008189564396332927, "loss": 880.5472, "step": 2870 }, { "ce_loss_13": 3.223262095451355, "ce_loss_17": 3.17556711435318, "ce_loss_2": 4.1609018564224245, "ce_loss_4": 3.69830641746521, "ce_loss_9": 3.3397215843200683, "epoch": 0.288, "grad_norm": 692.0, "kl_loss_13": 86.13037757873535, "kl_loss_2": 2080.58564453125, "kl_loss_4": 1139.816616821289, "kl_loss_9": 329.157487487793, "learning_rate": 0.0008177329330524181, "loss": 909.0221, "step": 2880 }, { "ce_loss_13": 3.283345627784729, "ce_loss_17": 3.2363038182258608, "ce_loss_2": 4.173475062847137, "ce_loss_4": 3.7359678983688354, "ce_loss_9": 3.396084928512573, "epoch": 0.289, "grad_norm": 772.0, "kl_loss_13": 84.94896850585937, "kl_loss_2": 2006.2187255859376, "kl_loss_4": 1102.736669921875, "kl_loss_9": 324.60801391601564, "learning_rate": 0.0008165062269044352, "loss": 888.0768, "step": 2890 }, { "ce_loss_13": 3.231962251663208, "ce_loss_17": 3.1847190499305724, "ce_loss_2": 4.167845821380615, "ce_loss_4": 3.69975209236145, "ce_loss_9": 3.3504262447357176, "epoch": 0.29, "grad_norm": 828.0, "kl_loss_13": 85.72283172607422, "kl_loss_2": 2110.886083984375, "kl_loss_4": 1136.065966796875, "kl_loss_9": 334.0262481689453, "learning_rate": 0.0008152763335422613, "loss": 916.1252, "step": 2900 }, { "ce_loss_13": 3.222399044036865, "ce_loss_17": 3.1736102938652038, "ce_loss_2": 4.142154896259308, "ce_loss_4": 3.690603697299957, "ce_loss_9": 3.3418049931526186, "epoch": 0.291, "grad_norm": 888.0, "kl_loss_13": 86.23341636657715, "kl_loss_2": 2069.0516357421875, "kl_loss_4": 1140.7152221679687, "kl_loss_9": 336.0206954956055, "learning_rate": 0.0008140432653509088, "loss": 895.5287, "step": 2910 }, { "ce_loss_13": 3.27564138174057, "ce_loss_17": 3.228243827819824, "ce_loss_2": 4.174822735786438, "ce_loss_4": 3.733404290676117, "ce_loss_9": 3.393027651309967, "epoch": 0.292, "grad_norm": 752.0, "kl_loss_13": 86.92860412597656, "kl_loss_2": 2049.1183898925783, "kl_loss_4": 1127.7726440429688, "kl_loss_9": 341.3429443359375, "learning_rate": 0.0008128070347473608, "loss": 889.8033, "step": 2920 }, { "ce_loss_13": 3.2838281273841856, "ce_loss_17": 3.238075816631317, "ce_loss_2": 4.215974903106689, "ce_loss_4": 3.7511423945426943, "ce_loss_9": 3.4039663076400757, "epoch": 0.293, "grad_norm": 724.0, "kl_loss_13": 87.57083511352539, "kl_loss_2": 2116.1800048828127, "kl_loss_4": 1153.1954772949218, "kl_loss_9": 348.8721435546875, "learning_rate": 0.0008115676541804455, "loss": 910.3477, "step": 2930 }, { "ce_loss_13": 3.2814828038215635, "ce_loss_17": 3.235085892677307, "ce_loss_2": 4.168118190765381, "ce_loss_4": 3.734774100780487, "ce_loss_9": 3.3973249793052673, "epoch": 0.294, "grad_norm": 756.0, "kl_loss_13": 86.96043701171875, "kl_loss_2": 2015.842791748047, "kl_loss_4": 1110.1989379882812, "kl_loss_9": 337.41949768066405, "learning_rate": 0.0008103251361307119, "loss": 902.8018, "step": 2940 }, { "ce_loss_13": 3.3166361451148987, "ce_loss_17": 3.2672725558280944, "ce_loss_2": 4.226753163337707, "ce_loss_4": 3.778488886356354, "ce_loss_9": 3.4341904520988464, "epoch": 0.295, "grad_norm": 792.0, "kl_loss_13": 88.19808197021484, "kl_loss_2": 2055.312109375, "kl_loss_4": 1131.3970184326172, "kl_loss_9": 334.36364898681643, "learning_rate": 0.0008090794931103026, "loss": 892.6072, "step": 2950 }, { "ce_loss_13": 3.296609079837799, "ce_loss_17": 3.2501901030540465, "ce_loss_2": 4.189459836483001, "ce_loss_4": 3.748788833618164, "ce_loss_9": 3.412396025657654, "epoch": 0.296, "grad_norm": 688.0, "kl_loss_13": 84.80472068786621, "kl_loss_2": 1989.0398803710937, "kl_loss_4": 1095.225326538086, "kl_loss_9": 323.63614959716796, "learning_rate": 0.0008078307376628291, "loss": 883.5747, "step": 2960 }, { "ce_loss_13": 3.3625754952430724, "ce_loss_17": 3.3169750809669494, "ce_loss_2": 4.21538405418396, "ce_loss_4": 3.798343324661255, "ce_loss_9": 3.4755647778511047, "epoch": 0.297, "grad_norm": 912.0, "kl_loss_13": 85.28404312133789, "kl_loss_2": 1934.4125427246095, "kl_loss_4": 1069.026303100586, "kl_loss_9": 319.52669219970704, "learning_rate": 0.000806578882363245, "loss": 857.3016, "step": 2970 }, { "ce_loss_13": 3.2705488443374633, "ce_loss_17": 3.2260661005973814, "ce_loss_2": 4.157830369472504, "ce_loss_4": 3.72437344789505, "ce_loss_9": 3.38711701631546, "epoch": 0.298, "grad_norm": 952.0, "kl_loss_13": 84.83495826721192, "kl_loss_2": 2000.8730346679688, "kl_loss_4": 1108.6398498535157, "kl_loss_9": 326.1505355834961, "learning_rate": 0.0008053239398177191, "loss": 903.929, "step": 2980 }, { "ce_loss_13": 3.263384389877319, "ce_loss_17": 3.2167693614959716, "ce_loss_2": 4.172792506217957, "ce_loss_4": 3.720265972614288, "ce_loss_9": 3.373223125934601, "epoch": 0.299, "grad_norm": 860.0, "kl_loss_13": 86.03630332946777, "kl_loss_2": 2041.31611328125, "kl_loss_4": 1108.6524291992187, "kl_loss_9": 325.85936584472654, "learning_rate": 0.0008040659226635089, "loss": 910.0854, "step": 2990 }, { "ce_loss_13": 3.385720467567444, "ce_loss_17": 3.3382191181182863, "ce_loss_2": 4.266795706748963, "ce_loss_4": 3.8300618648529055, "ce_loss_9": 3.507095968723297, "epoch": 0.3, "grad_norm": 820.0, "kl_loss_13": 89.02688827514649, "kl_loss_2": 2004.2498901367187, "kl_loss_4": 1094.0719848632812, "kl_loss_9": 335.1713592529297, "learning_rate": 0.0008028048435688333, "loss": 878.5109, "step": 3000 }, { "ce_loss_13": 3.257288599014282, "ce_loss_17": 3.2109145641326906, "ce_loss_2": 4.18526531457901, "ce_loss_4": 3.730650246143341, "ce_loss_9": 3.3745195031166078, "epoch": 0.301, "grad_norm": 864.0, "kl_loss_13": 84.54983787536621, "kl_loss_2": 2080.8784423828124, "kl_loss_4": 1137.5463317871095, "kl_loss_9": 330.99841461181643, "learning_rate": 0.0008015407152327448, "loss": 897.2885, "step": 3010 }, { "ce_loss_13": 3.30250426530838, "ce_loss_17": 3.255344307422638, "ce_loss_2": 4.208631324768066, "ce_loss_4": 3.7579152584075928, "ce_loss_9": 3.418510675430298, "epoch": 0.302, "grad_norm": 852.0, "kl_loss_13": 85.88929748535156, "kl_loss_2": 2057.7919921875, "kl_loss_4": 1116.4963287353517, "kl_loss_9": 330.4980941772461, "learning_rate": 0.0008002735503850016, "loss": 898.668, "step": 3020 }, { "ce_loss_13": 3.2031028985977175, "ce_loss_17": 3.1481196999549867, "ce_loss_2": 4.1365126967430115, "ce_loss_4": 3.6649728655815124, "ce_loss_9": 3.31855708360672, "epoch": 0.303, "grad_norm": 680.0, "kl_loss_13": 95.30129356384278, "kl_loss_2": 2118.992889404297, "kl_loss_4": 1142.1922302246094, "kl_loss_9": 337.14517364501955, "learning_rate": 0.0007990033617859396, "loss": 915.553, "step": 3030 }, { "ce_loss_13": 3.2547006607055664, "ce_loss_17": 3.2032246708869936, "ce_loss_2": 4.142168891429901, "ce_loss_4": 3.6976324796676634, "ce_loss_9": 3.3654952168464662, "epoch": 0.304, "grad_norm": 720.0, "kl_loss_13": 95.32271614074708, "kl_loss_2": 2014.362725830078, "kl_loss_4": 1092.9575225830079, "kl_loss_9": 326.83533630371096, "learning_rate": 0.000797730162226344, "loss": 867.2573, "step": 3040 }, { "ce_loss_13": 3.2801309704780577, "ce_loss_17": 3.227008855342865, "ce_loss_2": 4.175163698196411, "ce_loss_4": 3.7368531465530395, "ce_loss_9": 3.3903521060943604, "epoch": 0.305, "grad_norm": 824.0, "kl_loss_13": 94.85666313171387, "kl_loss_2": 2027.4653747558593, "kl_loss_4": 1120.939682006836, "kl_loss_9": 327.00482788085935, "learning_rate": 0.0007964539645273203, "loss": 881.1213, "step": 3050 }, { "ce_loss_13": 3.2920580983161924, "ce_loss_17": 3.241214370727539, "ce_loss_2": 4.164921832084656, "ce_loss_4": 3.726666307449341, "ce_loss_9": 3.396126222610474, "epoch": 0.306, "grad_norm": 748.0, "kl_loss_13": 87.54969520568848, "kl_loss_2": 1971.8020629882812, "kl_loss_4": 1079.1253387451172, "kl_loss_9": 318.69213104248047, "learning_rate": 0.000795174781540165, "loss": 880.1559, "step": 3060 }, { "ce_loss_13": 3.363193917274475, "ce_loss_17": 3.311252546310425, "ce_loss_2": 4.219256675243377, "ce_loss_4": 3.792584311962128, "ce_loss_9": 3.466492402553558, "epoch": 0.307, "grad_norm": 668.0, "kl_loss_13": 94.67599182128906, "kl_loss_2": 1946.3895568847656, "kl_loss_4": 1068.7306396484375, "kl_loss_9": 316.1515609741211, "learning_rate": 0.0007938926261462366, "loss": 880.2222, "step": 3070 }, { "ce_loss_13": 3.311126208305359, "ce_loss_17": 3.261291027069092, "ce_loss_2": 4.170499920845032, "ce_loss_4": 3.73604336977005, "ce_loss_9": 3.4188188672065736, "epoch": 0.308, "grad_norm": 972.0, "kl_loss_13": 91.15548439025879, "kl_loss_2": 1990.57578125, "kl_loss_4": 1087.4571441650392, "kl_loss_9": 321.47308959960935, "learning_rate": 0.0007926075112568258, "loss": 890.7071, "step": 3080 }, { "ce_loss_13": 3.3046496868133546, "ce_loss_17": 3.2562206268310545, "ce_loss_2": 4.18972373008728, "ce_loss_4": 3.7526055812835692, "ce_loss_9": 3.417297029495239, "epoch": 0.309, "grad_norm": 776.0, "kl_loss_13": 90.14528007507325, "kl_loss_2": 2015.0728637695313, "kl_loss_4": 1102.9831237792969, "kl_loss_9": 323.41277465820315, "learning_rate": 0.0007913194498130252, "loss": 870.3509, "step": 3090 }, { "ce_loss_13": 3.2310577392578126, "ce_loss_17": 3.1834172368049622, "ce_loss_2": 4.143940055370331, "ce_loss_4": 3.6971134185791015, "ce_loss_9": 3.347603809833527, "epoch": 0.31, "grad_norm": 736.0, "kl_loss_13": 88.47995147705078, "kl_loss_2": 2047.0616760253906, "kl_loss_4": 1117.8453430175782, "kl_loss_9": 326.5718460083008, "learning_rate": 0.0007900284547855992, "loss": 898.5285, "step": 3100 }, { "ce_loss_13": 3.247792375087738, "ce_loss_17": 3.2005126953125, "ce_loss_2": 4.1182458877563475, "ce_loss_4": 3.6890514135360717, "ce_loss_9": 3.359893488883972, "epoch": 0.311, "grad_norm": 688.0, "kl_loss_13": 87.62182350158692, "kl_loss_2": 1989.3281494140624, "kl_loss_4": 1094.8817047119142, "kl_loss_9": 322.1224609375, "learning_rate": 0.0007887345391748532, "loss": 891.0457, "step": 3110 }, { "ce_loss_13": 3.3665268778800965, "ce_loss_17": 3.3208903670310974, "ce_loss_2": 4.215417850017547, "ce_loss_4": 3.7931267857551574, "ce_loss_9": 3.4738640189170837, "epoch": 0.312, "grad_norm": 816.0, "kl_loss_13": 87.93111915588379, "kl_loss_2": 1944.576806640625, "kl_loss_4": 1068.2371887207032, "kl_loss_9": 314.94420928955077, "learning_rate": 0.0007874377160105036, "loss": 852.7338, "step": 3120 }, { "ce_loss_13": 3.266544818878174, "ce_loss_17": 3.2198745250701903, "ce_loss_2": 4.183292400836945, "ce_loss_4": 3.7293641209602355, "ce_loss_9": 3.3768165946006774, "epoch": 0.313, "grad_norm": 812.0, "kl_loss_13": 92.22348403930664, "kl_loss_2": 2072.977648925781, "kl_loss_4": 1130.5628204345703, "kl_loss_9": 317.1712127685547, "learning_rate": 0.0007861379983515449, "loss": 918.9357, "step": 3130 }, { "ce_loss_13": 3.346944522857666, "ce_loss_17": 3.3014272809028626, "ce_loss_2": 4.223881769180298, "ce_loss_4": 3.801562249660492, "ce_loss_9": 3.460020422935486, "epoch": 0.314, "grad_norm": 1004.0, "kl_loss_13": 87.69670944213867, "kl_loss_2": 2007.4992248535157, "kl_loss_4": 1115.0575561523438, "kl_loss_9": 323.3450332641602, "learning_rate": 0.0007848353992861195, "loss": 879.2082, "step": 3140 }, { "ce_loss_13": 3.426660692691803, "ce_loss_17": 3.37130469083786, "ce_loss_2": 4.312441456317901, "ce_loss_4": 3.8890052437782288, "ce_loss_9": 3.546935868263245, "epoch": 0.315, "grad_norm": 824.0, "kl_loss_13": 94.69918479919434, "kl_loss_2": 1997.0293395996093, "kl_loss_4": 1118.933282470703, "kl_loss_9": 335.3253112792969, "learning_rate": 0.0007835299319313853, "loss": 891.7405, "step": 3150 }, { "ce_loss_13": 3.3141162276268004, "ce_loss_17": 3.266682839393616, "ce_loss_2": 4.179789280891418, "ce_loss_4": 3.750796389579773, "ce_loss_9": 3.4229891777038572, "epoch": 0.316, "grad_norm": 852.0, "kl_loss_13": 88.69958419799805, "kl_loss_2": 1990.604473876953, "kl_loss_4": 1091.772930908203, "kl_loss_9": 319.4409896850586, "learning_rate": 0.0007822216094333848, "loss": 903.952, "step": 3160 }, { "ce_loss_13": 3.3159342288970945, "ce_loss_17": 3.266482150554657, "ce_loss_2": 4.214651012420655, "ce_loss_4": 3.7684731245040894, "ce_loss_9": 3.425386905670166, "epoch": 0.317, "grad_norm": 828.0, "kl_loss_13": 89.16406593322753, "kl_loss_2": 2032.8277465820313, "kl_loss_4": 1108.2530944824218, "kl_loss_9": 323.86759490966796, "learning_rate": 0.0007809104449669101, "loss": 878.7519, "step": 3170 }, { "ce_loss_13": 3.270819342136383, "ce_loss_17": 3.223061752319336, "ce_loss_2": 4.13521009683609, "ce_loss_4": 3.7123499155044555, "ce_loss_9": 3.377288591861725, "epoch": 0.318, "grad_norm": 864.0, "kl_loss_13": 85.74185523986816, "kl_loss_2": 1959.4578186035155, "kl_loss_4": 1078.3698852539062, "kl_loss_9": 318.6862518310547, "learning_rate": 0.0007795964517353734, "loss": 862.148, "step": 3180 }, { "ce_loss_13": 3.2628143787384034, "ce_loss_17": 3.218585216999054, "ce_loss_2": 4.16016184091568, "ce_loss_4": 3.711937928199768, "ce_loss_9": 3.3730796337127686, "epoch": 0.319, "grad_norm": 680.0, "kl_loss_13": 86.5244125366211, "kl_loss_2": 2051.4399475097657, "kl_loss_4": 1119.6945068359375, "kl_loss_9": 325.8485305786133, "learning_rate": 0.000778279642970672, "loss": 867.9717, "step": 3190 }, { "ce_loss_13": 3.2614962220191956, "ce_loss_17": 3.216829836368561, "ce_loss_2": 4.138477158546448, "ce_loss_4": 3.704724645614624, "ce_loss_9": 3.3741748690605164, "epoch": 0.32, "grad_norm": 1112.0, "kl_loss_13": 84.40923233032227, "kl_loss_2": 1991.6298278808595, "kl_loss_4": 1090.3296936035156, "kl_loss_9": 322.5443389892578, "learning_rate": 0.0007769600319330552, "loss": 859.8841, "step": 3200 }, { "ce_loss_13": 3.288937306404114, "ce_loss_17": 3.2442583322525023, "ce_loss_2": 4.222093415260315, "ce_loss_4": 3.7567214012145995, "ce_loss_9": 3.4035842776298524, "epoch": 0.321, "grad_norm": 1088.0, "kl_loss_13": 83.93023834228515, "kl_loss_2": 2080.5573974609374, "kl_loss_4": 1127.0830047607421, "kl_loss_9": 325.4534255981445, "learning_rate": 0.0007756376319109917, "loss": 887.2564, "step": 3210 }, { "ce_loss_13": 3.339513421058655, "ce_loss_17": 3.2954869866371155, "ce_loss_2": 4.208236837387085, "ce_loss_4": 3.7821726322174074, "ce_loss_9": 3.451899802684784, "epoch": 0.322, "grad_norm": 936.0, "kl_loss_13": 82.67616577148438, "kl_loss_2": 1983.3537475585938, "kl_loss_4": 1092.1045135498048, "kl_loss_9": 322.47215576171874, "learning_rate": 0.0007743124562210351, "loss": 852.4111, "step": 3220 }, { "ce_loss_13": 3.350459098815918, "ce_loss_17": 3.3049745678901674, "ce_loss_2": 4.223370480537414, "ce_loss_4": 3.792885971069336, "ce_loss_9": 3.4620816826820375, "epoch": 0.323, "grad_norm": 952.0, "kl_loss_13": 84.83108139038086, "kl_loss_2": 1979.5841796875, "kl_loss_4": 1089.1873504638672, "kl_loss_9": 320.7507629394531, "learning_rate": 0.0007729845182076895, "loss": 872.6604, "step": 3230 }, { "ce_loss_13": 3.280298352241516, "ce_loss_17": 3.239156460762024, "ce_loss_2": 4.138239550590515, "ce_loss_4": 3.7194194436073302, "ce_loss_9": 3.3929233312606812, "epoch": 0.324, "grad_norm": 848.0, "kl_loss_13": 82.10867538452149, "kl_loss_2": 1946.6383728027345, "kl_loss_4": 1077.661312866211, "kl_loss_9": 320.58251495361327, "learning_rate": 0.0007716538312432765, "loss": 880.8723, "step": 3240 }, { "ce_loss_13": 3.242268109321594, "ce_loss_17": 3.1970906615257264, "ce_loss_2": 4.153493225574493, "ce_loss_4": 3.6985382556915285, "ce_loss_9": 3.3589555382728578, "epoch": 0.325, "grad_norm": 744.0, "kl_loss_13": 83.0129451751709, "kl_loss_2": 2044.7987487792968, "kl_loss_4": 1114.1810821533204, "kl_loss_9": 324.7056488037109, "learning_rate": 0.0007703204087277988, "loss": 886.2568, "step": 3250 }, { "ce_loss_13": 3.340661096572876, "ce_loss_17": 3.2950722336769105, "ce_loss_2": 4.182095468044281, "ce_loss_4": 3.757711577415466, "ce_loss_9": 3.44431414604187, "epoch": 0.326, "grad_norm": 740.0, "kl_loss_13": 81.64471549987793, "kl_loss_2": 1907.3132751464843, "kl_loss_4": 1040.2453918457031, "kl_loss_9": 309.3230773925781, "learning_rate": 0.0007689842640888063, "loss": 847.0235, "step": 3260 }, { "ce_loss_13": 3.3351841807365417, "ce_loss_17": 3.2892799735069276, "ce_loss_2": 4.196241044998169, "ce_loss_4": 3.774498987197876, "ce_loss_9": 3.448796498775482, "epoch": 0.327, "grad_norm": 768.0, "kl_loss_13": 83.10033264160157, "kl_loss_2": 1935.7306579589845, "kl_loss_4": 1063.3290313720704, "kl_loss_9": 319.72888641357423, "learning_rate": 0.0007676454107812607, "loss": 860.6641, "step": 3270 }, { "ce_loss_13": 3.27805734872818, "ce_loss_17": 3.2325761914253235, "ce_loss_2": 4.163552284240723, "ce_loss_4": 3.7228221774101256, "ce_loss_9": 3.3905189394950868, "epoch": 0.328, "grad_norm": 1200.0, "kl_loss_13": 83.2603816986084, "kl_loss_2": 2022.139111328125, "kl_loss_4": 1090.8947479248047, "kl_loss_9": 321.4178863525391, "learning_rate": 0.0007663038622873999, "loss": 865.2281, "step": 3280 }, { "ce_loss_13": 3.31582168340683, "ce_loss_17": 3.2736997604370117, "ce_loss_2": 4.193931818008423, "ce_loss_4": 3.752781319618225, "ce_loss_9": 3.42306672334671, "epoch": 0.329, "grad_norm": 772.0, "kl_loss_13": 82.19000549316407, "kl_loss_2": 1988.5528625488282, "kl_loss_4": 1076.8862548828124, "kl_loss_9": 316.0207717895508, "learning_rate": 0.0007649596321166025, "loss": 855.2568, "step": 3290 }, { "ce_loss_13": 3.2226877927780153, "ce_loss_17": 3.178584325313568, "ce_loss_2": 4.08319239616394, "ce_loss_4": 3.6656685709953307, "ce_loss_9": 3.3301095366477966, "epoch": 0.33, "grad_norm": 724.0, "kl_loss_13": 78.87243385314942, "kl_loss_2": 1926.275946044922, "kl_loss_4": 1067.525653076172, "kl_loss_9": 306.3053573608398, "learning_rate": 0.0007636127338052513, "loss": 860.6953, "step": 3300 }, { "ce_loss_13": 3.320180690288544, "ce_loss_17": 3.276493513584137, "ce_loss_2": 4.227478420734405, "ce_loss_4": 3.7749785423278808, "ce_loss_9": 3.435419762134552, "epoch": 0.331, "grad_norm": 632.0, "kl_loss_13": 84.00535354614257, "kl_loss_2": 2043.9775085449219, "kl_loss_4": 1105.5403198242188, "kl_loss_9": 321.6100845336914, "learning_rate": 0.0007622631809165971, "loss": 869.9645, "step": 3310 }, { "ce_loss_13": 3.3199275851249697, "ce_loss_17": 3.2771979689598085, "ce_loss_2": 4.15064367055893, "ce_loss_4": 3.7406180262565614, "ce_loss_9": 3.4273388624191283, "epoch": 0.332, "grad_norm": 900.0, "kl_loss_13": 78.16543731689453, "kl_loss_2": 1873.4186096191406, "kl_loss_4": 1031.5828399658203, "kl_loss_9": 303.8035720825195, "learning_rate": 0.000760910987040623, "loss": 841.3832, "step": 3320 }, { "ce_loss_13": 3.301027572154999, "ce_loss_17": 3.2573514699935915, "ce_loss_2": 4.202074742317199, "ce_loss_4": 3.752170813083649, "ce_loss_9": 3.4156262040138246, "epoch": 0.333, "grad_norm": 820.0, "kl_loss_13": 82.72586593627929, "kl_loss_2": 2041.1314575195313, "kl_loss_4": 1107.1857055664063, "kl_loss_9": 323.77696228027344, "learning_rate": 0.000759556165793906, "loss": 863.6803, "step": 3330 }, { "ce_loss_13": 3.3216323256492615, "ce_loss_17": 3.2765379667282106, "ce_loss_2": 4.196370267868042, "ce_loss_4": 3.769469165802002, "ce_loss_9": 3.4320739030838014, "epoch": 0.334, "grad_norm": 748.0, "kl_loss_13": 82.56733436584473, "kl_loss_2": 1987.8205200195312, "kl_loss_4": 1094.0633605957032, "kl_loss_9": 319.1539001464844, "learning_rate": 0.000758198730819481, "loss": 876.0232, "step": 3340 }, { "ce_loss_13": 3.2749180793762207, "ce_loss_17": 3.2325748682022093, "ce_loss_2": 4.154002678394318, "ce_loss_4": 3.724340319633484, "ce_loss_9": 3.386329698562622, "epoch": 0.335, "grad_norm": 880.0, "kl_loss_13": 80.55043601989746, "kl_loss_2": 2001.4410034179687, "kl_loss_4": 1099.623828125, "kl_loss_9": 314.9535186767578, "learning_rate": 0.0007568386957867032, "loss": 869.0264, "step": 3350 }, { "ce_loss_13": 3.3380845189094543, "ce_loss_17": 3.2921542525291443, "ce_loss_2": 4.198313045501709, "ce_loss_4": 3.776810646057129, "ce_loss_9": 3.4500930070877076, "epoch": 0.336, "grad_norm": 836.0, "kl_loss_13": 81.9162425994873, "kl_loss_2": 1956.8607299804687, "kl_loss_4": 1075.7296630859375, "kl_loss_9": 317.184814453125, "learning_rate": 0.0007554760743911103, "loss": 869.276, "step": 3360 }, { "ce_loss_13": 3.249480664730072, "ce_loss_17": 3.207807409763336, "ce_loss_2": 4.117359912395477, "ce_loss_4": 3.6858110547065737, "ce_loss_9": 3.3595375537872316, "epoch": 0.337, "grad_norm": 752.0, "kl_loss_13": 78.22098541259766, "kl_loss_2": 1978.9128662109374, "kl_loss_4": 1069.7276184082032, "kl_loss_9": 306.45787353515624, "learning_rate": 0.0007541108803542846, "loss": 883.0004, "step": 3370 }, { "ce_loss_13": 3.292482590675354, "ce_loss_17": 3.24836003780365, "ce_loss_2": 4.1632227301597595, "ce_loss_4": 3.725235164165497, "ce_loss_9": 3.399733805656433, "epoch": 0.338, "grad_norm": 900.0, "kl_loss_13": 81.92625350952149, "kl_loss_2": 1998.8309326171875, "kl_loss_4": 1077.7982391357423, "kl_loss_9": 313.54881439208987, "learning_rate": 0.0007527431274237149, "loss": 907.5791, "step": 3380 }, { "ce_loss_13": 3.2658942699432374, "ce_loss_17": 3.224719560146332, "ce_loss_2": 4.125988805294037, "ce_loss_4": 3.6935991287231444, "ce_loss_9": 3.3738891005516054, "epoch": 0.339, "grad_norm": 776.0, "kl_loss_13": 80.2432315826416, "kl_loss_2": 1971.8451171875, "kl_loss_4": 1068.139666748047, "kl_loss_9": 310.88482971191405, "learning_rate": 0.0007513728293726579, "loss": 861.7771, "step": 3390 }, { "ce_loss_13": 3.378425621986389, "ce_loss_17": 3.3328167319297792, "ce_loss_2": 4.222032153606415, "ce_loss_4": 3.809156823158264, "ce_loss_9": 3.4891692996025085, "epoch": 0.34, "grad_norm": 768.0, "kl_loss_13": 81.9792694091797, "kl_loss_2": 1940.3130737304687, "kl_loss_4": 1070.0313262939453, "kl_loss_9": 317.02344970703126, "learning_rate": 0.00075, "loss": 849.7743, "step": 3400 }, { "ce_loss_13": 3.3663819909095762, "ce_loss_17": 3.3200625658035277, "ce_loss_2": 4.24148062467575, "ce_loss_4": 3.816836142539978, "ce_loss_9": 3.4848857522010803, "epoch": 0.341, "grad_norm": 912.0, "kl_loss_13": 83.7885841369629, "kl_loss_2": 1977.0342834472656, "kl_loss_4": 1090.5568939208983, "kl_loss_9": 327.1791015625, "learning_rate": 0.0007486246531301177, "loss": 858.6299, "step": 3410 }, { "ce_loss_13": 3.1796913862228395, "ce_loss_17": 3.1363569140434264, "ce_loss_2": 4.062297952175141, "ce_loss_4": 3.633475697040558, "ce_loss_9": 3.2941571712493896, "epoch": 0.342, "grad_norm": 788.0, "kl_loss_13": 79.56804389953614, "kl_loss_2": 1977.1298095703125, "kl_loss_4": 1089.7484497070313, "kl_loss_9": 323.80016326904297, "learning_rate": 0.0007472468026127384, "loss": 850.588, "step": 3420 }, { "ce_loss_13": 3.315054738521576, "ce_loss_17": 3.2690629601478576, "ce_loss_2": 4.222481107711792, "ce_loss_4": 3.777264642715454, "ce_loss_9": 3.4370264291763304, "epoch": 0.343, "grad_norm": 836.0, "kl_loss_13": 84.76142272949218, "kl_loss_2": 2066.071649169922, "kl_loss_4": 1127.61123046875, "kl_loss_9": 338.3886428833008, "learning_rate": 0.000745866462322802, "loss": 888.6806, "step": 3430 }, { "ce_loss_13": 3.3010619044303895, "ce_loss_17": 3.2584898352622984, "ce_loss_2": 4.155515027046204, "ce_loss_4": 3.732189679145813, "ce_loss_9": 3.4116573691368104, "epoch": 0.344, "grad_norm": 792.0, "kl_loss_13": 80.68924560546876, "kl_loss_2": 1923.6429443359375, "kl_loss_4": 1055.2689880371095, "kl_loss_9": 314.93599700927734, "learning_rate": 0.0007444836461603195, "loss": 852.7298, "step": 3440 }, { "ce_loss_13": 3.3628268957138063, "ce_loss_17": 3.3158179044723513, "ce_loss_2": 4.236175012588501, "ce_loss_4": 3.80575430393219, "ce_loss_9": 3.4840613842010497, "epoch": 0.345, "grad_norm": 768.0, "kl_loss_13": 86.91317672729492, "kl_loss_2": 1995.2550659179688, "kl_loss_4": 1098.6545684814453, "kl_loss_9": 338.66367797851564, "learning_rate": 0.0007430983680502344, "loss": 884.6416, "step": 3450 }, { "ce_loss_13": 3.204190742969513, "ce_loss_17": 3.160679280757904, "ce_loss_2": 4.097968113422394, "ce_loss_4": 3.6541949272155763, "ce_loss_9": 3.3195021510124207, "epoch": 0.346, "grad_norm": 828.0, "kl_loss_13": 82.72286491394043, "kl_loss_2": 2016.9924682617188, "kl_loss_4": 1099.164813232422, "kl_loss_9": 331.8776428222656, "learning_rate": 0.0007417106419422819, "loss": 877.3746, "step": 3460 }, { "ce_loss_13": 3.3090489149093627, "ce_loss_17": 3.262884783744812, "ce_loss_2": 4.165620517730713, "ce_loss_4": 3.7416993379592896, "ce_loss_9": 3.4226615190505982, "epoch": 0.347, "grad_norm": 804.0, "kl_loss_13": 82.66714973449707, "kl_loss_2": 1941.4239135742187, "kl_loss_4": 1066.0562683105468, "kl_loss_9": 318.9803466796875, "learning_rate": 0.0007403204818108486, "loss": 867.3693, "step": 3470 }, { "ce_loss_13": 3.2859779000282288, "ce_loss_17": 3.2418403744697573, "ce_loss_2": 4.1588115215301515, "ce_loss_4": 3.7164050340652466, "ce_loss_9": 3.3979223132133485, "epoch": 0.348, "grad_norm": 660.0, "kl_loss_13": 82.70349998474121, "kl_loss_2": 2008.7912902832031, "kl_loss_4": 1082.7954498291015, "kl_loss_9": 321.50720062255857, "learning_rate": 0.0007389279016548316, "loss": 845.0812, "step": 3480 }, { "ce_loss_13": 3.2886459708213804, "ce_loss_17": 3.2422160506248474, "ce_loss_2": 4.201461863517761, "ce_loss_4": 3.7479749441146852, "ce_loss_9": 3.4051220774650575, "epoch": 0.349, "grad_norm": 812.0, "kl_loss_13": 82.32789001464843, "kl_loss_2": 2051.374365234375, "kl_loss_4": 1109.6338958740234, "kl_loss_9": 325.52098541259767, "learning_rate": 0.0007375329154974975, "loss": 881.8524, "step": 3490 }, { "ce_loss_13": 3.2568347334861754, "ce_loss_17": 3.213172948360443, "ce_loss_2": 4.103061079978943, "ce_loss_4": 3.6816585183143617, "ce_loss_9": 3.364275133609772, "epoch": 0.35, "grad_norm": 700.0, "kl_loss_13": 83.60759010314942, "kl_loss_2": 1942.3072021484375, "kl_loss_4": 1067.110647583008, "kl_loss_9": 311.56414184570315, "learning_rate": 0.0007361355373863414, "loss": 870.9309, "step": 3500 }, { "ce_loss_13": 3.2974486470222475, "ce_loss_17": 3.256460976600647, "ce_loss_2": 4.147087705135346, "ce_loss_4": 3.7272114038467405, "ce_loss_9": 3.406346929073334, "epoch": 0.351, "grad_norm": 988.0, "kl_loss_13": 78.86546592712402, "kl_loss_2": 1917.3578735351562, "kl_loss_4": 1045.9387237548829, "kl_loss_9": 310.67402801513674, "learning_rate": 0.0007347357813929454, "loss": 867.2453, "step": 3510 }, { "ce_loss_13": 3.2474450349807737, "ce_loss_17": 3.203974211215973, "ce_loss_2": 4.1023600697517395, "ce_loss_4": 3.6744820713996886, "ce_loss_9": 3.3537165760993957, "epoch": 0.352, "grad_norm": 924.0, "kl_loss_13": 79.74258193969726, "kl_loss_2": 1930.6052978515625, "kl_loss_4": 1048.7113800048828, "kl_loss_9": 309.16983489990236, "learning_rate": 0.0007333336616128369, "loss": 867.3182, "step": 3520 }, { "ce_loss_13": 3.224319100379944, "ce_loss_17": 3.177726924419403, "ce_loss_2": 4.1144388794898985, "ce_loss_4": 3.667049491405487, "ce_loss_9": 3.3337238669395446, "epoch": 0.353, "grad_norm": 852.0, "kl_loss_13": 81.09270210266114, "kl_loss_2": 1998.5930908203125, "kl_loss_4": 1079.8505889892579, "kl_loss_9": 319.79501190185545, "learning_rate": 0.0007319291921653463, "loss": 869.1441, "step": 3530 }, { "ce_loss_13": 3.306344509124756, "ce_loss_17": 3.260231626033783, "ce_loss_2": 4.194766342639923, "ce_loss_4": 3.753794813156128, "ce_loss_9": 3.4233739256858824, "epoch": 0.354, "grad_norm": 884.0, "kl_loss_13": 83.32369499206543, "kl_loss_2": 1991.9574462890625, "kl_loss_4": 1089.6318115234376, "kl_loss_9": 322.709977722168, "learning_rate": 0.0007305223871934656, "loss": 855.8797, "step": 3540 }, { "ce_loss_13": 3.275039529800415, "ce_loss_17": 3.228920245170593, "ce_loss_2": 4.137136328220367, "ce_loss_4": 3.708537828922272, "ce_loss_9": 3.3849629878997805, "epoch": 0.355, "grad_norm": 948.0, "kl_loss_13": 81.76578102111816, "kl_loss_2": 1950.9606018066406, "kl_loss_4": 1065.2773895263672, "kl_loss_9": 313.07694854736326, "learning_rate": 0.0007291132608637052, "loss": 857.2044, "step": 3550 }, { "ce_loss_13": 3.2366208672523498, "ce_loss_17": 3.193993294239044, "ce_loss_2": 4.182784128189087, "ce_loss_4": 3.6868955850601197, "ce_loss_9": 3.3444697976112367, "epoch": 0.356, "grad_norm": 812.0, "kl_loss_13": 78.44016952514649, "kl_loss_2": 2117.178143310547, "kl_loss_4": 1090.2738372802735, "kl_loss_9": 307.27020263671875, "learning_rate": 0.0007277018273659516, "loss": 891.6365, "step": 3560 }, { "ce_loss_13": 3.361915683746338, "ce_loss_17": 3.3157687306404116, "ce_loss_2": 4.230954706668854, "ce_loss_4": 3.804699718952179, "ce_loss_9": 3.4764284491539, "epoch": 0.357, "grad_norm": 772.0, "kl_loss_13": 82.67460632324219, "kl_loss_2": 1993.5298583984375, "kl_loss_4": 1096.8324890136719, "kl_loss_9": 325.3048751831055, "learning_rate": 0.0007262881009133242, "loss": 869.1084, "step": 3570 }, { "ce_loss_13": 3.2845988869667053, "ce_loss_17": 3.2426820397377014, "ce_loss_2": 4.143541800975799, "ce_loss_4": 3.7120046854019164, "ce_loss_9": 3.3966021656990053, "epoch": 0.358, "grad_norm": 1064.0, "kl_loss_13": 78.48812313079834, "kl_loss_2": 1937.2622436523438, "kl_loss_4": 1051.5308563232422, "kl_loss_9": 308.0796234130859, "learning_rate": 0.0007248720957420329, "loss": 843.0527, "step": 3580 }, { "ce_loss_13": 3.2862144470214845, "ce_loss_17": 3.244723927974701, "ce_loss_2": 4.132400393486023, "ce_loss_4": 3.7120439767837525, "ce_loss_9": 3.390040838718414, "epoch": 0.359, "grad_norm": 884.0, "kl_loss_13": 79.27053871154786, "kl_loss_2": 1919.6581420898438, "kl_loss_4": 1045.3176635742188, "kl_loss_9": 308.4083541870117, "learning_rate": 0.0007234538261112341, "loss": 876.4783, "step": 3590 }, { "ce_loss_13": 3.320645642280579, "ce_loss_17": 3.2759631395339968, "ce_loss_2": 4.197598016262054, "ce_loss_4": 3.76350314617157, "ce_loss_9": 3.4308587551116942, "epoch": 0.36, "grad_norm": 904.0, "kl_loss_13": 79.74393005371094, "kl_loss_2": 1974.381982421875, "kl_loss_4": 1078.2549438476562, "kl_loss_9": 316.0038696289063, "learning_rate": 0.0007220333063028871, "loss": 854.5963, "step": 3600 }, { "ce_loss_13": 3.3596386075019837, "ce_loss_17": 3.3134003400802614, "ce_loss_2": 4.298530578613281, "ce_loss_4": 3.835520887374878, "ce_loss_9": 3.474874699115753, "epoch": 0.361, "grad_norm": 896.0, "kl_loss_13": 82.07156906127929, "kl_loss_2": 2145.89267578125, "kl_loss_4": 1164.2722106933593, "kl_loss_9": 322.6014999389648, "learning_rate": 0.0007206105506216106, "loss": 904.726, "step": 3610 }, { "ce_loss_13": 3.2352831959724426, "ce_loss_17": 3.192703652381897, "ce_loss_2": 4.088595056533814, "ce_loss_4": 3.6704824805259704, "ce_loss_9": 3.344395172595978, "epoch": 0.362, "grad_norm": 792.0, "kl_loss_13": 79.15930404663087, "kl_loss_2": 1925.1086791992188, "kl_loss_4": 1052.2397583007812, "kl_loss_9": 310.4357452392578, "learning_rate": 0.0007191855733945387, "loss": 838.3771, "step": 3620 }, { "ce_loss_13": 3.3250008821487427, "ce_loss_17": 3.282253110408783, "ce_loss_2": 4.182705891132355, "ce_loss_4": 3.760298418998718, "ce_loss_9": 3.4336714267730715, "epoch": 0.363, "grad_norm": 1032.0, "kl_loss_13": 79.23486404418945, "kl_loss_2": 1932.1587280273438, "kl_loss_4": 1057.2986968994142, "kl_loss_9": 309.3342681884766, "learning_rate": 0.0007177583889711762, "loss": 845.6563, "step": 3630 }, { "ce_loss_13": 3.245380866527557, "ce_loss_17": 3.202890765666962, "ce_loss_2": 4.11917644739151, "ce_loss_4": 3.6847108244895934, "ce_loss_9": 3.356135535240173, "epoch": 0.364, "grad_norm": 716.0, "kl_loss_13": 80.34892082214355, "kl_loss_2": 1982.9852783203125, "kl_loss_4": 1083.8937103271485, "kl_loss_9": 318.4195266723633, "learning_rate": 0.0007163290117232541, "loss": 863.3904, "step": 3640 }, { "ce_loss_13": 3.359238362312317, "ce_loss_17": 3.3169869422912597, "ce_loss_2": 4.1737536787986755, "ce_loss_4": 3.7707980632781983, "ce_loss_9": 3.464288854598999, "epoch": 0.365, "grad_norm": 832.0, "kl_loss_13": 78.68237228393555, "kl_loss_2": 1882.076104736328, "kl_loss_4": 1027.501986694336, "kl_loss_9": 305.38031616210935, "learning_rate": 0.0007148974560445859, "loss": 837.6867, "step": 3650 }, { "ce_loss_13": 3.2792360305786135, "ce_loss_17": 3.238192355632782, "ce_loss_2": 4.114881753921509, "ce_loss_4": 3.706641066074371, "ce_loss_9": 3.3858341693878176, "epoch": 0.366, "grad_norm": 792.0, "kl_loss_13": 77.95560035705566, "kl_loss_2": 1890.5672485351563, "kl_loss_4": 1040.546405029297, "kl_loss_9": 307.1755569458008, "learning_rate": 0.0007134637363509209, "loss": 831.2457, "step": 3660 }, { "ce_loss_13": 3.3924476146697997, "ce_loss_17": 3.3501150131225588, "ce_loss_2": 4.223927116394043, "ce_loss_4": 3.811612105369568, "ce_loss_9": 3.4943058609962465, "epoch": 0.367, "grad_norm": 736.0, "kl_loss_13": 78.58778686523438, "kl_loss_2": 1878.4443969726562, "kl_loss_4": 1033.2739837646484, "kl_loss_9": 305.39368057250977, "learning_rate": 0.0007120278670798009, "loss": 845.4932, "step": 3670 }, { "ce_loss_13": 3.1926652908325197, "ce_loss_17": 3.147311043739319, "ce_loss_2": 4.120092225074768, "ce_loss_4": 3.6587387681007386, "ce_loss_9": 3.308837962150574, "epoch": 0.368, "grad_norm": 1000.0, "kl_loss_13": 80.63431091308594, "kl_loss_2": 2088.163299560547, "kl_loss_4": 1126.5058013916016, "kl_loss_9": 326.61964416503906, "learning_rate": 0.0007105898626904133, "loss": 900.6866, "step": 3680 }, { "ce_loss_13": 3.2909374833106995, "ce_loss_17": 3.2479761719703673, "ce_loss_2": 4.165724217891693, "ce_loss_4": 3.7259296655654905, "ce_loss_9": 3.401779890060425, "epoch": 0.369, "grad_norm": 708.0, "kl_loss_13": 79.0161735534668, "kl_loss_2": 1963.41201171875, "kl_loss_4": 1061.9493225097656, "kl_loss_9": 310.74631652832034, "learning_rate": 0.0007091497376634463, "loss": 845.885, "step": 3690 }, { "ce_loss_13": 3.242671525478363, "ce_loss_17": 3.2003350973129274, "ce_loss_2": 4.090977871418, "ce_loss_4": 3.670491564273834, "ce_loss_9": 3.348677897453308, "epoch": 0.37, "grad_norm": 636.0, "kl_loss_13": 79.4022216796875, "kl_loss_2": 1923.3630676269531, "kl_loss_4": 1051.1070129394532, "kl_loss_9": 309.0306625366211, "learning_rate": 0.0007077075065009433, "loss": 857.3949, "step": 3700 }, { "ce_loss_13": 3.339042866230011, "ce_loss_17": 3.2937220692634583, "ce_loss_2": 4.216702568531036, "ce_loss_4": 3.786496937274933, "ce_loss_9": 3.452583134174347, "epoch": 0.371, "grad_norm": 808.0, "kl_loss_13": 83.84554100036621, "kl_loss_2": 1977.7676391601562, "kl_loss_4": 1081.7481719970704, "kl_loss_9": 318.5555221557617, "learning_rate": 0.0007062631837261557, "loss": 858.5238, "step": 3710 }, { "ce_loss_13": 3.214540886878967, "ce_loss_17": 3.1731338500976562, "ce_loss_2": 4.084117066860199, "ce_loss_4": 3.6512245535850525, "ce_loss_9": 3.3241248726844788, "epoch": 0.372, "grad_norm": 836.0, "kl_loss_13": 78.77836952209472, "kl_loss_2": 1959.5369201660155, "kl_loss_4": 1061.3888275146485, "kl_loss_9": 308.37495574951174, "learning_rate": 0.0007048167838833977, "loss": 871.3439, "step": 3720 }, { "ce_loss_13": 3.305317234992981, "ce_loss_17": 3.2632120013237, "ce_loss_2": 4.142421758174896, "ce_loss_4": 3.7283589124679564, "ce_loss_9": 3.4138307213783263, "epoch": 0.373, "grad_norm": 836.0, "kl_loss_13": 79.6880443572998, "kl_loss_2": 1905.605615234375, "kl_loss_4": 1035.902392578125, "kl_loss_9": 308.8265640258789, "learning_rate": 0.0007033683215379002, "loss": 839.7719, "step": 3730 }, { "ce_loss_13": 3.2911507964134215, "ce_loss_17": 3.2498687624931337, "ce_loss_2": 4.154629027843475, "ce_loss_4": 3.7290974378585817, "ce_loss_9": 3.404646301269531, "epoch": 0.374, "grad_norm": 840.0, "kl_loss_13": 79.13087692260743, "kl_loss_2": 1930.552557373047, "kl_loss_4": 1047.307080078125, "kl_loss_9": 307.69998626708986, "learning_rate": 0.0007019178112756625, "loss": 854.4352, "step": 3740 }, { "ce_loss_13": 3.257925236225128, "ce_loss_17": 3.217692804336548, "ce_loss_2": 4.112984991073608, "ce_loss_4": 3.68343665599823, "ce_loss_9": 3.3633124232292175, "epoch": 0.375, "grad_norm": 884.0, "kl_loss_13": 77.43262634277343, "kl_loss_2": 1928.0837707519531, "kl_loss_4": 1048.2009521484374, "kl_loss_9": 304.83057403564453, "learning_rate": 0.0007004652677033068, "loss": 851.567, "step": 3750 }, { "ce_loss_13": 3.3331055283546447, "ce_loss_17": 3.2939743518829347, "ce_loss_2": 4.160905277729034, "ce_loss_4": 3.75036461353302, "ce_loss_9": 3.4350261807441713, "epoch": 0.376, "grad_norm": 820.0, "kl_loss_13": 76.70789985656738, "kl_loss_2": 1890.9389343261719, "kl_loss_4": 1032.068994140625, "kl_loss_9": 300.15679931640625, "learning_rate": 0.0006990107054479312, "loss": 838.1568, "step": 3760 }, { "ce_loss_13": 3.3143543720245363, "ce_loss_17": 3.270555055141449, "ce_loss_2": 4.165234935283661, "ce_loss_4": 3.746877431869507, "ce_loss_9": 3.4235692620277405, "epoch": 0.377, "grad_norm": 876.0, "kl_loss_13": 80.17977790832519, "kl_loss_2": 1924.0489685058594, "kl_loss_4": 1052.3332916259765, "kl_loss_9": 309.0487655639648, "learning_rate": 0.000697554139156961, "loss": 849.1367, "step": 3770 }, { "ce_loss_13": 3.3027577638626098, "ce_loss_17": 3.2585461258888246, "ce_loss_2": 4.1655859589576725, "ce_loss_4": 3.736509621143341, "ce_loss_9": 3.4114188671112062, "epoch": 0.378, "grad_norm": 824.0, "kl_loss_13": 79.86643676757812, "kl_loss_2": 1961.685479736328, "kl_loss_4": 1064.6449981689452, "kl_loss_9": 314.34642333984374, "learning_rate": 0.0006960955834980027, "loss": 838.839, "step": 3780 }, { "ce_loss_13": 3.2778510212898255, "ce_loss_17": 3.231376898288727, "ce_loss_2": 4.127654790878296, "ce_loss_4": 3.7043236970901487, "ce_loss_9": 3.3832536220550535, "epoch": 0.379, "grad_norm": 788.0, "kl_loss_13": 79.7079948425293, "kl_loss_2": 1923.5986022949219, "kl_loss_4": 1057.3545501708984, "kl_loss_9": 307.5535095214844, "learning_rate": 0.0006946350531586958, "loss": 846.7726, "step": 3790 }, { "ce_loss_13": 3.3021407604217528, "ce_loss_17": 3.2583566904067993, "ce_loss_2": 4.157404685020447, "ce_loss_4": 3.733813750743866, "ce_loss_9": 3.4081004738807676, "epoch": 0.38, "grad_norm": 884.0, "kl_loss_13": 79.01305770874023, "kl_loss_2": 1941.484326171875, "kl_loss_4": 1055.3097930908202, "kl_loss_9": 307.8282974243164, "learning_rate": 0.0006931725628465643, "loss": 862.8076, "step": 3800 }, { "ce_loss_13": 3.315520977973938, "ce_loss_17": 3.273395228385925, "ce_loss_2": 4.174019360542298, "ce_loss_4": 3.7471574902534486, "ce_loss_9": 3.42430157661438, "epoch": 0.381, "grad_norm": 792.0, "kl_loss_13": 78.43453598022461, "kl_loss_2": 1927.9592224121093, "kl_loss_4": 1050.5472503662108, "kl_loss_9": 307.24270095825193, "learning_rate": 0.0006917081272888696, "loss": 846.8363, "step": 3810 }, { "ce_loss_13": 3.2242579102516173, "ce_loss_17": 3.181358051300049, "ce_loss_2": 4.110445141792297, "ce_loss_4": 3.653716266155243, "ce_loss_9": 3.3308806896209715, "epoch": 0.382, "grad_norm": 960.0, "kl_loss_13": 79.20247192382813, "kl_loss_2": 2012.6092102050782, "kl_loss_4": 1057.345932006836, "kl_loss_9": 308.32613067626954, "learning_rate": 0.0006902417612324615, "loss": 848.2767, "step": 3820 }, { "ce_loss_13": 3.355782425403595, "ce_loss_17": 3.3093623876571656, "ce_loss_2": 4.239986300468445, "ce_loss_4": 3.8019633531570434, "ce_loss_9": 3.467235004901886, "epoch": 0.383, "grad_norm": 908.0, "kl_loss_13": 82.72949295043945, "kl_loss_2": 2004.3609008789062, "kl_loss_4": 1097.67138671875, "kl_loss_9": 321.2957168579102, "learning_rate": 0.00068877347944363, "loss": 866.2252, "step": 3830 }, { "ce_loss_13": 3.3492382287979128, "ce_loss_17": 3.306003439426422, "ce_loss_2": 4.181626462936402, "ce_loss_4": 3.7661064505577087, "ce_loss_9": 3.4544392824172974, "epoch": 0.384, "grad_norm": 952.0, "kl_loss_13": 78.85240173339844, "kl_loss_2": 1910.4473876953125, "kl_loss_4": 1039.4304443359374, "kl_loss_9": 308.61584320068357, "learning_rate": 0.0006873032967079561, "loss": 853.8964, "step": 3840 }, { "ce_loss_13": 3.336069393157959, "ce_loss_17": 3.2946664333343505, "ce_loss_2": 4.151469981670379, "ce_loss_4": 3.745041239261627, "ce_loss_9": 3.437327229976654, "epoch": 0.385, "grad_norm": 768.0, "kl_loss_13": 77.62555274963378, "kl_loss_2": 1888.4370483398438, "kl_loss_4": 1033.6312164306642, "kl_loss_9": 302.03850708007815, "learning_rate": 0.0006858312278301637, "loss": 829.2557, "step": 3850 }, { "ce_loss_13": 3.3677077651023866, "ce_loss_17": 3.3257739305496217, "ce_loss_2": 4.178003787994385, "ce_loss_4": 3.772132909297943, "ce_loss_9": 3.4704753518104554, "epoch": 0.386, "grad_norm": 844.0, "kl_loss_13": 79.3912525177002, "kl_loss_2": 1875.7722900390625, "kl_loss_4": 1026.8799957275392, "kl_loss_9": 306.31392059326174, "learning_rate": 0.0006843572876339704, "loss": 827.2038, "step": 3860 }, { "ce_loss_13": 3.2855607509613036, "ce_loss_17": 3.244386303424835, "ce_loss_2": 4.093652892112732, "ce_loss_4": 3.692332851886749, "ce_loss_9": 3.385841965675354, "epoch": 0.387, "grad_norm": 916.0, "kl_loss_13": 77.1948143005371, "kl_loss_2": 1841.5903930664062, "kl_loss_4": 1009.3366516113281, "kl_loss_9": 300.8722259521484, "learning_rate": 0.0006828814909619373, "loss": 847.6008, "step": 3870 }, { "ce_loss_13": 3.411894679069519, "ce_loss_17": 3.3655677914619444, "ce_loss_2": 4.245130181312561, "ce_loss_4": 3.8279731273651123, "ce_loss_9": 3.5144017219543455, "epoch": 0.388, "grad_norm": 836.0, "kl_loss_13": 81.73871154785157, "kl_loss_2": 1895.018212890625, "kl_loss_4": 1038.8522857666017, "kl_loss_9": 310.09679260253904, "learning_rate": 0.0006814038526753205, "loss": 824.0662, "step": 3880 }, { "ce_loss_13": 3.3071584939956664, "ce_loss_17": 3.2654017090797423, "ce_loss_2": 4.141039967536926, "ce_loss_4": 3.729535710811615, "ce_loss_9": 3.412937140464783, "epoch": 0.389, "grad_norm": 756.0, "kl_loss_13": 78.23615188598633, "kl_loss_2": 1893.238671875, "kl_loss_4": 1039.030029296875, "kl_loss_9": 307.1902275085449, "learning_rate": 0.0006799243876539213, "loss": 834.9811, "step": 3890 }, { "ce_loss_13": 3.2321924924850465, "ce_loss_17": 3.191261577606201, "ce_loss_2": 4.11304601430893, "ce_loss_4": 3.6646535277366636, "ce_loss_9": 3.3411396503448487, "epoch": 0.39, "grad_norm": 1096.0, "kl_loss_13": 80.79813156127929, "kl_loss_2": 1971.1149475097657, "kl_loss_4": 1049.593035888672, "kl_loss_9": 309.8133071899414, "learning_rate": 0.0006784431107959359, "loss": 854.1135, "step": 3900 }, { "ce_loss_13": 3.294963073730469, "ce_loss_17": 3.250784397125244, "ce_loss_2": 4.176942825317383, "ce_loss_4": 3.7329927563667296, "ce_loss_9": 3.4081822991371156, "epoch": 0.391, "grad_norm": 924.0, "kl_loss_13": 79.89174423217773, "kl_loss_2": 1994.7768249511719, "kl_loss_4": 1062.798147583008, "kl_loss_9": 311.36636505126955, "learning_rate": 0.0006769600370178059, "loss": 849.3525, "step": 3910 }, { "ce_loss_13": 3.2559167623519896, "ce_loss_17": 3.2112300753593446, "ce_loss_2": 4.119880974292755, "ce_loss_4": 3.6950501561164857, "ce_loss_9": 3.3670750975608827, "epoch": 0.392, "grad_norm": 724.0, "kl_loss_13": 78.87548294067383, "kl_loss_2": 1928.7585510253907, "kl_loss_4": 1055.2064392089844, "kl_loss_9": 310.10369262695315, "learning_rate": 0.0006754751812540679, "loss": 827.0956, "step": 3920 }, { "ce_loss_13": 3.3033456802368164, "ce_loss_17": 3.2604467153549193, "ce_loss_2": 4.161333394050598, "ce_loss_4": 3.7346084475517274, "ce_loss_9": 3.412954103946686, "epoch": 0.393, "grad_norm": 1004.0, "kl_loss_13": 80.26275672912598, "kl_loss_2": 1960.345928955078, "kl_loss_4": 1062.8235137939453, "kl_loss_9": 310.36848907470704, "learning_rate": 0.0006739885584572025, "loss": 854.8727, "step": 3930 }, { "ce_loss_13": 3.328982615470886, "ce_loss_17": 3.285259997844696, "ce_loss_2": 4.213145470619201, "ce_loss_4": 3.7565238237380982, "ce_loss_9": 3.436812424659729, "epoch": 0.394, "grad_norm": 1272.0, "kl_loss_13": 81.48662643432617, "kl_loss_2": 2019.0970703125, "kl_loss_4": 1064.3614410400392, "kl_loss_9": 310.75197448730466, "learning_rate": 0.0006725001835974853, "loss": 848.3544, "step": 3940 }, { "ce_loss_13": 3.319366729259491, "ce_loss_17": 3.2750706672668457, "ce_loss_2": 4.18541134595871, "ce_loss_4": 3.758718478679657, "ce_loss_9": 3.430189275741577, "epoch": 0.395, "grad_norm": 956.0, "kl_loss_13": 81.0606185913086, "kl_loss_2": 1969.6244079589844, "kl_loss_4": 1066.606381225586, "kl_loss_9": 310.7843353271484, "learning_rate": 0.0006710100716628344, "loss": 835.6175, "step": 3950 }, { "ce_loss_13": 3.3074681639671324, "ce_loss_17": 3.2640940666198732, "ce_loss_2": 4.156752181053162, "ce_loss_4": 3.7395651817321776, "ce_loss_9": 3.412280297279358, "epoch": 0.396, "grad_norm": 744.0, "kl_loss_13": 78.18349380493164, "kl_loss_2": 1931.2714904785157, "kl_loss_4": 1062.4384399414062, "kl_loss_9": 304.9924346923828, "learning_rate": 0.0006695182376586602, "loss": 849.6749, "step": 3960 }, { "ce_loss_13": 3.3401866912841798, "ce_loss_17": 3.297827625274658, "ce_loss_2": 4.143641221523285, "ce_loss_4": 3.744636571407318, "ce_loss_9": 3.4408209681510926, "epoch": 0.397, "grad_norm": 876.0, "kl_loss_13": 77.44681243896484, "kl_loss_2": 1831.1161743164062, "kl_loss_4": 1002.5733123779297, "kl_loss_9": 296.31993103027344, "learning_rate": 0.000668024696607715, "loss": 842.8701, "step": 3970 }, { "ce_loss_13": 3.295796203613281, "ce_loss_17": 3.2550071120262145, "ce_loss_2": 4.131110274791718, "ce_loss_4": 3.7154141306877135, "ce_loss_9": 3.400242578983307, "epoch": 0.398, "grad_norm": 788.0, "kl_loss_13": 77.02307968139648, "kl_loss_2": 1907.8619750976563, "kl_loss_4": 1040.749215698242, "kl_loss_9": 303.7198455810547, "learning_rate": 0.0006665294635499404, "loss": 836.6309, "step": 3980 }, { "ce_loss_13": 3.2978402376174927, "ce_loss_17": 3.2530077695846558, "ce_loss_2": 4.187583088874817, "ce_loss_4": 3.7399232268333433, "ce_loss_9": 3.4086226344108583, "epoch": 0.399, "grad_norm": 832.0, "kl_loss_13": 81.76366004943847, "kl_loss_2": 2014.4826110839845, "kl_loss_4": 1080.946337890625, "kl_loss_9": 316.02596893310545, "learning_rate": 0.0006650325535423167, "loss": 853.916, "step": 3990 }, { "ce_loss_13": 3.330570673942566, "ce_loss_17": 3.288055944442749, "ce_loss_2": 4.126987683773041, "ce_loss_4": 3.733007574081421, "ce_loss_9": 3.4324370622634888, "epoch": 0.4, "grad_norm": 764.0, "kl_loss_13": 75.5315990447998, "kl_loss_2": 1814.0561645507812, "kl_loss_4": 997.540869140625, "kl_loss_9": 293.8647766113281, "learning_rate": 0.0006635339816587109, "loss": 819.9658, "step": 4000 }, { "ce_loss_13": 3.2647815465927126, "ce_loss_17": 3.22164990901947, "ce_loss_2": 4.133316695690155, "ce_loss_4": 3.6849881410598755, "ce_loss_9": 3.368342387676239, "epoch": 0.401, "grad_norm": 928.0, "kl_loss_13": 79.31331672668458, "kl_loss_2": 1972.0010803222656, "kl_loss_4": 1051.2727966308594, "kl_loss_9": 306.35632476806643, "learning_rate": 0.0006620337629897252, "loss": 838.599, "step": 4010 }, { "ce_loss_13": 3.2707459211349486, "ce_loss_17": 3.2277172327041628, "ce_loss_2": 4.11969610452652, "ce_loss_4": 3.6980610370635985, "ce_loss_9": 3.377561020851135, "epoch": 0.402, "grad_norm": 876.0, "kl_loss_13": 77.62689361572265, "kl_loss_2": 1930.257879638672, "kl_loss_4": 1045.306314086914, "kl_loss_9": 304.64718475341795, "learning_rate": 0.0006605319126425454, "loss": 851.9811, "step": 4020 }, { "ce_loss_13": 3.1794585824012755, "ce_loss_17": 3.135476553440094, "ce_loss_2": 4.056850111484527, "ce_loss_4": 3.6122671961784363, "ce_loss_9": 3.2904020071029665, "epoch": 0.403, "grad_norm": 788.0, "kl_loss_13": 79.04550437927246, "kl_loss_2": 1997.5563049316406, "kl_loss_4": 1072.4226806640625, "kl_loss_9": 309.31011962890625, "learning_rate": 0.0006590284457407876, "loss": 856.1703, "step": 4030 }, { "ce_loss_13": 3.271203708648682, "ce_loss_17": 3.228550398349762, "ce_loss_2": 4.125647473335266, "ce_loss_4": 3.696555233001709, "ce_loss_9": 3.3810480356216432, "epoch": 0.404, "grad_norm": 944.0, "kl_loss_13": 79.07201461791992, "kl_loss_2": 1917.312451171875, "kl_loss_4": 1031.7962280273437, "kl_loss_9": 304.8022430419922, "learning_rate": 0.0006575233774243465, "loss": 834.16, "step": 4040 }, { "ce_loss_13": 3.2648462891578673, "ce_loss_17": 3.2220491051673887, "ce_loss_2": 4.121234345436096, "ce_loss_4": 3.693406546115875, "ce_loss_9": 3.374107909202576, "epoch": 0.405, "grad_norm": 944.0, "kl_loss_13": 77.91585922241211, "kl_loss_2": 1947.8253234863282, "kl_loss_4": 1051.3325927734375, "kl_loss_9": 307.1045608520508, "learning_rate": 0.0006560167228492435, "loss": 842.7441, "step": 4050 }, { "ce_loss_13": 3.3149160623550413, "ce_loss_17": 3.274205195903778, "ce_loss_2": 4.132644689083099, "ce_loss_4": 3.7271281838417054, "ce_loss_9": 3.419368267059326, "epoch": 0.406, "grad_norm": 1012.0, "kl_loss_13": 77.59754943847656, "kl_loss_2": 1871.8706176757812, "kl_loss_4": 1024.0221588134766, "kl_loss_9": 299.1046112060547, "learning_rate": 0.0006545084971874737, "loss": 838.234, "step": 4060 }, { "ce_loss_13": 3.2789357542991637, "ce_loss_17": 3.2333378076553343, "ce_loss_2": 4.15515753030777, "ce_loss_4": 3.717312145233154, "ce_loss_9": 3.3853928685188293, "epoch": 0.407, "grad_norm": 768.0, "kl_loss_13": 81.99721145629883, "kl_loss_2": 1989.4007446289063, "kl_loss_4": 1078.338934326172, "kl_loss_9": 316.0284622192383, "learning_rate": 0.0006529987156268526, "loss": 843.0453, "step": 4070 }, { "ce_loss_13": 3.1949838519096376, "ce_loss_17": 3.1512240171432495, "ce_loss_2": 4.062924098968506, "ce_loss_4": 3.629937028884888, "ce_loss_9": 3.305235779285431, "epoch": 0.408, "grad_norm": 1200.0, "kl_loss_13": 80.35381736755372, "kl_loss_2": 1956.441827392578, "kl_loss_4": 1060.228433227539, "kl_loss_9": 309.562190246582, "learning_rate": 0.0006514873933708637, "loss": 865.9684, "step": 4080 }, { "ce_loss_13": 3.307583379745483, "ce_loss_17": 3.2647162079811096, "ce_loss_2": 4.147261226177216, "ce_loss_4": 3.7207722663879395, "ce_loss_9": 3.4137214183807374, "epoch": 0.409, "grad_norm": 904.0, "kl_loss_13": 77.66700706481933, "kl_loss_2": 1919.4348022460938, "kl_loss_4": 1029.5156524658203, "kl_loss_9": 300.5586654663086, "learning_rate": 0.0006499745456385053, "loss": 828.9072, "step": 4090 }, { "ce_loss_13": 3.2745136857032775, "ce_loss_17": 3.230354392528534, "ce_loss_2": 4.120101284980774, "ce_loss_4": 3.7019402742385865, "ce_loss_9": 3.3798617243766786, "epoch": 0.41, "grad_norm": 820.0, "kl_loss_13": 77.67239074707031, "kl_loss_2": 1915.2319641113281, "kl_loss_4": 1042.1595092773437, "kl_loss_9": 302.0523025512695, "learning_rate": 0.0006484601876641375, "loss": 846.3459, "step": 4100 }, { "ce_loss_13": 3.2652450799942017, "ce_loss_17": 3.2237433433532714, "ce_loss_2": 4.077217090129852, "ce_loss_4": 3.6771028876304626, "ce_loss_9": 3.367217707633972, "epoch": 0.411, "grad_norm": 824.0, "kl_loss_13": 76.87406425476074, "kl_loss_2": 1852.5192993164062, "kl_loss_4": 1016.7843078613281, "kl_loss_9": 296.21001586914065, "learning_rate": 0.000646944334697328, "loss": 818.729, "step": 4110 }, { "ce_loss_13": 3.3710275053977967, "ce_loss_17": 3.3288058638572693, "ce_loss_2": 4.173830461502075, "ce_loss_4": 3.7750932216644286, "ce_loss_9": 3.4707067608833313, "epoch": 0.412, "grad_norm": 1112.0, "kl_loss_13": 77.32941474914551, "kl_loss_2": 1825.318096923828, "kl_loss_4": 1005.8795227050781, "kl_loss_9": 300.3182144165039, "learning_rate": 0.0006454270020026995, "loss": 810.5407, "step": 4120 }, { "ce_loss_13": 3.3413079261779783, "ce_loss_17": 3.300805962085724, "ce_loss_2": 4.138811671733857, "ce_loss_4": 3.7457743525505065, "ce_loss_9": 3.4431130647659303, "epoch": 0.413, "grad_norm": 832.0, "kl_loss_13": 74.75608367919922, "kl_loss_2": 1811.4225952148438, "kl_loss_4": 992.4546264648437, "kl_loss_9": 291.7052032470703, "learning_rate": 0.0006439082048597755, "loss": 804.2014, "step": 4130 }, { "ce_loss_13": 3.325836753845215, "ce_loss_17": 3.2841463565826414, "ce_loss_2": 4.166321110725403, "ce_loss_4": 3.746056628227234, "ce_loss_9": 3.4339433550834655, "epoch": 0.414, "grad_norm": 1104.0, "kl_loss_13": 77.54204635620117, "kl_loss_2": 1915.472186279297, "kl_loss_4": 1034.2969024658203, "kl_loss_9": 303.5854751586914, "learning_rate": 0.0006423879585628261, "loss": 836.3888, "step": 4140 }, { "ce_loss_13": 3.2876922488212585, "ce_loss_17": 3.244413447380066, "ce_loss_2": 4.1571802496910095, "ce_loss_4": 3.7206246495246886, "ce_loss_9": 3.397969675064087, "epoch": 0.415, "grad_norm": 844.0, "kl_loss_13": 79.64313621520996, "kl_loss_2": 1972.5984680175782, "kl_loss_4": 1062.806448364258, "kl_loss_9": 311.07977294921875, "learning_rate": 0.0006408662784207149, "loss": 852.0227, "step": 4150 }, { "ce_loss_13": 3.2557998657226563, "ce_loss_17": 3.213127410411835, "ce_loss_2": 4.102911353111267, "ce_loss_4": 3.679526710510254, "ce_loss_9": 3.3603752493858337, "epoch": 0.416, "grad_norm": 1032.0, "kl_loss_13": 75.84726219177246, "kl_loss_2": 1918.3287048339844, "kl_loss_4": 1042.9201538085938, "kl_loss_9": 302.7628631591797, "learning_rate": 0.0006393431797567439, "loss": 837.298, "step": 4160 }, { "ce_loss_13": 3.3369251608848574, "ce_loss_17": 3.2943072080612184, "ce_loss_2": 4.126527976989746, "ce_loss_4": 3.724642038345337, "ce_loss_9": 3.4380351662635804, "epoch": 0.417, "grad_norm": 776.0, "kl_loss_13": 76.58052368164063, "kl_loss_2": 1837.2471862792968, "kl_loss_4": 998.3685485839844, "kl_loss_9": 296.33739318847654, "learning_rate": 0.0006378186779084996, "loss": 796.7759, "step": 4170 }, { "ce_loss_13": 3.1708509802818297, "ce_loss_17": 3.129050099849701, "ce_loss_2": 4.039690005779266, "ce_loss_4": 3.605795121192932, "ce_loss_9": 3.276896905899048, "epoch": 0.418, "grad_norm": 760.0, "kl_loss_13": 76.8579605102539, "kl_loss_2": 1934.0495178222657, "kl_loss_4": 1046.7709106445313, "kl_loss_9": 305.6838653564453, "learning_rate": 0.0006362927882276989, "loss": 846.643, "step": 4180 }, { "ce_loss_13": 3.3557766914367675, "ce_loss_17": 3.3138336539268494, "ce_loss_2": 4.165496897697449, "ce_loss_4": 3.756271946430206, "ce_loss_9": 3.4560358047485353, "epoch": 0.419, "grad_norm": 704.0, "kl_loss_13": 77.31939125061035, "kl_loss_2": 1856.0389343261718, "kl_loss_4": 1000.2576049804687, "kl_loss_9": 295.1786392211914, "learning_rate": 0.000634765526080034, "loss": 801.9405, "step": 4190 }, { "ce_loss_13": 3.3638531088829042, "ce_loss_17": 3.3213589787483215, "ce_loss_2": 4.186532127857208, "ce_loss_4": 3.7772738099098206, "ce_loss_9": 3.466420602798462, "epoch": 0.42, "grad_norm": 768.0, "kl_loss_13": 78.90635223388672, "kl_loss_2": 1869.4422790527344, "kl_loss_4": 1022.9750732421875, "kl_loss_9": 301.8726806640625, "learning_rate": 0.0006332369068450174, "loss": 815.1037, "step": 4200 }, { "ce_loss_13": 3.2963593363761903, "ce_loss_17": 3.2548921585083006, "ce_loss_2": 4.137203359603882, "ce_loss_4": 3.7190677642822267, "ce_loss_9": 3.397758388519287, "epoch": 0.421, "grad_norm": 1512.0, "kl_loss_13": 77.09004592895508, "kl_loss_2": 1897.3307983398438, "kl_loss_4": 1035.5769775390625, "kl_loss_9": 300.7089096069336, "learning_rate": 0.0006317069459158283, "loss": 823.016, "step": 4210 }, { "ce_loss_13": 3.4075854420661926, "ce_loss_17": 3.3641351580619814, "ce_loss_2": 4.199348616600036, "ce_loss_4": 3.7940779328346252, "ce_loss_9": 3.5064744114875794, "epoch": 0.422, "grad_norm": 688.0, "kl_loss_13": 77.74232749938965, "kl_loss_2": 1831.5187866210938, "kl_loss_4": 995.7357452392578, "kl_loss_9": 296.18746490478514, "learning_rate": 0.0006301756586991561, "loss": 812.6053, "step": 4220 }, { "ce_loss_13": 3.1881481528282167, "ce_loss_17": 3.1477632880210877, "ce_loss_2": 4.054125678539276, "ce_loss_4": 3.6175846815109254, "ce_loss_9": 3.2955458760261536, "epoch": 0.423, "grad_norm": 852.0, "kl_loss_13": 78.09756584167481, "kl_loss_2": 1970.5815002441407, "kl_loss_4": 1058.3021545410156, "kl_loss_9": 304.6946334838867, "learning_rate": 0.0006286430606150459, "loss": 843.0814, "step": 4230 }, { "ce_loss_13": 3.3874191522598265, "ce_loss_17": 3.3446141123771667, "ce_loss_2": 4.204588401317596, "ce_loss_4": 3.800839841365814, "ce_loss_9": 3.4927867531776426, "epoch": 0.424, "grad_norm": 704.0, "kl_loss_13": 78.68111152648926, "kl_loss_2": 1860.591729736328, "kl_loss_4": 1016.1077117919922, "kl_loss_9": 299.66543731689455, "learning_rate": 0.0006271091670967436, "loss": 816.7326, "step": 4240 }, { "ce_loss_13": 3.3006239771842956, "ce_loss_17": 3.254270374774933, "ce_loss_2": 4.166652262210846, "ce_loss_4": 3.745281684398651, "ce_loss_9": 3.409282112121582, "epoch": 0.425, "grad_norm": 820.0, "kl_loss_13": 81.14281044006347, "kl_loss_2": 1977.9890625, "kl_loss_4": 1089.311703491211, "kl_loss_9": 315.2414260864258, "learning_rate": 0.0006255739935905395, "loss": 843.2561, "step": 4250 }, { "ce_loss_13": 3.3398287296295166, "ce_loss_17": 3.2982711791992188, "ce_loss_2": 4.1548157095909115, "ce_loss_4": 3.7448269844055178, "ce_loss_9": 3.444038677215576, "epoch": 0.426, "grad_norm": 788.0, "kl_loss_13": 78.06872749328613, "kl_loss_2": 1849.7942260742188, "kl_loss_4": 1005.2631042480468, "kl_loss_9": 299.2171264648438, "learning_rate": 0.0006240375555556145, "loss": 841.797, "step": 4260 }, { "ce_loss_13": 3.3420338153839113, "ce_loss_17": 3.298211193084717, "ce_loss_2": 4.206630194187165, "ce_loss_4": 3.7783311367034913, "ce_loss_9": 3.450661611557007, "epoch": 0.427, "grad_norm": 820.0, "kl_loss_13": 80.10629310607911, "kl_loss_2": 1955.883984375, "kl_loss_4": 1052.7101623535157, "kl_loss_9": 304.01519165039065, "learning_rate": 0.000622499868463882, "loss": 838.1461, "step": 4270 }, { "ce_loss_13": 3.315020728111267, "ce_loss_17": 3.2741780042648316, "ce_loss_2": 4.107786476612091, "ce_loss_4": 3.705011510848999, "ce_loss_9": 3.4146546721458435, "epoch": 0.428, "grad_norm": 872.0, "kl_loss_13": 77.37769355773926, "kl_loss_2": 1848.4179931640624, "kl_loss_4": 997.9491729736328, "kl_loss_9": 291.1655242919922, "learning_rate": 0.0006209609477998338, "loss": 815.0931, "step": 4280 }, { "ce_loss_13": 3.3685535907745363, "ce_loss_17": 3.324531579017639, "ce_loss_2": 4.187193953990937, "ce_loss_4": 3.7842114925384522, "ce_loss_9": 3.468222963809967, "epoch": 0.429, "grad_norm": 832.0, "kl_loss_13": 79.78409614562989, "kl_loss_2": 1874.082763671875, "kl_loss_4": 1024.7485290527343, "kl_loss_9": 300.9065643310547, "learning_rate": 0.0006194208090603844, "loss": 831.918, "step": 4290 }, { "ce_loss_13": 3.2854601979255675, "ce_loss_17": 3.243454360961914, "ce_loss_2": 4.113818681240081, "ce_loss_4": 3.696045386791229, "ce_loss_9": 3.3911391139030456, "epoch": 0.43, "grad_norm": 816.0, "kl_loss_13": 75.71716423034668, "kl_loss_2": 1863.8755798339844, "kl_loss_4": 1003.1294708251953, "kl_loss_9": 294.57244262695315, "learning_rate": 0.0006178794677547138, "loss": 806.2747, "step": 4300 }, { "ce_loss_13": 3.309733045101166, "ce_loss_17": 3.2677249670028687, "ce_loss_2": 4.158813059329987, "ce_loss_4": 3.729399120807648, "ce_loss_9": 3.4140867710113527, "epoch": 0.431, "grad_norm": 812.0, "kl_loss_13": 78.08214874267578, "kl_loss_2": 1935.1852661132812, "kl_loss_4": 1048.1672271728517, "kl_loss_9": 306.0022201538086, "learning_rate": 0.0006163369394041111, "loss": 833.0965, "step": 4310 }, { "ce_loss_13": 3.248022508621216, "ce_loss_17": 3.207003927230835, "ce_loss_2": 4.110338830947876, "ce_loss_4": 3.6842548727989195, "ce_loss_9": 3.357089364528656, "epoch": 0.432, "grad_norm": 1008.0, "kl_loss_13": 75.94662075042724, "kl_loss_2": 1934.8188049316407, "kl_loss_4": 1051.1523498535157, "kl_loss_9": 300.2815399169922, "learning_rate": 0.0006147932395418205, "loss": 852.7858, "step": 4320 }, { "ce_loss_13": 3.28361519575119, "ce_loss_17": 3.241096353530884, "ce_loss_2": 4.102269923686981, "ce_loss_4": 3.6967898845672607, "ce_loss_9": 3.3902220487594605, "epoch": 0.433, "grad_norm": 888.0, "kl_loss_13": 76.6314250946045, "kl_loss_2": 1858.9971618652344, "kl_loss_4": 1016.6501831054687, "kl_loss_9": 300.2827835083008, "learning_rate": 0.0006132483837128823, "loss": 814.2979, "step": 4330 }, { "ce_loss_13": 3.2630454659461976, "ce_loss_17": 3.2222825288772583, "ce_loss_2": 4.10932183265686, "ce_loss_4": 3.676711046695709, "ce_loss_9": 3.36719868183136, "epoch": 0.434, "grad_norm": 732.0, "kl_loss_13": 76.80081672668457, "kl_loss_2": 1923.487744140625, "kl_loss_4": 1023.3612548828125, "kl_loss_9": 300.2582061767578, "learning_rate": 0.0006117023874739772, "loss": 827.8298, "step": 4340 }, { "ce_loss_13": 3.2588926672935488, "ce_loss_17": 3.218635880947113, "ce_loss_2": 4.1072376608848575, "ce_loss_4": 3.680481123924255, "ce_loss_9": 3.3696163058280946, "epoch": 0.435, "grad_norm": 976.0, "kl_loss_13": 76.15673713684082, "kl_loss_2": 1920.554864501953, "kl_loss_4": 1039.4910980224608, "kl_loss_9": 304.0564468383789, "learning_rate": 0.0006101552663932703, "loss": 841.0854, "step": 4350 }, { "ce_loss_13": 3.2911000847816467, "ce_loss_17": 3.249547779560089, "ce_loss_2": 4.114426076412201, "ce_loss_4": 3.7044010758399963, "ce_loss_9": 3.395657777786255, "epoch": 0.436, "grad_norm": 768.0, "kl_loss_13": 77.62348098754883, "kl_loss_2": 1890.3679443359374, "kl_loss_4": 1025.9166900634766, "kl_loss_9": 301.8690780639648, "learning_rate": 0.0006086070360502539, "loss": 825.298, "step": 4360 }, { "ce_loss_13": 3.299925887584686, "ce_loss_17": 3.2557137250900268, "ce_loss_2": 4.13572096824646, "ce_loss_4": 3.7072188019752503, "ce_loss_9": 3.3994361639022825, "epoch": 0.437, "grad_norm": 840.0, "kl_loss_13": 77.6753143310547, "kl_loss_2": 1916.3807983398438, "kl_loss_4": 1023.9032165527344, "kl_loss_9": 297.97623443603516, "learning_rate": 0.0006070577120355903, "loss": 831.7244, "step": 4370 }, { "ce_loss_13": 3.295057225227356, "ce_loss_17": 3.2547211170196535, "ce_loss_2": 4.113643145561218, "ce_loss_4": 3.712255358695984, "ce_loss_9": 3.404051995277405, "epoch": 0.438, "grad_norm": 848.0, "kl_loss_13": 76.0235752105713, "kl_loss_2": 1835.069677734375, "kl_loss_4": 1011.377474975586, "kl_loss_9": 294.67051849365237, "learning_rate": 0.0006055073099509549, "loss": 816.6621, "step": 4380 }, { "ce_loss_13": 3.3560105204582213, "ce_loss_17": 3.3139033555984496, "ce_loss_2": 4.159651386737823, "ce_loss_4": 3.754903721809387, "ce_loss_9": 3.455800485610962, "epoch": 0.439, "grad_norm": 700.0, "kl_loss_13": 77.7121353149414, "kl_loss_2": 1852.9252380371095, "kl_loss_4": 1004.2511535644531, "kl_loss_9": 295.7561569213867, "learning_rate": 0.0006039558454088796, "loss": 824.942, "step": 4390 }, { "ce_loss_13": 3.329780697822571, "ce_loss_17": 3.285024857521057, "ce_loss_2": 4.164557564258575, "ce_loss_4": 3.748790717124939, "ce_loss_9": 3.4335277676582336, "epoch": 0.44, "grad_norm": 804.0, "kl_loss_13": 77.75255661010742, "kl_loss_2": 1905.2563781738281, "kl_loss_4": 1030.5379302978515, "kl_loss_9": 299.5456596374512, "learning_rate": 0.0006024033340325954, "loss": 811.2349, "step": 4400 }, { "ce_loss_13": 3.3937047004699705, "ce_loss_17": 3.3539093375205993, "ce_loss_2": 4.187200272083283, "ce_loss_4": 3.796113383769989, "ce_loss_9": 3.4941094875335694, "epoch": 0.441, "grad_norm": 804.0, "kl_loss_13": 75.44027671813964, "kl_loss_2": 1791.9597412109374, "kl_loss_4": 984.2767913818359, "kl_loss_9": 288.6034698486328, "learning_rate": 0.0006008497914558743, "loss": 799.1658, "step": 4410 }, { "ce_loss_13": 3.3351574301719666, "ce_loss_17": 3.291703939437866, "ce_loss_2": 4.171803283691406, "ce_loss_4": 3.7514198541641237, "ce_loss_9": 3.4398239731788633, "epoch": 0.442, "grad_norm": 848.0, "kl_loss_13": 79.80851287841797, "kl_loss_2": 1910.2213806152345, "kl_loss_4": 1033.5369079589843, "kl_loss_9": 304.91014556884767, "learning_rate": 0.0005992952333228728, "loss": 827.2408, "step": 4420 }, { "ce_loss_13": 3.2761381387710573, "ce_loss_17": 3.234825384616852, "ce_loss_2": 4.106686508655548, "ce_loss_4": 3.6917160749435425, "ce_loss_9": 3.378207635879517, "epoch": 0.443, "grad_norm": 964.0, "kl_loss_13": 75.30015296936035, "kl_loss_2": 1906.8137512207031, "kl_loss_4": 1027.6942047119142, "kl_loss_9": 293.7550811767578, "learning_rate": 0.0005977396752879741, "loss": 824.6945, "step": 4430 }, { "ce_loss_13": 3.2037268400192263, "ce_loss_17": 3.1642729163169863, "ce_loss_2": 4.042637968063355, "ce_loss_4": 3.6258600473403932, "ce_loss_9": 3.312602734565735, "epoch": 0.444, "grad_norm": 828.0, "kl_loss_13": 74.46896743774414, "kl_loss_2": 1914.5224304199219, "kl_loss_4": 1037.9090118408203, "kl_loss_9": 298.5860763549805, "learning_rate": 0.0005961831330156305, "loss": 819.113, "step": 4440 }, { "ce_loss_13": 3.3450072526931764, "ce_loss_17": 3.3041136980056764, "ce_loss_2": 4.193412566184998, "ce_loss_4": 3.768865776062012, "ce_loss_9": 3.4508578300476076, "epoch": 0.445, "grad_norm": 1576.0, "kl_loss_13": 76.84782943725585, "kl_loss_2": 1931.1100830078126, "kl_loss_4": 1040.968194580078, "kl_loss_9": 299.5110198974609, "learning_rate": 0.0005946256221802051, "loss": 841.9498, "step": 4450 }, { "ce_loss_13": 3.322051453590393, "ce_loss_17": 3.2811499357223513, "ce_loss_2": 4.1088451743125916, "ce_loss_4": 3.7157408595085144, "ce_loss_9": 3.4206302642822264, "epoch": 0.446, "grad_norm": 912.0, "kl_loss_13": 75.70633506774902, "kl_loss_2": 1808.1333068847657, "kl_loss_4": 980.8681457519531, "kl_loss_9": 288.6044303894043, "learning_rate": 0.0005930671584658151, "loss": 832.3179, "step": 4460 }, { "ce_loss_13": 3.3276433110237122, "ce_loss_17": 3.2855786204338076, "ce_loss_2": 4.14680814743042, "ce_loss_4": 3.737990176677704, "ce_loss_9": 3.429105854034424, "epoch": 0.447, "grad_norm": 852.0, "kl_loss_13": 76.35856971740722, "kl_loss_2": 1891.0329956054688, "kl_loss_4": 1026.7948364257813, "kl_loss_9": 295.6607940673828, "learning_rate": 0.0005915077575661722, "loss": 830.7908, "step": 4470 }, { "ce_loss_13": 3.3396317839622496, "ce_loss_17": 3.295839321613312, "ce_loss_2": 4.168763411045075, "ce_loss_4": 3.7516340255737304, "ce_loss_9": 3.442499566078186, "epoch": 0.448, "grad_norm": 808.0, "kl_loss_13": 79.23018226623535, "kl_loss_2": 1906.8447692871093, "kl_loss_4": 1035.3365295410156, "kl_loss_9": 304.0178924560547, "learning_rate": 0.000589947435184427, "loss": 816.7979, "step": 4480 }, { "ce_loss_13": 3.4035409450531007, "ce_loss_17": 3.3630342364311216, "ce_loss_2": 4.180931556224823, "ce_loss_4": 3.7933268666267397, "ce_loss_9": 3.5012203454971313, "epoch": 0.449, "grad_norm": 824.0, "kl_loss_13": 78.1723731994629, "kl_loss_2": 1819.143408203125, "kl_loss_4": 1002.5709869384766, "kl_loss_9": 295.7471405029297, "learning_rate": 0.0005883862070330078, "loss": 811.6958, "step": 4490 }, { "ce_loss_13": 3.3434889793395994, "ce_loss_17": 3.300837182998657, "ce_loss_2": 4.154209268093109, "ce_loss_4": 3.7567609906196595, "ce_loss_9": 3.4478923201560976, "epoch": 0.45, "grad_norm": 884.0, "kl_loss_13": 77.33296165466308, "kl_loss_2": 1873.624853515625, "kl_loss_4": 1029.102764892578, "kl_loss_9": 299.1194305419922, "learning_rate": 0.0005868240888334653, "loss": 812.2415, "step": 4500 }, { "ce_loss_13": 3.2234660267829893, "ce_loss_17": 3.1812546253204346, "ce_loss_2": 4.079589867591858, "ce_loss_4": 3.6489335775375364, "ce_loss_9": 3.3286787033081056, "epoch": 0.451, "grad_norm": 1008.0, "kl_loss_13": 76.51410064697265, "kl_loss_2": 1931.3193603515624, "kl_loss_4": 1046.2738800048828, "kl_loss_9": 303.19763259887696, "learning_rate": 0.0005852610963163119, "loss": 831.9842, "step": 4510 }, { "ce_loss_13": 3.2492877006530763, "ce_loss_17": 3.209109592437744, "ce_loss_2": 4.060594689846039, "ce_loss_4": 3.654378688335419, "ce_loss_9": 3.349640953540802, "epoch": 0.452, "grad_norm": 940.0, "kl_loss_13": 74.64283828735351, "kl_loss_2": 1859.8475830078125, "kl_loss_4": 1011.8156707763671, "kl_loss_9": 294.5323791503906, "learning_rate": 0.0005836972452208654, "loss": 804.4784, "step": 4520 }, { "ce_loss_13": 3.2525294959545135, "ce_loss_17": 3.2113110303878782, "ce_loss_2": 4.088192105293274, "ce_loss_4": 3.667824113368988, "ce_loss_9": 3.3553738236427306, "epoch": 0.453, "grad_norm": 864.0, "kl_loss_13": 76.30174942016602, "kl_loss_2": 1897.0127502441405, "kl_loss_4": 1024.7700744628905, "kl_loss_9": 298.32971572875977, "learning_rate": 0.0005821325512950885, "loss": 824.1601, "step": 4530 }, { "ce_loss_13": 3.279394602775574, "ce_loss_17": 3.2389808893203735, "ce_loss_2": 4.09222549200058, "ce_loss_4": 3.687285912036896, "ce_loss_9": 3.3819608807563784, "epoch": 0.454, "grad_norm": 780.0, "kl_loss_13": 74.611399269104, "kl_loss_2": 1832.5005798339844, "kl_loss_4": 993.5156280517579, "kl_loss_9": 288.8341064453125, "learning_rate": 0.0005805670302954321, "loss": 815.0844, "step": 4540 }, { "ce_loss_13": 3.288856554031372, "ce_loss_17": 3.2489148497581484, "ce_loss_2": 4.091669178009033, "ce_loss_4": 3.6877204060554503, "ce_loss_9": 3.3897912740707397, "epoch": 0.455, "grad_norm": 828.0, "kl_loss_13": 73.65193538665771, "kl_loss_2": 1845.4981689453125, "kl_loss_4": 1002.7583282470703, "kl_loss_9": 293.07848052978517, "learning_rate": 0.000579000697986675, "loss": 804.0725, "step": 4550 }, { "ce_loss_13": 3.2434154629707335, "ce_loss_17": 3.199486696720123, "ce_loss_2": 4.098711669445038, "ce_loss_4": 3.681813371181488, "ce_loss_9": 3.3549178719520567, "epoch": 0.456, "grad_norm": 744.0, "kl_loss_13": 77.70943298339844, "kl_loss_2": 1935.5743041992187, "kl_loss_4": 1063.0270935058593, "kl_loss_9": 307.4092712402344, "learning_rate": 0.0005774335701417662, "loss": 828.9596, "step": 4560 }, { "ce_loss_13": 3.2368232369422913, "ce_loss_17": 3.196345341205597, "ce_loss_2": 4.089473974704743, "ce_loss_4": 3.661019968986511, "ce_loss_9": 3.3422934889793394, "epoch": 0.457, "grad_norm": 816.0, "kl_loss_13": 73.5301586151123, "kl_loss_2": 1949.0679992675782, "kl_loss_4": 1040.8331512451173, "kl_loss_9": 298.1072036743164, "learning_rate": 0.0005758656625416658, "loss": 832.1058, "step": 4570 }, { "ce_loss_13": 3.2876862049102784, "ce_loss_17": 3.2457969188690186, "ce_loss_2": 4.112439560890198, "ce_loss_4": 3.695971500873566, "ce_loss_9": 3.3895508646965027, "epoch": 0.458, "grad_norm": 776.0, "kl_loss_13": 75.88962821960449, "kl_loss_2": 1860.5553161621094, "kl_loss_4": 1020.166015625, "kl_loss_9": 297.5071060180664, "learning_rate": 0.0005742969909751859, "loss": 804.1062, "step": 4580 }, { "ce_loss_13": 3.3020657181739805, "ce_loss_17": 3.2607773661613466, "ce_loss_2": 4.131641614437103, "ce_loss_4": 3.709846580028534, "ce_loss_9": 3.405165731906891, "epoch": 0.459, "grad_norm": 1048.0, "kl_loss_13": 75.65441131591797, "kl_loss_2": 1891.6655334472657, "kl_loss_4": 1009.7342956542968, "kl_loss_9": 294.8964126586914, "learning_rate": 0.0005727275712388318, "loss": 823.5099, "step": 4590 }, { "ce_loss_13": 3.3285714864730833, "ce_loss_17": 3.287725341320038, "ce_loss_2": 4.124272167682648, "ce_loss_4": 3.7236576795578005, "ce_loss_9": 3.430411958694458, "epoch": 0.46, "grad_norm": 956.0, "kl_loss_13": 75.18080139160156, "kl_loss_2": 1820.7663513183593, "kl_loss_4": 986.5203704833984, "kl_loss_9": 289.905224609375, "learning_rate": 0.0005711574191366427, "loss": 804.5652, "step": 4600 }, { "ce_loss_13": 3.2801861643791197, "ce_loss_17": 3.2401907563209535, "ce_loss_2": 4.089137363433838, "ce_loss_4": 3.684733271598816, "ce_loss_9": 3.379390871524811, "epoch": 0.461, "grad_norm": 688.0, "kl_loss_13": 75.0859489440918, "kl_loss_2": 1857.538348388672, "kl_loss_4": 999.1473907470703, "kl_loss_9": 290.6371444702148, "learning_rate": 0.0005695865504800327, "loss": 803.9468, "step": 4610 }, { "ce_loss_13": 3.214577782154083, "ce_loss_17": 3.171001160144806, "ce_loss_2": 4.123684239387512, "ce_loss_4": 3.668971860408783, "ce_loss_9": 3.325447380542755, "epoch": 0.462, "grad_norm": 968.0, "kl_loss_13": 77.4129825592041, "kl_loss_2": 2041.3165405273437, "kl_loss_4": 1094.7594757080078, "kl_loss_9": 311.20372467041017, "learning_rate": 0.0005680149810876322, "loss": 845.5946, "step": 4620 }, { "ce_loss_13": 3.2732278943061828, "ce_loss_17": 3.2324661135673525, "ce_loss_2": 4.097072160243988, "ce_loss_4": 3.676388144493103, "ce_loss_9": 3.374810588359833, "epoch": 0.463, "grad_norm": 776.0, "kl_loss_13": 74.99469413757325, "kl_loss_2": 1880.4112670898437, "kl_loss_4": 1009.9831604003906, "kl_loss_9": 291.9598579406738, "learning_rate": 0.0005664427267851271, "loss": 808.8515, "step": 4630 }, { "ce_loss_13": 3.1931955575942994, "ce_loss_17": 3.1502653479576113, "ce_loss_2": 4.024770927429199, "ce_loss_4": 3.610238516330719, "ce_loss_9": 3.297864890098572, "epoch": 0.464, "grad_norm": 1000.0, "kl_loss_13": 74.38558692932129, "kl_loss_2": 1880.835107421875, "kl_loss_4": 1015.0424072265625, "kl_loss_9": 292.6310218811035, "learning_rate": 0.0005648698034051009, "loss": 812.3297, "step": 4640 }, { "ce_loss_13": 3.3028565406799317, "ce_loss_17": 3.259902000427246, "ce_loss_2": 4.152322435379029, "ce_loss_4": 3.719174313545227, "ce_loss_9": 3.406006133556366, "epoch": 0.465, "grad_norm": 760.0, "kl_loss_13": 75.3498405456543, "kl_loss_2": 1921.487548828125, "kl_loss_4": 1028.3840270996093, "kl_loss_9": 291.388444519043, "learning_rate": 0.0005632962267868747, "loss": 807.8761, "step": 4650 }, { "ce_loss_13": 3.2430774450302122, "ce_loss_17": 3.2033018827438355, "ce_loss_2": 4.058165490627289, "ce_loss_4": 3.6541751623153687, "ce_loss_9": 3.3456701397895814, "epoch": 0.466, "grad_norm": 840.0, "kl_loss_13": 73.02407569885254, "kl_loss_2": 1852.6263793945313, "kl_loss_4": 1005.7894439697266, "kl_loss_9": 287.7598388671875, "learning_rate": 0.0005617220127763474, "loss": 814.0826, "step": 4660 }, { "ce_loss_13": 3.3238396286964416, "ce_loss_17": 3.281664049625397, "ce_loss_2": 4.129569494724274, "ce_loss_4": 3.7306875705718996, "ce_loss_9": 3.4238651275634764, "epoch": 0.467, "grad_norm": 924.0, "kl_loss_13": 75.31039695739746, "kl_loss_2": 1845.7906188964844, "kl_loss_4": 1007.4520416259766, "kl_loss_9": 292.87747955322266, "learning_rate": 0.0005601471772258368, "loss": 814.8657, "step": 4670 }, { "ce_loss_13": 3.3064895629882813, "ce_loss_17": 3.2665050864219665, "ce_loss_2": 4.108851706981659, "ce_loss_4": 3.703833544254303, "ce_loss_9": 3.4077421545982363, "epoch": 0.468, "grad_norm": 964.0, "kl_loss_13": 74.9879051208496, "kl_loss_2": 1819.450994873047, "kl_loss_4": 974.3571472167969, "kl_loss_9": 288.794450378418, "learning_rate": 0.0005585717359939192, "loss": 812.0005, "step": 4680 }, { "ce_loss_13": 3.2169051647186278, "ce_loss_17": 3.1767580628395082, "ce_loss_2": 4.030018448829651, "ce_loss_4": 3.6190481901168825, "ce_loss_9": 3.3161608457565306, "epoch": 0.469, "grad_norm": 1032.0, "kl_loss_13": 73.6907470703125, "kl_loss_2": 1837.1917602539063, "kl_loss_4": 995.9050445556641, "kl_loss_9": 288.79329681396484, "learning_rate": 0.0005569957049452703, "loss": 820.1777, "step": 4690 }, { "ce_loss_13": 3.2727556824684143, "ce_loss_17": 3.233135461807251, "ce_loss_2": 4.109477472305298, "ce_loss_4": 3.689884305000305, "ce_loss_9": 3.3748852491378782, "epoch": 0.47, "grad_norm": 856.0, "kl_loss_13": 76.08234329223633, "kl_loss_2": 1908.498553466797, "kl_loss_4": 1032.0423736572266, "kl_loss_9": 300.4012725830078, "learning_rate": 0.0005554190999505056, "loss": 825.159, "step": 4700 }, { "ce_loss_13": 3.3975147128105165, "ce_loss_17": 3.3547534942626953, "ce_loss_2": 4.215018939971924, "ce_loss_4": 3.8030583024024964, "ce_loss_9": 3.4997135996818542, "epoch": 0.471, "grad_norm": 660.0, "kl_loss_13": 77.31126403808594, "kl_loss_2": 1889.2184509277345, "kl_loss_4": 1020.566683959961, "kl_loss_9": 302.34742279052733, "learning_rate": 0.0005538419368860196, "loss": 794.3334, "step": 4710 }, { "ce_loss_13": 3.3169658899307253, "ce_loss_17": 3.277206563949585, "ce_loss_2": 4.13512532711029, "ce_loss_4": 3.7266653776168823, "ce_loss_9": 3.4202173471450807, "epoch": 0.472, "grad_norm": 732.0, "kl_loss_13": 76.7188003540039, "kl_loss_2": 1864.9093627929688, "kl_loss_4": 1011.0238464355468, "kl_loss_9": 296.90992279052733, "learning_rate": 0.0005522642316338268, "loss": 827.9762, "step": 4720 }, { "ce_loss_13": 3.3335940837860107, "ce_loss_17": 3.2930097460746763, "ce_loss_2": 4.1359561562538145, "ce_loss_4": 3.7283730030059816, "ce_loss_9": 3.433068335056305, "epoch": 0.473, "grad_norm": 1032.0, "kl_loss_13": 76.10838012695312, "kl_loss_2": 1853.8134521484376, "kl_loss_4": 1004.4875946044922, "kl_loss_9": 296.6020965576172, "learning_rate": 0.0005506860000814017, "loss": 829.242, "step": 4730 }, { "ce_loss_13": 3.355704641342163, "ce_loss_17": 3.3165703058242797, "ce_loss_2": 4.14022581577301, "ce_loss_4": 3.7492562770843505, "ce_loss_9": 3.4540576934814453, "epoch": 0.474, "grad_norm": 952.0, "kl_loss_13": 74.14773750305176, "kl_loss_2": 1809.160467529297, "kl_loss_4": 994.4486236572266, "kl_loss_9": 289.21658477783205, "learning_rate": 0.0005491072581215186, "loss": 810.8421, "step": 4740 }, { "ce_loss_13": 3.347885513305664, "ce_loss_17": 3.305254805088043, "ce_loss_2": 4.148220801353455, "ce_loss_4": 3.7457194209098814, "ce_loss_9": 3.4504737854003906, "epoch": 0.475, "grad_norm": 956.0, "kl_loss_13": 76.87275047302246, "kl_loss_2": 1862.5593688964843, "kl_loss_4": 1007.2896026611328, "kl_loss_9": 299.8138198852539, "learning_rate": 0.0005475280216520913, "loss": 793.7902, "step": 4750 }, { "ce_loss_13": 3.2711644768714905, "ce_loss_17": 3.2309195160865785, "ce_loss_2": 4.070014572143554, "ce_loss_4": 3.6676210045814512, "ce_loss_9": 3.3719236969947817, "epoch": 0.476, "grad_norm": 752.0, "kl_loss_13": 73.26801719665528, "kl_loss_2": 1814.8681579589843, "kl_loss_4": 981.7904296875, "kl_loss_9": 286.1250335693359, "learning_rate": 0.0005459483065760138, "loss": 816.5074, "step": 4760 }, { "ce_loss_13": 3.2079075932502747, "ce_loss_17": 3.1671441316604616, "ce_loss_2": 4.089412212371826, "ce_loss_4": 3.642701601982117, "ce_loss_9": 3.309967613220215, "epoch": 0.477, "grad_norm": 884.0, "kl_loss_13": 74.20464248657227, "kl_loss_2": 1979.9038391113281, "kl_loss_4": 1055.5522186279297, "kl_loss_9": 293.0203689575195, "learning_rate": 0.0005443681288009991, "loss": 826.5146, "step": 4770 }, { "ce_loss_13": 3.266829860210419, "ce_loss_17": 3.2285204768180846, "ce_loss_2": 4.086682987213135, "ce_loss_4": 3.6759202241897584, "ce_loss_9": 3.36565181016922, "epoch": 0.478, "grad_norm": 672.0, "kl_loss_13": 73.82023429870605, "kl_loss_2": 1879.6912658691406, "kl_loss_4": 1018.4355865478516, "kl_loss_9": 290.7284698486328, "learning_rate": 0.0005427875042394199, "loss": 820.2439, "step": 4780 }, { "ce_loss_13": 3.2985992431640625, "ce_loss_17": 3.2553022503852844, "ce_loss_2": 4.103298151493073, "ce_loss_4": 3.704231595993042, "ce_loss_9": 3.3976196885108947, "epoch": 0.479, "grad_norm": 764.0, "kl_loss_13": 74.87098045349121, "kl_loss_2": 1833.0201904296875, "kl_loss_4": 1016.7945831298828, "kl_loss_9": 290.94402770996095, "learning_rate": 0.0005412064488081482, "loss": 817.3122, "step": 4790 }, { "ce_loss_13": 3.3004804491996764, "ce_loss_17": 3.260096788406372, "ce_loss_2": 4.096733212471008, "ce_loss_4": 3.698592507839203, "ce_loss_9": 3.3992846846580504, "epoch": 0.48, "grad_norm": 1072.0, "kl_loss_13": 73.09108085632325, "kl_loss_2": 1830.7623352050782, "kl_loss_4": 989.9618072509766, "kl_loss_9": 285.9252304077148, "learning_rate": 0.0005396249784283942, "loss": 797.4109, "step": 4800 }, { "ce_loss_13": 3.316614365577698, "ce_loss_17": 3.2748780608177186, "ce_loss_2": 4.164385390281677, "ce_loss_4": 3.7438156366348267, "ce_loss_9": 3.4193318128585815, "epoch": 0.481, "grad_norm": 760.0, "kl_loss_13": 76.38266716003417, "kl_loss_2": 1923.5742980957032, "kl_loss_4": 1041.6750427246093, "kl_loss_9": 298.795263671875, "learning_rate": 0.0005380431090255476, "loss": 824.89, "step": 4810 }, { "ce_loss_13": 3.3142627000808718, "ce_loss_17": 3.275036060810089, "ce_loss_2": 4.110712015628815, "ce_loss_4": 3.7121946454048156, "ce_loss_9": 3.4122665405273436, "epoch": 0.482, "grad_norm": 808.0, "kl_loss_13": 72.91415100097656, "kl_loss_2": 1832.0138732910157, "kl_loss_4": 991.3748962402344, "kl_loss_9": 284.617170715332, "learning_rate": 0.0005364608565290155, "loss": 798.3713, "step": 4820 }, { "ce_loss_13": 3.3208075881004335, "ce_loss_17": 3.2798139452934265, "ce_loss_2": 4.140064144134522, "ce_loss_4": 3.727204477787018, "ce_loss_9": 3.4234581351280213, "epoch": 0.483, "grad_norm": 800.0, "kl_loss_13": 75.80227832794189, "kl_loss_2": 1862.9129943847656, "kl_loss_4": 1001.9588317871094, "kl_loss_9": 291.62636489868163, "learning_rate": 0.0005348782368720626, "loss": 808.1032, "step": 4830 }, { "ce_loss_13": 3.252118909358978, "ce_loss_17": 3.2115795969963075, "ce_loss_2": 4.059725773334503, "ce_loss_4": 3.6502609133720396, "ce_loss_9": 3.353605532646179, "epoch": 0.484, "grad_norm": 784.0, "kl_loss_13": 72.63689727783203, "kl_loss_2": 1815.8565368652344, "kl_loss_4": 977.6638732910156, "kl_loss_9": 285.4698852539062, "learning_rate": 0.000533295265991652, "loss": 804.2033, "step": 4840 }, { "ce_loss_13": 3.3277907848358153, "ce_loss_17": 3.2861191630363464, "ce_loss_2": 4.117743873596192, "ce_loss_4": 3.725143051147461, "ce_loss_9": 3.4288668394088746, "epoch": 0.485, "grad_norm": 836.0, "kl_loss_13": 73.85648956298829, "kl_loss_2": 1807.0551513671876, "kl_loss_4": 990.4627105712891, "kl_loss_9": 289.78437957763674, "learning_rate": 0.0005317119598282822, "loss": 794.5045, "step": 4850 }, { "ce_loss_13": 3.3274188876152038, "ce_loss_17": 3.2868924021720884, "ce_loss_2": 4.134918856620788, "ce_loss_4": 3.7305999279022215, "ce_loss_9": 3.4288418531417846, "epoch": 0.486, "grad_norm": 1352.0, "kl_loss_13": 75.01168785095214, "kl_loss_2": 1836.2504455566407, "kl_loss_4": 1000.2125915527344, "kl_loss_9": 291.90661773681643, "learning_rate": 0.0005301283343258293, "loss": 804.1091, "step": 4860 }, { "ce_loss_13": 3.3871153354644776, "ce_loss_17": 3.3473598957061768, "ce_loss_2": 4.170122230052948, "ce_loss_4": 3.782205104827881, "ce_loss_9": 3.4865453600883485, "epoch": 0.487, "grad_norm": 928.0, "kl_loss_13": 75.0543228149414, "kl_loss_2": 1801.48056640625, "kl_loss_4": 986.4563842773438, "kl_loss_9": 290.7179382324219, "learning_rate": 0.000528544405431384, "loss": 791.7803, "step": 4870 }, { "ce_loss_13": 3.270020830631256, "ce_loss_17": 3.2277884244918824, "ce_loss_2": 4.086709308624267, "ce_loss_4": 3.6876027822494506, "ce_loss_9": 3.3727181911468507, "epoch": 0.488, "grad_norm": 820.0, "kl_loss_13": 75.41398048400879, "kl_loss_2": 1875.3604064941405, "kl_loss_4": 1033.8077758789063, "kl_loss_9": 298.76770172119143, "learning_rate": 0.000526960189095093, "loss": 817.7302, "step": 4880 }, { "ce_loss_13": 3.2510303258895874, "ce_loss_17": 3.211883878707886, "ce_loss_2": 4.061688530445099, "ce_loss_4": 3.65454957485199, "ce_loss_9": 3.3511975765228272, "epoch": 0.489, "grad_norm": 888.0, "kl_loss_13": 72.43558959960937, "kl_loss_2": 1828.6154235839845, "kl_loss_4": 991.6234741210938, "kl_loss_9": 285.3493843078613, "learning_rate": 0.0005253757012699972, "loss": 798.6789, "step": 4890 }, { "ce_loss_13": 3.3291762590408327, "ce_loss_17": 3.290039026737213, "ce_loss_2": 4.127644944190979, "ce_loss_4": 3.724576246738434, "ce_loss_9": 3.4292059659957888, "epoch": 0.49, "grad_norm": 740.0, "kl_loss_13": 73.65130043029785, "kl_loss_2": 1827.6836364746093, "kl_loss_4": 992.8492248535156, "kl_loss_9": 288.521134185791, "learning_rate": 0.0005237909579118712, "loss": 811.9515, "step": 4900 }, { "ce_loss_13": 3.2921934723854065, "ce_loss_17": 3.249121403694153, "ce_loss_2": 4.118033158779144, "ce_loss_4": 3.7062087774276735, "ce_loss_9": 3.3943717956542967, "epoch": 0.491, "grad_norm": 756.0, "kl_loss_13": 75.26352424621582, "kl_loss_2": 1878.7477905273438, "kl_loss_4": 1020.291845703125, "kl_loss_9": 296.14775390625, "learning_rate": 0.0005222059749790631, "loss": 816.4149, "step": 4910 }, { "ce_loss_13": 3.3588847875595094, "ce_loss_17": 3.318212938308716, "ce_loss_2": 4.122260499000549, "ce_loss_4": 3.7380555152893065, "ce_loss_9": 3.456986737251282, "epoch": 0.492, "grad_norm": 720.0, "kl_loss_13": 74.36726608276368, "kl_loss_2": 1770.406268310547, "kl_loss_4": 970.4561462402344, "kl_loss_9": 286.1612060546875, "learning_rate": 0.0005206207684323337, "loss": 776.1451, "step": 4920 }, { "ce_loss_13": 3.3389157891273498, "ce_loss_17": 3.2978055119514464, "ce_loss_2": 4.135409045219421, "ce_loss_4": 3.736425042152405, "ce_loss_9": 3.439645564556122, "epoch": 0.493, "grad_norm": 920.0, "kl_loss_13": 75.09701538085938, "kl_loss_2": 1837.3790893554688, "kl_loss_4": 992.6164123535157, "kl_loss_9": 291.5115341186523, "learning_rate": 0.000519035354234695, "loss": 816.9218, "step": 4930 }, { "ce_loss_13": 3.314695417881012, "ce_loss_17": 3.2720402002334597, "ce_loss_2": 4.1193211555480955, "ce_loss_4": 3.7305904626846313, "ce_loss_9": 3.4197781324386596, "epoch": 0.494, "grad_norm": 800.0, "kl_loss_13": 75.44659748077393, "kl_loss_2": 1821.7885375976562, "kl_loss_4": 1008.4688659667969, "kl_loss_9": 292.37608642578124, "learning_rate": 0.0005174497483512506, "loss": 792.1003, "step": 4940 }, { "ce_loss_13": 3.3612324833869933, "ce_loss_17": 3.3229804396629334, "ce_loss_2": 4.14638044834137, "ce_loss_4": 3.750081944465637, "ce_loss_9": 3.457711327075958, "epoch": 0.495, "grad_norm": 800.0, "kl_loss_13": 74.3962791442871, "kl_loss_2": 1826.6995910644532, "kl_loss_4": 992.9568328857422, "kl_loss_9": 289.2128746032715, "learning_rate": 0.0005158639667490339, "loss": 814.1679, "step": 4950 }, { "ce_loss_13": 3.2649693608284, "ce_loss_17": 3.224947285652161, "ce_loss_2": 4.075822901725769, "ce_loss_4": 3.669500434398651, "ce_loss_9": 3.3687703609466553, "epoch": 0.496, "grad_norm": 1104.0, "kl_loss_13": 74.10564365386963, "kl_loss_2": 1856.5388305664062, "kl_loss_4": 1008.420068359375, "kl_loss_9": 294.5977310180664, "learning_rate": 0.0005142780253968481, "loss": 804.6434, "step": 4960 }, { "ce_loss_13": 3.2142897129058836, "ce_loss_17": 3.174994421005249, "ce_loss_2": 4.017913889884949, "ce_loss_4": 3.6049700498580934, "ce_loss_9": 3.312189483642578, "epoch": 0.497, "grad_norm": 976.0, "kl_loss_13": 71.54103832244873, "kl_loss_2": 1836.7127624511718, "kl_loss_4": 976.3125457763672, "kl_loss_9": 280.7961006164551, "learning_rate": 0.0005126919402651053, "loss": 781.3013, "step": 4970 }, { "ce_loss_13": 3.278981614112854, "ce_loss_17": 3.2370316624641418, "ce_loss_2": 4.111584794521332, "ce_loss_4": 3.703745257854462, "ce_loss_9": 3.381889748573303, "epoch": 0.498, "grad_norm": 844.0, "kl_loss_13": 76.156498336792, "kl_loss_2": 1858.4030456542969, "kl_loss_4": 1016.5751037597656, "kl_loss_9": 293.5657470703125, "learning_rate": 0.0005111057273256647, "loss": 810.4161, "step": 4980 }, { "ce_loss_13": 3.382327103614807, "ce_loss_17": 3.343675124645233, "ce_loss_2": 4.123418486118316, "ce_loss_4": 3.752922296524048, "ce_loss_9": 3.4781156182289124, "epoch": 0.499, "grad_norm": 700.0, "kl_loss_13": 72.45312232971192, "kl_loss_2": 1723.856219482422, "kl_loss_4": 949.0105102539062, "kl_loss_9": 278.47458724975587, "learning_rate": 0.0005095194025516733, "loss": 772.5709, "step": 4990 }, { "ce_loss_13": 3.308869647979736, "ce_loss_17": 3.270881199836731, "ce_loss_2": 4.0973071455955505, "ce_loss_4": 3.697914254665375, "ce_loss_9": 3.404145121574402, "epoch": 0.5, "grad_norm": 844.0, "kl_loss_13": 72.70045700073243, "kl_loss_2": 1794.5357177734375, "kl_loss_4": 970.1500518798828, "kl_loss_9": 283.13077850341796, "learning_rate": 0.000507932981917404, "loss": 812.2307, "step": 5000 }, { "ce_loss_13": 3.262911152839661, "ce_loss_17": 3.2198340773582457, "ce_loss_2": 4.109212005138398, "ce_loss_4": 3.6863863348960875, "ce_loss_9": 3.3666589736938475, "epoch": 0.501, "grad_norm": 1080.0, "kl_loss_13": 77.47729015350342, "kl_loss_2": 1929.3942321777345, "kl_loss_4": 1039.4493103027344, "kl_loss_9": 300.106640625, "learning_rate": 0.0005063464813980949, "loss": 830.4402, "step": 5010 }, { "ce_loss_13": 3.2479494333267214, "ce_loss_17": 3.2078561663627623, "ce_loss_2": 4.062610566616058, "ce_loss_4": 3.6508336186409, "ce_loss_9": 3.343203544616699, "epoch": 0.502, "grad_norm": 672.0, "kl_loss_13": 73.65824165344239, "kl_loss_2": 1878.8285278320313, "kl_loss_4": 1018.11328125, "kl_loss_9": 289.3094146728516, "learning_rate": 0.0005047599169697884, "loss": 805.0937, "step": 5020 }, { "ce_loss_13": 3.185950815677643, "ce_loss_17": 3.14703825712204, "ce_loss_2": 4.00815167427063, "ce_loss_4": 3.5929755568504333, "ce_loss_9": 3.288442540168762, "epoch": 0.503, "grad_norm": 1088.0, "kl_loss_13": 71.29309349060058, "kl_loss_2": 1850.965869140625, "kl_loss_4": 994.7696960449218, "kl_loss_9": 285.5740135192871, "learning_rate": 0.000503173304609171, "loss": 785.2191, "step": 5030 }, { "ce_loss_13": 3.305916726589203, "ce_loss_17": 3.266102612018585, "ce_loss_2": 4.113384175300598, "ce_loss_4": 3.70848388671875, "ce_loss_9": 3.4055826902389525, "epoch": 0.504, "grad_norm": 888.0, "kl_loss_13": 73.97796363830567, "kl_loss_2": 1829.7077392578126, "kl_loss_4": 996.4985748291016, "kl_loss_9": 288.2816551208496, "learning_rate": 0.0005015866602934111, "loss": 787.8277, "step": 5040 }, { "ce_loss_13": 3.2743287205696108, "ce_loss_17": 3.233179819583893, "ce_loss_2": 4.111311686038971, "ce_loss_4": 3.698232448101044, "ce_loss_9": 3.3796739101409914, "epoch": 0.505, "grad_norm": 780.0, "kl_loss_13": 76.13735866546631, "kl_loss_2": 1898.339990234375, "kl_loss_4": 1037.7702270507812, "kl_loss_9": 300.18348693847656, "learning_rate": 0.0005, "loss": 813.3053, "step": 5050 }, { "ce_loss_13": 3.2703390598297117, "ce_loss_17": 3.2306999921798707, "ce_loss_2": 4.07856274843216, "ce_loss_4": 3.668815791606903, "ce_loss_9": 3.3710572361946105, "epoch": 0.506, "grad_norm": 856.0, "kl_loss_13": 75.43934326171875, "kl_loss_2": 1849.766552734375, "kl_loss_4": 1009.7808013916016, "kl_loss_9": 293.8470733642578, "learning_rate": 0.0004984133397065889, "loss": 794.9061, "step": 5060 }, { "ce_loss_13": 3.272858726978302, "ce_loss_17": 3.2313432335853576, "ce_loss_2": 4.103818774223328, "ce_loss_4": 3.7008283495903016, "ce_loss_9": 3.37606999874115, "epoch": 0.507, "grad_norm": 752.0, "kl_loss_13": 74.73295059204102, "kl_loss_2": 1863.9148071289062, "kl_loss_4": 1017.3056182861328, "kl_loss_9": 292.6587875366211, "learning_rate": 0.0004968266953908291, "loss": 795.0145, "step": 5070 }, { "ce_loss_13": 3.3145520091056824, "ce_loss_17": 3.2744415640830993, "ce_loss_2": 4.133978307247162, "ce_loss_4": 3.7164300322532653, "ce_loss_9": 3.4116795897483825, "epoch": 0.508, "grad_norm": 820.0, "kl_loss_13": 74.0131664276123, "kl_loss_2": 1865.4527099609375, "kl_loss_4": 999.7883392333985, "kl_loss_9": 286.20834197998045, "learning_rate": 0.0004952400830302117, "loss": 803.6775, "step": 5080 }, { "ce_loss_13": 3.2411397218704225, "ce_loss_17": 3.200059103965759, "ce_loss_2": 4.0809555649757385, "ce_loss_4": 3.655611753463745, "ce_loss_9": 3.343540573120117, "epoch": 0.509, "grad_norm": 916.0, "kl_loss_13": 75.35558547973633, "kl_loss_2": 1893.0482055664063, "kl_loss_4": 1019.6383666992188, "kl_loss_9": 296.9830749511719, "learning_rate": 0.0004936535186019053, "loss": 804.5086, "step": 5090 }, { "ce_loss_13": 3.3392978429794313, "ce_loss_17": 3.300644409656525, "ce_loss_2": 4.120908486843109, "ce_loss_4": 3.7291369080543517, "ce_loss_9": 3.4352852582931517, "epoch": 0.51, "grad_norm": 676.0, "kl_loss_13": 72.64418449401856, "kl_loss_2": 1780.3622009277344, "kl_loss_4": 962.9150421142579, "kl_loss_9": 280.7718734741211, "learning_rate": 0.000492067018082596, "loss": 785.5723, "step": 5100 }, { "ce_loss_13": 3.2748634576797486, "ce_loss_17": 3.233682465553284, "ce_loss_2": 4.130126976966858, "ce_loss_4": 3.698560190200806, "ce_loss_9": 3.3825439453125, "epoch": 0.511, "grad_norm": 844.0, "kl_loss_13": 75.8413314819336, "kl_loss_2": 1924.5922424316407, "kl_loss_4": 1034.881884765625, "kl_loss_9": 296.8105667114258, "learning_rate": 0.0004904805974483267, "loss": 832.7997, "step": 5110 }, { "ce_loss_13": 3.3867220759391783, "ce_loss_17": 3.3429943084716798, "ce_loss_2": 4.215401363372803, "ce_loss_4": 3.8140215635299684, "ce_loss_9": 3.4924907803535463, "epoch": 0.512, "grad_norm": 836.0, "kl_loss_13": 79.06632575988769, "kl_loss_2": 1902.0935119628907, "kl_loss_4": 1052.306689453125, "kl_loss_9": 305.49594650268557, "learning_rate": 0.0004888942726743353, "loss": 841.9122, "step": 5120 }, { "ce_loss_13": 3.260837697982788, "ce_loss_17": 3.2186120748519897, "ce_loss_2": 4.087520980834961, "ce_loss_4": 3.6748701691627503, "ce_loss_9": 3.3637019872665403, "epoch": 0.513, "grad_norm": 748.0, "kl_loss_13": 74.35917491912842, "kl_loss_2": 1883.8236267089844, "kl_loss_4": 1020.7645202636719, "kl_loss_9": 293.17310638427733, "learning_rate": 0.0004873080597348947, "loss": 815.0167, "step": 5130 }, { "ce_loss_13": 3.1523805379867555, "ce_loss_17": 3.1087000250816343, "ce_loss_2": 4.018757474422455, "ce_loss_4": 3.583875072002411, "ce_loss_9": 3.2562185406684874, "epoch": 0.514, "grad_norm": 780.0, "kl_loss_13": 74.4741828918457, "kl_loss_2": 1964.23408203125, "kl_loss_4": 1050.2632995605468, "kl_loss_9": 292.9966156005859, "learning_rate": 0.0004857219746031519, "loss": 820.3717, "step": 5140 }, { "ce_loss_13": 3.3229424476623537, "ce_loss_17": 3.282235884666443, "ce_loss_2": 4.113417172431946, "ce_loss_4": 3.716164600849152, "ce_loss_9": 3.420932078361511, "epoch": 0.515, "grad_norm": 856.0, "kl_loss_13": 76.73463134765625, "kl_loss_2": 1819.937060546875, "kl_loss_4": 990.5772918701172, "kl_loss_9": 289.09563064575195, "learning_rate": 0.0004841360332509663, "loss": 801.6995, "step": 5150 }, { "ce_loss_13": 3.2782339096069335, "ce_loss_17": 3.236938774585724, "ce_loss_2": 4.07007886171341, "ce_loss_4": 3.6759052515029906, "ce_loss_9": 3.375661253929138, "epoch": 0.516, "grad_norm": 760.0, "kl_loss_13": 72.78237037658691, "kl_loss_2": 1811.8954650878907, "kl_loss_4": 982.9941467285156, "kl_loss_9": 282.3444313049316, "learning_rate": 0.0004825502516487497, "loss": 769.3042, "step": 5160 }, { "ce_loss_13": 3.23654762506485, "ce_loss_17": 3.19771009683609, "ce_loss_2": 4.066962695121765, "ce_loss_4": 3.6531906723976135, "ce_loss_9": 3.339952623844147, "epoch": 0.517, "grad_norm": 1024.0, "kl_loss_13": 74.12141265869141, "kl_loss_2": 1892.0356384277343, "kl_loss_4": 1020.4204345703125, "kl_loss_9": 293.0897346496582, "learning_rate": 0.00048096464576530507, "loss": 815.3729, "step": 5170 }, { "ce_loss_13": 3.341633379459381, "ce_loss_17": 3.301531136035919, "ce_loss_2": 4.103941702842713, "ce_loss_4": 3.724453830718994, "ce_loss_9": 3.438535213470459, "epoch": 0.518, "grad_norm": 716.0, "kl_loss_13": 73.60916862487792, "kl_loss_2": 1762.4796569824218, "kl_loss_4": 962.1791381835938, "kl_loss_9": 282.87510833740237, "learning_rate": 0.00047937923156766646, "loss": 780.2356, "step": 5180 }, { "ce_loss_13": 3.3900951504707337, "ce_loss_17": 3.3490119099617006, "ce_loss_2": 4.146072888374329, "ce_loss_4": 3.7663703680038454, "ce_loss_9": 3.481011140346527, "epoch": 0.519, "grad_norm": 736.0, "kl_loss_13": 73.94559173583984, "kl_loss_2": 1764.4251647949218, "kl_loss_4": 968.4230743408203, "kl_loss_9": 282.1041458129883, "learning_rate": 0.00047779402502093696, "loss": 785.2774, "step": 5190 }, { "ce_loss_13": 3.3521920800209046, "ce_loss_17": 3.311307668685913, "ce_loss_2": 4.136666762828827, "ce_loss_4": 3.745848596096039, "ce_loss_9": 3.4521227836608888, "epoch": 0.52, "grad_norm": 656.0, "kl_loss_13": 73.88220329284668, "kl_loss_2": 1792.12763671875, "kl_loss_4": 979.6051513671875, "kl_loss_9": 286.3393188476563, "learning_rate": 0.0004762090420881289, "loss": 793.2979, "step": 5200 }, { "ce_loss_13": 3.2692319631576536, "ce_loss_17": 3.2304707527160645, "ce_loss_2": 4.053920090198517, "ce_loss_4": 3.664601814746857, "ce_loss_9": 3.366422188282013, "epoch": 0.521, "grad_norm": 728.0, "kl_loss_13": 74.0851318359375, "kl_loss_2": 1796.7323913574219, "kl_loss_4": 979.0724670410157, "kl_loss_9": 283.4450454711914, "learning_rate": 0.00047462429873000296, "loss": 778.6974, "step": 5210 }, { "ce_loss_13": 3.353558099269867, "ce_loss_17": 3.3126341223716738, "ce_loss_2": 4.132767391204834, "ce_loss_4": 3.734746587276459, "ce_loss_9": 3.451547932624817, "epoch": 0.522, "grad_norm": 700.0, "kl_loss_13": 74.0196662902832, "kl_loss_2": 1803.9498779296875, "kl_loss_4": 971.2668426513671, "kl_loss_9": 285.3925193786621, "learning_rate": 0.0004730398109049071, "loss": 784.4695, "step": 5220 }, { "ce_loss_13": 3.2815186381340027, "ce_loss_17": 3.2411299347877502, "ce_loss_2": 4.117280209064484, "ce_loss_4": 3.7015105962753294, "ce_loss_9": 3.3863202929496765, "epoch": 0.523, "grad_norm": 1328.0, "kl_loss_13": 75.13538360595703, "kl_loss_2": 1900.709716796875, "kl_loss_4": 1028.2422943115234, "kl_loss_9": 298.4895965576172, "learning_rate": 0.000471455594568616, "loss": 806.2253, "step": 5230 }, { "ce_loss_13": 3.35224426984787, "ce_loss_17": 3.313410794734955, "ce_loss_2": 4.1167085528373715, "ce_loss_4": 3.735312795639038, "ce_loss_9": 3.450315523147583, "epoch": 0.524, "grad_norm": 992.0, "kl_loss_13": 74.43409156799316, "kl_loss_2": 1761.6297302246094, "kl_loss_4": 960.3011840820312, "kl_loss_9": 284.1883819580078, "learning_rate": 0.00046987166567417086, "loss": 786.022, "step": 5240 }, { "ce_loss_13": 3.2736294507980346, "ce_loss_17": 3.2341216087341307, "ce_loss_2": 4.069810843467712, "ce_loss_4": 3.6714844346046447, "ce_loss_9": 3.3713714718818664, "epoch": 0.525, "grad_norm": 828.0, "kl_loss_13": 72.30632076263427, "kl_loss_2": 1832.9207580566406, "kl_loss_4": 987.8688446044922, "kl_loss_9": 285.9743133544922, "learning_rate": 0.00046828804017171776, "loss": 772.2806, "step": 5250 }, { "ce_loss_13": 3.316581404209137, "ce_loss_17": 3.273679721355438, "ce_loss_2": 4.146453988552094, "ce_loss_4": 3.733655881881714, "ce_loss_9": 3.4181808471679687, "epoch": 0.526, "grad_norm": 936.0, "kl_loss_13": 74.23440246582031, "kl_loss_2": 1845.0445129394532, "kl_loss_4": 1004.0903381347656, "kl_loss_9": 291.0963500976562, "learning_rate": 0.00046670473400834805, "loss": 808.1028, "step": 5260 }, { "ce_loss_13": 3.2513556122779845, "ce_loss_17": 3.2132788777351378, "ce_loss_2": 4.033259403705597, "ce_loss_4": 3.635775065422058, "ce_loss_9": 3.3464043021202086, "epoch": 0.527, "grad_norm": 840.0, "kl_loss_13": 71.38737449645996, "kl_loss_2": 1778.4729431152343, "kl_loss_4": 960.8607696533203, "kl_loss_9": 277.62567749023435, "learning_rate": 0.00046512176312793734, "loss": 809.4104, "step": 5270 }, { "ce_loss_13": 3.2459253072738647, "ce_loss_17": 3.204818546772003, "ce_loss_2": 4.041217648983002, "ce_loss_4": 3.645413875579834, "ce_loss_9": 3.3453406572341917, "epoch": 0.528, "grad_norm": 796.0, "kl_loss_13": 72.22671623229981, "kl_loss_2": 1825.0807006835937, "kl_loss_4": 988.5527099609375, "kl_loss_9": 285.0157127380371, "learning_rate": 0.00046353914347098467, "loss": 801.4178, "step": 5280 }, { "ce_loss_13": 3.345044183731079, "ce_loss_17": 3.306043243408203, "ce_loss_2": 4.1388083577156065, "ce_loss_4": 3.741454613208771, "ce_loss_9": 3.4415024399757383, "epoch": 0.529, "grad_norm": 1040.0, "kl_loss_13": 73.76498985290527, "kl_loss_2": 1808.4031982421875, "kl_loss_4": 979.0365051269531, "kl_loss_9": 281.8998863220215, "learning_rate": 0.0004619568909744524, "loss": 797.9169, "step": 5290 }, { "ce_loss_13": 3.347410809993744, "ce_loss_17": 3.308348262310028, "ce_loss_2": 4.13295624256134, "ce_loss_4": 3.740420389175415, "ce_loss_9": 3.444485080242157, "epoch": 0.53, "grad_norm": 720.0, "kl_loss_13": 73.7395851135254, "kl_loss_2": 1801.3715942382812, "kl_loss_4": 977.6004638671875, "kl_loss_9": 286.12501373291013, "learning_rate": 0.00046037502157160573, "loss": 796.7553, "step": 5300 }, { "ce_loss_13": 3.2193509340286255, "ce_loss_17": 3.1804965138435364, "ce_loss_2": 4.027743196487426, "ce_loss_4": 3.6299861907958983, "ce_loss_9": 3.3204323410987855, "epoch": 0.531, "grad_norm": 860.0, "kl_loss_13": 73.14422378540038, "kl_loss_2": 1833.43486328125, "kl_loss_4": 996.2987762451172, "kl_loss_9": 287.3218566894531, "learning_rate": 0.00045879355119185207, "loss": 798.9685, "step": 5310 }, { "ce_loss_13": 3.301614260673523, "ce_loss_17": 3.2612352848052977, "ce_loss_2": 4.112736761569977, "ce_loss_4": 3.7088679313659667, "ce_loss_9": 3.403615081310272, "epoch": 0.532, "grad_norm": 796.0, "kl_loss_13": 73.1871494293213, "kl_loss_2": 1856.7558044433595, "kl_loss_4": 1009.262451171875, "kl_loss_9": 291.91101989746096, "learning_rate": 0.0004572124957605803, "loss": 814.7009, "step": 5320 }, { "ce_loss_13": 3.321007227897644, "ce_loss_17": 3.2809830784797667, "ce_loss_2": 4.108720934391021, "ce_loss_4": 3.7170739650726317, "ce_loss_9": 3.4239102602005005, "epoch": 0.533, "grad_norm": 732.0, "kl_loss_13": 73.42384452819825, "kl_loss_2": 1818.5907470703125, "kl_loss_4": 989.3438262939453, "kl_loss_9": 290.179621887207, "learning_rate": 0.00045563187119900103, "loss": 784.1296, "step": 5330 }, { "ce_loss_13": 3.1643845319747923, "ce_loss_17": 3.1251938700675965, "ce_loss_2": 3.995231831073761, "ce_loss_4": 3.5754489064216615, "ce_loss_9": 3.2659531235694885, "epoch": 0.534, "grad_norm": 1000.0, "kl_loss_13": 72.27840385437011, "kl_loss_2": 1878.682550048828, "kl_loss_4": 1004.5179229736328, "kl_loss_9": 287.7483283996582, "learning_rate": 0.00045405169342398633, "loss": 807.4451, "step": 5340 }, { "ce_loss_13": 3.2520262837409972, "ce_loss_17": 3.2115379691123964, "ce_loss_2": 4.080039012432098, "ce_loss_4": 3.657986414432526, "ce_loss_9": 3.3535806059837343, "epoch": 0.535, "grad_norm": 868.0, "kl_loss_13": 74.16668548583985, "kl_loss_2": 1865.976123046875, "kl_loss_4": 997.759033203125, "kl_loss_9": 288.57018051147463, "learning_rate": 0.0004524719783479088, "loss": 789.2814, "step": 5350 }, { "ce_loss_13": 3.206182086467743, "ce_loss_17": 3.1656386494636535, "ce_loss_2": 4.046193289756775, "ce_loss_4": 3.624490666389465, "ce_loss_9": 3.309402322769165, "epoch": 0.536, "grad_norm": 808.0, "kl_loss_13": 74.21720085144042, "kl_loss_2": 1901.2041381835938, "kl_loss_4": 1022.456185913086, "kl_loss_9": 292.5477523803711, "learning_rate": 0.00045089274187848144, "loss": 797.1016, "step": 5360 }, { "ce_loss_13": 3.330158460140228, "ce_loss_17": 3.2923543214797975, "ce_loss_2": 4.103685748577118, "ce_loss_4": 3.7177846431732178, "ce_loss_9": 3.4239140868186952, "epoch": 0.537, "grad_norm": 1144.0, "kl_loss_13": 73.13119049072266, "kl_loss_2": 1799.9715087890625, "kl_loss_4": 975.4336334228516, "kl_loss_9": 285.69232788085935, "learning_rate": 0.00044931399991859835, "loss": 781.6728, "step": 5370 }, { "ce_loss_13": 3.1870726346969604, "ce_loss_17": 3.1471142411231994, "ce_loss_2": 3.9945030927658083, "ce_loss_4": 3.5893633484840395, "ce_loss_9": 3.2885418176651, "epoch": 0.538, "grad_norm": 820.0, "kl_loss_13": 72.292848777771, "kl_loss_2": 1837.003564453125, "kl_loss_4": 990.8762390136719, "kl_loss_9": 285.67307739257814, "learning_rate": 0.00044773576836617336, "loss": 785.0116, "step": 5380 }, { "ce_loss_13": 3.277537798881531, "ce_loss_17": 3.2368523240089417, "ce_loss_2": 4.089139556884765, "ce_loss_4": 3.6902815222740175, "ce_loss_9": 3.380195736885071, "epoch": 0.539, "grad_norm": 820.0, "kl_loss_13": 73.99409484863281, "kl_loss_2": 1863.6287780761718, "kl_loss_4": 1013.0081237792969, "kl_loss_9": 291.7620361328125, "learning_rate": 0.00044615806311398056, "loss": 817.4792, "step": 5390 }, { "ce_loss_13": 3.3569162130355834, "ce_loss_17": 3.3178048491477967, "ce_loss_2": 4.094481468200684, "ce_loss_4": 3.7208905458450316, "ce_loss_9": 3.448286509513855, "epoch": 0.54, "grad_norm": 848.0, "kl_loss_13": 72.22508735656739, "kl_loss_2": 1719.27236328125, "kl_loss_4": 941.8825927734375, "kl_loss_9": 278.6324577331543, "learning_rate": 0.00044458090004949454, "loss": 786.7616, "step": 5400 }, { "ce_loss_13": 3.2138561010360718, "ce_loss_17": 3.172481417655945, "ce_loss_2": 4.065579998493194, "ce_loss_4": 3.6457912802696226, "ce_loss_9": 3.3184029817581178, "epoch": 0.541, "grad_norm": 760.0, "kl_loss_13": 75.91326332092285, "kl_loss_2": 1954.336328125, "kl_loss_4": 1064.4479553222657, "kl_loss_9": 301.6357475280762, "learning_rate": 0.0004430042950547297, "loss": 810.616, "step": 5410 }, { "ce_loss_13": 3.306875801086426, "ce_loss_17": 3.2644275188446046, "ce_loss_2": 4.119654250144959, "ce_loss_4": 3.7173089504241945, "ce_loss_9": 3.411296534538269, "epoch": 0.542, "grad_norm": 1072.0, "kl_loss_13": 76.02639694213867, "kl_loss_2": 1850.6163391113282, "kl_loss_4": 1003.1622009277344, "kl_loss_9": 294.12415466308596, "learning_rate": 0.0004414282640060809, "loss": 800.0967, "step": 5420 }, { "ce_loss_13": 3.4018345355987547, "ce_loss_17": 3.357880413532257, "ce_loss_2": 4.178731369972229, "ce_loss_4": 3.801996040344238, "ce_loss_9": 3.5053011655807493, "epoch": 0.543, "grad_norm": 844.0, "kl_loss_13": 77.79082794189453, "kl_loss_2": 1773.267041015625, "kl_loss_4": 982.6691650390625, "kl_loss_9": 305.6775268554687, "learning_rate": 0.0004398528227741633, "loss": 816.218, "step": 5430 }, { "ce_loss_13": 3.2647178173065186, "ce_loss_17": 3.222849798202515, "ce_loss_2": 4.06882221698761, "ce_loss_4": 3.671201431751251, "ce_loss_9": 3.3631152153015136, "epoch": 0.544, "grad_norm": 1128.0, "kl_loss_13": 76.07216796875, "kl_loss_2": 1808.1482543945312, "kl_loss_4": 997.9567108154297, "kl_loss_9": 296.2308380126953, "learning_rate": 0.00043827798722365264, "loss": 805.9838, "step": 5440 }, { "ce_loss_13": 3.387359392642975, "ce_loss_17": 3.3467159628868104, "ce_loss_2": 4.153163635730744, "ce_loss_4": 3.7647279858589173, "ce_loss_9": 3.48160742521286, "epoch": 0.545, "grad_norm": 988.0, "kl_loss_13": 76.58334579467774, "kl_loss_2": 1779.9947448730468, "kl_loss_4": 964.375357055664, "kl_loss_9": 291.64692840576174, "learning_rate": 0.00043670377321312535, "loss": 777.6915, "step": 5450 }, { "ce_loss_13": 3.3918582439422607, "ce_loss_17": 3.3527199625968933, "ce_loss_2": 4.149947786331177, "ce_loss_4": 3.7716354727745056, "ce_loss_9": 3.481933128833771, "epoch": 0.546, "grad_norm": 788.0, "kl_loss_13": 74.72479209899902, "kl_loss_2": 1766.5358337402345, "kl_loss_4": 963.9497955322265, "kl_loss_9": 286.72948455810547, "learning_rate": 0.0004351301965948991, "loss": 788.3238, "step": 5460 }, { "ce_loss_13": 3.29829740524292, "ce_loss_17": 3.257559609413147, "ce_loss_2": 4.066121160984039, "ce_loss_4": 3.677996289730072, "ce_loss_9": 3.3944310784339904, "epoch": 0.547, "grad_norm": 804.0, "kl_loss_13": 73.52764205932617, "kl_loss_2": 1759.8200561523438, "kl_loss_4": 955.4488830566406, "kl_loss_9": 282.16875, "learning_rate": 0.000433557273214873, "loss": 784.6819, "step": 5470 }, { "ce_loss_13": 3.289206564426422, "ce_loss_17": 3.250258719921112, "ce_loss_2": 4.070839929580688, "ce_loss_4": 3.6803099632263185, "ce_loss_9": 3.38953412771225, "epoch": 0.548, "grad_norm": 784.0, "kl_loss_13": 73.20038318634033, "kl_loss_2": 1785.5767761230468, "kl_loss_4": 964.192514038086, "kl_loss_9": 285.82276916503906, "learning_rate": 0.000431985018912368, "loss": 773.3384, "step": 5480 }, { "ce_loss_13": 3.2603097796440124, "ce_loss_17": 3.2189146876335144, "ce_loss_2": 4.083324456214905, "ce_loss_4": 3.6726097226142884, "ce_loss_9": 3.3604193806648253, "epoch": 0.549, "grad_norm": 928.0, "kl_loss_13": 74.56349067687988, "kl_loss_2": 1874.79150390625, "kl_loss_4": 1015.2865112304687, "kl_loss_9": 294.37069549560545, "learning_rate": 0.0004304134495199674, "loss": 787.2493, "step": 5490 }, { "ce_loss_13": 3.285221612453461, "ce_loss_17": 3.244589960575104, "ce_loss_2": 4.08959436416626, "ce_loss_4": 3.694540321826935, "ce_loss_9": 3.3886062860488892, "epoch": 0.55, "grad_norm": 800.0, "kl_loss_13": 75.10189781188964, "kl_loss_2": 1862.0597717285157, "kl_loss_4": 1024.3334533691407, "kl_loss_9": 298.7258102416992, "learning_rate": 0.0004288425808633575, "loss": 800.5307, "step": 5500 }, { "ce_loss_13": 3.2647858023643495, "ce_loss_17": 3.2276356697082518, "ce_loss_2": 4.072724485397339, "ce_loss_4": 3.6583107709884644, "ce_loss_9": 3.3632304668426514, "epoch": 0.551, "grad_norm": 1304.0, "kl_loss_13": 72.74190845489503, "kl_loss_2": 1834.6743041992188, "kl_loss_4": 987.31357421875, "kl_loss_9": 286.421395111084, "learning_rate": 0.0004272724287611684, "loss": 796.3049, "step": 5510 }, { "ce_loss_13": 3.238948404788971, "ce_loss_17": 3.1981234192848205, "ce_loss_2": 4.060285580158234, "ce_loss_4": 3.635609674453735, "ce_loss_9": 3.3366718769073485, "epoch": 0.552, "grad_norm": 848.0, "kl_loss_13": 73.77195167541504, "kl_loss_2": 1879.8147033691407, "kl_loss_4": 999.0303314208984, "kl_loss_9": 289.79567489624026, "learning_rate": 0.00042570300902481425, "loss": 798.8034, "step": 5520 }, { "ce_loss_13": 3.272075152397156, "ce_loss_17": 3.234650266170502, "ce_loss_2": 4.0568290710449215, "ce_loss_4": 3.6595101237297056, "ce_loss_9": 3.365482270717621, "epoch": 0.553, "grad_norm": 988.0, "kl_loss_13": 72.45253143310546, "kl_loss_2": 1820.305108642578, "kl_loss_4": 980.4863983154297, "kl_loss_9": 285.43726501464846, "learning_rate": 0.00042413433745833423, "loss": 787.6417, "step": 5530 }, { "ce_loss_13": 3.26614705324173, "ce_loss_17": 3.2261630058288575, "ce_loss_2": 4.0755760908126835, "ce_loss_4": 3.669107723236084, "ce_loss_9": 3.364769494533539, "epoch": 0.554, "grad_norm": 700.0, "kl_loss_13": 73.44537525177002, "kl_loss_2": 1826.7767822265625, "kl_loss_4": 984.6950897216797, "kl_loss_9": 285.97044677734374, "learning_rate": 0.0004225664298582339, "loss": 771.5904, "step": 5540 }, { "ce_loss_13": 3.349853444099426, "ce_loss_17": 3.3096522808074953, "ce_loss_2": 4.12833468914032, "ce_loss_4": 3.7342480659484862, "ce_loss_9": 3.444065606594086, "epoch": 0.555, "grad_norm": 896.0, "kl_loss_13": 73.12475662231445, "kl_loss_2": 1772.5195373535157, "kl_loss_4": 959.9620391845704, "kl_loss_9": 281.3079231262207, "learning_rate": 0.000420999302013325, "loss": 776.8577, "step": 5550 }, { "ce_loss_13": 3.253776121139526, "ce_loss_17": 3.212263309955597, "ce_loss_2": 4.09758027791977, "ce_loss_4": 3.661249279975891, "ce_loss_9": 3.355074954032898, "epoch": 0.556, "grad_norm": 868.0, "kl_loss_13": 75.87601051330566, "kl_loss_2": 1901.3112548828126, "kl_loss_4": 1008.3065582275391, "kl_loss_9": 296.084765625, "learning_rate": 0.000419432969704568, "loss": 795.7606, "step": 5560 }, { "ce_loss_13": 3.289377510547638, "ce_loss_17": 3.250764536857605, "ce_loss_2": 4.073163557052612, "ce_loss_4": 3.680362272262573, "ce_loss_9": 3.388530659675598, "epoch": 0.557, "grad_norm": 828.0, "kl_loss_13": 73.03952255249024, "kl_loss_2": 1783.5357543945313, "kl_loss_4": 967.2474578857422, "kl_loss_9": 283.408634185791, "learning_rate": 0.00041786744870491154, "loss": 801.2744, "step": 5570 }, { "ce_loss_13": 3.2308903098106385, "ce_loss_17": 3.1919775366783143, "ce_loss_2": 4.034533071517944, "ce_loss_4": 3.6301452279090882, "ce_loss_9": 3.3295583367347716, "epoch": 0.558, "grad_norm": 980.0, "kl_loss_13": 74.31400451660156, "kl_loss_2": 1842.5608154296874, "kl_loss_4": 1004.8509399414063, "kl_loss_9": 293.26367645263673, "learning_rate": 0.0004163027547791347, "loss": 794.3833, "step": 5580 }, { "ce_loss_13": 3.211347496509552, "ce_loss_17": 3.1711423277854918, "ce_loss_2": 4.050317776203156, "ce_loss_4": 3.624680197238922, "ce_loss_9": 3.310125172138214, "epoch": 0.559, "grad_norm": 880.0, "kl_loss_13": 73.63075923919678, "kl_loss_2": 1894.5034240722657, "kl_loss_4": 1011.8344757080079, "kl_loss_9": 293.5899291992188, "learning_rate": 0.0004147389036836881, "loss": 801.4451, "step": 5590 }, { "ce_loss_13": 3.259829878807068, "ce_loss_17": 3.220303547382355, "ce_loss_2": 4.069304740428924, "ce_loss_4": 3.671776497364044, "ce_loss_9": 3.359443461894989, "epoch": 0.56, "grad_norm": 1296.0, "kl_loss_13": 74.24327507019044, "kl_loss_2": 1844.3182250976563, "kl_loss_4": 1008.930209350586, "kl_loss_9": 290.1443237304687, "learning_rate": 0.00041317591116653486, "loss": 810.9732, "step": 5600 }, { "ce_loss_13": 3.2949826955795287, "ce_loss_17": 3.254460871219635, "ce_loss_2": 4.106885957717895, "ce_loss_4": 3.699885070323944, "ce_loss_9": 3.3967799067497255, "epoch": 0.561, "grad_norm": 968.0, "kl_loss_13": 75.05366821289063, "kl_loss_2": 1856.025897216797, "kl_loss_4": 1003.9119995117187, "kl_loss_9": 293.60422973632814, "learning_rate": 0.0004116137929669921, "loss": 791.4979, "step": 5610 }, { "ce_loss_13": 3.287285017967224, "ce_loss_17": 3.2479687929153442, "ce_loss_2": 4.0837935447692875, "ce_loss_4": 3.680448615550995, "ce_loss_9": 3.386300873756409, "epoch": 0.562, "grad_norm": 1056.0, "kl_loss_13": 72.04925346374512, "kl_loss_2": 1821.5766967773438, "kl_loss_4": 986.7944122314453, "kl_loss_9": 286.0955436706543, "learning_rate": 0.00041005256481557305, "loss": 780.1872, "step": 5620 }, { "ce_loss_13": 3.3894767999649047, "ce_loss_17": 3.3497204542160035, "ce_loss_2": 4.132556700706482, "ce_loss_4": 3.76358847618103, "ce_loss_9": 3.4814849138259887, "epoch": 0.563, "grad_norm": 816.0, "kl_loss_13": 71.50232734680176, "kl_loss_2": 1727.2944641113281, "kl_loss_4": 946.6339752197266, "kl_loss_9": 276.58668060302733, "learning_rate": 0.00040849224243382767, "loss": 766.1647, "step": 5630 }, { "ce_loss_13": 3.242488920688629, "ce_loss_17": 3.202657175064087, "ce_loss_2": 4.0449035406112674, "ce_loss_4": 3.6400243639945984, "ce_loss_9": 3.3369064807891844, "epoch": 0.564, "grad_norm": 1020.0, "kl_loss_13": 72.28278465270996, "kl_loss_2": 1823.1846130371093, "kl_loss_4": 994.4835876464844, "kl_loss_9": 287.82414321899415, "learning_rate": 0.000406932841534185, "loss": 780.4899, "step": 5640 }, { "ce_loss_13": 3.20475310087204, "ce_loss_17": 3.1642747282981873, "ce_loss_2": 4.017904949188233, "ce_loss_4": 3.613080847263336, "ce_loss_9": 3.3053141593933106, "epoch": 0.565, "grad_norm": 1004.0, "kl_loss_13": 72.92307014465332, "kl_loss_2": 1857.2842529296875, "kl_loss_4": 1004.6457885742187, "kl_loss_9": 287.81507263183596, "learning_rate": 0.0004053743778197951, "loss": 813.7131, "step": 5650 }, { "ce_loss_13": 3.3094315767288207, "ce_loss_17": 3.2693715691566467, "ce_loss_2": 4.100631475448608, "ce_loss_4": 3.7085015058517454, "ce_loss_9": 3.40853990316391, "epoch": 0.566, "grad_norm": 916.0, "kl_loss_13": 75.08573303222656, "kl_loss_2": 1802.2984436035156, "kl_loss_4": 985.8848937988281, "kl_loss_9": 289.80066299438477, "learning_rate": 0.0004038168669843697, "loss": 799.6597, "step": 5660 }, { "ce_loss_13": 3.266611671447754, "ce_loss_17": 3.226332187652588, "ce_loss_2": 4.034893548488617, "ce_loss_4": 3.64892041683197, "ce_loss_9": 3.363729107379913, "epoch": 0.567, "grad_norm": 1328.0, "kl_loss_13": 72.34437274932861, "kl_loss_2": 1765.5274291992187, "kl_loss_4": 955.3150238037109, "kl_loss_9": 282.6461380004883, "learning_rate": 0.000402260324712026, "loss": 790.4768, "step": 5670 }, { "ce_loss_13": 3.3097354412078857, "ce_loss_17": 3.2704842329025268, "ce_loss_2": 4.124593257904053, "ce_loss_4": 3.7098461508750917, "ce_loss_9": 3.411205291748047, "epoch": 0.568, "grad_norm": 1224.0, "kl_loss_13": 72.51471862792968, "kl_loss_2": 1852.9243041992188, "kl_loss_4": 988.7502838134766, "kl_loss_9": 285.3108932495117, "learning_rate": 0.00040070476667712743, "loss": 783.2475, "step": 5680 }, { "ce_loss_13": 3.340377914905548, "ce_loss_17": 3.3000412344932557, "ce_loss_2": 4.13344761133194, "ce_loss_4": 3.7344311833381654, "ce_loss_9": 3.4361794233322143, "epoch": 0.569, "grad_norm": 688.0, "kl_loss_13": 74.35391159057617, "kl_loss_2": 1810.0054870605468, "kl_loss_4": 979.7866760253906, "kl_loss_9": 283.6129066467285, "learning_rate": 0.0003991502085441259, "loss": 791.0041, "step": 5690 }, { "ce_loss_13": 3.3778745532035828, "ce_loss_17": 3.33834547996521, "ce_loss_2": 4.130846786499023, "ce_loss_4": 3.750100874900818, "ce_loss_9": 3.4685394763946533, "epoch": 0.57, "grad_norm": 848.0, "kl_loss_13": 72.54917221069336, "kl_loss_2": 1723.6391296386719, "kl_loss_4": 939.2174407958985, "kl_loss_9": 277.1242614746094, "learning_rate": 0.0003975966659674047, "loss": 776.0271, "step": 5700 }, { "ce_loss_13": 3.3454225778579714, "ce_loss_17": 3.3044866919517517, "ce_loss_2": 4.126947188377381, "ce_loss_4": 3.7350040912628173, "ce_loss_9": 3.4427593469619753, "epoch": 0.571, "grad_norm": 848.0, "kl_loss_13": 73.82932510375977, "kl_loss_2": 1794.7004455566407, "kl_loss_4": 971.1542999267579, "kl_loss_9": 282.82764434814453, "learning_rate": 0.0003960441545911204, "loss": 776.3346, "step": 5710 }, { "ce_loss_13": 3.338672161102295, "ce_loss_17": 3.298799526691437, "ce_loss_2": 4.117006981372834, "ce_loss_4": 3.7270028710365297, "ce_loss_9": 3.4329198598861694, "epoch": 0.572, "grad_norm": 840.0, "kl_loss_13": 73.2137435913086, "kl_loss_2": 1802.4476318359375, "kl_loss_4": 983.7936096191406, "kl_loss_9": 285.7454093933105, "learning_rate": 0.0003944926900490452, "loss": 782.9495, "step": 5720 }, { "ce_loss_13": 3.248178768157959, "ce_loss_17": 3.2080833554267882, "ce_loss_2": 4.0701495885849, "ce_loss_4": 3.6604724168777465, "ce_loss_9": 3.3536651849746706, "epoch": 0.573, "grad_norm": 672.0, "kl_loss_13": 73.6166389465332, "kl_loss_2": 1849.4944641113282, "kl_loss_4": 999.0758972167969, "kl_loss_9": 290.0587493896484, "learning_rate": 0.0003929422879644099, "loss": 784.3274, "step": 5730 }, { "ce_loss_13": 3.259362077713013, "ce_loss_17": 3.218967521190643, "ce_loss_2": 4.038165628910065, "ce_loss_4": 3.63946738243103, "ce_loss_9": 3.353592646121979, "epoch": 0.574, "grad_norm": 940.0, "kl_loss_13": 72.40253295898438, "kl_loss_2": 1799.1306030273438, "kl_loss_4": 966.6379364013671, "kl_loss_9": 281.5230438232422, "learning_rate": 0.0003913929639497462, "loss": 765.5293, "step": 5740 }, { "ce_loss_13": 3.2139196157455445, "ce_loss_17": 3.1727853059768676, "ce_loss_2": 4.041042923927307, "ce_loss_4": 3.619505214691162, "ce_loss_9": 3.312306559085846, "epoch": 0.575, "grad_norm": 744.0, "kl_loss_13": 71.97178192138672, "kl_loss_2": 1872.7219848632812, "kl_loss_4": 992.9176544189453, "kl_loss_9": 282.19856185913085, "learning_rate": 0.00038984473360672965, "loss": 783.3154, "step": 5750 }, { "ce_loss_13": 3.219143533706665, "ce_loss_17": 3.1789993166923525, "ce_loss_2": 4.0345776915550235, "ce_loss_4": 3.621943485736847, "ce_loss_9": 3.3177709102630617, "epoch": 0.576, "grad_norm": 792.0, "kl_loss_13": 71.56662025451661, "kl_loss_2": 1847.7013122558594, "kl_loss_4": 994.68759765625, "kl_loss_9": 283.5988624572754, "learning_rate": 0.0003882976125260229, "loss": 778.3832, "step": 5760 }, { "ce_loss_13": 3.286661374568939, "ce_loss_17": 3.2458075642585755, "ce_loss_2": 4.075940239429474, "ce_loss_4": 3.6776213526725767, "ce_loss_9": 3.387130117416382, "epoch": 0.577, "grad_norm": 748.0, "kl_loss_13": 72.54135475158691, "kl_loss_2": 1799.4101135253907, "kl_loss_4": 966.9556396484375, "kl_loss_9": 282.7928466796875, "learning_rate": 0.00038675161628711776, "loss": 783.4904, "step": 5770 }, { "ce_loss_13": 3.3262439131736756, "ce_loss_17": 3.285924530029297, "ce_loss_2": 4.106383979320526, "ce_loss_4": 3.714357054233551, "ce_loss_9": 3.4214155793190004, "epoch": 0.578, "grad_norm": 700.0, "kl_loss_13": 72.68965663909913, "kl_loss_2": 1772.8457275390624, "kl_loss_4": 961.9623809814453, "kl_loss_9": 283.20936126708983, "learning_rate": 0.0003852067604581794, "loss": 798.5637, "step": 5780 }, { "ce_loss_13": 3.2768237948417664, "ce_loss_17": 3.237665665149689, "ce_loss_2": 4.080949985980988, "ce_loss_4": 3.669422745704651, "ce_loss_9": 3.370028007030487, "epoch": 0.579, "grad_norm": 1072.0, "kl_loss_13": 72.39575462341308, "kl_loss_2": 1839.0868286132813, "kl_loss_4": 986.4563354492187, "kl_loss_9": 280.40548477172854, "learning_rate": 0.0003836630605958888, "loss": 783.4337, "step": 5790 }, { "ce_loss_13": 3.3321749329566956, "ce_loss_17": 3.2925341963768004, "ce_loss_2": 4.109150612354279, "ce_loss_4": 3.7234968543052673, "ce_loss_9": 3.428684854507446, "epoch": 0.58, "grad_norm": 1240.0, "kl_loss_13": 74.08155212402343, "kl_loss_2": 1816.310040283203, "kl_loss_4": 986.648226928711, "kl_loss_9": 286.85925216674804, "learning_rate": 0.0003821205322452863, "loss": 815.1135, "step": 5800 }, { "ce_loss_13": 3.304441678524017, "ce_loss_17": 3.2667020320892335, "ce_loss_2": 4.0839027762413025, "ce_loss_4": 3.6912376165390013, "ce_loss_9": 3.399633002281189, "epoch": 0.581, "grad_norm": 888.0, "kl_loss_13": 72.40485496520996, "kl_loss_2": 1802.7967712402344, "kl_loss_4": 969.6277923583984, "kl_loss_9": 282.01318740844727, "learning_rate": 0.0003805791909396155, "loss": 785.8026, "step": 5810 }, { "ce_loss_13": 3.2645418524742125, "ce_loss_17": 3.2266998291015625, "ce_loss_2": 4.0642083287239075, "ce_loss_4": 3.650386297702789, "ce_loss_9": 3.3594146370887756, "epoch": 0.582, "grad_norm": 932.0, "kl_loss_13": 71.84885711669922, "kl_loss_2": 1824.7707580566407, "kl_loss_4": 971.6696868896485, "kl_loss_9": 280.12883377075195, "learning_rate": 0.0003790390522001662, "loss": 793.4603, "step": 5820 }, { "ce_loss_13": 3.199913036823273, "ce_loss_17": 3.1629793882369994, "ce_loss_2": 3.999752473831177, "ce_loss_4": 3.594913971424103, "ce_loss_9": 3.2937525868415833, "epoch": 0.583, "grad_norm": 848.0, "kl_loss_13": 70.37647552490235, "kl_loss_2": 1852.6868469238282, "kl_loss_4": 989.9863128662109, "kl_loss_9": 280.64996337890625, "learning_rate": 0.0003775001315361183, "loss": 780.3628, "step": 5830 }, { "ce_loss_13": 3.3063352584838865, "ce_loss_17": 3.264518618583679, "ce_loss_2": 4.108177971839905, "ce_loss_4": 3.697060787677765, "ce_loss_9": 3.4061919689178466, "epoch": 0.584, "grad_norm": 660.0, "kl_loss_13": 73.38811836242675, "kl_loss_2": 1819.9228637695312, "kl_loss_4": 975.4645874023438, "kl_loss_9": 284.6202987670898, "learning_rate": 0.0003759624444443858, "loss": 786.6172, "step": 5840 }, { "ce_loss_13": 3.34173401594162, "ce_loss_17": 3.3037470102310182, "ce_loss_2": 4.114887666702271, "ce_loss_4": 3.7185524225234987, "ce_loss_9": 3.435133862495422, "epoch": 0.585, "grad_norm": 956.0, "kl_loss_13": 73.05100364685059, "kl_loss_2": 1791.394512939453, "kl_loss_4": 958.487109375, "kl_loss_9": 280.1448257446289, "learning_rate": 0.00037442600640946044, "loss": 771.5148, "step": 5850 }, { "ce_loss_13": 3.2984007954597474, "ce_loss_17": 3.2617080211639404, "ce_loss_2": 4.0688137888908384, "ce_loss_4": 3.67920743227005, "ce_loss_9": 3.3929454565048216, "epoch": 0.586, "grad_norm": 1056.0, "kl_loss_13": 71.55231246948242, "kl_loss_2": 1775.8209228515625, "kl_loss_4": 966.913232421875, "kl_loss_9": 281.4110786437988, "learning_rate": 0.00037289083290325663, "loss": 761.8062, "step": 5860 }, { "ce_loss_13": 3.282070851325989, "ce_loss_17": 3.2435006380081175, "ce_loss_2": 4.057844626903534, "ce_loss_4": 3.6678024888038636, "ce_loss_9": 3.3763722658157347, "epoch": 0.587, "grad_norm": 1200.0, "kl_loss_13": 71.80185241699219, "kl_loss_2": 1767.1731994628906, "kl_loss_4": 954.13974609375, "kl_loss_9": 277.7298324584961, "learning_rate": 0.0003713569393849543, "loss": 768.4436, "step": 5870 }, { "ce_loss_13": 3.3335991501808167, "ce_loss_17": 3.2933664560317992, "ce_loss_2": 4.114494931697846, "ce_loss_4": 3.718398153781891, "ce_loss_9": 3.4264504432678224, "epoch": 0.588, "grad_norm": 1120.0, "kl_loss_13": 73.1373176574707, "kl_loss_2": 1792.4062377929688, "kl_loss_4": 973.3330139160156, "kl_loss_9": 283.1918441772461, "learning_rate": 0.00036982434130084397, "loss": 782.7954, "step": 5880 }, { "ce_loss_13": 3.2465739846229553, "ce_loss_17": 3.2064101934432983, "ce_loss_2": 4.027490735054016, "ce_loss_4": 3.6307048916816713, "ce_loss_9": 3.3453976035118105, "epoch": 0.589, "grad_norm": 820.0, "kl_loss_13": 72.75676975250244, "kl_loss_2": 1803.2782409667968, "kl_loss_4": 972.25625, "kl_loss_9": 285.99529190063475, "learning_rate": 0.00036829305408417166, "loss": 787.7693, "step": 5890 }, { "ce_loss_13": 3.2280359148979185, "ce_loss_17": 3.188660192489624, "ce_loss_2": 4.046233725547791, "ce_loss_4": 3.6307573556900024, "ce_loss_9": 3.327609062194824, "epoch": 0.59, "grad_norm": 824.0, "kl_loss_13": 73.30173187255859, "kl_loss_2": 1848.1465759277344, "kl_loss_4": 992.1645568847656, "kl_loss_9": 286.77287139892576, "learning_rate": 0.0003667630931549826, "loss": 788.664, "step": 5900 }, { "ce_loss_13": 3.2037394762039186, "ce_loss_17": 3.164516258239746, "ce_loss_2": 4.040177667140961, "ce_loss_4": 3.6129900217056274, "ce_loss_9": 3.3020408153533936, "epoch": 0.591, "grad_norm": 1192.0, "kl_loss_13": 71.7927785873413, "kl_loss_2": 1909.3319885253907, "kl_loss_4": 1010.0443572998047, "kl_loss_9": 286.49772720336915, "learning_rate": 0.00036523447391996613, "loss": 803.2408, "step": 5910 }, { "ce_loss_13": 3.286428999900818, "ce_loss_17": 3.250199830532074, "ce_loss_2": 4.064352822303772, "ce_loss_4": 3.67577840089798, "ce_loss_9": 3.3807467460632323, "epoch": 0.592, "grad_norm": 848.0, "kl_loss_13": 71.02851467132568, "kl_loss_2": 1782.7820373535155, "kl_loss_4": 960.0286407470703, "kl_loss_9": 279.03868942260743, "learning_rate": 0.00036370721177230114, "loss": 770.0045, "step": 5920 }, { "ce_loss_13": 3.2896131038665772, "ce_loss_17": 3.2510338187217713, "ce_loss_2": 4.09786479473114, "ce_loss_4": 3.6828097462654115, "ce_loss_9": 3.388933300971985, "epoch": 0.593, "grad_norm": 728.0, "kl_loss_13": 72.76485977172851, "kl_loss_2": 1831.8380004882813, "kl_loss_4": 980.9190948486328, "kl_loss_9": 285.5613159179687, "learning_rate": 0.00036218132209150044, "loss": 786.7089, "step": 5930 }, { "ce_loss_13": 3.239467215538025, "ce_loss_17": 3.1968210339546204, "ce_loss_2": 4.074165380001068, "ce_loss_4": 3.661141836643219, "ce_loss_9": 3.3476688265800476, "epoch": 0.594, "grad_norm": 984.0, "kl_loss_13": 75.23736305236817, "kl_loss_2": 1897.803240966797, "kl_loss_4": 1030.2247222900392, "kl_loss_9": 297.45814361572263, "learning_rate": 0.0003606568202432562, "loss": 802.2447, "step": 5940 }, { "ce_loss_13": 3.316815996170044, "ce_loss_17": 3.277123522758484, "ce_loss_2": 4.12262943983078, "ce_loss_4": 3.7138426423072817, "ce_loss_9": 3.412475216388702, "epoch": 0.595, "grad_norm": 876.0, "kl_loss_13": 73.52718086242676, "kl_loss_2": 1856.3543212890625, "kl_loss_4": 992.5437133789062, "kl_loss_9": 285.9757568359375, "learning_rate": 0.0003591337215792851, "loss": 781.4097, "step": 5950 }, { "ce_loss_13": 3.3531678915023804, "ce_loss_17": 3.314804708957672, "ce_loss_2": 4.109200823307037, "ce_loss_4": 3.7312793254852297, "ce_loss_9": 3.447885584831238, "epoch": 0.596, "grad_norm": 628.0, "kl_loss_13": 71.84714088439941, "kl_loss_2": 1773.0002014160157, "kl_loss_4": 959.8357330322266, "kl_loss_9": 279.19155349731443, "learning_rate": 0.00035761204143717383, "loss": 782.717, "step": 5960 }, { "ce_loss_13": 3.3036818742752074, "ce_loss_17": 3.263713240623474, "ce_loss_2": 4.092971360683441, "ce_loss_4": 3.698699343204498, "ce_loss_9": 3.4006860733032225, "epoch": 0.597, "grad_norm": 908.0, "kl_loss_13": 72.98650817871093, "kl_loss_2": 1813.5230285644532, "kl_loss_4": 986.8791961669922, "kl_loss_9": 283.56762619018554, "learning_rate": 0.0003560917951402245, "loss": 802.3848, "step": 5970 }, { "ce_loss_13": 3.285984826087952, "ce_loss_17": 3.247555208206177, "ce_loss_2": 4.072379291057587, "ce_loss_4": 3.6760414361953737, "ce_loss_9": 3.3791685938835143, "epoch": 0.598, "grad_norm": 900.0, "kl_loss_13": 71.70736351013184, "kl_loss_2": 1795.4314697265625, "kl_loss_4": 970.2411376953125, "kl_loss_9": 280.68557662963866, "learning_rate": 0.00035457299799730046, "loss": 778.7829, "step": 5980 }, { "ce_loss_13": 3.3455660462379457, "ce_loss_17": 3.306633234024048, "ce_loss_2": 4.122979462146759, "ce_loss_4": 3.7328011989593506, "ce_loss_9": 3.4436651349067686, "epoch": 0.599, "grad_norm": 792.0, "kl_loss_13": 71.79905586242675, "kl_loss_2": 1778.4928039550782, "kl_loss_4": 960.0759826660156, "kl_loss_9": 281.6575355529785, "learning_rate": 0.0003530556653026721, "loss": 784.7722, "step": 5990 }, { "ce_loss_13": 3.266416549682617, "ce_loss_17": 3.228486883640289, "ce_loss_2": 4.066090321540832, "ce_loss_4": 3.647917056083679, "ce_loss_9": 3.361822855472565, "epoch": 0.6, "grad_norm": 1344.0, "kl_loss_13": 70.68609580993652, "kl_loss_2": 1819.8283996582031, "kl_loss_4": 955.7968322753907, "kl_loss_9": 277.94916610717775, "learning_rate": 0.00035153981233586274, "loss": 786.5068, "step": 6000 }, { "ce_loss_13": 3.2435776352882386, "ce_loss_17": 3.203437614440918, "ce_loss_2": 4.040588653087616, "ce_loss_4": 3.6361332893371583, "ce_loss_9": 3.338526356220245, "epoch": 0.601, "grad_norm": 1072.0, "kl_loss_13": 70.50964450836182, "kl_loss_2": 1814.0566650390624, "kl_loss_4": 977.1874572753907, "kl_loss_9": 278.71122283935546, "learning_rate": 0.00035002545436149473, "loss": 807.4567, "step": 6010 }, { "ce_loss_13": 3.250414502620697, "ce_loss_17": 3.211272585391998, "ce_loss_2": 4.0653922200202945, "ce_loss_4": 3.655634081363678, "ce_loss_9": 3.3470831632614138, "epoch": 0.602, "grad_norm": 664.0, "kl_loss_13": 74.24386062622071, "kl_loss_2": 1860.7207580566405, "kl_loss_4": 1006.5490264892578, "kl_loss_9": 289.0743743896484, "learning_rate": 0.0003485126066291364, "loss": 781.3338, "step": 6020 }, { "ce_loss_13": 3.294699478149414, "ce_loss_17": 3.253976845741272, "ce_loss_2": 4.0994375348091125, "ce_loss_4": 3.6909204363822936, "ce_loss_9": 3.3906335711479185, "epoch": 0.603, "grad_norm": 916.0, "kl_loss_13": 72.10275173187256, "kl_loss_2": 1822.8707214355468, "kl_loss_4": 977.2399291992188, "kl_loss_9": 279.21849060058594, "learning_rate": 0.0003470012843731476, "loss": 787.5461, "step": 6030 }, { "ce_loss_13": 3.24105304479599, "ce_loss_17": 3.2001903772354128, "ce_loss_2": 4.045105612277984, "ce_loss_4": 3.6382987022399904, "ce_loss_9": 3.3370439410209656, "epoch": 0.604, "grad_norm": 836.0, "kl_loss_13": 72.2528564453125, "kl_loss_2": 1836.884100341797, "kl_loss_4": 988.0165435791016, "kl_loss_9": 281.7273681640625, "learning_rate": 0.00034549150281252633, "loss": 804.6245, "step": 6040 }, { "ce_loss_13": 3.2170220375061036, "ce_loss_17": 3.1779794812202455, "ce_loss_2": 3.995331084728241, "ce_loss_4": 3.6100341081619263, "ce_loss_9": 3.3146474242210386, "epoch": 0.605, "grad_norm": 836.0, "kl_loss_13": 71.00680580139161, "kl_loss_2": 1757.2800537109374, "kl_loss_4": 960.993408203125, "kl_loss_9": 278.24233474731443, "learning_rate": 0.0003439832771507565, "loss": 766.7023, "step": 6050 }, { "ce_loss_13": 3.228524351119995, "ce_loss_17": 3.187285077571869, "ce_loss_2": 4.024105882644653, "ce_loss_4": 3.625711727142334, "ce_loss_9": 3.323964500427246, "epoch": 0.606, "grad_norm": 712.0, "kl_loss_13": 71.69733200073242, "kl_loss_2": 1829.7399353027345, "kl_loss_4": 992.6620208740235, "kl_loss_9": 280.5502082824707, "learning_rate": 0.0003424766225756537, "loss": 780.052, "step": 6060 }, { "ce_loss_13": 3.285083842277527, "ce_loss_17": 3.2451503396034242, "ce_loss_2": 4.077549755573273, "ce_loss_4": 3.676045870780945, "ce_loss_9": 3.380914330482483, "epoch": 0.607, "grad_norm": 772.0, "kl_loss_13": 72.76493911743164, "kl_loss_2": 1803.9171203613282, "kl_loss_4": 968.7603179931641, "kl_loss_9": 281.1850296020508, "learning_rate": 0.00034097155425921255, "loss": 769.2323, "step": 6070 }, { "ce_loss_13": 3.1820239067077636, "ce_loss_17": 3.142336893081665, "ce_loss_2": 3.9852845311164855, "ce_loss_4": 3.57771520614624, "ce_loss_9": 3.278271722793579, "epoch": 0.608, "grad_norm": 840.0, "kl_loss_13": 72.62495079040528, "kl_loss_2": 1856.206787109375, "kl_loss_4": 989.25322265625, "kl_loss_9": 283.5368026733398, "learning_rate": 0.0003394680873574546, "loss": 785.9624, "step": 6080 }, { "ce_loss_13": 3.2822832107543944, "ce_loss_17": 3.239939785003662, "ce_loss_2": 4.101767385005951, "ce_loss_4": 3.685703420639038, "ce_loss_9": 3.382130753993988, "epoch": 0.609, "grad_norm": 1024.0, "kl_loss_13": 73.62976379394532, "kl_loss_2": 1866.8404479980468, "kl_loss_4": 996.1072570800782, "kl_loss_9": 283.988956451416, "learning_rate": 0.0003379662370102747, "loss": 784.3762, "step": 6090 }, { "ce_loss_13": 3.296636772155762, "ce_loss_17": 3.2578766703605653, "ce_loss_2": 4.068477404117584, "ce_loss_4": 3.681435239315033, "ce_loss_9": 3.391046917438507, "epoch": 0.61, "grad_norm": 820.0, "kl_loss_13": 71.44107837677002, "kl_loss_2": 1790.297344970703, "kl_loss_4": 970.0491180419922, "kl_loss_9": 278.9227890014648, "learning_rate": 0.0003364660183412892, "loss": 782.6667, "step": 6100 }, { "ce_loss_13": 3.279218602180481, "ce_loss_17": 3.239975702762604, "ce_loss_2": 4.057258355617523, "ce_loss_4": 3.662726712226868, "ce_loss_9": 3.3751845836639403, "epoch": 0.611, "grad_norm": 940.0, "kl_loss_13": 71.98993682861328, "kl_loss_2": 1803.81904296875, "kl_loss_4": 971.1891784667969, "kl_loss_9": 282.17127532958983, "learning_rate": 0.0003349674464576834, "loss": 789.9217, "step": 6110 }, { "ce_loss_13": 3.228991961479187, "ce_loss_17": 3.1899941682815554, "ce_loss_2": 4.037423419952392, "ce_loss_4": 3.6243603825569153, "ce_loss_9": 3.326341247558594, "epoch": 0.612, "grad_norm": 772.0, "kl_loss_13": 72.28171844482422, "kl_loss_2": 1837.7317016601562, "kl_loss_4": 978.088656616211, "kl_loss_9": 281.76746063232423, "learning_rate": 0.00033347053645005966, "loss": 770.0652, "step": 6120 }, { "ce_loss_13": 3.331780731678009, "ce_loss_17": 3.292486870288849, "ce_loss_2": 4.094799375534057, "ce_loss_4": 3.7155101776123045, "ce_loss_9": 3.4249905228614805, "epoch": 0.613, "grad_norm": 1040.0, "kl_loss_13": 71.58653755187989, "kl_loss_2": 1745.4302124023438, "kl_loss_4": 957.3974456787109, "kl_loss_9": 277.33105697631834, "learning_rate": 0.00033197530339228485, "loss": 779.0169, "step": 6130 }, { "ce_loss_13": 3.2880977034568786, "ce_loss_17": 3.248146677017212, "ce_loss_2": 4.074892008304596, "ce_loss_4": 3.6834922194480897, "ce_loss_9": 3.387561297416687, "epoch": 0.614, "grad_norm": 756.0, "kl_loss_13": 72.65009574890136, "kl_loss_2": 1790.276544189453, "kl_loss_4": 975.4449920654297, "kl_loss_9": 283.23338775634767, "learning_rate": 0.00033048176234133967, "loss": 775.951, "step": 6140 }, { "ce_loss_13": 3.2806970953941343, "ce_loss_17": 3.2423112869262694, "ce_loss_2": 4.056227326393127, "ce_loss_4": 3.6680848240852355, "ce_loss_9": 3.37451308965683, "epoch": 0.615, "grad_norm": 944.0, "kl_loss_13": 72.70654010772705, "kl_loss_2": 1784.218359375, "kl_loss_4": 975.8216552734375, "kl_loss_9": 283.3952438354492, "learning_rate": 0.0003289899283371657, "loss": 784.4755, "step": 6150 }, { "ce_loss_13": 3.2955815315246584, "ce_loss_17": 3.255915582180023, "ce_loss_2": 4.095048213005066, "ce_loss_4": 3.6850364565849305, "ce_loss_9": 3.3905885100364683, "epoch": 0.616, "grad_norm": 1240.0, "kl_loss_13": 72.33249015808106, "kl_loss_2": 1809.0255187988282, "kl_loss_4": 961.4775451660156, "kl_loss_9": 279.44675674438474, "learning_rate": 0.0003274998164025148, "loss": 792.1812, "step": 6160 }, { "ce_loss_13": 3.328199291229248, "ce_loss_17": 3.2899797320365907, "ce_loss_2": 4.103036093711853, "ce_loss_4": 3.7185376644134522, "ce_loss_9": 3.4223578333854676, "epoch": 0.617, "grad_norm": 952.0, "kl_loss_13": 73.33154029846192, "kl_loss_2": 1780.0444702148438, "kl_loss_4": 974.3755706787109, "kl_loss_9": 282.3663688659668, "learning_rate": 0.0003260114415427975, "loss": 793.5954, "step": 6170 }, { "ce_loss_13": 3.252111566066742, "ce_loss_17": 3.212690567970276, "ce_loss_2": 4.058794844150543, "ce_loss_4": 3.6559091925621034, "ce_loss_9": 3.350780153274536, "epoch": 0.618, "grad_norm": 928.0, "kl_loss_13": 72.44158325195312, "kl_loss_2": 1846.1583312988282, "kl_loss_4": 986.2647827148437, "kl_loss_9": 280.55818099975585, "learning_rate": 0.0003245248187459323, "loss": 801.7746, "step": 6180 }, { "ce_loss_13": 3.240708112716675, "ce_loss_17": 3.2041526675224303, "ce_loss_2": 4.004272377490997, "ce_loss_4": 3.614217388629913, "ce_loss_9": 3.3318390011787415, "epoch": 0.619, "grad_norm": 728.0, "kl_loss_13": 68.6929853439331, "kl_loss_2": 1759.1183471679688, "kl_loss_4": 946.2114318847656, "kl_loss_9": 271.18813400268556, "learning_rate": 0.00032303996298219416, "loss": 764.9716, "step": 6190 }, { "ce_loss_13": 3.3225024580955504, "ce_loss_17": 3.2831245183944704, "ce_loss_2": 4.081291854381561, "ce_loss_4": 3.7007321953773498, "ce_loss_9": 3.4160247802734376, "epoch": 0.62, "grad_norm": 836.0, "kl_loss_13": 71.11275367736816, "kl_loss_2": 1734.1385681152344, "kl_loss_4": 945.437710571289, "kl_loss_9": 272.4640754699707, "learning_rate": 0.00032155688920406414, "loss": 760.9354, "step": 6200 }, { "ce_loss_13": 3.236972117424011, "ce_loss_17": 3.1968539357185364, "ce_loss_2": 4.053024530410767, "ce_loss_4": 3.636395263671875, "ce_loss_9": 3.3320537924766542, "epoch": 0.621, "grad_norm": 912.0, "kl_loss_13": 72.77989196777344, "kl_loss_2": 1853.1631958007813, "kl_loss_4": 989.524203491211, "kl_loss_9": 284.0096466064453, "learning_rate": 0.0003200756123460788, "loss": 803.549, "step": 6210 }, { "ce_loss_13": 3.2674758195877076, "ce_loss_17": 3.226785624027252, "ce_loss_2": 4.08230230808258, "ce_loss_4": 3.671753633022308, "ce_loss_9": 3.367294156551361, "epoch": 0.622, "grad_norm": 1064.0, "kl_loss_13": 73.93520374298096, "kl_loss_2": 1856.7218139648437, "kl_loss_4": 1001.2637176513672, "kl_loss_9": 288.70165252685547, "learning_rate": 0.00031859614732467957, "loss": 802.2571, "step": 6220 }, { "ce_loss_13": 3.3192309975624084, "ce_loss_17": 3.2794345736503603, "ce_loss_2": 4.087488758563995, "ce_loss_4": 3.6964677214622497, "ce_loss_9": 3.4120842099189757, "epoch": 0.623, "grad_norm": 992.0, "kl_loss_13": 71.44276008605956, "kl_loss_2": 1754.6969421386718, "kl_loss_4": 947.3564575195312, "kl_loss_9": 274.8693054199219, "learning_rate": 0.00031711850903806275, "loss": 765.0162, "step": 6230 }, { "ce_loss_13": 3.224324369430542, "ce_loss_17": 3.1861729264259337, "ce_loss_2": 4.034242498874664, "ce_loss_4": 3.627521336078644, "ce_loss_9": 3.32495493888855, "epoch": 0.624, "grad_norm": 812.0, "kl_loss_13": 73.42914161682128, "kl_loss_2": 1844.0267211914063, "kl_loss_4": 993.9605987548828, "kl_loss_9": 288.5894271850586, "learning_rate": 0.0003156427123660297, "loss": 782.3298, "step": 6240 }, { "ce_loss_13": 3.308632123470306, "ce_loss_17": 3.269201564788818, "ce_loss_2": 4.075583910942077, "ce_loss_4": 3.693323624134064, "ce_loss_9": 3.4053150177001954, "epoch": 0.625, "grad_norm": 964.0, "kl_loss_13": 71.81601142883301, "kl_loss_2": 1760.6226501464844, "kl_loss_4": 959.6227325439453, "kl_loss_9": 277.5846717834473, "learning_rate": 0.0003141687721698363, "loss": 778.7586, "step": 6250 }, { "ce_loss_13": 3.285798990726471, "ce_loss_17": 3.246725380420685, "ce_loss_2": 4.028945982456207, "ce_loss_4": 3.649042856693268, "ce_loss_9": 3.3741363763809202, "epoch": 0.626, "grad_norm": 932.0, "kl_loss_13": 69.40913047790528, "kl_loss_2": 1709.059881591797, "kl_loss_4": 922.3430023193359, "kl_loss_9": 265.99488525390626, "learning_rate": 0.00031269670329204396, "loss": 764.9511, "step": 6260 }, { "ce_loss_13": 3.318275809288025, "ce_loss_17": 3.2813219666481017, "ce_loss_2": 4.072272872924804, "ce_loss_4": 3.6981289267539976, "ce_loss_9": 3.4113965749740602, "epoch": 0.627, "grad_norm": 904.0, "kl_loss_13": 71.68196277618408, "kl_loss_2": 1742.3362915039063, "kl_loss_4": 950.3534515380859, "kl_loss_9": 277.0380157470703, "learning_rate": 0.00031122652055637015, "loss": 774.6862, "step": 6270 }, { "ce_loss_13": 3.2794430017471314, "ce_loss_17": 3.2412443280220034, "ce_loss_2": 4.075256025791168, "ce_loss_4": 3.671683204174042, "ce_loss_9": 3.3764753937721252, "epoch": 0.628, "grad_norm": 932.0, "kl_loss_13": 72.09937229156495, "kl_loss_2": 1823.369482421875, "kl_loss_4": 986.5869415283203, "kl_loss_9": 282.3317504882813, "learning_rate": 0.0003097582387675385, "loss": 774.2291, "step": 6280 }, { "ce_loss_13": 3.3189268708229065, "ce_loss_17": 3.2824313163757326, "ce_loss_2": 4.103376257419586, "ce_loss_4": 3.7095051646232604, "ce_loss_9": 3.415855610370636, "epoch": 0.629, "grad_norm": 856.0, "kl_loss_13": 71.44106979370117, "kl_loss_2": 1805.623046875, "kl_loss_4": 971.4082427978516, "kl_loss_9": 279.6691734313965, "learning_rate": 0.00030829187271113034, "loss": 774.3185, "step": 6290 }, { "ce_loss_13": 3.313884997367859, "ce_loss_17": 3.2744386076927183, "ce_loss_2": 4.0814232468605045, "ce_loss_4": 3.692212975025177, "ce_loss_9": 3.40540634393692, "epoch": 0.63, "grad_norm": 988.0, "kl_loss_13": 71.26968612670899, "kl_loss_2": 1759.0361206054688, "kl_loss_4": 956.8941284179688, "kl_loss_9": 275.8732261657715, "learning_rate": 0.00030682743715343565, "loss": 782.9761, "step": 6300 }, { "ce_loss_13": 3.2659210562705994, "ce_loss_17": 3.223229229450226, "ce_loss_2": 4.058343923091888, "ce_loss_4": 3.6658867835998534, "ce_loss_9": 3.3643734216690064, "epoch": 0.631, "grad_norm": 660.0, "kl_loss_13": 74.21347732543946, "kl_loss_2": 1803.3361572265626, "kl_loss_4": 977.7171142578125, "kl_loss_9": 284.70903701782225, "learning_rate": 0.0003053649468413043, "loss": 791.4564, "step": 6310 }, { "ce_loss_13": 3.373299503326416, "ce_loss_17": 3.3342126488685606, "ce_loss_2": 4.143603193759918, "ce_loss_4": 3.7604181289672853, "ce_loss_9": 3.4721192598342894, "epoch": 0.632, "grad_norm": 1032.0, "kl_loss_13": 72.66786975860596, "kl_loss_2": 1778.5544677734374, "kl_loss_4": 964.4415985107422, "kl_loss_9": 281.6828071594238, "learning_rate": 0.00030390441650199725, "loss": 774.1748, "step": 6320 }, { "ce_loss_13": 3.277493715286255, "ce_loss_17": 3.2391820192337035, "ce_loss_2": 4.05932000875473, "ce_loss_4": 3.664527249336243, "ce_loss_9": 3.3744643330574036, "epoch": 0.633, "grad_norm": 1016.0, "kl_loss_13": 70.67644805908203, "kl_loss_2": 1778.2772399902344, "kl_loss_4": 962.4036499023438, "kl_loss_9": 278.81672897338865, "learning_rate": 0.00030244586084303903, "loss": 770.6784, "step": 6330 }, { "ce_loss_13": 3.244217538833618, "ce_loss_17": 3.206530499458313, "ce_loss_2": 4.046750092506409, "ce_loss_4": 3.65035218000412, "ce_loss_9": 3.341222083568573, "epoch": 0.634, "grad_norm": 756.0, "kl_loss_13": 71.81494064331055, "kl_loss_2": 1835.2865966796876, "kl_loss_4": 1000.7807067871094, "kl_loss_9": 283.35120391845703, "learning_rate": 0.00030098929455206903, "loss": 775.5071, "step": 6340 }, { "ce_loss_13": 3.252631652355194, "ce_loss_17": 3.215214204788208, "ce_loss_2": 4.041942119598389, "ce_loss_4": 3.6305495619773867, "ce_loss_9": 3.3447246193885802, "epoch": 0.635, "grad_norm": 784.0, "kl_loss_13": 70.43671970367431, "kl_loss_2": 1820.333349609375, "kl_loss_4": 969.7455963134765, "kl_loss_9": 277.15399780273435, "learning_rate": 0.00029953473229669324, "loss": 797.8032, "step": 6350 }, { "ce_loss_13": 3.281371021270752, "ce_loss_17": 3.2443987011909483, "ce_loss_2": 4.0675465822219845, "ce_loss_4": 3.678115463256836, "ce_loss_9": 3.3796138882637026, "epoch": 0.636, "grad_norm": 1304.0, "kl_loss_13": 71.03854198455811, "kl_loss_2": 1794.914044189453, "kl_loss_4": 974.7052612304688, "kl_loss_9": 282.9618148803711, "learning_rate": 0.00029808218872433767, "loss": 771.7335, "step": 6360 }, { "ce_loss_13": 3.3407238960266112, "ce_loss_17": 3.3002508759498594, "ce_loss_2": 4.118294715881348, "ce_loss_4": 3.7223580479621887, "ce_loss_9": 3.4371691823005674, "epoch": 0.637, "grad_norm": 996.0, "kl_loss_13": 71.89597148895264, "kl_loss_2": 1791.3907897949218, "kl_loss_4": 955.3046051025391, "kl_loss_9": 277.70820846557615, "learning_rate": 0.0002966316784621, "loss": 765.5395, "step": 6370 }, { "ce_loss_13": 3.2532244086265565, "ce_loss_17": 3.213400721549988, "ce_loss_2": 4.059457647800445, "ce_loss_4": 3.6510413765907286, "ce_loss_9": 3.349591112136841, "epoch": 0.638, "grad_norm": 836.0, "kl_loss_13": 72.27867050170899, "kl_loss_2": 1827.22919921875, "kl_loss_4": 989.6232208251953, "kl_loss_9": 285.76849670410155, "learning_rate": 0.0002951832161166024, "loss": 776.5623, "step": 6380 }, { "ce_loss_13": 3.3263792872428892, "ce_loss_17": 3.287059986591339, "ce_loss_2": 4.109494614601135, "ce_loss_4": 3.719343626499176, "ce_loss_9": 3.425124990940094, "epoch": 0.639, "grad_norm": 812.0, "kl_loss_13": 73.35769863128662, "kl_loss_2": 1783.822198486328, "kl_loss_4": 967.2407745361328, "kl_loss_9": 281.5798004150391, "learning_rate": 0.0002937368162738445, "loss": 763.2839, "step": 6390 }, { "ce_loss_13": 3.2767720222473145, "ce_loss_17": 3.238836967945099, "ce_loss_2": 4.0461488485336305, "ce_loss_4": 3.654835057258606, "ce_loss_9": 3.3681108474731447, "epoch": 0.64, "grad_norm": 948.0, "kl_loss_13": 69.1386381149292, "kl_loss_2": 1787.5851501464845, "kl_loss_4": 960.3883850097657, "kl_loss_9": 271.1270004272461, "learning_rate": 0.0002922924934990568, "loss": 779.4924, "step": 6400 }, { "ce_loss_13": 3.21414737701416, "ce_loss_17": 3.173914396762848, "ce_loss_2": 4.030556344985962, "ce_loss_4": 3.611742639541626, "ce_loss_9": 3.309198236465454, "epoch": 0.641, "grad_norm": 804.0, "kl_loss_13": 71.55045490264892, "kl_loss_2": 1865.8428894042968, "kl_loss_4": 997.1090881347657, "kl_loss_9": 281.19031829833983, "learning_rate": 0.0002908502623365536, "loss": 787.647, "step": 6410 }, { "ce_loss_13": 3.142470693588257, "ce_loss_17": 3.10512797832489, "ce_loss_2": 3.9646692752838133, "ce_loss_4": 3.547313940525055, "ce_loss_9": 3.2446085810661316, "epoch": 0.642, "grad_norm": 848.0, "kl_loss_13": 69.80724754333497, "kl_loss_2": 1873.81376953125, "kl_loss_4": 1001.1044128417968, "kl_loss_9": 281.8624671936035, "learning_rate": 0.0002894101373095867, "loss": 791.2895, "step": 6420 }, { "ce_loss_13": 3.355082380771637, "ce_loss_17": 3.3160241842269897, "ce_loss_2": 4.120398831367493, "ce_loss_4": 3.737993931770325, "ce_loss_9": 3.4494681477546694, "epoch": 0.643, "grad_norm": 1040.0, "kl_loss_13": 72.17458419799804, "kl_loss_2": 1769.3141662597657, "kl_loss_4": 963.356201171875, "kl_loss_9": 278.2576614379883, "learning_rate": 0.00028797213292019926, "loss": 770.181, "step": 6430 }, { "ce_loss_13": 3.3366548657417296, "ce_loss_17": 3.295503115653992, "ce_loss_2": 4.1048006296157835, "ce_loss_4": 3.7235260963439942, "ce_loss_9": 3.4315634846687315, "epoch": 0.644, "grad_norm": 940.0, "kl_loss_13": 72.5972915649414, "kl_loss_2": 1776.367205810547, "kl_loss_4": 970.1185089111328, "kl_loss_9": 280.8289596557617, "learning_rate": 0.0002865362636490791, "loss": 788.4355, "step": 6440 }, { "ce_loss_13": 3.3477864146232603, "ce_loss_17": 3.310180294513702, "ce_loss_2": 4.12326066493988, "ce_loss_4": 3.7306790232658384, "ce_loss_9": 3.44403954744339, "epoch": 0.645, "grad_norm": 924.0, "kl_loss_13": 71.45477027893067, "kl_loss_2": 1776.1947631835938, "kl_loss_4": 955.2560546875, "kl_loss_9": 276.38500061035154, "learning_rate": 0.0002851025439554142, "loss": 764.4826, "step": 6450 }, { "ce_loss_13": 3.3339331269264223, "ce_loss_17": 3.2946556568145753, "ce_loss_2": 4.094459521770477, "ce_loss_4": 3.7216048359870912, "ce_loss_9": 3.4312340378761292, "epoch": 0.646, "grad_norm": 904.0, "kl_loss_13": 72.28151435852051, "kl_loss_2": 1738.59453125, "kl_loss_4": 967.1679748535156, "kl_loss_9": 280.8125907897949, "learning_rate": 0.00028367098827674573, "loss": 767.1638, "step": 6460 }, { "ce_loss_13": 3.262351620197296, "ce_loss_17": 3.225614595413208, "ce_loss_2": 4.04227089881897, "ce_loss_4": 3.643923032283783, "ce_loss_9": 3.3568783521652223, "epoch": 0.647, "grad_norm": 824.0, "kl_loss_13": 70.0902738571167, "kl_loss_2": 1774.7165893554688, "kl_loss_4": 950.2580627441406, "kl_loss_9": 275.03301162719725, "learning_rate": 0.00028224161102882397, "loss": 771.2599, "step": 6470 }, { "ce_loss_13": 3.2347336411476135, "ce_loss_17": 3.200877916812897, "ce_loss_2": 4.004123604297638, "ce_loss_4": 3.6204243063926698, "ce_loss_9": 3.328202188014984, "epoch": 0.648, "grad_norm": 1464.0, "kl_loss_13": 69.69904022216797, "kl_loss_2": 1754.0138366699218, "kl_loss_4": 955.4148651123047, "kl_loss_9": 272.5667182922363, "learning_rate": 0.00028081442660546124, "loss": 771.9115, "step": 6480 }, { "ce_loss_13": 3.305151629447937, "ce_loss_17": 3.2654871940612793, "ce_loss_2": 4.065175533294678, "ce_loss_4": 3.6872007489204406, "ce_loss_9": 3.3978879928588865, "epoch": 0.649, "grad_norm": 788.0, "kl_loss_13": 72.08305015563965, "kl_loss_2": 1752.1914611816405, "kl_loss_4": 951.0301177978515, "kl_loss_9": 277.8635765075684, "learning_rate": 0.0002793894493783892, "loss": 769.7433, "step": 6490 }, { "ce_loss_13": 3.322451424598694, "ce_loss_17": 3.284602701663971, "ce_loss_2": 4.086758828163147, "ce_loss_4": 3.695036458969116, "ce_loss_9": 3.4139219641685488, "epoch": 0.65, "grad_norm": 820.0, "kl_loss_13": 70.80225067138672, "kl_loss_2": 1760.5633239746094, "kl_loss_4": 943.2197174072265, "kl_loss_9": 270.98013763427736, "learning_rate": 0.0002779666936971129, "loss": 761.3337, "step": 6500 }, { "ce_loss_13": 3.3305124044418335, "ce_loss_17": 3.2916394114494323, "ce_loss_2": 4.117040920257568, "ce_loss_4": 3.7206697344779966, "ce_loss_9": 3.4260012030601503, "epoch": 0.651, "grad_norm": 912.0, "kl_loss_13": 71.9400619506836, "kl_loss_2": 1809.14013671875, "kl_loss_4": 977.0250427246094, "kl_loss_9": 281.1179428100586, "learning_rate": 0.00027654617388876614, "loss": 785.1118, "step": 6510 }, { "ce_loss_13": 3.3561911463737486, "ce_loss_17": 3.3163033366203307, "ce_loss_2": 4.123410153388977, "ce_loss_4": 3.734295201301575, "ce_loss_9": 3.4477269887924193, "epoch": 0.652, "grad_norm": 792.0, "kl_loss_13": 72.00549278259277, "kl_loss_2": 1767.6321655273437, "kl_loss_4": 956.7952453613282, "kl_loss_9": 276.63285598754885, "learning_rate": 0.0002751279042579672, "loss": 769.8588, "step": 6520 }, { "ce_loss_13": 3.2965205788612364, "ce_loss_17": 3.258060562610626, "ce_loss_2": 4.066187536716461, "ce_loss_4": 3.6735449194908143, "ce_loss_9": 3.3904851913452148, "epoch": 0.653, "grad_norm": 1016.0, "kl_loss_13": 70.93943119049072, "kl_loss_2": 1765.4940246582032, "kl_loss_4": 948.104296875, "kl_loss_9": 273.96680755615233, "learning_rate": 0.00027371189908667604, "loss": 780.4364, "step": 6530 }, { "ce_loss_13": 3.3481835246086122, "ce_loss_17": 3.3083571434020995, "ce_loss_2": 4.160701036453247, "ce_loss_4": 3.751006531715393, "ce_loss_9": 3.4436028599739075, "epoch": 0.654, "grad_norm": 864.0, "kl_loss_13": 73.45802383422851, "kl_loss_2": 1832.6454040527344, "kl_loss_4": 981.995571899414, "kl_loss_9": 284.39140167236326, "learning_rate": 0.00027229817263404863, "loss": 796.8684, "step": 6540 }, { "ce_loss_13": 3.330435812473297, "ce_loss_17": 3.2924392580986024, "ce_loss_2": 4.067032384872436, "ce_loss_4": 3.698940932750702, "ce_loss_9": 3.423515808582306, "epoch": 0.655, "grad_norm": 840.0, "kl_loss_13": 71.07789249420166, "kl_loss_2": 1709.2124084472657, "kl_loss_4": 935.6491851806641, "kl_loss_9": 273.2137001037598, "learning_rate": 0.0002708867391362948, "loss": 762.9658, "step": 6550 }, { "ce_loss_13": 3.3154842495918273, "ce_loss_17": 3.277799355983734, "ce_loss_2": 4.051917147636414, "ce_loss_4": 3.676171064376831, "ce_loss_9": 3.402910828590393, "epoch": 0.656, "grad_norm": 856.0, "kl_loss_13": 69.70827026367188, "kl_loss_2": 1708.5784423828125, "kl_loss_4": 915.2664428710938, "kl_loss_9": 267.1766471862793, "learning_rate": 0.0002694776128065345, "loss": 762.0438, "step": 6560 }, { "ce_loss_13": 3.248313331604004, "ce_loss_17": 3.210926651954651, "ce_loss_2": 4.031670546531677, "ce_loss_4": 3.640976977348328, "ce_loss_9": 3.343873071670532, "epoch": 0.657, "grad_norm": 696.0, "kl_loss_13": 70.72250576019287, "kl_loss_2": 1804.789971923828, "kl_loss_4": 979.3676391601563, "kl_loss_9": 282.7828567504883, "learning_rate": 0.00026807080783465374, "loss": 769.7072, "step": 6570 }, { "ce_loss_13": 3.353252899646759, "ce_loss_17": 3.3145036935806274, "ce_loss_2": 4.141715168952942, "ce_loss_4": 3.7431769251823424, "ce_loss_9": 3.450781464576721, "epoch": 0.658, "grad_norm": 724.0, "kl_loss_13": 72.77521324157715, "kl_loss_2": 1798.7067626953126, "kl_loss_4": 975.6858001708985, "kl_loss_9": 282.0204391479492, "learning_rate": 0.00026666633838716316, "loss": 785.6488, "step": 6580 }, { "ce_loss_13": 3.2512298226356506, "ce_loss_17": 3.209996056556702, "ce_loss_2": 4.05036506652832, "ce_loss_4": 3.6514967322349547, "ce_loss_9": 3.35011887550354, "epoch": 0.659, "grad_norm": 780.0, "kl_loss_13": 72.99883079528809, "kl_loss_2": 1824.5407653808593, "kl_loss_4": 993.5420104980469, "kl_loss_9": 284.9843505859375, "learning_rate": 0.00026526421860705474, "loss": 793.5008, "step": 6590 }, { "ce_loss_13": 3.275307810306549, "ce_loss_17": 3.234655296802521, "ce_loss_2": 4.0622793674469, "ce_loss_4": 3.665205705165863, "ce_loss_9": 3.372453773021698, "epoch": 0.66, "grad_norm": 800.0, "kl_loss_13": 72.58459072113037, "kl_loss_2": 1791.7223022460937, "kl_loss_4": 965.9852325439454, "kl_loss_9": 280.37607803344724, "learning_rate": 0.0002638644626136587, "loss": 769.9592, "step": 6600 }, { "ce_loss_13": 3.2888129353523254, "ce_loss_17": 3.251253354549408, "ce_loss_2": 4.072038042545318, "ce_loss_4": 3.6752144932746886, "ce_loss_9": 3.3817059636116027, "epoch": 0.661, "grad_norm": 816.0, "kl_loss_13": 70.77959651947022, "kl_loss_2": 1782.0645324707032, "kl_loss_4": 968.8410400390625, "kl_loss_9": 276.794002532959, "learning_rate": 0.00026246708450250255, "loss": 774.9196, "step": 6610 }, { "ce_loss_13": 3.2809179782867433, "ce_loss_17": 3.2430922031402587, "ce_loss_2": 4.0411675572395325, "ce_loss_4": 3.660265898704529, "ce_loss_9": 3.372435915470123, "epoch": 0.662, "grad_norm": 972.0, "kl_loss_13": 71.35952968597412, "kl_loss_2": 1753.348388671875, "kl_loss_4": 947.7372283935547, "kl_loss_9": 274.24386978149414, "learning_rate": 0.00026107209834516854, "loss": 769.384, "step": 6620 }, { "ce_loss_13": 3.2341336011886597, "ce_loss_17": 3.195348930358887, "ce_loss_2": 4.057987570762634, "ce_loss_4": 3.636424171924591, "ce_loss_9": 3.3336671113967897, "epoch": 0.663, "grad_norm": 636.0, "kl_loss_13": 72.059228515625, "kl_loss_2": 1867.8700805664062, "kl_loss_4": 991.2097351074219, "kl_loss_9": 283.4357620239258, "learning_rate": 0.0002596795181891514, "loss": 794.6483, "step": 6630 }, { "ce_loss_13": 3.23643182516098, "ce_loss_17": 3.1967314720153808, "ce_loss_2": 4.034952008724213, "ce_loss_4": 3.6355783104896546, "ce_loss_9": 3.336190640926361, "epoch": 0.664, "grad_norm": 1104.0, "kl_loss_13": 72.54963207244873, "kl_loss_2": 1816.6078491210938, "kl_loss_4": 983.9069519042969, "kl_loss_9": 283.86315460205077, "learning_rate": 0.000258289358057718, "loss": 812.8871, "step": 6640 }, { "ce_loss_13": 3.312631404399872, "ce_loss_17": 3.2709787249565125, "ce_loss_2": 4.105361819267273, "ce_loss_4": 3.7066175818443297, "ce_loss_9": 3.4137498497962953, "epoch": 0.665, "grad_norm": 880.0, "kl_loss_13": 73.74160442352294, "kl_loss_2": 1820.5200256347657, "kl_loss_4": 976.478402709961, "kl_loss_9": 286.28767547607424, "learning_rate": 0.0002569016319497657, "loss": 788.9465, "step": 6650 }, { "ce_loss_13": 3.2948471903800964, "ce_loss_17": 3.2543782114982607, "ce_loss_2": 4.078556907176972, "ce_loss_4": 3.686791181564331, "ce_loss_9": 3.3982311129570006, "epoch": 0.666, "grad_norm": 812.0, "kl_loss_13": 73.74290733337402, "kl_loss_2": 1815.1135498046874, "kl_loss_4": 980.0942779541016, "kl_loss_9": 287.9010688781738, "learning_rate": 0.00025551635383968066, "loss": 793.4953, "step": 6660 }, { "ce_loss_13": 3.213699662685394, "ce_loss_17": 3.1740002036094666, "ce_loss_2": 4.008650445938111, "ce_loss_4": 3.6066946148872376, "ce_loss_9": 3.3078731894493103, "epoch": 0.667, "grad_norm": 920.0, "kl_loss_13": 71.81263065338135, "kl_loss_2": 1831.4727355957032, "kl_loss_4": 988.4091125488281, "kl_loss_9": 283.7192153930664, "learning_rate": 0.00025413353767719804, "loss": 787.2924, "step": 6670 }, { "ce_loss_13": 3.2675450086593627, "ce_loss_17": 3.230876660346985, "ce_loss_2": 4.050916016101837, "ce_loss_4": 3.6535902619361877, "ce_loss_9": 3.3607989072799684, "epoch": 0.668, "grad_norm": 876.0, "kl_loss_13": 70.10343036651611, "kl_loss_2": 1803.5049194335938, "kl_loss_4": 975.379360961914, "kl_loss_9": 275.31723861694337, "learning_rate": 0.0002527531973872617, "loss": 783.0383, "step": 6680 }, { "ce_loss_13": 3.2812340021133424, "ce_loss_17": 3.2442690253257753, "ce_loss_2": 4.050098311901093, "ce_loss_4": 3.6664043426513673, "ce_loss_9": 3.3740264654159544, "epoch": 0.669, "grad_norm": 1096.0, "kl_loss_13": 70.2721643447876, "kl_loss_2": 1771.3466003417968, "kl_loss_4": 963.0172088623046, "kl_loss_9": 275.112654876709, "learning_rate": 0.0002513753468698826, "loss": 772.7548, "step": 6690 }, { "ce_loss_13": 3.251974892616272, "ce_loss_17": 3.2128549456596374, "ce_loss_2": 4.043203258514405, "ce_loss_4": 3.643755042552948, "ce_loss_9": 3.345508944988251, "epoch": 0.67, "grad_norm": 880.0, "kl_loss_13": 71.74250259399415, "kl_loss_2": 1831.1538635253905, "kl_loss_4": 983.5442596435547, "kl_loss_9": 282.8565361022949, "learning_rate": 0.0002500000000000001, "loss": 786.5076, "step": 6700 }, { "ce_loss_13": 3.3693856596946716, "ce_loss_17": 3.3300490021705627, "ce_loss_2": 4.101691889762878, "ce_loss_4": 3.727284145355225, "ce_loss_9": 3.4568962335586546, "epoch": 0.671, "grad_norm": 772.0, "kl_loss_13": 70.97701282501221, "kl_loss_2": 1705.0662841796875, "kl_loss_4": 922.4608154296875, "kl_loss_9": 270.672705078125, "learning_rate": 0.0002486271706273421, "loss": 782.0349, "step": 6710 }, { "ce_loss_13": 3.3036414980888367, "ce_loss_17": 3.267821168899536, "ce_loss_2": 4.045556378364563, "ce_loss_4": 3.670479393005371, "ce_loss_9": 3.392921674251556, "epoch": 0.672, "grad_norm": 1536.0, "kl_loss_13": 70.02148818969727, "kl_loss_2": 1703.5993408203126, "kl_loss_4": 918.4036315917969, "kl_loss_9": 267.0767433166504, "learning_rate": 0.0002472568725762853, "loss": 763.772, "step": 6720 }, { "ce_loss_13": 3.2973303318023683, "ce_loss_17": 3.25869699716568, "ce_loss_2": 4.035550963878632, "ce_loss_4": 3.6621091604232787, "ce_loss_9": 3.388404607772827, "epoch": 0.673, "grad_norm": 972.0, "kl_loss_13": 69.29530754089356, "kl_loss_2": 1722.9706298828125, "kl_loss_4": 927.8394775390625, "kl_loss_9": 266.9626518249512, "learning_rate": 0.00024588911964571554, "loss": 758.6145, "step": 6730 }, { "ce_loss_13": 3.3077115178108216, "ce_loss_17": 3.2654620885848997, "ce_loss_2": 4.116430401802063, "ce_loss_4": 3.7116676926612855, "ce_loss_9": 3.4078907489776613, "epoch": 0.674, "grad_norm": 1072.0, "kl_loss_13": 75.20856895446778, "kl_loss_2": 1833.9399047851562, "kl_loss_4": 993.9445495605469, "kl_loss_9": 289.59678802490237, "learning_rate": 0.00024452392560888974, "loss": 779.83, "step": 6740 }, { "ce_loss_13": 3.202228772640228, "ce_loss_17": 3.1627680897712707, "ce_loss_2": 3.9784467577934266, "ce_loss_4": 3.589304792881012, "ce_loss_9": 3.301189970970154, "epoch": 0.675, "grad_norm": 840.0, "kl_loss_13": 69.91371078491211, "kl_loss_2": 1786.9942443847656, "kl_loss_4": 964.2825653076172, "kl_loss_9": 273.65016479492186, "learning_rate": 0.00024316130421329695, "loss": 766.0736, "step": 6750 }, { "ce_loss_13": 3.2838388085365295, "ce_loss_17": 3.246099424362183, "ce_loss_2": 4.045943284034729, "ce_loss_4": 3.6641584515571592, "ce_loss_9": 3.377654695510864, "epoch": 0.676, "grad_norm": 700.0, "kl_loss_13": 70.78494644165039, "kl_loss_2": 1752.5845092773438, "kl_loss_4": 948.2985046386718, "kl_loss_9": 273.8147804260254, "learning_rate": 0.00024180126918051909, "loss": 767.4725, "step": 6760 }, { "ce_loss_13": 3.3248173236846923, "ce_loss_17": 3.2853858709335326, "ce_loss_2": 4.0837649464607235, "ce_loss_4": 3.7026811838150024, "ce_loss_9": 3.419490098953247, "epoch": 0.677, "grad_norm": 1000.0, "kl_loss_13": 71.17267417907715, "kl_loss_2": 1750.3757202148438, "kl_loss_4": 943.9945373535156, "kl_loss_9": 275.70510177612306, "learning_rate": 0.00024044383420609406, "loss": 761.4471, "step": 6770 }, { "ce_loss_13": 3.3338271021842956, "ce_loss_17": 3.2949508309364317, "ce_loss_2": 4.071341669559478, "ce_loss_4": 3.6980969429016115, "ce_loss_9": 3.4230048298835754, "epoch": 0.678, "grad_norm": 1144.0, "kl_loss_13": 70.32768592834472, "kl_loss_2": 1737.1424133300782, "kl_loss_4": 944.9273620605469, "kl_loss_9": 273.1325569152832, "learning_rate": 0.00023908901295937712, "loss": 776.1757, "step": 6780 }, { "ce_loss_13": 3.329776632785797, "ce_loss_17": 3.2886913418769836, "ce_loss_2": 4.091409718990326, "ce_loss_4": 3.708410894870758, "ce_loss_9": 3.4214085578918456, "epoch": 0.679, "grad_norm": 1088.0, "kl_loss_13": 71.47972583770752, "kl_loss_2": 1743.7018920898438, "kl_loss_4": 947.4816619873047, "kl_loss_9": 272.5430030822754, "learning_rate": 0.00023773681908340283, "loss": 779.8013, "step": 6790 }, { "ce_loss_13": 3.304816460609436, "ce_loss_17": 3.2629502773284913, "ce_loss_2": 4.09833025932312, "ce_loss_4": 3.7017361760139464, "ce_loss_9": 3.404166114330292, "epoch": 0.68, "grad_norm": 764.0, "kl_loss_13": 75.02878093719482, "kl_loss_2": 1834.7515686035156, "kl_loss_4": 995.4694244384766, "kl_loss_9": 290.82249908447267, "learning_rate": 0.00023638726619474876, "loss": 799.3655, "step": 6800 }, { "ce_loss_13": 3.286585295200348, "ce_loss_17": 3.2471044182777407, "ce_loss_2": 4.1025919079780575, "ce_loss_4": 3.694387984275818, "ce_loss_9": 3.3894974827766418, "epoch": 0.681, "grad_norm": 936.0, "kl_loss_13": 71.55782508850098, "kl_loss_2": 1831.8867919921875, "kl_loss_4": 990.6810455322266, "kl_loss_9": 282.49219741821287, "learning_rate": 0.0002350403678833976, "loss": 785.4149, "step": 6810 }, { "ce_loss_13": 3.217432200908661, "ce_loss_17": 3.1785250067710877, "ce_loss_2": 4.012353837490082, "ce_loss_4": 3.611852240562439, "ce_loss_9": 3.313712215423584, "epoch": 0.682, "grad_norm": 732.0, "kl_loss_13": 70.16158847808838, "kl_loss_2": 1814.9119995117187, "kl_loss_4": 981.8430053710938, "kl_loss_9": 277.7752067565918, "learning_rate": 0.00023369613771260007, "loss": 774.9444, "step": 6820 }, { "ce_loss_13": 3.336046314239502, "ce_loss_17": 3.295581614971161, "ce_loss_2": 4.117106795310974, "ce_loss_4": 3.7286535143852233, "ce_loss_9": 3.4304684519767763, "epoch": 0.683, "grad_norm": 1056.0, "kl_loss_13": 73.21032981872558, "kl_loss_2": 1802.8844848632812, "kl_loss_4": 978.7585845947266, "kl_loss_9": 281.62718200683594, "learning_rate": 0.00023235458921873925, "loss": 786.7231, "step": 6830 }, { "ce_loss_13": 3.289022660255432, "ce_loss_17": 3.248283493518829, "ce_loss_2": 4.117696058750153, "ce_loss_4": 3.704160213470459, "ce_loss_9": 3.3890608310699464, "epoch": 0.684, "grad_norm": 1032.0, "kl_loss_13": 74.40035495758056, "kl_loss_2": 1897.1766357421875, "kl_loss_4": 1015.8494293212891, "kl_loss_9": 289.44225082397463, "learning_rate": 0.0002310157359111938, "loss": 812.3593, "step": 6840 }, { "ce_loss_13": 3.17997887134552, "ce_loss_17": 3.140820550918579, "ce_loss_2": 4.045794034004212, "ce_loss_4": 3.602759265899658, "ce_loss_9": 3.283980429172516, "epoch": 0.685, "grad_norm": 1872.0, "kl_loss_13": 71.50569801330566, "kl_loss_2": 1944.2283752441406, "kl_loss_4": 1027.8053771972657, "kl_loss_9": 287.1462005615234, "learning_rate": 0.0002296795912722014, "loss": 809.6538, "step": 6850 }, { "ce_loss_13": 3.3196268558502195, "ce_loss_17": 3.2815457463264464, "ce_loss_2": 4.0760578393936155, "ce_loss_4": 3.6949382424354553, "ce_loss_9": 3.417670750617981, "epoch": 0.686, "grad_norm": 684.0, "kl_loss_13": 70.69927768707275, "kl_loss_2": 1754.9938415527345, "kl_loss_4": 945.5229858398437, "kl_loss_9": 274.71204833984376, "learning_rate": 0.0002283461687567236, "loss": 755.9741, "step": 6860 }, { "ce_loss_13": 3.378780448436737, "ce_loss_17": 3.339313018321991, "ce_loss_2": 4.116203796863556, "ce_loss_4": 3.7454458117485045, "ce_loss_9": 3.4698862433433533, "epoch": 0.687, "grad_norm": 764.0, "kl_loss_13": 71.76581039428712, "kl_loss_2": 1704.88330078125, "kl_loss_4": 929.5515380859375, "kl_loss_9": 270.881037902832, "learning_rate": 0.00022701548179231045, "loss": 771.088, "step": 6870 }, { "ce_loss_13": 3.330306875705719, "ce_loss_17": 3.2887945055961607, "ce_loss_2": 4.113952279090881, "ce_loss_4": 3.714215672016144, "ce_loss_9": 3.425825297832489, "epoch": 0.688, "grad_norm": 820.0, "kl_loss_13": 72.66767101287842, "kl_loss_2": 1807.6771850585938, "kl_loss_4": 971.4168090820312, "kl_loss_9": 281.0403091430664, "learning_rate": 0.00022568754377896516, "loss": 766.6914, "step": 6880 }, { "ce_loss_13": 3.321236217021942, "ce_loss_17": 3.2834801197052004, "ce_loss_2": 4.078975677490234, "ce_loss_4": 3.7029571533203125, "ce_loss_9": 3.4183809041976927, "epoch": 0.689, "grad_norm": 1104.0, "kl_loss_13": 71.08349094390869, "kl_loss_2": 1754.2367858886719, "kl_loss_4": 954.1943084716797, "kl_loss_9": 278.3357391357422, "learning_rate": 0.00022436236808900844, "loss": 765.88, "step": 6890 }, { "ce_loss_13": 3.2175270676612855, "ce_loss_17": 3.17860347032547, "ce_loss_2": 4.004483795166015, "ce_loss_4": 3.607356011867523, "ce_loss_9": 3.3122127175331117, "epoch": 0.69, "grad_norm": 816.0, "kl_loss_13": 70.88688526153564, "kl_loss_2": 1818.0381103515624, "kl_loss_4": 978.0747833251953, "kl_loss_9": 279.3812728881836, "learning_rate": 0.00022303996806694487, "loss": 776.254, "step": 6900 }, { "ce_loss_13": 3.2926143050193786, "ce_loss_17": 3.2546900272369386, "ce_loss_2": 4.065900957584381, "ce_loss_4": 3.676529657840729, "ce_loss_9": 3.386691427230835, "epoch": 0.691, "grad_norm": 792.0, "kl_loss_13": 70.55873546600341, "kl_loss_2": 1799.1060607910156, "kl_loss_4": 966.7993988037109, "kl_loss_9": 275.59252166748047, "learning_rate": 0.00022172035702932823, "loss": 775.97, "step": 6910 }, { "ce_loss_13": 3.337917852401733, "ce_loss_17": 3.3001177310943604, "ce_loss_2": 4.088469135761261, "ce_loss_4": 3.7172390818595886, "ce_loss_9": 3.426655948162079, "epoch": 0.692, "grad_norm": 1304.0, "kl_loss_13": 71.22828941345215, "kl_loss_2": 1717.326348876953, "kl_loss_4": 943.609243774414, "kl_loss_9": 274.3278610229492, "learning_rate": 0.00022040354826462666, "loss": 759.6712, "step": 6920 }, { "ce_loss_13": 3.2716445088386537, "ce_loss_17": 3.2332261562347413, "ce_loss_2": 4.051820051670075, "ce_loss_4": 3.653087794780731, "ce_loss_9": 3.3642398953437804, "epoch": 0.693, "grad_norm": 1020.0, "kl_loss_13": 70.62501392364501, "kl_loss_2": 1778.5100158691407, "kl_loss_4": 947.1623840332031, "kl_loss_9": 271.7375068664551, "learning_rate": 0.0002190895550330899, "loss": 776.3381, "step": 6930 }, { "ce_loss_13": 3.2025559544563293, "ce_loss_17": 3.1621774196624757, "ce_loss_2": 4.008305644989013, "ce_loss_4": 3.609113883972168, "ce_loss_9": 3.301098358631134, "epoch": 0.694, "grad_norm": 1208.0, "kl_loss_13": 71.739794921875, "kl_loss_2": 1830.299560546875, "kl_loss_4": 996.0399810791016, "kl_loss_9": 283.3093772888184, "learning_rate": 0.00021777839056661552, "loss": 775.2254, "step": 6940 }, { "ce_loss_13": 3.2846802711486816, "ce_loss_17": 3.2477630376815796, "ce_loss_2": 4.051619839668274, "ce_loss_4": 3.6623515844345094, "ce_loss_9": 3.3748562932014465, "epoch": 0.695, "grad_norm": 712.0, "kl_loss_13": 70.80554389953613, "kl_loss_2": 1764.8639221191406, "kl_loss_4": 951.2530120849609, "kl_loss_9": 274.18594512939455, "learning_rate": 0.0002164700680686147, "loss": 760.6781, "step": 6950 }, { "ce_loss_13": 3.334276628494263, "ce_loss_17": 3.2962963581085205, "ce_loss_2": 4.088710451126099, "ce_loss_4": 3.7073917031288146, "ce_loss_9": 3.4302581310272218, "epoch": 0.696, "grad_norm": 1152.0, "kl_loss_13": 72.1072265625, "kl_loss_2": 1724.0063781738281, "kl_loss_4": 936.3488128662109, "kl_loss_9": 275.84831008911135, "learning_rate": 0.0002151646007138806, "loss": 761.3031, "step": 6960 }, { "ce_loss_13": 3.209850013256073, "ce_loss_17": 3.1725902438163756, "ce_loss_2": 4.002887773513794, "ce_loss_4": 3.606672966480255, "ce_loss_9": 3.3055623888969423, "epoch": 0.697, "grad_norm": 772.0, "kl_loss_13": 72.02248516082764, "kl_loss_2": 1826.7546997070312, "kl_loss_4": 989.4366668701172, "kl_loss_9": 279.758309173584, "learning_rate": 0.00021386200164845526, "loss": 778.6626, "step": 6970 }, { "ce_loss_13": 3.3914801478385925, "ce_loss_17": 3.3524523973464966, "ce_loss_2": 4.114296960830688, "ce_loss_4": 3.755442941188812, "ce_loss_9": 3.4810468912124635, "epoch": 0.698, "grad_norm": 636.0, "kl_loss_13": 71.40635414123535, "kl_loss_2": 1709.2556945800782, "kl_loss_4": 944.0305419921875, "kl_loss_9": 274.5273628234863, "learning_rate": 0.0002125622839894964, "loss": 755.8117, "step": 6980 }, { "ce_loss_13": 3.329510045051575, "ce_loss_17": 3.290641796588898, "ce_loss_2": 4.083862352371216, "ce_loss_4": 3.706824839115143, "ce_loss_9": 3.4205021142959593, "epoch": 0.699, "grad_norm": 780.0, "kl_loss_13": 71.35521602630615, "kl_loss_2": 1725.0163208007812, "kl_loss_4": 938.1391387939453, "kl_loss_9": 272.0928482055664, "learning_rate": 0.00021126546082514663, "loss": 758.0833, "step": 6990 }, { "ce_loss_13": 3.3541366338729857, "ce_loss_17": 3.316024458408356, "ce_loss_2": 4.095506346225738, "ce_loss_4": 3.722522163391113, "ce_loss_9": 3.4431343197822573, "epoch": 0.7, "grad_norm": 760.0, "kl_loss_13": 71.6666015625, "kl_loss_2": 1725.105419921875, "kl_loss_4": 942.9446807861328, "kl_loss_9": 274.0315826416016, "learning_rate": 0.00020997154521440098, "loss": 755.1295, "step": 7000 }, { "ce_loss_13": 3.2969266891479494, "ce_loss_17": 3.2600142121315003, "ce_loss_2": 4.0547042965888975, "ce_loss_4": 3.6686680912971497, "ce_loss_9": 3.386641597747803, "epoch": 0.701, "grad_norm": 1080.0, "kl_loss_13": 70.24392051696778, "kl_loss_2": 1758.719580078125, "kl_loss_4": 950.8714721679687, "kl_loss_9": 272.28409042358396, "learning_rate": 0.0002086805501869749, "loss": 755.9173, "step": 7010 }, { "ce_loss_13": 3.264370834827423, "ce_loss_17": 3.226525294780731, "ce_loss_2": 4.065334379673004, "ce_loss_4": 3.667639875411987, "ce_loss_9": 3.3651340007781982, "epoch": 0.702, "grad_norm": 780.0, "kl_loss_13": 71.73685340881347, "kl_loss_2": 1839.5365051269532, "kl_loss_4": 993.7012023925781, "kl_loss_9": 285.7314987182617, "learning_rate": 0.0002073924887431744, "loss": 780.9709, "step": 7020 }, { "ce_loss_13": 3.2721614122390745, "ce_loss_17": 3.2347609400749207, "ce_loss_2": 4.044773375988006, "ce_loss_4": 3.6571881771087646, "ce_loss_9": 3.369364929199219, "epoch": 0.703, "grad_norm": 728.0, "kl_loss_13": 71.0749008178711, "kl_loss_2": 1778.470965576172, "kl_loss_4": 964.4342071533204, "kl_loss_9": 277.28941345214844, "learning_rate": 0.00020610737385376348, "loss": 786.926, "step": 7030 }, { "ce_loss_13": 3.332412588596344, "ce_loss_17": 3.294099271297455, "ce_loss_2": 4.061883521080017, "ce_loss_4": 3.698824071884155, "ce_loss_9": 3.4222095012664795, "epoch": 0.704, "grad_norm": 876.0, "kl_loss_13": 70.61465110778809, "kl_loss_2": 1688.5162780761718, "kl_loss_4": 928.5938690185546, "kl_loss_9": 269.72947082519534, "learning_rate": 0.00020482521845983521, "loss": 767.581, "step": 7040 }, { "ce_loss_13": 3.3323171854019167, "ce_loss_17": 3.2925487518310548, "ce_loss_2": 4.104314303398132, "ce_loss_4": 3.7108531475067137, "ce_loss_9": 3.427067816257477, "epoch": 0.705, "grad_norm": 1160.0, "kl_loss_13": 72.79951210021973, "kl_loss_2": 1783.0196350097656, "kl_loss_4": 961.49697265625, "kl_loss_9": 278.71324615478517, "learning_rate": 0.00020354603547267987, "loss": 781.7756, "step": 7050 }, { "ce_loss_13": 3.3129538536071776, "ce_loss_17": 3.2723392963409426, "ce_loss_2": 4.110062229633331, "ce_loss_4": 3.7134634494781493, "ce_loss_9": 3.410679376125336, "epoch": 0.706, "grad_norm": 736.0, "kl_loss_13": 72.24404449462891, "kl_loss_2": 1795.7587646484376, "kl_loss_4": 975.1206726074219, "kl_loss_9": 279.9232604980469, "learning_rate": 0.00020226983777365604, "loss": 797.2193, "step": 7060 }, { "ce_loss_13": 3.224683380126953, "ce_loss_17": 3.186890208721161, "ce_loss_2": 4.026706266403198, "ce_loss_4": 3.619293999671936, "ce_loss_9": 3.318616247177124, "epoch": 0.707, "grad_norm": 760.0, "kl_loss_13": 68.36813163757324, "kl_loss_2": 1822.1632385253906, "kl_loss_4": 961.1641967773437, "kl_loss_9": 269.3346450805664, "learning_rate": 0.00020099663821406056, "loss": 771.0888, "step": 7070 }, { "ce_loss_13": 3.3214773178100585, "ce_loss_17": 3.283447802066803, "ce_loss_2": 4.071311795711518, "ce_loss_4": 3.695223093032837, "ce_loss_9": 3.413736712932587, "epoch": 0.708, "grad_norm": 1704.0, "kl_loss_13": 70.34514198303222, "kl_loss_2": 1715.6263061523437, "kl_loss_4": 928.1057037353515, "kl_loss_9": 269.1682067871094, "learning_rate": 0.00019972644961499853, "loss": 767.0122, "step": 7080 }, { "ce_loss_13": 3.2867833733558656, "ce_loss_17": 3.247623884677887, "ce_loss_2": 4.082489454746247, "ce_loss_4": 3.6845450401306152, "ce_loss_9": 3.3835286021232607, "epoch": 0.709, "grad_norm": 720.0, "kl_loss_13": 72.10076866149902, "kl_loss_2": 1813.9811889648438, "kl_loss_4": 983.6025085449219, "kl_loss_9": 281.21164016723634, "learning_rate": 0.00019845928476725522, "loss": 779.535, "step": 7090 }, { "ce_loss_13": 3.3654056072235106, "ce_loss_17": 3.3253042817115785, "ce_loss_2": 4.129805624485016, "ce_loss_4": 3.7489982724189757, "ce_loss_9": 3.4594806432724, "epoch": 0.71, "grad_norm": 872.0, "kl_loss_13": 72.55214748382568, "kl_loss_2": 1756.3334106445313, "kl_loss_4": 960.7546752929687, "kl_loss_9": 277.62411270141604, "learning_rate": 0.00019719515643116677, "loss": 792.3727, "step": 7100 }, { "ce_loss_13": 3.309456527233124, "ce_loss_17": 3.27019317150116, "ce_loss_2": 4.06982558965683, "ce_loss_4": 3.6808470845222474, "ce_loss_9": 3.402991759777069, "epoch": 0.711, "grad_norm": 952.0, "kl_loss_13": 71.02709293365479, "kl_loss_2": 1760.2141357421874, "kl_loss_4": 942.3459259033203, "kl_loss_9": 273.5096015930176, "learning_rate": 0.0001959340773364911, "loss": 773.76, "step": 7110 }, { "ce_loss_13": 3.3214038491249083, "ce_loss_17": 3.2831618547439576, "ce_loss_2": 4.088900756835938, "ce_loss_4": 3.7054635286331177, "ce_loss_9": 3.4160606980323793, "epoch": 0.712, "grad_norm": 744.0, "kl_loss_13": 71.1388599395752, "kl_loss_2": 1771.7801818847656, "kl_loss_4": 960.0322570800781, "kl_loss_9": 276.5096008300781, "learning_rate": 0.0001946760601822809, "loss": 758.2503, "step": 7120 }, { "ce_loss_13": 3.37441748380661, "ce_loss_17": 3.336512637138367, "ce_loss_2": 4.1263908505439755, "ce_loss_4": 3.7414964199066163, "ce_loss_9": 3.4689518332481386, "epoch": 0.713, "grad_norm": 812.0, "kl_loss_13": 70.92561588287353, "kl_loss_2": 1740.7447570800782, "kl_loss_4": 931.0078186035156, "kl_loss_9": 273.3617431640625, "learning_rate": 0.00019342111763675512, "loss": 746.9193, "step": 7130 }, { "ce_loss_13": 3.373600149154663, "ce_loss_17": 3.334926092624664, "ce_loss_2": 4.108856225013733, "ce_loss_4": 3.73887095451355, "ce_loss_9": 3.46598562002182, "epoch": 0.714, "grad_norm": 1240.0, "kl_loss_13": 72.69193038940429, "kl_loss_2": 1716.9721557617188, "kl_loss_4": 944.558950805664, "kl_loss_9": 276.685969543457, "learning_rate": 0.00019216926233717085, "loss": 753.1709, "step": 7140 }, { "ce_loss_13": 3.263760483264923, "ce_loss_17": 3.2253942251205445, "ce_loss_2": 4.082875895500183, "ce_loss_4": 3.6528494596481322, "ce_loss_9": 3.356517326831818, "epoch": 0.715, "grad_norm": 840.0, "kl_loss_13": 69.8310625076294, "kl_loss_2": 1855.0406066894532, "kl_loss_4": 966.3692352294922, "kl_loss_9": 269.43797836303713, "learning_rate": 0.00019092050688969737, "loss": 785.2349, "step": 7150 }, { "ce_loss_13": 3.332518148422241, "ce_loss_17": 3.2948469281196595, "ce_loss_2": 4.081777715682984, "ce_loss_4": 3.7075140953063963, "ce_loss_9": 3.4232553839683533, "epoch": 0.716, "grad_norm": 696.0, "kl_loss_13": 70.54215602874756, "kl_loss_2": 1763.9240600585938, "kl_loss_4": 959.4861602783203, "kl_loss_9": 273.6377548217773, "learning_rate": 0.00018967486386928817, "loss": 763.6613, "step": 7160 }, { "ce_loss_13": 3.205380403995514, "ce_loss_17": 3.165660285949707, "ce_loss_2": 3.997770869731903, "ce_loss_4": 3.5989809036254883, "ce_loss_9": 3.3011061429977415, "epoch": 0.717, "grad_norm": 752.0, "kl_loss_13": 70.15151481628418, "kl_loss_2": 1823.458935546875, "kl_loss_4": 981.4985076904297, "kl_loss_9": 280.47804107666013, "learning_rate": 0.00018843234581955443, "loss": 801.8435, "step": 7170 }, { "ce_loss_13": 3.2218088269233705, "ce_loss_17": 3.182110404968262, "ce_loss_2": 4.012683880329132, "ce_loss_4": 3.6198145747184753, "ce_loss_9": 3.318508338928223, "epoch": 0.718, "grad_norm": 884.0, "kl_loss_13": 71.75006408691407, "kl_loss_2": 1810.262518310547, "kl_loss_4": 978.4985412597656, "kl_loss_9": 281.0864456176758, "learning_rate": 0.00018719296525263924, "loss": 781.7542, "step": 7180 }, { "ce_loss_13": 3.316895771026611, "ce_loss_17": 3.2792091727256776, "ce_loss_2": 4.051657652854919, "ce_loss_4": 3.6806211829185487, "ce_loss_9": 3.407132875919342, "epoch": 0.719, "grad_norm": 780.0, "kl_loss_13": 71.19637126922608, "kl_loss_2": 1715.8442016601562, "kl_loss_4": 928.7437896728516, "kl_loss_9": 271.83887710571287, "learning_rate": 0.0001859567346490913, "loss": 752.6343, "step": 7190 }, { "ce_loss_13": 3.2919270396232605, "ce_loss_17": 3.2515564560890198, "ce_loss_2": 4.07712619304657, "ce_loss_4": 3.681968426704407, "ce_loss_9": 3.386718785762787, "epoch": 0.72, "grad_norm": 1020.0, "kl_loss_13": 72.11697540283203, "kl_loss_2": 1800.6586608886719, "kl_loss_4": 971.411962890625, "kl_loss_9": 280.454451751709, "learning_rate": 0.0001847236664577389, "loss": 766.3369, "step": 7200 }, { "ce_loss_13": 3.3172996997833253, "ce_loss_17": 3.278312313556671, "ce_loss_2": 4.056863987445832, "ce_loss_4": 3.686291182041168, "ce_loss_9": 3.4074368476867676, "epoch": 0.721, "grad_norm": 740.0, "kl_loss_13": 70.0596134185791, "kl_loss_2": 1703.831787109375, "kl_loss_4": 926.7533538818359, "kl_loss_9": 269.8570755004883, "learning_rate": 0.00018349377309556487, "loss": 744.4033, "step": 7210 }, { "ce_loss_13": 3.2637869834899904, "ce_loss_17": 3.2251646637916567, "ce_loss_2": 4.072896242141724, "ce_loss_4": 3.6544003009796144, "ce_loss_9": 3.36046644449234, "epoch": 0.722, "grad_norm": 1168.0, "kl_loss_13": 71.50100860595703, "kl_loss_2": 1873.2872375488282, "kl_loss_4": 992.8349487304688, "kl_loss_9": 280.7294204711914, "learning_rate": 0.00018226706694758193, "loss": 789.8526, "step": 7220 }, { "ce_loss_13": 3.334029030799866, "ce_loss_17": 3.297024190425873, "ce_loss_2": 4.095572996139526, "ce_loss_4": 3.712455523014069, "ce_loss_9": 3.4249449014663695, "epoch": 0.723, "grad_norm": 856.0, "kl_loss_13": 71.00616970062256, "kl_loss_2": 1758.424822998047, "kl_loss_4": 952.4993621826172, "kl_loss_9": 276.48938674926757, "learning_rate": 0.0001810435603667075, "loss": 786.2653, "step": 7230 }, { "ce_loss_13": 3.191468024253845, "ce_loss_17": 3.153413712978363, "ce_loss_2": 3.9690565705299377, "ce_loss_4": 3.5743834137916566, "ce_loss_9": 3.2823387384414673, "epoch": 0.724, "grad_norm": 1008.0, "kl_loss_13": 68.86694049835205, "kl_loss_2": 1788.1756591796875, "kl_loss_4": 959.1340911865234, "kl_loss_9": 270.8941192626953, "learning_rate": 0.0001798232656736389, "loss": 784.757, "step": 7240 }, { "ce_loss_13": 3.3603003859519958, "ce_loss_17": 3.322044885158539, "ce_loss_2": 4.09637211561203, "ce_loss_4": 3.72024085521698, "ce_loss_9": 3.451583540439606, "epoch": 0.725, "grad_norm": 836.0, "kl_loss_13": 70.9534122467041, "kl_loss_2": 1698.4436950683594, "kl_loss_4": 915.228955078125, "kl_loss_9": 270.6935722351074, "learning_rate": 0.0001786061951567303, "loss": 755.1846, "step": 7250 }, { "ce_loss_13": 3.274634397029877, "ce_loss_17": 3.2367483615875243, "ce_loss_2": 4.048952245712281, "ce_loss_4": 3.656607174873352, "ce_loss_9": 3.369658660888672, "epoch": 0.726, "grad_norm": 804.0, "kl_loss_13": 71.53393859863282, "kl_loss_2": 1769.5117797851562, "kl_loss_4": 957.7466979980469, "kl_loss_9": 276.7366523742676, "learning_rate": 0.00017739236107186857, "loss": 776.2261, "step": 7260 }, { "ce_loss_13": 3.3679779410362243, "ce_loss_17": 3.329963207244873, "ce_loss_2": 4.0942219495773315, "ce_loss_4": 3.7239694952964784, "ce_loss_9": 3.4533618807792665, "epoch": 0.727, "grad_norm": 1464.0, "kl_loss_13": 70.42549552917481, "kl_loss_2": 1689.8609008789062, "kl_loss_4": 919.8603881835937, "kl_loss_9": 267.1950225830078, "learning_rate": 0.00017618177564234904, "loss": 749.2871, "step": 7270 }, { "ce_loss_13": 3.3417418003082275, "ce_loss_17": 3.3037155866622925, "ce_loss_2": 4.073198449611664, "ce_loss_4": 3.7028775930404665, "ce_loss_9": 3.4276382446289064, "epoch": 0.728, "grad_norm": 760.0, "kl_loss_13": 69.97520542144775, "kl_loss_2": 1678.5777709960937, "kl_loss_4": 915.3362426757812, "kl_loss_9": 263.50071105957034, "learning_rate": 0.00017497445105875377, "loss": 749.8755, "step": 7280 }, { "ce_loss_13": 3.247866189479828, "ce_loss_17": 3.2090843081474305, "ce_loss_2": 4.045860695838928, "ce_loss_4": 3.6415121078491213, "ce_loss_9": 3.3421841621398927, "epoch": 0.729, "grad_norm": 992.0, "kl_loss_13": 70.66439056396484, "kl_loss_2": 1826.7165100097657, "kl_loss_4": 977.5601196289062, "kl_loss_9": 280.64800415039065, "learning_rate": 0.000173770399478828, "loss": 779.078, "step": 7290 }, { "ce_loss_13": 3.173096179962158, "ce_loss_17": 3.1370855569839478, "ce_loss_2": 3.9375186562538147, "ce_loss_4": 3.5511539220809936, "ce_loss_9": 3.263213336467743, "epoch": 0.73, "grad_norm": 1032.0, "kl_loss_13": 68.80027599334717, "kl_loss_2": 1763.6752075195313, "kl_loss_4": 956.6611968994141, "kl_loss_9": 271.0298553466797, "learning_rate": 0.0001725696330273575, "loss": 787.5494, "step": 7300 }, { "ce_loss_13": 3.3573803544044494, "ce_loss_17": 3.3180949211120607, "ce_loss_2": 4.097061896324158, "ce_loss_4": 3.7262882709503176, "ce_loss_9": 3.448265993595123, "epoch": 0.731, "grad_norm": 836.0, "kl_loss_13": 69.84502964019775, "kl_loss_2": 1707.2565124511718, "kl_loss_4": 931.3904113769531, "kl_loss_9": 270.31018524169923, "learning_rate": 0.00017137216379604724, "loss": 749.7715, "step": 7310 }, { "ce_loss_13": 3.241015446186066, "ce_loss_17": 3.2024821400642396, "ce_loss_2": 4.016211712360382, "ce_loss_4": 3.619802701473236, "ce_loss_9": 3.3317625522613525, "epoch": 0.732, "grad_norm": 984.0, "kl_loss_13": 70.11958351135254, "kl_loss_2": 1761.5307678222657, "kl_loss_4": 948.8353576660156, "kl_loss_9": 269.7608154296875, "learning_rate": 0.00017017800384339925, "loss": 766.8838, "step": 7320 }, { "ce_loss_13": 3.193186175823212, "ce_loss_17": 3.154158186912537, "ce_loss_2": 4.000348472595215, "ce_loss_4": 3.593831789493561, "ce_loss_9": 3.289596879482269, "epoch": 0.733, "grad_norm": 868.0, "kl_loss_13": 69.91508522033692, "kl_loss_2": 1842.14296875, "kl_loss_4": 985.5766326904297, "kl_loss_9": 278.39381408691406, "learning_rate": 0.00016898716519459073, "loss": 763.6745, "step": 7330 }, { "ce_loss_13": 3.313058865070343, "ce_loss_17": 3.2736072659492494, "ce_loss_2": 4.118879449367523, "ce_loss_4": 3.7130431056022646, "ce_loss_9": 3.4101476788520815, "epoch": 0.734, "grad_norm": 844.0, "kl_loss_13": 72.03692722320557, "kl_loss_2": 1814.1436828613282, "kl_loss_4": 976.4756866455078, "kl_loss_9": 282.7153793334961, "learning_rate": 0.00016779965984135375, "loss": 774.9484, "step": 7340 }, { "ce_loss_13": 3.225519323348999, "ce_loss_17": 3.1890533685684206, "ce_loss_2": 4.00371423959732, "ce_loss_4": 3.6043476462364197, "ce_loss_9": 3.3188387513160706, "epoch": 0.735, "grad_norm": 776.0, "kl_loss_13": 68.39549350738525, "kl_loss_2": 1771.8436584472656, "kl_loss_4": 936.6364837646485, "kl_loss_9": 268.1987022399902, "learning_rate": 0.00016661549974185424, "loss": 763.8604, "step": 7350 }, { "ce_loss_13": 3.2641552090644836, "ce_loss_17": 3.226039266586304, "ce_loss_2": 4.0332492113113405, "ce_loss_4": 3.6448875188827516, "ce_loss_9": 3.356839954853058, "epoch": 0.736, "grad_norm": 824.0, "kl_loss_13": 71.28998851776123, "kl_loss_2": 1765.07041015625, "kl_loss_4": 949.3002319335938, "kl_loss_9": 274.99008255004884, "learning_rate": 0.00016543469682057105, "loss": 755.6967, "step": 7360 }, { "ce_loss_13": 3.2904924869537355, "ce_loss_17": 3.2508055090904238, "ce_loss_2": 4.065948736667633, "ce_loss_4": 3.678385245800018, "ce_loss_9": 3.3866549491882325, "epoch": 0.737, "grad_norm": 816.0, "kl_loss_13": 71.95163116455078, "kl_loss_2": 1769.7718627929687, "kl_loss_4": 962.1701629638671, "kl_loss_9": 277.88295516967776, "learning_rate": 0.00016425726296817632, "loss": 766.5517, "step": 7370 }, { "ce_loss_13": 3.3040218114852906, "ce_loss_17": 3.2673555254936217, "ce_loss_2": 4.0545696258544925, "ce_loss_4": 3.682534599304199, "ce_loss_9": 3.3967278480529783, "epoch": 0.738, "grad_norm": 740.0, "kl_loss_13": 69.9533712387085, "kl_loss_2": 1727.5446228027345, "kl_loss_4": 941.7377288818359, "kl_loss_9": 268.64248809814455, "learning_rate": 0.00016308321004141607, "loss": 756.726, "step": 7380 }, { "ce_loss_13": 3.2565553188323975, "ce_loss_17": 3.2167924642562866, "ce_loss_2": 4.043980371952057, "ce_loss_4": 3.6463310599327086, "ce_loss_9": 3.356897795200348, "epoch": 0.739, "grad_norm": 1056.0, "kl_loss_13": 72.12949066162109, "kl_loss_2": 1785.9837768554687, "kl_loss_4": 958.8319763183594, "kl_loss_9": 281.2048934936523, "learning_rate": 0.00016191254986299043, "loss": 763.5404, "step": 7390 }, { "ce_loss_13": 3.305033326148987, "ce_loss_17": 3.267452096939087, "ce_loss_2": 4.053663873672486, "ce_loss_4": 3.6732759952545164, "ce_loss_9": 3.3918822526931764, "epoch": 0.74, "grad_norm": 884.0, "kl_loss_13": 70.24519844055176, "kl_loss_2": 1748.568621826172, "kl_loss_4": 941.6577880859375, "kl_loss_9": 269.2971351623535, "learning_rate": 0.00016074529422143398, "loss": 772.0949, "step": 7400 }, { "ce_loss_13": 3.2546313643455504, "ce_loss_17": 3.216951239109039, "ce_loss_2": 4.049028539657593, "ce_loss_4": 3.6475675821304323, "ce_loss_9": 3.34945809841156, "epoch": 0.741, "grad_norm": 1176.0, "kl_loss_13": 70.28467655181885, "kl_loss_2": 1803.59521484375, "kl_loss_4": 959.3927185058594, "kl_loss_9": 274.4232048034668, "learning_rate": 0.0001595814548709983, "loss": 781.1009, "step": 7410 }, { "ce_loss_13": 3.3178256511688233, "ce_loss_17": 3.2781765699386596, "ce_loss_2": 4.100561285018921, "ce_loss_4": 3.7127739071846007, "ce_loss_9": 3.417123830318451, "epoch": 0.742, "grad_norm": 760.0, "kl_loss_13": 72.4094009399414, "kl_loss_2": 1799.5929931640626, "kl_loss_4": 974.4059661865234, "kl_loss_9": 284.3437103271484, "learning_rate": 0.00015842104353153285, "loss": 776.3952, "step": 7420 }, { "ce_loss_13": 3.3347533464431764, "ce_loss_17": 3.297309637069702, "ce_loss_2": 4.107506430149078, "ce_loss_4": 3.7230252385139466, "ce_loss_9": 3.428642463684082, "epoch": 0.743, "grad_norm": 784.0, "kl_loss_13": 71.99139099121093, "kl_loss_2": 1769.0837463378907, "kl_loss_4": 962.6025939941406, "kl_loss_9": 277.31082305908205, "learning_rate": 0.0001572640718883667, "loss": 785.4607, "step": 7430 }, { "ce_loss_13": 3.2768795490264893, "ce_loss_17": 3.2394585609436035, "ce_loss_2": 4.0312792420387265, "ce_loss_4": 3.6453895688056948, "ce_loss_9": 3.366493320465088, "epoch": 0.744, "grad_norm": 844.0, "kl_loss_13": 69.40192909240723, "kl_loss_2": 1744.5479919433594, "kl_loss_4": 938.4931243896484, "kl_loss_9": 269.1444465637207, "learning_rate": 0.0001561105515921915, "loss": 776.6132, "step": 7440 }, { "ce_loss_13": 3.1278898119926453, "ce_loss_17": 3.090610134601593, "ce_loss_2": 3.942419695854187, "ce_loss_4": 3.5280018210411073, "ce_loss_9": 3.223814272880554, "epoch": 0.745, "grad_norm": 1104.0, "kl_loss_13": 68.1939962387085, "kl_loss_2": 1857.2819396972657, "kl_loss_4": 978.3077850341797, "kl_loss_9": 272.3531295776367, "learning_rate": 0.0001549604942589441, "loss": 772.6961, "step": 7450 }, { "ce_loss_13": 3.313250517845154, "ce_loss_17": 3.276069223880768, "ce_loss_2": 4.036946654319763, "ce_loss_4": 3.6701719045639036, "ce_loss_9": 3.400228428840637, "epoch": 0.746, "grad_norm": 736.0, "kl_loss_13": 69.25663681030274, "kl_loss_2": 1669.5231079101563, "kl_loss_4": 901.7798034667969, "kl_loss_9": 264.0007888793945, "learning_rate": 0.00015381391146968864, "loss": 746.5012, "step": 7460 }, { "ce_loss_13": 3.287136948108673, "ce_loss_17": 3.2516398191452027, "ce_loss_2": 4.058957719802857, "ce_loss_4": 3.6662606596946716, "ce_loss_9": 3.379166769981384, "epoch": 0.747, "grad_norm": 1312.0, "kl_loss_13": 68.81890754699707, "kl_loss_2": 1759.797265625, "kl_loss_4": 944.7041290283203, "kl_loss_9": 268.30206604003905, "learning_rate": 0.00015267081477050133, "loss": 769.2019, "step": 7470 }, { "ce_loss_13": 3.3837154507637024, "ce_loss_17": 3.3446292996406557, "ce_loss_2": 4.122562909126282, "ce_loss_4": 3.7535754799842835, "ce_loss_9": 3.4761662602424623, "epoch": 0.748, "grad_norm": 712.0, "kl_loss_13": 72.16933269500733, "kl_loss_2": 1710.0824951171876, "kl_loss_4": 931.889291381836, "kl_loss_9": 273.9661148071289, "learning_rate": 0.00015153121567235335, "loss": 746.5872, "step": 7480 }, { "ce_loss_13": 3.283895766735077, "ce_loss_17": 3.244150185585022, "ce_loss_2": 4.0544509291648865, "ce_loss_4": 3.661958086490631, "ce_loss_9": 3.374240827560425, "epoch": 0.749, "grad_norm": 704.0, "kl_loss_13": 69.87776603698731, "kl_loss_2": 1792.9642272949218, "kl_loss_4": 965.1159057617188, "kl_loss_9": 272.7176933288574, "learning_rate": 0.00015039512565099468, "loss": 751.652, "step": 7490 }, { "ce_loss_13": 3.346974790096283, "ce_loss_17": 3.307431769371033, "ce_loss_2": 4.096277952194214, "ce_loss_4": 3.716240656375885, "ce_loss_9": 3.435083281993866, "epoch": 0.75, "grad_norm": 824.0, "kl_loss_13": 70.67133522033691, "kl_loss_2": 1747.3429260253906, "kl_loss_4": 947.5887268066406, "kl_loss_9": 273.0171401977539, "learning_rate": 0.00014926255614683932, "loss": 788.2286, "step": 7500 }, { "ce_loss_13": 3.2786112070083617, "ce_loss_17": 3.2404692649841307, "ce_loss_2": 4.0347212433815, "ce_loss_4": 3.6455458045005797, "ce_loss_9": 3.3715378761291506, "epoch": 0.751, "grad_norm": 732.0, "kl_loss_13": 70.24715995788574, "kl_loss_2": 1751.265087890625, "kl_loss_4": 936.3882263183593, "kl_loss_9": 272.389094543457, "learning_rate": 0.0001481335185648498, "loss": 765.7348, "step": 7510 }, { "ce_loss_13": 3.2962894678115844, "ce_loss_17": 3.2575737595558167, "ce_loss_2": 4.060336661338806, "ce_loss_4": 3.676244294643402, "ce_loss_9": 3.391019034385681, "epoch": 0.752, "grad_norm": 876.0, "kl_loss_13": 70.00695781707763, "kl_loss_2": 1750.6751708984375, "kl_loss_4": 946.6380157470703, "kl_loss_9": 275.6133514404297, "learning_rate": 0.0001470080242744218, "loss": 756.1388, "step": 7520 }, { "ce_loss_13": 3.291015386581421, "ce_loss_17": 3.253824019432068, "ce_loss_2": 4.068705368041992, "ce_loss_4": 3.669373023509979, "ce_loss_9": 3.381570076942444, "epoch": 0.753, "grad_norm": 872.0, "kl_loss_13": 69.27585773468017, "kl_loss_2": 1781.7065185546876, "kl_loss_4": 955.3471130371094, "kl_loss_9": 270.36650772094725, "learning_rate": 0.0001458860846092705, "loss": 771.5325, "step": 7530 }, { "ce_loss_13": 3.3303808689117433, "ce_loss_17": 3.2930827856063845, "ce_loss_2": 4.065006506443024, "ce_loss_4": 3.6996832132339477, "ce_loss_9": 3.420238471031189, "epoch": 0.754, "grad_norm": 840.0, "kl_loss_13": 69.98066291809081, "kl_loss_2": 1698.9991882324218, "kl_loss_4": 931.16552734375, "kl_loss_9": 268.8388671875, "learning_rate": 0.00014476771086731566, "loss": 742.7994, "step": 7540 }, { "ce_loss_13": 3.423077344894409, "ce_loss_17": 3.3821932554244993, "ce_loss_2": 4.17581205368042, "ce_loss_4": 3.7954395055770873, "ce_loss_9": 3.5166402459144592, "epoch": 0.755, "grad_norm": 788.0, "kl_loss_13": 73.9686882019043, "kl_loss_2": 1728.0974670410155, "kl_loss_4": 930.5460083007813, "kl_loss_9": 276.084765625, "learning_rate": 0.00014365291431056872, "loss": 776.2555, "step": 7550 }, { "ce_loss_13": 3.2581419229507445, "ce_loss_17": 3.2202382564544676, "ce_loss_2": 4.035905528068542, "ce_loss_4": 3.6437175273895264, "ce_loss_9": 3.3557262897491453, "epoch": 0.756, "grad_norm": 948.0, "kl_loss_13": 72.2882619857788, "kl_loss_2": 1792.8220336914062, "kl_loss_4": 968.6297149658203, "kl_loss_9": 281.67093353271486, "learning_rate": 0.00014254170616501827, "loss": 770.4111, "step": 7560 }, { "ce_loss_13": 3.195040285587311, "ce_loss_17": 3.1560064554214478, "ce_loss_2": 4.0122485756874084, "ce_loss_4": 3.609569180011749, "ce_loss_9": 3.297301399707794, "epoch": 0.757, "grad_norm": 1064.0, "kl_loss_13": 71.20706291198731, "kl_loss_2": 1849.8933044433593, "kl_loss_4": 1010.1989196777344, "kl_loss_9": 285.5230583190918, "learning_rate": 0.0001414340976205183, "loss": 802.3704, "step": 7570 }, { "ce_loss_13": 3.2119127988815306, "ce_loss_17": 3.174885427951813, "ce_loss_2": 4.007180690765381, "ce_loss_4": 3.6032917380332945, "ce_loss_9": 3.3073129057884216, "epoch": 0.758, "grad_norm": 800.0, "kl_loss_13": 70.78567810058594, "kl_loss_2": 1807.2638732910157, "kl_loss_4": 963.6115509033203, "kl_loss_9": 274.7288459777832, "learning_rate": 0.00014033009983067452, "loss": 770.9219, "step": 7580 }, { "ce_loss_13": 3.371620202064514, "ce_loss_17": 3.33318635225296, "ce_loss_2": 4.107252395153045, "ce_loss_4": 3.7321237087249757, "ce_loss_9": 3.459933066368103, "epoch": 0.759, "grad_norm": 908.0, "kl_loss_13": 69.97175979614258, "kl_loss_2": 1699.2034057617188, "kl_loss_4": 914.4719757080078, "kl_loss_9": 266.3812965393066, "learning_rate": 0.00013922972391273224, "loss": 751.4869, "step": 7590 }, { "ce_loss_13": 3.376069927215576, "ce_loss_17": 3.337877118587494, "ce_loss_2": 4.137043678760529, "ce_loss_4": 3.7415095448493956, "ce_loss_9": 3.465313446521759, "epoch": 0.76, "grad_norm": 1072.0, "kl_loss_13": 70.89640998840332, "kl_loss_2": 1749.950537109375, "kl_loss_4": 923.0523040771484, "kl_loss_9": 269.26611404418946, "learning_rate": 0.0001381329809474649, "loss": 764.8768, "step": 7600 }, { "ce_loss_13": 3.272611165046692, "ce_loss_17": 3.232573592662811, "ce_loss_2": 4.078569507598877, "ce_loss_4": 3.673508608341217, "ce_loss_9": 3.371352481842041, "epoch": 0.761, "grad_norm": 900.0, "kl_loss_13": 72.22393836975098, "kl_loss_2": 1831.3680297851563, "kl_loss_4": 984.5613494873047, "kl_loss_9": 281.33798599243164, "learning_rate": 0.0001370398819790621, "loss": 784.8211, "step": 7610 }, { "ce_loss_13": 3.412788820266724, "ce_loss_17": 3.372195541858673, "ce_loss_2": 4.15156922340393, "ce_loss_4": 3.778453195095062, "ce_loss_9": 3.5044657707214357, "epoch": 0.762, "grad_norm": 680.0, "kl_loss_13": 71.23473243713379, "kl_loss_2": 1704.420654296875, "kl_loss_4": 923.1366424560547, "kl_loss_9": 270.8112342834473, "learning_rate": 0.00013595043801501794, "loss": 740.0381, "step": 7620 }, { "ce_loss_13": 3.2104438662528993, "ce_loss_17": 3.1701866149902345, "ce_loss_2": 4.04260162115097, "ce_loss_4": 3.6209548711776733, "ce_loss_9": 3.3081387162208555, "epoch": 0.763, "grad_norm": 1240.0, "kl_loss_13": 71.1663465499878, "kl_loss_2": 1879.5203369140625, "kl_loss_4": 1001.1256713867188, "kl_loss_9": 282.31495513916013, "learning_rate": 0.00013486466002602133, "loss": 786.8135, "step": 7630 }, { "ce_loss_13": 3.3238829135894776, "ce_loss_17": 3.2859877705574037, "ce_loss_2": 4.06448130607605, "ce_loss_4": 3.6895562171936036, "ce_loss_9": 3.414611339569092, "epoch": 0.764, "grad_norm": 844.0, "kl_loss_13": 70.97187271118165, "kl_loss_2": 1728.2161987304687, "kl_loss_4": 934.7195190429687, "kl_loss_9": 270.5533042907715, "learning_rate": 0.00013378255894584462, "loss": 777.6823, "step": 7640 }, { "ce_loss_13": 3.2600265622138975, "ce_loss_17": 3.2197046518325805, "ce_loss_2": 4.051760137081146, "ce_loss_4": 3.6472837567329406, "ce_loss_9": 3.3538818955421448, "epoch": 0.765, "grad_norm": 924.0, "kl_loss_13": 71.37140407562256, "kl_loss_2": 1799.4996520996094, "kl_loss_4": 965.9475006103515, "kl_loss_9": 278.5297454833984, "learning_rate": 0.0001327041456712334, "loss": 776.7692, "step": 7650 }, { "ce_loss_13": 3.298428177833557, "ce_loss_17": 3.2592820644378664, "ce_loss_2": 4.06556681394577, "ce_loss_4": 3.6799978971481324, "ce_loss_9": 3.3925637722015383, "epoch": 0.766, "grad_norm": 980.0, "kl_loss_13": 71.00599212646485, "kl_loss_2": 1772.591778564453, "kl_loss_4": 963.7923431396484, "kl_loss_9": 276.6527587890625, "learning_rate": 0.00013162943106179747, "loss": 778.0987, "step": 7660 }, { "ce_loss_13": 3.281181848049164, "ce_loss_17": 3.24316520690918, "ce_loss_2": 4.04365484714508, "ce_loss_4": 3.651939356327057, "ce_loss_9": 3.3700647592544555, "epoch": 0.767, "grad_norm": 736.0, "kl_loss_13": 70.76591453552246, "kl_loss_2": 1750.6124877929688, "kl_loss_4": 946.7601104736328, "kl_loss_9": 271.6425071716309, "learning_rate": 0.00013055842593990132, "loss": 760.269, "step": 7670 }, { "ce_loss_13": 3.2225858211517333, "ce_loss_17": 3.1849503040313722, "ce_loss_2": 3.9886361718177796, "ce_loss_4": 3.603174602985382, "ce_loss_9": 3.316388738155365, "epoch": 0.768, "grad_norm": 904.0, "kl_loss_13": 68.82950401306152, "kl_loss_2": 1732.0895629882812, "kl_loss_4": 934.3409393310546, "kl_loss_9": 269.33024139404296, "learning_rate": 0.00012949114109055414, "loss": 772.0984, "step": 7680 }, { "ce_loss_13": 3.2685733318328856, "ce_loss_17": 3.230886149406433, "ce_loss_2": 4.043664062023163, "ce_loss_4": 3.6572733879089356, "ce_loss_9": 3.366124129295349, "epoch": 0.769, "grad_norm": 768.0, "kl_loss_13": 69.85037384033203, "kl_loss_2": 1776.966748046875, "kl_loss_4": 955.4688507080078, "kl_loss_9": 275.6646896362305, "learning_rate": 0.00012842758726130281, "loss": 776.2871, "step": 7690 }, { "ce_loss_13": 3.3045955061912538, "ce_loss_17": 3.265139162540436, "ce_loss_2": 4.105229377746582, "ce_loss_4": 3.698564016819, "ce_loss_9": 3.4019991993904113, "epoch": 0.77, "grad_norm": 1032.0, "kl_loss_13": 71.63125114440918, "kl_loss_2": 1811.1598754882812, "kl_loss_4": 967.3991241455078, "kl_loss_9": 280.4630210876465, "learning_rate": 0.00012736777516212267, "loss": 765.7549, "step": 7700 }, { "ce_loss_13": 3.305664074420929, "ce_loss_17": 3.2651020765304564, "ce_loss_2": 4.0766695737838745, "ce_loss_4": 3.6917852401733398, "ce_loss_9": 3.40242223739624, "epoch": 0.771, "grad_norm": 892.0, "kl_loss_13": 71.45978965759278, "kl_loss_2": 1781.9480834960937, "kl_loss_4": 961.7313262939454, "kl_loss_9": 278.37108001708987, "learning_rate": 0.00012631171546530968, "loss": 759.236, "step": 7710 }, { "ce_loss_13": 3.313371980190277, "ce_loss_17": 3.272728908061981, "ce_loss_2": 4.079601192474366, "ce_loss_4": 3.702372431755066, "ce_loss_9": 3.4114127159118652, "epoch": 0.772, "grad_norm": 872.0, "kl_loss_13": 72.43958892822266, "kl_loss_2": 1763.9843139648438, "kl_loss_4": 964.8334869384765, "kl_loss_9": 279.30160369873045, "learning_rate": 0.00012525941880537307, "loss": 779.6013, "step": 7720 }, { "ce_loss_13": 3.350739538669586, "ce_loss_17": 3.3108729243278505, "ce_loss_2": 4.1055583477020265, "ce_loss_4": 3.7289539337158204, "ce_loss_9": 3.4414247274398804, "epoch": 0.773, "grad_norm": 2352.0, "kl_loss_13": 71.25534629821777, "kl_loss_2": 1736.2442199707032, "kl_loss_4": 947.9007110595703, "kl_loss_9": 272.0314407348633, "learning_rate": 0.00012421089577892869, "loss": 762.51, "step": 7730 }, { "ce_loss_13": 3.2997055292129516, "ce_loss_17": 3.2605361342430115, "ce_loss_2": 4.074028778076172, "ce_loss_4": 3.6856356263160706, "ce_loss_9": 3.398003101348877, "epoch": 0.774, "grad_norm": 1080.0, "kl_loss_13": 70.94810771942139, "kl_loss_2": 1778.2087951660155, "kl_loss_4": 962.2582336425781, "kl_loss_9": 278.71899871826173, "learning_rate": 0.0001231661569445919, "loss": 771.0827, "step": 7740 }, { "ce_loss_13": 3.1601253509521485, "ce_loss_17": 3.121994066238403, "ce_loss_2": 3.9486268877983095, "ce_loss_4": 3.5523595929145815, "ce_loss_9": 3.25460444688797, "epoch": 0.775, "grad_norm": 808.0, "kl_loss_13": 69.09685134887695, "kl_loss_2": 1793.243145751953, "kl_loss_4": 957.2483337402343, "kl_loss_9": 272.4994636535645, "learning_rate": 0.00012212521282287093, "loss": 784.6723, "step": 7750 }, { "ce_loss_13": 3.3081719994544985, "ce_loss_17": 3.269343101978302, "ce_loss_2": 4.065685200691223, "ce_loss_4": 3.689472830295563, "ce_loss_9": 3.405069386959076, "epoch": 0.776, "grad_norm": 748.0, "kl_loss_13": 72.37533378601074, "kl_loss_2": 1743.4986999511718, "kl_loss_4": 953.6687194824219, "kl_loss_9": 277.7476356506348, "learning_rate": 0.00012108807389606158, "loss": 779.1344, "step": 7760 }, { "ce_loss_13": 3.3046793580055236, "ce_loss_17": 3.2680047869682314, "ce_loss_2": 4.072437536716461, "ce_loss_4": 3.6793755173683165, "ce_loss_9": 3.398095953464508, "epoch": 0.777, "grad_norm": 948.0, "kl_loss_13": 68.8588794708252, "kl_loss_2": 1749.9745239257813, "kl_loss_4": 931.3860107421875, "kl_loss_9": 265.7514595031738, "learning_rate": 0.00012005475060814159, "loss": 758.0013, "step": 7770 }, { "ce_loss_13": 3.24513418674469, "ce_loss_17": 3.205347514152527, "ce_loss_2": 4.035976684093475, "ce_loss_4": 3.631837856769562, "ce_loss_9": 3.3390307068824767, "epoch": 0.778, "grad_norm": 1112.0, "kl_loss_13": 71.06086196899415, "kl_loss_2": 1820.938134765625, "kl_loss_4": 973.9619323730469, "kl_loss_9": 278.1083953857422, "learning_rate": 0.00011902525336466464, "loss": 777.8083, "step": 7780 }, { "ce_loss_13": 3.2333223700523375, "ce_loss_17": 3.1922889590263366, "ce_loss_2": 4.0394504308700565, "ce_loss_4": 3.6332613825798035, "ce_loss_9": 3.3305997610092164, "epoch": 0.779, "grad_norm": 996.0, "kl_loss_13": 71.51114463806152, "kl_loss_2": 1842.697412109375, "kl_loss_4": 987.1933624267579, "kl_loss_9": 281.46219329833986, "learning_rate": 0.00011799959253265668, "loss": 777.4664, "step": 7790 }, { "ce_loss_13": 3.293571639060974, "ce_loss_17": 3.2538684844970702, "ce_loss_2": 4.073249363899231, "ce_loss_4": 3.674374008178711, "ce_loss_9": 3.3861318469047545, "epoch": 0.78, "grad_norm": 992.0, "kl_loss_13": 72.05066471099853, "kl_loss_2": 1801.1092712402344, "kl_loss_4": 965.7812194824219, "kl_loss_9": 278.4226219177246, "learning_rate": 0.00011697777844051105, "loss": 774.5723, "step": 7800 }, { "ce_loss_13": 3.2768689155578614, "ce_loss_17": 3.2374550819396974, "ce_loss_2": 4.0919262886047365, "ce_loss_4": 3.6746891021728514, "ce_loss_9": 3.37604638338089, "epoch": 0.781, "grad_norm": 1056.0, "kl_loss_13": 71.38128356933593, "kl_loss_2": 1849.8265380859375, "kl_loss_4": 979.9144989013672, "kl_loss_9": 278.7414749145508, "learning_rate": 0.00011595982137788402, "loss": 781.9421, "step": 7810 }, { "ce_loss_13": 3.2553463697433473, "ce_loss_17": 3.2170026302337646, "ce_loss_2": 4.000703608989715, "ce_loss_4": 3.627398419380188, "ce_loss_9": 3.346537911891937, "epoch": 0.782, "grad_norm": 876.0, "kl_loss_13": 69.83545207977295, "kl_loss_2": 1720.7229370117188, "kl_loss_4": 933.8058410644531, "kl_loss_9": 268.9400184631348, "learning_rate": 0.00011494573159559212, "loss": 762.1486, "step": 7820 }, { "ce_loss_13": 3.240817403793335, "ce_loss_17": 3.2020283579826354, "ce_loss_2": 4.018700480461121, "ce_loss_4": 3.6315406918525697, "ce_loss_9": 3.3373527765274047, "epoch": 0.783, "grad_norm": 788.0, "kl_loss_13": 70.06516742706299, "kl_loss_2": 1784.2691650390625, "kl_loss_4": 969.1496063232422, "kl_loss_9": 275.128840637207, "learning_rate": 0.00011393551930550828, "loss": 784.6266, "step": 7830 }, { "ce_loss_13": 3.3773342847824095, "ce_loss_17": 3.337771785259247, "ce_loss_2": 4.118187880516052, "ce_loss_4": 3.749299919605255, "ce_loss_9": 3.4668367743492126, "epoch": 0.784, "grad_norm": 976.0, "kl_loss_13": 72.88168487548828, "kl_loss_2": 1722.021337890625, "kl_loss_4": 940.9131134033203, "kl_loss_9": 276.2025405883789, "learning_rate": 0.00011292919468045875, "loss": 759.5694, "step": 7840 }, { "ce_loss_13": 3.331298661231995, "ce_loss_17": 3.293097233772278, "ce_loss_2": 4.090363371372223, "ce_loss_4": 3.708501470088959, "ce_loss_9": 3.4256391763687133, "epoch": 0.785, "grad_norm": 716.0, "kl_loss_13": 70.71659488677979, "kl_loss_2": 1758.3239807128907, "kl_loss_4": 952.2195709228515, "kl_loss_9": 275.8377281188965, "learning_rate": 0.00011192676785412154, "loss": 758.0768, "step": 7850 }, { "ce_loss_13": 3.2701008558273315, "ce_loss_17": 3.2280179619789124, "ce_loss_2": 4.070445036888122, "ce_loss_4": 3.6703123331069945, "ce_loss_9": 3.367347037792206, "epoch": 0.786, "grad_norm": 1312.0, "kl_loss_13": 71.71622810363769, "kl_loss_2": 1807.7423217773437, "kl_loss_4": 965.1001220703125, "kl_loss_9": 275.57702484130857, "learning_rate": 0.00011092824892092374, "loss": 775.3067, "step": 7860 }, { "ce_loss_13": 3.2034618616104127, "ce_loss_17": 3.16623512506485, "ce_loss_2": 4.00084820985794, "ce_loss_4": 3.602157771587372, "ce_loss_9": 3.298418068885803, "epoch": 0.787, "grad_norm": 664.0, "kl_loss_13": 70.10608253479003, "kl_loss_2": 1820.81669921875, "kl_loss_4": 982.6229064941406, "kl_loss_9": 274.26637802124026, "learning_rate": 0.0001099336479359398, "loss": 770.4278, "step": 7870 }, { "ce_loss_13": 3.327226400375366, "ce_loss_17": 3.289413809776306, "ce_loss_2": 4.066450893878937, "ce_loss_4": 3.6946874141693113, "ce_loss_9": 3.4174214959144593, "epoch": 0.788, "grad_norm": 760.0, "kl_loss_13": 70.14546699523926, "kl_loss_2": 1729.8601928710937, "kl_loss_4": 936.3539489746094, "kl_loss_9": 270.65756072998045, "learning_rate": 0.00010894297491479043, "loss": 763.074, "step": 7880 }, { "ce_loss_13": 3.314143991470337, "ce_loss_17": 3.27738618850708, "ce_loss_2": 4.085150945186615, "ce_loss_4": 3.6851558089256287, "ce_loss_9": 3.4041661500930784, "epoch": 0.789, "grad_norm": 736.0, "kl_loss_13": 70.67064800262452, "kl_loss_2": 1768.3765869140625, "kl_loss_4": 944.1422546386718, "kl_loss_9": 274.3588096618652, "learning_rate": 0.00010795623983354214, "loss": 759.3542, "step": 7890 }, { "ce_loss_13": 3.2119219303131104, "ce_loss_17": 3.1734250664710997, "ce_loss_2": 3.9959056615829467, "ce_loss_4": 3.6035253524780275, "ce_loss_9": 3.309849750995636, "epoch": 0.79, "grad_norm": 972.0, "kl_loss_13": 70.98051776885987, "kl_loss_2": 1806.2532165527343, "kl_loss_4": 973.5648162841796, "kl_loss_9": 282.1634231567383, "learning_rate": 0.00010697345262860636, "loss": 769.9423, "step": 7900 }, { "ce_loss_13": 3.346919822692871, "ce_loss_17": 3.3095391273498533, "ce_loss_2": 4.095262408256531, "ce_loss_4": 3.709192931652069, "ce_loss_9": 3.437203562259674, "epoch": 0.791, "grad_norm": 1072.0, "kl_loss_13": 70.88391819000245, "kl_loss_2": 1729.3513916015625, "kl_loss_4": 928.2963439941407, "kl_loss_9": 271.72196731567385, "learning_rate": 0.00010599462319663906, "loss": 752.6117, "step": 7910 }, { "ce_loss_13": 3.3200356721878053, "ce_loss_17": 3.2812179923057556, "ce_loss_2": 4.047398543357849, "ce_loss_4": 3.6830509424209597, "ce_loss_9": 3.4092038750648497, "epoch": 0.792, "grad_norm": 784.0, "kl_loss_13": 69.72565822601318, "kl_loss_2": 1688.08408203125, "kl_loss_4": 914.8679321289062, "kl_loss_9": 266.0558052062988, "learning_rate": 0.00010501976139444191, "loss": 744.4802, "step": 7920 }, { "ce_loss_13": 3.3473254561424257, "ce_loss_17": 3.3079727411270143, "ce_loss_2": 4.098410105705261, "ce_loss_4": 3.719019615650177, "ce_loss_9": 3.43493013381958, "epoch": 0.793, "grad_norm": 1472.0, "kl_loss_13": 71.42715454101562, "kl_loss_2": 1740.3159423828124, "kl_loss_4": 933.7770782470703, "kl_loss_9": 269.4993034362793, "learning_rate": 0.0001040488770388625, "loss": 768.4107, "step": 7930 }, { "ce_loss_13": 3.2970604300498962, "ce_loss_17": 3.2603843569755555, "ce_loss_2": 4.063302505016327, "ce_loss_4": 3.673347556591034, "ce_loss_9": 3.387963795661926, "epoch": 0.794, "grad_norm": 968.0, "kl_loss_13": 70.85658683776856, "kl_loss_2": 1781.4008056640625, "kl_loss_4": 959.971401977539, "kl_loss_9": 274.77855758666993, "learning_rate": 0.00010308197990669538, "loss": 766.2293, "step": 7940 }, { "ce_loss_13": 3.4099225759506226, "ce_loss_17": 3.3691091775894164, "ce_loss_2": 4.165284764766693, "ce_loss_4": 3.784447419643402, "ce_loss_9": 3.500907635688782, "epoch": 0.795, "grad_norm": 880.0, "kl_loss_13": 73.56418590545654, "kl_loss_2": 1752.826593017578, "kl_loss_4": 951.5711303710938, "kl_loss_9": 278.11546325683594, "learning_rate": 0.0001021190797345839, "loss": 760.0999, "step": 7950 }, { "ce_loss_13": 3.135342812538147, "ce_loss_17": 3.0957832813262938, "ce_loss_2": 3.9510748744010926, "ce_loss_4": 3.5491525173187255, "ce_loss_9": 3.233706223964691, "epoch": 0.796, "grad_norm": 1040.0, "kl_loss_13": 71.97039318084717, "kl_loss_2": 1850.091290283203, "kl_loss_4": 1004.5883972167969, "kl_loss_9": 287.36984634399414, "learning_rate": 0.00010116018621892236, "loss": 782.3515, "step": 7960 }, { "ce_loss_13": 3.341820991039276, "ce_loss_17": 3.3028963685035704, "ce_loss_2": 4.1250855922698975, "ce_loss_4": 3.729242372512817, "ce_loss_9": 3.4384153485298157, "epoch": 0.797, "grad_norm": 836.0, "kl_loss_13": 74.41549568176269, "kl_loss_2": 1800.5012512207031, "kl_loss_4": 978.0294982910157, "kl_loss_9": 285.6070960998535, "learning_rate": 0.00010020530901575753, "loss": 762.0277, "step": 7970 }, { "ce_loss_13": 3.369021511077881, "ce_loss_17": 3.3300336837768554, "ce_loss_2": 4.121864223480225, "ce_loss_4": 3.742775762081146, "ce_loss_9": 3.4580808758735655, "epoch": 0.798, "grad_norm": 688.0, "kl_loss_13": 71.71014213562012, "kl_loss_2": 1744.4852905273438, "kl_loss_4": 954.4276824951172, "kl_loss_9": 276.1943420410156, "learning_rate": 9.925445774069231e-05, "loss": 751.1472, "step": 7980 }, { "ce_loss_13": 3.3181877017021177, "ce_loss_17": 3.2799045562744142, "ce_loss_2": 4.085768818855286, "ce_loss_4": 3.7050577878952025, "ce_loss_9": 3.41319922208786, "epoch": 0.799, "grad_norm": 908.0, "kl_loss_13": 71.46154289245605, "kl_loss_2": 1742.6618408203126, "kl_loss_4": 944.5083557128906, "kl_loss_9": 273.0162544250488, "learning_rate": 9.830764196878872e-05, "loss": 749.2419, "step": 7990 }, { "ce_loss_13": 3.263114702701569, "ce_loss_17": 3.2257192015647886, "ce_loss_2": 4.035439610481262, "ce_loss_4": 3.6414616227149965, "ce_loss_9": 3.3571359753608703, "epoch": 0.8, "grad_norm": 748.0, "kl_loss_13": 69.72845554351807, "kl_loss_2": 1804.6130981445312, "kl_loss_4": 963.6896820068359, "kl_loss_9": 273.2717483520508, "learning_rate": 9.736487123447069e-05, "loss": 769.4352, "step": 8000 }, { "ce_loss_13": 3.215253698825836, "ce_loss_17": 3.175714838504791, "ce_loss_2": 4.042688941955566, "ce_loss_4": 3.6105726838111876, "ce_loss_9": 3.3094853520393372, "epoch": 0.801, "grad_norm": 692.0, "kl_loss_13": 70.90237617492676, "kl_loss_2": 1899.3806396484374, "kl_loss_4": 990.8334075927735, "kl_loss_9": 274.07350997924806, "learning_rate": 9.642615503142926e-05, "loss": 789.7463, "step": 8010 }, { "ce_loss_13": 3.2806339621543885, "ce_loss_17": 3.2416765809059145, "ce_loss_2": 4.056443822383881, "ce_loss_4": 3.6604487776756285, "ce_loss_9": 3.376413810253143, "epoch": 0.802, "grad_norm": 884.0, "kl_loss_13": 70.28888111114502, "kl_loss_2": 1780.5822021484375, "kl_loss_4": 944.0170684814453, "kl_loss_9": 270.7223930358887, "learning_rate": 9.549150281252633e-05, "loss": 755.7942, "step": 8020 }, { "ce_loss_13": 3.3055353283882143, "ce_loss_17": 3.2666314482688903, "ce_loss_2": 4.071729218959808, "ce_loss_4": 3.6816184878349305, "ce_loss_9": 3.398531424999237, "epoch": 0.803, "grad_norm": 852.0, "kl_loss_13": 71.74902591705322, "kl_loss_2": 1772.0785400390625, "kl_loss_4": 942.818148803711, "kl_loss_9": 274.69599533081055, "learning_rate": 9.4560923989699e-05, "loss": 776.9241, "step": 8030 }, { "ce_loss_13": 3.2983041524887087, "ce_loss_17": 3.260856258869171, "ce_loss_2": 4.068617153167724, "ce_loss_4": 3.6743965983390807, "ce_loss_9": 3.3917889475822447, "epoch": 0.804, "grad_norm": 900.0, "kl_loss_13": 70.64652194976807, "kl_loss_2": 1753.8257873535156, "kl_loss_4": 938.5328674316406, "kl_loss_9": 273.6092224121094, "learning_rate": 9.363442793386607e-05, "loss": 777.0836, "step": 8040 }, { "ce_loss_13": 3.2714555144309996, "ce_loss_17": 3.231795275211334, "ce_loss_2": 4.066785430908203, "ce_loss_4": 3.6732001066207887, "ce_loss_9": 3.3679249763488768, "epoch": 0.805, "grad_norm": 984.0, "kl_loss_13": 72.06962203979492, "kl_loss_2": 1801.3849243164063, "kl_loss_4": 984.603271484375, "kl_loss_9": 283.51671295166017, "learning_rate": 9.271202397483213e-05, "loss": 761.9754, "step": 8050 }, { "ce_loss_13": 3.3052454233169555, "ce_loss_17": 3.2672803163528443, "ce_loss_2": 4.0526569247245785, "ce_loss_4": 3.667846715450287, "ce_loss_9": 3.3939131259918214, "epoch": 0.806, "grad_norm": 832.0, "kl_loss_13": 70.0618745803833, "kl_loss_2": 1736.9171203613282, "kl_loss_4": 927.4877319335938, "kl_loss_9": 268.56889114379885, "learning_rate": 9.179372140119524e-05, "loss": 771.3464, "step": 8060 }, { "ce_loss_13": 3.250062882900238, "ce_loss_17": 3.2121487855911255, "ce_loss_2": 3.9981072664260866, "ce_loss_4": 3.618996787071228, "ce_loss_9": 3.339201021194458, "epoch": 0.807, "grad_norm": 968.0, "kl_loss_13": 69.4912302017212, "kl_loss_2": 1744.63623046875, "kl_loss_4": 938.9472076416016, "kl_loss_9": 271.9808044433594, "learning_rate": 9.087952946025175e-05, "loss": 772.0036, "step": 8070 }, { "ce_loss_13": 3.3562884092330934, "ce_loss_17": 3.3175664067268373, "ce_loss_2": 4.0800391912460325, "ce_loss_4": 3.7106515407562255, "ce_loss_9": 3.4446619272232057, "epoch": 0.808, "grad_norm": 704.0, "kl_loss_13": 70.61629962921143, "kl_loss_2": 1696.0239379882812, "kl_loss_4": 904.205810546875, "kl_loss_9": 266.76557540893555, "learning_rate": 8.996945735790446e-05, "loss": 758.6735, "step": 8080 }, { "ce_loss_13": 3.2506332993507385, "ce_loss_17": 3.212847054004669, "ce_loss_2": 4.017482662200928, "ce_loss_4": 3.631740617752075, "ce_loss_9": 3.3438016176223755, "epoch": 0.809, "grad_norm": 756.0, "kl_loss_13": 70.42041702270508, "kl_loss_2": 1766.6884155273438, "kl_loss_4": 954.8519500732422, "kl_loss_9": 271.8013496398926, "learning_rate": 8.906351425856951e-05, "loss": 775.4984, "step": 8090 }, { "ce_loss_13": 3.2348312735557556, "ce_loss_17": 3.196401846408844, "ce_loss_2": 4.024988865852356, "ce_loss_4": 3.624850368499756, "ce_loss_9": 3.3307148933410646, "epoch": 0.81, "grad_norm": 1096.0, "kl_loss_13": 70.86194591522217, "kl_loss_2": 1827.4809509277343, "kl_loss_4": 976.5987243652344, "kl_loss_9": 276.3275917053223, "learning_rate": 8.816170928508365e-05, "loss": 782.7902, "step": 8100 }, { "ce_loss_13": 3.202365779876709, "ce_loss_17": 3.164315390586853, "ce_loss_2": 4.0153038740158085, "ce_loss_4": 3.5968923449516295, "ce_loss_9": 3.3008163809776305, "epoch": 0.811, "grad_norm": 700.0, "kl_loss_13": 70.01141414642333, "kl_loss_2": 1855.7152587890625, "kl_loss_4": 981.6325653076171, "kl_loss_9": 277.6419342041016, "learning_rate": 8.7264051518613e-05, "loss": 779.224, "step": 8110 }, { "ce_loss_13": 3.290323805809021, "ce_loss_17": 3.2542584896087647, "ce_loss_2": 4.046256864070893, "ce_loss_4": 3.662633013725281, "ce_loss_9": 3.3823811054229735, "epoch": 0.812, "grad_norm": 948.0, "kl_loss_13": 68.87970733642578, "kl_loss_2": 1737.5715454101562, "kl_loss_4": 926.0573944091797, "kl_loss_9": 267.5809020996094, "learning_rate": 8.637054999856148e-05, "loss": 759.8539, "step": 8120 }, { "ce_loss_13": 3.2804335355758667, "ce_loss_17": 3.2413512229919434, "ce_loss_2": 4.054854559898376, "ce_loss_4": 3.6653516054153443, "ce_loss_9": 3.3767602682113647, "epoch": 0.813, "grad_norm": 800.0, "kl_loss_13": 70.57294750213623, "kl_loss_2": 1777.4925964355468, "kl_loss_4": 956.8676177978516, "kl_loss_9": 274.3025924682617, "learning_rate": 8.548121372247918e-05, "loss": 779.4507, "step": 8130 }, { "ce_loss_13": 3.349512314796448, "ce_loss_17": 3.3119771838188172, "ce_loss_2": 4.094702482223511, "ce_loss_4": 3.7121527791023254, "ce_loss_9": 3.4385990619659426, "epoch": 0.814, "grad_norm": 960.0, "kl_loss_13": 70.90967864990235, "kl_loss_2": 1739.1879577636719, "kl_loss_4": 929.7545379638672, "kl_loss_9": 271.06103591918946, "learning_rate": 8.459605164597267e-05, "loss": 759.1175, "step": 8140 }, { "ce_loss_13": 3.2360759139060975, "ce_loss_17": 3.197964668273926, "ce_loss_2": 4.00811208486557, "ce_loss_4": 3.6193493843078612, "ce_loss_9": 3.3254319429397583, "epoch": 0.815, "grad_norm": 900.0, "kl_loss_13": 69.83507804870605, "kl_loss_2": 1777.051202392578, "kl_loss_4": 955.8897918701172, "kl_loss_9": 271.30548400878905, "learning_rate": 8.371507268261436e-05, "loss": 771.6216, "step": 8150 }, { "ce_loss_13": 3.3079094648361207, "ce_loss_17": 3.2703657865524294, "ce_loss_2": 4.0787346959114075, "ce_loss_4": 3.692156195640564, "ce_loss_9": 3.4013009071350098, "epoch": 0.816, "grad_norm": 640.0, "kl_loss_13": 70.76464653015137, "kl_loss_2": 1769.2670593261719, "kl_loss_4": 954.974008178711, "kl_loss_9": 275.3981559753418, "learning_rate": 8.283828570385238e-05, "loss": 750.3438, "step": 8160 }, { "ce_loss_13": 3.3092649817466735, "ce_loss_17": 3.2702112197875977, "ce_loss_2": 4.073802161216736, "ce_loss_4": 3.688433313369751, "ce_loss_9": 3.4022815227508545, "epoch": 0.817, "grad_norm": 728.0, "kl_loss_13": 71.31716861724854, "kl_loss_2": 1728.0110778808594, "kl_loss_4": 938.0338684082031, "kl_loss_9": 272.21169128417966, "learning_rate": 8.196569953892202e-05, "loss": 758.1972, "step": 8170 }, { "ce_loss_13": 3.230646347999573, "ce_loss_17": 3.1929505705833434, "ce_loss_2": 4.0089329242706295, "ce_loss_4": 3.6149255514144896, "ce_loss_9": 3.3274157643318176, "epoch": 0.818, "grad_norm": 888.0, "kl_loss_13": 70.58127098083496, "kl_loss_2": 1758.4310180664063, "kl_loss_4": 950.9697113037109, "kl_loss_9": 274.587052154541, "learning_rate": 8.109732297475635e-05, "loss": 758.4494, "step": 8180 }, { "ce_loss_13": 3.1995676875114443, "ce_loss_17": 3.1593656182289123, "ce_loss_2": 4.04011173248291, "ce_loss_4": 3.627755606174469, "ce_loss_9": 3.3016679525375365, "epoch": 0.819, "grad_norm": 796.0, "kl_loss_13": 72.1693660736084, "kl_loss_2": 1876.0396484375, "kl_loss_4": 1012.8940368652344, "kl_loss_9": 286.2684616088867, "learning_rate": 8.023316475589754e-05, "loss": 792.0807, "step": 8190 }, { "ce_loss_13": 3.168242931365967, "ce_loss_17": 3.1274430990219115, "ce_loss_2": 4.0250523686409, "ce_loss_4": 3.5892801761627195, "ce_loss_9": 3.2700828194618223, "epoch": 0.82, "grad_norm": 1320.0, "kl_loss_13": 73.71393585205078, "kl_loss_2": 1926.69619140625, "kl_loss_4": 1025.3121826171875, "kl_loss_9": 291.9488471984863, "learning_rate": 7.937323358440934e-05, "loss": 808.6539, "step": 8200 }, { "ce_loss_13": 3.291755425930023, "ce_loss_17": 3.256150817871094, "ce_loss_2": 4.031732153892517, "ce_loss_4": 3.6551050424575804, "ce_loss_9": 3.379641282558441, "epoch": 0.821, "grad_norm": 708.0, "kl_loss_13": 69.81248378753662, "kl_loss_2": 1714.6736938476563, "kl_loss_4": 931.0094848632813, "kl_loss_9": 268.1382438659668, "learning_rate": 7.851753811978923e-05, "loss": 758.6292, "step": 8210 }, { "ce_loss_13": 3.306903636455536, "ce_loss_17": 3.2677652716636656, "ce_loss_2": 4.0906357884407045, "ce_loss_4": 3.69010044336319, "ce_loss_9": 3.4002402305603026, "epoch": 0.822, "grad_norm": 868.0, "kl_loss_13": 71.25135803222656, "kl_loss_2": 1789.5710693359374, "kl_loss_4": 954.998403930664, "kl_loss_9": 273.52307586669923, "learning_rate": 7.766608697888095e-05, "loss": 763.3939, "step": 8220 }, { "ce_loss_13": 3.3179776072502136, "ce_loss_17": 3.2780291557312013, "ce_loss_2": 4.08966873884201, "ce_loss_4": 3.69758517742157, "ce_loss_9": 3.4125046730041504, "epoch": 0.823, "grad_norm": 1112.0, "kl_loss_13": 72.05121192932128, "kl_loss_2": 1786.785711669922, "kl_loss_4": 958.7785858154297, "kl_loss_9": 277.2726997375488, "learning_rate": 7.681888873578785e-05, "loss": 778.8477, "step": 8230 }, { "ce_loss_13": 3.2508747458457945, "ce_loss_17": 3.2092309474945067, "ce_loss_2": 4.041729521751404, "ce_loss_4": 3.643390250205994, "ce_loss_9": 3.349603259563446, "epoch": 0.824, "grad_norm": 884.0, "kl_loss_13": 72.39484767913818, "kl_loss_2": 1815.295880126953, "kl_loss_4": 973.4565734863281, "kl_loss_9": 281.2719497680664, "learning_rate": 7.597595192178702e-05, "loss": 769.4248, "step": 8240 }, { "ce_loss_13": 3.250980806350708, "ce_loss_17": 3.2109704494476317, "ce_loss_2": 4.054522025585174, "ce_loss_4": 3.645539367198944, "ce_loss_9": 3.34756623506546, "epoch": 0.825, "grad_norm": 768.0, "kl_loss_13": 72.21156806945801, "kl_loss_2": 1856.1005126953125, "kl_loss_4": 995.8562927246094, "kl_loss_9": 282.67078399658203, "learning_rate": 7.513728502524286e-05, "loss": 790.4012, "step": 8250 }, { "ce_loss_13": 3.250458598136902, "ce_loss_17": 3.2125913977622984, "ce_loss_2": 4.01266096830368, "ce_loss_4": 3.622589910030365, "ce_loss_9": 3.3403961658477783, "epoch": 0.826, "grad_norm": 1040.0, "kl_loss_13": 68.84297733306884, "kl_loss_2": 1746.9054443359375, "kl_loss_4": 937.1652435302734, "kl_loss_9": 268.0669677734375, "learning_rate": 7.430289649152156e-05, "loss": 771.2598, "step": 8260 }, { "ce_loss_13": 3.1503018498420716, "ce_loss_17": 3.1115196704864503, "ce_loss_2": 3.9706924200057983, "ce_loss_4": 3.5614026308059694, "ce_loss_9": 3.2497955560684204, "epoch": 0.827, "grad_norm": 900.0, "kl_loss_13": 70.41055603027344, "kl_loss_2": 1863.2432922363282, "kl_loss_4": 1000.8920288085938, "kl_loss_9": 280.12621841430666, "learning_rate": 7.347279472290646e-05, "loss": 778.9465, "step": 8270 }, { "ce_loss_13": 3.2909953236579894, "ce_loss_17": 3.2529433488845827, "ce_loss_2": 4.078157913684845, "ce_loss_4": 3.674878942966461, "ce_loss_9": 3.3848023533821108, "epoch": 0.828, "grad_norm": 744.0, "kl_loss_13": 71.25290489196777, "kl_loss_2": 1796.028582763672, "kl_loss_4": 962.6837554931641, "kl_loss_9": 272.6476791381836, "learning_rate": 7.264698807851328e-05, "loss": 779.5428, "step": 8280 }, { "ce_loss_13": 3.263857388496399, "ce_loss_17": 3.2270354986190797, "ce_loss_2": 4.014006841182709, "ce_loss_4": 3.6293476581573487, "ce_loss_9": 3.351469397544861, "epoch": 0.829, "grad_norm": 840.0, "kl_loss_13": 68.5794059753418, "kl_loss_2": 1724.5256225585938, "kl_loss_4": 925.4598114013672, "kl_loss_9": 267.2643295288086, "learning_rate": 7.182548487420554e-05, "loss": 759.1056, "step": 8290 }, { "ce_loss_13": 3.311834120750427, "ce_loss_17": 3.272626531124115, "ce_loss_2": 4.066555631160736, "ce_loss_4": 3.6880828857421877, "ce_loss_9": 3.4068371653556824, "epoch": 0.83, "grad_norm": 692.0, "kl_loss_13": 71.82446117401123, "kl_loss_2": 1767.156024169922, "kl_loss_4": 956.2914825439453, "kl_loss_9": 277.0776992797852, "learning_rate": 7.100829338251146e-05, "loss": 766.3485, "step": 8300 }, { "ce_loss_13": 3.249209761619568, "ce_loss_17": 3.209535229206085, "ce_loss_2": 4.045548975467682, "ce_loss_4": 3.6445645332336425, "ce_loss_9": 3.344308114051819, "epoch": 0.831, "grad_norm": 984.0, "kl_loss_13": 71.40059185028076, "kl_loss_2": 1808.8923950195312, "kl_loss_4": 975.227816772461, "kl_loss_9": 279.40746307373047, "learning_rate": 7.019542183254046e-05, "loss": 768.1299, "step": 8310 }, { "ce_loss_13": 3.285986268520355, "ce_loss_17": 3.24474892616272, "ce_loss_2": 4.044291996955872, "ce_loss_4": 3.65914249420166, "ce_loss_9": 3.379481887817383, "epoch": 0.832, "grad_norm": 1136.0, "kl_loss_13": 74.21089000701905, "kl_loss_2": 1763.9286071777344, "kl_loss_4": 951.864404296875, "kl_loss_9": 279.7374038696289, "learning_rate": 6.938687840989971e-05, "loss": 768.0131, "step": 8320 }, { "ce_loss_13": 3.227405917644501, "ce_loss_17": 3.1868462920188905, "ce_loss_2": 3.9979352712631226, "ce_loss_4": 3.6121195673942568, "ce_loss_9": 3.3207536339759827, "epoch": 0.833, "grad_norm": 992.0, "kl_loss_13": 71.92461414337158, "kl_loss_2": 1749.97158203125, "kl_loss_4": 951.9216796875, "kl_loss_9": 275.5987823486328, "learning_rate": 6.858267125661271e-05, "loss": 773.1094, "step": 8330 }, { "ce_loss_13": 3.283363175392151, "ce_loss_17": 3.2460866570472717, "ce_loss_2": 4.060705983638764, "ce_loss_4": 3.6730141282081603, "ce_loss_9": 3.380064380168915, "epoch": 0.834, "grad_norm": 1192.0, "kl_loss_13": 69.57496089935303, "kl_loss_2": 1759.7617492675781, "kl_loss_4": 951.299429321289, "kl_loss_9": 272.3924873352051, "learning_rate": 6.778280847103668e-05, "loss": 782.0201, "step": 8340 }, { "ce_loss_13": 3.2931501269340515, "ce_loss_17": 3.2552053213119505, "ce_loss_2": 4.059948515892029, "ce_loss_4": 3.6772831797599794, "ce_loss_9": 3.386395478248596, "epoch": 0.835, "grad_norm": 756.0, "kl_loss_13": 71.99072380065918, "kl_loss_2": 1784.4557189941406, "kl_loss_4": 968.9391723632813, "kl_loss_9": 280.3265556335449, "learning_rate": 6.698729810778065e-05, "loss": 768.9306, "step": 8350 }, { "ce_loss_13": 3.2062022924423217, "ce_loss_17": 3.167280352115631, "ce_loss_2": 3.992838132381439, "ce_loss_4": 3.594524645805359, "ce_loss_9": 3.3012266397476195, "epoch": 0.836, "grad_norm": 1392.0, "kl_loss_13": 68.42381629943847, "kl_loss_2": 1783.9000061035156, "kl_loss_4": 959.2278900146484, "kl_loss_9": 273.95938339233396, "learning_rate": 6.619614817762538e-05, "loss": 774.9958, "step": 8360 }, { "ce_loss_13": 3.170720875263214, "ce_loss_17": 3.131775438785553, "ce_loss_2": 4.007088744640351, "ce_loss_4": 3.580740916728973, "ce_loss_9": 3.26929292678833, "epoch": 0.837, "grad_norm": 808.0, "kl_loss_13": 69.11339492797852, "kl_loss_2": 1886.8944580078125, "kl_loss_4": 999.9775939941406, "kl_loss_9": 279.94310607910154, "learning_rate": 6.540936664744196e-05, "loss": 790.6644, "step": 8370 }, { "ce_loss_13": 3.3124632954597475, "ce_loss_17": 3.272116649150848, "ce_loss_2": 4.103182435035706, "ce_loss_4": 3.7041156768798826, "ce_loss_9": 3.4053141117095946, "epoch": 0.838, "grad_norm": 652.0, "kl_loss_13": 72.02493133544922, "kl_loss_2": 1790.3144104003907, "kl_loss_4": 962.9958099365234, "kl_loss_9": 275.7400604248047, "learning_rate": 6.462696144011149e-05, "loss": 764.9788, "step": 8380 }, { "ce_loss_13": 3.2692295789718626, "ce_loss_17": 3.230850112438202, "ce_loss_2": 4.035164856910706, "ce_loss_4": 3.6583542227745056, "ce_loss_9": 3.3649362325668335, "epoch": 0.839, "grad_norm": 964.0, "kl_loss_13": 72.52585830688477, "kl_loss_2": 1767.6659545898438, "kl_loss_4": 971.1108917236328, "kl_loss_9": 279.3069702148438, "learning_rate": 6.384894043444567e-05, "loss": 762.8141, "step": 8390 }, { "ce_loss_13": 3.295042598247528, "ce_loss_17": 3.257158863544464, "ce_loss_2": 4.079833257198334, "ce_loss_4": 3.6833842754364015, "ce_loss_9": 3.39192214012146, "epoch": 0.84, "grad_norm": 1008.0, "kl_loss_13": 71.43204746246337, "kl_loss_2": 1781.3298583984374, "kl_loss_4": 956.8086730957032, "kl_loss_9": 275.4544845581055, "learning_rate": 6.307531146510753e-05, "loss": 764.5186, "step": 8400 }, { "ce_loss_13": 3.270242619514465, "ce_loss_17": 3.231538414955139, "ce_loss_2": 4.02031763792038, "ce_loss_4": 3.6488274216651915, "ce_loss_9": 3.361456108093262, "epoch": 0.841, "grad_norm": 992.0, "kl_loss_13": 70.6785327911377, "kl_loss_2": 1725.7596313476563, "kl_loss_4": 943.61474609375, "kl_loss_9": 273.6351890563965, "learning_rate": 6.230608232253226e-05, "loss": 751.6663, "step": 8410 }, { "ce_loss_13": 3.229662823677063, "ce_loss_17": 3.19106924533844, "ce_loss_2": 4.043300378322601, "ce_loss_4": 3.634285008907318, "ce_loss_9": 3.324590063095093, "epoch": 0.842, "grad_norm": 1056.0, "kl_loss_13": 71.09436721801758, "kl_loss_2": 1843.1129028320313, "kl_loss_4": 992.1776947021484, "kl_loss_9": 281.0560470581055, "learning_rate": 6.154126075284855e-05, "loss": 770.8019, "step": 8420 }, { "ce_loss_13": 3.3237983703613283, "ce_loss_17": 3.2856169939041138, "ce_loss_2": 4.068436872959137, "ce_loss_4": 3.697681736946106, "ce_loss_9": 3.4135434150695803, "epoch": 0.843, "grad_norm": 964.0, "kl_loss_13": 69.21185264587402, "kl_loss_2": 1709.265692138672, "kl_loss_4": 943.1334503173828, "kl_loss_9": 268.73162231445315, "learning_rate": 6.078085445780129e-05, "loss": 747.3139, "step": 8430 }, { "ce_loss_13": 3.326972723007202, "ce_loss_17": 3.2875264525413512, "ce_loss_2": 4.109538185596466, "ce_loss_4": 3.707826542854309, "ce_loss_9": 3.420196759700775, "epoch": 0.844, "grad_norm": 988.0, "kl_loss_13": 72.10048751831054, "kl_loss_2": 1796.7720275878905, "kl_loss_4": 955.2393432617188, "kl_loss_9": 276.44380645751954, "learning_rate": 6.002487109467347e-05, "loss": 757.8997, "step": 8440 }, { "ce_loss_13": 3.3340129971504213, "ce_loss_17": 3.2949753284454344, "ce_loss_2": 4.08952693939209, "ce_loss_4": 3.7143755912780763, "ce_loss_9": 3.4288853764533997, "epoch": 0.845, "grad_norm": 788.0, "kl_loss_13": 72.3598985671997, "kl_loss_2": 1756.4138244628907, "kl_loss_4": 955.092416381836, "kl_loss_9": 280.5437271118164, "learning_rate": 5.927331827620902e-05, "loss": 761.8184, "step": 8450 }, { "ce_loss_13": 3.3196855664253233, "ce_loss_17": 3.281851589679718, "ce_loss_2": 4.05379341840744, "ce_loss_4": 3.686984992027283, "ce_loss_9": 3.4138710618019106, "epoch": 0.846, "grad_norm": 776.0, "kl_loss_13": 69.6445505142212, "kl_loss_2": 1699.683575439453, "kl_loss_4": 928.4897003173828, "kl_loss_9": 270.78993072509763, "learning_rate": 5.852620357053651e-05, "loss": 758.1724, "step": 8460 }, { "ce_loss_13": 3.3563570737838746, "ce_loss_17": 3.3188074350357057, "ce_loss_2": 4.095889627933502, "ce_loss_4": 3.7254838228225706, "ce_loss_9": 3.4483654141426086, "epoch": 0.847, "grad_norm": 980.0, "kl_loss_13": 69.58336372375489, "kl_loss_2": 1713.8079406738282, "kl_loss_4": 931.7415771484375, "kl_loss_9": 271.48472900390624, "learning_rate": 5.778353450109286e-05, "loss": 753.4014, "step": 8470 }, { "ce_loss_13": 3.3939215540885925, "ce_loss_17": 3.3531723380088807, "ce_loss_2": 4.165327072143555, "ce_loss_4": 3.7743943333625793, "ce_loss_9": 3.4884645462036135, "epoch": 0.848, "grad_norm": 932.0, "kl_loss_13": 72.31379737854004, "kl_loss_2": 1771.1787109375, "kl_loss_4": 953.3261779785156, "kl_loss_9": 278.8832099914551, "learning_rate": 5.7045318546547206e-05, "loss": 763.3915, "step": 8480 }, { "ce_loss_13": 3.2871436715126037, "ce_loss_17": 3.2488813638687133, "ce_loss_2": 4.066386353969574, "ce_loss_4": 3.6715317487716677, "ce_loss_9": 3.381435012817383, "epoch": 0.849, "grad_norm": 900.0, "kl_loss_13": 71.21525192260742, "kl_loss_2": 1788.6527099609375, "kl_loss_4": 959.9057922363281, "kl_loss_9": 273.3738731384277, "learning_rate": 5.631156314072605e-05, "loss": 761.9224, "step": 8490 }, { "ce_loss_13": 3.310305631160736, "ce_loss_17": 3.2725605964660645, "ce_loss_2": 4.052412235736847, "ce_loss_4": 3.6782548785209657, "ce_loss_9": 3.400036323070526, "epoch": 0.85, "grad_norm": 940.0, "kl_loss_13": 70.49066848754883, "kl_loss_2": 1720.9379455566407, "kl_loss_4": 929.7799072265625, "kl_loss_9": 269.5110496520996, "learning_rate": 5.5582275672538315e-05, "loss": 752.5031, "step": 8500 }, { "ce_loss_13": 3.2241469621658325, "ce_loss_17": 3.1831390619277955, "ce_loss_2": 4.047346830368042, "ce_loss_4": 3.6304913997650146, "ce_loss_9": 3.3220279693603514, "epoch": 0.851, "grad_norm": 752.0, "kl_loss_13": 72.84465465545654, "kl_loss_2": 1857.0891418457031, "kl_loss_4": 999.8686706542969, "kl_loss_9": 281.9967445373535, "learning_rate": 5.4857463485900484e-05, "loss": 786.0194, "step": 8510 }, { "ce_loss_13": 3.2842156052589417, "ce_loss_17": 3.2467063069343567, "ce_loss_2": 4.037990629673004, "ce_loss_4": 3.6584272980690002, "ce_loss_9": 3.3805489897727967, "epoch": 0.852, "grad_norm": 1232.0, "kl_loss_13": 69.57874660491943, "kl_loss_2": 1749.3746459960937, "kl_loss_4": 943.728482055664, "kl_loss_9": 272.5829315185547, "learning_rate": 5.413713387966329e-05, "loss": 759.3067, "step": 8520 }, { "ce_loss_13": 3.2964441180229187, "ce_loss_17": 3.2589587688446047, "ce_loss_2": 4.080042326450348, "ce_loss_4": 3.6818908572196962, "ce_loss_9": 3.3896549224853514, "epoch": 0.853, "grad_norm": 1216.0, "kl_loss_13": 72.38710823059083, "kl_loss_2": 1792.3215270996093, "kl_loss_4": 959.0531524658203, "kl_loss_9": 274.66367111206057, "learning_rate": 5.34212941075381e-05, "loss": 770.6358, "step": 8530 }, { "ce_loss_13": 3.307350420951843, "ce_loss_17": 3.2698061943054197, "ce_loss_2": 4.054671609401703, "ce_loss_4": 3.672791314125061, "ce_loss_9": 3.392826998233795, "epoch": 0.854, "grad_norm": 728.0, "kl_loss_13": 69.6476016998291, "kl_loss_2": 1730.8017272949219, "kl_loss_4": 920.2828186035156, "kl_loss_9": 264.1802864074707, "learning_rate": 5.270995137802315e-05, "loss": 754.1865, "step": 8540 }, { "ce_loss_13": 3.243489348888397, "ce_loss_17": 3.2078842401504515, "ce_loss_2": 4.012664020061493, "ce_loss_4": 3.621446192264557, "ce_loss_9": 3.3361738204956053, "epoch": 0.855, "grad_norm": 736.0, "kl_loss_13": 68.58904209136963, "kl_loss_2": 1769.7939025878907, "kl_loss_4": 945.7029113769531, "kl_loss_9": 271.47595977783203, "learning_rate": 5.2003112854332125e-05, "loss": 767.745, "step": 8550 }, { "ce_loss_13": 3.2445086240768433, "ce_loss_17": 3.206673777103424, "ce_loss_2": 4.001877117156982, "ce_loss_4": 3.617502176761627, "ce_loss_9": 3.334064221382141, "epoch": 0.856, "grad_norm": 752.0, "kl_loss_13": 69.51929016113282, "kl_loss_2": 1765.8089904785156, "kl_loss_4": 951.4055114746094, "kl_loss_9": 270.62868423461913, "learning_rate": 5.130078565432089e-05, "loss": 750.355, "step": 8560 }, { "ce_loss_13": 3.312978744506836, "ce_loss_17": 3.2759904861450195, "ce_loss_2": 4.05236177444458, "ce_loss_4": 3.6772449254989623, "ce_loss_9": 3.400533843040466, "epoch": 0.857, "grad_norm": 1184.0, "kl_loss_13": 69.48748779296875, "kl_loss_2": 1731.421649169922, "kl_loss_4": 937.3368255615235, "kl_loss_9": 268.3721839904785, "learning_rate": 5.060297685041659e-05, "loss": 745.1645, "step": 8570 }, { "ce_loss_13": 3.2397432565689086, "ce_loss_17": 3.2003438830375672, "ce_loss_2": 4.030338478088379, "ce_loss_4": 3.628654670715332, "ce_loss_9": 3.3365538835525514, "epoch": 0.858, "grad_norm": 812.0, "kl_loss_13": 72.3453441619873, "kl_loss_2": 1798.8057678222656, "kl_loss_4": 965.1929901123046, "kl_loss_9": 279.8908744812012, "learning_rate": 4.99096934695461e-05, "loss": 778.915, "step": 8580 }, { "ce_loss_13": 3.3056263446807863, "ce_loss_17": 3.266205370426178, "ce_loss_2": 4.073195433616638, "ce_loss_4": 3.687685859203339, "ce_loss_9": 3.395856535434723, "epoch": 0.859, "grad_norm": 584.0, "kl_loss_13": 70.5243480682373, "kl_loss_2": 1767.046942138672, "kl_loss_4": 954.3834838867188, "kl_loss_9": 272.76340255737307, "learning_rate": 4.922094249306558e-05, "loss": 754.2901, "step": 8590 }, { "ce_loss_13": 3.3336752533912657, "ce_loss_17": 3.294740152359009, "ce_loss_2": 4.099012005329132, "ce_loss_4": 3.713496136665344, "ce_loss_9": 3.427042770385742, "epoch": 0.86, "grad_norm": 1000.0, "kl_loss_13": 72.1313404083252, "kl_loss_2": 1759.39765625, "kl_loss_4": 944.4590118408203, "kl_loss_9": 275.7927375793457, "learning_rate": 4.853673085668947e-05, "loss": 750.4285, "step": 8600 }, { "ce_loss_13": 3.351103699207306, "ce_loss_17": 3.312099051475525, "ce_loss_2": 4.120914161205292, "ce_loss_4": 3.7291797757148744, "ce_loss_9": 3.4441885232925413, "epoch": 0.861, "grad_norm": 824.0, "kl_loss_13": 71.3762767791748, "kl_loss_2": 1763.5246337890626, "kl_loss_4": 944.3128356933594, "kl_loss_9": 272.16346435546876, "learning_rate": 4.78570654504214e-05, "loss": 764.83, "step": 8610 }, { "ce_loss_13": 3.2950334191322326, "ce_loss_17": 3.2570829153060914, "ce_loss_2": 4.068877625465393, "ce_loss_4": 3.6807502269744874, "ce_loss_9": 3.386111795902252, "epoch": 0.862, "grad_norm": 800.0, "kl_loss_13": 70.43105850219726, "kl_loss_2": 1788.814599609375, "kl_loss_4": 968.8083038330078, "kl_loss_9": 274.6896087646484, "learning_rate": 4.7181953118484556e-05, "loss": 771.1818, "step": 8620 }, { "ce_loss_13": 3.3184030055999756, "ce_loss_17": 3.2805970191955565, "ce_loss_2": 4.080092549324036, "ce_loss_4": 3.693860375881195, "ce_loss_9": 3.4120346784591673, "epoch": 0.863, "grad_norm": 848.0, "kl_loss_13": 70.71467151641846, "kl_loss_2": 1714.6408752441407, "kl_loss_4": 927.5413970947266, "kl_loss_9": 269.1174514770508, "learning_rate": 4.651140065925269e-05, "loss": 769.4998, "step": 8630 }, { "ce_loss_13": 3.2543252825737, "ce_loss_17": 3.2153276324272158, "ce_loss_2": 4.0182312488555905, "ce_loss_4": 3.625260126590729, "ce_loss_9": 3.346437680721283, "epoch": 0.864, "grad_norm": 820.0, "kl_loss_13": 70.5960153579712, "kl_loss_2": 1772.5950439453125, "kl_loss_4": 949.2362548828125, "kl_loss_9": 272.66626968383787, "learning_rate": 4.58454148251814e-05, "loss": 770.8148, "step": 8640 }, { "ce_loss_13": 3.2696855068206787, "ce_loss_17": 3.229102146625519, "ce_loss_2": 4.069872319698334, "ce_loss_4": 3.668764066696167, "ce_loss_9": 3.3631327509880067, "epoch": 0.865, "grad_norm": 868.0, "kl_loss_13": 70.95263595581055, "kl_loss_2": 1812.3966369628906, "kl_loss_4": 973.0469512939453, "kl_loss_9": 273.66270751953124, "learning_rate": 4.518400232274078e-05, "loss": 767.7344, "step": 8650 }, { "ce_loss_13": 3.2935158729553224, "ce_loss_17": 3.253829777240753, "ce_loss_2": 4.04986002445221, "ce_loss_4": 3.6717586398124693, "ce_loss_9": 3.38398414850235, "epoch": 0.866, "grad_norm": 800.0, "kl_loss_13": 71.67686386108399, "kl_loss_2": 1741.0731506347656, "kl_loss_4": 944.9275756835938, "kl_loss_9": 272.5065689086914, "learning_rate": 4.452716981234745e-05, "loss": 742.4328, "step": 8660 }, { "ce_loss_13": 3.267419683933258, "ce_loss_17": 3.2313178896903993, "ce_loss_2": 4.023157560825348, "ce_loss_4": 3.637619066238403, "ce_loss_9": 3.358725380897522, "epoch": 0.867, "grad_norm": 860.0, "kl_loss_13": 68.30587711334229, "kl_loss_2": 1740.3493225097657, "kl_loss_4": 934.4726257324219, "kl_loss_9": 267.2579002380371, "learning_rate": 4.3874923908297335e-05, "loss": 746.8162, "step": 8670 }, { "ce_loss_13": 3.31860488653183, "ce_loss_17": 3.2785238146781923, "ce_loss_2": 4.093624210357666, "ce_loss_4": 3.705119812488556, "ce_loss_9": 3.413517880439758, "epoch": 0.868, "grad_norm": 1016.0, "kl_loss_13": 72.45567092895507, "kl_loss_2": 1782.8880004882812, "kl_loss_4": 965.7743316650391, "kl_loss_9": 274.9748229980469, "learning_rate": 4.322727117869951e-05, "loss": 764.1808, "step": 8680 }, { "ce_loss_13": 3.3246540427207947, "ce_loss_17": 3.285372281074524, "ce_loss_2": 4.089873361587524, "ce_loss_4": 3.703606963157654, "ce_loss_9": 3.4189262986183167, "epoch": 0.869, "grad_norm": 892.0, "kl_loss_13": 70.91092338562012, "kl_loss_2": 1777.360284423828, "kl_loss_4": 954.4211364746094, "kl_loss_9": 273.534854888916, "learning_rate": 4.2584218145409916e-05, "loss": 760.0275, "step": 8690 }, { "ce_loss_13": 3.3691744565963746, "ce_loss_17": 3.3325124382972717, "ce_loss_2": 4.093606245517731, "ce_loss_4": 3.727621281147003, "ce_loss_9": 3.458368957042694, "epoch": 0.87, "grad_norm": 840.0, "kl_loss_13": 70.12667446136474, "kl_loss_2": 1697.2558349609376, "kl_loss_4": 926.7784637451172, "kl_loss_9": 266.85135955810546, "learning_rate": 4.194577128396521e-05, "loss": 742.6759, "step": 8700 }, { "ce_loss_13": 3.251688504219055, "ce_loss_17": 3.213710355758667, "ce_loss_2": 4.014662778377533, "ce_loss_4": 3.626564681529999, "ce_loss_9": 3.3434009909629823, "epoch": 0.871, "grad_norm": 768.0, "kl_loss_13": 69.05314979553222, "kl_loss_2": 1765.7520751953125, "kl_loss_4": 944.2459381103515, "kl_loss_9": 267.29578399658203, "learning_rate": 4.1311937023518264e-05, "loss": 768.9531, "step": 8710 }, { "ce_loss_13": 3.27214492559433, "ce_loss_17": 3.2352864265441896, "ce_loss_2": 4.068167233467102, "ce_loss_4": 3.640877389907837, "ce_loss_9": 3.360073173046112, "epoch": 0.872, "grad_norm": 796.0, "kl_loss_13": 68.70249729156494, "kl_loss_2": 1826.7659301757812, "kl_loss_4": 930.3711578369141, "kl_loss_9": 261.41344299316404, "learning_rate": 4.0682721746773344e-05, "loss": 761.2753, "step": 8720 }, { "ce_loss_13": 3.1379726767539977, "ce_loss_17": 3.099522340297699, "ce_loss_2": 3.9501651883125306, "ce_loss_4": 3.5429770708084107, "ce_loss_9": 3.2354901313781737, "epoch": 0.873, "grad_norm": 1208.0, "kl_loss_13": 68.82833328247071, "kl_loss_2": 1811.7914794921876, "kl_loss_4": 972.6370147705078, "kl_loss_9": 273.3907341003418, "learning_rate": 4.0058131789920904e-05, "loss": 757.3973, "step": 8730 }, { "ce_loss_13": 3.289105761051178, "ce_loss_17": 3.2499382734298705, "ce_loss_2": 4.047884678840637, "ce_loss_4": 3.6669172763824465, "ce_loss_9": 3.3774807810783387, "epoch": 0.874, "grad_norm": 832.0, "kl_loss_13": 69.40928993225097, "kl_loss_2": 1764.611505126953, "kl_loss_4": 951.75166015625, "kl_loss_9": 268.1239875793457, "learning_rate": 3.9438173442575e-05, "loss": 781.4809, "step": 8740 }, { "ce_loss_13": 3.320170760154724, "ce_loss_17": 3.2803778886795043, "ce_loss_2": 4.067899703979492, "ce_loss_4": 3.691530239582062, "ce_loss_9": 3.4129509568214416, "epoch": 0.875, "grad_norm": 872.0, "kl_loss_13": 69.87012176513672, "kl_loss_2": 1723.4821533203126, "kl_loss_4": 929.8530395507812, "kl_loss_9": 268.4810333251953, "learning_rate": 3.882285294770937e-05, "loss": 754.1662, "step": 8750 }, { "ce_loss_13": 3.283842885494232, "ce_loss_17": 3.245735538005829, "ce_loss_2": 4.028535318374634, "ce_loss_4": 3.652010107040405, "ce_loss_9": 3.373887574672699, "epoch": 0.876, "grad_norm": 740.0, "kl_loss_13": 70.28794174194336, "kl_loss_2": 1731.2416015625, "kl_loss_4": 935.3825469970703, "kl_loss_9": 269.52797775268556, "learning_rate": 3.821217650159453e-05, "loss": 763.3816, "step": 8760 }, { "ce_loss_13": 3.158191645145416, "ce_loss_17": 3.1189328789711, "ce_loss_2": 3.9809357166290282, "ce_loss_4": 3.5679367542266847, "ce_loss_9": 3.2566331028938293, "epoch": 0.877, "grad_norm": 1304.0, "kl_loss_13": 69.88195037841797, "kl_loss_2": 1841.284228515625, "kl_loss_4": 990.7837341308593, "kl_loss_9": 277.99508438110354, "learning_rate": 3.760615025373543e-05, "loss": 778.5645, "step": 8770 }, { "ce_loss_13": 3.330407190322876, "ce_loss_17": 3.2909754276275636, "ce_loss_2": 4.113665688037872, "ce_loss_4": 3.7197707295417786, "ce_loss_9": 3.427734637260437, "epoch": 0.878, "grad_norm": 980.0, "kl_loss_13": 73.42694778442383, "kl_loss_2": 1789.1897705078125, "kl_loss_4": 958.443881225586, "kl_loss_9": 277.80702514648436, "learning_rate": 3.700478030680987e-05, "loss": 777.7257, "step": 8780 }, { "ce_loss_13": 3.3253095865249636, "ce_loss_17": 3.287730133533478, "ce_loss_2": 4.088598692417145, "ce_loss_4": 3.6988863945007324, "ce_loss_9": 3.4193095088005068, "epoch": 0.879, "grad_norm": 772.0, "kl_loss_13": 70.39273853302002, "kl_loss_2": 1752.1946899414063, "kl_loss_4": 939.2807342529297, "kl_loss_9": 270.5202178955078, "learning_rate": 3.6408072716606344e-05, "loss": 755.5979, "step": 8790 }, { "ce_loss_13": 3.2510732650756835, "ce_loss_17": 3.2137936115264893, "ce_loss_2": 4.049818813800812, "ce_loss_4": 3.6421456575393676, "ce_loss_9": 3.3426763296127318, "epoch": 0.88, "grad_norm": 720.0, "kl_loss_13": 70.29140071868896, "kl_loss_2": 1819.5076477050782, "kl_loss_4": 972.0310791015625, "kl_loss_9": 276.21380920410155, "learning_rate": 3.5816033491963716e-05, "loss": 787.8986, "step": 8800 }, { "ce_loss_13": 3.1142725348472595, "ce_loss_17": 3.075247824192047, "ce_loss_2": 3.9192667841911315, "ce_loss_4": 3.5038288831710815, "ce_loss_9": 3.2087692499160765, "epoch": 0.881, "grad_norm": 636.0, "kl_loss_13": 69.20652503967285, "kl_loss_2": 1830.6102783203125, "kl_loss_4": 967.6713928222656, "kl_loss_9": 269.281755065918, "learning_rate": 3.522866859471047e-05, "loss": 767.3122, "step": 8810 }, { "ce_loss_13": 3.34773451089859, "ce_loss_17": 3.3099034905433653, "ce_loss_2": 4.070950448513031, "ce_loss_4": 3.7069722771644593, "ce_loss_9": 3.4367212653160095, "epoch": 0.882, "grad_norm": 968.0, "kl_loss_13": 68.84079093933106, "kl_loss_2": 1672.555108642578, "kl_loss_4": 900.7840759277344, "kl_loss_9": 262.96395416259764, "learning_rate": 3.46459839396045e-05, "loss": 743.6263, "step": 8820 }, { "ce_loss_13": 3.26260107755661, "ce_loss_17": 3.223562455177307, "ce_loss_2": 4.047707068920135, "ce_loss_4": 3.6520734786987306, "ce_loss_9": 3.356485903263092, "epoch": 0.883, "grad_norm": 808.0, "kl_loss_13": 70.78400783538818, "kl_loss_2": 1764.89443359375, "kl_loss_4": 950.646841430664, "kl_loss_9": 271.79892044067384, "learning_rate": 3.406798539427386e-05, "loss": 779.9631, "step": 8830 }, { "ce_loss_13": 3.3257412910461426, "ce_loss_17": 3.287832188606262, "ce_loss_2": 4.086572694778442, "ce_loss_4": 3.70034202337265, "ce_loss_9": 3.4149993896484374, "epoch": 0.884, "grad_norm": 1040.0, "kl_loss_13": 70.3533836364746, "kl_loss_2": 1780.2422241210938, "kl_loss_4": 953.5696105957031, "kl_loss_9": 272.3615303039551, "learning_rate": 3.349467877915746e-05, "loss": 766.4402, "step": 8840 }, { "ce_loss_13": 3.285241413116455, "ce_loss_17": 3.2471628308296205, "ce_loss_2": 4.070745611190796, "ce_loss_4": 3.6734976291656496, "ce_loss_9": 3.3793052554130556, "epoch": 0.885, "grad_norm": 1120.0, "kl_loss_13": 70.29715118408203, "kl_loss_2": 1816.7627563476562, "kl_loss_4": 976.2758880615235, "kl_loss_9": 275.21948318481446, "learning_rate": 3.292606986744667e-05, "loss": 792.1923, "step": 8850 }, { "ce_loss_13": 3.2434609055519106, "ce_loss_17": 3.206607627868652, "ce_loss_2": 4.017269504070282, "ce_loss_4": 3.620222342014313, "ce_loss_9": 3.331614947319031, "epoch": 0.886, "grad_norm": 760.0, "kl_loss_13": 69.23771839141845, "kl_loss_2": 1774.5667114257812, "kl_loss_4": 956.0857360839843, "kl_loss_9": 266.13800888061525, "learning_rate": 3.23621643850267e-05, "loss": 764.0102, "step": 8860 }, { "ce_loss_13": 3.3150471806526185, "ce_loss_17": 3.2769230365753175, "ce_loss_2": 4.073702692985535, "ce_loss_4": 3.692244303226471, "ce_loss_9": 3.410197043418884, "epoch": 0.887, "grad_norm": 976.0, "kl_loss_13": 71.57453002929688, "kl_loss_2": 1763.2067504882812, "kl_loss_4": 960.6644958496094, "kl_loss_9": 275.66893005371094, "learning_rate": 3.180296801041971e-05, "loss": 753.6317, "step": 8870 }, { "ce_loss_13": 3.340101194381714, "ce_loss_17": 3.302070701122284, "ce_loss_2": 4.106904423236847, "ce_loss_4": 3.7093899130821226, "ce_loss_9": 3.431441366672516, "epoch": 0.888, "grad_norm": 628.0, "kl_loss_13": 70.40896244049073, "kl_loss_2": 1772.5249816894532, "kl_loss_4": 935.5421752929688, "kl_loss_9": 267.8436454772949, "learning_rate": 3.124848637472688e-05, "loss": 746.5057, "step": 8880 }, { "ce_loss_13": 3.160101127624512, "ce_loss_17": 3.1225696921348574, "ce_loss_2": 3.940399968624115, "ce_loss_4": 3.5493147373199463, "ce_loss_9": 3.2547220468521116, "epoch": 0.889, "grad_norm": 1008.0, "kl_loss_13": 67.81524658203125, "kl_loss_2": 1787.1769104003906, "kl_loss_4": 962.53515625, "kl_loss_9": 267.5764961242676, "learning_rate": 3.069872506157212e-05, "loss": 762.1292, "step": 8890 }, { "ce_loss_13": 3.263330328464508, "ce_loss_17": 3.2273592352867126, "ce_loss_2": 4.024621450901032, "ce_loss_4": 3.642102038860321, "ce_loss_9": 3.3568263411521913, "epoch": 0.89, "grad_norm": 804.0, "kl_loss_13": 69.50649681091309, "kl_loss_2": 1758.631689453125, "kl_loss_4": 941.1656555175781, "kl_loss_9": 271.157022857666, "learning_rate": 3.0153689607045842e-05, "loss": 756.4652, "step": 8900 }, { "ce_loss_13": 3.166585099697113, "ce_loss_17": 3.127001166343689, "ce_loss_2": 3.9992379188537597, "ce_loss_4": 3.573567008972168, "ce_loss_9": 3.266878294944763, "epoch": 0.891, "grad_norm": 996.0, "kl_loss_13": 71.30703067779541, "kl_loss_2": 1902.9365478515624, "kl_loss_4": 1013.771694946289, "kl_loss_9": 282.7529136657715, "learning_rate": 2.9613385499648926e-05, "loss": 777.2954, "step": 8910 }, { "ce_loss_13": 3.21743483543396, "ce_loss_17": 3.1798877000808714, "ce_loss_2": 3.980141830444336, "ce_loss_4": 3.600106048583984, "ce_loss_9": 3.3116451501846313, "epoch": 0.892, "grad_norm": 864.0, "kl_loss_13": 68.77742424011231, "kl_loss_2": 1738.6429748535156, "kl_loss_4": 944.1782318115235, "kl_loss_9": 268.3657096862793, "learning_rate": 2.9077818180237692e-05, "loss": 761.2621, "step": 8920 }, { "ce_loss_13": 3.260299324989319, "ce_loss_17": 3.2211027264595034, "ce_loss_2": 4.056830155849457, "ce_loss_4": 3.656483030319214, "ce_loss_9": 3.357074999809265, "epoch": 0.893, "grad_norm": 1760.0, "kl_loss_13": 69.7451015472412, "kl_loss_2": 1785.2978942871093, "kl_loss_4": 953.3751007080078, "kl_loss_9": 269.3721778869629, "learning_rate": 2.8546993041969172e-05, "loss": 762.2178, "step": 8930 }, { "ce_loss_13": 3.2998342752456664, "ce_loss_17": 3.2634523510932922, "ce_loss_2": 4.0385368943214415, "ce_loss_4": 3.671085524559021, "ce_loss_9": 3.3905824542045595, "epoch": 0.894, "grad_norm": 796.0, "kl_loss_13": 68.27731208801269, "kl_loss_2": 1728.0355590820313, "kl_loss_4": 936.2373199462891, "kl_loss_9": 267.1129722595215, "learning_rate": 2.802091543024671e-05, "loss": 759.3199, "step": 8940 }, { "ce_loss_13": 3.2947364926338194, "ce_loss_17": 3.2571501612663267, "ce_loss_2": 4.081899428367615, "ce_loss_4": 3.679962158203125, "ce_loss_9": 3.3866132140159606, "epoch": 0.895, "grad_norm": 840.0, "kl_loss_13": 70.01423645019531, "kl_loss_2": 1806.8572692871094, "kl_loss_4": 965.6742645263672, "kl_loss_9": 273.54743881225585, "learning_rate": 2.7499590642665774e-05, "loss": 785.5458, "step": 8950 }, { "ce_loss_13": 3.3065556645393372, "ce_loss_17": 3.2688744187355043, "ce_loss_2": 4.072537469863891, "ce_loss_4": 3.6804751992225646, "ce_loss_9": 3.406609535217285, "epoch": 0.896, "grad_norm": 740.0, "kl_loss_13": 74.98095989227295, "kl_loss_2": 1751.6958984375, "kl_loss_4": 938.6008728027343, "kl_loss_9": 289.8558975219727, "learning_rate": 2.6983023928961405e-05, "loss": 755.7607, "step": 8960 }, { "ce_loss_13": 3.2771018624305723, "ce_loss_17": 3.2386683106422423, "ce_loss_2": 4.048685503005982, "ce_loss_4": 3.660785710811615, "ce_loss_9": 3.3700268983840944, "epoch": 0.897, "grad_norm": 780.0, "kl_loss_13": 70.14976196289062, "kl_loss_2": 1756.1913452148438, "kl_loss_4": 947.1248443603515, "kl_loss_9": 269.67872009277346, "learning_rate": 2.6471220490954628e-05, "loss": 770.3491, "step": 8970 }, { "ce_loss_13": 3.2675058484077453, "ce_loss_17": 3.2324760437011717, "ce_loss_2": 4.026072096824646, "ce_loss_4": 3.62881281375885, "ce_loss_9": 3.354077172279358, "epoch": 0.898, "grad_norm": 840.0, "kl_loss_13": 69.91112670898437, "kl_loss_2": 1751.6456481933594, "kl_loss_4": 933.82841796875, "kl_loss_9": 269.164444732666, "learning_rate": 2.596418548250029e-05, "loss": 763.478, "step": 8980 }, { "ce_loss_13": 3.3039443254470826, "ce_loss_17": 3.265969121456146, "ce_loss_2": 4.057813549041748, "ce_loss_4": 3.682532238960266, "ce_loss_9": 3.398101258277893, "epoch": 0.899, "grad_norm": 828.0, "kl_loss_13": 71.41869773864747, "kl_loss_2": 1756.7818115234375, "kl_loss_4": 951.8295043945312, "kl_loss_9": 274.06252059936526, "learning_rate": 2.5461924009435368e-05, "loss": 753.8367, "step": 8990 }, { "ce_loss_13": 3.298236906528473, "ce_loss_17": 3.2587198138237, "ce_loss_2": 4.062357556819916, "ce_loss_4": 3.6736649870872498, "ce_loss_9": 3.390058147907257, "epoch": 0.9, "grad_norm": 880.0, "kl_loss_13": 71.20307960510254, "kl_loss_2": 1756.3238525390625, "kl_loss_4": 942.694546508789, "kl_loss_9": 271.4934455871582, "learning_rate": 2.4964441129527336e-05, "loss": 775.5805, "step": 9000 }, { "ce_loss_13": 3.3012996196746824, "ce_loss_17": 3.2620099425315856, "ce_loss_2": 4.042165410518646, "ce_loss_4": 3.6681466937065124, "ce_loss_9": 3.3898880243301392, "epoch": 0.901, "grad_norm": 1160.0, "kl_loss_13": 69.23692741394044, "kl_loss_2": 1715.5816650390625, "kl_loss_4": 922.6554473876953, "kl_loss_9": 265.27770233154297, "learning_rate": 2.4471741852423235e-05, "loss": 747.8052, "step": 9010 }, { "ce_loss_13": 3.346455466747284, "ce_loss_17": 3.3067072510719298, "ce_loss_2": 4.1050170183181764, "ce_loss_4": 3.7224635004997255, "ce_loss_9": 3.4403756499290465, "epoch": 0.902, "grad_norm": 736.0, "kl_loss_13": 70.2386646270752, "kl_loss_2": 1725.9721374511719, "kl_loss_4": 930.9395690917969, "kl_loss_9": 270.36619567871094, "learning_rate": 2.3983831139599287e-05, "loss": 754.2608, "step": 9020 }, { "ce_loss_13": 3.2672984123229982, "ce_loss_17": 3.229282486438751, "ce_loss_2": 4.023867189884186, "ce_loss_4": 3.6345030784606935, "ce_loss_9": 3.3552754759788512, "epoch": 0.903, "grad_norm": 648.0, "kl_loss_13": 68.81279144287109, "kl_loss_2": 1725.340350341797, "kl_loss_4": 918.9158569335938, "kl_loss_9": 264.1955795288086, "learning_rate": 2.3500713904311022e-05, "loss": 737.7361, "step": 9030 }, { "ce_loss_13": 3.3072561860084533, "ce_loss_17": 3.269752490520477, "ce_loss_2": 4.04286288022995, "ce_loss_4": 3.6655190706253054, "ce_loss_9": 3.3957794904708862, "epoch": 0.904, "grad_norm": 984.0, "kl_loss_13": 69.29099216461182, "kl_loss_2": 1694.4139587402344, "kl_loss_4": 911.9616882324219, "kl_loss_9": 261.82068862915037, "learning_rate": 2.3022395011543685e-05, "loss": 739.5109, "step": 9040 }, { "ce_loss_13": 3.3339088559150696, "ce_loss_17": 3.2950241684913637, "ce_loss_2": 4.0952001214027405, "ce_loss_4": 3.714173400402069, "ce_loss_9": 3.4319105625152586, "epoch": 0.905, "grad_norm": 912.0, "kl_loss_13": 71.4954969406128, "kl_loss_2": 1753.8956970214845, "kl_loss_4": 951.8698364257813, "kl_loss_9": 276.7628715515137, "learning_rate": 2.2548879277963063e-05, "loss": 773.6446, "step": 9050 }, { "ce_loss_13": 3.253114938735962, "ce_loss_17": 3.215239441394806, "ce_loss_2": 4.0072418570518495, "ce_loss_4": 3.6237417459487915, "ce_loss_9": 3.342426073551178, "epoch": 0.906, "grad_norm": 828.0, "kl_loss_13": 69.18032398223878, "kl_loss_2": 1732.9671569824218, "kl_loss_4": 933.28525390625, "kl_loss_9": 266.85338287353517, "learning_rate": 2.208017147186736e-05, "loss": 737.7998, "step": 9060 }, { "ce_loss_13": 3.2505879163742066, "ce_loss_17": 3.211529290676117, "ce_loss_2": 4.008640742301941, "ce_loss_4": 3.6223735451698302, "ce_loss_9": 3.342756152153015, "epoch": 0.907, "grad_norm": 936.0, "kl_loss_13": 69.45951023101807, "kl_loss_2": 1758.497900390625, "kl_loss_4": 942.9026062011719, "kl_loss_9": 268.5462837219238, "learning_rate": 2.1616276313139227e-05, "loss": 751.1222, "step": 9070 }, { "ce_loss_13": 3.28677442073822, "ce_loss_17": 3.248307991027832, "ce_loss_2": 4.057864952087402, "ce_loss_4": 3.6663498282432556, "ce_loss_9": 3.3798938751220704, "epoch": 0.908, "grad_norm": 844.0, "kl_loss_13": 69.54161643981934, "kl_loss_2": 1760.1901550292969, "kl_loss_4": 946.1911895751953, "kl_loss_9": 270.7872001647949, "learning_rate": 2.1157198473197415e-05, "loss": 767.2781, "step": 9080 }, { "ce_loss_13": 3.347646725177765, "ce_loss_17": 3.3092511177062987, "ce_loss_2": 4.1128313660621645, "ce_loss_4": 3.7317168712615967, "ce_loss_9": 3.4412038564682006, "epoch": 0.909, "grad_norm": 1024.0, "kl_loss_13": 71.05716209411621, "kl_loss_2": 1744.008056640625, "kl_loss_4": 947.9106719970703, "kl_loss_9": 274.9560401916504, "learning_rate": 2.0702942574950812e-05, "loss": 759.0129, "step": 9090 }, { "ce_loss_13": 3.2757676362991335, "ce_loss_17": 3.2370603919029235, "ce_loss_2": 4.05085917711258, "ce_loss_4": 3.658015692234039, "ce_loss_9": 3.3714335680007936, "epoch": 0.91, "grad_norm": 828.0, "kl_loss_13": 70.6139980316162, "kl_loss_2": 1773.1221130371093, "kl_loss_4": 956.3177947998047, "kl_loss_9": 274.6713508605957, "learning_rate": 2.025351319275137e-05, "loss": 763.2487, "step": 9100 }, { "ce_loss_13": 3.3982797265052795, "ce_loss_17": 3.359361159801483, "ce_loss_2": 4.155125546455383, "ce_loss_4": 3.780303680896759, "ce_loss_9": 3.4923812985420226, "epoch": 0.911, "grad_norm": 772.0, "kl_loss_13": 73.26307582855225, "kl_loss_2": 1774.17333984375, "kl_loss_4": 974.4539184570312, "kl_loss_9": 282.6258262634277, "learning_rate": 1.9808914852347816e-05, "loss": 784.291, "step": 9110 }, { "ce_loss_13": 3.2470030188560486, "ce_loss_17": 3.208009135723114, "ce_loss_2": 4.013169312477112, "ce_loss_4": 3.6329050064086914, "ce_loss_9": 3.342044270038605, "epoch": 0.912, "grad_norm": 940.0, "kl_loss_13": 69.41524143218994, "kl_loss_2": 1744.8079162597655, "kl_loss_4": 952.8638854980469, "kl_loss_9": 271.1719367980957, "learning_rate": 1.9369152030840554e-05, "loss": 755.5211, "step": 9120 }, { "ce_loss_13": 3.328555727005005, "ce_loss_17": 3.291154706478119, "ce_loss_2": 4.090105664730072, "ce_loss_4": 3.709492301940918, "ce_loss_9": 3.4200507760047913, "epoch": 0.913, "grad_norm": 792.0, "kl_loss_13": 71.5401647567749, "kl_loss_2": 1769.6208740234374, "kl_loss_4": 956.116665649414, "kl_loss_9": 270.7011413574219, "learning_rate": 1.893422915663645e-05, "loss": 761.0888, "step": 9130 }, { "ce_loss_13": 3.197735035419464, "ce_loss_17": 3.158833086490631, "ce_loss_2": 4.001493084430694, "ce_loss_4": 3.5969720602035524, "ce_loss_9": 3.2945626020431518, "epoch": 0.914, "grad_norm": 880.0, "kl_loss_13": 69.98915824890136, "kl_loss_2": 1819.2923645019532, "kl_loss_4": 977.3836547851563, "kl_loss_9": 276.6566581726074, "learning_rate": 1.850415060940386e-05, "loss": 776.1738, "step": 9140 }, { "ce_loss_13": 3.3226425528526304, "ce_loss_17": 3.2849931478500367, "ce_loss_2": 4.062166357040406, "ce_loss_4": 3.6929720759391786, "ce_loss_9": 3.4111554622650146, "epoch": 0.915, "grad_norm": 928.0, "kl_loss_13": 70.22557373046875, "kl_loss_2": 1711.80859375, "kl_loss_4": 938.1763580322265, "kl_loss_9": 269.89337692260744, "learning_rate": 1.8078920720028978e-05, "loss": 756.1047, "step": 9150 }, { "ce_loss_13": 3.2460474491119387, "ce_loss_17": 3.210865044593811, "ce_loss_2": 3.9874136805534364, "ce_loss_4": 3.6203092336654663, "ce_loss_9": 3.337096083164215, "epoch": 0.916, "grad_norm": 944.0, "kl_loss_13": 68.0163013458252, "kl_loss_2": 1707.0913818359375, "kl_loss_4": 931.6345520019531, "kl_loss_9": 263.9716209411621, "learning_rate": 1.765854377057219e-05, "loss": 761.1198, "step": 9160 }, { "ce_loss_13": 3.2273457169532778, "ce_loss_17": 3.1899821162223816, "ce_loss_2": 3.9820481896400453, "ce_loss_4": 3.595924234390259, "ce_loss_9": 3.3174880385398864, "epoch": 0.917, "grad_norm": 764.0, "kl_loss_13": 68.14966316223145, "kl_loss_2": 1748.0338256835937, "kl_loss_4": 931.5319671630859, "kl_loss_9": 263.98573303222656, "learning_rate": 1.724302399422456e-05, "loss": 756.9239, "step": 9170 }, { "ce_loss_13": 3.188059365749359, "ce_loss_17": 3.150203537940979, "ce_loss_2": 3.963750922679901, "ce_loss_4": 3.5690385460853578, "ce_loss_9": 3.2794461131095884, "epoch": 0.918, "grad_norm": 944.0, "kl_loss_13": 70.40487995147706, "kl_loss_2": 1777.5741821289062, "kl_loss_4": 961.4688934326172, "kl_loss_9": 273.7363990783691, "learning_rate": 1.683236557526574e-05, "loss": 767.8744, "step": 9180 }, { "ce_loss_13": 3.3032285809516906, "ce_loss_17": 3.2665912747383117, "ce_loss_2": 4.032867658138275, "ce_loss_4": 3.657480251789093, "ce_loss_9": 3.390727710723877, "epoch": 0.919, "grad_norm": 740.0, "kl_loss_13": 68.06188373565674, "kl_loss_2": 1685.8097534179688, "kl_loss_4": 901.4606018066406, "kl_loss_9": 260.56498565673826, "learning_rate": 1.6426572649021475e-05, "loss": 749.3797, "step": 9190 }, { "ce_loss_13": 3.3341614603996277, "ce_loss_17": 3.296714127063751, "ce_loss_2": 4.0532737374305725, "ce_loss_4": 3.684324491024017, "ce_loss_9": 3.423136281967163, "epoch": 0.92, "grad_norm": 1004.0, "kl_loss_13": 71.2539436340332, "kl_loss_2": 1687.29228515625, "kl_loss_4": 912.562939453125, "kl_loss_9": 266.74498748779297, "learning_rate": 1.6025649301821876e-05, "loss": 744.0132, "step": 9200 }, { "ce_loss_13": 3.322655665874481, "ce_loss_17": 3.2867024898529054, "ce_loss_2": 4.051113891601562, "ce_loss_4": 3.688315272331238, "ce_loss_9": 3.4139885306358337, "epoch": 0.921, "grad_norm": 1072.0, "kl_loss_13": 69.86746578216552, "kl_loss_2": 1715.9775390625, "kl_loss_4": 933.6245697021484, "kl_loss_9": 271.504940032959, "learning_rate": 1.5629599570960716e-05, "loss": 748.0333, "step": 9210 }, { "ce_loss_13": 3.2324368953704834, "ce_loss_17": 3.1948946714401245, "ce_loss_2": 3.9962414503097534, "ce_loss_4": 3.605444300174713, "ce_loss_9": 3.3203277349472047, "epoch": 0.922, "grad_norm": 888.0, "kl_loss_13": 68.84123592376709, "kl_loss_2": 1762.1533569335938, "kl_loss_4": 940.2346588134766, "kl_loss_9": 268.61144332885743, "learning_rate": 1.5238427444654367e-05, "loss": 755.0762, "step": 9220 }, { "ce_loss_13": 3.2893491268157957, "ce_loss_17": 3.2509705901145933, "ce_loss_2": 4.038589036464691, "ce_loss_4": 3.6586158990859987, "ce_loss_9": 3.3821335554122927, "epoch": 0.923, "grad_norm": 1056.0, "kl_loss_13": 69.26623306274413, "kl_loss_2": 1720.7350646972657, "kl_loss_4": 922.943701171875, "kl_loss_9": 264.8475883483887, "learning_rate": 1.4852136862001764e-05, "loss": 747.6942, "step": 9230 }, { "ce_loss_13": 3.252811241149902, "ce_loss_17": 3.216975140571594, "ce_loss_2": 3.999893867969513, "ce_loss_4": 3.624435234069824, "ce_loss_9": 3.346466529369354, "epoch": 0.924, "grad_norm": 724.0, "kl_loss_13": 67.6477201461792, "kl_loss_2": 1716.8084045410155, "kl_loss_4": 932.0351043701172, "kl_loss_9": 264.6703620910645, "learning_rate": 1.4470731712944884e-05, "loss": 758.1266, "step": 9240 }, { "ce_loss_13": 3.2765080451965334, "ce_loss_17": 3.239612317085266, "ce_loss_2": 4.037005198001862, "ce_loss_4": 3.6546990990638735, "ce_loss_9": 3.3715944051742555, "epoch": 0.925, "grad_norm": 772.0, "kl_loss_13": 69.63426055908204, "kl_loss_2": 1746.720361328125, "kl_loss_4": 935.8729522705078, "kl_loss_9": 272.0946846008301, "learning_rate": 1.4094215838229174e-05, "loss": 772.5833, "step": 9250 }, { "ce_loss_13": 3.247589886188507, "ce_loss_17": 3.2106730580329894, "ce_loss_2": 4.026830673217773, "ce_loss_4": 3.631477081775665, "ce_loss_9": 3.3435757398605346, "epoch": 0.926, "grad_norm": 1000.0, "kl_loss_13": 69.61430225372314, "kl_loss_2": 1786.5716674804687, "kl_loss_4": 955.6620147705078, "kl_loss_9": 273.12003631591796, "learning_rate": 1.372259302936546e-05, "loss": 789.6091, "step": 9260 }, { "ce_loss_13": 3.354861545562744, "ce_loss_17": 3.3138466238975526, "ce_loss_2": 4.110436654090881, "ce_loss_4": 3.7305679082870484, "ce_loss_9": 3.4497798085212708, "epoch": 0.927, "grad_norm": 776.0, "kl_loss_13": 72.69700546264649, "kl_loss_2": 1737.003369140625, "kl_loss_4": 939.8022277832031, "kl_loss_9": 276.0119041442871, "learning_rate": 1.3355867028591206e-05, "loss": 750.552, "step": 9270 }, { "ce_loss_13": 3.26385555267334, "ce_loss_17": 3.2263644456863405, "ce_loss_2": 3.9958433508872986, "ce_loss_4": 3.6246315956115724, "ce_loss_9": 3.3551544904708863, "epoch": 0.928, "grad_norm": 776.0, "kl_loss_13": 68.76478576660156, "kl_loss_2": 1707.2232849121094, "kl_loss_4": 926.3538391113282, "kl_loss_9": 266.91420669555663, "learning_rate": 1.2994041528833267e-05, "loss": 746.6288, "step": 9280 }, { "ce_loss_13": 3.2609859108924866, "ce_loss_17": 3.222886061668396, "ce_loss_2": 4.019745421409607, "ce_loss_4": 3.6354535818099976, "ce_loss_9": 3.3499014139175416, "epoch": 0.929, "grad_norm": 780.0, "kl_loss_13": 68.63880157470703, "kl_loss_2": 1757.2555541992188, "kl_loss_4": 940.3133087158203, "kl_loss_9": 266.6288612365723, "learning_rate": 1.2637120173670358e-05, "loss": 751.512, "step": 9290 }, { "ce_loss_13": 3.2835487365722655, "ce_loss_17": 3.244655930995941, "ce_loss_2": 4.051596641540527, "ce_loss_4": 3.6634366631507875, "ce_loss_9": 3.378000283241272, "epoch": 0.93, "grad_norm": 1296.0, "kl_loss_13": 70.42365875244141, "kl_loss_2": 1761.8828002929688, "kl_loss_4": 952.9559265136719, "kl_loss_9": 273.18845825195314, "learning_rate": 1.2285106557296478e-05, "loss": 757.7055, "step": 9300 }, { "ce_loss_13": 3.1630964636802674, "ce_loss_17": 3.1254794001579285, "ce_loss_2": 3.98508734703064, "ce_loss_4": 3.562894332408905, "ce_loss_9": 3.257805061340332, "epoch": 0.931, "grad_norm": 780.0, "kl_loss_13": 69.35065059661865, "kl_loss_2": 1842.6054992675781, "kl_loss_4": 977.738735961914, "kl_loss_9": 272.13949966430664, "learning_rate": 1.1938004224484989e-05, "loss": 771.8602, "step": 9310 }, { "ce_loss_13": 3.3944175481796264, "ce_loss_17": 3.353794741630554, "ce_loss_2": 4.1406211972236635, "ce_loss_4": 3.7649981141090394, "ce_loss_9": 3.4874671697616577, "epoch": 0.932, "grad_norm": 1104.0, "kl_loss_13": 72.44851455688476, "kl_loss_2": 1742.682635498047, "kl_loss_4": 943.5166168212891, "kl_loss_9": 273.04381866455077, "learning_rate": 1.1595816670552429e-05, "loss": 771.1343, "step": 9320 }, { "ce_loss_13": 3.323309564590454, "ce_loss_17": 3.2835536003112793, "ce_loss_2": 4.059554195404052, "ce_loss_4": 3.683386528491974, "ce_loss_9": 3.412449359893799, "epoch": 0.933, "grad_norm": 1000.0, "kl_loss_13": 71.7368278503418, "kl_loss_2": 1710.4193237304687, "kl_loss_4": 919.0308288574219, "kl_loss_9": 266.2962310791016, "learning_rate": 1.1258547341323699e-05, "loss": 743.8206, "step": 9330 }, { "ce_loss_13": 3.3531580924987794, "ce_loss_17": 3.3139570116996766, "ce_loss_2": 4.0956674933433534, "ce_loss_4": 3.722786843776703, "ce_loss_9": 3.443931555747986, "epoch": 0.934, "grad_norm": 680.0, "kl_loss_13": 70.99226303100586, "kl_loss_2": 1743.4335693359376, "kl_loss_4": 943.1558502197265, "kl_loss_9": 272.2658096313477, "learning_rate": 1.0926199633097156e-05, "loss": 753.9291, "step": 9340 }, { "ce_loss_13": 3.3619842648506166, "ce_loss_17": 3.3256566524505615, "ce_loss_2": 4.070989274978638, "ce_loss_4": 3.71152184009552, "ce_loss_9": 3.446810233592987, "epoch": 0.935, "grad_norm": 748.0, "kl_loss_13": 68.85320167541504, "kl_loss_2": 1681.0407958984374, "kl_loss_4": 915.6587646484375, "kl_loss_9": 264.3614471435547, "learning_rate": 1.0598776892610684e-05, "loss": 758.3277, "step": 9350 }, { "ce_loss_13": 3.174557101726532, "ce_loss_17": 3.1382050037384035, "ce_loss_2": 3.94193377494812, "ce_loss_4": 3.554804193973541, "ce_loss_9": 3.2670865774154665, "epoch": 0.936, "grad_norm": 1096.0, "kl_loss_13": 68.04401416778565, "kl_loss_2": 1760.4666748046875, "kl_loss_4": 945.3590911865234, "kl_loss_9": 266.08999099731443, "learning_rate": 1.0276282417007399e-05, "loss": 749.2618, "step": 9360 }, { "ce_loss_13": 3.3314119935035706, "ce_loss_17": 3.2932984948158266, "ce_loss_2": 4.055443024635315, "ce_loss_4": 3.685200798511505, "ce_loss_9": 3.4210570573806764, "epoch": 0.937, "grad_norm": 800.0, "kl_loss_13": 69.36395244598388, "kl_loss_2": 1693.5143676757812, "kl_loss_4": 913.9628021240235, "kl_loss_9": 265.6849952697754, "learning_rate": 9.958719453803277e-06, "loss": 748.5033, "step": 9370 }, { "ce_loss_13": 3.3231441020965575, "ce_loss_17": 3.283005452156067, "ce_loss_2": 4.085646188259124, "ce_loss_4": 3.704668116569519, "ce_loss_9": 3.4138529539108275, "epoch": 0.938, "grad_norm": 640.0, "kl_loss_13": 70.55078353881837, "kl_loss_2": 1759.910662841797, "kl_loss_4": 956.954931640625, "kl_loss_9": 271.58061294555665, "learning_rate": 9.646091200853802e-06, "loss": 754.8813, "step": 9380 }, { "ce_loss_13": 3.2818185925483703, "ce_loss_17": 3.245718610286713, "ce_loss_2": 4.025053870677948, "ce_loss_4": 3.652445447444916, "ce_loss_9": 3.370171332359314, "epoch": 0.939, "grad_norm": 1312.0, "kl_loss_13": 67.64159717559815, "kl_loss_2": 1702.7493835449218, "kl_loss_4": 921.6332214355468, "kl_loss_9": 264.55640106201173, "learning_rate": 9.338400806321978e-06, "loss": 728.2665, "step": 9390 }, { "ce_loss_13": 3.3152188658714294, "ce_loss_17": 3.275552845001221, "ce_loss_2": 4.059438633918762, "ce_loss_4": 3.6847678899765013, "ce_loss_9": 3.410316598415375, "epoch": 0.94, "grad_norm": 692.0, "kl_loss_13": 71.03326644897462, "kl_loss_2": 1715.1363952636718, "kl_loss_4": 928.6465850830078, "kl_loss_9": 271.49635848999026, "learning_rate": 9.035651368646646e-06, "loss": 743.4288, "step": 9400 }, { "ce_loss_13": 3.3191699028015136, "ce_loss_17": 3.2819036841392517, "ce_loss_2": 4.0510072112083435, "ce_loss_4": 3.6800065636634827, "ce_loss_9": 3.4086018681526182, "epoch": 0.941, "grad_norm": 768.0, "kl_loss_13": 69.9085536956787, "kl_loss_2": 1709.1678588867187, "kl_loss_4": 927.9352905273438, "kl_loss_9": 265.0817222595215, "learning_rate": 8.737845936511335e-06, "loss": 749.88, "step": 9410 }, { "ce_loss_13": 3.2696847438812258, "ce_loss_17": 3.2313279032707216, "ce_loss_2": 4.0388831973075865, "ce_loss_4": 3.6455859422683714, "ce_loss_9": 3.3634910702705385, "epoch": 0.942, "grad_norm": 740.0, "kl_loss_13": 70.54738807678223, "kl_loss_2": 1764.8729553222656, "kl_loss_4": 944.1729400634765, "kl_loss_9": 272.425553894043, "learning_rate": 8.444987508813451e-06, "loss": 753.2954, "step": 9420 }, { "ce_loss_13": 3.2250452876091003, "ce_loss_17": 3.187439298629761, "ce_loss_2": 4.013418805599213, "ce_loss_4": 3.613393557071686, "ce_loss_9": 3.32097065448761, "epoch": 0.943, "grad_norm": 932.0, "kl_loss_13": 71.29195957183838, "kl_loss_2": 1826.9091674804688, "kl_loss_4": 979.7543212890625, "kl_loss_9": 276.6710952758789, "learning_rate": 8.157079034633974e-06, "loss": 775.8956, "step": 9430 }, { "ce_loss_13": 3.2237427115440367, "ce_loss_17": 3.1853844165802, "ce_loss_2": 3.98459529876709, "ce_loss_4": 3.5957419633865357, "ce_loss_9": 3.3150734543800353, "epoch": 0.944, "grad_norm": 1120.0, "kl_loss_13": 69.64744243621826, "kl_loss_2": 1783.77939453125, "kl_loss_4": 956.6733428955079, "kl_loss_9": 270.16100692749023, "learning_rate": 7.874123413208145e-06, "loss": 758.3695, "step": 9440 }, { "ce_loss_13": 3.193096709251404, "ce_loss_17": 3.155752348899841, "ce_loss_2": 3.9789442300796507, "ce_loss_4": 3.581227695941925, "ce_loss_9": 3.2874748349189757, "epoch": 0.945, "grad_norm": 876.0, "kl_loss_13": 69.05770931243896, "kl_loss_2": 1784.2775451660157, "kl_loss_4": 953.4078979492188, "kl_loss_9": 271.03979644775393, "learning_rate": 7.59612349389599e-06, "loss": 766.0222, "step": 9450 }, { "ce_loss_13": 3.2857839107513427, "ce_loss_17": 3.248835825920105, "ce_loss_2": 4.009725487232208, "ce_loss_4": 3.6440842509269715, "ce_loss_9": 3.375727343559265, "epoch": 0.946, "grad_norm": 896.0, "kl_loss_13": 67.77833862304688, "kl_loss_2": 1678.2259338378906, "kl_loss_4": 907.5334350585938, "kl_loss_9": 261.4477348327637, "learning_rate": 7.323082076153509e-06, "loss": 742.3237, "step": 9460 }, { "ce_loss_13": 3.328112506866455, "ce_loss_17": 3.290648400783539, "ce_loss_2": 4.060686004161835, "ce_loss_4": 3.688987469673157, "ce_loss_9": 3.4178372263908385, "epoch": 0.947, "grad_norm": 852.0, "kl_loss_13": 70.47306404113769, "kl_loss_2": 1702.2720703125, "kl_loss_4": 920.1532928466797, "kl_loss_9": 269.27613067626953, "learning_rate": 7.055001909504755e-06, "loss": 757.1909, "step": 9470 }, { "ce_loss_13": 3.3599886775016783, "ce_loss_17": 3.3227712154388427, "ce_loss_2": 4.101234364509582, "ce_loss_4": 3.7277470231056213, "ce_loss_9": 3.4521798849105836, "epoch": 0.948, "grad_norm": 688.0, "kl_loss_13": 69.91639251708985, "kl_loss_2": 1721.4495849609375, "kl_loss_4": 929.4572998046875, "kl_loss_9": 271.32519454956054, "learning_rate": 6.791885693514133e-06, "loss": 753.6105, "step": 9480 }, { "ce_loss_13": 3.2632110476493836, "ce_loss_17": 3.225801682472229, "ce_loss_2": 4.036578476428986, "ce_loss_4": 3.6461342096328737, "ce_loss_9": 3.3560299634933473, "epoch": 0.949, "grad_norm": 852.0, "kl_loss_13": 70.22738361358643, "kl_loss_2": 1787.6828125, "kl_loss_4": 959.2408905029297, "kl_loss_9": 271.782169342041, "learning_rate": 6.533736077758867e-06, "loss": 769.4135, "step": 9490 }, { "ce_loss_13": 3.228712463378906, "ce_loss_17": 3.1906594753265383, "ce_loss_2": 4.0256568670272825, "ce_loss_4": 3.6140143275260925, "ce_loss_9": 3.322402632236481, "epoch": 0.95, "grad_norm": 1048.0, "kl_loss_13": 70.73279113769532, "kl_loss_2": 1830.7975463867188, "kl_loss_4": 965.3727020263672, "kl_loss_9": 273.8419059753418, "learning_rate": 6.2805556618028556e-06, "loss": 767.9269, "step": 9500 }, { "ce_loss_13": 3.3222765564918517, "ce_loss_17": 3.2846568703651426, "ce_loss_2": 4.04825781583786, "ce_loss_4": 3.6733174681663514, "ce_loss_9": 3.4051283955574037, "epoch": 0.951, "grad_norm": 864.0, "kl_loss_13": 68.7835521697998, "kl_loss_2": 1683.3633361816405, "kl_loss_4": 898.0291534423828, "kl_loss_9": 259.48881378173826, "learning_rate": 6.032346995169968e-06, "loss": 721.3496, "step": 9510 }, { "ce_loss_13": 3.3196557998657226, "ce_loss_17": 3.282727527618408, "ce_loss_2": 4.06610267162323, "ce_loss_4": 3.6882732272148133, "ce_loss_9": 3.4062796950340273, "epoch": 0.952, "grad_norm": 864.0, "kl_loss_13": 69.86494426727295, "kl_loss_2": 1729.0597351074218, "kl_loss_4": 936.212890625, "kl_loss_9": 267.50542373657225, "learning_rate": 5.789112577318789e-06, "loss": 746.7356, "step": 9520 }, { "ce_loss_13": 3.3010855078697205, "ce_loss_17": 3.2626437067985536, "ce_loss_2": 4.069916033744812, "ce_loss_4": 3.6735063433647155, "ce_loss_9": 3.3919169664382935, "epoch": 0.953, "grad_norm": 772.0, "kl_loss_13": 70.53233699798584, "kl_loss_2": 1780.1747192382813, "kl_loss_4": 953.9167205810547, "kl_loss_9": 270.62007904052734, "learning_rate": 5.550854857617194e-06, "loss": 751.1559, "step": 9530 }, { "ce_loss_13": 3.2827063322067263, "ce_loss_17": 3.2447108030319214, "ce_loss_2": 4.069734442234039, "ce_loss_4": 3.6681645154953, "ce_loss_9": 3.3760493993759155, "epoch": 0.954, "grad_norm": 932.0, "kl_loss_13": 71.62755641937255, "kl_loss_2": 1796.897705078125, "kl_loss_4": 958.7881378173828, "kl_loss_9": 275.33373107910154, "learning_rate": 5.317576235317756e-06, "loss": 766.1111, "step": 9540 }, { "ce_loss_13": 3.3136985898017883, "ce_loss_17": 3.2769460320472716, "ce_loss_2": 4.0408616065979, "ce_loss_4": 3.669191324710846, "ce_loss_9": 3.402140426635742, "epoch": 0.955, "grad_norm": 972.0, "kl_loss_13": 69.35324230194092, "kl_loss_2": 1673.2431579589843, "kl_loss_4": 898.83974609375, "kl_loss_9": 260.54905395507814, "learning_rate": 5.089279059533658e-06, "loss": 750.5405, "step": 9550 }, { "ce_loss_13": 3.3648689150810243, "ce_loss_17": 3.3249842524528503, "ce_loss_2": 4.105914556980133, "ce_loss_4": 3.7360412955284117, "ce_loss_9": 3.4594300985336304, "epoch": 0.956, "grad_norm": 900.0, "kl_loss_13": 71.78082599639893, "kl_loss_2": 1721.1691772460938, "kl_loss_4": 939.7376190185547, "kl_loss_9": 275.466544342041, "learning_rate": 4.865965629214819e-06, "loss": 750.1981, "step": 9560 }, { "ce_loss_13": 3.3124891042709352, "ce_loss_17": 3.2736460328102113, "ce_loss_2": 4.072525918483734, "ce_loss_4": 3.6866251111030577, "ce_loss_9": 3.4061823725700378, "epoch": 0.957, "grad_norm": 1264.0, "kl_loss_13": 71.31147899627686, "kl_loss_2": 1768.5251342773438, "kl_loss_4": 946.2674346923828, "kl_loss_9": 274.1868476867676, "learning_rate": 4.6476381931251366e-06, "loss": 746.5137, "step": 9570 }, { "ce_loss_13": 3.298474097251892, "ce_loss_17": 3.260497677326202, "ce_loss_2": 4.046734261512756, "ce_loss_4": 3.6683933973312377, "ce_loss_9": 3.390977942943573, "epoch": 0.958, "grad_norm": 1112.0, "kl_loss_13": 69.28888702392578, "kl_loss_2": 1714.635205078125, "kl_loss_4": 926.0194061279296, "kl_loss_9": 266.40323333740236, "learning_rate": 4.434298949819449e-06, "loss": 749.155, "step": 9580 }, { "ce_loss_13": 3.2587222695350646, "ce_loss_17": 3.219885218143463, "ce_loss_2": 4.048289930820465, "ce_loss_4": 3.6507463574409487, "ce_loss_9": 3.35064240694046, "epoch": 0.959, "grad_norm": 788.0, "kl_loss_13": 72.76544532775878, "kl_loss_2": 1839.6946411132812, "kl_loss_4": 989.5889343261719, "kl_loss_9": 280.65069885253905, "learning_rate": 4.2259500476214406e-06, "loss": 773.379, "step": 9590 }, { "ce_loss_13": 3.240147149562836, "ce_loss_17": 3.201702296733856, "ce_loss_2": 4.003677499294281, "ce_loss_4": 3.6168144822120665, "ce_loss_9": 3.332273817062378, "epoch": 0.96, "grad_norm": 752.0, "kl_loss_13": 69.94089698791504, "kl_loss_2": 1777.3868408203125, "kl_loss_4": 952.8343505859375, "kl_loss_9": 270.85561447143556, "learning_rate": 4.02259358460233e-06, "loss": 754.1871, "step": 9600 }, { "ce_loss_13": 3.305615413188934, "ce_loss_17": 3.2665465831756593, "ce_loss_2": 4.051685702800751, "ce_loss_4": 3.675060486793518, "ce_loss_9": 3.398111867904663, "epoch": 0.961, "grad_norm": 960.0, "kl_loss_13": 70.52569160461425, "kl_loss_2": 1709.8893249511718, "kl_loss_4": 921.5408935546875, "kl_loss_9": 268.09111938476565, "learning_rate": 3.8242316085594916e-06, "loss": 742.2291, "step": 9610 }, { "ce_loss_13": 3.190785253047943, "ce_loss_17": 3.1509253859519957, "ce_loss_2": 4.003580093383789, "ce_loss_4": 3.5900234818458556, "ce_loss_9": 3.2888937115669252, "epoch": 0.962, "grad_norm": 692.0, "kl_loss_13": 70.176953125, "kl_loss_2": 1853.9943420410157, "kl_loss_4": 980.0655944824218, "kl_loss_9": 275.7702995300293, "learning_rate": 3.630866116995757e-06, "loss": 782.2559, "step": 9620 }, { "ce_loss_13": 3.3454036831855776, "ce_loss_17": 3.3080078125, "ce_loss_2": 4.0747090697288515, "ce_loss_4": 3.700935626029968, "ce_loss_9": 3.433185613155365, "epoch": 0.963, "grad_norm": 712.0, "kl_loss_13": 69.57482261657715, "kl_loss_2": 1695.5358520507812, "kl_loss_4": 906.6251342773437, "kl_loss_9": 265.3205223083496, "learning_rate": 3.4424990570994797e-06, "loss": 757.5875, "step": 9630 }, { "ce_loss_13": 3.3344528794288637, "ce_loss_17": 3.296273350715637, "ce_loss_2": 4.070157301425934, "ce_loss_4": 3.700620484352112, "ce_loss_9": 3.425011682510376, "epoch": 0.964, "grad_norm": 752.0, "kl_loss_13": 69.56573696136475, "kl_loss_2": 1715.6682678222655, "kl_loss_4": 931.6341369628906, "kl_loss_9": 266.6449722290039, "learning_rate": 3.2591323257248896e-06, "loss": 749.5438, "step": 9640 }, { "ce_loss_13": 3.1882264971733094, "ce_loss_17": 3.1511335372924805, "ce_loss_2": 3.953037369251251, "ce_loss_4": 3.562658357620239, "ce_loss_9": 3.276075565814972, "epoch": 0.965, "grad_norm": 872.0, "kl_loss_13": 68.23988037109375, "kl_loss_2": 1771.6603393554688, "kl_loss_4": 947.5275238037109, "kl_loss_9": 267.8297752380371, "learning_rate": 3.0807677693729385e-06, "loss": 764.7062, "step": 9650 }, { "ce_loss_13": 3.3730565428733827, "ce_loss_17": 3.336056077480316, "ce_loss_2": 4.112894988059997, "ce_loss_4": 3.7414841294288634, "ce_loss_9": 3.4624873042106628, "epoch": 0.966, "grad_norm": 752.0, "kl_loss_13": 70.90753707885742, "kl_loss_2": 1713.3698181152345, "kl_loss_4": 927.716064453125, "kl_loss_9": 266.6476768493652, "learning_rate": 2.9074071841727055e-06, "loss": 739.7657, "step": 9660 }, { "ce_loss_13": 3.296717894077301, "ce_loss_17": 3.2591960549354555, "ce_loss_2": 4.048906767368317, "ce_loss_4": 3.6761029839515684, "ce_loss_9": 3.3900691509246825, "epoch": 0.967, "grad_norm": 652.0, "kl_loss_13": 69.59696311950684, "kl_loss_2": 1729.5003051757812, "kl_loss_4": 946.6276977539062, "kl_loss_9": 269.5311737060547, "learning_rate": 2.739052315863355e-06, "loss": 738.4226, "step": 9670 }, { "ce_loss_13": 3.2766026735305784, "ce_loss_17": 3.2395278930664064, "ce_loss_2": 4.038357901573181, "ce_loss_4": 3.648110294342041, "ce_loss_9": 3.3652175784111025, "epoch": 0.968, "grad_norm": 1080.0, "kl_loss_13": 70.5869369506836, "kl_loss_2": 1758.3627502441407, "kl_loss_4": 933.8951812744141, "kl_loss_9": 264.542862701416, "learning_rate": 2.5757048597765396e-06, "loss": 750.0461, "step": 9680 }, { "ce_loss_13": 3.2924631953239443, "ce_loss_17": 3.2535227656364443, "ce_loss_2": 4.052191209793091, "ce_loss_4": 3.664053797721863, "ce_loss_9": 3.384879839420319, "epoch": 0.969, "grad_norm": 1072.0, "kl_loss_13": 69.67913131713867, "kl_loss_2": 1746.7089233398438, "kl_loss_4": 945.0013824462891, "kl_loss_9": 270.4563980102539, "learning_rate": 2.417366460819359e-06, "loss": 753.9213, "step": 9690 }, { "ce_loss_13": 3.300505042076111, "ce_loss_17": 3.260878264904022, "ce_loss_2": 4.087180781364441, "ce_loss_4": 3.6878235220909117, "ce_loss_9": 3.3978606462478638, "epoch": 0.97, "grad_norm": 2768.0, "kl_loss_13": 71.02720127105712, "kl_loss_2": 1801.905743408203, "kl_loss_4": 961.648291015625, "kl_loss_9": 274.4756118774414, "learning_rate": 2.2640387134577057e-06, "loss": 754.1334, "step": 9700 }, { "ce_loss_13": 3.233382725715637, "ce_loss_17": 3.196085739135742, "ce_loss_2": 3.9578961730003357, "ce_loss_4": 3.587084412574768, "ce_loss_9": 3.320077121257782, "epoch": 0.971, "grad_norm": 816.0, "kl_loss_13": 66.40662364959717, "kl_loss_2": 1661.5213989257813, "kl_loss_4": 899.0106079101563, "kl_loss_9": 257.6714744567871, "learning_rate": 2.115723161700278e-06, "loss": 735.0702, "step": 9710 }, { "ce_loss_13": 3.2087143778800966, "ce_loss_17": 3.169182777404785, "ce_loss_2": 3.998633527755737, "ce_loss_4": 3.590976357460022, "ce_loss_9": 3.302764666080475, "epoch": 0.972, "grad_norm": 852.0, "kl_loss_13": 70.9349552154541, "kl_loss_2": 1806.8160339355468, "kl_loss_4": 960.1527740478516, "kl_loss_9": 273.3882225036621, "learning_rate": 1.9724212990830937e-06, "loss": 772.3686, "step": 9720 }, { "ce_loss_13": 3.347757840156555, "ce_loss_17": 3.310051369667053, "ce_loss_2": 4.111681115627289, "ce_loss_4": 3.7260345101356505, "ce_loss_9": 3.4391422271728516, "epoch": 0.973, "grad_norm": 668.0, "kl_loss_13": 70.93805503845215, "kl_loss_2": 1759.4647155761718, "kl_loss_4": 944.5461242675781, "kl_loss_9": 271.56855697631835, "learning_rate": 1.8341345686543331e-06, "loss": 757.8961, "step": 9730 }, { "ce_loss_13": 3.3323220372200013, "ce_loss_17": 3.2967097997665404, "ce_loss_2": 4.05302050113678, "ce_loss_4": 3.6953728199005127, "ce_loss_9": 3.4248616099357605, "epoch": 0.974, "grad_norm": 1040.0, "kl_loss_13": 69.34260234832763, "kl_loss_2": 1672.5962463378905, "kl_loss_4": 911.3729583740235, "kl_loss_9": 265.0969268798828, "learning_rate": 1.7008643629596864e-06, "loss": 755.02, "step": 9740 }, { "ce_loss_13": 3.31864572763443, "ce_loss_17": 3.2793949127197264, "ce_loss_2": 4.070427584648132, "ce_loss_4": 3.683055078983307, "ce_loss_9": 3.410471832752228, "epoch": 0.975, "grad_norm": 1012.0, "kl_loss_13": 69.98407096862793, "kl_loss_2": 1750.4641174316407, "kl_loss_4": 933.7738891601563, "kl_loss_9": 268.15416793823243, "learning_rate": 1.5726120240288633e-06, "loss": 762.7929, "step": 9750 }, { "ce_loss_13": 3.218663954734802, "ce_loss_17": 3.181781232357025, "ce_loss_2": 3.9690282464027407, "ce_loss_4": 3.5920695543289183, "ce_loss_9": 3.3074343681335447, "epoch": 0.976, "grad_norm": 712.0, "kl_loss_13": 68.4236557006836, "kl_loss_2": 1734.748388671875, "kl_loss_4": 934.9843048095703, "kl_loss_9": 265.47901916503906, "learning_rate": 1.4493788433612708e-06, "loss": 748.722, "step": 9760 }, { "ce_loss_13": 3.335367035865784, "ce_loss_17": 3.297475850582123, "ce_loss_2": 4.103602564334869, "ce_loss_4": 3.7194851636886597, "ce_loss_9": 3.4302751779556275, "epoch": 0.977, "grad_norm": 620.0, "kl_loss_13": 70.56827011108399, "kl_loss_2": 1770.5423828125, "kl_loss_4": 957.4742462158204, "kl_loss_9": 272.5340232849121, "learning_rate": 1.3311660619138578e-06, "loss": 765.3009, "step": 9770 }, { "ce_loss_13": 3.336168646812439, "ce_loss_17": 3.2970511078834535, "ce_loss_2": 4.043975496292115, "ce_loss_4": 3.690586745738983, "ce_loss_9": 3.4224077343940733, "epoch": 0.978, "grad_norm": 904.0, "kl_loss_13": 69.4926971435547, "kl_loss_2": 1652.4708862304688, "kl_loss_4": 911.5842163085938, "kl_loss_9": 264.75740280151365, "learning_rate": 1.2179748700879012e-06, "loss": 746.8001, "step": 9780 }, { "ce_loss_13": 3.263418173789978, "ce_loss_17": 3.2262269020080567, "ce_loss_2": 4.012387645244599, "ce_loss_4": 3.6370876669883727, "ce_loss_9": 3.3552389621734617, "epoch": 0.979, "grad_norm": 1248.0, "kl_loss_13": 69.25509204864503, "kl_loss_2": 1723.763494873047, "kl_loss_4": 927.4136138916016, "kl_loss_9": 265.8643524169922, "learning_rate": 1.1098064077174619e-06, "loss": 753.7449, "step": 9790 }, { "ce_loss_13": 3.292123317718506, "ce_loss_17": 3.253508412837982, "ce_loss_2": 4.069903910160065, "ce_loss_4": 3.6727489948272707, "ce_loss_9": 3.3865288853645326, "epoch": 0.98, "grad_norm": 900.0, "kl_loss_13": 69.55743618011475, "kl_loss_2": 1785.3747497558593, "kl_loss_4": 953.881314086914, "kl_loss_9": 269.7953392028809, "learning_rate": 1.006661764057837e-06, "loss": 757.5746, "step": 9800 }, { "ce_loss_13": 3.29565806388855, "ce_loss_17": 3.259297585487366, "ce_loss_2": 4.048216080665588, "ce_loss_4": 3.6683563709259035, "ce_loss_9": 3.386642026901245, "epoch": 0.981, "grad_norm": 968.0, "kl_loss_13": 69.07173500061035, "kl_loss_2": 1738.47861328125, "kl_loss_4": 937.5677337646484, "kl_loss_9": 266.25218734741213, "learning_rate": 9.085419777743465e-07, "loss": 748.7905, "step": 9810 }, { "ce_loss_13": 3.2435696125030518, "ce_loss_17": 3.207517647743225, "ce_loss_2": 4.002567946910858, "ce_loss_4": 3.617623841762543, "ce_loss_9": 3.333637535572052, "epoch": 0.982, "grad_norm": 800.0, "kl_loss_13": 67.94173545837403, "kl_loss_2": 1745.14423828125, "kl_loss_4": 936.8636291503906, "kl_loss_9": 262.454483795166, "learning_rate": 8.15448036932176e-07, "loss": 739.0622, "step": 9820 }, { "ce_loss_13": 3.291981852054596, "ce_loss_17": 3.2551918506622313, "ce_loss_2": 4.038154816627502, "ce_loss_4": 3.6620378851890565, "ce_loss_9": 3.3822624683380127, "epoch": 0.983, "grad_norm": 648.0, "kl_loss_13": 69.3007080078125, "kl_loss_2": 1744.5392822265626, "kl_loss_4": 948.7087493896485, "kl_loss_9": 269.4570854187012, "learning_rate": 7.273808789862724e-07, "loss": 763.5148, "step": 9830 }, { "ce_loss_13": 3.369704580307007, "ce_loss_17": 3.3324461221694945, "ce_loss_2": 4.108627426624298, "ce_loss_4": 3.735966980457306, "ce_loss_9": 3.461838722229004, "epoch": 0.984, "grad_norm": 732.0, "kl_loss_13": 71.17484645843506, "kl_loss_2": 1731.5099853515626, "kl_loss_4": 940.5303466796875, "kl_loss_9": 271.71323165893557, "learning_rate": 6.443413907720186e-07, "loss": 750.0659, "step": 9840 }, { "ce_loss_13": 3.3022388219833374, "ce_loss_17": 3.2654573798179625, "ce_loss_2": 4.047054243087769, "ce_loss_4": 3.6680091381073, "ce_loss_9": 3.3926560521125793, "epoch": 0.985, "grad_norm": 804.0, "kl_loss_13": 69.3776517868042, "kl_loss_2": 1703.3996826171874, "kl_loss_4": 919.8436981201172, "kl_loss_9": 267.2551780700684, "learning_rate": 5.663304084960185e-07, "loss": 741.2038, "step": 9850 }, { "ce_loss_13": 3.228341591358185, "ce_loss_17": 3.1917863130569457, "ce_loss_2": 4.006502139568329, "ce_loss_4": 3.6165035367012024, "ce_loss_9": 3.321926999092102, "epoch": 0.986, "grad_norm": 628.0, "kl_loss_13": 69.7114330291748, "kl_loss_2": 1775.5684326171875, "kl_loss_4": 950.0264434814453, "kl_loss_9": 269.45206451416016, "learning_rate": 4.933487177280482e-07, "loss": 746.5763, "step": 9860 }, { "ce_loss_13": 3.3282948970794677, "ce_loss_17": 3.2902629494667055, "ce_loss_2": 4.073453938961029, "ce_loss_4": 3.6912740230560304, "ce_loss_9": 3.418955981731415, "epoch": 0.987, "grad_norm": 704.0, "kl_loss_13": 68.88297080993652, "kl_loss_2": 1733.244970703125, "kl_loss_4": 929.6509796142578, "kl_loss_9": 264.36970443725585, "learning_rate": 4.2539705339295075e-07, "loss": 743.9317, "step": 9870 }, { "ce_loss_13": 3.184259068965912, "ce_loss_17": 3.146634590625763, "ce_loss_2": 3.948678719997406, "ce_loss_4": 3.568000066280365, "ce_loss_9": 3.279090178012848, "epoch": 0.988, "grad_norm": 756.0, "kl_loss_13": 69.63558444976806, "kl_loss_2": 1756.38798828125, "kl_loss_4": 957.2883392333985, "kl_loss_9": 278.4889938354492, "learning_rate": 3.6247609976319816e-07, "loss": 749.9763, "step": 9880 }, { "ce_loss_13": 3.2724010348320007, "ce_loss_17": 3.233578050136566, "ce_loss_2": 4.047141480445862, "ce_loss_4": 3.6549957036972045, "ce_loss_9": 3.3679153680801392, "epoch": 0.989, "grad_norm": 1024.0, "kl_loss_13": 70.6495204925537, "kl_loss_2": 1769.112371826172, "kl_loss_4": 951.6210327148438, "kl_loss_9": 272.63334426879885, "learning_rate": 3.0458649045211895e-07, "loss": 773.768, "step": 9890 }, { "ce_loss_13": 3.2451439619064333, "ce_loss_17": 3.205696094036102, "ce_loss_2": 4.007219898700714, "ce_loss_4": 3.629718315601349, "ce_loss_9": 3.3400197505950926, "epoch": 0.99, "grad_norm": 800.0, "kl_loss_13": 70.50988845825195, "kl_loss_2": 1742.4984558105468, "kl_loss_4": 959.8676452636719, "kl_loss_9": 274.8178520202637, "learning_rate": 2.517288084074587e-07, "loss": 772.7277, "step": 9900 }, { "ce_loss_13": 3.2862741708755494, "ce_loss_17": 3.246384072303772, "ce_loss_2": 4.084227788448334, "ce_loss_4": 3.6856178522109984, "ce_loss_9": 3.386175799369812, "epoch": 0.991, "grad_norm": 736.0, "kl_loss_13": 71.46778984069825, "kl_loss_2": 1809.738525390625, "kl_loss_4": 976.1868469238282, "kl_loss_9": 278.51739807128905, "learning_rate": 2.0390358590538505e-07, "loss": 768.0669, "step": 9910 }, { "ce_loss_13": 3.296647012233734, "ce_loss_17": 3.2595779418945314, "ce_loss_2": 4.049106597900391, "ce_loss_4": 3.6708460211753846, "ce_loss_9": 3.3929797291755674, "epoch": 0.992, "grad_norm": 816.0, "kl_loss_13": 70.1730453491211, "kl_loss_2": 1743.3746154785156, "kl_loss_4": 954.984341430664, "kl_loss_9": 275.02985229492185, "learning_rate": 1.61111304545436e-07, "loss": 753.1512, "step": 9920 }, { "ce_loss_13": 3.259925878047943, "ce_loss_17": 3.2214688301086425, "ce_loss_2": 4.006985282897949, "ce_loss_4": 3.6319169282913206, "ce_loss_9": 3.3517908215522767, "epoch": 0.993, "grad_norm": 704.0, "kl_loss_13": 69.38464469909668, "kl_loss_2": 1734.0220458984375, "kl_loss_4": 941.4155578613281, "kl_loss_9": 268.93776626586913, "learning_rate": 1.2335239524541298e-07, "loss": 742.7448, "step": 9930 }, { "ce_loss_13": 3.232028913497925, "ce_loss_17": 3.1935453176498414, "ce_loss_2": 3.9849515080451967, "ce_loss_4": 3.60744651556015, "ce_loss_9": 3.3257043600082397, "epoch": 0.994, "grad_norm": 836.0, "kl_loss_13": 69.2954231262207, "kl_loss_2": 1727.1304809570313, "kl_loss_4": 928.5043914794921, "kl_loss_9": 265.77542266845705, "learning_rate": 9.06272382371065e-08, "loss": 752.2372, "step": 9940 }, { "ce_loss_13": 3.297999620437622, "ce_loss_17": 3.2620956897735596, "ce_loss_2": 4.070128989219666, "ce_loss_4": 3.679764378070831, "ce_loss_9": 3.3932497262954713, "epoch": 0.995, "grad_norm": 696.0, "kl_loss_13": 71.25127658843994, "kl_loss_2": 1781.1347351074219, "kl_loss_4": 961.7238128662109, "kl_loss_9": 274.8592224121094, "learning_rate": 6.293616306246586e-08, "loss": 759.6051, "step": 9950 }, { "ce_loss_13": 3.2957133054733276, "ce_loss_17": 3.2595414757728576, "ce_loss_2": 4.026886129379273, "ce_loss_4": 3.658927488327026, "ce_loss_9": 3.383803868293762, "epoch": 0.996, "grad_norm": 800.0, "kl_loss_13": 68.08552322387695, "kl_loss_2": 1692.548760986328, "kl_loss_4": 916.358920288086, "kl_loss_9": 262.1323112487793, "learning_rate": 4.027944857032395e-08, "loss": 726.9326, "step": 9960 }, { "ce_loss_13": 3.29274468421936, "ce_loss_17": 3.257336509227753, "ce_loss_2": 4.0016671299934385, "ce_loss_4": 3.640883004665375, "ce_loss_9": 3.3762596607208253, "epoch": 0.997, "grad_norm": 676.0, "kl_loss_13": 67.64072341918946, "kl_loss_2": 1639.7742065429688, "kl_loss_4": 890.015170288086, "kl_loss_9": 255.31717681884766, "learning_rate": 2.265732291356626e-08, "loss": 724.3973, "step": 9970 }, { "ce_loss_13": 3.3301588773727415, "ce_loss_17": 3.292578196525574, "ce_loss_2": 4.057374143600464, "ce_loss_4": 3.6901230216026306, "ce_loss_9": 3.4179542779922487, "epoch": 0.998, "grad_norm": 680.0, "kl_loss_13": 69.15830268859864, "kl_loss_2": 1689.9597412109374, "kl_loss_4": 915.1488616943359, "kl_loss_9": 266.0099395751953, "learning_rate": 1.0069963546743833e-08, "loss": 755.9442, "step": 9980 }, { "ce_loss_13": 3.3151816725730896, "ce_loss_17": 3.275153863430023, "ce_loss_2": 4.068166565895081, "ce_loss_4": 3.6815107583999636, "ce_loss_9": 3.405549705028534, "epoch": 0.999, "grad_norm": 792.0, "kl_loss_13": 69.87775192260742, "kl_loss_2": 1741.6067260742188, "kl_loss_4": 939.5706390380859, "kl_loss_9": 270.8768714904785, "learning_rate": 2.517497224463483e-09, "loss": 750.72, "step": 9990 }, { "ce_loss_13": 3.264886772632599, "ce_loss_17": 3.225277531147003, "ce_loss_2": 4.066579639911652, "ce_loss_4": 3.6539052367210387, "ce_loss_9": 3.362174320220947, "epoch": 1.0, "grad_norm": 776.0, "kl_loss_13": 71.4085069656372, "kl_loss_2": 1836.1367431640624, "kl_loss_4": 969.44462890625, "kl_loss_9": 277.70877304077146, "learning_rate": 0.0, "loss": 777.38, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.447557417823109e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }