{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_13": 8.245837211608887, "ce_loss_17": 3.203194260597229, "ce_loss_2": 12.116918087005615, "ce_loss_4": 19.26921272277832, "ce_loss_9": 15.57656478881836, "epoch": 0.0001, "grad_norm": 125952.0, "kl_loss_13": 11612.04248046875, "kl_loss_2": 18340.9765625, "kl_loss_4": 32967.3173828125, "kl_loss_9": 26059.0986328125, "learning_rate": 1e-05, "loss": 23249.668, "step": 1 }, { "ce_loss_13": 6.262006759643555, "ce_loss_17": 3.2416110701031156, "ce_loss_2": 9.207348664601644, "ce_loss_4": 12.576181120342678, "ce_loss_9": 10.658434788386026, "epoch": 0.001, "grad_norm": 21504.0, "kl_loss_13": 6946.584865993924, "kl_loss_2": 11908.856553819445, "kl_loss_4": 18771.619683159723, "kl_loss_9": 15229.311116536459, "learning_rate": 0.0001, "loss": 13270.6684, "step": 10 }, { "ce_loss_13": 4.119381642341613, "ce_loss_17": 3.2502736806869508, "ce_loss_2": 6.785771179199219, "ce_loss_4": 6.746949768066406, "ce_loss_9": 5.307699370384216, "epoch": 0.002, "grad_norm": 5280.0, "kl_loss_13": 1743.6538513183593, "kl_loss_2": 6630.5220703125, "kl_loss_4": 6579.506982421875, "kl_loss_9": 3960.1172485351562, "learning_rate": 0.0002, "loss": 4773.9969, "step": 20 }, { "ce_loss_13": 3.4927234172821047, "ce_loss_17": 3.0495631098747253, "ce_loss_2": 5.970883321762085, "ce_loss_4": 5.638284993171692, "ce_loss_9": 4.39405175447464, "epoch": 0.003, "grad_norm": 2160.0, "kl_loss_13": 872.9093994140625, "kl_loss_2": 5513.04033203125, "kl_loss_4": 4894.046948242188, "kl_loss_9": 2559.5335327148437, "learning_rate": 0.0003, "loss": 3401.0473, "step": 30 }, { "ce_loss_13": 3.547671043872833, "ce_loss_17": 3.2164444804191588, "ce_loss_2": 5.677912878990173, "ce_loss_4": 5.3011287450790405, "ce_loss_9": 4.261575305461884, "epoch": 0.004, "grad_norm": 2000.0, "kl_loss_13": 650.052767944336, "kl_loss_2": 4676.879370117187, "kl_loss_4": 3976.904150390625, "kl_loss_9": 2006.8223205566405, "learning_rate": 0.0004, "loss": 2844.3119, "step": 40 }, { "ce_loss_13": 3.4759989619255065, "ce_loss_17": 3.1865654587745667, "ce_loss_2": 5.485703873634338, "ce_loss_4": 5.052077317237854, "ce_loss_9": 4.077259612083435, "epoch": 0.005, "grad_norm": 2304.0, "kl_loss_13": 567.6999160766602, "kl_loss_2": 4412.083032226563, "kl_loss_4": 3599.5486083984374, "kl_loss_9": 1712.4074951171874, "learning_rate": 0.0005, "loss": 2568.6793, "step": 50 }, { "ce_loss_13": 3.441826868057251, "ce_loss_17": 3.2044201850891114, "ce_loss_2": 5.347141432762146, "ce_loss_4": 4.920163488388061, "ce_loss_9": 3.9821664094924927, "epoch": 0.006, "grad_norm": 2024.0, "kl_loss_13": 485.2277374267578, "kl_loss_2": 4141.191357421875, "kl_loss_4": 3337.098974609375, "kl_loss_9": 1526.653369140625, "learning_rate": 0.0006, "loss": 2384.8809, "step": 60 }, { "ce_loss_13": 3.3561118245124817, "ce_loss_17": 3.124888551235199, "ce_loss_2": 5.261025285720825, "ce_loss_4": 4.78769850730896, "ce_loss_9": 3.864793264865875, "epoch": 0.007, "grad_norm": 2896.0, "kl_loss_13": 481.27857971191406, "kl_loss_2": 4145.036047363281, "kl_loss_4": 3255.6009887695313, "kl_loss_9": 1471.7347290039063, "learning_rate": 0.0007, "loss": 2327.5355, "step": 70 }, { "ce_loss_13": 3.3492482781410216, "ce_loss_17": 3.1225196599960325, "ce_loss_2": 5.220322489738464, "ce_loss_4": 4.740187120437622, "ce_loss_9": 3.8486096024513246, "epoch": 0.008, "grad_norm": 4288.0, "kl_loss_13": 450.43479919433594, "kl_loss_2": 4066.550830078125, "kl_loss_4": 3159.6666259765625, "kl_loss_9": 1417.1455688476562, "learning_rate": 0.0008, "loss": 2290.3338, "step": 80 }, { "ce_loss_13": 3.3400510907173158, "ce_loss_17": 3.0802392244338987, "ce_loss_2": 5.197456169128418, "ce_loss_4": 4.685648941993714, "ce_loss_9": 3.7761051297187804, "epoch": 0.009, "grad_norm": 2864.0, "kl_loss_13": 544.5672927856446, "kl_loss_2": 4127.287915039063, "kl_loss_4": 3172.030029296875, "kl_loss_9": 1414.8119262695313, "learning_rate": 0.0009000000000000001, "loss": 2300.5998, "step": 90 }, { "ce_loss_13": 3.5245481848716738, "ce_loss_17": 3.197898268699646, "ce_loss_2": 5.283472776412964, "ce_loss_4": 4.746220755577087, "ce_loss_9": 3.89434951543808, "epoch": 0.01, "grad_norm": 2496.0, "kl_loss_13": 668.638525390625, "kl_loss_2": 4065.00205078125, "kl_loss_4": 3026.5156982421877, "kl_loss_9": 1377.3786193847657, "learning_rate": 0.001, "loss": 2285.8096, "step": 100 }, { "ce_loss_13": 3.454442536830902, "ce_loss_17": 3.1511964797973633, "ce_loss_2": 5.290353393554687, "ce_loss_4": 4.756244111061096, "ce_loss_9": 3.8903318524360655, "epoch": 0.011, "grad_norm": 2272.0, "kl_loss_13": 605.0197692871094, "kl_loss_2": 4168.639721679688, "kl_loss_4": 3157.1772583007814, "kl_loss_9": 1496.293798828125, "learning_rate": 0.0009999974825027757, "loss": 2345.7531, "step": 110 }, { "ce_loss_13": 3.4478296875953673, "ce_loss_17": 3.214282178878784, "ce_loss_2": 5.254533863067627, "ce_loss_4": 4.679026770591736, "ce_loss_9": 3.8918806195259092, "epoch": 0.012, "grad_norm": 1984.0, "kl_loss_13": 479.9314788818359, "kl_loss_2": 4000.81513671875, "kl_loss_4": 2880.430310058594, "kl_loss_9": 1364.1475036621093, "learning_rate": 0.0009999899300364532, "loss": 2170.741, "step": 120 }, { "ce_loss_13": 3.392778384685516, "ce_loss_17": 3.183854401111603, "ce_loss_2": 5.239187955856323, "ce_loss_4": 4.659725522994995, "ce_loss_9": 3.8580212712287905, "epoch": 0.013, "grad_norm": 1832.0, "kl_loss_13": 431.01489410400393, "kl_loss_2": 3987.960729980469, "kl_loss_4": 2893.437353515625, "kl_loss_9": 1324.4112121582032, "learning_rate": 0.0009999773426770863, "loss": 2176.1145, "step": 130 }, { "ce_loss_13": 3.4126084804534913, "ce_loss_17": 3.221629428863525, "ce_loss_2": 5.179600095748901, "ce_loss_4": 4.640480160713196, "ce_loss_9": 3.838227427005768, "epoch": 0.014, "grad_norm": 1680.0, "kl_loss_13": 402.77818908691404, "kl_loss_2": 3835.3425415039064, "kl_loss_4": 2803.274072265625, "kl_loss_9": 1250.2555114746094, "learning_rate": 0.0009999597205514296, "loss": 2094.5086, "step": 140 }, { "ce_loss_13": 3.3619016647338866, "ce_loss_17": 3.1769514203071596, "ce_loss_2": 5.107105851173401, "ce_loss_4": 4.588386178016663, "ce_loss_9": 3.785622251033783, "epoch": 0.015, "grad_norm": 1512.0, "kl_loss_13": 384.02117614746095, "kl_loss_2": 3768.1231201171877, "kl_loss_4": 2787.8059692382812, "kl_loss_9": 1224.824462890625, "learning_rate": 0.0009999370638369377, "loss": 2052.0465, "step": 150 }, { "ce_loss_13": 3.4005820751190186, "ce_loss_17": 3.2184852957725525, "ce_loss_2": 5.173541307449341, "ce_loss_4": 4.60784420967102, "ce_loss_9": 3.798355770111084, "epoch": 0.016, "grad_norm": 1784.0, "kl_loss_13": 395.0272247314453, "kl_loss_2": 3845.1993896484373, "kl_loss_4": 2780.1154296875, "kl_loss_9": 1202.1837524414063, "learning_rate": 0.000999909372761763, "loss": 2051.0324, "step": 160 }, { "ce_loss_13": 3.3563824772834776, "ce_loss_17": 3.154946196079254, "ce_loss_2": 5.112983512878418, "ce_loss_4": 4.5388916015625, "ce_loss_9": 3.731077456474304, "epoch": 0.017, "grad_norm": 1208.0, "kl_loss_13": 441.8022232055664, "kl_loss_2": 3878.9568725585937, "kl_loss_4": 2776.47412109375, "kl_loss_9": 1190.0721435546875, "learning_rate": 0.0009998766476047546, "loss": 2080.0131, "step": 170 }, { "ce_loss_13": 3.4069248914718626, "ce_loss_17": 3.1887315988540648, "ce_loss_2": 5.138336157798767, "ce_loss_4": 4.577449989318848, "ce_loss_9": 3.76503324508667, "epoch": 0.018, "grad_norm": 1176.0, "kl_loss_13": 432.528498840332, "kl_loss_2": 3817.0463989257814, "kl_loss_4": 2752.0455078125, "kl_loss_9": 1165.6939636230468, "learning_rate": 0.0009998388886954545, "loss": 2052.5229, "step": 180 }, { "ce_loss_13": 3.3545591950416567, "ce_loss_17": 3.1638806462287903, "ce_loss_2": 5.095881795883178, "ce_loss_4": 4.529397130012512, "ce_loss_9": 3.7296519041061402, "epoch": 0.019, "grad_norm": 1264.0, "kl_loss_13": 398.90871276855466, "kl_loss_2": 3791.656286621094, "kl_loss_4": 2732.0404052734375, "kl_loss_9": 1158.1481231689454, "learning_rate": 0.0009997960964140947, "loss": 2013.7441, "step": 190 }, { "ce_loss_13": 3.3469136238098143, "ce_loss_17": 3.1555357933044434, "ce_loss_2": 5.0711424350738525, "ce_loss_4": 4.537726044654846, "ce_loss_9": 3.7215031743049622, "epoch": 0.02, "grad_norm": 1112.0, "kl_loss_13": 414.29385528564455, "kl_loss_2": 3790.6836547851562, "kl_loss_4": 2755.56083984375, "kl_loss_9": 1152.2727325439453, "learning_rate": 0.0009997482711915926, "loss": 2011.382, "step": 200 }, { "ce_loss_13": 3.3071155309677125, "ce_loss_17": 3.131619155406952, "ce_loss_2": 4.990554714202881, "ce_loss_4": 4.447378182411194, "ce_loss_9": 3.6526355147361755, "epoch": 0.021, "grad_norm": 1208.0, "kl_loss_13": 368.4332565307617, "kl_loss_2": 3686.34599609375, "kl_loss_4": 2644.977673339844, "kl_loss_9": 1085.0194458007813, "learning_rate": 0.0009996954135095479, "loss": 1936.2941, "step": 210 }, { "ce_loss_13": 3.368330681324005, "ce_loss_17": 3.203192853927612, "ce_loss_2": 5.023554182052612, "ce_loss_4": 4.469095087051391, "ce_loss_9": 3.723447573184967, "epoch": 0.022, "grad_norm": 1216.0, "kl_loss_13": 346.5491180419922, "kl_loss_2": 3587.663049316406, "kl_loss_4": 2537.7098754882813, "kl_loss_9": 1058.0772766113282, "learning_rate": 0.0009996375239002368, "loss": 1883.5674, "step": 220 }, { "ce_loss_13": 3.4276290774345397, "ce_loss_17": 3.2727778434753416, "ce_loss_2": 5.031824541091919, "ce_loss_4": 4.480432939529419, "ce_loss_9": 3.787499284744263, "epoch": 0.023, "grad_norm": 1280.0, "kl_loss_13": 334.30745697021484, "kl_loss_2": 3476.1241943359373, "kl_loss_4": 2450.0752685546877, "kl_loss_9": 1053.5572967529297, "learning_rate": 0.0009995746029466072, "loss": 1840.5514, "step": 230 }, { "ce_loss_13": 3.223449742794037, "ce_loss_17": 3.0670263051986693, "ce_loss_2": 4.954608368873596, "ce_loss_4": 4.3865398406982425, "ce_loss_9": 3.6171311140060425, "epoch": 0.024, "grad_norm": 1104.0, "kl_loss_13": 336.07812042236327, "kl_loss_2": 3743.14775390625, "kl_loss_4": 2660.5342651367187, "kl_loss_9": 1136.1767944335938, "learning_rate": 0.0009995066512822719, "loss": 1902.8227, "step": 240 }, { "ce_loss_13": 3.3226835250854494, "ce_loss_17": 3.168175756931305, "ce_loss_2": 5.076572394371032, "ce_loss_4": 4.506612014770508, "ce_loss_9": 3.680534768104553, "epoch": 0.025, "grad_norm": 1136.0, "kl_loss_13": 325.8242752075195, "kl_loss_2": 3756.1126953125, "kl_loss_4": 2674.860302734375, "kl_loss_9": 1059.0274932861328, "learning_rate": 0.000999433669591504, "loss": 1876.3104, "step": 250 }, { "ce_loss_13": 3.2331672072410584, "ce_loss_17": 3.0719566226005552, "ce_loss_2": 4.984365653991699, "ce_loss_4": 4.357344591617585, "ce_loss_9": 3.587770068645477, "epoch": 0.026, "grad_norm": 1160.0, "kl_loss_13": 341.86385345458984, "kl_loss_2": 3776.8579711914062, "kl_loss_4": 2582.1751953125, "kl_loss_9": 1055.3525695800781, "learning_rate": 0.000999355658609228, "loss": 1895.5004, "step": 260 }, { "ce_loss_13": 3.272359001636505, "ce_loss_17": 3.096132791042328, "ce_loss_2": 5.022517538070678, "ce_loss_4": 4.407870817184448, "ce_loss_9": 3.6219112873077393, "epoch": 0.027, "grad_norm": 1104.0, "kl_loss_13": 353.37426300048827, "kl_loss_2": 3760.107629394531, "kl_loss_4": 2602.4897216796876, "kl_loss_9": 1055.752737426758, "learning_rate": 0.0009992726191210138, "loss": 1912.4219, "step": 270 }, { "ce_loss_13": 3.310946595668793, "ce_loss_17": 3.1392672419548036, "ce_loss_2": 4.95752317905426, "ce_loss_4": 4.404611146450042, "ce_loss_9": 3.652379024028778, "epoch": 0.028, "grad_norm": 1176.0, "kl_loss_13": 360.9754608154297, "kl_loss_2": 3588.5043212890623, "kl_loss_4": 2543.2757934570313, "kl_loss_9": 1052.491000366211, "learning_rate": 0.0009991845519630679, "loss": 1868.9936, "step": 280 }, { "ce_loss_13": 3.2008662104606627, "ce_loss_17": 3.0263441681861876, "ce_loss_2": 4.856085014343262, "ce_loss_4": 4.303015398979187, "ce_loss_9": 3.5388938903808596, "epoch": 0.029, "grad_norm": 1040.0, "kl_loss_13": 363.8386260986328, "kl_loss_2": 3612.0285522460936, "kl_loss_4": 2563.0463134765623, "kl_loss_9": 1046.3826843261718, "learning_rate": 0.0009990914580222257, "loss": 1888.5355, "step": 290 }, { "ce_loss_13": 3.3314425468444826, "ce_loss_17": 3.16639723777771, "ce_loss_2": 4.893620347976684, "ce_loss_4": 4.355177521705627, "ce_loss_9": 3.6631988167762755, "epoch": 0.03, "grad_norm": 984.0, "kl_loss_13": 361.12161407470705, "kl_loss_2": 3433.845556640625, "kl_loss_4": 2402.5695922851564, "kl_loss_9": 1031.2381103515625, "learning_rate": 0.0009989933382359422, "loss": 1845.8475, "step": 300 }, { "ce_loss_13": 3.3318355441093446, "ce_loss_17": 3.178987145423889, "ce_loss_2": 4.896115136146546, "ce_loss_4": 4.337056875228882, "ce_loss_9": 3.6559376239776613, "epoch": 0.031, "grad_norm": 972.0, "kl_loss_13": 332.8449203491211, "kl_loss_2": 3406.24501953125, "kl_loss_4": 2356.873175048828, "kl_loss_9": 999.1671112060546, "learning_rate": 0.0009988901935922825, "loss": 1793.9715, "step": 310 }, { "ce_loss_13": 3.1816823482513428, "ce_loss_17": 3.0243487119674684, "ce_loss_2": 4.851909255981445, "ce_loss_4": 4.274699199199676, "ce_loss_9": 3.5197362065315247, "epoch": 0.032, "grad_norm": 1072.0, "kl_loss_13": 328.0967269897461, "kl_loss_2": 3633.808972167969, "kl_loss_4": 2519.924853515625, "kl_loss_9": 1019.008544921875, "learning_rate": 0.0009987820251299122, "loss": 1827.6861, "step": 320 }, { "ce_loss_13": 3.2988600850105287, "ce_loss_17": 3.1533907890319823, "ce_loss_2": 4.869427728652954, "ce_loss_4": 4.301612663269043, "ce_loss_9": 3.6177143692970275, "epoch": 0.033, "grad_norm": 1096.0, "kl_loss_13": 308.2405410766602, "kl_loss_2": 3414.5861328125, "kl_loss_4": 2337.975048828125, "kl_loss_9": 962.3565979003906, "learning_rate": 0.0009986688339380862, "loss": 1752.4719, "step": 330 }, { "ce_loss_13": 3.246885049343109, "ce_loss_17": 3.108083176612854, "ce_loss_2": 4.796933245658875, "ce_loss_4": 4.238318455219269, "ce_loss_9": 3.552007722854614, "epoch": 0.034, "grad_norm": 812.0, "kl_loss_13": 301.0337249755859, "kl_loss_2": 3343.612902832031, "kl_loss_4": 2285.6415771484376, "kl_loss_9": 924.484326171875, "learning_rate": 0.0009985506211566387, "loss": 1730.3281, "step": 340 }, { "ce_loss_13": 3.273797357082367, "ce_loss_17": 3.136493670940399, "ce_loss_2": 4.775125026702881, "ce_loss_4": 4.241373467445373, "ce_loss_9": 3.5697874903678892, "epoch": 0.035, "grad_norm": 1032.0, "kl_loss_13": 294.5239418029785, "kl_loss_2": 3276.111437988281, "kl_loss_4": 2258.342663574219, "kl_loss_9": 921.4810943603516, "learning_rate": 0.0009984273879759713, "loss": 1701.0359, "step": 350 }, { "ce_loss_13": 3.3104996562004088, "ce_loss_17": 3.165222942829132, "ce_loss_2": 4.8404416799545285, "ce_loss_4": 4.315926265716553, "ce_loss_9": 3.6346293807029726, "epoch": 0.036, "grad_norm": 1048.0, "kl_loss_13": 306.0645492553711, "kl_loss_2": 3322.6215087890623, "kl_loss_4": 2322.489129638672, "kl_loss_9": 971.6646301269532, "learning_rate": 0.0009982991356370402, "loss": 1752.7691, "step": 360 }, { "ce_loss_13": 3.2871758699417115, "ce_loss_17": 3.144582521915436, "ce_loss_2": 4.7956489562988285, "ce_loss_4": 4.284250044822693, "ce_loss_9": 3.6023415327072144, "epoch": 0.037, "grad_norm": 860.0, "kl_loss_13": 306.3320365905762, "kl_loss_2": 3296.55322265625, "kl_loss_4": 2307.4163452148437, "kl_loss_9": 952.9344543457031, "learning_rate": 0.0009981658654313456, "loss": 1728.3863, "step": 370 }, { "ce_loss_13": 3.3514113068580627, "ce_loss_17": 3.2154078125953673, "ce_loss_2": 4.831361770629883, "ce_loss_4": 4.296992933750152, "ce_loss_9": 3.6497627019882204, "epoch": 0.038, "grad_norm": 892.0, "kl_loss_13": 286.9757354736328, "kl_loss_2": 3220.245751953125, "kl_loss_4": 2214.9276611328123, "kl_loss_9": 908.6463470458984, "learning_rate": 0.000998027578700917, "loss": 1685.0902, "step": 380 }, { "ce_loss_13": 3.2966546773910523, "ce_loss_17": 3.1595390915870665, "ce_loss_2": 4.792065525054932, "ce_loss_4": 4.277795004844665, "ce_loss_9": 3.6049975156784058, "epoch": 0.039, "grad_norm": 852.0, "kl_loss_13": 295.4520408630371, "kl_loss_2": 3276.6158935546873, "kl_loss_4": 2279.613635253906, "kl_loss_9": 930.3630981445312, "learning_rate": 0.0009978842768382998, "loss": 1707.1207, "step": 390 }, { "ce_loss_13": 3.3094278812408446, "ce_loss_17": 3.17583087682724, "ce_loss_2": 4.76841790676117, "ce_loss_4": 4.243080353736877, "ce_loss_9": 3.6015673160552977, "epoch": 0.04, "grad_norm": 1112.0, "kl_loss_13": 291.0220031738281, "kl_loss_2": 3171.0037475585937, "kl_loss_4": 2188.385711669922, "kl_loss_9": 893.6235076904297, "learning_rate": 0.0009977359612865424, "loss": 1653.8094, "step": 400 }, { "ce_loss_13": 3.315619432926178, "ce_loss_17": 3.1770965814590455, "ce_loss_2": 4.781732821464539, "ce_loss_4": 4.25855211019516, "ce_loss_9": 3.622215211391449, "epoch": 0.041, "grad_norm": 912.0, "kl_loss_13": 306.6347946166992, "kl_loss_2": 3209.241320800781, "kl_loss_4": 2211.0861572265626, "kl_loss_9": 926.7490783691406, "learning_rate": 0.0009975826335391806, "loss": 1648.1408, "step": 410 }, { "ce_loss_13": 3.33244446516037, "ce_loss_17": 3.2002769708633423, "ce_loss_2": 4.770659971237182, "ce_loss_4": 4.253687918186188, "ce_loss_9": 3.6277463674545287, "epoch": 0.042, "grad_norm": 852.0, "kl_loss_13": 279.2612892150879, "kl_loss_2": 3132.292858886719, "kl_loss_4": 2164.451251220703, "kl_loss_9": 898.8495544433594, "learning_rate": 0.0009974242951402235, "loss": 1628.7986, "step": 420 }, { "ce_loss_13": 3.3416669011116027, "ce_loss_17": 3.210995614528656, "ce_loss_2": 4.786950802803039, "ce_loss_4": 4.279473066329956, "ce_loss_9": 3.6323882818222044, "epoch": 0.043, "grad_norm": 836.0, "kl_loss_13": 278.0007804870605, "kl_loss_2": 3173.102331542969, "kl_loss_4": 2205.178302001953, "kl_loss_9": 898.5286987304687, "learning_rate": 0.0009972609476841367, "loss": 1623.4425, "step": 430 }, { "ce_loss_13": 3.250875973701477, "ce_loss_17": 3.1214403033256533, "ce_loss_2": 4.747104144096374, "ce_loss_4": 4.223791539669037, "ce_loss_9": 3.550717067718506, "epoch": 0.044, "grad_norm": 936.0, "kl_loss_13": 278.9578063964844, "kl_loss_2": 3234.1185791015623, "kl_loss_4": 2244.054852294922, "kl_loss_9": 895.8857635498047, "learning_rate": 0.0009970925928158272, "loss": 1663.9736, "step": 440 }, { "ce_loss_13": 3.203373897075653, "ce_loss_17": 3.0639839291572573, "ce_loss_2": 4.704200720787048, "ce_loss_4": 4.174713146686554, "ce_loss_9": 3.505053627490997, "epoch": 0.045, "grad_norm": 1008.0, "kl_loss_13": 296.22080688476564, "kl_loss_2": 3297.6390869140623, "kl_loss_4": 2290.6543090820314, "kl_loss_9": 938.6929901123046, "learning_rate": 0.000996919232230627, "loss": 1684.9379, "step": 450 }, { "ce_loss_13": 3.290696454048157, "ce_loss_17": 3.1533886909484865, "ce_loss_2": 4.71325581073761, "ce_loss_4": 4.19888790845871, "ce_loss_9": 3.5651607751846313, "epoch": 0.046, "grad_norm": 880.0, "kl_loss_13": 292.053125, "kl_loss_2": 3139.4821899414064, "kl_loss_4": 2155.3989196777343, "kl_loss_9": 873.195669555664, "learning_rate": 0.0009967408676742752, "loss": 1584.9984, "step": 460 }, { "ce_loss_13": 3.42132054567337, "ce_loss_17": 3.2914034724235535, "ce_loss_2": 4.811346578598022, "ce_loss_4": 4.294545769691467, "ce_loss_9": 3.687683117389679, "epoch": 0.047, "grad_norm": 1016.0, "kl_loss_13": 294.36837844848634, "kl_loss_2": 3055.811877441406, "kl_loss_4": 2075.969854736328, "kl_loss_9": 855.9110748291016, "learning_rate": 0.0009965575009429006, "loss": 1619.1236, "step": 470 }, { "ce_loss_13": 3.214449155330658, "ce_loss_17": 3.0739150166511537, "ce_loss_2": 4.668636918067932, "ce_loss_4": 4.157142472267151, "ce_loss_9": 3.489282858371735, "epoch": 0.048, "grad_norm": 992.0, "kl_loss_13": 299.16979904174804, "kl_loss_2": 3194.0108154296877, "kl_loss_4": 2204.400537109375, "kl_loss_9": 872.0316680908203, "learning_rate": 0.0009963691338830043, "loss": 1619.6937, "step": 480 }, { "ce_loss_13": 3.30702965259552, "ce_loss_17": 3.1685732007026672, "ce_loss_2": 4.7181963443756105, "ce_loss_4": 4.203646314144135, "ce_loss_9": 3.563032793998718, "epoch": 0.049, "grad_norm": 952.0, "kl_loss_13": 295.1540016174316, "kl_loss_2": 3125.740100097656, "kl_loss_4": 2150.886328125, "kl_loss_9": 842.9466033935547, "learning_rate": 0.0009961757683914405, "loss": 1584.7215, "step": 490 }, { "ce_loss_13": 3.288169574737549, "ce_loss_17": 3.1593953371047974, "ce_loss_2": 4.673777222633362, "ce_loss_4": 4.2005146741867065, "ce_loss_9": 3.58009819984436, "epoch": 0.05, "grad_norm": 968.0, "kl_loss_13": 267.9173278808594, "kl_loss_2": 3055.447961425781, "kl_loss_4": 2151.9576049804687, "kl_loss_9": 897.8867614746093, "learning_rate": 0.0009959774064153978, "loss": 1607.1874, "step": 500 }, { "ce_loss_13": 3.283954107761383, "ce_loss_17": 3.167663061618805, "ce_loss_2": 4.649220967292786, "ce_loss_4": 4.162761688232422, "ce_loss_9": 3.545726478099823, "epoch": 0.051, "grad_norm": 816.0, "kl_loss_13": 255.12116165161132, "kl_loss_2": 2997.7501220703125, "kl_loss_4": 2069.501153564453, "kl_loss_9": 830.93466796875, "learning_rate": 0.0009957740499523787, "loss": 1567.6182, "step": 510 }, { "ce_loss_13": 3.302040684223175, "ce_loss_17": 3.184415805339813, "ce_loss_2": 4.69151737689972, "ce_loss_4": 4.185350131988526, "ce_loss_9": 3.5807156205177306, "epoch": 0.052, "grad_norm": 1000.0, "kl_loss_13": 256.7886077880859, "kl_loss_2": 3014.2888427734374, "kl_loss_4": 2067.243231201172, "kl_loss_9": 843.0293792724609, "learning_rate": 0.0009955657010501807, "loss": 1556.7627, "step": 520 }, { "ce_loss_13": 3.272020387649536, "ce_loss_17": 3.152293634414673, "ce_loss_2": 4.684082174301148, "ce_loss_4": 4.1793334722518924, "ce_loss_9": 3.5509377121925354, "epoch": 0.053, "grad_norm": 828.0, "kl_loss_13": 258.3700881958008, "kl_loss_2": 3078.1129516601563, "kl_loss_4": 2105.8355102539062, "kl_loss_9": 846.9102508544922, "learning_rate": 0.000995352361806875, "loss": 1562.7159, "step": 530 }, { "ce_loss_13": 3.307564854621887, "ce_loss_17": 3.1844978094100953, "ce_loss_2": 4.7047443151473995, "ce_loss_4": 4.182905077934265, "ce_loss_9": 3.5815004229545595, "epoch": 0.054, "grad_norm": 876.0, "kl_loss_13": 263.63003768920896, "kl_loss_2": 3079.06552734375, "kl_loss_4": 2088.055554199219, "kl_loss_9": 847.7167785644531, "learning_rate": 0.0009951340343707852, "loss": 1585.4748, "step": 540 }, { "ce_loss_13": 3.3478440284729003, "ce_loss_17": 3.2363200783729553, "ce_loss_2": 4.756931948661804, "ce_loss_4": 4.248503375053406, "ce_loss_9": 3.626732349395752, "epoch": 0.055, "grad_norm": 820.0, "kl_loss_13": 244.08612976074218, "kl_loss_2": 3046.7291748046873, "kl_loss_4": 2076.1069274902343, "kl_loss_9": 822.4989318847656, "learning_rate": 0.0009949107209404665, "loss": 1561.637, "step": 550 }, { "ce_loss_13": 3.2647992372512817, "ce_loss_17": 3.154092490673065, "ce_loss_2": 4.64266676902771, "ce_loss_4": 4.143053829669952, "ce_loss_9": 3.5433424353599547, "epoch": 0.056, "grad_norm": 880.0, "kl_loss_13": 239.78442001342773, "kl_loss_2": 3010.17568359375, "kl_loss_4": 2059.5078308105467, "kl_loss_9": 820.6934234619141, "learning_rate": 0.0009946824237646824, "loss": 1542.7184, "step": 560 }, { "ce_loss_13": 3.215599799156189, "ce_loss_17": 3.104117822647095, "ce_loss_2": 4.624624919891358, "ce_loss_4": 4.125102794170379, "ce_loss_9": 3.4957406878471375, "epoch": 0.057, "grad_norm": 1024.0, "kl_loss_13": 251.91957931518556, "kl_loss_2": 3072.9482299804686, "kl_loss_4": 2121.170025634766, "kl_loss_9": 845.219482421875, "learning_rate": 0.0009944491451423828, "loss": 1596.3019, "step": 570 }, { "ce_loss_13": 3.2231896996498106, "ce_loss_17": 3.101136314868927, "ce_loss_2": 4.642922592163086, "ce_loss_4": 4.14011332988739, "ce_loss_9": 3.5034247159957888, "epoch": 0.058, "grad_norm": 992.0, "kl_loss_13": 263.1824234008789, "kl_loss_2": 3113.314099121094, "kl_loss_4": 2143.1869995117186, "kl_loss_9": 857.413427734375, "learning_rate": 0.0009942108874226813, "loss": 1566.2489, "step": 580 }, { "ce_loss_13": 3.332329547405243, "ce_loss_17": 3.21410493850708, "ce_loss_2": 4.667682480812073, "ce_loss_4": 4.177564013004303, "ce_loss_9": 3.590143620967865, "epoch": 0.059, "grad_norm": 1072.0, "kl_loss_13": 267.44557037353513, "kl_loss_2": 2937.4343505859374, "kl_loss_4": 2008.2415649414063, "kl_loss_9": 816.2921875, "learning_rate": 0.00099396765300483, "loss": 1498.2856, "step": 590 }, { "ce_loss_13": 3.320378637313843, "ce_loss_17": 3.19181113243103, "ce_loss_2": 4.655848789215088, "ce_loss_4": 4.160718262195587, "ce_loss_9": 3.572845149040222, "epoch": 0.06, "grad_norm": 964.0, "kl_loss_13": 273.50896682739256, "kl_loss_2": 2966.854052734375, "kl_loss_4": 2020.4810424804687, "kl_loss_9": 809.3897918701172, "learning_rate": 0.0009937194443381972, "loss": 1519.6391, "step": 600 }, { "ce_loss_13": 3.3402734875679014, "ce_loss_17": 3.2227625966072084, "ce_loss_2": 4.6513392448425295, "ce_loss_4": 4.157649374008178, "ce_loss_9": 3.5860095381736756, "epoch": 0.061, "grad_norm": 876.0, "kl_loss_13": 261.92719650268555, "kl_loss_2": 2905.961279296875, "kl_loss_4": 1960.33046875, "kl_loss_9": 794.5582885742188, "learning_rate": 0.0009934662639222412, "loss": 1520.3426, "step": 610 }, { "ce_loss_13": 3.288265109062195, "ce_loss_17": 3.1717207074165343, "ce_loss_2": 4.675124382972717, "ce_loss_4": 4.158351671695709, "ce_loss_9": 3.562034714221954, "epoch": 0.062, "grad_norm": 912.0, "kl_loss_13": 257.04343566894534, "kl_loss_2": 3043.36982421875, "kl_loss_4": 2060.909814453125, "kl_loss_9": 833.235791015625, "learning_rate": 0.000993208114306486, "loss": 1531.9012, "step": 620 }, { "ce_loss_13": 3.2093312740325928, "ce_loss_17": 3.093514621257782, "ce_loss_2": 4.610535764694214, "ce_loss_4": 4.084069979190827, "ce_loss_9": 3.490491473674774, "epoch": 0.063, "grad_norm": 1016.0, "kl_loss_13": 250.1993179321289, "kl_loss_2": 3051.593884277344, "kl_loss_4": 2049.0211669921873, "kl_loss_9": 849.7222106933593, "learning_rate": 0.0009929449980904952, "loss": 1506.9313, "step": 630 }, { "ce_loss_13": 3.260781669616699, "ce_loss_17": 3.155267858505249, "ce_loss_2": 4.620536375045776, "ce_loss_4": 4.117581927776337, "ce_loss_9": 3.528477704524994, "epoch": 0.064, "grad_norm": 772.0, "kl_loss_13": 238.38918380737306, "kl_loss_2": 2962.8508911132812, "kl_loss_4": 1999.503125, "kl_loss_9": 807.9630187988281, "learning_rate": 0.0009926769179238466, "loss": 1494.3372, "step": 640 }, { "ce_loss_13": 3.3054434776306154, "ce_loss_17": 3.1935141324996947, "ce_loss_2": 4.657902264595032, "ce_loss_4": 4.158003878593445, "ce_loss_9": 3.5845689177513123, "epoch": 0.065, "grad_norm": 920.0, "kl_loss_13": 245.2348831176758, "kl_loss_2": 2980.8562866210937, "kl_loss_4": 2014.990087890625, "kl_loss_9": 839.8653686523437, "learning_rate": 0.000992403876506104, "loss": 1507.8677, "step": 650 }, { "ce_loss_13": 3.2457976222038267, "ce_loss_17": 3.135643982887268, "ce_loss_2": 4.615095782279968, "ce_loss_4": 4.102451252937317, "ce_loss_9": 3.513416254520416, "epoch": 0.066, "grad_norm": 960.0, "kl_loss_13": 238.21534576416016, "kl_loss_2": 2986.016882324219, "kl_loss_4": 2005.8657775878905, "kl_loss_9": 814.6085174560546, "learning_rate": 0.0009921258765867918, "loss": 1511.3381, "step": 660 }, { "ce_loss_13": 3.2168304562568664, "ce_loss_17": 3.1089659929275513, "ce_loss_2": 4.6113745927810665, "ce_loss_4": 4.100262176990509, "ce_loss_9": 3.486327290534973, "epoch": 0.067, "grad_norm": 944.0, "kl_loss_13": 238.76525421142577, "kl_loss_2": 3070.3208740234377, "kl_loss_4": 2079.151824951172, "kl_loss_9": 816.0104278564453, "learning_rate": 0.0009918429209653662, "loss": 1519.7805, "step": 670 }, { "ce_loss_13": 3.265933096408844, "ce_loss_17": 3.1613794565200806, "ce_loss_2": 4.631187629699707, "ce_loss_4": 4.122865617275238, "ce_loss_9": 3.5264532804489135, "epoch": 0.068, "grad_norm": 868.0, "kl_loss_13": 235.20472564697266, "kl_loss_2": 3013.9812622070312, "kl_loss_4": 2027.5386657714844, "kl_loss_9": 796.5073425292969, "learning_rate": 0.0009915550124911866, "loss": 1487.1905, "step": 680 }, { "ce_loss_13": 3.2715242862701417, "ce_loss_17": 3.164630722999573, "ce_loss_2": 4.612001299858093, "ce_loss_4": 4.128691375255585, "ce_loss_9": 3.5321898460388184, "epoch": 0.069, "grad_norm": 1072.0, "kl_loss_13": 233.58517608642578, "kl_loss_2": 2920.829455566406, "kl_loss_4": 1978.2656311035157, "kl_loss_9": 777.0253204345703, "learning_rate": 0.0009912621540634887, "loss": 1480.1011, "step": 690 }, { "ce_loss_13": 3.302413272857666, "ce_loss_17": 3.2039023637771606, "ce_loss_2": 4.601486611366272, "ce_loss_4": 4.105905914306641, "ce_loss_9": 3.5536171793937683, "epoch": 0.07, "grad_norm": 780.0, "kl_loss_13": 218.9342254638672, "kl_loss_2": 2862.6760375976564, "kl_loss_4": 1910.033221435547, "kl_loss_9": 770.5138702392578, "learning_rate": 0.0009909643486313534, "loss": 1457.1656, "step": 700 }, { "ce_loss_13": 3.191205418109894, "ce_loss_17": 3.089118230342865, "ce_loss_2": 4.562660980224609, "ce_loss_4": 4.043074250221252, "ce_loss_9": 3.4587977528572083, "epoch": 0.071, "grad_norm": 924.0, "kl_loss_13": 228.60917053222656, "kl_loss_2": 2996.6495727539063, "kl_loss_4": 2004.634375, "kl_loss_9": 783.8835723876953, "learning_rate": 0.000990661599193678, "loss": 1524.3539, "step": 710 }, { "ce_loss_13": 3.3127217531204223, "ce_loss_17": 3.209806263446808, "ce_loss_2": 4.624217081069946, "ce_loss_4": 4.1476091027259825, "ce_loss_9": 3.567340886592865, "epoch": 0.072, "grad_norm": 892.0, "kl_loss_13": 229.1813949584961, "kl_loss_2": 2889.9541259765624, "kl_loss_4": 1959.398016357422, "kl_loss_9": 771.8566009521485, "learning_rate": 0.0009903539087991462, "loss": 1464.3682, "step": 720 }, { "ce_loss_13": 3.291123640537262, "ce_loss_17": 3.188254976272583, "ce_loss_2": 4.6128562688827515, "ce_loss_4": 4.119845390319824, "ce_loss_9": 3.5449374675750733, "epoch": 0.073, "grad_norm": 936.0, "kl_loss_13": 228.41663208007813, "kl_loss_2": 2910.5075927734374, "kl_loss_4": 1964.818048095703, "kl_loss_9": 783.3948699951172, "learning_rate": 0.0009900412805461966, "loss": 1474.468, "step": 730 }, { "ce_loss_13": 3.3584244012832642, "ce_loss_17": 3.2603103876113892, "ce_loss_2": 4.6467427730560305, "ce_loss_4": 4.159996581077576, "ce_loss_9": 3.605022168159485, "epoch": 0.074, "grad_norm": 1376.0, "kl_loss_13": 219.20079040527344, "kl_loss_2": 2843.9935668945313, "kl_loss_4": 1912.9739196777343, "kl_loss_9": 757.825796508789, "learning_rate": 0.0009897237175829927, "loss": 1456.7439, "step": 740 }, { "ce_loss_13": 3.255482721328735, "ce_loss_17": 3.1463262915611265, "ce_loss_2": 4.585414028167724, "ce_loss_4": 4.092055261135101, "ce_loss_9": 3.502428936958313, "epoch": 0.075, "grad_norm": 1056.0, "kl_loss_13": 236.42009735107422, "kl_loss_2": 2941.9501098632813, "kl_loss_4": 1989.1080749511718, "kl_loss_9": 775.6414337158203, "learning_rate": 0.0009894012231073895, "loss": 1471.8316, "step": 750 }, { "ce_loss_13": 3.3025878787040712, "ce_loss_17": 3.1921103239059447, "ce_loss_2": 4.6062129735946655, "ce_loss_4": 4.113703978061676, "ce_loss_9": 3.5406395554542542, "epoch": 0.076, "grad_norm": 920.0, "kl_loss_13": 245.76659927368163, "kl_loss_2": 2878.5973266601563, "kl_loss_4": 1939.0380126953125, "kl_loss_9": 759.0526550292968, "learning_rate": 0.0009890738003669028, "loss": 1468.4375, "step": 760 }, { "ce_loss_13": 3.2711366295814512, "ce_loss_17": 3.1641719222068785, "ce_loss_2": 4.614434313774109, "ce_loss_4": 4.1120974779129025, "ce_loss_9": 3.525176227092743, "epoch": 0.077, "grad_norm": 860.0, "kl_loss_13": 242.79741363525392, "kl_loss_2": 2964.690075683594, "kl_loss_4": 2004.9281494140625, "kl_loss_9": 786.4657104492187, "learning_rate": 0.0009887414526586764, "loss": 1457.9067, "step": 770 }, { "ce_loss_13": 3.3219969630241395, "ce_loss_17": 3.2179538726806642, "ce_loss_2": 4.629001760482788, "ce_loss_4": 4.129860985279083, "ce_loss_9": 3.566312837600708, "epoch": 0.078, "grad_norm": 864.0, "kl_loss_13": 227.92978286743164, "kl_loss_2": 2875.2292724609374, "kl_loss_4": 1906.8374938964844, "kl_loss_9": 761.5834991455079, "learning_rate": 0.0009884041833294476, "loss": 1422.2779, "step": 780 }, { "ce_loss_13": 3.319532239437103, "ce_loss_17": 3.219510018825531, "ce_loss_2": 4.604642176628113, "ce_loss_4": 4.112042582035064, "ce_loss_9": 3.5662195682525635, "epoch": 0.079, "grad_norm": 980.0, "kl_loss_13": 222.00338287353514, "kl_loss_2": 2833.296484375, "kl_loss_4": 1881.948016357422, "kl_loss_9": 754.9702026367188, "learning_rate": 0.000988061995775515, "loss": 1460.5127, "step": 790 }, { "ce_loss_13": 3.2566551208496093, "ce_loss_17": 3.1524986267089843, "ce_loss_2": 4.52668297290802, "ce_loss_4": 4.057033574581146, "ce_loss_9": 3.509914147853851, "epoch": 0.08, "grad_norm": 912.0, "kl_loss_13": 227.21118698120117, "kl_loss_2": 2832.344384765625, "kl_loss_4": 1919.4224548339844, "kl_loss_9": 769.3831726074219, "learning_rate": 0.0009877148934427035, "loss": 1436.4799, "step": 800 }, { "ce_loss_13": 3.30249844789505, "ce_loss_17": 3.1899492502212525, "ce_loss_2": 4.605924224853515, "ce_loss_4": 4.09448412656784, "ce_loss_9": 3.5394587874412538, "epoch": 0.081, "grad_norm": 1040.0, "kl_loss_13": 236.81161499023438, "kl_loss_2": 2882.2568603515624, "kl_loss_4": 1907.9080749511718, "kl_loss_9": 756.8568481445312, "learning_rate": 0.0009873628798263297, "loss": 1430.0584, "step": 810 }, { "ce_loss_13": 3.2553959012031557, "ce_loss_17": 3.1530211329460145, "ce_loss_2": 4.527966475486755, "ce_loss_4": 4.025527036190033, "ce_loss_9": 3.4859420537948607, "epoch": 0.082, "grad_norm": 848.0, "kl_loss_13": 244.30451049804688, "kl_loss_2": 2814.4177001953126, "kl_loss_4": 1863.2796508789063, "kl_loss_9": 741.8500396728516, "learning_rate": 0.0009870059584711668, "loss": 1453.7934, "step": 820 }, { "ce_loss_13": 3.2679091691970825, "ce_loss_17": 3.160526216030121, "ce_loss_2": 4.546485543251038, "ce_loss_4": 4.062058734893799, "ce_loss_9": 3.5079483866691588, "epoch": 0.083, "grad_norm": 908.0, "kl_loss_13": 229.13085861206054, "kl_loss_2": 2824.4812744140627, "kl_loss_4": 1893.74443359375, "kl_loss_9": 743.4009490966797, "learning_rate": 0.000986644132971409, "loss": 1421.9037, "step": 830 }, { "ce_loss_13": 3.2508782505989076, "ce_loss_17": 3.147918689250946, "ce_loss_2": 4.566097235679626, "ce_loss_4": 4.078214037418365, "ce_loss_9": 3.5056838035583495, "epoch": 0.084, "grad_norm": 816.0, "kl_loss_13": 226.18180313110352, "kl_loss_2": 2877.771838378906, "kl_loss_4": 1940.1273559570313, "kl_loss_9": 762.095327758789, "learning_rate": 0.0009862774069706345, "loss": 1436.819, "step": 840 }, { "ce_loss_13": 3.374106287956238, "ce_loss_17": 3.2794013023376465, "ce_loss_2": 4.61131522655487, "ce_loss_4": 4.138250291347504, "ce_loss_9": 3.6066044330596925, "epoch": 0.085, "grad_norm": 1096.0, "kl_loss_13": 214.66336822509766, "kl_loss_2": 2766.5848754882813, "kl_loss_4": 1856.3738220214843, "kl_loss_9": 738.8373840332031, "learning_rate": 0.000985905784161771, "loss": 1412.0915, "step": 850 }, { "ce_loss_13": 3.301159071922302, "ce_loss_17": 3.2023780703544618, "ce_loss_2": 4.56342580318451, "ce_loss_4": 4.066360580921173, "ce_loss_9": 3.535403859615326, "epoch": 0.086, "grad_norm": 852.0, "kl_loss_13": 215.81713256835937, "kl_loss_2": 2779.8074462890627, "kl_loss_4": 1835.9205078125, "kl_loss_9": 723.7096923828125, "learning_rate": 0.000985529268287055, "loss": 1390.6165, "step": 860 }, { "ce_loss_13": 3.2372225642204286, "ce_loss_17": 3.1359041333198547, "ce_loss_2": 4.546686887741089, "ce_loss_4": 4.038998281955719, "ce_loss_9": 3.4763866543769835, "epoch": 0.087, "grad_norm": 1208.0, "kl_loss_13": 223.39121017456054, "kl_loss_2": 2862.9194946289062, "kl_loss_4": 1890.7715881347656, "kl_loss_9": 740.996142578125, "learning_rate": 0.0009851478631379982, "loss": 1429.6895, "step": 870 }, { "ce_loss_13": 3.301837408542633, "ce_loss_17": 3.195950448513031, "ce_loss_2": 4.575394034385681, "ce_loss_4": 4.080995976924896, "ce_loss_9": 3.5355889201164246, "epoch": 0.088, "grad_norm": 992.0, "kl_loss_13": 227.06667404174806, "kl_loss_2": 2819.738562011719, "kl_loss_4": 1853.1929748535156, "kl_loss_9": 728.0207733154297, "learning_rate": 0.0009847615725553456, "loss": 1405.1773, "step": 880 }, { "ce_loss_13": 3.3436193823814393, "ce_loss_17": 3.2495802879333495, "ce_loss_2": 4.556550931930542, "ce_loss_4": 4.090126383304596, "ce_loss_9": 3.563624370098114, "epoch": 0.089, "grad_norm": 976.0, "kl_loss_13": 211.11851119995117, "kl_loss_2": 2669.558203125, "kl_loss_4": 1778.830877685547, "kl_loss_9": 693.2171966552735, "learning_rate": 0.0009843704004290394, "loss": 1391.1742, "step": 890 }, { "ce_loss_13": 3.2571977376937866, "ce_loss_17": 3.1573795080184937, "ce_loss_2": 4.524980092048645, "ce_loss_4": 4.044141805171966, "ce_loss_9": 3.494728994369507, "epoch": 0.09, "grad_norm": 776.0, "kl_loss_13": 220.56846084594727, "kl_loss_2": 2824.4695434570312, "kl_loss_4": 1900.4137573242188, "kl_loss_9": 740.662060546875, "learning_rate": 0.0009839743506981783, "loss": 1412.9612, "step": 900 }, { "ce_loss_13": 3.1822590231895447, "ce_loss_17": 3.082024037837982, "ce_loss_2": 4.517245936393738, "ce_loss_4": 4.0128992080688475, "ce_loss_9": 3.4349818229675293, "epoch": 0.091, "grad_norm": 932.0, "kl_loss_13": 218.17855911254884, "kl_loss_2": 2951.1318237304686, "kl_loss_4": 1967.5721313476563, "kl_loss_9": 775.0937805175781, "learning_rate": 0.0009835734273509786, "loss": 1442.8287, "step": 910 }, { "ce_loss_13": 3.268250608444214, "ce_loss_17": 3.170508885383606, "ce_loss_2": 4.571528077125549, "ce_loss_4": 4.059476625919342, "ce_loss_9": 3.520617055892944, "epoch": 0.092, "grad_norm": 988.0, "kl_loss_13": 211.56254806518555, "kl_loss_2": 2825.3916015625, "kl_loss_4": 1850.3968017578125, "kl_loss_9": 745.2472381591797, "learning_rate": 0.0009831676344247342, "loss": 1407.3197, "step": 920 }, { "ce_loss_13": 3.284067964553833, "ce_loss_17": 3.198581850528717, "ce_loss_2": 4.525421953201294, "ce_loss_4": 4.044948315620422, "ce_loss_9": 3.5254756212234497, "epoch": 0.093, "grad_norm": 840.0, "kl_loss_13": 202.3430076599121, "kl_loss_2": 2736.1406127929686, "kl_loss_4": 1813.5325988769532, "kl_loss_9": 718.036279296875, "learning_rate": 0.0009827569760057755, "loss": 1393.5624, "step": 930 }, { "ce_loss_13": 3.2077683806419373, "ce_loss_17": 3.1096729755401613, "ce_loss_2": 4.547787547111511, "ce_loss_4": 4.033227443695068, "ce_loss_9": 3.469490075111389, "epoch": 0.094, "grad_norm": 1152.0, "kl_loss_13": 220.52874832153321, "kl_loss_2": 2927.2427734375, "kl_loss_4": 1945.5797485351563, "kl_loss_9": 767.0525177001953, "learning_rate": 0.000982341456229428, "loss": 1420.0414, "step": 940 }, { "ce_loss_13": 3.299692213535309, "ce_loss_17": 3.202546167373657, "ce_loss_2": 4.581137084960938, "ce_loss_4": 4.095329129695893, "ce_loss_9": 3.5443032503128054, "epoch": 0.095, "grad_norm": 936.0, "kl_loss_13": 213.6924285888672, "kl_loss_2": 2836.0372680664063, "kl_loss_4": 1892.16591796875, "kl_loss_9": 754.620736694336, "learning_rate": 0.000981921079279971, "loss": 1391.9723, "step": 950 }, { "ce_loss_13": 3.3109747886657717, "ce_loss_17": 3.2198655366897584, "ce_loss_2": 4.50907678604126, "ce_loss_4": 4.047967004776001, "ce_loss_9": 3.5344821929931642, "epoch": 0.096, "grad_norm": 960.0, "kl_loss_13": 203.3304130554199, "kl_loss_2": 2685.2383666992187, "kl_loss_4": 1790.3744873046876, "kl_loss_9": 709.4701110839844, "learning_rate": 0.0009814958493905962, "loss": 1362.8317, "step": 960 }, { "ce_loss_13": 3.2681641817092895, "ce_loss_17": 3.174164628982544, "ce_loss_2": 4.558875513076782, "ce_loss_4": 4.060932624340057, "ce_loss_9": 3.5090798854827883, "epoch": 0.097, "grad_norm": 952.0, "kl_loss_13": 208.82522888183593, "kl_loss_2": 2827.6786987304686, "kl_loss_4": 1876.4476135253906, "kl_loss_9": 732.2768280029297, "learning_rate": 0.0009810657708433637, "loss": 1429.1148, "step": 970 }, { "ce_loss_13": 3.3462990522384644, "ce_loss_17": 3.2505513191223145, "ce_loss_2": 4.540898418426513, "ce_loss_4": 4.076681923866272, "ce_loss_9": 3.56636620759964, "epoch": 0.098, "grad_norm": 1056.0, "kl_loss_13": 209.25713043212892, "kl_loss_2": 2658.92119140625, "kl_loss_4": 1770.1000732421876, "kl_loss_9": 703.7161193847656, "learning_rate": 0.0009806308479691594, "loss": 1351.686, "step": 980 }, { "ce_loss_13": 3.377233922481537, "ce_loss_17": 3.2492746710777283, "ce_loss_2": 4.594968247413635, "ce_loss_4": 4.129810321331024, "ce_loss_9": 3.5848703742027284, "epoch": 0.099, "grad_norm": 1096.0, "kl_loss_13": 287.80250854492186, "kl_loss_2": 2762.0632080078126, "kl_loss_4": 1856.1412292480468, "kl_loss_9": 730.6600799560547, "learning_rate": 0.0009801910851476522, "loss": 1402.8809, "step": 990 }, { "ce_loss_13": 3.305970883369446, "ce_loss_17": 3.176001238822937, "ce_loss_2": 4.572340750694275, "ce_loss_4": 4.071510076522827, "ce_loss_9": 3.5081840991973876, "epoch": 0.1, "grad_norm": 964.0, "kl_loss_13": 284.42375030517576, "kl_loss_2": 2872.3831176757812, "kl_loss_4": 1909.947979736328, "kl_loss_9": 739.5316497802735, "learning_rate": 0.0009797464868072487, "loss": 1414.8691, "step": 1000 }, { "ce_loss_13": 3.279418241977692, "ce_loss_17": 3.1654094457626343, "ce_loss_2": 4.518069314956665, "ce_loss_4": 4.02565997838974, "ce_loss_9": 3.490999865531921, "epoch": 0.101, "grad_norm": 1128.0, "kl_loss_13": 258.2674430847168, "kl_loss_2": 2788.8846801757813, "kl_loss_4": 1847.487353515625, "kl_loss_9": 731.5081329345703, "learning_rate": 0.0009792970574250492, "loss": 1400.6663, "step": 1010 }, { "ce_loss_13": 3.292120933532715, "ce_loss_17": 3.1848495960235597, "ce_loss_2": 4.530604815483093, "ce_loss_4": 4.046932804584503, "ce_loss_9": 3.510738480091095, "epoch": 0.102, "grad_norm": 868.0, "kl_loss_13": 234.31045532226562, "kl_loss_2": 2766.3355102539062, "kl_loss_4": 1832.100927734375, "kl_loss_9": 711.1432739257813, "learning_rate": 0.0009788428015268028, "loss": 1365.3713, "step": 1020 }, { "ce_loss_13": 3.2847746133804323, "ce_loss_17": 3.190233790874481, "ce_loss_2": 4.518140530586242, "ce_loss_4": 4.030335605144501, "ce_loss_9": 3.5054259061813355, "epoch": 0.103, "grad_norm": 1040.0, "kl_loss_13": 215.07061233520508, "kl_loss_2": 2737.9119262695312, "kl_loss_4": 1803.1291137695312, "kl_loss_9": 703.9826507568359, "learning_rate": 0.0009783837236868609, "loss": 1360.594, "step": 1030 }, { "ce_loss_13": 3.2488949418067934, "ce_loss_17": 3.153828501701355, "ce_loss_2": 4.49981541633606, "ce_loss_4": 4.011671948432922, "ce_loss_9": 3.482884335517883, "epoch": 0.104, "grad_norm": 1080.0, "kl_loss_13": 211.31450119018555, "kl_loss_2": 2745.953894042969, "kl_loss_4": 1809.6199462890625, "kl_loss_9": 721.9542236328125, "learning_rate": 0.0009779198285281327, "loss": 1366.3518, "step": 1040 }, { "ce_loss_13": 3.2448183059692384, "ce_loss_17": 3.1531440377235413, "ce_loss_2": 4.519461226463318, "ce_loss_4": 4.017684829235077, "ce_loss_9": 3.47557373046875, "epoch": 0.105, "grad_norm": 840.0, "kl_loss_13": 202.99858474731445, "kl_loss_2": 2811.4517211914062, "kl_loss_4": 1840.135516357422, "kl_loss_9": 713.690560913086, "learning_rate": 0.0009774511207220368, "loss": 1382.9094, "step": 1050 }, { "ce_loss_13": 3.284769284725189, "ce_loss_17": 3.1939712166786194, "ce_loss_2": 4.555031633377075, "ce_loss_4": 4.041078901290893, "ce_loss_9": 3.515140450000763, "epoch": 0.106, "grad_norm": 796.0, "kl_loss_13": 203.96903915405272, "kl_loss_2": 2777.827502441406, "kl_loss_4": 1801.6379455566407, "kl_loss_9": 712.6347808837891, "learning_rate": 0.0009769776049884564, "loss": 1367.0195, "step": 1060 }, { "ce_loss_13": 3.198607552051544, "ce_loss_17": 3.1053199768066406, "ce_loss_2": 4.501138353347779, "ce_loss_4": 3.98526394367218, "ce_loss_9": 3.439184379577637, "epoch": 0.107, "grad_norm": 1144.0, "kl_loss_13": 206.50607376098634, "kl_loss_2": 2858.7117431640627, "kl_loss_4": 1866.1358337402344, "kl_loss_9": 723.7725769042969, "learning_rate": 0.0009764992860956889, "loss": 1417.184, "step": 1070 }, { "ce_loss_13": 3.3383467197418213, "ce_loss_17": 3.255633223056793, "ce_loss_2": 4.514598083496094, "ce_loss_4": 4.0518736124038695, "ce_loss_9": 3.558040511608124, "epoch": 0.108, "grad_norm": 996.0, "kl_loss_13": 193.55395431518554, "kl_loss_2": 2616.440319824219, "kl_loss_4": 1720.3406494140625, "kl_loss_9": 673.8594696044922, "learning_rate": 0.0009760161688604008, "loss": 1329.9379, "step": 1080 }, { "ce_loss_13": 3.345517909526825, "ce_loss_17": 3.254804825782776, "ce_loss_2": 4.578144526481628, "ce_loss_4": 4.08804349899292, "ce_loss_9": 3.5734084129333494, "epoch": 0.109, "grad_norm": 948.0, "kl_loss_13": 200.05121688842775, "kl_loss_2": 2710.793176269531, "kl_loss_4": 1770.9658569335938, "kl_loss_9": 703.3856597900391, "learning_rate": 0.0009755282581475768, "loss": 1360.3123, "step": 1090 }, { "ce_loss_13": 3.385720765590668, "ce_loss_17": 3.29618159532547, "ce_loss_2": 4.5968094110488895, "ce_loss_4": 4.108812022209167, "ce_loss_9": 3.607908749580383, "epoch": 0.11, "grad_norm": 856.0, "kl_loss_13": 203.5487907409668, "kl_loss_2": 2677.9612060546874, "kl_loss_4": 1750.08759765625, "kl_loss_9": 700.3291900634765, "learning_rate": 0.0009750355588704727, "loss": 1326.8827, "step": 1100 }, { "ce_loss_13": 3.235057520866394, "ce_loss_17": 3.1465519428253175, "ce_loss_2": 4.4831276655197145, "ce_loss_4": 3.9889507055282594, "ce_loss_9": 3.4614131808280946, "epoch": 0.111, "grad_norm": 996.0, "kl_loss_13": 200.0942596435547, "kl_loss_2": 2727.8265014648437, "kl_loss_4": 1791.7176391601563, "kl_loss_9": 693.8158874511719, "learning_rate": 0.0009745380759905647, "loss": 1377.6648, "step": 1110 }, { "ce_loss_13": 3.1875979781150816, "ce_loss_17": 3.1030888676643373, "ce_loss_2": 4.451004838943481, "ce_loss_4": 3.9573415875434876, "ce_loss_9": 3.4160804986953734, "epoch": 0.112, "grad_norm": 1360.0, "kl_loss_13": 196.68862838745116, "kl_loss_2": 2778.4487915039062, "kl_loss_4": 1825.51240234375, "kl_loss_9": 697.3607025146484, "learning_rate": 0.0009740358145174998, "loss": 1403.6008, "step": 1120 }, { "ce_loss_13": 3.3286696791648867, "ce_loss_17": 3.2423221588134767, "ce_loss_2": 4.50739552974701, "ce_loss_4": 4.037946367263794, "ce_loss_9": 3.554660737514496, "epoch": 0.113, "grad_norm": 1032.0, "kl_loss_13": 195.9855224609375, "kl_loss_2": 2627.1833740234374, "kl_loss_4": 1724.3111389160156, "kl_loss_9": 699.7449737548828, "learning_rate": 0.0009735287795090455, "loss": 1332.7893, "step": 1130 }, { "ce_loss_13": 3.2233405351638793, "ce_loss_17": 3.1406787514686583, "ce_loss_2": 4.4660379648208615, "ce_loss_4": 3.9776986479759215, "ce_loss_9": 3.463868188858032, "epoch": 0.114, "grad_norm": 1200.0, "kl_loss_13": 191.02578506469726, "kl_loss_2": 2727.4050537109374, "kl_loss_4": 1799.5895446777345, "kl_loss_9": 709.8186279296875, "learning_rate": 0.0009730169760710386, "loss": 1347.5334, "step": 1140 }, { "ce_loss_13": 3.2993661642074583, "ce_loss_17": 3.214020645618439, "ce_loss_2": 4.524878191947937, "ce_loss_4": 4.0486938714981076, "ce_loss_9": 3.532728040218353, "epoch": 0.115, "grad_norm": 1168.0, "kl_loss_13": 195.39532318115235, "kl_loss_2": 2671.4237548828123, "kl_loss_4": 1764.889630126953, "kl_loss_9": 701.7526702880859, "learning_rate": 0.0009725004093573342, "loss": 1348.1705, "step": 1150 }, { "ce_loss_13": 3.2519322514533995, "ce_loss_17": 3.1604103565216066, "ce_loss_2": 4.47962064743042, "ce_loss_4": 3.9971869707107546, "ce_loss_9": 3.49699524641037, "epoch": 0.116, "grad_norm": 1056.0, "kl_loss_13": 204.54513626098634, "kl_loss_2": 2678.934289550781, "kl_loss_4": 1748.5664611816405, "kl_loss_9": 720.3613555908203, "learning_rate": 0.0009719790845697534, "loss": 1332.2166, "step": 1160 }, { "ce_loss_13": 3.2073302984237673, "ce_loss_17": 3.1191227197647096, "ce_loss_2": 4.389339399337769, "ce_loss_4": 3.9196030497550964, "ce_loss_9": 3.426686441898346, "epoch": 0.117, "grad_norm": 1128.0, "kl_loss_13": 200.11859588623048, "kl_loss_2": 2624.1080932617188, "kl_loss_4": 1717.0731872558595, "kl_loss_9": 687.8316497802734, "learning_rate": 0.0009714530069580309, "loss": 1311.7604, "step": 1170 }, { "ce_loss_13": 3.294953465461731, "ce_loss_17": 3.2026371002197265, "ce_loss_2": 4.513879489898682, "ce_loss_4": 4.039666080474854, "ce_loss_9": 3.538236165046692, "epoch": 0.118, "grad_norm": 1168.0, "kl_loss_13": 204.21649322509765, "kl_loss_2": 2670.8010986328127, "kl_loss_4": 1764.6788391113282, "kl_loss_9": 733.3729858398438, "learning_rate": 0.0009709221818197624, "loss": 1337.2672, "step": 1180 }, { "ce_loss_13": 3.331338405609131, "ce_loss_17": 3.242104744911194, "ce_loss_2": 4.561363887786865, "ce_loss_4": 4.074220752716064, "ce_loss_9": 3.569767677783966, "epoch": 0.119, "grad_norm": 1088.0, "kl_loss_13": 199.29565811157227, "kl_loss_2": 2707.6415405273438, "kl_loss_4": 1771.7481811523437, "kl_loss_9": 721.1656341552734, "learning_rate": 0.0009703866145003512, "loss": 1348.8243, "step": 1190 }, { "ce_loss_13": 3.303205907344818, "ce_loss_17": 3.2188957929611206, "ce_loss_2": 4.495118236541748, "ce_loss_4": 4.03345410823822, "ce_loss_9": 3.5354658484458925, "epoch": 0.12, "grad_norm": 1208.0, "kl_loss_13": 193.17132644653321, "kl_loss_2": 2659.7258544921874, "kl_loss_4": 1753.0074768066406, "kl_loss_9": 708.479605102539, "learning_rate": 0.0009698463103929542, "loss": 1348.5676, "step": 1200 }, { "ce_loss_13": 3.266885006427765, "ce_loss_17": 3.1778912782669066, "ce_loss_2": 4.48817548751831, "ce_loss_4": 4.0104964017868046, "ce_loss_9": 3.4941314339637755, "epoch": 0.121, "grad_norm": 844.0, "kl_loss_13": 199.55910415649413, "kl_loss_2": 2693.4548461914064, "kl_loss_4": 1777.9077026367188, "kl_loss_9": 703.0745422363282, "learning_rate": 0.0009693012749384279, "loss": 1353.958, "step": 1210 }, { "ce_loss_13": 3.2818835616111754, "ce_loss_17": 3.191451632976532, "ce_loss_2": 4.476631903648377, "ce_loss_4": 4.026136028766632, "ce_loss_9": 3.5077857851982115, "epoch": 0.122, "grad_norm": 948.0, "kl_loss_13": 196.7908737182617, "kl_loss_2": 2652.621923828125, "kl_loss_4": 1780.204296875, "kl_loss_9": 696.9551300048828, "learning_rate": 0.0009687515136252732, "loss": 1325.3814, "step": 1220 }, { "ce_loss_13": 3.2415047764778135, "ce_loss_17": 3.1540622234344484, "ce_loss_2": 4.509912896156311, "ce_loss_4": 4.011622071266174, "ce_loss_9": 3.477183485031128, "epoch": 0.123, "grad_norm": 1104.0, "kl_loss_13": 197.74361038208008, "kl_loss_2": 2791.901171875, "kl_loss_4": 1838.36376953125, "kl_loss_9": 718.2526062011718, "learning_rate": 0.0009681970319895803, "loss": 1399.8516, "step": 1230 }, { "ce_loss_13": 3.317516529560089, "ce_loss_17": 3.235230004787445, "ce_loss_2": 4.5138836145401005, "ce_loss_4": 4.046058464050293, "ce_loss_9": 3.5491626501083373, "epoch": 0.124, "grad_norm": 844.0, "kl_loss_13": 194.59980850219728, "kl_loss_2": 2645.33642578125, "kl_loss_4": 1737.356427001953, "kl_loss_9": 688.3965637207032, "learning_rate": 0.0009676378356149733, "loss": 1318.4164, "step": 1240 }, { "ce_loss_13": 3.287678062915802, "ce_loss_17": 3.200864279270172, "ce_loss_2": 4.472741651535034, "ce_loss_4": 3.987990975379944, "ce_loss_9": 3.4946972370147704, "epoch": 0.125, "grad_norm": 1240.0, "kl_loss_13": 196.19946823120117, "kl_loss_2": 2622.074609375, "kl_loss_4": 1703.4868408203124, "kl_loss_9": 665.4978210449219, "learning_rate": 0.0009670739301325534, "loss": 1304.3303, "step": 1250 }, { "ce_loss_13": 3.251143455505371, "ce_loss_17": 3.161188006401062, "ce_loss_2": 4.430971789360046, "ce_loss_4": 3.9714604020118713, "ce_loss_9": 3.4733884215354918, "epoch": 0.126, "grad_norm": 812.0, "kl_loss_13": 200.71605911254883, "kl_loss_2": 2606.991357421875, "kl_loss_4": 1723.56396484375, "kl_loss_9": 684.7825775146484, "learning_rate": 0.0009665053212208426, "loss": 1324.8665, "step": 1260 }, { "ce_loss_13": 3.286880683898926, "ce_loss_17": 3.204228329658508, "ce_loss_2": 4.4972593784332275, "ce_loss_4": 4.0207718968391415, "ce_loss_9": 3.510022056102753, "epoch": 0.127, "grad_norm": 1192.0, "kl_loss_13": 196.8276824951172, "kl_loss_2": 2681.8605346679688, "kl_loss_4": 1758.460205078125, "kl_loss_9": 694.0363739013671, "learning_rate": 0.0009659320146057262, "loss": 1329.1243, "step": 1270 }, { "ce_loss_13": 3.29819837808609, "ce_loss_17": 3.2135840892791747, "ce_loss_2": 4.470524191856384, "ce_loss_4": 4.009433555603027, "ce_loss_9": 3.5127774357795714, "epoch": 0.128, "grad_norm": 916.0, "kl_loss_13": 199.30388717651368, "kl_loss_2": 2607.261511230469, "kl_loss_4": 1717.1849304199218, "kl_loss_9": 668.7663299560547, "learning_rate": 0.0009653540160603955, "loss": 1306.6781, "step": 1280 }, { "ce_loss_13": 3.309871256351471, "ce_loss_17": 3.221308135986328, "ce_loss_2": 4.4761080026626585, "ce_loss_4": 4.0116840362548825, "ce_loss_9": 3.5193127393722534, "epoch": 0.129, "grad_norm": 936.0, "kl_loss_13": 207.4881805419922, "kl_loss_2": 2632.054235839844, "kl_loss_4": 1726.5696716308594, "kl_loss_9": 670.4268432617188, "learning_rate": 0.0009647713314052896, "loss": 1296.7373, "step": 1290 }, { "ce_loss_13": 3.2544073343276976, "ce_loss_17": 3.163151001930237, "ce_loss_2": 4.49944953918457, "ce_loss_4": 4.019238436222077, "ce_loss_9": 3.4850263476371763, "epoch": 0.13, "grad_norm": 1024.0, "kl_loss_13": 204.04417343139647, "kl_loss_2": 2748.844836425781, "kl_loss_4": 1805.3862182617188, "kl_loss_9": 699.5055114746094, "learning_rate": 0.0009641839665080363, "loss": 1349.0986, "step": 1300 }, { "ce_loss_13": 3.217115592956543, "ce_loss_17": 3.1326871633529665, "ce_loss_2": 4.433155751228332, "ce_loss_4": 3.949730694293976, "ce_loss_9": 3.437587261199951, "epoch": 0.131, "grad_norm": 1144.0, "kl_loss_13": 193.7048141479492, "kl_loss_2": 2677.2375, "kl_loss_4": 1743.4517639160156, "kl_loss_9": 682.1249816894531, "learning_rate": 0.0009635919272833937, "loss": 1310.1549, "step": 1310 }, { "ce_loss_13": 3.244895505905151, "ce_loss_17": 3.1582866668701173, "ce_loss_2": 4.4647527694702145, "ce_loss_4": 3.9752745628356934, "ce_loss_9": 3.4737863063812258, "epoch": 0.132, "grad_norm": 1088.0, "kl_loss_13": 196.25409851074218, "kl_loss_2": 2650.9760620117186, "kl_loss_4": 1716.925469970703, "kl_loss_9": 687.0598754882812, "learning_rate": 0.0009629952196931902, "loss": 1290.0225, "step": 1320 }, { "ce_loss_13": 3.241357922554016, "ce_loss_17": 3.1556437611579895, "ce_loss_2": 4.449398493766784, "ce_loss_4": 3.9562542915344237, "ce_loss_9": 3.4533059120178224, "epoch": 0.133, "grad_norm": 816.0, "kl_loss_13": 196.67173233032227, "kl_loss_2": 2673.589306640625, "kl_loss_4": 1732.5205444335938, "kl_loss_9": 672.7849487304687, "learning_rate": 0.0009623938497462645, "loss": 1309.2213, "step": 1330 }, { "ce_loss_13": 3.231060814857483, "ce_loss_17": 3.1450219631195067, "ce_loss_2": 4.431920719146729, "ce_loss_4": 3.9562933325767515, "ce_loss_9": 3.445254957675934, "epoch": 0.134, "grad_norm": 936.0, "kl_loss_13": 197.01858825683593, "kl_loss_2": 2659.0852783203127, "kl_loss_4": 1747.1570068359374, "kl_loss_9": 674.9324920654296, "learning_rate": 0.0009617878234984055, "loss": 1324.4643, "step": 1340 }, { "ce_loss_13": 3.315055227279663, "ce_loss_17": 3.234802782535553, "ce_loss_2": 4.477152037620544, "ce_loss_4": 4.004279613494873, "ce_loss_9": 3.5192012786865234, "epoch": 0.135, "grad_norm": 1312.0, "kl_loss_13": 186.29926528930665, "kl_loss_2": 2561.1720581054688, "kl_loss_4": 1659.4882568359376, "kl_loss_9": 640.2065704345703, "learning_rate": 0.0009611771470522907, "loss": 1284.5598, "step": 1350 }, { "ce_loss_13": 3.2487470388412474, "ce_loss_17": 3.1640228986740113, "ce_loss_2": 4.461760711669922, "ce_loss_4": 3.9678382754325865, "ce_loss_9": 3.4665634632110596, "epoch": 0.136, "grad_norm": 912.0, "kl_loss_13": 187.84272689819335, "kl_loss_2": 2642.635705566406, "kl_loss_4": 1698.8799194335938, "kl_loss_9": 653.8625396728515, "learning_rate": 0.0009605618265574251, "loss": 1280.1448, "step": 1360 }, { "ce_loss_13": 3.214472460746765, "ce_loss_17": 3.1299581646919252, "ce_loss_2": 4.4454809665679935, "ce_loss_4": 3.9873968601226806, "ce_loss_9": 3.4357502818107606, "epoch": 0.137, "grad_norm": 768.0, "kl_loss_13": 192.21915664672852, "kl_loss_2": 2723.652941894531, "kl_loss_4": 1827.2098266601563, "kl_loss_9": 683.973178100586, "learning_rate": 0.0009599418682100792, "loss": 1329.4066, "step": 1370 }, { "ce_loss_13": 3.253819763660431, "ce_loss_17": 3.173033118247986, "ce_loss_2": 4.453915023803711, "ce_loss_4": 3.9669384598731994, "ce_loss_9": 3.473034620285034, "epoch": 0.138, "grad_norm": 1528.0, "kl_loss_13": 189.45810928344727, "kl_loss_2": 2643.3591552734374, "kl_loss_4": 1715.158758544922, "kl_loss_9": 669.6325622558594, "learning_rate": 0.0009593172782532268, "loss": 1305.4308, "step": 1380 }, { "ce_loss_13": 3.286542224884033, "ce_loss_17": 3.206256937980652, "ce_loss_2": 4.468729424476623, "ce_loss_4": 3.9946436524391173, "ce_loss_9": 3.504188370704651, "epoch": 0.139, "grad_norm": 832.0, "kl_loss_13": 185.1733543395996, "kl_loss_2": 2590.571081542969, "kl_loss_4": 1689.0799255371094, "kl_loss_9": 667.5989410400391, "learning_rate": 0.0009586880629764817, "loss": 1283.118, "step": 1390 }, { "ce_loss_13": 3.2239181756973267, "ce_loss_17": 3.1406423687934875, "ce_loss_2": 4.419588565826416, "ce_loss_4": 3.9577874422073362, "ce_loss_9": 3.4435829520225525, "epoch": 0.14, "grad_norm": 1208.0, "kl_loss_13": 186.29568405151366, "kl_loss_2": 2632.61650390625, "kl_loss_4": 1728.332586669922, "kl_loss_9": 672.9785247802735, "learning_rate": 0.0009580542287160348, "loss": 1284.7888, "step": 1400 }, { "ce_loss_13": 3.186039626598358, "ce_loss_17": 3.1046715259552, "ce_loss_2": 4.386249756813049, "ce_loss_4": 3.8989491939544676, "ce_loss_9": 3.4024126648902895, "epoch": 0.141, "grad_norm": 1784.0, "kl_loss_13": 184.34195709228516, "kl_loss_2": 2642.0093627929687, "kl_loss_4": 1707.34677734375, "kl_loss_9": 665.6565856933594, "learning_rate": 0.0009574157818545901, "loss": 1285.8109, "step": 1410 }, { "ce_loss_13": 3.2593000650405886, "ce_loss_17": 3.1767266750335694, "ce_loss_2": 4.4258177995681764, "ce_loss_4": 3.954494500160217, "ce_loss_9": 3.467965912818909, "epoch": 0.142, "grad_norm": 1000.0, "kl_loss_13": 190.62417297363282, "kl_loss_2": 2581.8979736328124, "kl_loss_4": 1670.95986328125, "kl_loss_9": 652.826416015625, "learning_rate": 0.0009567727288213005, "loss": 1300.5624, "step": 1420 }, { "ce_loss_13": 3.243348813056946, "ce_loss_17": 3.152751958370209, "ce_loss_2": 4.437687015533447, "ce_loss_4": 3.956622529029846, "ce_loss_9": 3.463455247879028, "epoch": 0.143, "grad_norm": 1272.0, "kl_loss_13": 208.92820892333984, "kl_loss_2": 2655.5672973632813, "kl_loss_4": 1718.3946350097656, "kl_loss_9": 690.7540466308594, "learning_rate": 0.0009561250760917027, "loss": 1296.2412, "step": 1430 }, { "ce_loss_13": 3.2590105533599854, "ce_loss_17": 3.1690674901008604, "ce_loss_2": 4.4350172758102415, "ce_loss_4": 3.9588972449302675, "ce_loss_9": 3.471857154369354, "epoch": 0.144, "grad_norm": 964.0, "kl_loss_13": 211.25488739013673, "kl_loss_2": 2640.179052734375, "kl_loss_4": 1719.6405517578125, "kl_loss_9": 686.0618316650391, "learning_rate": 0.0009554728301876525, "loss": 1285.432, "step": 1440 }, { "ce_loss_13": 3.3047693252563475, "ce_loss_17": 3.2157714128494264, "ce_loss_2": 4.461184954643249, "ce_loss_4": 3.988552522659302, "ce_loss_9": 3.5201760053634645, "epoch": 0.145, "grad_norm": 900.0, "kl_loss_13": 206.70000534057618, "kl_loss_2": 2566.944055175781, "kl_loss_4": 1664.7619384765626, "kl_loss_9": 677.4077362060547, "learning_rate": 0.0009548159976772592, "loss": 1319.3715, "step": 1450 }, { "ce_loss_13": 3.265376424789429, "ce_loss_17": 3.171922028064728, "ce_loss_2": 4.460851573944092, "ce_loss_4": 3.97399560213089, "ce_loss_9": 3.4832445979118347, "epoch": 0.146, "grad_norm": 936.0, "kl_loss_13": 207.9811233520508, "kl_loss_2": 2668.653466796875, "kl_loss_4": 1725.571514892578, "kl_loss_9": 685.6256256103516, "learning_rate": 0.0009541545851748186, "loss": 1307.916, "step": 1460 }, { "ce_loss_13": 3.121864128112793, "ce_loss_17": 3.0353453516960145, "ce_loss_2": 4.362925410270691, "ce_loss_4": 3.8561588764190673, "ce_loss_9": 3.34004967212677, "epoch": 0.147, "grad_norm": 1096.0, "kl_loss_13": 194.13334732055665, "kl_loss_2": 2699.7970947265626, "kl_loss_4": 1731.8086303710938, "kl_loss_9": 667.9556243896484, "learning_rate": 0.0009534885993407473, "loss": 1307.069, "step": 1470 }, { "ce_loss_13": 3.281321096420288, "ce_loss_17": 3.1976502895355225, "ce_loss_2": 4.477102589607239, "ce_loss_4": 4.010080826282501, "ce_loss_9": 3.4938496351242065, "epoch": 0.148, "grad_norm": 872.0, "kl_loss_13": 190.50678482055665, "kl_loss_2": 2635.22958984375, "kl_loss_4": 1739.0423828125, "kl_loss_9": 656.5610870361328, "learning_rate": 0.0009528180468815154, "loss": 1304.9076, "step": 1480 }, { "ce_loss_13": 3.3260432481765747, "ce_loss_17": 3.246252727508545, "ce_loss_2": 4.48549599647522, "ce_loss_4": 4.018248724937439, "ce_loss_9": 3.538304555416107, "epoch": 0.149, "grad_norm": 1192.0, "kl_loss_13": 189.51629104614258, "kl_loss_2": 2565.716943359375, "kl_loss_4": 1672.4983459472655, "kl_loss_9": 671.9416290283203, "learning_rate": 0.0009521429345495787, "loss": 1279.1693, "step": 1490 }, { "ce_loss_13": 3.31293044090271, "ce_loss_17": 3.2300219416618345, "ce_loss_2": 4.452384424209595, "ce_loss_4": 3.9803717613220213, "ce_loss_9": 3.5235919833183287, "epoch": 0.15, "grad_norm": 1004.0, "kl_loss_13": 190.58819122314452, "kl_loss_2": 2547.9408203125, "kl_loss_4": 1643.4682373046876, "kl_loss_9": 655.9859924316406, "learning_rate": 0.0009514632691433108, "loss": 1279.7461, "step": 1500 }, { "ce_loss_13": 3.279203104972839, "ce_loss_17": 3.191249144077301, "ce_loss_2": 4.442009162902832, "ce_loss_4": 3.97766991853714, "ce_loss_9": 3.4907020807266234, "epoch": 0.151, "grad_norm": 948.0, "kl_loss_13": 191.48612213134766, "kl_loss_2": 2592.095910644531, "kl_loss_4": 1688.652362060547, "kl_loss_9": 675.5984161376953, "learning_rate": 0.0009507790575069346, "loss": 1288.0406, "step": 1510 }, { "ce_loss_13": 3.244783675670624, "ce_loss_17": 3.15552476644516, "ce_loss_2": 4.449785494804383, "ce_loss_4": 3.9698154449462892, "ce_loss_9": 3.4762943744659425, "epoch": 0.152, "grad_norm": 972.0, "kl_loss_13": 192.18615036010743, "kl_loss_2": 2642.100305175781, "kl_loss_4": 1711.9736328125, "kl_loss_9": 695.0744506835938, "learning_rate": 0.0009500903065304539, "loss": 1322.2445, "step": 1520 }, { "ce_loss_13": 3.281165933609009, "ce_loss_17": 3.203299367427826, "ce_loss_2": 4.423307085037232, "ce_loss_4": 3.947160470485687, "ce_loss_9": 3.4900771141052247, "epoch": 0.153, "grad_norm": 1040.0, "kl_loss_13": 179.45922393798827, "kl_loss_2": 2517.097058105469, "kl_loss_4": 1616.0437377929688, "kl_loss_9": 651.5772277832032, "learning_rate": 0.0009493970231495835, "loss": 1265.9131, "step": 1530 }, { "ce_loss_13": 3.2213188648223876, "ce_loss_17": 3.147273373603821, "ce_loss_2": 4.361351060867309, "ce_loss_4": 3.900830078125, "ce_loss_9": 3.4266594529151915, "epoch": 0.154, "grad_norm": 1128.0, "kl_loss_13": 173.9607292175293, "kl_loss_2": 2539.301721191406, "kl_loss_4": 1644.03251953125, "kl_loss_9": 646.0508209228516, "learning_rate": 0.0009486992143456792, "loss": 1251.5666, "step": 1540 }, { "ce_loss_13": 3.2468919277191164, "ce_loss_17": 3.1612582445144652, "ce_loss_2": 4.49159038066864, "ce_loss_4": 4.004345107078552, "ce_loss_9": 3.485341966152191, "epoch": 0.155, "grad_norm": 1272.0, "kl_loss_13": 190.46406478881835, "kl_loss_2": 2736.513623046875, "kl_loss_4": 1780.1145751953125, "kl_loss_9": 701.5022918701172, "learning_rate": 0.0009479968871456679, "loss": 1314.9027, "step": 1550 }, { "ce_loss_13": 3.2169959902763368, "ce_loss_17": 3.1394698143005373, "ce_loss_2": 4.423523116111755, "ce_loss_4": 3.93103688955307, "ce_loss_9": 3.443285310268402, "epoch": 0.156, "grad_norm": 988.0, "kl_loss_13": 179.46360397338867, "kl_loss_2": 2665.304296875, "kl_loss_4": 1716.2525817871094, "kl_loss_9": 691.0308044433593, "learning_rate": 0.0009472900486219768, "loss": 1286.4227, "step": 1560 }, { "ce_loss_13": 3.2078802943229676, "ce_loss_17": 3.133780360221863, "ce_loss_2": 4.368587255477905, "ce_loss_4": 3.9098785400390623, "ce_loss_9": 3.430669057369232, "epoch": 0.157, "grad_norm": 1040.0, "kl_loss_13": 175.05562591552734, "kl_loss_2": 2575.9386352539063, "kl_loss_4": 1681.8322265625, "kl_loss_9": 669.0976928710937, "learning_rate": 0.000946578705892462, "loss": 1277.1063, "step": 1570 }, { "ce_loss_13": 3.2449191331863405, "ce_loss_17": 3.1666473746299744, "ce_loss_2": 4.391902756690979, "ce_loss_4": 3.9190162301063536, "ce_loss_9": 3.455510103702545, "epoch": 0.158, "grad_norm": 984.0, "kl_loss_13": 176.7796714782715, "kl_loss_2": 2530.8050170898437, "kl_loss_4": 1625.0544128417969, "kl_loss_9": 643.6431610107422, "learning_rate": 0.0009458628661203367, "loss": 1266.3047, "step": 1580 }, { "ce_loss_13": 3.249901831150055, "ce_loss_17": 3.171110236644745, "ce_loss_2": 4.457311916351318, "ce_loss_4": 3.964294362068176, "ce_loss_9": 3.4723296403884887, "epoch": 0.159, "grad_norm": 984.0, "kl_loss_13": 181.31310653686523, "kl_loss_2": 2657.1421630859377, "kl_loss_4": 1710.8099060058594, "kl_loss_9": 666.9279144287109, "learning_rate": 0.0009451425365140996, "loss": 1266.4518, "step": 1590 }, { "ce_loss_13": 3.3207091212272646, "ce_loss_17": 3.2401039838790893, "ce_loss_2": 4.460034894943237, "ce_loss_4": 3.993881011009216, "ce_loss_9": 3.528563976287842, "epoch": 0.16, "grad_norm": 976.0, "kl_loss_13": 182.75577621459962, "kl_loss_2": 2532.9882934570314, "kl_loss_4": 1626.806219482422, "kl_loss_9": 644.7626403808594, "learning_rate": 0.0009444177243274617, "loss": 1242.7062, "step": 1600 }, { "ce_loss_13": 3.1875908732414246, "ce_loss_17": 3.1038769602775576, "ce_loss_2": 4.375924420356751, "ce_loss_4": 3.8973376154899597, "ce_loss_9": 3.40350079536438, "epoch": 0.161, "grad_norm": 984.0, "kl_loss_13": 188.69177856445313, "kl_loss_2": 2622.9703979492188, "kl_loss_4": 1706.211669921875, "kl_loss_9": 670.6045684814453, "learning_rate": 0.0009436884368592739, "loss": 1286.3109, "step": 1610 }, { "ce_loss_13": 3.2306648969650267, "ce_loss_17": 3.150356578826904, "ce_loss_2": 4.3817664861679075, "ce_loss_4": 3.9106077432632445, "ce_loss_9": 3.4392393231391907, "epoch": 0.162, "grad_norm": 1208.0, "kl_loss_13": 182.87859497070312, "kl_loss_2": 2554.181591796875, "kl_loss_4": 1647.701885986328, "kl_loss_9": 646.6049133300781, "learning_rate": 0.0009429546814534529, "loss": 1281.8854, "step": 1620 }, { "ce_loss_13": 3.2389976620674132, "ce_loss_17": 3.161899375915527, "ce_loss_2": 4.389292621612549, "ce_loss_4": 3.9234333992004395, "ce_loss_9": 3.443483603000641, "epoch": 0.163, "grad_norm": 876.0, "kl_loss_13": 185.8115982055664, "kl_loss_2": 2531.540673828125, "kl_loss_4": 1627.9187133789062, "kl_loss_9": 636.7353637695312, "learning_rate": 0.0009422164654989072, "loss": 1234.1139, "step": 1630 }, { "ce_loss_13": 3.35833158493042, "ce_loss_17": 3.2711931347846983, "ce_loss_2": 4.476850366592407, "ce_loss_4": 4.018917286396027, "ce_loss_9": 3.5507665395736696, "epoch": 0.164, "grad_norm": 1280.0, "kl_loss_13": 199.6320373535156, "kl_loss_2": 2517.2793090820314, "kl_loss_4": 1633.148175048828, "kl_loss_9": 632.9461883544922, "learning_rate": 0.0009414737964294635, "loss": 1252.5148, "step": 1640 }, { "ce_loss_13": 3.2865842819213866, "ce_loss_17": 3.2051337361335754, "ce_loss_2": 4.396987843513489, "ce_loss_4": 3.9361155033111572, "ce_loss_9": 3.473356032371521, "epoch": 0.165, "grad_norm": 1088.0, "kl_loss_13": 182.2093391418457, "kl_loss_2": 2454.3593139648438, "kl_loss_4": 1572.0215393066405, "kl_loss_9": 608.0177185058594, "learning_rate": 0.000940726681723791, "loss": 1241.7802, "step": 1650 }, { "ce_loss_13": 3.138375794887543, "ce_loss_17": 3.056673324108124, "ce_loss_2": 4.341856670379639, "ce_loss_4": 3.840356147289276, "ce_loss_9": 3.3471007466316225, "epoch": 0.166, "grad_norm": 1384.0, "kl_loss_13": 184.16161422729493, "kl_loss_2": 2672.4370361328124, "kl_loss_4": 1701.6221557617187, "kl_loss_9": 643.5906280517578, "learning_rate": 0.0009399751289053266, "loss": 1246.3816, "step": 1660 }, { "ce_loss_13": 3.3287126183509828, "ce_loss_17": 3.253064739704132, "ce_loss_2": 4.465554118156433, "ce_loss_4": 3.994976794719696, "ce_loss_9": 3.5269273281097413, "epoch": 0.167, "grad_norm": 984.0, "kl_loss_13": 179.20603942871094, "kl_loss_2": 2513.819470214844, "kl_loss_4": 1605.6195495605468, "kl_loss_9": 617.1175933837891, "learning_rate": 0.0009392191455421988, "loss": 1254.4086, "step": 1670 }, { "ce_loss_13": 3.3128968119621276, "ce_loss_17": 3.2315263628959654, "ce_loss_2": 4.448565721511841, "ce_loss_4": 3.9704694867134096, "ce_loss_9": 3.51049165725708, "epoch": 0.168, "grad_norm": 988.0, "kl_loss_13": 188.92223587036133, "kl_loss_2": 2547.3358154296875, "kl_loss_4": 1629.4804748535157, "kl_loss_9": 649.6787933349609, "learning_rate": 0.0009384587392471515, "loss": 1227.6839, "step": 1680 }, { "ce_loss_13": 3.298438286781311, "ce_loss_17": 3.2249420166015623, "ce_loss_2": 4.4021806716918945, "ce_loss_4": 3.9572454929351806, "ce_loss_9": 3.4934035778045653, "epoch": 0.169, "grad_norm": 1136.0, "kl_loss_13": 174.20817108154296, "kl_loss_2": 2469.3099975585938, "kl_loss_4": 1596.5793395996093, "kl_loss_9": 622.9863677978516, "learning_rate": 0.0009376939176774678, "loss": 1219.8741, "step": 1690 }, { "ce_loss_13": 3.276881718635559, "ce_loss_17": 3.201683449745178, "ce_loss_2": 4.416564154624939, "ce_loss_4": 3.9488250017166138, "ce_loss_9": 3.480985176563263, "epoch": 0.17, "grad_norm": 992.0, "kl_loss_13": 176.36187744140625, "kl_loss_2": 2507.9495178222655, "kl_loss_4": 1620.8624938964845, "kl_loss_9": 624.9853118896484, "learning_rate": 0.0009369246885348925, "loss": 1255.4079, "step": 1700 }, { "ce_loss_13": 3.263762593269348, "ce_loss_17": 3.188036346435547, "ce_loss_2": 4.447522163391113, "ce_loss_4": 3.9723405599594117, "ce_loss_9": 3.472601282596588, "epoch": 0.171, "grad_norm": 816.0, "kl_loss_13": 178.61313018798828, "kl_loss_2": 2612.7346801757812, "kl_loss_4": 1686.6918090820313, "kl_loss_9": 637.1985504150391, "learning_rate": 0.0009361510595655545, "loss": 1264.61, "step": 1710 }, { "ce_loss_13": 3.2228453159332275, "ce_loss_17": 3.14246381521225, "ce_loss_2": 4.3903366208076475, "ce_loss_4": 3.913253605365753, "ce_loss_9": 3.43224413394928, "epoch": 0.172, "grad_norm": 1104.0, "kl_loss_13": 183.22847900390624, "kl_loss_2": 2593.470104980469, "kl_loss_4": 1670.820379638672, "kl_loss_9": 652.0130035400391, "learning_rate": 0.0009353730385598887, "loss": 1261.6023, "step": 1720 }, { "ce_loss_13": 3.1585384368896485, "ce_loss_17": 3.079004967212677, "ce_loss_2": 4.354488873481751, "ce_loss_4": 3.8698577284812927, "ce_loss_9": 3.3650123476982117, "epoch": 0.173, "grad_norm": 1080.0, "kl_loss_13": 178.92700805664063, "kl_loss_2": 2626.0948974609373, "kl_loss_4": 1687.1143798828125, "kl_loss_9": 643.9748596191406, "learning_rate": 0.0009345906333525581, "loss": 1277.0397, "step": 1730 }, { "ce_loss_13": 3.200738763809204, "ce_loss_17": 3.1166375041007996, "ce_loss_2": 4.3668485403060915, "ce_loss_4": 3.8916123628616335, "ce_loss_9": 3.410936200618744, "epoch": 0.174, "grad_norm": 1304.0, "kl_loss_13": 192.06573638916015, "kl_loss_2": 2592.806115722656, "kl_loss_4": 1670.9978515625, "kl_loss_9": 663.9850250244141, "learning_rate": 0.0009338038518223745, "loss": 1259.5258, "step": 1740 }, { "ce_loss_13": 3.2624486088752747, "ce_loss_17": 3.175633180141449, "ce_loss_2": 4.428672361373901, "ce_loss_4": 3.9581262946128843, "ce_loss_9": 3.481553077697754, "epoch": 0.175, "grad_norm": 1104.0, "kl_loss_13": 190.7497131347656, "kl_loss_2": 2604.9264404296873, "kl_loss_4": 1682.4797119140626, "kl_loss_9": 679.0449371337891, "learning_rate": 0.0009330127018922195, "loss": 1297.8215, "step": 1750 }, { "ce_loss_13": 3.222031795978546, "ce_loss_17": 3.136512589454651, "ce_loss_2": 4.384543764591217, "ce_loss_4": 3.9043487191200255, "ce_loss_9": 3.417991650104523, "epoch": 0.176, "grad_norm": 1504.0, "kl_loss_13": 188.52411422729492, "kl_loss_2": 2595.811730957031, "kl_loss_4": 1653.3313232421874, "kl_loss_9": 642.8933685302734, "learning_rate": 0.0009322171915289634, "loss": 1265.3272, "step": 1760 }, { "ce_loss_13": 3.256188678741455, "ce_loss_17": 3.1804313182830812, "ce_loss_2": 4.3770142793655396, "ce_loss_4": 3.916848373413086, "ce_loss_9": 3.4582699298858643, "epoch": 0.177, "grad_norm": 972.0, "kl_loss_13": 182.7982391357422, "kl_loss_2": 2506.593884277344, "kl_loss_4": 1615.7811462402344, "kl_loss_9": 639.5718444824219, "learning_rate": 0.0009314173287433873, "loss": 1225.9427, "step": 1770 }, { "ce_loss_13": 3.2405248284339905, "ce_loss_17": 3.161733555793762, "ce_loss_2": 4.379581236839295, "ce_loss_4": 3.907567834854126, "ce_loss_9": 3.445212960243225, "epoch": 0.178, "grad_norm": 944.0, "kl_loss_13": 181.9399169921875, "kl_loss_2": 2536.3139770507814, "kl_loss_4": 1634.8711730957032, "kl_loss_9": 646.9036285400391, "learning_rate": 0.0009306131215901003, "loss": 1230.1917, "step": 1780 }, { "ce_loss_13": 3.268652844429016, "ce_loss_17": 3.191008913516998, "ce_loss_2": 4.41733226776123, "ce_loss_4": 3.9325471997261046, "ce_loss_9": 3.466645562648773, "epoch": 0.179, "grad_norm": 992.0, "kl_loss_13": 179.42753143310546, "kl_loss_2": 2537.5878051757813, "kl_loss_4": 1614.683740234375, "kl_loss_9": 636.4151763916016, "learning_rate": 0.0009298045781674596, "loss": 1217.6664, "step": 1790 }, { "ce_loss_13": 3.2506497621536257, "ce_loss_17": 3.1762409925460817, "ce_loss_2": 4.38077392578125, "ce_loss_4": 3.909109318256378, "ce_loss_9": 3.4528645277023315, "epoch": 0.18, "grad_norm": 1208.0, "kl_loss_13": 172.71835479736328, "kl_loss_2": 2501.61572265625, "kl_loss_4": 1592.382666015625, "kl_loss_9": 624.5138946533203, "learning_rate": 0.0009289917066174886, "loss": 1239.3841, "step": 1800 }, { "ce_loss_13": 3.242199409008026, "ce_loss_17": 3.169390046596527, "ce_loss_2": 4.349033665657044, "ce_loss_4": 3.882836973667145, "ce_loss_9": 3.4302767395973204, "epoch": 0.181, "grad_norm": 1024.0, "kl_loss_13": 171.74287185668945, "kl_loss_2": 2464.4281005859375, "kl_loss_4": 1562.7095886230468, "kl_loss_9": 601.4007720947266, "learning_rate": 0.0009281745151257945, "loss": 1209.8825, "step": 1810 }, { "ce_loss_13": 3.2685648798942566, "ce_loss_17": 3.189663565158844, "ce_loss_2": 4.409343361854553, "ce_loss_4": 3.930420625209808, "ce_loss_9": 3.464329183101654, "epoch": 0.182, "grad_norm": 1464.0, "kl_loss_13": 181.52149353027343, "kl_loss_2": 2528.6295532226563, "kl_loss_4": 1610.6787170410157, "kl_loss_9": 620.9924072265625, "learning_rate": 0.0009273530119214868, "loss": 1246.7359, "step": 1820 }, { "ce_loss_13": 3.3619608283042908, "ce_loss_17": 3.285822570323944, "ce_loss_2": 4.473600506782532, "ce_loss_4": 4.010280549526215, "ce_loss_9": 3.54911128282547, "epoch": 0.183, "grad_norm": 960.0, "kl_loss_13": 179.32927551269532, "kl_loss_2": 2497.52265625, "kl_loss_4": 1594.5492858886719, "kl_loss_9": 610.7064666748047, "learning_rate": 0.0009265272052770935, "loss": 1204.0391, "step": 1830 }, { "ce_loss_13": 3.1914825201034547, "ce_loss_17": 3.113387143611908, "ce_loss_2": 4.360214495658875, "ce_loss_4": 3.876107943058014, "ce_loss_9": 3.393728792667389, "epoch": 0.184, "grad_norm": 1536.0, "kl_loss_13": 174.8393768310547, "kl_loss_2": 2555.389111328125, "kl_loss_4": 1630.0819030761718, "kl_loss_9": 623.1473541259766, "learning_rate": 0.0009256971035084784, "loss": 1239.6832, "step": 1840 }, { "ce_loss_13": 3.1333197474479677, "ce_loss_17": 3.0520315527915955, "ce_loss_2": 4.315753293037415, "ce_loss_4": 3.8390265583992003, "ce_loss_9": 3.342691648006439, "epoch": 0.185, "grad_norm": 1112.0, "kl_loss_13": 178.39919052124023, "kl_loss_2": 2612.957275390625, "kl_loss_4": 1682.3123291015625, "kl_loss_9": 644.5492248535156, "learning_rate": 0.0009248627149747573, "loss": 1251.0012, "step": 1850 }, { "ce_loss_13": 3.321735155582428, "ce_loss_17": 3.2501082181930543, "ce_loss_2": 4.44288444519043, "ce_loss_4": 3.9697200417518617, "ce_loss_9": 3.515633249282837, "epoch": 0.186, "grad_norm": 920.0, "kl_loss_13": 172.0274230957031, "kl_loss_2": 2501.6731567382812, "kl_loss_4": 1588.6938171386719, "kl_loss_9": 614.5717163085938, "learning_rate": 0.0009240240480782129, "loss": 1220.523, "step": 1860 }, { "ce_loss_13": 3.2293495655059816, "ce_loss_17": 3.1538926482200624, "ce_loss_2": 4.3796076536178585, "ce_loss_4": 3.9058475852012635, "ce_loss_9": 3.4308831214904787, "epoch": 0.187, "grad_norm": 972.0, "kl_loss_13": 172.08147354125975, "kl_loss_2": 2550.4640869140626, "kl_loss_4": 1630.90361328125, "kl_loss_9": 627.7829986572266, "learning_rate": 0.0009231811112642122, "loss": 1225.4676, "step": 1870 }, { "ce_loss_13": 3.2666676878929137, "ce_loss_17": 3.1940443873405457, "ce_loss_2": 4.369254851341248, "ce_loss_4": 3.90828161239624, "ce_loss_9": 3.463264262676239, "epoch": 0.188, "grad_norm": 1232.0, "kl_loss_13": 174.07933654785157, "kl_loss_2": 2447.7242065429687, "kl_loss_4": 1567.0455688476563, "kl_loss_9": 612.2089477539063, "learning_rate": 0.0009223339130211192, "loss": 1205.1075, "step": 1880 }, { "ce_loss_13": 3.1309128761291505, "ce_loss_17": 3.058625042438507, "ce_loss_2": 4.302036368846894, "ce_loss_4": 3.808994436264038, "ce_loss_9": 3.325362515449524, "epoch": 0.189, "grad_norm": 828.0, "kl_loss_13": 170.17826919555665, "kl_loss_2": 2578.439562988281, "kl_loss_4": 1625.4724609375, "kl_loss_9": 607.4008087158203, "learning_rate": 0.0009214824618802108, "loss": 1236.62, "step": 1890 }, { "ce_loss_13": 3.305174434185028, "ce_loss_17": 3.227649712562561, "ce_loss_2": 4.428006196022034, "ce_loss_4": 3.9669900059700014, "ce_loss_9": 3.4966753363609313, "epoch": 0.19, "grad_norm": 1096.0, "kl_loss_13": 177.23081588745117, "kl_loss_2": 2484.3692993164063, "kl_loss_4": 1597.2304626464843, "kl_loss_9": 615.6154571533203, "learning_rate": 0.0009206267664155906, "loss": 1247.851, "step": 1900 }, { "ce_loss_13": 3.236591875553131, "ce_loss_17": 3.1556692838668825, "ce_loss_2": 4.383834958076477, "ce_loss_4": 3.9012603998184203, "ce_loss_9": 3.432961881160736, "epoch": 0.191, "grad_norm": 1208.0, "kl_loss_13": 185.43472290039062, "kl_loss_2": 2526.648291015625, "kl_loss_4": 1610.399884033203, "kl_loss_9": 617.7114166259765, "learning_rate": 0.0009197668352441024, "loss": 1237.5637, "step": 1910 }, { "ce_loss_13": 3.29288387298584, "ce_loss_17": 3.2098604321479796, "ce_loss_2": 4.405272650718689, "ce_loss_4": 3.934405505657196, "ce_loss_9": 3.4787180542945864, "epoch": 0.192, "grad_norm": 1544.0, "kl_loss_13": 193.9896026611328, "kl_loss_2": 2494.799951171875, "kl_loss_4": 1585.9344055175782, "kl_loss_9": 610.7688873291015, "learning_rate": 0.0009189026770252437, "loss": 1230.2221, "step": 1920 }, { "ce_loss_13": 3.3155988216400147, "ce_loss_17": 3.231837272644043, "ce_loss_2": 4.432423758506775, "ce_loss_4": 3.9658974170684815, "ce_loss_9": 3.5009405851364135, "epoch": 0.193, "grad_norm": 944.0, "kl_loss_13": 199.73346405029298, "kl_loss_2": 2475.5898193359376, "kl_loss_4": 1583.1226684570313, "kl_loss_9": 612.1580017089843, "learning_rate": 0.000918034300461078, "loss": 1260.4457, "step": 1930 }, { "ce_loss_13": 3.3449774742126466, "ce_loss_17": 3.2589903831481934, "ce_loss_2": 4.432424187660217, "ce_loss_4": 3.9798689246177674, "ce_loss_9": 3.527190065383911, "epoch": 0.194, "grad_norm": 1272.0, "kl_loss_13": 194.0785804748535, "kl_loss_2": 2458.8873657226563, "kl_loss_4": 1575.2092346191407, "kl_loss_9": 614.2167175292968, "learning_rate": 0.0009171617142961477, "loss": 1217.5326, "step": 1940 }, { "ce_loss_13": 3.304656136035919, "ce_loss_17": 3.221306765079498, "ce_loss_2": 4.41871075630188, "ce_loss_4": 3.9499549984931948, "ce_loss_9": 3.499179792404175, "epoch": 0.195, "grad_norm": 1064.0, "kl_loss_13": 190.04144821166992, "kl_loss_2": 2491.202587890625, "kl_loss_4": 1580.2600708007812, "kl_loss_9": 618.8035919189454, "learning_rate": 0.0009162849273173857, "loss": 1221.0037, "step": 1950 }, { "ce_loss_13": 3.2356810927391053, "ce_loss_17": 3.160042715072632, "ce_loss_2": 4.358764338493347, "ce_loss_4": 3.883910024166107, "ce_loss_9": 3.430359184741974, "epoch": 0.196, "grad_norm": 872.0, "kl_loss_13": 180.9440490722656, "kl_loss_2": 2475.846484375, "kl_loss_4": 1568.940167236328, "kl_loss_9": 614.7134246826172, "learning_rate": 0.0009154039483540273, "loss": 1220.1999, "step": 1960 }, { "ce_loss_13": 3.2223536133766175, "ce_loss_17": 3.1467234969139097, "ce_loss_2": 4.346777701377869, "ce_loss_4": 3.8647279739379883, "ce_loss_9": 3.4144640922546388, "epoch": 0.197, "grad_norm": 1240.0, "kl_loss_13": 175.61696166992186, "kl_loss_2": 2511.473046875, "kl_loss_4": 1590.4403991699219, "kl_loss_9": 619.0662567138672, "learning_rate": 0.0009145187862775209, "loss": 1222.0047, "step": 1970 }, { "ce_loss_13": 3.251642882823944, "ce_loss_17": 3.1779204964637757, "ce_loss_2": 4.3648188471794125, "ce_loss_4": 3.9074428915977477, "ce_loss_9": 3.443933606147766, "epoch": 0.198, "grad_norm": 1104.0, "kl_loss_13": 170.7974060058594, "kl_loss_2": 2475.900537109375, "kl_loss_4": 1596.3815124511718, "kl_loss_9": 609.0825561523437, "learning_rate": 0.0009136294500014386, "loss": 1207.0257, "step": 1980 }, { "ce_loss_13": 3.197197771072388, "ce_loss_17": 3.1263050079345702, "ce_loss_2": 4.385333669185639, "ce_loss_4": 3.8912293553352355, "ce_loss_9": 3.404167115688324, "epoch": 0.199, "grad_norm": 1584.0, "kl_loss_13": 169.49581985473634, "kl_loss_2": 2578.220178222656, "kl_loss_4": 1631.1317138671875, "kl_loss_9": 620.5709808349609, "learning_rate": 0.000912735948481387, "loss": 1242.4711, "step": 1990 }, { "ce_loss_13": 3.2254818797111513, "ce_loss_17": 3.1561971783638, "ce_loss_2": 4.349449694156647, "ce_loss_4": 3.886545944213867, "ce_loss_9": 3.4241779923439024, "epoch": 0.2, "grad_norm": 988.0, "kl_loss_13": 171.69295501708984, "kl_loss_2": 2505.286541748047, "kl_loss_4": 1606.9174560546876, "kl_loss_9": 621.2827056884765, "learning_rate": 0.0009118382907149164, "loss": 1203.799, "step": 2000 }, { "ce_loss_13": 3.255273973941803, "ce_loss_17": 3.1816171646118163, "ce_loss_2": 4.367860150337219, "ce_loss_4": 3.9092182993888853, "ce_loss_9": 3.4540134191513063, "epoch": 0.201, "grad_norm": 1136.0, "kl_loss_13": 171.37966613769532, "kl_loss_2": 2460.830773925781, "kl_loss_4": 1574.7275695800781, "kl_loss_9": 610.4031219482422, "learning_rate": 0.0009109364857414306, "loss": 1197.9989, "step": 2010 }, { "ce_loss_13": 3.228543388843536, "ce_loss_17": 3.1526674747467043, "ce_loss_2": 4.337665176391601, "ce_loss_4": 3.871294450759888, "ce_loss_9": 3.4169225335121154, "epoch": 0.202, "grad_norm": 1344.0, "kl_loss_13": 174.14765853881835, "kl_loss_2": 2494.749401855469, "kl_loss_4": 1584.3169555664062, "kl_loss_9": 612.3951843261718, "learning_rate": 0.0009100305426420956, "loss": 1239.3257, "step": 2020 }, { "ce_loss_13": 3.197736442089081, "ce_loss_17": 3.1226819515228272, "ce_loss_2": 4.369256162643433, "ce_loss_4": 3.8711305499076842, "ce_loss_9": 3.390553653240204, "epoch": 0.203, "grad_norm": 1032.0, "kl_loss_13": 172.3013458251953, "kl_loss_2": 2605.1050537109377, "kl_loss_4": 1636.3674926757812, "kl_loss_9": 616.2915466308593, "learning_rate": 0.0009091204705397484, "loss": 1228.6025, "step": 2030 }, { "ce_loss_13": 3.1828688979148865, "ce_loss_17": 3.106009066104889, "ce_loss_2": 4.3495824337005615, "ce_loss_4": 3.8593189835548403, "ce_loss_9": 3.3797409653663637, "epoch": 0.204, "grad_norm": 928.0, "kl_loss_13": 183.13731079101564, "kl_loss_2": 2592.9508666992188, "kl_loss_4": 1641.916925048828, "kl_loss_9": 623.0726470947266, "learning_rate": 0.0009082062785988049, "loss": 1241.6615, "step": 2040 }, { "ce_loss_13": 3.3152880549430845, "ce_loss_17": 3.237951624393463, "ce_loss_2": 4.403791975975037, "ce_loss_4": 3.9412471055984497, "ce_loss_9": 3.4984307646751405, "epoch": 0.205, "grad_norm": 888.0, "kl_loss_13": 178.84955062866212, "kl_loss_2": 2452.1805908203123, "kl_loss_4": 1562.497296142578, "kl_loss_9": 606.3364501953125, "learning_rate": 0.0009072879760251679, "loss": 1217.0081, "step": 2050 }, { "ce_loss_13": 3.2656277656555175, "ce_loss_17": 3.186823320388794, "ce_loss_2": 4.407771277427673, "ce_loss_4": 3.9346683859825133, "ce_loss_9": 3.460373651981354, "epoch": 0.206, "grad_norm": 1096.0, "kl_loss_13": 183.87697372436523, "kl_loss_2": 2556.0197998046874, "kl_loss_4": 1629.0449157714843, "kl_loss_9": 619.8264465332031, "learning_rate": 0.0009063655720661341, "loss": 1230.0994, "step": 2060 }, { "ce_loss_13": 3.314466619491577, "ce_loss_17": 3.228593814373016, "ce_loss_2": 4.396897697448731, "ce_loss_4": 3.937827467918396, "ce_loss_9": 3.4970903038978576, "epoch": 0.207, "grad_norm": 1064.0, "kl_loss_13": 198.95360260009767, "kl_loss_2": 2437.8350463867187, "kl_loss_4": 1566.2887145996094, "kl_loss_9": 608.7262634277344, "learning_rate": 0.000905439076010301, "loss": 1209.3958, "step": 2070 }, { "ce_loss_13": 3.2695337176322936, "ce_loss_17": 3.1824235320091248, "ce_loss_2": 4.391359329223633, "ce_loss_4": 3.9166039347648622, "ce_loss_9": 3.455859911441803, "epoch": 0.208, "grad_norm": 1112.0, "kl_loss_13": 198.64115295410156, "kl_loss_2": 2503.0908325195314, "kl_loss_4": 1586.6750732421874, "kl_loss_9": 612.9619567871093, "learning_rate": 0.0009045084971874737, "loss": 1202.842, "step": 2080 }, { "ce_loss_13": 3.247004246711731, "ce_loss_17": 3.165184569358826, "ce_loss_2": 4.34849796295166, "ce_loss_4": 3.8942025899887085, "ce_loss_9": 3.437330794334412, "epoch": 0.209, "grad_norm": 1048.0, "kl_loss_13": 188.02949066162108, "kl_loss_2": 2466.1105834960936, "kl_loss_4": 1581.7042724609375, "kl_loss_9": 619.7475341796875, "learning_rate": 0.0009035738449685707, "loss": 1228.3738, "step": 2090 }, { "ce_loss_13": 3.1783780336380003, "ce_loss_17": 3.1015420794487, "ce_loss_2": 4.349784111976623, "ce_loss_4": 3.864401626586914, "ce_loss_9": 3.38290011882782, "epoch": 0.21, "grad_norm": 1144.0, "kl_loss_13": 178.06553268432617, "kl_loss_2": 2578.827606201172, "kl_loss_4": 1636.2303344726563, "kl_loss_9": 621.805209350586, "learning_rate": 0.0009026351287655293, "loss": 1217.8744, "step": 2100 }, { "ce_loss_13": 3.370856428146362, "ce_loss_17": 3.3029974818229677, "ce_loss_2": 4.401641869544983, "ce_loss_4": 3.9782015085220337, "ce_loss_9": 3.55935515165329, "epoch": 0.211, "grad_norm": 1752.0, "kl_loss_13": 166.1920181274414, "kl_loss_2": 2318.3716552734377, "kl_loss_4": 1490.68994140625, "kl_loss_9": 584.5986175537109, "learning_rate": 0.0009016923580312113, "loss": 1151.8691, "step": 2110 }, { "ce_loss_13": 3.235159659385681, "ce_loss_17": 3.164220559597015, "ce_loss_2": 4.330569124221801, "ce_loss_4": 3.868649196624756, "ce_loss_9": 3.4228716254234315, "epoch": 0.212, "grad_norm": 1312.0, "kl_loss_13": 170.2442512512207, "kl_loss_2": 2444.163232421875, "kl_loss_4": 1553.7580200195312, "kl_loss_9": 605.9254455566406, "learning_rate": 0.0009007455422593077, "loss": 1214.2754, "step": 2120 }, { "ce_loss_13": 3.247132456302643, "ce_loss_17": 3.175766611099243, "ce_loss_2": 4.393925070762634, "ce_loss_4": 3.9121050119400023, "ce_loss_9": 3.4535219788551332, "epoch": 0.213, "grad_norm": 1264.0, "kl_loss_13": 169.7667152404785, "kl_loss_2": 2533.9401977539064, "kl_loss_4": 1612.2480224609376, "kl_loss_9": 620.3468444824218, "learning_rate": 0.0008997946909842425, "loss": 1228.6486, "step": 2130 }, { "ce_loss_13": 3.261511981487274, "ce_loss_17": 3.1822165966033937, "ce_loss_2": 4.4444811820983885, "ce_loss_4": 3.9535838961601257, "ce_loss_9": 3.466679072380066, "epoch": 0.214, "grad_norm": 1464.0, "kl_loss_13": 175.5771697998047, "kl_loss_2": 2624.862683105469, "kl_loss_4": 1670.9113708496093, "kl_loss_9": 635.0477752685547, "learning_rate": 0.0008988398137810777, "loss": 1225.1609, "step": 2140 }, { "ce_loss_13": 3.2928843975067137, "ce_loss_17": 3.2243247509002684, "ce_loss_2": 4.390626311302185, "ce_loss_4": 3.9335252404212953, "ce_loss_9": 3.4892451405525207, "epoch": 0.215, "grad_norm": 1080.0, "kl_loss_13": 166.29994735717773, "kl_loss_2": 2440.90458984375, "kl_loss_4": 1554.6802185058593, "kl_loss_9": 603.1138671875, "learning_rate": 0.0008978809202654162, "loss": 1184.1196, "step": 2150 }, { "ce_loss_13": 3.269202697277069, "ce_loss_17": 3.196268391609192, "ce_loss_2": 4.374455070495605, "ce_loss_4": 3.914650225639343, "ce_loss_9": 3.4582060813903808, "epoch": 0.216, "grad_norm": 968.0, "kl_loss_13": 166.64142990112305, "kl_loss_2": 2442.7971069335936, "kl_loss_4": 1549.0375915527343, "kl_loss_9": 602.6442138671875, "learning_rate": 0.0008969180200933046, "loss": 1206.3904, "step": 2160 }, { "ce_loss_13": 3.232723796367645, "ce_loss_17": 3.157481050491333, "ce_loss_2": 4.385147833824158, "ce_loss_4": 3.9135856866836547, "ce_loss_9": 3.4439543008804323, "epoch": 0.217, "grad_norm": 1360.0, "kl_loss_13": 174.04514999389647, "kl_loss_2": 2513.339318847656, "kl_loss_4": 1600.955419921875, "kl_loss_9": 625.593295288086, "learning_rate": 0.0008959511229611376, "loss": 1228.7556, "step": 2170 }, { "ce_loss_13": 3.3115162372589113, "ce_loss_17": 3.239002013206482, "ce_loss_2": 4.427860569953919, "ce_loss_4": 3.9602201342582704, "ce_loss_9": 3.520912504196167, "epoch": 0.218, "grad_norm": 1056.0, "kl_loss_13": 178.7967658996582, "kl_loss_2": 2487.41484375, "kl_loss_4": 1584.9494262695312, "kl_loss_9": 638.6561401367187, "learning_rate": 0.0008949802386055581, "loss": 1218.132, "step": 2180 }, { "ce_loss_13": 3.185491108894348, "ce_loss_17": 3.104011261463165, "ce_loss_2": 4.3011164426803585, "ce_loss_4": 3.8323360085487366, "ce_loss_9": 3.3841028451919555, "epoch": 0.219, "grad_norm": 1528.0, "kl_loss_13": 181.5668830871582, "kl_loss_2": 2458.5107421875, "kl_loss_4": 1568.5500793457031, "kl_loss_9": 649.3895080566406, "learning_rate": 0.0008940053768033609, "loss": 1239.2924, "step": 2190 }, { "ce_loss_13": 3.2574360847473143, "ce_loss_17": 3.1868374943733215, "ce_loss_2": 4.356430721282959, "ce_loss_4": 3.8946019887924193, "ce_loss_9": 3.4635474562644957, "epoch": 0.22, "grad_norm": 1064.0, "kl_loss_13": 168.85814361572267, "kl_loss_2": 2450.5212646484374, "kl_loss_4": 1562.5769409179688, "kl_loss_9": 634.4777526855469, "learning_rate": 0.0008930265473713938, "loss": 1205.9708, "step": 2200 }, { "ce_loss_13": 3.2195906043052673, "ce_loss_17": 3.1456341624259947, "ce_loss_2": 4.328716278076172, "ce_loss_4": 3.8702699542045593, "ce_loss_9": 3.4241649746894836, "epoch": 0.221, "grad_norm": 1016.0, "kl_loss_13": 167.63500900268554, "kl_loss_2": 2454.0426025390625, "kl_loss_4": 1564.5342163085938, "kl_loss_9": 618.7196166992187, "learning_rate": 0.0008920437601664579, "loss": 1181.0371, "step": 2210 }, { "ce_loss_13": 3.2132100582122805, "ce_loss_17": 3.142171657085419, "ce_loss_2": 4.328527975082397, "ce_loss_4": 3.864507591724396, "ce_loss_9": 3.416048502922058, "epoch": 0.222, "grad_norm": 1072.0, "kl_loss_13": 168.61766891479493, "kl_loss_2": 2464.6184814453127, "kl_loss_4": 1573.3881958007812, "kl_loss_9": 621.09169921875, "learning_rate": 0.0008910570250852097, "loss": 1184.8296, "step": 2220 }, { "ce_loss_13": 3.3130900979042055, "ce_loss_17": 3.243193793296814, "ce_loss_2": 4.373210859298706, "ce_loss_4": 3.9156973719596864, "ce_loss_9": 3.5026893258094787, "epoch": 0.223, "grad_norm": 1056.0, "kl_loss_13": 162.28300857543945, "kl_loss_2": 2372.620886230469, "kl_loss_4": 1490.9735534667968, "kl_loss_9": 590.5284545898437, "learning_rate": 0.0008900663520640604, "loss": 1160.1016, "step": 2230 }, { "ce_loss_13": 3.2660316228866577, "ce_loss_17": 3.1939164876937864, "ce_loss_2": 4.368876051902771, "ce_loss_4": 3.9000473737716677, "ce_loss_9": 3.462582457065582, "epoch": 0.224, "grad_norm": 972.0, "kl_loss_13": 165.77899093627929, "kl_loss_2": 2467.854736328125, "kl_loss_4": 1555.5611145019532, "kl_loss_9": 606.0369384765625, "learning_rate": 0.0008890717510790764, "loss": 1196.099, "step": 2240 }, { "ce_loss_13": 3.2237872004508974, "ce_loss_17": 3.1545886993408203, "ce_loss_2": 4.3431625127792355, "ce_loss_4": 3.8668596029281614, "ce_loss_9": 3.418202614784241, "epoch": 0.225, "grad_norm": 932.0, "kl_loss_13": 161.6664192199707, "kl_loss_2": 2490.2755493164063, "kl_loss_4": 1573.8761291503906, "kl_loss_9": 608.7557037353515, "learning_rate": 0.0008880732321458784, "loss": 1211.9898, "step": 2250 }, { "ce_loss_13": 3.251143419742584, "ce_loss_17": 3.182486617565155, "ce_loss_2": 4.343449199199677, "ce_loss_4": 3.8893532395362853, "ce_loss_9": 3.455567109584808, "epoch": 0.226, "grad_norm": 1080.0, "kl_loss_13": 163.2533706665039, "kl_loss_2": 2413.1260009765624, "kl_loss_4": 1536.5033874511719, "kl_loss_9": 609.8270050048828, "learning_rate": 0.0008870708053195413, "loss": 1199.8385, "step": 2260 }, { "ce_loss_13": 3.2727303743362426, "ce_loss_17": 3.2080424666404723, "ce_loss_2": 4.353303575515747, "ce_loss_4": 3.892589437961578, "ce_loss_9": 3.459402084350586, "epoch": 0.227, "grad_norm": 1176.0, "kl_loss_13": 157.8570640563965, "kl_loss_2": 2397.659033203125, "kl_loss_4": 1513.7567443847656, "kl_loss_9": 583.5933288574219, "learning_rate": 0.0008860644806944918, "loss": 1173.6873, "step": 2270 }, { "ce_loss_13": 3.2192744851112365, "ce_loss_17": 3.1479940176010133, "ce_loss_2": 4.343790674209595, "ce_loss_4": 3.871932303905487, "ce_loss_9": 3.4208832502365114, "epoch": 0.228, "grad_norm": 1144.0, "kl_loss_13": 165.40411911010742, "kl_loss_2": 2474.957080078125, "kl_loss_4": 1564.2632446289062, "kl_loss_9": 613.3717468261718, "learning_rate": 0.0008850542684044079, "loss": 1174.8881, "step": 2280 }, { "ce_loss_13": 3.182979905605316, "ce_loss_17": 3.1098023653030396, "ce_loss_2": 4.355036759376526, "ce_loss_4": 3.862727773189545, "ce_loss_9": 3.390827941894531, "epoch": 0.229, "grad_norm": 1160.0, "kl_loss_13": 174.97535781860353, "kl_loss_2": 2579.4727783203125, "kl_loss_4": 1627.1920227050782, "kl_loss_9": 634.4140808105469, "learning_rate": 0.0008840401786221159, "loss": 1214.0426, "step": 2290 }, { "ce_loss_13": 3.3293598771095274, "ce_loss_17": 3.2579127073287966, "ce_loss_2": 4.396261429786682, "ce_loss_4": 3.941065287590027, "ce_loss_9": 3.5251535892486574, "epoch": 0.23, "grad_norm": 1056.0, "kl_loss_13": 168.42937545776368, "kl_loss_2": 2375.623291015625, "kl_loss_4": 1498.7406799316407, "kl_loss_9": 600.6245239257812, "learning_rate": 0.000883022221559489, "loss": 1161.0031, "step": 2300 }, { "ce_loss_13": 3.2889331340789796, "ce_loss_17": 3.2135741233825685, "ce_loss_2": 4.392712044715881, "ce_loss_4": 3.925084626674652, "ce_loss_9": 3.4880856394767763, "epoch": 0.231, "grad_norm": 1012.0, "kl_loss_13": 179.8710792541504, "kl_loss_2": 2467.5209228515623, "kl_loss_4": 1559.3661743164062, "kl_loss_9": 618.4212463378906, "learning_rate": 0.0008820004074673434, "loss": 1231.823, "step": 2310 }, { "ce_loss_13": 3.1933369755744936, "ce_loss_17": 3.1252753376960754, "ce_loss_2": 4.294953918457031, "ce_loss_4": 3.8378122806549073, "ce_loss_9": 3.3925586581230163, "epoch": 0.232, "grad_norm": 1560.0, "kl_loss_13": 167.0640899658203, "kl_loss_2": 2459.3835571289064, "kl_loss_4": 1566.9483337402344, "kl_loss_9": 615.2656066894531, "learning_rate": 0.0008809747466353355, "loss": 1186.5007, "step": 2320 }, { "ce_loss_13": 3.2021512031555175, "ce_loss_17": 3.1307880759239195, "ce_loss_2": 4.3199771523475645, "ce_loss_4": 3.838150429725647, "ce_loss_9": 3.400985860824585, "epoch": 0.233, "grad_norm": 960.0, "kl_loss_13": 166.11581573486328, "kl_loss_2": 2469.960095214844, "kl_loss_4": 1549.035821533203, "kl_loss_9": 606.1809661865234, "learning_rate": 0.0008799452493918585, "loss": 1202.8001, "step": 2330 }, { "ce_loss_13": 3.2777753829956056, "ce_loss_17": 3.209915804862976, "ce_loss_2": 4.373718392848969, "ce_loss_4": 3.909938931465149, "ce_loss_9": 3.4673448324203493, "epoch": 0.234, "grad_norm": 1048.0, "kl_loss_13": 164.04096908569335, "kl_loss_2": 2452.0421875, "kl_loss_4": 1547.1481689453126, "kl_loss_9": 601.4567581176758, "learning_rate": 0.0008789119261039385, "loss": 1223.3338, "step": 2340 }, { "ce_loss_13": 3.18933242559433, "ce_loss_17": 3.1203453540802, "ce_loss_2": 4.302351558208466, "ce_loss_4": 3.8373776197433473, "ce_loss_9": 3.3886887788772584, "epoch": 0.235, "grad_norm": 1056.0, "kl_loss_13": 162.94512634277345, "kl_loss_2": 2459.123046875, "kl_loss_4": 1554.942156982422, "kl_loss_9": 601.6080810546875, "learning_rate": 0.0008778747871771292, "loss": 1180.0914, "step": 2350 }, { "ce_loss_13": 3.2376285791397095, "ce_loss_17": 3.1705939888954164, "ce_loss_2": 4.310338973999023, "ce_loss_4": 3.851981055736542, "ce_loss_9": 3.422150433063507, "epoch": 0.236, "grad_norm": 1168.0, "kl_loss_13": 158.25713729858398, "kl_loss_2": 2382.9071533203123, "kl_loss_4": 1502.9647338867187, "kl_loss_9": 577.0218231201172, "learning_rate": 0.0008768338430554083, "loss": 1158.3773, "step": 2360 }, { "ce_loss_13": 3.2555309653282167, "ce_loss_17": 3.184670698642731, "ce_loss_2": 4.343218052387238, "ce_loss_4": 3.8820324540138245, "ce_loss_9": 3.4478107333183288, "epoch": 0.237, "grad_norm": 1256.0, "kl_loss_13": 165.27040176391603, "kl_loss_2": 2410.749963378906, "kl_loss_4": 1532.383807373047, "kl_loss_9": 596.2854827880859, "learning_rate": 0.0008757891042210713, "loss": 1187.5914, "step": 2370 }, { "ce_loss_13": 3.2679835200309753, "ce_loss_17": 3.1996604204177856, "ce_loss_2": 4.349635422229767, "ce_loss_4": 3.9014257550239564, "ce_loss_9": 3.4530524253845214, "epoch": 0.238, "grad_norm": 1168.0, "kl_loss_13": 162.72979431152345, "kl_loss_2": 2401.4757568359373, "kl_loss_4": 1532.94892578125, "kl_loss_9": 591.5622161865234, "learning_rate": 0.0008747405811946271, "loss": 1177.8601, "step": 2380 }, { "ce_loss_13": 3.171390688419342, "ce_loss_17": 3.1004143714904786, "ce_loss_2": 4.323645710945129, "ce_loss_4": 3.834692454338074, "ce_loss_9": 3.3695825934410095, "epoch": 0.239, "grad_norm": 1048.0, "kl_loss_13": 163.4970359802246, "kl_loss_2": 2534.0405395507814, "kl_loss_4": 1590.2084899902343, "kl_loss_9": 614.3838989257813, "learning_rate": 0.0008736882845346905, "loss": 1181.5529, "step": 2390 }, { "ce_loss_13": 3.256571340560913, "ce_loss_17": 3.183486533164978, "ce_loss_2": 4.367242360115052, "ce_loss_4": 3.886058485507965, "ce_loss_9": 3.4580071210861205, "epoch": 0.24, "grad_norm": 876.0, "kl_loss_13": 167.91817626953124, "kl_loss_2": 2434.656634521484, "kl_loss_4": 1517.5751403808595, "kl_loss_9": 606.3812927246094, "learning_rate": 0.0008726322248378774, "loss": 1174.3748, "step": 2400 }, { "ce_loss_13": 3.2598382234573364, "ce_loss_17": 3.191308069229126, "ce_loss_2": 4.393772912025452, "ce_loss_4": 3.9158979296684264, "ce_loss_9": 3.4543803572654723, "epoch": 0.241, "grad_norm": 844.0, "kl_loss_13": 161.7484146118164, "kl_loss_2": 2509.6781616210938, "kl_loss_4": 1581.4444702148437, "kl_loss_9": 598.8506011962891, "learning_rate": 0.0008715724127386971, "loss": 1217.4995, "step": 2410 }, { "ce_loss_13": 3.331840419769287, "ce_loss_17": 3.264649474620819, "ce_loss_2": 4.407275056838989, "ce_loss_4": 3.937208962440491, "ce_loss_9": 3.51151841878891, "epoch": 0.242, "grad_norm": 948.0, "kl_loss_13": 162.88704528808594, "kl_loss_2": 2415.36083984375, "kl_loss_4": 1509.5026977539062, "kl_loss_9": 584.2631759643555, "learning_rate": 0.0008705088589094458, "loss": 1181.8004, "step": 2420 }, { "ce_loss_13": 3.3389374136924745, "ce_loss_17": 3.2720374584198, "ce_loss_2": 4.430340528488159, "ce_loss_4": 3.972409152984619, "ce_loss_9": 3.528220009803772, "epoch": 0.243, "grad_norm": 1040.0, "kl_loss_13": 164.32056274414063, "kl_loss_2": 2435.7047485351563, "kl_loss_4": 1537.0535888671875, "kl_loss_9": 590.5355804443359, "learning_rate": 0.0008694415740600988, "loss": 1192.3914, "step": 2430 }, { "ce_loss_13": 3.205978238582611, "ce_loss_17": 3.1305561304092406, "ce_loss_2": 4.345691251754761, "ce_loss_4": 3.8779913902282717, "ce_loss_9": 3.3808531880378725, "epoch": 0.244, "grad_norm": 1120.0, "kl_loss_13": 192.28745956420897, "kl_loss_2": 2543.264794921875, "kl_loss_4": 1639.93154296875, "kl_loss_9": 588.0655242919922, "learning_rate": 0.0008683705689382025, "loss": 1209.4562, "step": 2440 }, { "ce_loss_13": 3.2994937181472777, "ce_loss_17": 3.2158532619476317, "ce_loss_2": 4.351251614093781, "ce_loss_4": 3.8979491591453552, "ce_loss_9": 3.461684155464172, "epoch": 0.245, "grad_norm": 952.0, "kl_loss_13": 187.56344757080078, "kl_loss_2": 2395.6779174804688, "kl_loss_4": 1509.5264038085938, "kl_loss_9": 574.243083190918, "learning_rate": 0.0008672958543287666, "loss": 1193.1916, "step": 2450 }, { "ce_loss_13": 3.291569185256958, "ce_loss_17": 3.215873694419861, "ce_loss_2": 4.350538802146912, "ce_loss_4": 3.907143604755402, "ce_loss_9": 3.4677678465843202, "epoch": 0.246, "grad_norm": 1504.0, "kl_loss_13": 191.10882644653321, "kl_loss_2": 2380.0281372070312, "kl_loss_4": 1514.4664672851563, "kl_loss_9": 580.7178253173828, "learning_rate": 0.0008662174410541554, "loss": 1165.9505, "step": 2460 }, { "ce_loss_13": 3.2676523447036745, "ce_loss_17": 3.1862968683242796, "ce_loss_2": 4.314630842208862, "ce_loss_4": 3.860619866847992, "ce_loss_9": 3.4369062542915345, "epoch": 0.247, "grad_norm": 984.0, "kl_loss_13": 179.4552963256836, "kl_loss_2": 2372.0970092773437, "kl_loss_4": 1492.142041015625, "kl_loss_9": 572.7284759521484, "learning_rate": 0.0008651353399739787, "loss": 1186.6246, "step": 2470 }, { "ce_loss_13": 3.2845077991485594, "ce_loss_17": 3.212189829349518, "ce_loss_2": 4.361234426498413, "ce_loss_4": 3.9042978167533873, "ce_loss_9": 3.4637476086616514, "epoch": 0.248, "grad_norm": 1016.0, "kl_loss_13": 175.80752258300782, "kl_loss_2": 2396.800048828125, "kl_loss_4": 1512.92890625, "kl_loss_9": 577.5321243286132, "learning_rate": 0.0008640495619849821, "loss": 1171.3414, "step": 2480 }, { "ce_loss_13": 3.2493863701820374, "ce_loss_17": 3.171652042865753, "ce_loss_2": 4.310073709487915, "ce_loss_4": 3.8591334581375123, "ce_loss_9": 3.4222182512283323, "epoch": 0.249, "grad_norm": 1400.0, "kl_loss_13": 173.6839714050293, "kl_loss_2": 2392.72724609375, "kl_loss_4": 1507.1541015625, "kl_loss_9": 578.6557098388672, "learning_rate": 0.0008629601180209381, "loss": 1163.6204, "step": 2490 }, { "ce_loss_13": 3.236731171607971, "ce_loss_17": 3.1655083894729614, "ce_loss_2": 4.314030575752258, "ce_loss_4": 3.849554121494293, "ce_loss_9": 3.4178988099098206, "epoch": 0.25, "grad_norm": 1600.0, "kl_loss_13": 166.4853645324707, "kl_loss_2": 2373.589978027344, "kl_loss_4": 1485.3241638183595, "kl_loss_9": 581.0582550048828, "learning_rate": 0.000861867019052535, "loss": 1177.1547, "step": 2500 }, { "ce_loss_13": 3.1592686057090758, "ce_loss_17": 3.0856385350227358, "ce_loss_2": 4.300999808311462, "ce_loss_4": 3.8131303668022154, "ce_loss_9": 3.350085270404816, "epoch": 0.251, "grad_norm": 1136.0, "kl_loss_13": 164.39508514404298, "kl_loss_2": 2496.231005859375, "kl_loss_4": 1561.564990234375, "kl_loss_9": 594.1875793457032, "learning_rate": 0.0008607702760872678, "loss": 1203.5514, "step": 2510 }, { "ce_loss_13": 3.2638188123703005, "ce_loss_17": 3.1970773577690124, "ce_loss_2": 4.328337180614471, "ce_loss_4": 3.8849018573760987, "ce_loss_9": 3.445836865901947, "epoch": 0.252, "grad_norm": 1184.0, "kl_loss_13": 159.37547760009767, "kl_loss_2": 2360.8461853027343, "kl_loss_4": 1498.2792236328125, "kl_loss_9": 579.0341110229492, "learning_rate": 0.0008596699001693256, "loss": 1182.2393, "step": 2520 }, { "ce_loss_13": 3.280370306968689, "ce_loss_17": 3.2163194537162783, "ce_loss_2": 4.334255194664001, "ce_loss_4": 3.873677706718445, "ce_loss_9": 3.466372084617615, "epoch": 0.253, "grad_norm": 1336.0, "kl_loss_13": 156.86802673339844, "kl_loss_2": 2377.7126220703126, "kl_loss_4": 1483.310107421875, "kl_loss_9": 586.2825103759766, "learning_rate": 0.0008585659023794818, "loss": 1188.3191, "step": 2530 }, { "ce_loss_13": 3.2405317783355714, "ce_loss_17": 3.1731743454933166, "ce_loss_2": 4.372336435317993, "ce_loss_4": 3.9005088210105896, "ce_loss_9": 3.439291274547577, "epoch": 0.254, "grad_norm": 1432.0, "kl_loss_13": 162.73361053466797, "kl_loss_2": 2485.4057739257814, "kl_loss_4": 1573.4989990234376, "kl_loss_9": 610.8084442138672, "learning_rate": 0.0008574582938349817, "loss": 1198.0736, "step": 2540 }, { "ce_loss_13": 3.220080387592316, "ce_loss_17": 3.148034429550171, "ce_loss_2": 4.343723487854004, "ce_loss_4": 3.8852342844009398, "ce_loss_9": 3.4319419264793396, "epoch": 0.255, "grad_norm": 1072.0, "kl_loss_13": 166.51998291015624, "kl_loss_2": 2466.979150390625, "kl_loss_4": 1579.2205993652344, "kl_loss_9": 630.5009887695312, "learning_rate": 0.0008563470856894315, "loss": 1171.9037, "step": 2550 }, { "ce_loss_13": 3.225235605239868, "ce_loss_17": 3.157585322856903, "ce_loss_2": 4.325297284126282, "ce_loss_4": 3.8703621864318847, "ce_loss_9": 3.4286532163619996, "epoch": 0.256, "grad_norm": 1064.0, "kl_loss_13": 156.06654891967773, "kl_loss_2": 2427.3985961914063, "kl_loss_4": 1544.202703857422, "kl_loss_9": 624.6888000488282, "learning_rate": 0.0008552322891326845, "loss": 1180.9875, "step": 2560 }, { "ce_loss_13": 3.191692852973938, "ce_loss_17": 3.1237489700317385, "ce_loss_2": 4.298963868618012, "ce_loss_4": 3.823654294013977, "ce_loss_9": 3.389700245857239, "epoch": 0.257, "grad_norm": 1248.0, "kl_loss_13": 156.65526885986327, "kl_loss_2": 2449.2206176757813, "kl_loss_4": 1537.2764099121093, "kl_loss_9": 622.8494995117187, "learning_rate": 0.0008541139153907296, "loss": 1171.1264, "step": 2570 }, { "ce_loss_13": 3.1523714303970336, "ce_loss_17": 3.085716736316681, "ce_loss_2": 4.2537607789039615, "ce_loss_4": 3.7822983384132387, "ce_loss_9": 3.343126356601715, "epoch": 0.258, "grad_norm": 1192.0, "kl_loss_13": 153.64549560546874, "kl_loss_2": 2446.2069458007813, "kl_loss_4": 1528.4807678222655, "kl_loss_9": 592.7972290039063, "learning_rate": 0.0008529919757255782, "loss": 1186.6359, "step": 2580 }, { "ce_loss_13": 3.1913410305976866, "ce_loss_17": 3.1273189187049866, "ce_loss_2": 4.2308083295822145, "ce_loss_4": 3.7789337038993835, "ce_loss_9": 3.36907354593277, "epoch": 0.259, "grad_norm": 1112.0, "kl_loss_13": 153.28515243530273, "kl_loss_2": 2337.139758300781, "kl_loss_4": 1464.8349853515624, "kl_loss_9": 567.7613189697265, "learning_rate": 0.0008518664814351503, "loss": 1142.2709, "step": 2590 }, { "ce_loss_13": 3.155083882808685, "ce_loss_17": 3.086553168296814, "ce_loss_2": 4.266127622127533, "ce_loss_4": 3.795076847076416, "ce_loss_9": 3.3463833689689637, "epoch": 0.26, "grad_norm": 1008.0, "kl_loss_13": 163.10962829589843, "kl_loss_2": 2467.2045288085938, "kl_loss_4": 1553.3551147460937, "kl_loss_9": 595.849008178711, "learning_rate": 0.0008507374438531607, "loss": 1217.2842, "step": 2600 }, { "ce_loss_13": 3.13476984500885, "ce_loss_17": 3.0698801040649415, "ce_loss_2": 4.226039528846741, "ce_loss_4": 3.759116756916046, "ce_loss_9": 3.3224439144134523, "epoch": 0.261, "grad_norm": 1056.0, "kl_loss_13": 156.09860076904297, "kl_loss_2": 2403.2086547851563, "kl_loss_4": 1513.3333618164063, "kl_loss_9": 579.2591766357422, "learning_rate": 0.0008496048743490053, "loss": 1166.8539, "step": 2610 }, { "ce_loss_13": 3.2819517254829407, "ce_loss_17": 3.215398597717285, "ce_loss_2": 4.328773403167725, "ce_loss_4": 3.892801523208618, "ce_loss_9": 3.461100721359253, "epoch": 0.262, "grad_norm": 1152.0, "kl_loss_13": 159.18943786621094, "kl_loss_2": 2338.8973754882813, "kl_loss_4": 1486.0660217285156, "kl_loss_9": 570.2473754882812, "learning_rate": 0.0008484687843276469, "loss": 1153.5506, "step": 2620 }, { "ce_loss_13": 3.2157720923423767, "ce_loss_17": 3.145278573036194, "ce_loss_2": 4.292907226085663, "ce_loss_4": 3.8370604515075684, "ce_loss_9": 3.3946972489356995, "epoch": 0.263, "grad_norm": 1408.0, "kl_loss_13": 162.22661437988282, "kl_loss_2": 2406.223504638672, "kl_loss_4": 1514.0081848144532, "kl_loss_9": 576.9833374023438, "learning_rate": 0.0008473291852294987, "loss": 1180.2768, "step": 2630 }, { "ce_loss_13": 3.220456433296204, "ce_loss_17": 3.151499891281128, "ce_loss_2": 4.3100086688995365, "ce_loss_4": 3.8514158844947817, "ce_loss_9": 3.408153212070465, "epoch": 0.264, "grad_norm": 1576.0, "kl_loss_13": 160.23906021118165, "kl_loss_2": 2425.66640625, "kl_loss_4": 1527.0668395996095, "kl_loss_9": 583.1130310058594, "learning_rate": 0.0008461860885303114, "loss": 1159.4809, "step": 2640 }, { "ce_loss_13": 3.254705774784088, "ce_loss_17": 3.19010808467865, "ce_loss_2": 4.3067298412323, "ce_loss_4": 3.861544120311737, "ce_loss_9": 3.4349950551986694, "epoch": 0.265, "grad_norm": 1012.0, "kl_loss_13": 155.57027435302734, "kl_loss_2": 2350.6857788085936, "kl_loss_4": 1483.878924560547, "kl_loss_9": 567.7372207641602, "learning_rate": 0.000845039505741056, "loss": 1160.7898, "step": 2650 }, { "ce_loss_13": 3.233179807662964, "ce_loss_17": 3.1637366890907286, "ce_loss_2": 4.321112418174744, "ce_loss_4": 3.8588730812072756, "ce_loss_9": 3.4195252299308776, "epoch": 0.266, "grad_norm": 1184.0, "kl_loss_13": 163.67174911499023, "kl_loss_2": 2435.8692016601562, "kl_loss_4": 1541.1813720703126, "kl_loss_9": 594.2984893798828, "learning_rate": 0.0008438894484078086, "loss": 1209.0862, "step": 2660 }, { "ce_loss_13": 3.2440340638160707, "ce_loss_17": 3.175339770317078, "ce_loss_2": 4.308804273605347, "ce_loss_4": 3.8474526405334473, "ce_loss_9": 3.424735951423645, "epoch": 0.267, "grad_norm": 1160.0, "kl_loss_13": 158.14912185668945, "kl_loss_2": 2381.434460449219, "kl_loss_4": 1492.9729309082031, "kl_loss_9": 571.6585479736328, "learning_rate": 0.0008427359281116334, "loss": 1160.0953, "step": 2670 }, { "ce_loss_13": 3.139516532421112, "ce_loss_17": 3.0721927523612975, "ce_loss_2": 4.254960906505585, "ce_loss_4": 3.7871536731719972, "ce_loss_9": 3.3304852724075316, "epoch": 0.268, "grad_norm": 876.0, "kl_loss_13": 159.5185287475586, "kl_loss_2": 2452.6441040039062, "kl_loss_4": 1542.8196105957031, "kl_loss_9": 576.8506011962891, "learning_rate": 0.0008415789564684673, "loss": 1179.1404, "step": 2680 }, { "ce_loss_13": 3.38873450756073, "ce_loss_17": 3.315239143371582, "ce_loss_2": 4.439826941490173, "ce_loss_4": 3.9916396856307985, "ce_loss_9": 3.5668327689170836, "epoch": 0.269, "grad_norm": 1400.0, "kl_loss_13": 181.40618057250975, "kl_loss_2": 2323.1558837890625, "kl_loss_4": 1469.31552734375, "kl_loss_9": 573.77734375, "learning_rate": 0.0008404185451290017, "loss": 1141.2142, "step": 2690 }, { "ce_loss_13": 3.2647972464561463, "ce_loss_17": 3.186790108680725, "ce_loss_2": 4.332613229751587, "ce_loss_4": 3.868630349636078, "ce_loss_9": 3.436994421482086, "epoch": 0.27, "grad_norm": 1056.0, "kl_loss_13": 174.33276596069337, "kl_loss_2": 2391.900830078125, "kl_loss_4": 1495.1382385253905, "kl_loss_9": 565.5920883178711, "learning_rate": 0.0008392547057785661, "loss": 1156.8433, "step": 2700 }, { "ce_loss_13": 3.1926159858703613, "ce_loss_17": 3.12235666513443, "ce_loss_2": 4.301304054260254, "ce_loss_4": 3.8322774529457093, "ce_loss_9": 3.373118245601654, "epoch": 0.271, "grad_norm": 1336.0, "kl_loss_13": 168.37930450439453, "kl_loss_2": 2504.0964965820312, "kl_loss_4": 1568.2899536132813, "kl_loss_9": 585.0844696044921, "learning_rate": 0.0008380874501370098, "loss": 1158.6948, "step": 2710 }, { "ce_loss_13": 3.1808969378471375, "ce_loss_17": 3.1105502009391786, "ce_loss_2": 4.3013105392456055, "ce_loss_4": 3.8207743167877197, "ce_loss_9": 3.369449770450592, "epoch": 0.272, "grad_norm": 1264.0, "kl_loss_13": 165.28190155029296, "kl_loss_2": 2478.663232421875, "kl_loss_4": 1552.3070007324218, "kl_loss_9": 590.3447906494141, "learning_rate": 0.0008369167899585841, "loss": 1180.1382, "step": 2720 }, { "ce_loss_13": 3.302024173736572, "ce_loss_17": 3.234196436405182, "ce_loss_2": 4.331646609306335, "ce_loss_4": 3.890242040157318, "ce_loss_9": 3.476538383960724, "epoch": 0.273, "grad_norm": 1120.0, "kl_loss_13": 157.59114151000978, "kl_loss_2": 2316.9540893554686, "kl_loss_4": 1464.0277465820313, "kl_loss_9": 559.7096633911133, "learning_rate": 0.0008357427370318238, "loss": 1163.4612, "step": 2730 }, { "ce_loss_13": 3.249239194393158, "ce_loss_17": 3.1840958118438722, "ce_loss_2": 4.335900700092315, "ce_loss_4": 3.872771644592285, "ce_loss_9": 3.4334316492080688, "epoch": 0.274, "grad_norm": 1064.0, "kl_loss_13": 157.17083358764648, "kl_loss_2": 2418.312463378906, "kl_loss_4": 1520.273992919922, "kl_loss_9": 572.335482788086, "learning_rate": 0.0008345653031794292, "loss": 1172.4254, "step": 2740 }, { "ce_loss_13": 3.248675227165222, "ce_loss_17": 3.1838600516319273, "ce_loss_2": 4.322873675823212, "ce_loss_4": 3.8594106078147887, "ce_loss_9": 3.432173991203308, "epoch": 0.275, "grad_norm": 1256.0, "kl_loss_13": 158.3459274291992, "kl_loss_2": 2372.074285888672, "kl_loss_4": 1487.8560119628905, "kl_loss_9": 571.3229461669922, "learning_rate": 0.0008333845002581458, "loss": 1155.4703, "step": 2750 }, { "ce_loss_13": 3.1797487139701843, "ce_loss_17": 3.1138875484466553, "ce_loss_2": 4.287734258174896, "ce_loss_4": 3.8220154285430907, "ce_loss_9": 3.3683555483818055, "epoch": 0.276, "grad_norm": 980.0, "kl_loss_13": 159.59934005737304, "kl_loss_2": 2468.5367065429687, "kl_loss_4": 1555.7626586914062, "kl_loss_9": 584.6411987304688, "learning_rate": 0.0008322003401586462, "loss": 1183.748, "step": 2760 }, { "ce_loss_13": 3.213080632686615, "ce_loss_17": 3.150394916534424, "ce_loss_2": 4.262652182579041, "ce_loss_4": 3.811266016960144, "ce_loss_9": 3.387704312801361, "epoch": 0.277, "grad_norm": 1048.0, "kl_loss_13": 152.44154052734376, "kl_loss_2": 2341.8986938476564, "kl_loss_4": 1468.5275817871093, "kl_loss_9": 555.7105697631836, "learning_rate": 0.0008310128348054094, "loss": 1120.3209, "step": 2770 }, { "ce_loss_13": 3.1813905835151672, "ce_loss_17": 3.1166908025741575, "ce_loss_2": 4.26682071685791, "ce_loss_4": 3.802069163322449, "ce_loss_9": 3.3580470442771913, "epoch": 0.278, "grad_norm": 1288.0, "kl_loss_13": 154.38205184936524, "kl_loss_2": 2405.271826171875, "kl_loss_4": 1502.7876281738281, "kl_loss_9": 570.5797637939453, "learning_rate": 0.0008298219961566008, "loss": 1152.7189, "step": 2780 }, { "ce_loss_13": 3.1453885197639466, "ce_loss_17": 3.081616723537445, "ce_loss_2": 4.264086389541626, "ce_loss_4": 3.7981610774993895, "ce_loss_9": 3.332336151599884, "epoch": 0.279, "grad_norm": 892.0, "kl_loss_13": 155.54904708862304, "kl_loss_2": 2489.413684082031, "kl_loss_4": 1572.5774719238282, "kl_loss_9": 582.2561920166015, "learning_rate": 0.0008286278362039527, "loss": 1166.3323, "step": 2790 }, { "ce_loss_13": 3.1810045719146727, "ce_loss_17": 3.1103263020515444, "ce_loss_2": 4.296833920478821, "ce_loss_4": 3.8195695638656617, "ce_loss_9": 3.360769248008728, "epoch": 0.28, "grad_norm": 872.0, "kl_loss_13": 161.04954223632814, "kl_loss_2": 2494.748254394531, "kl_loss_4": 1561.1195556640625, "kl_loss_9": 574.9254943847657, "learning_rate": 0.0008274303669726426, "loss": 1159.6732, "step": 2800 }, { "ce_loss_13": 3.0888274192810057, "ce_loss_17": 3.0172742366790772, "ce_loss_2": 4.236777710914612, "ce_loss_4": 3.7422790050506594, "ce_loss_9": 3.273540186882019, "epoch": 0.281, "grad_norm": 1072.0, "kl_loss_13": 168.36445999145508, "kl_loss_2": 2530.986120605469, "kl_loss_4": 1564.6460205078124, "kl_loss_9": 575.3854736328125, "learning_rate": 0.0008262296005211721, "loss": 1165.9934, "step": 2810 }, { "ce_loss_13": 3.2056065678596495, "ce_loss_17": 3.139582359790802, "ce_loss_2": 4.314467334747315, "ce_loss_4": 3.840426743030548, "ce_loss_9": 3.3923126339912413, "epoch": 0.282, "grad_norm": 1096.0, "kl_loss_13": 160.40669708251954, "kl_loss_2": 2444.849951171875, "kl_loss_4": 1532.9786010742187, "kl_loss_9": 582.871533203125, "learning_rate": 0.0008250255489412463, "loss": 1161.9223, "step": 2820 }, { "ce_loss_13": 3.309978258609772, "ce_loss_17": 3.2416829466819763, "ce_loss_2": 4.3867769002914425, "ce_loss_4": 3.9256178975105285, "ce_loss_9": 3.4867620348930357, "epoch": 0.283, "grad_norm": 1816.0, "kl_loss_13": 159.3329833984375, "kl_loss_2": 2400.079638671875, "kl_loss_4": 1509.1794494628907, "kl_loss_9": 575.3592864990235, "learning_rate": 0.0008238182243576511, "loss": 1161.2479, "step": 2830 }, { "ce_loss_13": 3.2732188940048217, "ce_loss_17": 3.2095182657241823, "ce_loss_2": 4.279084932804108, "ce_loss_4": 3.849619650840759, "ce_loss_9": 3.4416254281997682, "epoch": 0.284, "grad_norm": 1104.0, "kl_loss_13": 153.90090789794922, "kl_loss_2": 2277.3889343261717, "kl_loss_4": 1434.3886596679688, "kl_loss_9": 550.8199615478516, "learning_rate": 0.0008226076389281315, "loss": 1120.6627, "step": 2840 }, { "ce_loss_13": 3.3116733551025392, "ce_loss_17": 3.2507740259170532, "ce_loss_2": 4.357061219215393, "ce_loss_4": 3.9099013566970826, "ce_loss_9": 3.485238790512085, "epoch": 0.285, "grad_norm": 940.0, "kl_loss_13": 152.02257232666017, "kl_loss_2": 2355.538958740234, "kl_loss_4": 1473.3945678710938, "kl_loss_9": 559.0594009399414, "learning_rate": 0.0008213938048432696, "loss": 1125.0373, "step": 2850 }, { "ce_loss_13": 3.2426870703697204, "ce_loss_17": 3.17085679769516, "ce_loss_2": 4.307324051856995, "ce_loss_4": 3.8451292395591734, "ce_loss_9": 3.4224828124046325, "epoch": 0.286, "grad_norm": 1360.0, "kl_loss_13": 160.1384078979492, "kl_loss_2": 2370.5759521484374, "kl_loss_4": 1472.3720703125, "kl_loss_9": 572.2973831176757, "learning_rate": 0.0008201767343263612, "loss": 1156.1633, "step": 2860 }, { "ce_loss_13": 3.1852338194847105, "ce_loss_17": 3.119821071624756, "ce_loss_2": 4.27563351392746, "ce_loss_4": 3.8214959502220154, "ce_loss_9": 3.367746365070343, "epoch": 0.287, "grad_norm": 1264.0, "kl_loss_13": 156.9891471862793, "kl_loss_2": 2427.7287719726564, "kl_loss_4": 1540.6155151367188, "kl_loss_9": 573.2903503417969, "learning_rate": 0.0008189564396332927, "loss": 1132.4008, "step": 2870 }, { "ce_loss_13": 3.16705527305603, "ce_loss_17": 3.101548802852631, "ce_loss_2": 4.2670722007751465, "ce_loss_4": 3.8008057713508605, "ce_loss_9": 3.3476714730262755, "epoch": 0.288, "grad_norm": 1032.0, "kl_loss_13": 155.40261611938476, "kl_loss_2": 2420.7283264160155, "kl_loss_4": 1511.5582275390625, "kl_loss_9": 568.9872009277344, "learning_rate": 0.0008177329330524181, "loss": 1165.8711, "step": 2880 }, { "ce_loss_13": 3.2226102232933043, "ce_loss_17": 3.157565939426422, "ce_loss_2": 4.277075111865997, "ce_loss_4": 3.8224209189414977, "ce_loss_9": 3.40519917011261, "epoch": 0.289, "grad_norm": 996.0, "kl_loss_13": 153.85085678100586, "kl_loss_2": 2334.1626892089844, "kl_loss_4": 1463.6302062988282, "kl_loss_9": 561.2743194580078, "learning_rate": 0.0008165062269044352, "loss": 1135.0438, "step": 2890 }, { "ce_loss_13": 3.1779353976249696, "ce_loss_17": 3.111630141735077, "ce_loss_2": 4.272741568088532, "ce_loss_4": 3.795286238193512, "ce_loss_9": 3.36032794713974, "epoch": 0.29, "grad_norm": 1064.0, "kl_loss_13": 156.48692779541017, "kl_loss_2": 2431.9177490234374, "kl_loss_4": 1507.5694641113282, "kl_loss_9": 575.4245101928711, "learning_rate": 0.0008152763335422613, "loss": 1167.9848, "step": 2900 }, { "ce_loss_13": 3.1681819319725038, "ce_loss_17": 3.099293220043182, "ce_loss_2": 4.250057470798493, "ce_loss_4": 3.7852508068084716, "ce_loss_9": 3.349228310585022, "epoch": 0.291, "grad_norm": 1208.0, "kl_loss_13": 157.99078292846679, "kl_loss_2": 2402.497448730469, "kl_loss_4": 1511.788507080078, "kl_loss_9": 573.1223114013671, "learning_rate": 0.0008140432653509088, "loss": 1147.5254, "step": 2910 }, { "ce_loss_13": 3.220819914340973, "ce_loss_17": 3.1524434685707092, "ce_loss_2": 4.269567775726318, "ce_loss_4": 3.8220234513282776, "ce_loss_9": 3.400552248954773, "epoch": 0.292, "grad_norm": 1024.0, "kl_loss_13": 162.24704360961914, "kl_loss_2": 2375.408972167969, "kl_loss_4": 1487.8807556152344, "kl_loss_9": 575.9518005371094, "learning_rate": 0.0008128070347473608, "loss": 1138.3451, "step": 2920 }, { "ce_loss_13": 3.2322062611579896, "ce_loss_17": 3.1652246475219727, "ce_loss_2": 4.333968925476074, "ce_loss_4": 3.8537524580955504, "ce_loss_9": 3.411892592906952, "epoch": 0.293, "grad_norm": 1104.0, "kl_loss_13": 161.431795501709, "kl_loss_2": 2445.74111328125, "kl_loss_4": 1517.6073425292968, "kl_loss_9": 574.6339508056641, "learning_rate": 0.0008115676541804455, "loss": 1157.5475, "step": 2930 }, { "ce_loss_13": 3.227903997898102, "ce_loss_17": 3.1622729897499084, "ce_loss_2": 4.28635356426239, "ce_loss_4": 3.832652580738068, "ce_loss_9": 3.404482388496399, "epoch": 0.294, "grad_norm": 1200.0, "kl_loss_13": 156.28535842895508, "kl_loss_2": 2366.579449462891, "kl_loss_4": 1473.5561462402343, "kl_loss_9": 565.0164428710938, "learning_rate": 0.0008103251361307119, "loss": 1156.653, "step": 2940 }, { "ce_loss_13": 3.255156099796295, "ce_loss_17": 3.1889549612998964, "ce_loss_2": 4.3231881856918335, "ce_loss_4": 3.8647888541221618, "ce_loss_9": 3.4339943170547484, "epoch": 0.295, "grad_norm": 1080.0, "kl_loss_13": 157.45657806396486, "kl_loss_2": 2391.686669921875, "kl_loss_4": 1501.926043701172, "kl_loss_9": 568.1345138549805, "learning_rate": 0.0008090794931103026, "loss": 1140.7871, "step": 2950 }, { "ce_loss_13": 3.2350062012672423, "ce_loss_17": 3.174361193180084, "ce_loss_2": 4.285650289058685, "ce_loss_4": 3.8339219808578493, "ce_loss_9": 3.4103875041007994, "epoch": 0.296, "grad_norm": 1040.0, "kl_loss_13": 152.02781677246094, "kl_loss_2": 2320.036328125, "kl_loss_4": 1445.3720092773438, "kl_loss_9": 553.481396484375, "learning_rate": 0.0008078307376628291, "loss": 1132.2182, "step": 2960 }, { "ce_loss_13": 3.299833023548126, "ce_loss_17": 3.2374541044235228, "ce_loss_2": 4.320287299156189, "ce_loss_4": 3.8823325991630555, "ce_loss_9": 3.4714555740356445, "epoch": 0.297, "grad_norm": 1248.0, "kl_loss_13": 148.53265686035155, "kl_loss_2": 2271.287493896484, "kl_loss_4": 1417.8889892578125, "kl_loss_9": 545.032878112793, "learning_rate": 0.000806578882363245, "loss": 1102.3357, "step": 2970 }, { "ce_loss_13": 3.212441349029541, "ce_loss_17": 3.1519762873649597, "ce_loss_2": 4.2551368236541744, "ce_loss_4": 3.810543382167816, "ce_loss_9": 3.390281319618225, "epoch": 0.298, "grad_norm": 1416.0, "kl_loss_13": 150.54759216308594, "kl_loss_2": 2334.7947387695312, "kl_loss_4": 1468.8730407714843, "kl_loss_9": 561.1636810302734, "learning_rate": 0.0008053239398177191, "loss": 1153.8505, "step": 2980 }, { "ce_loss_13": 3.2004388213157653, "ce_loss_17": 3.1383460521698, "ce_loss_2": 4.276726841926575, "ce_loss_4": 3.817077648639679, "ce_loss_9": 3.38042129278183, "epoch": 0.299, "grad_norm": 1376.0, "kl_loss_13": 152.81283836364747, "kl_loss_2": 2363.779895019531, "kl_loss_4": 1480.2968200683595, "kl_loss_9": 560.4363021850586, "learning_rate": 0.0008040659226635089, "loss": 1166.4947, "step": 2990 }, { "ce_loss_13": 3.3245912909507753, "ce_loss_17": 3.2565393328666685, "ce_loss_2": 4.372659683227539, "ce_loss_4": 3.9220868349075317, "ce_loss_9": 3.5069492936134337, "epoch": 0.3, "grad_norm": 1216.0, "kl_loss_13": 158.68586044311525, "kl_loss_2": 2347.0229309082033, "kl_loss_4": 1471.0124938964843, "kl_loss_9": 577.4713195800781, "learning_rate": 0.0008028048435688333, "loss": 1133.2046, "step": 3000 }, { "ce_loss_13": 3.2011621236801147, "ce_loss_17": 3.135597062110901, "ce_loss_2": 4.290722858905792, "ce_loss_4": 3.8268226265907286, "ce_loss_9": 3.3848962664604185, "epoch": 0.301, "grad_norm": 1184.0, "kl_loss_13": 152.4864013671875, "kl_loss_2": 2429.7817626953124, "kl_loss_4": 1516.4819274902343, "kl_loss_9": 570.3181289672851, "learning_rate": 0.0008015407152327448, "loss": 1153.2562, "step": 3010 }, { "ce_loss_13": 3.2436813950538634, "ce_loss_17": 3.178469181060791, "ce_loss_2": 4.318740200996399, "ce_loss_4": 3.8487913250923156, "ce_loss_9": 3.4196946263313293, "epoch": 0.302, "grad_norm": 988.0, "kl_loss_13": 155.82396850585937, "kl_loss_2": 2401.2838745117188, "kl_loss_4": 1485.9017028808594, "kl_loss_9": 566.714582824707, "learning_rate": 0.0008002735503850016, "loss": 1151.6682, "step": 3020 }, { "ce_loss_13": 3.14404753446579, "ce_loss_17": 3.0747854709625244, "ce_loss_2": 4.253287613391876, "ce_loss_4": 3.7722442746162415, "ce_loss_9": 3.3273327469825746, "epoch": 0.303, "grad_norm": 1016.0, "kl_loss_13": 158.19406814575194, "kl_loss_2": 2457.8320922851562, "kl_loss_4": 1525.1825927734376, "kl_loss_9": 576.828091430664, "learning_rate": 0.0007990033617859396, "loss": 1170.6595, "step": 3030 }, { "ce_loss_13": 3.193160855770111, "ce_loss_17": 3.129583740234375, "ce_loss_2": 4.252513670921326, "ce_loss_4": 3.798577535152435, "ce_loss_9": 3.364718425273895, "epoch": 0.304, "grad_norm": 896.0, "kl_loss_13": 153.78710479736327, "kl_loss_2": 2355.1887451171874, "kl_loss_4": 1471.1909057617188, "kl_loss_9": 558.3321548461914, "learning_rate": 0.000797730162226344, "loss": 1112.6169, "step": 3040 }, { "ce_loss_13": 3.215179169178009, "ce_loss_17": 3.1501219272613525, "ce_loss_2": 4.279658055305481, "ce_loss_4": 3.822163736820221, "ce_loss_9": 3.3928865671157835, "epoch": 0.305, "grad_norm": 1384.0, "kl_loss_13": 153.84282150268555, "kl_loss_2": 2364.8491943359377, "kl_loss_4": 1479.1527404785156, "kl_loss_9": 561.7840438842774, "learning_rate": 0.0007964539645273203, "loss": 1130.3143, "step": 3050 }, { "ce_loss_13": 3.2269324898719787, "ce_loss_17": 3.165442419052124, "ce_loss_2": 4.26456583738327, "ce_loss_4": 3.816432571411133, "ce_loss_9": 3.396591603755951, "epoch": 0.306, "grad_norm": 936.0, "kl_loss_13": 148.35995483398438, "kl_loss_2": 2303.788427734375, "kl_loss_4": 1438.92109375, "kl_loss_9": 543.5974334716797, "learning_rate": 0.000795174781540165, "loss": 1126.9539, "step": 3060 }, { "ce_loss_13": 3.3024577021598818, "ce_loss_17": 3.238257920742035, "ce_loss_2": 4.322912240028382, "ce_loss_4": 3.881198751926422, "ce_loss_9": 3.4736642718315123, "epoch": 0.307, "grad_norm": 912.0, "kl_loss_13": 151.55249099731446, "kl_loss_2": 2269.7977905273438, "kl_loss_4": 1416.2910705566405, "kl_loss_9": 543.7117935180664, "learning_rate": 0.0007938926261462366, "loss": 1126.6917, "step": 3070 }, { "ce_loss_13": 3.252531409263611, "ce_loss_17": 3.1861618876457216, "ce_loss_2": 4.288499522209167, "ce_loss_4": 3.831051743030548, "ce_loss_9": 3.4202633142471313, "epoch": 0.308, "grad_norm": 1328.0, "kl_loss_13": 156.2278793334961, "kl_loss_2": 2335.334600830078, "kl_loss_4": 1447.8191772460937, "kl_loss_9": 550.3907333374024, "learning_rate": 0.0007926075112568258, "loss": 1146.5154, "step": 3080 }, { "ce_loss_13": 3.2464492559432983, "ce_loss_17": 3.181445896625519, "ce_loss_2": 4.292973136901855, "ce_loss_4": 3.84365496635437, "ce_loss_9": 3.4248139381408693, "epoch": 0.309, "grad_norm": 860.0, "kl_loss_13": 153.26948318481445, "kl_loss_2": 2335.8649963378907, "kl_loss_4": 1470.2033325195312, "kl_loss_9": 558.1766815185547, "learning_rate": 0.0007913194498130252, "loss": 1114.3188, "step": 3090 }, { "ce_loss_13": 3.1723266720771788, "ce_loss_17": 3.1083881735801695, "ce_loss_2": 4.252477383613586, "ce_loss_4": 3.7924391984939576, "ce_loss_9": 3.355199670791626, "epoch": 0.31, "grad_norm": 1976.0, "kl_loss_13": 155.1093894958496, "kl_loss_2": 2370.7455139160156, "kl_loss_4": 1480.9192626953125, "kl_loss_9": 562.531704711914, "learning_rate": 0.0007900284547855992, "loss": 1145.8815, "step": 3100 }, { "ce_loss_13": 3.1906801581382753, "ce_loss_17": 3.126253354549408, "ce_loss_2": 4.222725510597229, "ce_loss_4": 3.783966934680939, "ce_loss_9": 3.3666825532913207, "epoch": 0.311, "grad_norm": 1304.0, "kl_loss_13": 152.6490852355957, "kl_loss_2": 2319.5681091308593, "kl_loss_4": 1456.0589172363282, "kl_loss_9": 559.1586547851563, "learning_rate": 0.0007887345391748532, "loss": 1143.0985, "step": 3110 }, { "ce_loss_13": 3.3048522114753722, "ce_loss_17": 3.24383544921875, "ce_loss_2": 4.312772250175476, "ce_loss_4": 3.87812625169754, "ce_loss_9": 3.4762377977371215, "epoch": 0.312, "grad_norm": 912.0, "kl_loss_13": 150.15930938720703, "kl_loss_2": 2267.2772705078123, "kl_loss_4": 1425.9346740722656, "kl_loss_9": 544.2236785888672, "learning_rate": 0.0007874377160105036, "loss": 1097.1926, "step": 3120 }, { "ce_loss_13": 3.2103907227516175, "ce_loss_17": 3.143777811527252, "ce_loss_2": 4.296741032600403, "ce_loss_4": 3.8537627935409544, "ce_loss_9": 3.4047881245613096, "epoch": 0.313, "grad_norm": 1304.0, "kl_loss_13": 150.8695846557617, "kl_loss_2": 2417.7692443847654, "kl_loss_4": 1562.9567993164062, "kl_loss_9": 602.8833923339844, "learning_rate": 0.0007861379983515449, "loss": 1188.1447, "step": 3130 }, { "ce_loss_13": 3.288115584850311, "ce_loss_17": 3.225015878677368, "ce_loss_2": 4.329520583152771, "ce_loss_4": 3.8883249044418333, "ce_loss_9": 3.4756605505943297, "epoch": 0.314, "grad_norm": 1016.0, "kl_loss_13": 150.58888320922853, "kl_loss_2": 2320.821423339844, "kl_loss_4": 1465.1744812011718, "kl_loss_9": 579.1295059204101, "learning_rate": 0.0007848353992861195, "loss": 1123.0182, "step": 3140 }, { "ce_loss_13": 3.3636842727661134, "ce_loss_17": 3.2931829929351806, "ce_loss_2": 4.410285210609436, "ce_loss_4": 3.9733946323394775, "ce_loss_9": 3.5541293025016785, "epoch": 0.315, "grad_norm": 1048.0, "kl_loss_13": 160.31154174804686, "kl_loss_2": 2328.236944580078, "kl_loss_4": 1484.042742919922, "kl_loss_9": 589.4030960083007, "learning_rate": 0.0007835299319313853, "loss": 1147.6381, "step": 3150 }, { "ce_loss_13": 3.249498438835144, "ce_loss_17": 3.1869486570358276, "ce_loss_2": 4.27755823135376, "ce_loss_4": 3.833740699291229, "ce_loss_9": 3.427429366111755, "epoch": 0.316, "grad_norm": 1056.0, "kl_loss_13": 152.25080032348632, "kl_loss_2": 2311.8635009765626, "kl_loss_4": 1435.6420349121095, "kl_loss_9": 568.1980636596679, "learning_rate": 0.0007822216094333848, "loss": 1155.0496, "step": 3160 }, { "ce_loss_13": 3.2559966683387755, "ce_loss_17": 3.19338481426239, "ce_loss_2": 4.324941778182984, "ce_loss_4": 3.865340304374695, "ce_loss_9": 3.440066230297089, "epoch": 0.317, "grad_norm": 1096.0, "kl_loss_13": 151.22731170654296, "kl_loss_2": 2377.271032714844, "kl_loss_4": 1482.4694702148438, "kl_loss_9": 574.9463195800781, "learning_rate": 0.0007809104449669101, "loss": 1130.1689, "step": 3170 }, { "ce_loss_13": 3.205196738243103, "ce_loss_17": 3.1431015968322753, "ce_loss_2": 4.243529379367828, "ce_loss_4": 3.791561245918274, "ce_loss_9": 3.3814544320106505, "epoch": 0.318, "grad_norm": 1096.0, "kl_loss_13": 148.97144813537597, "kl_loss_2": 2314.6684387207033, "kl_loss_4": 1438.1063049316406, "kl_loss_9": 557.4238296508789, "learning_rate": 0.0007795964517353734, "loss": 1115.059, "step": 3180 }, { "ce_loss_13": 3.20696964263916, "ce_loss_17": 3.1438522815704344, "ce_loss_2": 4.266876590251923, "ce_loss_4": 3.8088624835014344, "ce_loss_9": 3.3898658633232115, "epoch": 0.319, "grad_norm": 1128.0, "kl_loss_13": 165.41862030029296, "kl_loss_2": 2381.005236816406, "kl_loss_4": 1476.2580322265626, "kl_loss_9": 568.6693588256836, "learning_rate": 0.000778279642970672, "loss": 1117.838, "step": 3190 }, { "ce_loss_13": 3.2137474179267884, "ce_loss_17": 3.1448992013931276, "ce_loss_2": 4.235397756099701, "ce_loss_4": 3.7964614272117614, "ce_loss_9": 3.3810783505439757, "epoch": 0.32, "grad_norm": 1288.0, "kl_loss_13": 169.94136657714844, "kl_loss_2": 2311.4658325195314, "kl_loss_4": 1452.004412841797, "kl_loss_9": 556.6757232666016, "learning_rate": 0.0007769600319330552, "loss": 1112.1271, "step": 3200 }, { "ce_loss_13": 3.2547234535217284, "ce_loss_17": 3.172394323348999, "ce_loss_2": 4.328731942176819, "ce_loss_4": 3.852751278877258, "ce_loss_9": 3.4201572060585024, "epoch": 0.321, "grad_norm": 1104.0, "kl_loss_13": 187.84977188110352, "kl_loss_2": 2396.479626464844, "kl_loss_4": 1486.8917541503906, "kl_loss_9": 561.3488861083985, "learning_rate": 0.0007756376319109917, "loss": 1141.6826, "step": 3210 }, { "ce_loss_13": 3.302193260192871, "ce_loss_17": 3.2170753479003906, "ce_loss_2": 4.317452394962311, "ce_loss_4": 3.8698158383369448, "ce_loss_9": 3.463702642917633, "epoch": 0.322, "grad_norm": 1032.0, "kl_loss_13": 186.3388641357422, "kl_loss_2": 2306.749645996094, "kl_loss_4": 1440.2958374023438, "kl_loss_9": 563.9955520629883, "learning_rate": 0.0007743124562210351, "loss": 1106.6877, "step": 3220 }, { "ce_loss_13": 3.3037513732910155, "ce_loss_17": 3.2336353063583374, "ce_loss_2": 4.3166821837425235, "ce_loss_4": 3.8739144682884215, "ce_loss_9": 3.4672475934028624, "epoch": 0.323, "grad_norm": 1104.0, "kl_loss_13": 174.28631973266602, "kl_loss_2": 2307.5599609375, "kl_loss_4": 1437.7964599609375, "kl_loss_9": 556.7998184204101, "learning_rate": 0.0007729845182076895, "loss": 1127.4451, "step": 3230 }, { "ce_loss_13": 3.2311972856521605, "ce_loss_17": 3.1641186356544493, "ce_loss_2": 4.241222143173218, "ce_loss_4": 3.8035602927207948, "ce_loss_9": 3.402587127685547, "epoch": 0.324, "grad_norm": 1072.0, "kl_loss_13": 159.46905975341798, "kl_loss_2": 2285.189599609375, "kl_loss_4": 1427.4874206542968, "kl_loss_9": 553.4407196044922, "learning_rate": 0.0007716538312432765, "loss": 1132.933, "step": 3240 }, { "ce_loss_13": 3.1945708632469176, "ce_loss_17": 3.1267093300819395, "ce_loss_2": 4.258045554161072, "ce_loss_4": 3.7964141845703123, "ce_loss_9": 3.3753007769584658, "epoch": 0.325, "grad_norm": 1112.0, "kl_loss_13": 160.37776718139648, "kl_loss_2": 2370.964392089844, "kl_loss_4": 1486.5513610839844, "kl_loss_9": 571.4844757080078, "learning_rate": 0.0007703204087277988, "loss": 1141.6326, "step": 3250 }, { "ce_loss_13": 3.282006120681763, "ce_loss_17": 3.2203773736953734, "ce_loss_2": 4.284748280048371, "ce_loss_4": 3.841662037372589, "ce_loss_9": 3.451895225048065, "epoch": 0.326, "grad_norm": 1208.0, "kl_loss_13": 150.4366310119629, "kl_loss_2": 2244.1715881347654, "kl_loss_4": 1386.1337585449219, "kl_loss_9": 539.058497619629, "learning_rate": 0.0007689842640888063, "loss": 1094.1932, "step": 3260 }, { "ce_loss_13": 3.272642362117767, "ce_loss_17": 3.2100208044052123, "ce_loss_2": 4.298172831535339, "ce_loss_4": 3.85098614692688, "ce_loss_9": 3.445296549797058, "epoch": 0.327, "grad_norm": 956.0, "kl_loss_13": 150.65960845947265, "kl_loss_2": 2262.691534423828, "kl_loss_4": 1410.63779296875, "kl_loss_9": 553.1260284423828, "learning_rate": 0.0007676454107812607, "loss": 1105.252, "step": 3270 }, { "ce_loss_13": 3.2149673104286194, "ce_loss_17": 3.153050935268402, "ce_loss_2": 4.281734049320221, "ce_loss_4": 3.8068859457969664, "ce_loss_9": 3.3918928623199465, "epoch": 0.328, "grad_norm": 1144.0, "kl_loss_13": 152.45655517578126, "kl_loss_2": 2388.701483154297, "kl_loss_4": 1453.8044311523438, "kl_loss_9": 560.6139907836914, "learning_rate": 0.0007663038622873999, "loss": 1121.3218, "step": 3280 }, { "ce_loss_13": 3.2625755429267884, "ce_loss_17": 3.2015271544456483, "ce_loss_2": 4.300834918022156, "ce_loss_4": 3.838354003429413, "ce_loss_9": 3.429958176612854, "epoch": 0.329, "grad_norm": 1104.0, "kl_loss_13": 150.86729431152344, "kl_loss_2": 2324.6329711914063, "kl_loss_4": 1433.3134155273438, "kl_loss_9": 549.3743423461914, "learning_rate": 0.0007649596321166025, "loss": 1100.8021, "step": 3290 }, { "ce_loss_13": 3.1663953185081484, "ce_loss_17": 3.105218207836151, "ce_loss_2": 4.183412873744965, "ce_loss_4": 3.7444605469703673, "ce_loss_9": 3.334432578086853, "epoch": 0.33, "grad_norm": 1616.0, "kl_loss_13": 145.85060501098633, "kl_loss_2": 2262.158172607422, "kl_loss_4": 1413.0821594238282, "kl_loss_9": 541.2716659545898, "learning_rate": 0.0007636127338052513, "loss": 1111.6593, "step": 3300 }, { "ce_loss_13": 3.265879511833191, "ce_loss_17": 3.2037213683128356, "ce_loss_2": 4.331912851333618, "ce_loss_4": 3.8695867538452147, "ce_loss_9": 3.4435683608055117, "epoch": 0.331, "grad_norm": 940.0, "kl_loss_13": 150.9625701904297, "kl_loss_2": 2377.02763671875, "kl_loss_4": 1468.2773803710938, "kl_loss_9": 559.4819946289062, "learning_rate": 0.0007622631809165971, "loss": 1115.4734, "step": 3310 }, { "ce_loss_13": 3.2576201438903807, "ce_loss_17": 3.2005860924720766, "ce_loss_2": 4.255062627792358, "ce_loss_4": 3.8225162744522097, "ce_loss_9": 3.422081470489502, "epoch": 0.332, "grad_norm": 1032.0, "kl_loss_13": 141.62738418579102, "kl_loss_2": 2209.023748779297, "kl_loss_4": 1374.7735412597656, "kl_loss_9": 525.4694061279297, "learning_rate": 0.000760910987040623, "loss": 1086.1008, "step": 3320 }, { "ce_loss_13": 3.243612325191498, "ce_loss_17": 3.1816245317459106, "ce_loss_2": 4.3075049877166744, "ce_loss_4": 3.8558592081069945, "ce_loss_9": 3.4239736199378967, "epoch": 0.333, "grad_norm": 1168.0, "kl_loss_13": 150.68810424804687, "kl_loss_2": 2382.2288330078127, "kl_loss_4": 1490.8859436035157, "kl_loss_9": 564.201774597168, "learning_rate": 0.000759556165793906, "loss": 1115.6281, "step": 3330 }, { "ce_loss_13": 3.2569358944892883, "ce_loss_17": 3.19698988199234, "ce_loss_2": 4.301933026313781, "ce_loss_4": 3.8509605526924133, "ce_loss_9": 3.434249830245972, "epoch": 0.334, "grad_norm": 984.0, "kl_loss_13": 147.97396926879884, "kl_loss_2": 2321.010205078125, "kl_loss_4": 1449.370379638672, "kl_loss_9": 556.1116943359375, "learning_rate": 0.000758198730819481, "loss": 1126.7792, "step": 3340 }, { "ce_loss_13": 3.2173620581626894, "ce_loss_17": 3.158857786655426, "ce_loss_2": 4.254974257946015, "ce_loss_4": 3.812070834636688, "ce_loss_9": 3.385625433921814, "epoch": 0.335, "grad_norm": 1112.0, "kl_loss_13": 145.76851654052734, "kl_loss_2": 2321.397100830078, "kl_loss_4": 1452.2245361328125, "kl_loss_9": 542.5366271972656, "learning_rate": 0.0007568386957867032, "loss": 1115.2388, "step": 3350 }, { "ce_loss_13": 3.271975409984589, "ce_loss_17": 3.2107978582382204, "ce_loss_2": 4.307687997817993, "ce_loss_4": 3.8569952249526978, "ce_loss_9": 3.447258937358856, "epoch": 0.336, "grad_norm": 1048.0, "kl_loss_13": 148.24569931030274, "kl_loss_2": 2303.9115661621095, "kl_loss_4": 1431.6387634277344, "kl_loss_9": 552.6152633666992, "learning_rate": 0.0007554760743911103, "loss": 1120.017, "step": 3360 }, { "ce_loss_13": 3.1912718772888184, "ce_loss_17": 3.1337812542915344, "ce_loss_2": 4.2186089038848875, "ce_loss_4": 3.76707683801651, "ce_loss_9": 3.36250239610672, "epoch": 0.337, "grad_norm": 1176.0, "kl_loss_13": 142.47974586486816, "kl_loss_2": 2312.4614685058596, "kl_loss_4": 1419.9174560546876, "kl_loss_9": 535.9155670166016, "learning_rate": 0.0007541108803542846, "loss": 1132.0973, "step": 3370 }, { "ce_loss_13": 3.2337278604507445, "ce_loss_17": 3.171838617324829, "ce_loss_2": 4.276608884334564, "ce_loss_4": 3.8093732595443726, "ce_loss_9": 3.4018531322479246, "epoch": 0.338, "grad_norm": 1088.0, "kl_loss_13": 146.71328125, "kl_loss_2": 2330.150848388672, "kl_loss_4": 1424.2885192871095, "kl_loss_9": 541.9459503173828, "learning_rate": 0.0007527431274237149, "loss": 1159.5068, "step": 3380 }, { "ce_loss_13": 3.2051732659339907, "ce_loss_17": 3.147116720676422, "ce_loss_2": 4.234716486930847, "ce_loss_4": 3.773326241970062, "ce_loss_9": 3.370953857898712, "epoch": 0.339, "grad_norm": 1144.0, "kl_loss_13": 145.39046669006348, "kl_loss_2": 2304.125866699219, "kl_loss_4": 1411.9735046386718, "kl_loss_9": 536.0326232910156, "learning_rate": 0.0007513728293726579, "loss": 1107.4529, "step": 3390 }, { "ce_loss_13": 3.3187707543373106, "ce_loss_17": 3.2572783827781677, "ce_loss_2": 4.3241403818130495, "ce_loss_4": 3.8821993112564086, "ce_loss_9": 3.487199079990387, "epoch": 0.34, "grad_norm": 1288.0, "kl_loss_13": 146.64396209716796, "kl_loss_2": 2271.188995361328, "kl_loss_4": 1406.5373168945312, "kl_loss_9": 543.4974731445312, "learning_rate": 0.00075, "loss": 1092.8582, "step": 3400 }, { "ce_loss_13": 3.3127061486244203, "ce_loss_17": 3.24704624414444, "ce_loss_2": 4.352763652801514, "ce_loss_4": 3.893049967288971, "ce_loss_9": 3.4850422739982605, "epoch": 0.341, "grad_norm": 1112.0, "kl_loss_13": 151.97821655273438, "kl_loss_2": 2334.576800537109, "kl_loss_4": 1438.0738220214844, "kl_loss_9": 556.013232421875, "learning_rate": 0.0007486246531301177, "loss": 1107.3371, "step": 3410 }, { "ce_loss_13": 3.1261796832084654, "ce_loss_17": 3.06277174949646, "ce_loss_2": 4.1624379515647885, "ce_loss_4": 3.718223309516907, "ce_loss_9": 3.2950675010681154, "epoch": 0.342, "grad_norm": 1112.0, "kl_loss_13": 149.521150970459, "kl_loss_2": 2304.6083984375, "kl_loss_4": 1436.80576171875, "kl_loss_9": 541.5665588378906, "learning_rate": 0.0007472468026127384, "loss": 1096.0479, "step": 3420 }, { "ce_loss_13": 3.2584650993347166, "ce_loss_17": 3.1953421592712403, "ce_loss_2": 4.334172916412354, "ce_loss_4": 3.866248273849487, "ce_loss_9": 3.4411001443862914, "epoch": 0.343, "grad_norm": 1424.0, "kl_loss_13": 158.502001953125, "kl_loss_2": 2416.915905761719, "kl_loss_4": 1494.9845336914063, "kl_loss_9": 573.8610061645508, "learning_rate": 0.000745866462322802, "loss": 1143.1998, "step": 3430 }, { "ce_loss_13": 3.2432548999786377, "ce_loss_17": 3.1849469780921935, "ce_loss_2": 4.2538442492485045, "ce_loss_4": 3.8122917294502257, "ce_loss_9": 3.4124765157699586, "epoch": 0.344, "grad_norm": 1096.0, "kl_loss_13": 147.10776596069337, "kl_loss_2": 2259.4493896484373, "kl_loss_4": 1393.3152954101563, "kl_loss_9": 535.3203903198242, "learning_rate": 0.0007444836461603195, "loss": 1097.352, "step": 3440 }, { "ce_loss_13": 3.3016470074653625, "ce_loss_17": 3.2397023320198057, "ce_loss_2": 4.341945815086365, "ce_loss_4": 3.887634980678558, "ce_loss_9": 3.474075531959534, "epoch": 0.345, "grad_norm": 1016.0, "kl_loss_13": 154.09693641662597, "kl_loss_2": 2349.6806701660157, "kl_loss_4": 1459.3677795410156, "kl_loss_9": 559.6774291992188, "learning_rate": 0.0007430983680502344, "loss": 1133.1525, "step": 3450 }, { "ce_loss_13": 3.1477053642272947, "ce_loss_17": 3.0883967161178587, "ce_loss_2": 4.20899977684021, "ce_loss_4": 3.751055192947388, "ce_loss_9": 3.321410799026489, "epoch": 0.346, "grad_norm": 1552.0, "kl_loss_13": 148.25251007080078, "kl_loss_2": 2360.233758544922, "kl_loss_4": 1470.9148742675782, "kl_loss_9": 555.7899490356446, "learning_rate": 0.0007417106419422819, "loss": 1125.7767, "step": 3460 }, { "ce_loss_13": 3.2474496603012084, "ce_loss_17": 3.184881091117859, "ce_loss_2": 4.280569851398468, "ce_loss_4": 3.8262940287590026, "ce_loss_9": 3.4188409447669983, "epoch": 0.347, "grad_norm": 948.0, "kl_loss_13": 148.29722518920897, "kl_loss_2": 2292.6326416015627, "kl_loss_4": 1409.8984130859376, "kl_loss_9": 542.1886779785157, "learning_rate": 0.0007403204818108486, "loss": 1113.5707, "step": 3470 }, { "ce_loss_13": 3.228637766838074, "ce_loss_17": 3.1685121297836303, "ce_loss_2": 4.2711215138435366, "ce_loss_4": 3.8028414607048036, "ce_loss_9": 3.4011463165283202, "epoch": 0.348, "grad_norm": 1208.0, "kl_loss_13": 150.19566650390624, "kl_loss_2": 2353.558056640625, "kl_loss_4": 1430.5952941894532, "kl_loss_9": 551.05166015625, "learning_rate": 0.0007389279016548316, "loss": 1088.5112, "step": 3480 }, { "ce_loss_13": 3.2353877067565917, "ce_loss_17": 3.168398642539978, "ce_loss_2": 4.31832218170166, "ce_loss_4": 3.837770998477936, "ce_loss_9": 3.411174511909485, "epoch": 0.349, "grad_norm": 1040.0, "kl_loss_13": 153.03259658813477, "kl_loss_2": 2415.8524963378904, "kl_loss_4": 1474.031640625, "kl_loss_9": 561.5152221679688, "learning_rate": 0.0007375329154974975, "loss": 1133.8506, "step": 3490 }, { "ce_loss_13": 3.1968216061592103, "ce_loss_17": 3.1390748023986816, "ce_loss_2": 4.216631627082824, "ce_loss_4": 3.771665835380554, "ce_loss_9": 3.363450062274933, "epoch": 0.35, "grad_norm": 828.0, "kl_loss_13": 146.8826446533203, "kl_loss_2": 2269.4626220703126, "kl_loss_4": 1406.673162841797, "kl_loss_9": 535.1948822021484, "learning_rate": 0.0007361355373863414, "loss": 1119.1126, "step": 3500 }, { "ce_loss_13": 3.2356035232543947, "ce_loss_17": 3.1763408303260805, "ce_loss_2": 4.259536981582642, "ce_loss_4": 3.8046111702919005, "ce_loss_9": 3.402137577533722, "epoch": 0.351, "grad_norm": 1024.0, "kl_loss_13": 145.52130050659179, "kl_loss_2": 2260.014050292969, "kl_loss_4": 1383.0173461914062, "kl_loss_9": 531.672900390625, "learning_rate": 0.0007347357813929454, "loss": 1117.5785, "step": 3510 }, { "ce_loss_13": 3.1925937294960023, "ce_loss_17": 3.130200004577637, "ce_loss_2": 4.210647571086883, "ce_loss_4": 3.764564573764801, "ce_loss_9": 3.358327293395996, "epoch": 0.352, "grad_norm": 1320.0, "kl_loss_13": 145.41512908935547, "kl_loss_2": 2270.6834350585937, "kl_loss_4": 1401.1832641601563, "kl_loss_9": 532.2987030029296, "learning_rate": 0.0007333336616128369, "loss": 1114.4848, "step": 3520 }, { "ce_loss_13": 3.162831437587738, "ce_loss_17": 3.1000130414962768, "ce_loss_2": 4.227586054801941, "ce_loss_4": 3.762269580364227, "ce_loss_9": 3.34154292345047, "epoch": 0.353, "grad_norm": 1360.0, "kl_loss_13": 148.98602676391602, "kl_loss_2": 2345.5939514160154, "kl_loss_4": 1448.8812316894532, "kl_loss_9": 553.6644653320312, "learning_rate": 0.0007319291921653463, "loss": 1121.9639, "step": 3530 }, { "ce_loss_13": 3.249232518672943, "ce_loss_17": 3.1846173644065856, "ce_loss_2": 4.3015941143035885, "ce_loss_4": 3.8419970273971558, "ce_loss_9": 3.4276416659355164, "epoch": 0.354, "grad_norm": 1264.0, "kl_loss_13": 150.81182098388672, "kl_loss_2": 2328.228625488281, "kl_loss_4": 1442.2828063964844, "kl_loss_9": 552.777229309082, "learning_rate": 0.0007305223871934656, "loss": 1102.2092, "step": 3540 }, { "ce_loss_13": 3.2185683250427246, "ce_loss_17": 3.1567870378494263, "ce_loss_2": 4.248492741584778, "ce_loss_4": 3.7911778688430786, "ce_loss_9": 3.386900854110718, "epoch": 0.355, "grad_norm": 1072.0, "kl_loss_13": 146.7679130554199, "kl_loss_2": 2296.283331298828, "kl_loss_4": 1412.8982482910155, "kl_loss_9": 538.6669143676758, "learning_rate": 0.0007291132608637052, "loss": 1105.6255, "step": 3550 }, { "ce_loss_13": 3.178242194652557, "ce_loss_17": 3.12276805639267, "ce_loss_2": 4.302333700656891, "ce_loss_4": 3.7869089245796204, "ce_loss_9": 3.354844129085541, "epoch": 0.356, "grad_norm": 1216.0, "kl_loss_13": 142.80283546447754, "kl_loss_2": 2466.674755859375, "kl_loss_4": 1458.3899169921874, "kl_loss_9": 537.4322006225586, "learning_rate": 0.0007277018273659516, "loss": 1144.3326, "step": 3560 }, { "ce_loss_13": 3.2996881127357485, "ce_loss_17": 3.2396285772323608, "ce_loss_2": 4.340237092971802, "ce_loss_4": 3.897028183937073, "ce_loss_9": 3.4774867057800294, "epoch": 0.357, "grad_norm": 1000.0, "kl_loss_13": 149.96279830932616, "kl_loss_2": 2329.6284118652343, "kl_loss_4": 1458.4853576660157, "kl_loss_9": 557.2407135009765, "learning_rate": 0.0007262881009133242, "loss": 1119.3438, "step": 3570 }, { "ce_loss_13": 3.221097409725189, "ce_loss_17": 3.163021421432495, "ce_loss_2": 4.24693284034729, "ce_loss_4": 3.801970827579498, "ce_loss_9": 3.3903760433197023, "epoch": 0.358, "grad_norm": 1160.0, "kl_loss_13": 142.13100357055663, "kl_loss_2": 2275.99423828125, "kl_loss_4": 1409.8691162109376, "kl_loss_9": 529.3317184448242, "learning_rate": 0.0007248720957420329, "loss": 1086.8392, "step": 3580 }, { "ce_loss_13": 3.2255361557006834, "ce_loss_17": 3.169695591926575, "ce_loss_2": 4.2355260968208315, "ce_loss_4": 3.7899292588233946, "ce_loss_9": 3.3940977334976195, "epoch": 0.359, "grad_norm": 1012.0, "kl_loss_13": 142.63518600463868, "kl_loss_2": 2253.330181884766, "kl_loss_4": 1386.0135681152344, "kl_loss_9": 531.0137985229492, "learning_rate": 0.0007234538261112341, "loss": 1116.9816, "step": 3590 }, { "ce_loss_13": 3.265474319458008, "ce_loss_17": 3.2016140699386595, "ce_loss_2": 4.305642938613891, "ce_loss_4": 3.8428993940353395, "ce_loss_9": 3.4374499320983887, "epoch": 0.36, "grad_norm": 1072.0, "kl_loss_13": 147.73601722717285, "kl_loss_2": 2308.5155639648438, "kl_loss_4": 1414.5113647460937, "kl_loss_9": 547.4331771850586, "learning_rate": 0.0007220333063028871, "loss": 1096.4055, "step": 3600 }, { "ce_loss_13": 3.2989066004753114, "ce_loss_17": 3.2356014132499693, "ce_loss_2": 4.422996711730957, "ce_loss_4": 3.9858542323112487, "ce_loss_9": 3.5469090700149537, "epoch": 0.361, "grad_norm": 1480.0, "kl_loss_13": 151.63524017333984, "kl_loss_2": 2514.263232421875, "kl_loss_4": 1652.5789123535155, "kl_loss_9": 695.4253387451172, "learning_rate": 0.0007206105506216106, "loss": 1193.3166, "step": 3610 }, { "ce_loss_13": 3.1783748149871824, "ce_loss_17": 3.1190456986427306, "ce_loss_2": 4.191591489315033, "ce_loss_4": 3.7545589327812197, "ce_loss_9": 3.3499138712882996, "epoch": 0.362, "grad_norm": 1448.0, "kl_loss_13": 144.47329483032226, "kl_loss_2": 2252.7556762695312, "kl_loss_4": 1405.2230529785156, "kl_loss_9": 545.3408447265625, "learning_rate": 0.0007191855733945387, "loss": 1080.2408, "step": 3620 }, { "ce_loss_13": 3.2665766358375548, "ce_loss_17": 3.2080157518386843, "ce_loss_2": 4.296332335472107, "ce_loss_4": 3.8461857795715333, "ce_loss_9": 3.4394288897514342, "epoch": 0.363, "grad_norm": 1056.0, "kl_loss_13": 144.99088439941406, "kl_loss_2": 2282.916888427734, "kl_loss_4": 1411.8398010253907, "kl_loss_9": 539.8447341918945, "learning_rate": 0.0007177583889711762, "loss": 1092.0897, "step": 3630 }, { "ce_loss_13": 3.1844016313552856, "ce_loss_17": 3.1235493302345274, "ce_loss_2": 4.226015019416809, "ce_loss_4": 3.7693402051925657, "ce_loss_9": 3.357278311252594, "epoch": 0.364, "grad_norm": 1012.0, "kl_loss_13": 145.76128997802735, "kl_loss_2": 2327.2415283203127, "kl_loss_4": 1433.749462890625, "kl_loss_9": 546.5354614257812, "learning_rate": 0.0007163290117232541, "loss": 1109.9568, "step": 3640 }, { "ce_loss_13": 3.297709548473358, "ce_loss_17": 3.237687146663666, "ce_loss_2": 4.276693189144135, "ce_loss_4": 3.8386462807655333, "ce_loss_9": 3.4577416300773622, "epoch": 0.365, "grad_norm": 1184.0, "kl_loss_13": 142.86292419433593, "kl_loss_2": 2231.6551879882813, "kl_loss_4": 1367.6371154785156, "kl_loss_9": 527.7468185424805, "learning_rate": 0.0007148974560445859, "loss": 1081.7939, "step": 3650 }, { "ce_loss_13": 3.222912406921387, "ce_loss_17": 3.1636874079704285, "ce_loss_2": 4.2330084323883055, "ce_loss_4": 3.7924039363861084, "ce_loss_9": 3.3899203300476075, "epoch": 0.366, "grad_norm": 928.0, "kl_loss_13": 143.13527603149413, "kl_loss_2": 2241.180841064453, "kl_loss_4": 1385.8086303710938, "kl_loss_9": 534.73125, "learning_rate": 0.0007134637363509209, "loss": 1074.9213, "step": 3660 }, { "ce_loss_13": 3.335157060623169, "ce_loss_17": 3.2772869348526, "ce_loss_2": 4.325956046581268, "ce_loss_4": 3.8889792442321776, "ce_loss_9": 3.4993149757385256, "epoch": 0.367, "grad_norm": 1056.0, "kl_loss_13": 140.72308235168458, "kl_loss_2": 2214.848260498047, "kl_loss_4": 1376.8715209960938, "kl_loss_9": 529.9875915527343, "learning_rate": 0.0007120278670798009, "loss": 1091.1173, "step": 3670 }, { "ce_loss_13": 3.136798989772797, "ce_loss_17": 3.0759010672569276, "ce_loss_2": 4.235171782970428, "ce_loss_4": 3.7663430690765383, "ce_loss_9": 3.3258102416992186, "epoch": 0.368, "grad_norm": 1040.0, "kl_loss_13": 147.1903984069824, "kl_loss_2": 2423.557489013672, "kl_loss_4": 1495.6615356445313, "kl_loss_9": 568.7859619140625, "learning_rate": 0.0007105898626904133, "loss": 1155.8561, "step": 3680 }, { "ce_loss_13": 3.233095121383667, "ce_loss_17": 3.1725640177726744, "ce_loss_2": 4.280826711654663, "ce_loss_4": 3.814694082736969, "ce_loss_9": 3.401129114627838, "epoch": 0.369, "grad_norm": 1168.0, "kl_loss_13": 146.79506340026856, "kl_loss_2": 2306.6810546875, "kl_loss_4": 1412.0461364746093, "kl_loss_9": 540.7714614868164, "learning_rate": 0.0007091497376634463, "loss": 1093.4692, "step": 3690 }, { "ce_loss_13": 3.177489197254181, "ce_loss_17": 3.1213108777999876, "ce_loss_2": 4.201327252388, "ce_loss_4": 3.752693247795105, "ce_loss_9": 3.347600758075714, "epoch": 0.37, "grad_norm": 1224.0, "kl_loss_13": 144.83996353149413, "kl_loss_2": 2272.6609008789064, "kl_loss_4": 1404.1740234375, "kl_loss_9": 537.0068740844727, "learning_rate": 0.0007077075065009433, "loss": 1109.5295, "step": 3700 }, { "ce_loss_13": 3.280963397026062, "ce_loss_17": 3.218790066242218, "ce_loss_2": 4.327343392372131, "ce_loss_4": 3.869922089576721, "ce_loss_9": 3.4605684995651247, "epoch": 0.371, "grad_norm": 1080.0, "kl_loss_13": 151.46605529785157, "kl_loss_2": 2337.507568359375, "kl_loss_4": 1437.551611328125, "kl_loss_9": 556.6773880004882, "learning_rate": 0.0007062631837261557, "loss": 1113.151, "step": 3710 }, { "ce_loss_13": 3.1577559113502502, "ce_loss_17": 3.1025161147117615, "ce_loss_2": 4.190767383575439, "ce_loss_4": 3.7329460263252257, "ce_loss_9": 3.329840636253357, "epoch": 0.372, "grad_norm": 924.0, "kl_loss_13": 142.3006591796875, "kl_loss_2": 2295.3624938964845, "kl_loss_4": 1407.7059448242187, "kl_loss_9": 537.3508773803711, "learning_rate": 0.0007048167838833977, "loss": 1121.9287, "step": 3720 }, { "ce_loss_13": 3.249891424179077, "ce_loss_17": 3.1910805702209473, "ce_loss_2": 4.248119270801544, "ce_loss_4": 3.800618124008179, "ce_loss_9": 3.415862667560577, "epoch": 0.373, "grad_norm": 1008.0, "kl_loss_13": 145.40240554809571, "kl_loss_2": 2251.275341796875, "kl_loss_4": 1370.6163269042968, "kl_loss_9": 534.883023071289, "learning_rate": 0.0007033683215379002, "loss": 1081.681, "step": 3730 }, { "ce_loss_13": 3.230395829677582, "ce_loss_17": 3.1737599849700926, "ce_loss_2": 4.260580706596374, "ce_loss_4": 3.8079190373420717, "ce_loss_9": 3.4071801900863647, "epoch": 0.374, "grad_norm": 1056.0, "kl_loss_13": 141.74846267700195, "kl_loss_2": 2266.902410888672, "kl_loss_4": 1387.6708251953125, "kl_loss_9": 530.994091796875, "learning_rate": 0.0007019178112756625, "loss": 1100.8117, "step": 3740 }, { "ce_loss_13": 3.201353704929352, "ce_loss_17": 3.1425270795822144, "ce_loss_2": 4.225880908966064, "ce_loss_4": 3.772598683834076, "ce_loss_9": 3.3647816658020018, "epoch": 0.375, "grad_norm": 1304.0, "kl_loss_13": 141.04876251220702, "kl_loss_2": 2274.372802734375, "kl_loss_4": 1400.5207092285157, "kl_loss_9": 530.8258392333985, "learning_rate": 0.0007004652677033068, "loss": 1099.8803, "step": 3750 }, { "ce_loss_13": 3.2765305042266846, "ce_loss_17": 3.22227942943573, "ce_loss_2": 4.27452290058136, "ce_loss_4": 3.8275328516960143, "ce_loss_9": 3.440238118171692, "epoch": 0.376, "grad_norm": 1176.0, "kl_loss_13": 139.7890884399414, "kl_loss_2": 2238.3694458007812, "kl_loss_4": 1367.7406066894532, "kl_loss_9": 521.6471252441406, "learning_rate": 0.0006990107054479312, "loss": 1082.2824, "step": 3760 }, { "ce_loss_13": 3.256076622009277, "ce_loss_17": 3.194600772857666, "ce_loss_2": 4.264088082313537, "ce_loss_4": 3.8264092564582826, "ce_loss_9": 3.425094723701477, "epoch": 0.377, "grad_norm": 1232.0, "kl_loss_13": 144.97562484741212, "kl_loss_2": 2255.506219482422, "kl_loss_4": 1399.572735595703, "kl_loss_9": 536.1826797485352, "learning_rate": 0.000697554139156961, "loss": 1093.2152, "step": 3770 }, { "ce_loss_13": 3.245597219467163, "ce_loss_17": 3.186148262023926, "ce_loss_2": 4.275712180137634, "ce_loss_4": 3.821213412284851, "ce_loss_9": 3.4150866270065308, "epoch": 0.378, "grad_norm": 1004.0, "kl_loss_13": 146.35874252319337, "kl_loss_2": 2314.0983276367188, "kl_loss_4": 1416.1824890136718, "kl_loss_9": 546.618765258789, "learning_rate": 0.0006960955834980027, "loss": 1080.7665, "step": 3780 }, { "ce_loss_13": 3.216923546791077, "ce_loss_17": 3.1580015063285827, "ce_loss_2": 4.225869452953338, "ce_loss_4": 3.782306122779846, "ce_loss_9": 3.380349326133728, "epoch": 0.379, "grad_norm": 836.0, "kl_loss_13": 143.18241539001465, "kl_loss_2": 2258.4364990234376, "kl_loss_4": 1401.2352722167968, "kl_loss_9": 533.9924728393555, "learning_rate": 0.0006946350531586958, "loss": 1090.8575, "step": 3790 }, { "ce_loss_13": 3.23983633518219, "ce_loss_17": 3.180867946147919, "ce_loss_2": 4.266278386116028, "ce_loss_4": 3.818061637878418, "ce_loss_9": 3.4105425238609315, "epoch": 0.38, "grad_norm": 1112.0, "kl_loss_13": 142.36459884643554, "kl_loss_2": 2282.0339538574217, "kl_loss_4": 1413.4867797851562, "kl_loss_9": 534.3084548950195, "learning_rate": 0.0006931725628465643, "loss": 1115.1276, "step": 3800 }, { "ce_loss_13": 3.25376056432724, "ce_loss_17": 3.1915287494659426, "ce_loss_2": 4.285311496257782, "ce_loss_4": 3.8337749242782593, "ce_loss_9": 3.419885981082916, "epoch": 0.381, "grad_norm": 1304.0, "kl_loss_13": 144.90459938049315, "kl_loss_2": 2283.720837402344, "kl_loss_4": 1408.787530517578, "kl_loss_9": 537.7331726074219, "learning_rate": 0.0006917081272888696, "loss": 1099.883, "step": 3810 }, { "ce_loss_13": 3.1708564519882203, "ce_loss_17": 3.1096996068954468, "ce_loss_2": 4.234085977077484, "ce_loss_4": 3.7422009468078614, "ce_loss_9": 3.3410732626914976, "epoch": 0.382, "grad_norm": 1168.0, "kl_loss_13": 144.76926918029784, "kl_loss_2": 2372.684216308594, "kl_loss_4": 1412.5823364257812, "kl_loss_9": 543.8266342163085, "learning_rate": 0.0006902417612324615, "loss": 1097.915, "step": 3820 }, { "ce_loss_13": 3.289571964740753, "ce_loss_17": 3.229340147972107, "ce_loss_2": 4.345370495319367, "ce_loss_4": 3.8833075881004335, "ce_loss_9": 3.470554828643799, "epoch": 0.383, "grad_norm": 1152.0, "kl_loss_13": 149.838623046875, "kl_loss_2": 2354.5707397460938, "kl_loss_4": 1445.2726745605469, "kl_loss_9": 556.7191513061523, "learning_rate": 0.00068877347944363, "loss": 1116.3727, "step": 3830 }, { "ce_loss_13": 3.288759171962738, "ce_loss_17": 3.230839025974274, "ce_loss_2": 4.285260081291199, "ce_loss_4": 3.8388131856918335, "ce_loss_9": 3.4526683568954466, "epoch": 0.384, "grad_norm": 1080.0, "kl_loss_13": 144.03519592285156, "kl_loss_2": 2237.6379272460936, "kl_loss_4": 1379.7740539550782, "kl_loss_9": 533.508641052246, "learning_rate": 0.0006873032967079561, "loss": 1094.7779, "step": 3840 }, { "ce_loss_13": 3.275795066356659, "ce_loss_17": 3.2191882848739626, "ce_loss_2": 4.25528713464737, "ce_loss_4": 3.8196405172348022, "ce_loss_9": 3.4371477127075196, "epoch": 0.385, "grad_norm": 1176.0, "kl_loss_13": 142.05069694519042, "kl_loss_2": 2218.6145874023437, "kl_loss_4": 1365.281317138672, "kl_loss_9": 528.4167739868165, "learning_rate": 0.0006858312278301637, "loss": 1067.3784, "step": 3850 }, { "ce_loss_13": 3.309419846534729, "ce_loss_17": 3.2534318566322327, "ce_loss_2": 4.283396553993225, "ce_loss_4": 3.846131241321564, "ce_loss_9": 3.4694788098335265, "epoch": 0.386, "grad_norm": 1216.0, "kl_loss_13": 143.49032974243164, "kl_loss_2": 2215.0926513671875, "kl_loss_4": 1370.4406372070312, "kl_loss_9": 528.8612045288086, "learning_rate": 0.0006843572876339704, "loss": 1071.2264, "step": 3860 }, { "ce_loss_13": 3.2243395805358888, "ce_loss_17": 3.1701308131217956, "ce_loss_2": 4.1989872574806215, "ce_loss_4": 3.7666295051574705, "ce_loss_9": 3.3849170446395873, "epoch": 0.387, "grad_norm": 976.0, "kl_loss_13": 138.2883701324463, "kl_loss_2": 2191.8744140625, "kl_loss_4": 1348.6982971191405, "kl_loss_9": 515.7973876953125, "learning_rate": 0.0006828814909619373, "loss": 1095.1074, "step": 3870 }, { "ce_loss_13": 3.3449166297912596, "ce_loss_17": 3.286892902851105, "ce_loss_2": 4.346810603141785, "ce_loss_4": 3.8929901719093323, "ce_loss_9": 3.5063482761383056, "epoch": 0.388, "grad_norm": 868.0, "kl_loss_13": 145.4689910888672, "kl_loss_2": 2239.009161376953, "kl_loss_4": 1361.2510803222656, "kl_loss_9": 528.7980728149414, "learning_rate": 0.0006814038526753205, "loss": 1061.7154, "step": 3880 }, { "ce_loss_13": 3.2501052141189577, "ce_loss_17": 3.1917136788368223, "ce_loss_2": 4.252227938175201, "ce_loss_4": 3.8069416165351866, "ce_loss_9": 3.4180442214012148, "epoch": 0.389, "grad_norm": 920.0, "kl_loss_13": 142.82520294189453, "kl_loss_2": 2233.081335449219, "kl_loss_4": 1376.3583984375, "kl_loss_9": 530.4266921997071, "learning_rate": 0.0006799243876539213, "loss": 1079.541, "step": 3890 }, { "ce_loss_13": 3.174937331676483, "ce_loss_17": 3.116032266616821, "ce_loss_2": 4.2224876403808596, "ce_loss_4": 3.74836003780365, "ce_loss_9": 3.3466053366661073, "epoch": 0.39, "grad_norm": 1352.0, "kl_loss_13": 142.3447692871094, "kl_loss_2": 2315.2199829101564, "kl_loss_4": 1392.1733459472657, "kl_loss_9": 534.2415222167969, "learning_rate": 0.0006784431107959359, "loss": 1099.9259, "step": 3900 }, { "ce_loss_13": 3.2335155725479128, "ce_loss_17": 3.1751118063926698, "ce_loss_2": 4.290912342071533, "ce_loss_4": 3.821517026424408, "ce_loss_9": 3.4079156160354613, "epoch": 0.391, "grad_norm": 996.0, "kl_loss_13": 144.96173362731935, "kl_loss_2": 2346.229870605469, "kl_loss_4": 1425.8754333496095, "kl_loss_9": 542.3289611816406, "learning_rate": 0.0006769600370178059, "loss": 1099.0344, "step": 3910 }, { "ce_loss_13": 3.200530457496643, "ce_loss_17": 3.1419711232185366, "ce_loss_2": 4.221344840526581, "ce_loss_4": 3.7708722352981567, "ce_loss_9": 3.3685542821884153, "epoch": 0.392, "grad_norm": 1072.0, "kl_loss_13": 141.8190891265869, "kl_loss_2": 2272.987408447266, "kl_loss_4": 1402.2062377929688, "kl_loss_9": 535.7193130493164, "learning_rate": 0.0006754751812540679, "loss": 1070.5279, "step": 3920 }, { "ce_loss_13": 3.2458479166030885, "ce_loss_17": 3.187723922729492, "ce_loss_2": 4.282582712173462, "ce_loss_4": 3.8182594895362856, "ce_loss_9": 3.4138319373130797, "epoch": 0.393, "grad_norm": 1032.0, "kl_loss_13": 145.07766571044922, "kl_loss_2": 2316.783581542969, "kl_loss_4": 1406.7053955078125, "kl_loss_9": 533.075520324707, "learning_rate": 0.0006739885584572025, "loss": 1103.8527, "step": 3930 }, { "ce_loss_13": 3.2687594890594482, "ce_loss_17": 3.2100741147994993, "ce_loss_2": 4.324749565124511, "ce_loss_4": 3.842464029788971, "ce_loss_9": 3.4363712906837462, "epoch": 0.394, "grad_norm": 1640.0, "kl_loss_13": 147.05072326660155, "kl_loss_2": 2373.3624633789063, "kl_loss_4": 1425.4352172851563, "kl_loss_9": 537.4004592895508, "learning_rate": 0.0006725001835974853, "loss": 1097.4596, "step": 3940 }, { "ce_loss_13": 3.266835296154022, "ce_loss_17": 3.2022838473320006, "ce_loss_2": 4.305810964107513, "ce_loss_4": 3.8371684312820435, "ce_loss_9": 3.4357321739196776, "epoch": 0.395, "grad_norm": 1168.0, "kl_loss_13": 147.67078170776367, "kl_loss_2": 2323.375280761719, "kl_loss_4": 1414.3819641113282, "kl_loss_9": 539.8738250732422, "learning_rate": 0.0006710100716628344, "loss": 1081.6447, "step": 3950 }, { "ce_loss_13": 3.2454325437545775, "ce_loss_17": 3.1863141536712645, "ce_loss_2": 4.268840169906616, "ce_loss_4": 3.8242222547531126, "ce_loss_9": 3.4139421820640563, "epoch": 0.396, "grad_norm": 1248.0, "kl_loss_13": 142.41865005493165, "kl_loss_2": 2278.9330505371095, "kl_loss_4": 1400.0128112792968, "kl_loss_9": 529.4596267700196, "learning_rate": 0.0006695182376586602, "loss": 1094.5256, "step": 3960 }, { "ce_loss_13": 3.270889091491699, "ce_loss_17": 3.2131107211112977, "ce_loss_2": 4.242412006855011, "ce_loss_4": 3.8120558500289916, "ce_loss_9": 3.4289388060569763, "epoch": 0.397, "grad_norm": 976.0, "kl_loss_13": 136.08055839538574, "kl_loss_2": 2154.5995910644533, "kl_loss_4": 1317.3354797363281, "kl_loss_9": 506.0668609619141, "learning_rate": 0.000668024696607715, "loss": 1073.0659, "step": 3970 }, { "ce_loss_13": 3.2372169494628906, "ce_loss_17": 3.180163621902466, "ce_loss_2": 4.252505350112915, "ce_loss_4": 3.796736788749695, "ce_loss_9": 3.400821304321289, "epoch": 0.398, "grad_norm": 1312.0, "kl_loss_13": 140.31367378234864, "kl_loss_2": 2270.2838928222654, "kl_loss_4": 1384.3943664550782, "kl_loss_9": 527.7767562866211, "learning_rate": 0.0006665294635499404, "loss": 1082.9027, "step": 3980 }, { "ce_loss_13": 3.2423538088798525, "ce_loss_17": 3.1814489722251893, "ce_loss_2": 4.3047816276550295, "ce_loss_4": 3.8311443090438844, "ce_loss_9": 3.4165687441825865, "epoch": 0.399, "grad_norm": 1384.0, "kl_loss_13": 148.61846618652345, "kl_loss_2": 2379.7642517089844, "kl_loss_4": 1452.3165954589845, "kl_loss_9": 549.0985458374023, "learning_rate": 0.0006650325535423167, "loss": 1106.8416, "step": 3990 }, { "ce_loss_13": 3.2674113273620606, "ce_loss_17": 3.2089606404304503, "ce_loss_2": 4.239629650115967, "ce_loss_4": 3.8048909425735475, "ce_loss_9": 3.4261961698532106, "epoch": 0.4, "grad_norm": 1032.0, "kl_loss_13": 139.04373092651366, "kl_loss_2": 2165.116558837891, "kl_loss_4": 1325.136541748047, "kl_loss_9": 512.5076156616211, "learning_rate": 0.0006635339816587109, "loss": 1068.3164, "step": 4000 }, { "ce_loss_13": 3.208540880680084, "ce_loss_17": 3.146498429775238, "ce_loss_2": 4.249489653110504, "ce_loss_4": 3.7775869250297545, "ce_loss_9": 3.3706496000289916, "epoch": 0.401, "grad_norm": 1080.0, "kl_loss_13": 150.75496444702148, "kl_loss_2": 2335.5356201171876, "kl_loss_4": 1411.4281921386719, "kl_loss_9": 531.2612396240235, "learning_rate": 0.0006620337629897252, "loss": 1088.9196, "step": 4010 }, { "ce_loss_13": 3.218238043785095, "ce_loss_17": 3.1518359541893006, "ce_loss_2": 4.230412554740906, "ce_loss_4": 3.77787264585495, "ce_loss_9": 3.381802034378052, "epoch": 0.402, "grad_norm": 1040.0, "kl_loss_13": 159.8979637145996, "kl_loss_2": 2287.495324707031, "kl_loss_4": 1394.5897277832032, "kl_loss_9": 530.7736282348633, "learning_rate": 0.0006605319126425454, "loss": 1110.2237, "step": 4020 }, { "ce_loss_13": 3.141453719139099, "ce_loss_17": 3.061632287502289, "ce_loss_2": 4.179062056541443, "ce_loss_4": 3.703935217857361, "ce_loss_9": 3.292442739009857, "epoch": 0.403, "grad_norm": 1088.0, "kl_loss_13": 184.18295135498047, "kl_loss_2": 2369.800567626953, "kl_loss_4": 1440.7917724609374, "kl_loss_9": 540.8053848266602, "learning_rate": 0.0006590284457407876, "loss": 1116.9631, "step": 4030 }, { "ce_loss_13": 3.2274596095085144, "ce_loss_17": 3.157707929611206, "ce_loss_2": 4.244430947303772, "ce_loss_4": 3.7824127674102783, "ce_loss_9": 3.3829394340515138, "epoch": 0.404, "grad_norm": 1536.0, "kl_loss_13": 174.13474349975587, "kl_loss_2": 2279.291223144531, "kl_loss_4": 1381.667462158203, "kl_loss_9": 531.105616760254, "learning_rate": 0.0006575233774243465, "loss": 1090.1229, "step": 4040 }, { "ce_loss_13": 3.2229812860488893, "ce_loss_17": 3.148377251625061, "ce_loss_2": 4.239022588729858, "ce_loss_4": 3.7807064771652223, "ce_loss_9": 3.370824730396271, "epoch": 0.405, "grad_norm": 1112.0, "kl_loss_13": 176.86324081420898, "kl_loss_2": 2314.5283935546877, "kl_loss_4": 1409.0387573242188, "kl_loss_9": 532.7137908935547, "learning_rate": 0.0006560167228492435, "loss": 1103.0762, "step": 4050 }, { "ce_loss_13": 3.2612957239151, "ce_loss_17": 3.1980576634407045, "ce_loss_2": 4.2364744424819945, "ce_loss_4": 3.7937767624855043, "ce_loss_9": 3.4135938048362733, "epoch": 0.406, "grad_norm": 1088.0, "kl_loss_13": 159.5461097717285, "kl_loss_2": 2207.3986633300783, "kl_loss_4": 1353.5649353027343, "kl_loss_9": 515.222622680664, "learning_rate": 0.0006545084971874737, "loss": 1085.0493, "step": 4060 }, { "ce_loss_13": 3.225047242641449, "ce_loss_17": 3.1550561666488646, "ce_loss_2": 4.267550551891327, "ce_loss_4": 3.801293730735779, "ce_loss_9": 3.391110193729401, "epoch": 0.407, "grad_norm": 1480.0, "kl_loss_13": 163.24023666381837, "kl_loss_2": 2356.808319091797, "kl_loss_4": 1430.5926574707032, "kl_loss_9": 550.0119644165039, "learning_rate": 0.0006529987156268526, "loss": 1096.7617, "step": 4070 }, { "ce_loss_13": 3.1378898501396177, "ce_loss_17": 3.0731621980667114, "ce_loss_2": 4.183925473690033, "ce_loss_4": 3.710806930065155, "ce_loss_9": 3.3054186344146728, "epoch": 0.408, "grad_norm": 1020.0, "kl_loss_13": 153.0199752807617, "kl_loss_2": 2313.3973754882813, "kl_loss_4": 1408.0865234375, "kl_loss_9": 534.8155807495117, "learning_rate": 0.0006514873933708637, "loss": 1118.4869, "step": 4080 }, { "ce_loss_13": 3.249518036842346, "ce_loss_17": 3.1910915851593016, "ce_loss_2": 4.262415194511414, "ce_loss_4": 3.807234191894531, "ce_loss_9": 3.4136009097099302, "epoch": 0.409, "grad_norm": 1200.0, "kl_loss_13": 146.33588523864745, "kl_loss_2": 2269.849676513672, "kl_loss_4": 1385.7192687988281, "kl_loss_9": 526.6253814697266, "learning_rate": 0.0006499745456385053, "loss": 1081.3832, "step": 4090 }, { "ce_loss_13": 3.218850302696228, "ce_loss_17": 3.1569252967834474, "ce_loss_2": 4.235727477073669, "ce_loss_4": 3.778496766090393, "ce_loss_9": 3.385510790348053, "epoch": 0.41, "grad_norm": 1680.0, "kl_loss_13": 145.96675872802734, "kl_loss_2": 2267.987579345703, "kl_loss_4": 1386.7368225097657, "kl_loss_9": 529.7655319213867, "learning_rate": 0.0006484601876641375, "loss": 1095.2829, "step": 4100 }, { "ce_loss_13": 3.207030248641968, "ce_loss_17": 3.1490207076072694, "ce_loss_2": 4.1859030485153195, "ce_loss_4": 3.749280405044556, "ce_loss_9": 3.3660176157951356, "epoch": 0.411, "grad_norm": 1472.0, "kl_loss_13": 141.7290786743164, "kl_loss_2": 2205.409045410156, "kl_loss_4": 1348.84775390625, "kl_loss_9": 516.6953704833984, "learning_rate": 0.000646944334697328, "loss": 1064.2965, "step": 4110 }, { "ce_loss_13": 3.3107524275779725, "ce_loss_17": 3.2528943061828612, "ce_loss_2": 4.274679255485535, "ce_loss_4": 3.8449880957603453, "ce_loss_9": 3.4716585755348204, "epoch": 0.412, "grad_norm": 1012.0, "kl_loss_13": 141.45433883666993, "kl_loss_2": 2166.1958557128905, "kl_loss_4": 1327.213555908203, "kl_loss_9": 520.3102600097657, "learning_rate": 0.0006454270020026995, "loss": 1046.3652, "step": 4120 }, { "ce_loss_13": 3.2846481561660767, "ce_loss_17": 3.228834879398346, "ce_loss_2": 4.241029262542725, "ce_loss_4": 3.8188376665115356, "ce_loss_9": 3.439929461479187, "epoch": 0.413, "grad_norm": 1032.0, "kl_loss_13": 136.84114837646484, "kl_loss_2": 2150.9038513183596, "kl_loss_4": 1325.6499145507812, "kl_loss_9": 509.27232666015624, "learning_rate": 0.0006439082048597755, "loss": 1040.5602, "step": 4130 }, { "ce_loss_13": 3.269050347805023, "ce_loss_17": 3.2135193824768065, "ce_loss_2": 4.278622674942016, "ce_loss_4": 3.8270646810531614, "ce_loss_9": 3.4363696575164795, "epoch": 0.414, "grad_norm": 1120.0, "kl_loss_13": 140.78020820617675, "kl_loss_2": 2251.0554260253907, "kl_loss_4": 1378.211199951172, "kl_loss_9": 527.0372833251953, "learning_rate": 0.0006423879585628261, "loss": 1080.9277, "step": 4140 }, { "ce_loss_13": 3.228526401519775, "ce_loss_17": 3.1682226181030275, "ce_loss_2": 4.262276649475098, "ce_loss_4": 3.802557277679443, "ce_loss_9": 3.3995023012161254, "epoch": 0.415, "grad_norm": 1224.0, "kl_loss_13": 145.04784660339357, "kl_loss_2": 2306.0456420898436, "kl_loss_4": 1408.7352722167968, "kl_loss_9": 536.5979507446289, "learning_rate": 0.0006408662784207149, "loss": 1099.3712, "step": 4150 }, { "ce_loss_13": 3.1986807227134704, "ce_loss_17": 3.1410440802574158, "ce_loss_2": 4.202910780906677, "ce_loss_4": 3.7563910603523256, "ce_loss_9": 3.361206841468811, "epoch": 0.416, "grad_norm": 1256.0, "kl_loss_13": 139.71601524353028, "kl_loss_2": 2257.1313049316404, "kl_loss_4": 1386.9466857910156, "kl_loss_9": 523.0001327514649, "learning_rate": 0.0006393431797567439, "loss": 1079.359, "step": 4160 }, { "ce_loss_13": 3.270991933345795, "ce_loss_17": 3.2179370760917663, "ce_loss_2": 4.227991509437561, "ce_loss_4": 3.7969707250595093, "ce_loss_9": 3.4277084946632383, "epoch": 0.417, "grad_norm": 872.0, "kl_loss_13": 139.8240867614746, "kl_loss_2": 2170.4001037597654, "kl_loss_4": 1321.7723754882813, "kl_loss_9": 508.6529541015625, "learning_rate": 0.0006378186779084996, "loss": 1030.8232, "step": 4170 }, { "ce_loss_13": 3.111368203163147, "ce_loss_17": 3.0538955330848694, "ce_loss_2": 4.152943015098572, "ce_loss_4": 3.6956923723220827, "ce_loss_9": 3.286008620262146, "epoch": 0.418, "grad_norm": 1168.0, "kl_loss_13": 142.83421096801757, "kl_loss_2": 2283.6093139648438, "kl_loss_4": 1403.3961547851563, "kl_loss_9": 537.1661239624024, "learning_rate": 0.0006362927882276989, "loss": 1096.6818, "step": 4180 }, { "ce_loss_13": 3.294391560554504, "ce_loss_17": 3.236796188354492, "ce_loss_2": 4.274410700798034, "ce_loss_4": 3.833553731441498, "ce_loss_9": 3.4546162605285646, "epoch": 0.419, "grad_norm": 1040.0, "kl_loss_13": 139.23397369384764, "kl_loss_2": 2200.6496337890626, "kl_loss_4": 1333.7858825683593, "kl_loss_9": 518.0154052734375, "learning_rate": 0.000634765526080034, "loss": 1041.789, "step": 4190 }, { "ce_loss_13": 3.3019187569618227, "ce_loss_17": 3.245002019405365, "ce_loss_2": 4.283450675010681, "ce_loss_4": 3.8495335936546327, "ce_loss_9": 3.463364267349243, "epoch": 0.42, "grad_norm": 1024.0, "kl_loss_13": 143.7896286010742, "kl_loss_2": 2209.079345703125, "kl_loss_4": 1355.8980224609375, "kl_loss_9": 529.1122772216797, "learning_rate": 0.0006332369068450174, "loss": 1056.0494, "step": 4200 }, { "ce_loss_13": 3.2413105368614197, "ce_loss_17": 3.1825143814086916, "ce_loss_2": 4.242955183982849, "ce_loss_4": 3.789373552799225, "ce_loss_9": 3.4039177179336546, "epoch": 0.421, "grad_norm": 972.0, "kl_loss_13": 141.43435554504396, "kl_loss_2": 2235.3357421875, "kl_loss_4": 1363.6363464355468, "kl_loss_9": 527.7023330688477, "learning_rate": 0.0006317069459158283, "loss": 1064.7709, "step": 4210 }, { "ce_loss_13": 3.3419814109802246, "ce_loss_17": 3.2849146246910097, "ce_loss_2": 4.305620932579041, "ce_loss_4": 3.8672492623329164, "ce_loss_9": 3.4940837264060973, "epoch": 0.422, "grad_norm": 996.0, "kl_loss_13": 141.1963623046875, "kl_loss_2": 2191.046350097656, "kl_loss_4": 1328.635321044922, "kl_loss_9": 516.1325454711914, "learning_rate": 0.0006301756586991561, "loss": 1055.1852, "step": 4220 }, { "ce_loss_13": 3.1269748330116274, "ce_loss_17": 3.0681017994880677, "ce_loss_2": 4.162196755409241, "ce_loss_4": 3.703963351249695, "ce_loss_9": 3.2904953718185426, "epoch": 0.423, "grad_norm": 1480.0, "kl_loss_13": 140.5515007019043, "kl_loss_2": 2316.8363830566404, "kl_loss_4": 1418.3695007324218, "kl_loss_9": 530.2604705810547, "learning_rate": 0.0006286430606150459, "loss": 1088.2359, "step": 4230 }, { "ce_loss_13": 3.325238800048828, "ce_loss_17": 3.267061173915863, "ce_loss_2": 4.305628073215485, "ce_loss_4": 3.8682989358901976, "ce_loss_9": 3.490296506881714, "epoch": 0.424, "grad_norm": 896.0, "kl_loss_13": 141.34505310058594, "kl_loss_2": 2207.500384521484, "kl_loss_4": 1351.1874450683595, "kl_loss_9": 525.6549011230469, "learning_rate": 0.0006271091670967436, "loss": 1055.2621, "step": 4240 }, { "ce_loss_13": 3.2424882888793944, "ce_loss_17": 3.178206443786621, "ce_loss_2": 4.283252918720246, "ce_loss_4": 3.820539355278015, "ce_loss_9": 3.4150837779045107, "epoch": 0.425, "grad_norm": 1000.0, "kl_loss_13": 147.5232925415039, "kl_loss_2": 2348.8510620117186, "kl_loss_4": 1439.9562377929688, "kl_loss_9": 554.0350677490235, "learning_rate": 0.0006255739935905395, "loss": 1093.5146, "step": 4250 }, { "ce_loss_13": 3.277960979938507, "ce_loss_17": 3.2215050578117372, "ce_loss_2": 4.261611819267273, "ce_loss_4": 3.8155045986175535, "ce_loss_9": 3.438245403766632, "epoch": 0.426, "grad_norm": 1040.0, "kl_loss_13": 141.46482124328614, "kl_loss_2": 2198.6148376464844, "kl_loss_4": 1343.028094482422, "kl_loss_9": 518.2984771728516, "learning_rate": 0.0006240375555556145, "loss": 1088.3504, "step": 4260 }, { "ce_loss_13": 3.279405117034912, "ce_loss_17": 3.2207542419433595, "ce_loss_2": 4.3215786695480345, "ce_loss_4": 3.8599573016166686, "ce_loss_9": 3.449298095703125, "epoch": 0.427, "grad_norm": 976.0, "kl_loss_13": 143.0325469970703, "kl_loss_2": 2297.389367675781, "kl_loss_4": 1405.2832885742187, "kl_loss_9": 536.6649734497071, "learning_rate": 0.000622499868463882, "loss": 1084.6301, "step": 4270 }, { "ce_loss_13": 3.2469609141349793, "ce_loss_17": 3.193158733844757, "ce_loss_2": 4.208359932899475, "ce_loss_4": 3.772513175010681, "ce_loss_9": 3.403375577926636, "epoch": 0.428, "grad_norm": 1416.0, "kl_loss_13": 139.11674118041992, "kl_loss_2": 2174.2932189941407, "kl_loss_4": 1317.2153015136719, "kl_loss_9": 509.6071441650391, "learning_rate": 0.0006209609477998338, "loss": 1051.5127, "step": 4280 }, { "ce_loss_13": 3.3088216185569763, "ce_loss_17": 3.2514103055000305, "ce_loss_2": 4.288022923469543, "ce_loss_4": 3.861828029155731, "ce_loss_9": 3.4687854290008544, "epoch": 0.429, "grad_norm": 1504.0, "kl_loss_13": 141.33465957641602, "kl_loss_2": 2205.5402282714845, "kl_loss_4": 1360.5914794921875, "kl_loss_9": 520.2003005981445, "learning_rate": 0.0006194208090603844, "loss": 1073.3851, "step": 4290 }, { "ce_loss_13": 3.2258095145225525, "ce_loss_17": 3.168082821369171, "ce_loss_2": 4.208843743801117, "ce_loss_4": 3.7678122401237486, "ce_loss_9": 3.3838152170181273, "epoch": 0.43, "grad_norm": 972.0, "kl_loss_13": 136.1593978881836, "kl_loss_2": 2191.6624084472655, "kl_loss_4": 1336.3553405761718, "kl_loss_9": 507.94545288085936, "learning_rate": 0.0006178794677547138, "loss": 1039.1131, "step": 4300 }, { "ce_loss_13": 3.2537111401557923, "ce_loss_17": 3.1996105670928956, "ce_loss_2": 4.262598633766174, "ce_loss_4": 3.810460865497589, "ce_loss_9": 3.4206053972244264, "epoch": 0.431, "grad_norm": 936.0, "kl_loss_13": 141.85072860717773, "kl_loss_2": 2256.732586669922, "kl_loss_4": 1380.2796875, "kl_loss_9": 530.5854919433593, "learning_rate": 0.0006163369394041111, "loss": 1067.6293, "step": 4310 }, { "ce_loss_13": 3.1882086277008055, "ce_loss_17": 3.1299739837646485, "ce_loss_2": 4.211286115646362, "ce_loss_4": 3.756604528427124, "ce_loss_9": 3.3521428227424623, "epoch": 0.432, "grad_norm": 1184.0, "kl_loss_13": 139.46152381896974, "kl_loss_2": 2272.4755859375, "kl_loss_4": 1394.2722961425782, "kl_loss_9": 520.6512222290039, "learning_rate": 0.0006147932395418205, "loss": 1097.5045, "step": 4320 }, { "ce_loss_13": 3.2259851932525634, "ce_loss_17": 3.1691091775894167, "ce_loss_2": 4.214279782772064, "ce_loss_4": 3.7716413497924806, "ce_loss_9": 3.3877723693847654, "epoch": 0.433, "grad_norm": 1072.0, "kl_loss_13": 140.1126495361328, "kl_loss_2": 2221.072473144531, "kl_loss_4": 1350.1669311523438, "kl_loss_9": 520.3444381713867, "learning_rate": 0.0006132483837128823, "loss": 1055.2018, "step": 4330 }, { "ce_loss_13": 3.204589641094208, "ce_loss_17": 3.1478713512420655, "ce_loss_2": 4.226406228542328, "ce_loss_4": 3.7643416047096254, "ce_loss_9": 3.3647199869155884, "epoch": 0.434, "grad_norm": 1120.0, "kl_loss_13": 140.3993827819824, "kl_loss_2": 2283.126818847656, "kl_loss_4": 1373.7911865234375, "kl_loss_9": 521.1708587646484, "learning_rate": 0.0006117023874739772, "loss": 1075.7758, "step": 4340 }, { "ce_loss_13": 3.201284170150757, "ce_loss_17": 3.1425856590270995, "ce_loss_2": 4.2224333643913265, "ce_loss_4": 3.7598430275917054, "ce_loss_9": 3.367708420753479, "epoch": 0.435, "grad_norm": 1136.0, "kl_loss_13": 140.36151657104492, "kl_loss_2": 2274.427783203125, "kl_loss_4": 1379.3215576171874, "kl_loss_9": 526.1111709594727, "learning_rate": 0.0006101552663932703, "loss": 1087.8369, "step": 4350 }, { "ce_loss_13": 3.232711672782898, "ce_loss_17": 3.1720163822174072, "ce_loss_2": 4.224399006366729, "ce_loss_4": 3.783992850780487, "ce_loss_9": 3.3939179182052612, "epoch": 0.436, "grad_norm": 868.0, "kl_loss_13": 142.2940731048584, "kl_loss_2": 2233.6643005371093, "kl_loss_4": 1366.6875427246093, "kl_loss_9": 520.8816528320312, "learning_rate": 0.0006086070360502539, "loss": 1068.5512, "step": 4360 }, { "ce_loss_13": 3.2368351221084595, "ce_loss_17": 3.1797510147094727, "ce_loss_2": 4.231801378726959, "ce_loss_4": 3.785017156600952, "ce_loss_9": 3.3957268357276917, "epoch": 0.437, "grad_norm": 1104.0, "kl_loss_13": 137.63512687683107, "kl_loss_2": 2237.6950256347654, "kl_loss_4": 1366.993829345703, "kl_loss_9": 513.3646606445312, "learning_rate": 0.0006070577120355903, "loss": 1066.3944, "step": 4370 }, { "ce_loss_13": 3.237009358406067, "ce_loss_17": 3.178954708576202, "ce_loss_2": 4.216632223129272, "ce_loss_4": 3.796343410015106, "ce_loss_9": 3.403877317905426, "epoch": 0.438, "grad_norm": 1328.0, "kl_loss_13": 136.9477794647217, "kl_loss_2": 2169.9096252441404, "kl_loss_4": 1347.52724609375, "kl_loss_9": 512.1582641601562, "learning_rate": 0.0006055073099509549, "loss": 1057.4924, "step": 4380 }, { "ce_loss_13": 3.2915359139442444, "ce_loss_17": 3.2354613065719606, "ce_loss_2": 4.267470490932465, "ce_loss_4": 3.821575427055359, "ce_loss_9": 3.450662899017334, "epoch": 0.439, "grad_norm": 1032.0, "kl_loss_13": 139.33467025756835, "kl_loss_2": 2202.695416259766, "kl_loss_4": 1338.1229736328125, "kl_loss_9": 513.8450103759766, "learning_rate": 0.0006039558454088796, "loss": 1069.9477, "step": 4390 }, { "ce_loss_13": 3.2683029651641844, "ce_loss_17": 3.2100586175918577, "ce_loss_2": 4.266280889511108, "ce_loss_4": 3.819291090965271, "ce_loss_9": 3.4346006989479063, "epoch": 0.44, "grad_norm": 1240.0, "kl_loss_13": 139.65477600097657, "kl_loss_2": 2235.1877380371093, "kl_loss_4": 1362.3440307617188, "kl_loss_9": 525.5908569335937, "learning_rate": 0.0006024033340325954, "loss": 1048.7053, "step": 4400 }, { "ce_loss_13": 3.329455244541168, "ce_loss_17": 3.277108299732208, "ce_loss_2": 4.28489305973053, "ce_loss_4": 3.8575692176818848, "ce_loss_9": 3.4874669194221495, "epoch": 0.441, "grad_norm": 1004.0, "kl_loss_13": 134.46086540222169, "kl_loss_2": 2136.056182861328, "kl_loss_4": 1304.4167846679688, "kl_loss_9": 501.4361206054688, "learning_rate": 0.0006008497914558743, "loss": 1037.7922, "step": 4410 }, { "ce_loss_13": 3.2765673398971558, "ce_loss_17": 3.219102692604065, "ce_loss_2": 4.282702565193176, "ce_loss_4": 3.8299104809761046, "ce_loss_9": 3.4462572932243347, "epoch": 0.442, "grad_norm": 836.0, "kl_loss_13": 144.82081146240233, "kl_loss_2": 2253.6763671875, "kl_loss_4": 1370.4649169921875, "kl_loss_9": 532.5985412597656, "learning_rate": 0.0005992952333228728, "loss": 1073.2026, "step": 4420 }, { "ce_loss_13": 3.2163989663124086, "ce_loss_17": 3.1613113284111023, "ce_loss_2": 4.222684049606324, "ce_loss_4": 3.7755476593971253, "ce_loss_9": 3.380954647064209, "epoch": 0.443, "grad_norm": 1128.0, "kl_loss_13": 135.7763469696045, "kl_loss_2": 2257.3896240234376, "kl_loss_4": 1371.2310974121094, "kl_loss_9": 514.2141525268555, "learning_rate": 0.0005977396752879741, "loss": 1066.2371, "step": 4430 }, { "ce_loss_13": 3.1414122939109803, "ce_loss_17": 3.0850576639175413, "ce_loss_2": 4.151449525356293, "ce_loss_4": 3.6996174097061156, "ce_loss_9": 3.3102399349212646, "epoch": 0.444, "grad_norm": 1024.0, "kl_loss_13": 138.2087547302246, "kl_loss_2": 2268.081298828125, "kl_loss_4": 1378.0351013183595, "kl_loss_9": 524.4750213623047, "learning_rate": 0.0005961831330156305, "loss": 1062.3824, "step": 4440 }, { "ce_loss_13": 3.2834069967269897, "ce_loss_17": 3.227842891216278, "ce_loss_2": 4.295534062385559, "ce_loss_4": 3.8469720482826233, "ce_loss_9": 3.448914134502411, "epoch": 0.445, "grad_norm": 1012.0, "kl_loss_13": 138.78630905151368, "kl_loss_2": 2269.0575439453123, "kl_loss_4": 1381.2828369140625, "kl_loss_9": 521.6292785644531, "learning_rate": 0.0005946256221802051, "loss": 1089.5113, "step": 4450 }, { "ce_loss_13": 3.2611006617546083, "ce_loss_17": 3.207739520072937, "ce_loss_2": 4.210397076606751, "ce_loss_4": 3.774532747268677, "ce_loss_9": 3.412037396430969, "epoch": 0.446, "grad_norm": 1424.0, "kl_loss_13": 135.13318481445313, "kl_loss_2": 2150.4615234375, "kl_loss_4": 1301.3334838867188, "kl_loss_9": 499.83845672607424, "learning_rate": 0.0005930671584658151, "loss": 1078.8482, "step": 4460 }, { "ce_loss_13": 3.269974207878113, "ce_loss_17": 3.212189781665802, "ce_loss_2": 4.259310460090637, "ce_loss_4": 3.8123076796531676, "ce_loss_9": 3.4320770859718324, "epoch": 0.447, "grad_norm": 1064.0, "kl_loss_13": 137.83714485168457, "kl_loss_2": 2231.3010681152346, "kl_loss_4": 1361.19599609375, "kl_loss_9": 515.2275115966797, "learning_rate": 0.0005915077575661722, "loss": 1075.276, "step": 4470 }, { "ce_loss_13": 3.2781373858451843, "ce_loss_17": 3.2194784045219422, "ce_loss_2": 4.281469869613647, "ce_loss_4": 3.8381133675575256, "ce_loss_9": 3.441315174102783, "epoch": 0.448, "grad_norm": 1016.0, "kl_loss_13": 143.31453552246094, "kl_loss_2": 2269.3169921875, "kl_loss_4": 1390.1720458984375, "kl_loss_9": 528.45634765625, "learning_rate": 0.000589947435184427, "loss": 1064.3135, "step": 4480 }, { "ce_loss_13": 3.345615029335022, "ce_loss_17": 3.289338934421539, "ce_loss_2": 4.280741190910339, "ce_loss_4": 3.86712327003479, "ce_loss_9": 3.498969566822052, "epoch": 0.449, "grad_norm": 1368.0, "kl_loss_13": 139.1669807434082, "kl_loss_2": 2140.4221313476564, "kl_loss_4": 1325.2461303710938, "kl_loss_9": 510.04296875, "learning_rate": 0.0005883862070330078, "loss": 1050.408, "step": 4490 }, { "ce_loss_13": 3.2837196469306944, "ce_loss_17": 3.2257981181144713, "ce_loss_2": 4.2700411677360535, "ce_loss_4": 3.831962525844574, "ce_loss_9": 3.44888060092926, "epoch": 0.45, "grad_norm": 1080.0, "kl_loss_13": 139.46359939575194, "kl_loss_2": 2235.403009033203, "kl_loss_4": 1366.5756103515625, "kl_loss_9": 523.3026916503907, "learning_rate": 0.0005868240888334653, "loss": 1056.9212, "step": 4500 }, { "ce_loss_13": 3.1653249502182006, "ce_loss_17": 3.108543610572815, "ce_loss_2": 4.195339334011078, "ce_loss_4": 3.730360007286072, "ce_loss_9": 3.3328774333000184, "epoch": 0.451, "grad_norm": 1024.0, "kl_loss_13": 140.5768093109131, "kl_loss_2": 2293.3398620605467, "kl_loss_4": 1396.7931884765626, "kl_loss_9": 528.1096237182617, "learning_rate": 0.0005852610963163119, "loss": 1078.4453, "step": 4510 }, { "ce_loss_13": 3.192566156387329, "ce_loss_17": 3.137960159778595, "ce_loss_2": 4.167438316345215, "ce_loss_4": 3.726328408718109, "ce_loss_9": 3.3511382579803466, "epoch": 0.452, "grad_norm": 1096.0, "kl_loss_13": 136.5985507965088, "kl_loss_2": 2205.8749145507813, "kl_loss_4": 1347.3533447265625, "kl_loss_9": 516.0711990356446, "learning_rate": 0.0005836972452208654, "loss": 1042.992, "step": 4520 }, { "ce_loss_13": 3.1964443027973175, "ce_loss_17": 3.1395262479782104, "ce_loss_2": 4.202695953845978, "ce_loss_4": 3.747763514518738, "ce_loss_9": 3.3560492157936097, "epoch": 0.453, "grad_norm": 1160.0, "kl_loss_13": 138.36513175964356, "kl_loss_2": 2245.2885803222657, "kl_loss_4": 1360.3642456054688, "kl_loss_9": 517.729866027832, "learning_rate": 0.0005821325512950885, "loss": 1065.4764, "step": 4530 }, { "ce_loss_13": 3.2149079561233522, "ce_loss_17": 3.1613360166549684, "ce_loss_2": 4.201947224140167, "ce_loss_4": 3.763677978515625, "ce_loss_9": 3.3721930980682373, "epoch": 0.454, "grad_norm": 1424.0, "kl_loss_13": 133.93580856323243, "kl_loss_2": 2177.999139404297, "kl_loss_4": 1323.984796142578, "kl_loss_9": 501.2599136352539, "learning_rate": 0.0005805670302954321, "loss": 1051.4672, "step": 4540 }, { "ce_loss_13": 3.2255538940429687, "ce_loss_17": 3.1723752617836, "ce_loss_2": 4.203854882717133, "ce_loss_4": 3.763327145576477, "ce_loss_9": 3.3855578303337097, "epoch": 0.455, "grad_norm": 1096.0, "kl_loss_13": 133.75055427551268, "kl_loss_2": 2202.7123046875, "kl_loss_4": 1338.9003356933595, "kl_loss_9": 510.1236938476562, "learning_rate": 0.000579000697986675, "loss": 1040.822, "step": 4550 }, { "ce_loss_13": 3.185821759700775, "ce_loss_17": 3.125026059150696, "ce_loss_2": 4.21089608669281, "ce_loss_4": 3.764529526233673, "ce_loss_9": 3.355712854862213, "epoch": 0.456, "grad_norm": 1136.0, "kl_loss_13": 141.42764282226562, "kl_loss_2": 2284.2400146484374, "kl_loss_4": 1409.0489013671875, "kl_loss_9": 531.6865676879883, "learning_rate": 0.0005774335701417662, "loss": 1067.8127, "step": 4560 }, { "ce_loss_13": 3.175957715511322, "ce_loss_17": 3.121280539035797, "ce_loss_2": 4.2087303638458256, "ce_loss_4": 3.7471463084220886, "ce_loss_9": 3.34022171497345, "epoch": 0.457, "grad_norm": 1248.0, "kl_loss_13": 135.24294357299806, "kl_loss_2": 2308.7422790527344, "kl_loss_4": 1391.1081115722657, "kl_loss_9": 518.5217803955078, "learning_rate": 0.0005758656625416658, "loss": 1074.0018, "step": 4570 }, { "ce_loss_13": 3.2291756749153135, "ce_loss_17": 3.170781981945038, "ce_loss_2": 4.225070238113403, "ce_loss_4": 3.7816842675209044, "ce_loss_9": 3.387883198261261, "epoch": 0.458, "grad_norm": 1120.0, "kl_loss_13": 139.53331031799317, "kl_loss_2": 2212.6359008789063, "kl_loss_4": 1354.746044921875, "kl_loss_9": 515.7745941162109, "learning_rate": 0.0005742969909751859, "loss": 1044.7492, "step": 4580 }, { "ce_loss_13": 3.2437532901763917, "ce_loss_17": 3.1872865557670593, "ce_loss_2": 4.245578122138977, "ce_loss_4": 3.7921226024627686, "ce_loss_9": 3.4025209546089172, "epoch": 0.459, "grad_norm": 888.0, "kl_loss_13": 137.5473960876465, "kl_loss_2": 2255.223736572266, "kl_loss_4": 1354.0939086914063, "kl_loss_9": 516.1319152832032, "learning_rate": 0.0005727275712388318, "loss": 1071.4119, "step": 4590 }, { "ce_loss_13": 3.2640806794166566, "ce_loss_17": 3.2108885407447816, "ce_loss_2": 4.22279646396637, "ce_loss_4": 3.790911781787872, "ce_loss_9": 3.422553527355194, "epoch": 0.46, "grad_norm": 1136.0, "kl_loss_13": 134.95833168029785, "kl_loss_2": 2164.09765625, "kl_loss_4": 1321.9383483886718, "kl_loss_9": 505.1380386352539, "learning_rate": 0.0005711574191366427, "loss": 1044.241, "step": 4600 }, { "ce_loss_13": 3.2194421887397766, "ce_loss_17": 3.1651885986328123, "ce_loss_2": 4.196126163005829, "ce_loss_4": 3.758162033557892, "ce_loss_9": 3.378272223472595, "epoch": 0.461, "grad_norm": 1040.0, "kl_loss_13": 136.2647560119629, "kl_loss_2": 2202.1795349121094, "kl_loss_4": 1340.1302612304687, "kl_loss_9": 508.3653259277344, "learning_rate": 0.0005695865504800327, "loss": 1046.0359, "step": 4610 }, { "ce_loss_13": 3.1593300700187683, "ce_loss_17": 3.098788321018219, "ce_loss_2": 4.241851806640625, "ce_loss_4": 3.762340772151947, "ce_loss_9": 3.335426914691925, "epoch": 0.462, "grad_norm": 1004.0, "kl_loss_13": 142.61250762939454, "kl_loss_2": 2396.3653930664063, "kl_loss_4": 1456.4510192871094, "kl_loss_9": 542.7531051635742, "learning_rate": 0.0005680149810876322, "loss": 1091.3473, "step": 4620 }, { "ce_loss_13": 3.2129239439964294, "ce_loss_17": 3.1580535173416138, "ce_loss_2": 4.218144428730011, "ce_loss_4": 3.7490538954734802, "ce_loss_9": 3.3671084880828857, "epoch": 0.463, "grad_norm": 1004.0, "kl_loss_13": 136.29999275207518, "kl_loss_2": 2237.864385986328, "kl_loss_4": 1340.7416137695313, "kl_loss_9": 510.8120834350586, "learning_rate": 0.0005664427267851271, "loss": 1050.7108, "step": 4630 }, { "ce_loss_13": 3.13439120054245, "ce_loss_17": 3.078427088260651, "ce_loss_2": 4.1404663801193236, "ce_loss_4": 3.6858803153038027, "ce_loss_9": 3.298119854927063, "epoch": 0.464, "grad_norm": 1448.0, "kl_loss_13": 135.20904617309571, "kl_loss_2": 2233.4781188964844, "kl_loss_4": 1343.5119384765626, "kl_loss_9": 507.21962890625, "learning_rate": 0.0005648698034051009, "loss": 1049.3102, "step": 4640 }, { "ce_loss_13": 3.241450309753418, "ce_loss_17": 3.1845844864845274, "ce_loss_2": 4.265048897266388, "ce_loss_4": 3.8023730635643007, "ce_loss_9": 3.4014768838882445, "epoch": 0.465, "grad_norm": 908.0, "kl_loss_13": 136.55409774780273, "kl_loss_2": 2276.3150268554687, "kl_loss_4": 1378.3646240234375, "kl_loss_9": 509.57129821777346, "learning_rate": 0.0005632962267868747, "loss": 1051.4721, "step": 4650 }, { "ce_loss_13": 3.1855497121810914, "ce_loss_17": 3.1305655002593995, "ce_loss_2": 4.1664527416229244, "ce_loss_4": 3.725935137271881, "ce_loss_9": 3.342176878452301, "epoch": 0.466, "grad_norm": 1480.0, "kl_loss_13": 133.37670249938964, "kl_loss_2": 2198.284558105469, "kl_loss_4": 1340.073095703125, "kl_loss_9": 499.1119094848633, "learning_rate": 0.0005617220127763474, "loss": 1055.049, "step": 4660 }, { "ce_loss_13": 3.262643837928772, "ce_loss_17": 3.206638824939728, "ce_loss_2": 4.23648864030838, "ce_loss_4": 3.798210823535919, "ce_loss_9": 3.417121922969818, "epoch": 0.467, "grad_norm": 1008.0, "kl_loss_13": 136.98114585876465, "kl_loss_2": 2185.211322021484, "kl_loss_4": 1337.24296875, "kl_loss_9": 505.3307342529297, "learning_rate": 0.0005601471772258368, "loss": 1055.2172, "step": 4670 }, { "ce_loss_13": 3.2469422936439516, "ce_loss_17": 3.1905743718147277, "ce_loss_2": 4.216200220584869, "ce_loss_4": 3.7766095280647276, "ce_loss_9": 3.4060312867164613, "epoch": 0.468, "grad_norm": 1040.0, "kl_loss_13": 137.14541969299316, "kl_loss_2": 2158.09638671875, "kl_loss_4": 1306.6739990234375, "kl_loss_9": 505.6152114868164, "learning_rate": 0.0005585717359939192, "loss": 1056.5875, "step": 4680 }, { "ce_loss_13": 3.1588857412338256, "ce_loss_17": 3.1055333375930787, "ce_loss_2": 4.129337120056152, "ce_loss_4": 3.694641447067261, "ce_loss_9": 3.31375173330307, "epoch": 0.469, "grad_norm": 956.0, "kl_loss_13": 134.92950706481935, "kl_loss_2": 2171.655059814453, "kl_loss_4": 1329.860333251953, "kl_loss_9": 502.35704345703124, "learning_rate": 0.0005569957049452703, "loss": 1061.9623, "step": 4690 }, { "ce_loss_13": 3.2140878319740294, "ce_loss_17": 3.157088017463684, "ce_loss_2": 4.219216895103455, "ce_loss_4": 3.7708500981330872, "ce_loss_9": 3.3719287276268006, "epoch": 0.47, "grad_norm": 884.0, "kl_loss_13": 137.88646545410157, "kl_loss_2": 2251.818975830078, "kl_loss_4": 1367.3576110839845, "kl_loss_9": 516.6486663818359, "learning_rate": 0.0005554190999505056, "loss": 1065.7701, "step": 4700 }, { "ce_loss_13": 3.340174913406372, "ce_loss_17": 3.281543505191803, "ce_loss_2": 4.3227524042129515, "ce_loss_4": 3.8859183549880982, "ce_loss_9": 3.5031768560409544, "epoch": 0.471, "grad_norm": 892.0, "kl_loss_13": 141.2210479736328, "kl_loss_2": 2227.123455810547, "kl_loss_4": 1369.4485778808594, "kl_loss_9": 524.8275726318359, "learning_rate": 0.0005538419368860196, "loss": 1029.9886, "step": 4710 }, { "ce_loss_13": 3.258028817176819, "ce_loss_17": 3.201567816734314, "ce_loss_2": 4.2379420638084415, "ce_loss_4": 3.7908987402915955, "ce_loss_9": 3.41657475233078, "epoch": 0.472, "grad_norm": 992.0, "kl_loss_13": 137.13487854003907, "kl_loss_2": 2210.2485778808596, "kl_loss_4": 1334.9109680175782, "kl_loss_9": 513.3073226928711, "learning_rate": 0.0005522642316338268, "loss": 1069.5956, "step": 4720 }, { "ce_loss_13": 3.269257152080536, "ce_loss_17": 3.217504823207855, "ce_loss_2": 4.245605778694153, "ce_loss_4": 3.796275007724762, "ce_loss_9": 3.425595462322235, "epoch": 0.473, "grad_norm": 1120.0, "kl_loss_13": 136.4365249633789, "kl_loss_2": 2192.927117919922, "kl_loss_4": 1323.2529602050781, "kl_loss_9": 510.12690887451174, "learning_rate": 0.0005506860000814017, "loss": 1069.1469, "step": 4730 }, { "ce_loss_13": 3.2911235332489013, "ce_loss_17": 3.2394946217536926, "ce_loss_2": 4.244098162651062, "ce_loss_4": 3.8144774079322814, "ce_loss_9": 3.448509895801544, "epoch": 0.474, "grad_norm": 1304.0, "kl_loss_13": 133.57581672668456, "kl_loss_2": 2146.5623168945312, "kl_loss_4": 1312.6512756347656, "kl_loss_9": 502.97911834716797, "learning_rate": 0.0005491072581215186, "loss": 1048.7453, "step": 4740 }, { "ce_loss_13": 3.2882182240486144, "ce_loss_17": 3.2304022908210754, "ce_loss_2": 4.259259605407715, "ce_loss_4": 3.826170098781586, "ce_loss_9": 3.4494891285896303, "epoch": 0.475, "grad_norm": 1064.0, "kl_loss_13": 140.23167266845704, "kl_loss_2": 2213.348944091797, "kl_loss_4": 1362.2956909179688, "kl_loss_9": 520.352766418457, "learning_rate": 0.0005475280216520913, "loss": 1035.411, "step": 4750 }, { "ce_loss_13": 3.2143434166908262, "ce_loss_17": 3.1581347227096557, "ce_loss_2": 4.173523890972137, "ce_loss_4": 3.74121550321579, "ce_loss_9": 3.3637998700141907, "epoch": 0.476, "grad_norm": 1056.0, "kl_loss_13": 134.35184707641602, "kl_loss_2": 2162.1074645996096, "kl_loss_4": 1317.6097778320313, "kl_loss_9": 500.77030487060546, "learning_rate": 0.0005459483065760138, "loss": 1061.3199, "step": 4760 }, { "ce_loss_13": 3.1506337881088258, "ce_loss_17": 3.0972867369651795, "ce_loss_2": 4.2028480052948, "ce_loss_4": 3.725043773651123, "ce_loss_9": 3.3135896563529967, "epoch": 0.477, "grad_norm": 1320.0, "kl_loss_13": 135.86248397827148, "kl_loss_2": 2328.050341796875, "kl_loss_4": 1411.0231384277345, "kl_loss_9": 513.8723983764648, "learning_rate": 0.0005443681288009991, "loss": 1068.6512, "step": 4770 }, { "ce_loss_13": 3.202068865299225, "ce_loss_17": 3.1486364006996155, "ce_loss_2": 4.194301104545593, "ce_loss_4": 3.747242498397827, "ce_loss_9": 3.3596622586250304, "epoch": 0.478, "grad_norm": 1144.0, "kl_loss_13": 133.54805221557618, "kl_loss_2": 2234.293359375, "kl_loss_4": 1344.2996520996094, "kl_loss_9": 504.06092224121096, "learning_rate": 0.0005427875042394199, "loss": 1059.1951, "step": 4780 }, { "ce_loss_13": 3.2362239003181457, "ce_loss_17": 3.1777065634727477, "ce_loss_2": 4.2125295281410216, "ce_loss_4": 3.785688304901123, "ce_loss_9": 3.3973616480827333, "epoch": 0.479, "grad_norm": 1200.0, "kl_loss_13": 137.9409606933594, "kl_loss_2": 2185.4827392578127, "kl_loss_4": 1351.2970825195312, "kl_loss_9": 511.90448608398435, "learning_rate": 0.0005412064488081482, "loss": 1064.308, "step": 4790 }, { "ce_loss_13": 3.2409753680229185, "ce_loss_17": 3.1855759739875795, "ce_loss_2": 4.208216512203217, "ce_loss_4": 3.7636180996894835, "ce_loss_9": 3.3940456032752992, "epoch": 0.48, "grad_norm": 1456.0, "kl_loss_13": 133.97342491149902, "kl_loss_2": 2173.90556640625, "kl_loss_4": 1315.6861145019532, "kl_loss_9": 499.45046081542966, "learning_rate": 0.0005396249784283942, "loss": 1033.6954, "step": 4800 }, { "ce_loss_13": 3.258735382556915, "ce_loss_17": 3.200114297866821, "ce_loss_2": 4.269623613357544, "ce_loss_4": 3.8198778510093687, "ce_loss_9": 3.4199629426002502, "epoch": 0.481, "grad_norm": 1080.0, "kl_loss_13": 141.58736763000488, "kl_loss_2": 2264.653668212891, "kl_loss_4": 1381.7948181152344, "kl_loss_9": 519.3439498901367, "learning_rate": 0.0005380431090255476, "loss": 1066.9398, "step": 4810 }, { "ce_loss_13": 3.254167151451111, "ce_loss_17": 3.202963137626648, "ce_loss_2": 4.217775845527649, "ce_loss_4": 3.7794894099235536, "ce_loss_9": 3.404033064842224, "epoch": 0.482, "grad_norm": 1192.0, "kl_loss_13": 132.02805519104004, "kl_loss_2": 2175.862255859375, "kl_loss_4": 1314.934423828125, "kl_loss_9": 491.5104095458984, "learning_rate": 0.0005364608565290155, "loss": 1032.9521, "step": 4820 }, { "ce_loss_13": 3.2596340417861938, "ce_loss_17": 3.2022710919380186, "ce_loss_2": 4.2500745296478275, "ce_loss_4": 3.7975626230239867, "ce_loss_9": 3.4185283064842222, "epoch": 0.483, "grad_norm": 2008.0, "kl_loss_13": 136.76991233825683, "kl_loss_2": 2217.622674560547, "kl_loss_4": 1334.8709106445312, "kl_loss_9": 506.6666061401367, "learning_rate": 0.0005348782368720626, "loss": 1048.6578, "step": 4830 }, { "ce_loss_13": 3.1943154454231264, "ce_loss_17": 3.1401220679283144, "ce_loss_2": 4.177812600135804, "ce_loss_4": 3.734412372112274, "ce_loss_9": 3.3543110370635985, "epoch": 0.484, "grad_norm": 936.0, "kl_loss_13": 132.41786003112793, "kl_loss_2": 2171.333807373047, "kl_loss_4": 1313.1377563476562, "kl_loss_9": 498.79088287353517, "learning_rate": 0.000533295265991652, "loss": 1047.1073, "step": 4840 }, { "ce_loss_13": 3.26388875246048, "ce_loss_17": 3.209026575088501, "ce_loss_2": 4.2196999311447145, "ce_loss_4": 3.7861626744270325, "ce_loss_9": 3.419498658180237, "epoch": 0.485, "grad_norm": 1224.0, "kl_loss_13": 133.78036727905274, "kl_loss_2": 2146.0129272460936, "kl_loss_4": 1306.4606384277345, "kl_loss_9": 500.0278350830078, "learning_rate": 0.0005317119598282822, "loss": 1027.6814, "step": 4850 }, { "ce_loss_13": 3.267427957057953, "ce_loss_17": 3.2125453114509583, "ce_loss_2": 4.2463397026062015, "ce_loss_4": 3.807596802711487, "ce_loss_9": 3.4268407940864565, "epoch": 0.486, "grad_norm": 1288.0, "kl_loss_13": 135.5311653137207, "kl_loss_2": 2179.989697265625, "kl_loss_4": 1330.0117248535157, "kl_loss_9": 505.967529296875, "learning_rate": 0.0005301283343258293, "loss": 1038.9207, "step": 4860 }, { "ce_loss_13": 3.32303227186203, "ce_loss_17": 3.269377565383911, "ce_loss_2": 4.275118768215179, "ce_loss_4": 3.850271725654602, "ce_loss_9": 3.477081835269928, "epoch": 0.487, "grad_norm": 1312.0, "kl_loss_13": 136.52353172302247, "kl_loss_2": 2155.4018127441404, "kl_loss_4": 1315.6972595214843, "kl_loss_9": 502.24618377685545, "learning_rate": 0.000528544405431384, "loss": 1025.6479, "step": 4870 }, { "ce_loss_13": 3.205915355682373, "ce_loss_17": 3.1481048822402955, "ce_loss_2": 4.190273451805115, "ce_loss_4": 3.759691536426544, "ce_loss_9": 3.3681720495224, "epoch": 0.488, "grad_norm": 1032.0, "kl_loss_13": 138.0819881439209, "kl_loss_2": 2221.1510986328126, "kl_loss_4": 1371.8503723144531, "kl_loss_9": 518.3593811035156, "learning_rate": 0.000526960189095093, "loss": 1060.1495, "step": 4880 }, { "ce_loss_13": 3.191713261604309, "ce_loss_17": 3.137663996219635, "ce_loss_2": 4.1650094270706175, "ce_loss_4": 3.726768064498901, "ce_loss_9": 3.347383975982666, "epoch": 0.489, "grad_norm": 956.0, "kl_loss_13": 133.13926544189454, "kl_loss_2": 2164.1394470214846, "kl_loss_4": 1313.3862548828124, "kl_loss_9": 492.73089294433595, "learning_rate": 0.0005253757012699972, "loss": 1033.7554, "step": 4890 }, { "ce_loss_13": 3.2697774052619932, "ce_loss_17": 3.2150630354881287, "ce_loss_2": 4.237921822071075, "ce_loss_4": 3.797005522251129, "ce_loss_9": 3.4237668633461, "epoch": 0.49, "grad_norm": 1120.0, "kl_loss_13": 134.52528114318847, "kl_loss_2": 2173.1056945800783, "kl_loss_4": 1318.3933898925782, "kl_loss_9": 501.64600830078126, "learning_rate": 0.0005237909579118712, "loss": 1049.9766, "step": 4900 }, { "ce_loss_13": 3.2318019390106203, "ce_loss_17": 3.1749009132385253, "ce_loss_2": 4.23730765581131, "ce_loss_4": 3.783624064922333, "ce_loss_9": 3.3909229397773744, "epoch": 0.491, "grad_norm": 1120.0, "kl_loss_13": 139.48478813171386, "kl_loss_2": 2248.2317749023437, "kl_loss_4": 1358.451190185547, "kl_loss_9": 515.9147048950196, "learning_rate": 0.0005222059749790631, "loss": 1061.1975, "step": 4910 }, { "ce_loss_13": 3.295530927181244, "ce_loss_17": 3.2410000920295716, "ce_loss_2": 4.22360405921936, "ce_loss_4": 3.8085273265838624, "ce_loss_9": 3.4464677929878236, "epoch": 0.492, "grad_norm": 1024.0, "kl_loss_13": 133.82221221923828, "kl_loss_2": 2110.8667114257814, "kl_loss_4": 1289.4907775878905, "kl_loss_9": 494.4731643676758, "learning_rate": 0.0005206207684323337, "loss": 1008.3338, "step": 4920 }, { "ce_loss_13": 3.276183319091797, "ce_loss_17": 3.222785985469818, "ce_loss_2": 4.250223541259766, "ce_loss_4": 3.8108495473861694, "ce_loss_9": 3.4319649219512938, "epoch": 0.493, "grad_norm": 1352.0, "kl_loss_13": 136.65951271057128, "kl_loss_2": 2195.4397521972655, "kl_loss_4": 1335.5635864257813, "kl_loss_9": 510.61053161621095, "learning_rate": 0.000519035354234695, "loss": 1061.7021, "step": 4930 }, { "ce_loss_13": 3.251898002624512, "ce_loss_17": 3.1955731749534606, "ce_loss_2": 4.224755346775055, "ce_loss_4": 3.799663472175598, "ce_loss_9": 3.4151692986488342, "epoch": 0.494, "grad_norm": 1216.0, "kl_loss_13": 137.9657169342041, "kl_loss_2": 2171.855584716797, "kl_loss_4": 1337.397021484375, "kl_loss_9": 512.0497360229492, "learning_rate": 0.0005174497483512506, "loss": 1032.9092, "step": 4940 }, { "ce_loss_13": 3.2971807956695556, "ce_loss_17": 3.2452670335769653, "ce_loss_2": 4.2498260855674745, "ce_loss_4": 3.818261182308197, "ce_loss_9": 3.4485969662666323, "epoch": 0.495, "grad_norm": 1048.0, "kl_loss_13": 132.890877532959, "kl_loss_2": 2166.161505126953, "kl_loss_4": 1318.9433227539062, "kl_loss_9": 500.15086669921874, "learning_rate": 0.0005158639667490339, "loss": 1053.1043, "step": 4950 }, { "ce_loss_13": 3.204316556453705, "ce_loss_17": 3.1496737718582155, "ce_loss_2": 4.184051644802094, "ce_loss_4": 3.7438693165779116, "ce_loss_9": 3.3629273056983946, "epoch": 0.496, "grad_norm": 976.0, "kl_loss_13": 135.28388442993165, "kl_loss_2": 2201.1526000976564, "kl_loss_4": 1345.3450500488282, "kl_loss_9": 513.6176574707031, "learning_rate": 0.0005142780253968481, "loss": 1046.0458, "step": 4960 }, { "ce_loss_13": 3.1585383653640746, "ce_loss_17": 3.1056560754776, "ce_loss_2": 4.124637794494629, "ce_loss_4": 3.684066724777222, "ce_loss_9": 3.309024250507355, "epoch": 0.497, "grad_norm": 1112.0, "kl_loss_13": 130.60840072631837, "kl_loss_2": 2166.1127014160156, "kl_loss_4": 1304.3673522949218, "kl_loss_9": 487.8077133178711, "learning_rate": 0.0005126919402651053, "loss": 1011.3193, "step": 4970 }, { "ce_loss_13": 3.222071182727814, "ce_loss_17": 3.162739098072052, "ce_loss_2": 4.207986128330231, "ce_loss_4": 3.776746702194214, "ce_loss_9": 3.383715271949768, "epoch": 0.498, "grad_norm": 1096.0, "kl_loss_13": 138.16339569091798, "kl_loss_2": 2195.0331787109376, "kl_loss_4": 1350.5455688476563, "kl_loss_9": 508.6073974609375, "learning_rate": 0.0005111057273256647, "loss": 1049.0661, "step": 4980 }, { "ce_loss_13": 3.3202736139297486, "ce_loss_17": 3.2693683743476867, "ce_loss_2": 4.226944077014923, "ce_loss_4": 3.816166079044342, "ce_loss_9": 3.4622004628181458, "epoch": 0.499, "grad_norm": 844.0, "kl_loss_13": 129.69556732177733, "kl_loss_2": 2055.7344360351562, "kl_loss_4": 1258.79853515625, "kl_loss_9": 480.38240203857424, "learning_rate": 0.0005095194025516733, "loss": 999.9001, "step": 4990 }, { "ce_loss_13": 3.2506856441497805, "ce_loss_17": 3.199086034297943, "ce_loss_2": 4.2019412279129025, "ce_loss_4": 3.76758496761322, "ce_loss_9": 3.397927403450012, "epoch": 0.5, "grad_norm": 940.0, "kl_loss_13": 130.52569503784179, "kl_loss_2": 2140.3324462890623, "kl_loss_4": 1295.9594299316407, "kl_loss_9": 490.5307876586914, "learning_rate": 0.000507932981917404, "loss": 1050.7144, "step": 5000 }, { "ce_loss_13": 3.201303684711456, "ce_loss_17": 3.144782042503357, "ce_loss_2": 4.221145558357239, "ce_loss_4": 3.761389744281769, "ce_loss_9": 3.366125202178955, "epoch": 0.501, "grad_norm": 1128.0, "kl_loss_13": 138.91932907104493, "kl_loss_2": 2284.188153076172, "kl_loss_4": 1382.7747741699218, "kl_loss_9": 521.5186538696289, "learning_rate": 0.0005063464813980949, "loss": 1077.2139, "step": 5010 }, { "ce_loss_13": 3.1854868292808534, "ce_loss_17": 3.1316314339637756, "ce_loss_2": 4.164624738693237, "ce_loss_4": 3.7247269630432127, "ce_loss_9": 3.338021993637085, "epoch": 0.502, "grad_norm": 1184.0, "kl_loss_13": 134.51192474365234, "kl_loss_2": 2209.3236755371095, "kl_loss_4": 1344.9856811523437, "kl_loss_9": 503.8586853027344, "learning_rate": 0.0005047599169697884, "loss": 1040.3154, "step": 5020 }, { "ce_loss_13": 3.1273163676261904, "ce_loss_17": 3.0690667152404787, "ce_loss_2": 4.115918350219727, "ce_loss_4": 3.6645180940628053, "ce_loss_9": 3.2855119585990904, "epoch": 0.503, "grad_norm": 1264.0, "kl_loss_13": 134.75112190246583, "kl_loss_2": 2185.707067871094, "kl_loss_4": 1313.5970458984375, "kl_loss_9": 498.97532653808594, "learning_rate": 0.000503173304609171, "loss": 1016.8127, "step": 5030 }, { "ce_loss_13": 3.24262592792511, "ce_loss_17": 3.187458097934723, "ce_loss_2": 4.214921712875366, "ce_loss_4": 3.783408355712891, "ce_loss_9": 3.3980445146560667, "epoch": 0.504, "grad_norm": 924.0, "kl_loss_13": 133.29533157348632, "kl_loss_2": 2175.716418457031, "kl_loss_4": 1329.2711364746094, "kl_loss_9": 498.6507049560547, "learning_rate": 0.0005015866602934111, "loss": 1023.4406, "step": 5040 }, { "ce_loss_13": 3.21926463842392, "ce_loss_17": 3.1603885173797606, "ce_loss_2": 4.2196164727211, "ce_loss_4": 3.7755380868911743, "ce_loss_9": 3.3812211990356444, "epoch": 0.505, "grad_norm": 996.0, "kl_loss_13": 138.86305465698243, "kl_loss_2": 2226.2142700195313, "kl_loss_4": 1363.1834533691406, "kl_loss_9": 520.0560317993164, "learning_rate": 0.0005, "loss": 1048.7143, "step": 5050 }, { "ce_loss_13": 3.2120168089866636, "ce_loss_17": 3.158287060260773, "ce_loss_2": 4.183288502693176, "ce_loss_4": 3.7456084847450257, "ce_loss_9": 3.366370642185211, "epoch": 0.506, "grad_norm": 1376.0, "kl_loss_13": 136.47029647827148, "kl_loss_2": 2184.931671142578, "kl_loss_4": 1333.580145263672, "kl_loss_9": 508.05563201904295, "learning_rate": 0.0004984133397065889, "loss": 1026.5486, "step": 5060 }, { "ce_loss_13": 3.2099183797836304, "ce_loss_17": 3.1534678101539613, "ce_loss_2": 4.21018488407135, "ce_loss_4": 3.7664723873138426, "ce_loss_9": 3.374504506587982, "epoch": 0.507, "grad_norm": 1320.0, "kl_loss_13": 135.16151390075683, "kl_loss_2": 2208.4457153320313, "kl_loss_4": 1338.3543640136718, "kl_loss_9": 507.63246765136716, "learning_rate": 0.0004968266953908291, "loss": 1028.7352, "step": 5070 }, { "ce_loss_13": 3.253543961048126, "ce_loss_17": 3.2022271871566774, "ce_loss_2": 4.242140090465545, "ce_loss_4": 3.793262243270874, "ce_loss_9": 3.408780574798584, "epoch": 0.508, "grad_norm": 1104.0, "kl_loss_13": 132.70711517333984, "kl_loss_2": 2212.339666748047, "kl_loss_4": 1335.3783874511719, "kl_loss_9": 497.7675247192383, "learning_rate": 0.0004952400830302117, "loss": 1039.142, "step": 5080 }, { "ce_loss_13": 3.178085136413574, "ce_loss_17": 3.1244508147239687, "ce_loss_2": 4.190670132637024, "ce_loss_4": 3.7374264001846313, "ce_loss_9": 3.337313449382782, "epoch": 0.509, "grad_norm": 876.0, "kl_loss_13": 136.45259895324708, "kl_loss_2": 2260.521563720703, "kl_loss_4": 1371.6193481445312, "kl_loss_9": 516.6408508300781, "learning_rate": 0.0004936535186019053, "loss": 1045.8287, "step": 5090 }, { "ce_loss_13": 3.277904772758484, "ce_loss_17": 3.2290755033493044, "ce_loss_2": 4.2309854388237, "ce_loss_4": 3.7887255787849425, "ce_loss_9": 3.429102098941803, "epoch": 0.51, "grad_norm": 980.0, "kl_loss_13": 130.1757598876953, "kl_loss_2": 2131.6693603515623, "kl_loss_4": 1280.2371032714843, "kl_loss_9": 485.75081634521484, "learning_rate": 0.000492067018082596, "loss": 1022.1869, "step": 5100 }, { "ce_loss_13": 3.2151978492736815, "ce_loss_17": 3.159080648422241, "ce_loss_2": 4.241300308704377, "ce_loss_4": 3.7795289278030397, "ce_loss_9": 3.380474317073822, "epoch": 0.511, "grad_norm": 1136.0, "kl_loss_13": 137.61385345458984, "kl_loss_2": 2275.185760498047, "kl_loss_4": 1376.8430786132812, "kl_loss_9": 519.0857238769531, "learning_rate": 0.0004904805974483267, "loss": 1077.6825, "step": 5110 }, { "ce_loss_13": 3.326034140586853, "ce_loss_17": 3.2685611844062805, "ce_loss_2": 4.32475733757019, "ce_loss_4": 3.88838849067688, "ce_loss_9": 3.492398464679718, "epoch": 0.512, "grad_norm": 1104.0, "kl_loss_13": 141.92054405212403, "kl_loss_2": 2250.115447998047, "kl_loss_4": 1391.5601806640625, "kl_loss_9": 531.3891952514648, "learning_rate": 0.0004888942726743353, "loss": 1089.7244, "step": 5120 }, { "ce_loss_13": 3.200274407863617, "ce_loss_17": 3.1462685227394105, "ce_loss_2": 4.189842891693115, "ce_loss_4": 3.746474051475525, "ce_loss_9": 3.3571847319602965, "epoch": 0.513, "grad_norm": 1552.0, "kl_loss_13": 136.46018829345704, "kl_loss_2": 2217.6776916503904, "kl_loss_4": 1355.43916015625, "kl_loss_9": 510.4006942749023, "learning_rate": 0.0004873080597348947, "loss": 1053.9887, "step": 5130 }, { "ce_loss_13": 3.0924919962882997, "ce_loss_17": 3.0377991914749147, "ce_loss_2": 4.136486649513245, "ce_loss_4": 3.6663604736328126, "ce_loss_9": 3.254821312427521, "epoch": 0.514, "grad_norm": 1104.0, "kl_loss_13": 133.889599609375, "kl_loss_2": 2305.96884765625, "kl_loss_4": 1389.7805236816407, "kl_loss_9": 507.92732391357424, "learning_rate": 0.0004857219746031519, "loss": 1059.3795, "step": 5140 }, { "ce_loss_13": 3.26264545917511, "ce_loss_17": 3.2070136666297913, "ce_loss_2": 4.222973263263702, "ce_loss_4": 3.785707104206085, "ce_loss_9": 3.4130852818489075, "epoch": 0.515, "grad_norm": 1020.0, "kl_loss_13": 134.39491539001466, "kl_loss_2": 2168.3056701660157, "kl_loss_4": 1311.4709350585938, "kl_loss_9": 499.8022964477539, "learning_rate": 0.0004841360332509663, "loss": 1039.5553, "step": 5150 }, { "ce_loss_13": 3.2168318390846253, "ce_loss_17": 3.164391648769379, "ce_loss_2": 4.176003646850586, "ce_loss_4": 3.7388017058372496, "ce_loss_9": 3.367315924167633, "epoch": 0.516, "grad_norm": 924.0, "kl_loss_13": 129.5806926727295, "kl_loss_2": 2149.677600097656, "kl_loss_4": 1296.674005126953, "kl_loss_9": 491.05738830566406, "learning_rate": 0.0004825502516487497, "loss": 996.7398, "step": 5160 }, { "ce_loss_13": 3.175827443599701, "ce_loss_17": 3.1233877658843996, "ce_loss_2": 4.170264160633087, "ce_loss_4": 3.72345951795578, "ce_loss_9": 3.3318180561065676, "epoch": 0.517, "grad_norm": 1368.0, "kl_loss_13": 133.3531795501709, "kl_loss_2": 2230.0435546875, "kl_loss_4": 1358.2493774414063, "kl_loss_9": 509.1219192504883, "learning_rate": 0.00048096464576530507, "loss": 1057.0379, "step": 5170 }, { "ce_loss_13": 3.2806073904037474, "ce_loss_17": 3.2255469441413878, "ce_loss_2": 4.210289931297302, "ce_loss_4": 3.7867536306381226, "ce_loss_9": 3.4270519495010374, "epoch": 0.518, "grad_norm": 984.0, "kl_loss_13": 133.39203376770018, "kl_loss_2": 2102.7431457519533, "kl_loss_4": 1276.3300354003907, "kl_loss_9": 491.09695587158205, "learning_rate": 0.00047937923156766646, "loss": 1010.4639, "step": 5180 }, { "ce_loss_13": 3.3245773792266844, "ce_loss_17": 3.272130787372589, "ce_loss_2": 4.243328547477722, "ce_loss_4": 3.8309839963912964, "ce_loss_9": 3.4682255506515505, "epoch": 0.519, "grad_norm": 1004.0, "kl_loss_13": 131.4694736480713, "kl_loss_2": 2084.385565185547, "kl_loss_4": 1282.0379821777344, "kl_loss_9": 487.6641311645508, "learning_rate": 0.00047779402502093696, "loss": 1013.3197, "step": 5190 }, { "ce_loss_13": 3.288065457344055, "ce_loss_17": 3.2369380474090574, "ce_loss_2": 4.237306439876557, "ce_loss_4": 3.8122665524482726, "ce_loss_9": 3.4411548733711244, "epoch": 0.52, "grad_norm": 976.0, "kl_loss_13": 133.08018188476564, "kl_loss_2": 2137.024792480469, "kl_loss_4": 1301.208056640625, "kl_loss_9": 492.8338592529297, "learning_rate": 0.0004762090420881289, "loss": 1027.7091, "step": 5200 }, { "ce_loss_13": 3.208534812927246, "ce_loss_17": 3.1561156034469606, "ce_loss_2": 4.15554096698761, "ce_loss_4": 3.73324168920517, "ce_loss_9": 3.36050431728363, "epoch": 0.521, "grad_norm": 1568.0, "kl_loss_13": 132.15848541259766, "kl_loss_2": 2140.581823730469, "kl_loss_4": 1302.3179260253905, "kl_loss_9": 492.4755630493164, "learning_rate": 0.00047462429873000296, "loss": 1005.7602, "step": 5210 }, { "ce_loss_13": 3.2912041306495667, "ce_loss_17": 3.23849618434906, "ce_loss_2": 4.235232472419739, "ce_loss_4": 3.8027437567710876, "ce_loss_9": 3.4434992551803587, "epoch": 0.522, "grad_norm": 1616.0, "kl_loss_13": 133.86591262817382, "kl_loss_2": 2145.210546875, "kl_loss_4": 1287.545635986328, "kl_loss_9": 493.10424957275393, "learning_rate": 0.0004730398109049071, "loss": 1015.4592, "step": 5220 }, { "ce_loss_13": 3.2192367196083067, "ce_loss_17": 3.1628214597702025, "ce_loss_2": 4.221984088420868, "ce_loss_4": 3.7681381702423096, "ce_loss_9": 3.3812098026275637, "epoch": 0.523, "grad_norm": 1768.0, "kl_loss_13": 135.77215042114258, "kl_loss_2": 2247.011602783203, "kl_loss_4": 1359.4958190917969, "kl_loss_9": 513.5448059082031, "learning_rate": 0.000471455594568616, "loss": 1040.0474, "step": 5230 }, { "ce_loss_13": 3.2910205006599424, "ce_loss_17": 3.2370130896568297, "ce_loss_2": 4.222568416595459, "ce_loss_4": 3.802774941921234, "ce_loss_9": 3.44402939081192, "epoch": 0.524, "grad_norm": 1184.0, "kl_loss_13": 133.75257568359376, "kl_loss_2": 2106.518200683594, "kl_loss_4": 1283.7892517089845, "kl_loss_9": 492.6229904174805, "learning_rate": 0.00046987166567417086, "loss": 1023.7068, "step": 5240 }, { "ce_loss_13": 3.212172508239746, "ce_loss_17": 3.1608593344688414, "ce_loss_2": 4.178283941745758, "ce_loss_4": 3.7399025201797484, "ce_loss_9": 3.364052891731262, "epoch": 0.525, "grad_norm": 1200.0, "kl_loss_13": 131.91106262207032, "kl_loss_2": 2165.4636291503907, "kl_loss_4": 1304.9942810058594, "kl_loss_9": 495.7914474487305, "learning_rate": 0.00046828804017171776, "loss": 1001.8737, "step": 5250 }, { "ce_loss_13": 3.2519564867019652, "ce_loss_17": 3.196308982372284, "ce_loss_2": 4.241941142082214, "ce_loss_4": 3.7961438655853272, "ce_loss_9": 3.4126891136169433, "epoch": 0.526, "grad_norm": 1208.0, "kl_loss_13": 135.40493927001953, "kl_loss_2": 2191.02392578125, "kl_loss_4": 1326.1872314453126, "kl_loss_9": 502.3997512817383, "learning_rate": 0.00046670473400834805, "loss": 1044.8854, "step": 5260 }, { "ce_loss_13": 3.192030155658722, "ce_loss_17": 3.1420849442481993, "ce_loss_2": 4.150267624855042, "ce_loss_4": 3.7068399310112, "ce_loss_9": 3.339703416824341, "epoch": 0.527, "grad_norm": 1008.0, "kl_loss_13": 130.03723640441893, "kl_loss_2": 2137.622454833984, "kl_loss_4": 1283.7067993164062, "kl_loss_9": 482.86547088623047, "learning_rate": 0.00046512176312793734, "loss": 1049.1754, "step": 5270 }, { "ce_loss_13": 3.1847679257392882, "ce_loss_17": 3.128868842124939, "ce_loss_2": 4.148423171043396, "ce_loss_4": 3.7098173022270204, "ce_loss_9": 3.3364980459213256, "epoch": 0.528, "grad_norm": 1168.0, "kl_loss_13": 132.88791580200194, "kl_loss_2": 2170.6068908691404, "kl_loss_4": 1316.0835388183593, "kl_loss_9": 496.25191802978514, "learning_rate": 0.00046353914347098467, "loss": 1039.4596, "step": 5280 }, { "ce_loss_13": 3.278215193748474, "ce_loss_17": 3.2252373695373535, "ce_loss_2": 4.246132504940033, "ce_loss_4": 3.805600607395172, "ce_loss_9": 3.4288393020629884, "epoch": 0.529, "grad_norm": 1312.0, "kl_loss_13": 132.18092765808106, "kl_loss_2": 2160.4322875976563, "kl_loss_4": 1304.8613037109376, "kl_loss_9": 489.24100189208986, "learning_rate": 0.0004619568909744524, "loss": 1035.6349, "step": 5290 }, { "ce_loss_13": 3.2832093358039858, "ce_loss_17": 3.227021861076355, "ce_loss_2": 4.233003640174866, "ce_loss_4": 3.8108847975730895, "ce_loss_9": 3.4366191983222962, "epoch": 0.53, "grad_norm": 1328.0, "kl_loss_13": 132.73285179138185, "kl_loss_2": 2121.3479553222655, "kl_loss_4": 1301.2569274902344, "kl_loss_9": 492.59303283691406, "learning_rate": 0.00046037502157160573, "loss": 1030.7157, "step": 5300 }, { "ce_loss_13": 3.158991277217865, "ce_loss_17": 3.105714809894562, "ce_loss_2": 4.146761012077332, "ce_loss_4": 3.7026300191879273, "ce_loss_9": 3.317048418521881, "epoch": 0.531, "grad_norm": 1064.0, "kl_loss_13": 133.3448528289795, "kl_loss_2": 2189.5560302734375, "kl_loss_4": 1326.8446472167968, "kl_loss_9": 504.44376983642576, "learning_rate": 0.00045879355119185207, "loss": 1043.1428, "step": 5310 }, { "ce_loss_13": 3.2406702637672424, "ce_loss_17": 3.187274920940399, "ce_loss_2": 4.220140731334686, "ce_loss_4": 3.778945767879486, "ce_loss_9": 3.3974151372909547, "epoch": 0.532, "grad_norm": 1488.0, "kl_loss_13": 134.67926635742188, "kl_loss_2": 2202.9053283691405, "kl_loss_4": 1335.4775756835938, "kl_loss_9": 508.46104125976564, "learning_rate": 0.0004572124957605803, "loss": 1054.7464, "step": 5320 }, { "ce_loss_13": 3.2571631550788878, "ce_loss_17": 3.2028262853622436, "ce_loss_2": 4.225158095359802, "ce_loss_4": 3.789994442462921, "ce_loss_9": 3.415012741088867, "epoch": 0.533, "grad_norm": 888.0, "kl_loss_13": 134.50125694274902, "kl_loss_2": 2184.2195251464846, "kl_loss_4": 1322.377685546875, "kl_loss_9": 503.02623138427737, "learning_rate": 0.00045563187119900103, "loss": 1021.092, "step": 5330 }, { "ce_loss_13": 3.1055750966072084, "ce_loss_17": 3.0517826795578005, "ce_loss_2": 4.1029151439666744, "ce_loss_4": 3.653390109539032, "ce_loss_9": 3.2625330805778505, "epoch": 0.534, "grad_norm": 1128.0, "kl_loss_13": 132.5312572479248, "kl_loss_2": 2217.793896484375, "kl_loss_4": 1342.1891967773438, "kl_loss_9": 499.63763427734375, "learning_rate": 0.00045405169342398633, "loss": 1043.8357, "step": 5340 }, { "ce_loss_13": 3.1921161651611327, "ce_loss_17": 3.135869061946869, "ce_loss_2": 4.191024851799011, "ce_loss_4": 3.7327986478805544, "ce_loss_9": 3.349143397808075, "epoch": 0.535, "grad_norm": 1144.0, "kl_loss_13": 136.30418968200684, "kl_loss_2": 2229.098095703125, "kl_loss_4": 1338.486151123047, "kl_loss_9": 503.174609375, "learning_rate": 0.0004524719783479088, "loss": 1029.323, "step": 5350 }, { "ce_loss_13": 3.1456685423851014, "ce_loss_17": 3.0922767996788023, "ce_loss_2": 4.1534268140792845, "ce_loss_4": 3.704431414604187, "ce_loss_9": 3.3060895442962646, "epoch": 0.536, "grad_norm": 1040.0, "kl_loss_13": 135.85845870971679, "kl_loss_2": 2255.5453857421876, "kl_loss_4": 1366.2614990234374, "kl_loss_9": 509.90000305175784, "learning_rate": 0.00045089274187848144, "loss": 1034.9277, "step": 5360 }, { "ce_loss_13": 3.264985990524292, "ce_loss_17": 3.2127264380455016, "ce_loss_2": 4.220912146568298, "ce_loss_4": 3.7728867888450623, "ce_loss_9": 3.411216843128204, "epoch": 0.537, "grad_norm": 1504.0, "kl_loss_13": 132.2303981781006, "kl_loss_2": 2172.8490966796876, "kl_loss_4": 1290.3493774414062, "kl_loss_9": 493.3435821533203, "learning_rate": 0.00044931399991859835, "loss": 1017.0732, "step": 5370 }, { "ce_loss_13": 3.1303730368614198, "ce_loss_17": 3.0752500176429747, "ce_loss_2": 4.101239895820617, "ce_loss_4": 3.6601492881774904, "ce_loss_9": 3.284208047389984, "epoch": 0.538, "grad_norm": 988.0, "kl_loss_13": 131.75076866149902, "kl_loss_2": 2184.6258850097656, "kl_loss_4": 1319.6296264648438, "kl_loss_9": 493.5565811157227, "learning_rate": 0.00044773576836617336, "loss": 1021.7246, "step": 5380 }, { "ce_loss_13": 3.2171466588974, "ce_loss_17": 3.162190842628479, "ce_loss_2": 4.19703506231308, "ce_loss_4": 3.7648111939430238, "ce_loss_9": 3.3748027086257935, "epoch": 0.539, "grad_norm": 860.0, "kl_loss_13": 134.25287551879882, "kl_loss_2": 2210.225714111328, "kl_loss_4": 1352.450634765625, "kl_loss_9": 510.7650650024414, "learning_rate": 0.00044615806311398056, "loss": 1058.4185, "step": 5390 }, { "ce_loss_13": 3.2941567420959474, "ce_loss_17": 3.243577218055725, "ce_loss_2": 4.194008111953735, "ce_loss_4": 3.783040940761566, "ce_loss_9": 3.4352632999420165, "epoch": 0.54, "grad_norm": 1128.0, "kl_loss_13": 129.89540672302246, "kl_loss_2": 2061.472998046875, "kl_loss_4": 1257.5868103027344, "kl_loss_9": 481.79625091552737, "learning_rate": 0.00044458090004949454, "loss": 1021.7906, "step": 5400 }, { "ce_loss_13": 3.155569624900818, "ce_loss_17": 3.0976157426834106, "ce_loss_2": 4.1883559107780455, "ce_loss_4": 3.722623872756958, "ce_loss_9": 3.3214789986610413, "epoch": 0.541, "grad_norm": 1360.0, "kl_loss_13": 137.566650390625, "kl_loss_2": 2315.764050292969, "kl_loss_4": 1404.6625244140625, "kl_loss_9": 524.3114639282227, "learning_rate": 0.0004430042950547297, "loss": 1049.3477, "step": 5410 }, { "ce_loss_13": 3.246773374080658, "ce_loss_17": 3.188892090320587, "ce_loss_2": 4.236356163024903, "ce_loss_4": 3.7915536642074583, "ce_loss_9": 3.4077443361282347, "epoch": 0.542, "grad_norm": 1064.0, "kl_loss_13": 137.08602447509764, "kl_loss_2": 2208.137078857422, "kl_loss_4": 1337.8753173828125, "kl_loss_9": 509.88084716796874, "learning_rate": 0.0004414282640060809, "loss": 1038.7643, "step": 5420 }, { "ce_loss_13": 3.3373844981193543, "ce_loss_17": 3.2815580010414123, "ce_loss_2": 4.274537968635559, "ce_loss_4": 3.868986356258392, "ce_loss_9": 3.51566801071167, "epoch": 0.543, "grad_norm": 1056.0, "kl_loss_13": 134.13159561157227, "kl_loss_2": 2105.6742980957033, "kl_loss_4": 1305.07958984375, "kl_loss_9": 549.923291015625, "learning_rate": 0.0004398528227741633, "loss": 1056.3928, "step": 5430 }, { "ce_loss_13": 3.203597700595856, "ce_loss_17": 3.1493515610694884, "ce_loss_2": 4.176466572284698, "ce_loss_4": 3.746497869491577, "ce_loss_9": 3.3762477040290833, "epoch": 0.544, "grad_norm": 1192.0, "kl_loss_13": 135.4837947845459, "kl_loss_2": 2154.3332946777346, "kl_loss_4": 1333.020281982422, "kl_loss_9": 535.9938461303711, "learning_rate": 0.00043827798722365264, "loss": 1052.0586, "step": 5440 }, { "ce_loss_13": 3.318354940414429, "ce_loss_17": 3.264675056934357, "ce_loss_2": 4.252612495422364, "ce_loss_4": 3.827874505519867, "ce_loss_9": 3.471630024909973, "epoch": 0.545, "grad_norm": 1200.0, "kl_loss_13": 135.04343338012694, "kl_loss_2": 2126.4736877441405, "kl_loss_4": 1292.3597839355468, "kl_loss_9": 509.0356414794922, "learning_rate": 0.00043670377321312535, "loss": 1023.5631, "step": 5450 }, { "ce_loss_13": 3.3337458252906798, "ce_loss_17": 3.2813233137130737, "ce_loss_2": 4.250935280323029, "ce_loss_4": 3.831894874572754, "ce_loss_9": 3.4755696296691894, "epoch": 0.546, "grad_norm": 1008.0, "kl_loss_13": 131.79749908447266, "kl_loss_2": 2095.901251220703, "kl_loss_4": 1278.7771484375, "kl_loss_9": 495.3526916503906, "learning_rate": 0.0004351301965948991, "loss": 1026.3041, "step": 5460 }, { "ce_loss_13": 3.234769380092621, "ce_loss_17": 3.1844673991203307, "ce_loss_2": 4.165803074836731, "ce_loss_4": 3.741377794742584, "ce_loss_9": 3.388260471820831, "epoch": 0.547, "grad_norm": 1200.0, "kl_loss_13": 130.0521255493164, "kl_loss_2": 2092.814373779297, "kl_loss_4": 1271.5724792480469, "kl_loss_9": 490.8547622680664, "learning_rate": 0.000433557273214873, "loss": 1017.5789, "step": 5470 }, { "ce_loss_13": 3.2289255619049073, "ce_loss_17": 3.1718102216720583, "ce_loss_2": 4.171566390991211, "ce_loss_4": 3.7412869334220886, "ce_loss_9": 3.3804320335388183, "epoch": 0.548, "grad_norm": 1064.0, "kl_loss_13": 132.98280296325683, "kl_loss_2": 2129.2273376464846, "kl_loss_4": 1285.3636352539063, "kl_loss_9": 500.02489318847654, "learning_rate": 0.000431985018912368, "loss": 1006.1987, "step": 5480 }, { "ce_loss_13": 3.199251711368561, "ce_loss_17": 3.143541431427002, "ce_loss_2": 4.185951912403107, "ce_loss_4": 3.7487940549850465, "ce_loss_9": 3.357284128665924, "epoch": 0.549, "grad_norm": 1004.0, "kl_loss_13": 133.99756317138673, "kl_loss_2": 2215.6486083984373, "kl_loss_4": 1349.2265380859376, "kl_loss_9": 512.2941909790039, "learning_rate": 0.0004304134495199674, "loss": 1018.1971, "step": 5490 }, { "ce_loss_13": 3.2224794387817384, "ce_loss_17": 3.169151949882507, "ce_loss_2": 4.192255032062531, "ce_loss_4": 3.7633113980293276, "ce_loss_9": 3.3868582010269166, "epoch": 0.55, "grad_norm": 1312.0, "kl_loss_13": 133.84683647155762, "kl_loss_2": 2195.102508544922, "kl_loss_4": 1351.3821105957031, "kl_loss_9": 514.1343627929688, "learning_rate": 0.0004288425808633575, "loss": 1033.9531, "step": 5500 }, { "ce_loss_13": 3.2058059334754945, "ce_loss_17": 3.153227412700653, "ce_loss_2": 4.171619081497193, "ce_loss_4": 3.733169734477997, "ce_loss_9": 3.3558686494827272, "epoch": 0.551, "grad_norm": 1136.0, "kl_loss_13": 131.19682273864746, "kl_loss_2": 2168.918328857422, "kl_loss_4": 1316.411883544922, "kl_loss_9": 494.8409881591797, "learning_rate": 0.0004272724287611684, "loss": 1028.4314, "step": 5510 }, { "ce_loss_13": 3.176490139961243, "ce_loss_17": 3.1218523263931273, "ce_loss_2": 4.16993955373764, "ce_loss_4": 3.703540527820587, "ce_loss_9": 3.330122184753418, "epoch": 0.552, "grad_norm": 1004.0, "kl_loss_13": 134.03536376953124, "kl_loss_2": 2222.2029235839846, "kl_loss_4": 1320.7259704589844, "kl_loss_9": 501.89526214599607, "learning_rate": 0.00042570300902481425, "loss": 1033.6307, "step": 5520 }, { "ce_loss_13": 3.210631549358368, "ce_loss_17": 3.1596511006355286, "ce_loss_2": 4.15686526298523, "ce_loss_4": 3.72890887260437, "ce_loss_9": 3.3562500834465028, "epoch": 0.553, "grad_norm": 1288.0, "kl_loss_13": 130.87191467285157, "kl_loss_2": 2162.6354248046873, "kl_loss_4": 1304.2269592285156, "kl_loss_9": 492.1235870361328, "learning_rate": 0.00042413433745833423, "loss": 1021.6552, "step": 5530 }, { "ce_loss_13": 3.2040164947509764, "ce_loss_17": 3.1507420897483827, "ce_loss_2": 4.180953633785248, "ce_loss_4": 3.7361573576927185, "ce_loss_9": 3.3582830667495727, "epoch": 0.554, "grad_norm": 1032.0, "kl_loss_13": 132.4494644165039, "kl_loss_2": 2179.8425354003907, "kl_loss_4": 1315.2587341308595, "kl_loss_9": 498.22752075195314, "learning_rate": 0.0004225664298582339, "loss": 1004.4396, "step": 5540 }, { "ce_loss_13": 3.285593938827515, "ce_loss_17": 3.233135688304901, "ce_loss_2": 4.227004444599151, "ce_loss_4": 3.8036520719528197, "ce_loss_9": 3.4368974804878234, "epoch": 0.555, "grad_norm": 944.0, "kl_loss_13": 129.94357032775878, "kl_loss_2": 2118.2228393554688, "kl_loss_4": 1287.5967407226562, "kl_loss_9": 489.92351684570315, "learning_rate": 0.000420999302013325, "loss": 1005.6581, "step": 5550 }, { "ce_loss_13": 3.187768280506134, "ce_loss_17": 3.1312767386436464, "ce_loss_2": 4.21038703918457, "ce_loss_4": 3.7397583365440368, "ce_loss_9": 3.347144615650177, "epoch": 0.556, "grad_norm": 1248.0, "kl_loss_13": 136.3631275177002, "kl_loss_2": 2268.610418701172, "kl_loss_4": 1352.30615234375, "kl_loss_9": 510.2410949707031, "learning_rate": 0.000419432969704568, "loss": 1032.9252, "step": 5560 }, { "ce_loss_13": 3.2292237162590025, "ce_loss_17": 3.177082085609436, "ce_loss_2": 4.18310284614563, "ce_loss_4": 3.744145452976227, "ce_loss_9": 3.3792643904685975, "epoch": 0.557, "grad_norm": 980.0, "kl_loss_13": 132.28214225769042, "kl_loss_2": 2134.446813964844, "kl_loss_4": 1289.7424621582031, "kl_loss_9": 493.618879699707, "learning_rate": 0.00041786744870491154, "loss": 1042.5356, "step": 5570 }, { "ce_loss_13": 3.171014332771301, "ce_loss_17": 3.1177715182304384, "ce_loss_2": 4.142423605918884, "ce_loss_4": 3.7064967155456543, "ce_loss_9": 3.3284991502761843, "epoch": 0.558, "grad_norm": 972.0, "kl_loss_13": 134.93039207458497, "kl_loss_2": 2183.427178955078, "kl_loss_4": 1331.1433715820312, "kl_loss_9": 506.1816467285156, "learning_rate": 0.0004163027547791347, "loss": 1030.3906, "step": 5580 }, { "ce_loss_13": 3.1522242069244384, "ce_loss_17": 3.0993991136550902, "ce_loss_2": 4.171307468414307, "ce_loss_4": 3.705061507225037, "ce_loss_9": 3.311863899230957, "epoch": 0.559, "grad_norm": 1416.0, "kl_loss_13": 133.92113571166993, "kl_loss_2": 2253.875848388672, "kl_loss_4": 1346.7115905761718, "kl_loss_9": 508.65733489990237, "learning_rate": 0.0004147389036836881, "loss": 1039.0274, "step": 5590 }, { "ce_loss_13": 3.1989952683448792, "ce_loss_17": 3.145903205871582, "ce_loss_2": 4.1762094616889955, "ce_loss_4": 3.7403613090515138, "ce_loss_9": 3.3588144302368166, "epoch": 0.56, "grad_norm": 1224.0, "kl_loss_13": 135.01686058044433, "kl_loss_2": 2191.3487060546877, "kl_loss_4": 1341.5612731933593, "kl_loss_9": 508.8012298583984, "learning_rate": 0.00041317591116653486, "loss": 1055.6337, "step": 5600 }, { "ce_loss_13": 3.2330668926239015, "ce_loss_17": 3.1778844714164736, "ce_loss_2": 4.216384243965149, "ce_loss_4": 3.7711787104606627, "ce_loss_9": 3.392125999927521, "epoch": 0.561, "grad_norm": 1032.0, "kl_loss_13": 136.42426719665528, "kl_loss_2": 2197.669677734375, "kl_loss_4": 1332.3004516601563, "kl_loss_9": 508.9519470214844, "learning_rate": 0.0004116137929669921, "loss": 1025.2678, "step": 5610 }, { "ce_loss_13": 3.2243699073791503, "ce_loss_17": 3.1729207754135134, "ce_loss_2": 4.187157392501831, "ce_loss_4": 3.7499901413917542, "ce_loss_9": 3.380918550491333, "epoch": 0.562, "grad_norm": 1472.0, "kl_loss_13": 130.93699417114257, "kl_loss_2": 2164.2665771484376, "kl_loss_4": 1314.7912902832031, "kl_loss_9": 498.43362884521486, "learning_rate": 0.00041005256481557305, "loss": 1011.9689, "step": 5620 }, { "ce_loss_13": 3.3207542061805726, "ce_loss_17": 3.2686077475547792, "ce_loss_2": 4.2263119459152225, "ce_loss_4": 3.817964506149292, "ce_loss_9": 3.46480907201767, "epoch": 0.563, "grad_norm": 936.0, "kl_loss_13": 129.03732147216797, "kl_loss_2": 2062.6393310546873, "kl_loss_4": 1257.5776977539062, "kl_loss_9": 482.7308776855469, "learning_rate": 0.00040849224243382767, "loss": 998.6781, "step": 5630 }, { "ce_loss_13": 3.180345618724823, "ce_loss_17": 3.127782416343689, "ce_loss_2": 4.147598731517792, "ce_loss_4": 3.7082226634025575, "ce_loss_9": 3.330659508705139, "epoch": 0.564, "grad_norm": 1056.0, "kl_loss_13": 131.76968536376953, "kl_loss_2": 2173.7773864746096, "kl_loss_4": 1314.792254638672, "kl_loss_9": 502.11459197998045, "learning_rate": 0.000406932841534185, "loss": 1010.9795, "step": 5640 }, { "ce_loss_13": 3.145984673500061, "ce_loss_17": 3.0922013878822328, "ce_loss_2": 4.1250264883041385, "ce_loss_4": 3.6886088490486144, "ce_loss_9": 3.300795888900757, "epoch": 0.565, "grad_norm": 1312.0, "kl_loss_13": 132.76808471679686, "kl_loss_2": 2193.359210205078, "kl_loss_4": 1333.0800720214843, "kl_loss_9": 499.2179977416992, "learning_rate": 0.0004053743778197951, "loss": 1052.1918, "step": 5650 }, { "ce_loss_13": 3.249157118797302, "ce_loss_17": 3.192816066741943, "ce_loss_2": 4.213837122917175, "ce_loss_4": 3.779017674922943, "ce_loss_9": 3.4072057247161864, "epoch": 0.566, "grad_norm": 1072.0, "kl_loss_13": 135.29073944091797, "kl_loss_2": 2147.1768493652344, "kl_loss_4": 1304.3086853027344, "kl_loss_9": 500.4528610229492, "learning_rate": 0.0004038168669843697, "loss": 1037.9282, "step": 5660 }, { "ce_loss_13": 3.2034655570983888, "ce_loss_17": 3.1523698568344116, "ce_loss_2": 4.144994485378265, "ce_loss_4": 3.7145391583442686, "ce_loss_9": 3.3537856101989747, "epoch": 0.567, "grad_norm": 1520.0, "kl_loss_13": 130.33058433532716, "kl_loss_2": 2116.760290527344, "kl_loss_4": 1276.805889892578, "kl_loss_9": 489.4864074707031, "learning_rate": 0.000402260324712026, "loss": 1028.4375, "step": 5670 }, { "ce_loss_13": 3.2483932852745054, "ce_loss_17": 3.1986745238304137, "ce_loss_2": 4.234227883815765, "ce_loss_4": 3.7821094512939455, "ce_loss_9": 3.406536591053009, "epoch": 0.568, "grad_norm": 1040.0, "kl_loss_13": 129.7056034088135, "kl_loss_2": 2196.4221740722655, "kl_loss_4": 1318.0563720703126, "kl_loss_9": 496.03491668701173, "learning_rate": 0.00040070476667712743, "loss": 1018.4254, "step": 5680 }, { "ce_loss_13": 3.2774877667427065, "ce_loss_17": 3.2215285301208496, "ce_loss_2": 4.236721122264862, "ce_loss_4": 3.806514632701874, "ce_loss_9": 3.429469978809357, "epoch": 0.569, "grad_norm": 868.0, "kl_loss_13": 132.3956100463867, "kl_loss_2": 2151.301025390625, "kl_loss_4": 1306.724298095703, "kl_loss_9": 495.3632614135742, "learning_rate": 0.0003991502085441259, "loss": 1026.9729, "step": 5690 }, { "ce_loss_13": 3.3139462232589723, "ce_loss_17": 3.261700189113617, "ce_loss_2": 4.225625109672547, "ce_loss_4": 3.8117709279060366, "ce_loss_9": 3.4587910771369934, "epoch": 0.57, "grad_norm": 996.0, "kl_loss_13": 128.47461585998536, "kl_loss_2": 2052.756951904297, "kl_loss_4": 1247.8939636230468, "kl_loss_9": 480.71121368408205, "learning_rate": 0.0003975966659674047, "loss": 1007.0836, "step": 5700 }, { "ce_loss_13": 3.2800772428512572, "ce_loss_17": 3.228550946712494, "ce_loss_2": 4.2356154203414915, "ce_loss_4": 3.800789141654968, "ce_loss_9": 3.4307155132293703, "epoch": 0.571, "grad_norm": 1120.0, "kl_loss_13": 132.43450775146485, "kl_loss_2": 2138.3428588867187, "kl_loss_4": 1292.2898010253907, "kl_loss_9": 493.69357147216795, "learning_rate": 0.0003960441545911204, "loss": 1009.6633, "step": 5710 }, { "ce_loss_13": 3.274647569656372, "ce_loss_17": 3.2232601284980773, "ce_loss_2": 4.217183375358582, "ce_loss_4": 3.7938233494758604, "ce_loss_9": 3.4216581344604493, "epoch": 0.572, "grad_norm": 1192.0, "kl_loss_13": 130.9056011199951, "kl_loss_2": 2138.7324340820314, "kl_loss_4": 1306.0460388183594, "kl_loss_9": 494.74311981201174, "learning_rate": 0.0003944926900490452, "loss": 1017.4797, "step": 5720 }, { "ce_loss_13": 3.189040946960449, "ce_loss_17": 3.1349090933799744, "ce_loss_2": 4.184177494049072, "ce_loss_4": 3.734488534927368, "ce_loss_9": 3.34765282869339, "epoch": 0.573, "grad_norm": 996.0, "kl_loss_13": 133.4018653869629, "kl_loss_2": 2211.316973876953, "kl_loss_4": 1334.711181640625, "kl_loss_9": 508.8348098754883, "learning_rate": 0.0003929422879644099, "loss": 1021.6818, "step": 5730 }, { "ce_loss_13": 3.1942123889923097, "ce_loss_17": 3.142498242855072, "ce_loss_2": 4.151941287517547, "ce_loss_4": 3.7095534682273863, "ce_loss_9": 3.3434372544288635, "epoch": 0.574, "grad_norm": 1176.0, "kl_loss_13": 128.78155403137208, "kl_loss_2": 2139.55869140625, "kl_loss_4": 1293.195068359375, "kl_loss_9": 486.07669830322266, "learning_rate": 0.0003913929639497462, "loss": 995.2121, "step": 5740 }, { "ce_loss_13": 3.1522650957107543, "ce_loss_17": 3.099550783634186, "ce_loss_2": 4.147507619857788, "ce_loss_4": 3.6863506197929383, "ce_loss_9": 3.3036242008209227, "epoch": 0.575, "grad_norm": 1176.0, "kl_loss_13": 129.52433891296386, "kl_loss_2": 2215.996954345703, "kl_loss_4": 1330.6463989257813, "kl_loss_9": 491.7368698120117, "learning_rate": 0.00038984473360672965, "loss": 1016.4816, "step": 5750 }, { "ce_loss_13": 3.159215581417084, "ce_loss_17": 3.1063928246498107, "ce_loss_2": 4.150642693042755, "ce_loss_4": 3.695197355747223, "ce_loss_9": 3.3141119360923765, "epoch": 0.576, "grad_norm": 856.0, "kl_loss_13": 128.82598114013672, "kl_loss_2": 2210.5453369140623, "kl_loss_4": 1317.800146484375, "kl_loss_9": 490.75178680419924, "learning_rate": 0.0003882976125260229, "loss": 1012.6595, "step": 5760 }, { "ce_loss_13": 3.2239240527153017, "ce_loss_17": 3.1707969427108766, "ce_loss_2": 4.190750896930695, "ce_loss_4": 3.7483965277671816, "ce_loss_9": 3.376004767417908, "epoch": 0.577, "grad_norm": 1144.0, "kl_loss_13": 130.94254608154296, "kl_loss_2": 2155.1744018554687, "kl_loss_4": 1296.964910888672, "kl_loss_9": 491.41676330566406, "learning_rate": 0.00038675161628711776, "loss": 1022.7446, "step": 5770 }, { "ce_loss_13": 3.2630710005760193, "ce_loss_17": 3.210341048240662, "ce_loss_2": 4.20232663154602, "ce_loss_4": 3.771031928062439, "ce_loss_9": 3.4094324827194216, "epoch": 0.578, "grad_norm": 1104.0, "kl_loss_13": 130.60945892333984, "kl_loss_2": 2114.451708984375, "kl_loss_4": 1274.0074768066406, "kl_loss_9": 487.78747406005857, "learning_rate": 0.0003852067604581794, "loss": 1034.5994, "step": 5780 }, { "ce_loss_13": 3.213357675075531, "ce_loss_17": 3.1623314380645753, "ce_loss_2": 4.178617370128632, "ce_loss_4": 3.734889566898346, "ce_loss_9": 3.363708531856537, "epoch": 0.579, "grad_norm": 1256.0, "kl_loss_13": 129.9854564666748, "kl_loss_2": 2183.9435424804688, "kl_loss_4": 1317.9291015625, "kl_loss_9": 493.04237365722656, "learning_rate": 0.0003836630605958888, "loss": 1017.9811, "step": 5790 }, { "ce_loss_13": 3.2693671107292177, "ce_loss_17": 3.2181182622909548, "ce_loss_2": 4.221805596351624, "ce_loss_4": 3.78885041475296, "ce_loss_9": 3.421799433231354, "epoch": 0.58, "grad_norm": 1192.0, "kl_loss_13": 131.78593864440919, "kl_loss_2": 2176.81298828125, "kl_loss_4": 1306.9365783691405, "kl_loss_9": 497.6964859008789, "learning_rate": 0.0003821205322452863, "loss": 1056.998, "step": 5800 }, { "ce_loss_13": 3.2440071582794188, "ce_loss_17": 3.1936801195144655, "ce_loss_2": 4.194181382656097, "ce_loss_4": 3.7538015484809875, "ce_loss_9": 3.393703615665436, "epoch": 0.581, "grad_norm": 1016.0, "kl_loss_13": 129.62765464782714, "kl_loss_2": 2139.3076782226562, "kl_loss_4": 1284.9189086914062, "kl_loss_9": 484.87910614013674, "learning_rate": 0.0003805791909396155, "loss": 1017.1694, "step": 5810 }, { "ce_loss_13": 3.1992859959602358, "ce_loss_17": 3.1508589386940002, "ce_loss_2": 4.169664001464843, "ce_loss_4": 3.720565640926361, "ce_loss_9": 3.349891519546509, "epoch": 0.582, "grad_norm": 1352.0, "kl_loss_13": 129.32291641235352, "kl_loss_2": 2162.0898681640624, "kl_loss_4": 1295.7684204101563, "kl_loss_9": 488.28394775390626, "learning_rate": 0.0003790390522001662, "loss": 1028.5563, "step": 5820 }, { "ce_loss_13": 3.1398118257522585, "ce_loss_17": 3.0897351026535036, "ce_loss_2": 4.109020674228669, "ce_loss_4": 3.663999307155609, "ce_loss_9": 3.2862123131752012, "epoch": 0.583, "grad_norm": 1032.0, "kl_loss_13": 128.52812805175782, "kl_loss_2": 2192.261993408203, "kl_loss_4": 1315.5209045410156, "kl_loss_9": 489.6519302368164, "learning_rate": 0.0003775001315361183, "loss": 1012.5699, "step": 5830 }, { "ce_loss_13": 3.241665470600128, "ce_loss_17": 3.186921846866608, "ce_loss_2": 4.219884061813355, "ce_loss_4": 3.7748271465301513, "ce_loss_9": 3.398333990573883, "epoch": 0.584, "grad_norm": 1056.0, "kl_loss_13": 132.4353328704834, "kl_loss_2": 2173.2671142578124, "kl_loss_4": 1307.314959716797, "kl_loss_9": 493.93543853759763, "learning_rate": 0.0003759624444443858, "loss": 1022.8445, "step": 5840 }, { "ce_loss_13": 3.279761719703674, "ce_loss_17": 3.2276388883590696, "ce_loss_2": 4.2155272483825685, "ce_loss_4": 3.7861064195632936, "ce_loss_9": 3.42243732213974, "epoch": 0.585, "grad_norm": 1168.0, "kl_loss_13": 130.31413650512695, "kl_loss_2": 2135.141632080078, "kl_loss_4": 1292.734228515625, "kl_loss_9": 486.45812835693357, "learning_rate": 0.00037442600640946044, "loss": 1002.9861, "step": 5850 }, { "ce_loss_13": 3.2379026412963867, "ce_loss_17": 3.186711013317108, "ce_loss_2": 4.18025815486908, "ce_loss_4": 3.749708187580109, "ce_loss_9": 3.386901295185089, "epoch": 0.586, "grad_norm": 1368.0, "kl_loss_13": 128.97719192504883, "kl_loss_2": 2137.7595092773436, "kl_loss_4": 1291.2993408203124, "kl_loss_9": 489.5702728271484, "learning_rate": 0.00037289083290325663, "loss": 996.3987, "step": 5860 }, { "ce_loss_13": 3.223183000087738, "ce_loss_17": 3.1709699511528013, "ce_loss_2": 4.15801248550415, "ce_loss_4": 3.730294871330261, "ce_loss_9": 3.3676915526390077, "epoch": 0.587, "grad_norm": 956.0, "kl_loss_13": 131.02411994934081, "kl_loss_2": 2102.5575744628904, "kl_loss_4": 1268.2917602539062, "kl_loss_9": 483.7045623779297, "learning_rate": 0.0003713569393849543, "loss": 999.1043, "step": 5870 }, { "ce_loss_13": 3.2732008695602417, "ce_loss_17": 3.2209419131278993, "ce_loss_2": 4.210261130332947, "ce_loss_4": 3.7881874203681947, "ce_loss_9": 3.42228764295578, "epoch": 0.588, "grad_norm": 1032.0, "kl_loss_13": 131.58010749816896, "kl_loss_2": 2132.156384277344, "kl_loss_4": 1298.139630126953, "kl_loss_9": 488.82654571533203, "learning_rate": 0.00036982434130084397, "loss": 1013.0625, "step": 5880 }, { "ce_loss_13": 3.1823778986930846, "ce_loss_17": 3.130832076072693, "ce_loss_2": 4.137702989578247, "ce_loss_4": 3.7032042384147643, "ce_loss_9": 3.3355833888053894, "epoch": 0.589, "grad_norm": 1368.0, "kl_loss_13": 133.46070594787597, "kl_loss_2": 2149.131475830078, "kl_loss_4": 1300.0962036132812, "kl_loss_9": 501.61551971435546, "learning_rate": 0.00036829305408417166, "loss": 1024.4371, "step": 5890 }, { "ce_loss_13": 3.1696176409721373, "ce_loss_17": 3.1138539552688598, "ce_loss_2": 4.159798121452331, "ce_loss_4": 3.7092005014419556, "ce_loss_9": 3.323733127117157, "epoch": 0.59, "grad_norm": 1200.0, "kl_loss_13": 132.95247344970704, "kl_loss_2": 2198.4232849121095, "kl_loss_4": 1329.5057006835937, "kl_loss_9": 497.9648834228516, "learning_rate": 0.0003667630931549826, "loss": 1025.4742, "step": 5900 }, { "ce_loss_13": 3.139581060409546, "ce_loss_17": 3.087868535518646, "ce_loss_2": 4.162065267562866, "ce_loss_4": 3.6882544636726378, "ce_loss_9": 3.29707407951355, "epoch": 0.591, "grad_norm": 1240.0, "kl_loss_13": 131.48533630371094, "kl_loss_2": 2270.6588500976563, "kl_loss_4": 1348.2559936523437, "kl_loss_9": 501.0956344604492, "learning_rate": 0.00036523447391996613, "loss": 1039.9964, "step": 5910 }, { "ce_loss_13": 3.229800522327423, "ce_loss_17": 3.1795202016830446, "ce_loss_2": 4.170403301715851, "ce_loss_4": 3.7378870248794556, "ce_loss_9": 3.3798424363136292, "epoch": 0.592, "grad_norm": 1272.0, "kl_loss_13": 127.81746520996094, "kl_loss_2": 2122.1211669921877, "kl_loss_4": 1275.9772521972657, "kl_loss_9": 483.68453216552734, "learning_rate": 0.00036370721177230114, "loss": 1000.5404, "step": 5920 }, { "ce_loss_13": 3.2272711992263794, "ce_loss_17": 3.174441361427307, "ce_loss_2": 4.205386471748352, "ce_loss_4": 3.751111078262329, "ce_loss_9": 3.382340908050537, "epoch": 0.593, "grad_norm": 972.0, "kl_loss_13": 133.14159812927247, "kl_loss_2": 2174.2038024902345, "kl_loss_4": 1306.2136352539062, "kl_loss_9": 498.0281463623047, "learning_rate": 0.00036218132209150044, "loss": 1021.5758, "step": 5930 }, { "ce_loss_13": 3.180279219150543, "ce_loss_17": 3.123573052883148, "ce_loss_2": 4.190980327129364, "ce_loss_4": 3.7382585763931275, "ce_loss_9": 3.3427863240242006, "epoch": 0.594, "grad_norm": 1032.0, "kl_loss_13": 136.12708206176757, "kl_loss_2": 2261.150262451172, "kl_loss_4": 1361.9492919921875, "kl_loss_9": 510.48377532958983, "learning_rate": 0.0003606568202432562, "loss": 1040.4267, "step": 5940 }, { "ce_loss_13": 3.2481855273246767, "ce_loss_17": 3.1964462637901305, "ce_loss_2": 4.238270390033722, "ce_loss_4": 3.7811824321746825, "ce_loss_9": 3.4021562576293944, "epoch": 0.595, "grad_norm": 1072.0, "kl_loss_13": 132.79216842651368, "kl_loss_2": 2225.1730224609373, "kl_loss_4": 1327.2218627929688, "kl_loss_9": 497.28614807128906, "learning_rate": 0.0003591337215792851, "loss": 1015.3662, "step": 5950 }, { "ce_loss_13": 3.2907931566238404, "ce_loss_17": 3.239170801639557, "ce_loss_2": 4.201977634429932, "ce_loss_4": 3.7864516615867614, "ce_loss_9": 3.435347092151642, "epoch": 0.596, "grad_norm": 1208.0, "kl_loss_13": 128.2599021911621, "kl_loss_2": 2096.9853454589843, "kl_loss_4": 1275.0028686523438, "kl_loss_9": 483.8995834350586, "learning_rate": 0.00035761204143717383, "loss": 1013.235, "step": 5960 }, { "ce_loss_13": 3.2461815237998963, "ce_loss_17": 3.193547213077545, "ce_loss_2": 4.199598157405854, "ce_loss_4": 3.7650540113449096, "ce_loss_9": 3.3914494156837462, "epoch": 0.597, "grad_norm": 1240.0, "kl_loss_13": 131.16183776855468, "kl_loss_2": 2161.549493408203, "kl_loss_4": 1307.6337036132813, "kl_loss_9": 490.7261627197266, "learning_rate": 0.0003560917951402245, "loss": 1038.1078, "step": 5970 }, { "ce_loss_13": 3.2252924919128416, "ce_loss_17": 3.1728481531143187, "ce_loss_2": 4.1842066764831545, "ce_loss_4": 3.7419864892959596, "ce_loss_9": 3.3711967945098875, "epoch": 0.598, "grad_norm": 1112.0, "kl_loss_13": 129.1011734008789, "kl_loss_2": 2145.193048095703, "kl_loss_4": 1291.327197265625, "kl_loss_9": 490.7007202148437, "learning_rate": 0.00035457299799730046, "loss": 1010.8068, "step": 5980 }, { "ce_loss_13": 3.282545638084412, "ce_loss_17": 3.2315930843353273, "ce_loss_2": 4.213894009590149, "ce_loss_4": 3.79153368473053, "ce_loss_9": 3.4328907132148743, "epoch": 0.599, "grad_norm": 888.0, "kl_loss_13": 129.47135429382325, "kl_loss_2": 2105.2073181152346, "kl_loss_4": 1276.262762451172, "kl_loss_9": 486.0889373779297, "learning_rate": 0.0003530556653026721, "loss": 1014.5766, "step": 5990 }, { "ce_loss_13": 3.2029795169830324, "ce_loss_17": 3.1522786259651183, "ce_loss_2": 4.1706165194511415, "ce_loss_4": 3.7232335209846497, "ce_loss_9": 3.352409815788269, "epoch": 0.6, "grad_norm": 3312.0, "kl_loss_13": 128.47716636657714, "kl_loss_2": 2174.0676513671874, "kl_loss_4": 1291.9308471679688, "kl_loss_9": 481.94248657226564, "learning_rate": 0.00035153981233586274, "loss": 1025.868, "step": 6000 }, { "ce_loss_13": 3.178492033481598, "ce_loss_17": 3.1272965908050536, "ce_loss_2": 4.149573051929474, "ce_loss_4": 3.705832135677338, "ce_loss_9": 3.3312960028648377, "epoch": 0.601, "grad_norm": 1064.0, "kl_loss_13": 127.85915641784668, "kl_loss_2": 2159.855822753906, "kl_loss_4": 1299.05146484375, "kl_loss_9": 485.8030700683594, "learning_rate": 0.00035002545436149473, "loss": 1046.7314, "step": 6010 }, { "ce_loss_13": 3.1914022088050844, "ce_loss_17": 3.1379736423492433, "ce_loss_2": 4.16829092502594, "ce_loss_4": 3.7276257395744326, "ce_loss_9": 3.3450494408607483, "epoch": 0.602, "grad_norm": 964.0, "kl_loss_13": 134.62960357666014, "kl_loss_2": 2200.064044189453, "kl_loss_4": 1337.7466247558593, "kl_loss_9": 499.6259002685547, "learning_rate": 0.0003485126066291364, "loss": 1014.4828, "step": 6020 }, { "ce_loss_13": 3.2334627151489257, "ce_loss_17": 3.1823294520378114, "ce_loss_2": 4.203712034225464, "ce_loss_4": 3.765213930606842, "ce_loss_9": 3.3829529643058778, "epoch": 0.603, "grad_norm": 1832.0, "kl_loss_13": 129.0961654663086, "kl_loss_2": 2162.0232177734374, "kl_loss_4": 1301.6566467285156, "kl_loss_9": 482.48655853271487, "learning_rate": 0.0003470012843731476, "loss": 1020.3895, "step": 6030 }, { "ce_loss_13": 3.1725748777389526, "ce_loss_17": 3.1219172358512877, "ce_loss_2": 4.153917360305786, "ce_loss_4": 3.7047430992126467, "ce_loss_9": 3.323031425476074, "epoch": 0.604, "grad_norm": 988.0, "kl_loss_13": 128.22620735168456, "kl_loss_2": 2174.6626892089844, "kl_loss_4": 1307.2364685058594, "kl_loss_9": 485.67259979248047, "learning_rate": 0.00034549150281252633, "loss": 1039.9221, "step": 6040 }, { "ce_loss_13": 3.156225049495697, "ce_loss_17": 3.1052051186561584, "ce_loss_2": 4.103243613243103, "ce_loss_4": 3.676672112941742, "ce_loss_9": 3.3094335913658144, "epoch": 0.605, "grad_norm": 948.0, "kl_loss_13": 130.03401947021484, "kl_loss_2": 2111.7546142578126, "kl_loss_4": 1276.77861328125, "kl_loss_9": 486.36414947509763, "learning_rate": 0.0003439832771507565, "loss": 1000.5956, "step": 6050 }, { "ce_loss_13": 3.1652685165405274, "ce_loss_17": 3.1147342681884767, "ce_loss_2": 4.136275148391723, "ce_loss_4": 3.7000526785850525, "ce_loss_9": 3.317437732219696, "epoch": 0.606, "grad_norm": 1112.0, "kl_loss_13": 129.71337890625, "kl_loss_2": 2177.412384033203, "kl_loss_4": 1315.5586364746093, "kl_loss_9": 489.50466003417966, "learning_rate": 0.0003424766225756537, "loss": 1013.9628, "step": 6060 }, { "ce_loss_13": 3.224382996559143, "ce_loss_17": 3.1717929601669312, "ce_loss_2": 4.186855435371399, "ce_loss_4": 3.7413568615913393, "ce_loss_9": 3.3756890177726744, "epoch": 0.607, "grad_norm": 1004.0, "kl_loss_13": 129.63483924865722, "kl_loss_2": 2154.6836853027344, "kl_loss_4": 1287.5677734375, "kl_loss_9": 492.26964416503904, "learning_rate": 0.00034097155425921255, "loss": 1000.866, "step": 6070 }, { "ce_loss_13": 3.1215175867080687, "ce_loss_17": 3.0668461322784424, "ce_loss_2": 4.104893136024475, "ce_loss_4": 3.6487231254577637, "ce_loss_9": 3.274439311027527, "epoch": 0.608, "grad_norm": 1048.0, "kl_loss_13": 130.64323501586915, "kl_loss_2": 2215.8221923828123, "kl_loss_4": 1315.2972045898437, "kl_loss_9": 493.64261322021486, "learning_rate": 0.0003394680873574546, "loss": 1019.724, "step": 6080 }, { "ce_loss_13": 3.221612584590912, "ce_loss_17": 3.168250393867493, "ce_loss_2": 4.213672697544098, "ce_loss_4": 3.7588576674461365, "ce_loss_9": 3.3779274940490724, "epoch": 0.609, "grad_norm": 1040.0, "kl_loss_13": 130.80139923095703, "kl_loss_2": 2217.8520080566404, "kl_loss_4": 1331.7625732421875, "kl_loss_9": 494.39530181884766, "learning_rate": 0.0003379662370102747, "loss": 1017.0575, "step": 6090 }, { "ce_loss_13": 3.236114704608917, "ce_loss_17": 3.1868168115615845, "ce_loss_2": 4.168602645397186, "ce_loss_4": 3.74908310174942, "ce_loss_9": 3.3852962970733644, "epoch": 0.61, "grad_norm": 1056.0, "kl_loss_13": 129.52690010070802, "kl_loss_2": 2133.054095458984, "kl_loss_4": 1290.4966674804687, "kl_loss_9": 490.9433334350586, "learning_rate": 0.0003364660183412892, "loss": 1019.0139, "step": 6100 }, { "ce_loss_13": 3.2169349193573, "ce_loss_17": 3.1648659348487853, "ce_loss_2": 4.162525868415832, "ce_loss_4": 3.726523590087891, "ce_loss_9": 3.367935848236084, "epoch": 0.611, "grad_norm": 1040.0, "kl_loss_13": 130.39142150878905, "kl_loss_2": 2152.272998046875, "kl_loss_4": 1297.5965270996094, "kl_loss_9": 494.06152801513673, "learning_rate": 0.0003349674464576834, "loss": 1028.7664, "step": 6110 }, { "ce_loss_13": 3.1670435786247255, "ce_loss_17": 3.113419938087463, "ce_loss_2": 4.145492124557495, "ce_loss_4": 3.6948423862457274, "ce_loss_9": 3.3198219656944277, "epoch": 0.612, "grad_norm": 1152.0, "kl_loss_13": 130.71346855163574, "kl_loss_2": 2194.3815979003907, "kl_loss_4": 1310.7994689941406, "kl_loss_9": 492.0315368652344, "learning_rate": 0.00033347053645005966, "loss": 1002.9774, "step": 6120 }, { "ce_loss_13": 3.269877278804779, "ce_loss_17": 3.2185349822044373, "ce_loss_2": 4.192396342754364, "ce_loss_4": 3.7749398946762085, "ce_loss_9": 3.4194448828697204, "epoch": 0.613, "grad_norm": 1136.0, "kl_loss_13": 128.15192909240722, "kl_loss_2": 2083.019189453125, "kl_loss_4": 1268.9011535644531, "kl_loss_9": 482.2656509399414, "learning_rate": 0.00033197530339228485, "loss": 1011.8859, "step": 6130 }, { "ce_loss_13": 3.225922727584839, "ce_loss_17": 3.1722275495529173, "ce_loss_2": 4.186046195030213, "ce_loss_4": 3.751970136165619, "ce_loss_9": 3.382258725166321, "epoch": 0.614, "grad_norm": 1272.0, "kl_loss_13": 131.46910705566407, "kl_loss_2": 2133.47294921875, "kl_loss_4": 1295.141046142578, "kl_loss_9": 497.4697860717773, "learning_rate": 0.00033048176234133967, "loss": 1009.9988, "step": 6140 }, { "ce_loss_13": 3.2190201759338377, "ce_loss_17": 3.1679511189460756, "ce_loss_2": 4.161319148540497, "ce_loss_4": 3.7299156904220583, "ce_loss_9": 3.366185426712036, "epoch": 0.615, "grad_norm": 1256.0, "kl_loss_13": 131.02049140930177, "kl_loss_2": 2132.594122314453, "kl_loss_4": 1294.4288146972656, "kl_loss_9": 490.61582946777344, "learning_rate": 0.0003289899283371657, "loss": 1017.3633, "step": 6150 }, { "ce_loss_13": 3.2317207932472227, "ce_loss_17": 3.181110608577728, "ce_loss_2": 4.203591299057007, "ce_loss_4": 3.756625533103943, "ce_loss_9": 3.382295918464661, "epoch": 0.616, "grad_norm": 1448.0, "kl_loss_13": 128.82185440063478, "kl_loss_2": 2146.947637939453, "kl_loss_4": 1283.8769958496093, "kl_loss_9": 482.7029403686523, "learning_rate": 0.0003274998164025148, "loss": 1026.5387, "step": 6160 }, { "ce_loss_13": 3.266766357421875, "ce_loss_17": 3.215480101108551, "ce_loss_2": 4.213196730613708, "ce_loss_4": 3.7787778258323668, "ce_loss_9": 3.418456768989563, "epoch": 0.617, "grad_norm": 1296.0, "kl_loss_13": 131.57930374145508, "kl_loss_2": 2129.7817932128905, "kl_loss_4": 1292.385076904297, "kl_loss_9": 494.99120330810547, "learning_rate": 0.0003260114415427975, "loss": 1032.8176, "step": 6170 }, { "ce_loss_13": 3.190107834339142, "ce_loss_17": 3.139162743091583, "ce_loss_2": 4.167746233940124, "ce_loss_4": 3.719817781448364, "ce_loss_9": 3.3410837650299072, "epoch": 0.618, "grad_norm": 1048.0, "kl_loss_13": 129.93679962158203, "kl_loss_2": 2185.32041015625, "kl_loss_4": 1304.9652587890625, "kl_loss_9": 484.50521087646484, "learning_rate": 0.0003245248187459323, "loss": 1035.5531, "step": 6180 }, { "ce_loss_13": 3.1807469844818117, "ce_loss_17": 3.132547652721405, "ce_loss_2": 4.106760859489441, "ce_loss_4": 3.6807324171066282, "ce_loss_9": 3.3244411826133726, "epoch": 0.619, "grad_norm": 988.0, "kl_loss_13": 125.5615966796875, "kl_loss_2": 2094.667272949219, "kl_loss_4": 1258.893084716797, "kl_loss_9": 473.79725646972656, "learning_rate": 0.00032303996298219416, "loss": 991.8865, "step": 6190 }, { "ce_loss_13": 3.2585551142692566, "ce_loss_17": 3.2073059678077698, "ce_loss_2": 4.1867189645767215, "ce_loss_4": 3.764932107925415, "ce_loss_9": 3.4036827087402344, "epoch": 0.62, "grad_norm": 1056.0, "kl_loss_13": 127.553950881958, "kl_loss_2": 2078.279748535156, "kl_loss_4": 1258.1490112304687, "kl_loss_9": 473.6962066650391, "learning_rate": 0.00032155688920406414, "loss": 990.6666, "step": 6200 }, { "ce_loss_13": 3.1700613379478453, "ce_loss_17": 3.11624915599823, "ce_loss_2": 4.160960614681244, "ce_loss_4": 3.7094520211219786, "ce_loss_9": 3.320200538635254, "epoch": 0.621, "grad_norm": 2240.0, "kl_loss_13": 131.5586887359619, "kl_loss_2": 2203.723388671875, "kl_loss_4": 1322.1176147460938, "kl_loss_9": 490.03852996826174, "learning_rate": 0.0003200756123460788, "loss": 1043.5803, "step": 6210 }, { "ce_loss_13": 3.206683027744293, "ce_loss_17": 3.1536587119102477, "ce_loss_2": 4.191497254371643, "ce_loss_4": 3.744880759716034, "ce_loss_9": 3.3635590791702272, "epoch": 0.622, "grad_norm": 1032.0, "kl_loss_13": 132.8932300567627, "kl_loss_2": 2204.233435058594, "kl_loss_4": 1323.6894104003907, "kl_loss_9": 500.33721618652345, "learning_rate": 0.00031859614732467957, "loss": 1037.4636, "step": 6220 }, { "ce_loss_13": 3.254208278656006, "ce_loss_17": 3.2018776655197145, "ce_loss_2": 4.194790470600128, "ce_loss_4": 3.758828341960907, "ce_loss_9": 3.4017786502838137, "epoch": 0.623, "grad_norm": 924.0, "kl_loss_13": 127.9259750366211, "kl_loss_2": 2118.521319580078, "kl_loss_4": 1272.2480041503907, "kl_loss_9": 482.4797988891602, "learning_rate": 0.00031711850903806275, "loss": 997.539, "step": 6230 }, { "ce_loss_13": 3.16369389295578, "ce_loss_17": 3.110656213760376, "ce_loss_2": 4.143066620826721, "ce_loss_4": 3.6980632305145265, "ce_loss_9": 3.3198477745056154, "epoch": 0.624, "grad_norm": 1144.0, "kl_loss_13": 132.46065940856934, "kl_loss_2": 2191.6250122070314, "kl_loss_4": 1318.770654296875, "kl_loss_9": 499.1682388305664, "learning_rate": 0.0003156427123660297, "loss": 1013.4273, "step": 6240 }, { "ce_loss_13": 3.2465111136436464, "ce_loss_17": 3.195686936378479, "ce_loss_2": 4.171822130680084, "ce_loss_4": 3.756170332431793, "ce_loss_9": 3.400746989250183, "epoch": 0.625, "grad_norm": 1520.0, "kl_loss_13": 128.69440422058105, "kl_loss_2": 2087.3688049316406, "kl_loss_4": 1274.438427734375, "kl_loss_9": 485.6867385864258, "learning_rate": 0.0003141687721698363, "loss": 1008.8062, "step": 6250 }, { "ce_loss_13": 3.219618356227875, "ce_loss_17": 3.1701894402503967, "ce_loss_2": 4.127357029914856, "ce_loss_4": 3.706602966785431, "ce_loss_9": 3.358958327770233, "epoch": 0.626, "grad_norm": 1360.0, "kl_loss_13": 124.10366249084473, "kl_loss_2": 2050.1990661621094, "kl_loss_4": 1228.9421691894531, "kl_loss_9": 464.08078918457034, "learning_rate": 0.00031269670329204396, "loss": 992.9805, "step": 6260 }, { "ce_loss_13": 3.258836364746094, "ce_loss_17": 3.2088966250419615, "ce_loss_2": 4.179555296897888, "ce_loss_4": 3.7659216642379763, "ce_loss_9": 3.4032238245010378, "epoch": 0.627, "grad_norm": 1128.0, "kl_loss_13": 129.05922622680663, "kl_loss_2": 2086.2775390625, "kl_loss_4": 1270.4315856933595, "kl_loss_9": 483.4391357421875, "learning_rate": 0.00031122652055637015, "loss": 1006.9918, "step": 6270 }, { "ce_loss_13": 3.2206938743591307, "ce_loss_17": 3.1703152894973754, "ce_loss_2": 4.182471299171448, "ce_loss_4": 3.739673984050751, "ce_loss_9": 3.371255946159363, "epoch": 0.628, "grad_norm": 928.0, "kl_loss_13": 129.65279655456544, "kl_loss_2": 2182.9799926757814, "kl_loss_4": 1312.7373107910157, "kl_loss_9": 491.89912109375, "learning_rate": 0.0003097582387675385, "loss": 1004.4152, "step": 6280 }, { "ce_loss_13": 3.2597048759460447, "ce_loss_17": 3.2095568656921385, "ce_loss_2": 4.195190346240997, "ce_loss_4": 3.7695303440093992, "ce_loss_9": 3.408683383464813, "epoch": 0.629, "grad_norm": 1208.0, "kl_loss_13": 129.30196952819824, "kl_loss_2": 2125.948077392578, "kl_loss_4": 1283.5384155273437, "kl_loss_9": 488.7808639526367, "learning_rate": 0.00030829187271113034, "loss": 1001.7178, "step": 6290 }, { "ce_loss_13": 3.249171268939972, "ce_loss_17": 3.199721872806549, "ce_loss_2": 4.184929835796356, "ce_loss_4": 3.7491312623023987, "ce_loss_9": 3.3924484133720396, "epoch": 0.63, "grad_norm": 1632.0, "kl_loss_13": 126.91181449890136, "kl_loss_2": 2092.1454345703123, "kl_loss_4": 1260.2350830078126, "kl_loss_9": 473.36788177490234, "learning_rate": 0.00030682743715343565, "loss": 1011.8248, "step": 6300 }, { "ce_loss_13": 3.1999635815620424, "ce_loss_17": 3.1475187659263613, "ce_loss_2": 4.172281873226166, "ce_loss_4": 3.73866411447525, "ce_loss_9": 3.355207014083862, "epoch": 0.631, "grad_norm": 912.0, "kl_loss_13": 133.29754943847655, "kl_loss_2": 2151.2015869140623, "kl_loss_4": 1310.7513244628906, "kl_loss_9": 495.34071350097656, "learning_rate": 0.0003053649468413043, "loss": 1031.6596, "step": 6310 }, { "ce_loss_13": 3.313090777397156, "ce_loss_17": 3.2586158514022827, "ce_loss_2": 4.246221339702606, "ce_loss_4": 3.8168843150138856, "ce_loss_9": 3.461293375492096, "epoch": 0.632, "grad_norm": 1032.0, "kl_loss_13": 131.5526538848877, "kl_loss_2": 2112.089636230469, "kl_loss_4": 1284.9698364257813, "kl_loss_9": 490.3696807861328, "learning_rate": 0.00030390441650199725, "loss": 1000.348, "step": 6320 }, { "ce_loss_13": 3.2131977438926698, "ce_loss_17": 3.161661946773529, "ce_loss_2": 4.162908935546875, "ce_loss_4": 3.7340385556221007, "ce_loss_9": 3.365513300895691, "epoch": 0.633, "grad_norm": 1096.0, "kl_loss_13": 128.31047248840332, "kl_loss_2": 2118.387255859375, "kl_loss_4": 1278.6016540527344, "kl_loss_9": 483.0653961181641, "learning_rate": 0.00030244586084303903, "loss": 999.3161, "step": 6330 }, { "ce_loss_13": 3.1839751839637755, "ce_loss_17": 3.1333404779434204, "ce_loss_2": 4.154147386550903, "ce_loss_4": 3.720763146877289, "ce_loss_9": 3.3386492133140564, "epoch": 0.634, "grad_norm": 1008.0, "kl_loss_13": 131.95327186584473, "kl_loss_2": 2192.22958984375, "kl_loss_4": 1327.3144897460938, "kl_loss_9": 496.0767120361328, "learning_rate": 0.00030098929455206903, "loss": 1010.0518, "step": 6340 }, { "ce_loss_13": 3.1903794765472413, "ce_loss_17": 3.1393781542778014, "ce_loss_2": 4.145576643943786, "ce_loss_4": 3.6984294414520265, "ce_loss_9": 3.3323403120040895, "epoch": 0.635, "grad_norm": 1080.0, "kl_loss_13": 126.84613990783691, "kl_loss_2": 2154.019696044922, "kl_loss_4": 1283.5807189941406, "kl_loss_9": 480.33143920898436, "learning_rate": 0.00029953473229669324, "loss": 1037.7311, "step": 6350 }, { "ce_loss_13": 3.2186363458633425, "ce_loss_17": 3.1682431578636168, "ce_loss_2": 4.177224063873291, "ce_loss_4": 3.742785966396332, "ce_loss_9": 3.371280241012573, "epoch": 0.636, "grad_norm": 940.0, "kl_loss_13": 127.65351753234863, "kl_loss_2": 2136.384002685547, "kl_loss_4": 1289.66396484375, "kl_loss_9": 486.83058166503906, "learning_rate": 0.00029808218872433767, "loss": 998.3021, "step": 6360 }, { "ce_loss_13": 3.2746296405792235, "ce_loss_17": 3.2223209500312806, "ce_loss_2": 4.207637786865234, "ce_loss_4": 3.7828882575035094, "ce_loss_9": 3.422584354877472, "epoch": 0.637, "grad_norm": 948.0, "kl_loss_13": 128.49758110046386, "kl_loss_2": 2115.7898559570312, "kl_loss_4": 1277.8892517089844, "kl_loss_9": 482.4291137695312, "learning_rate": 0.0002966316784621, "loss": 991.801, "step": 6370 }, { "ce_loss_13": 3.192687964439392, "ce_loss_17": 3.1387984275817873, "ce_loss_2": 4.163321709632873, "ce_loss_4": 3.725724518299103, "ce_loss_9": 3.341627466678619, "epoch": 0.638, "grad_norm": 912.0, "kl_loss_13": 131.47407188415528, "kl_loss_2": 2171.826025390625, "kl_loss_4": 1311.678533935547, "kl_loss_9": 492.5411651611328, "learning_rate": 0.0002951832161166024, "loss": 1003.1099, "step": 6380 }, { "ce_loss_13": 3.2633447885513305, "ce_loss_17": 3.2101088523864747, "ce_loss_2": 4.2188562750816345, "ce_loss_4": 3.787789022922516, "ce_loss_9": 3.4133699417114256, "epoch": 0.639, "grad_norm": 900.0, "kl_loss_13": 131.57433738708497, "kl_loss_2": 2135.934393310547, "kl_loss_4": 1290.1091003417969, "kl_loss_9": 488.0015365600586, "learning_rate": 0.0002937368162738445, "loss": 993.2084, "step": 6390 }, { "ce_loss_13": 3.210015070438385, "ce_loss_17": 3.1634300470352175, "ce_loss_2": 4.153219938278198, "ce_loss_4": 3.7185278296470643, "ce_loss_9": 3.3515684962272645, "epoch": 0.64, "grad_norm": 1096.0, "kl_loss_13": 124.47242279052735, "kl_loss_2": 2132.0518310546877, "kl_loss_4": 1278.2462280273437, "kl_loss_9": 472.7954605102539, "learning_rate": 0.0002922924934990568, "loss": 1013.5273, "step": 6400 }, { "ce_loss_13": 3.149289035797119, "ce_loss_17": 3.0991878867149354, "ce_loss_2": 4.1367835521698, "ce_loss_4": 3.6886163711547852, "ce_loss_9": 3.298277771472931, "epoch": 0.641, "grad_norm": 1056.0, "kl_loss_13": 129.38651847839355, "kl_loss_2": 2210.4976440429687, "kl_loss_4": 1331.8399291992187, "kl_loss_9": 492.8064361572266, "learning_rate": 0.0002908502623365536, "loss": 1019.5078, "step": 6410 }, { "ce_loss_13": 3.081189048290253, "ce_loss_17": 3.030816686153412, "ce_loss_2": 4.084830868244171, "ce_loss_4": 3.6201751470565795, "ce_loss_9": 3.235972213745117, "epoch": 0.642, "grad_norm": 1064.0, "kl_loss_13": 126.50981216430664, "kl_loss_2": 2236.6906860351564, "kl_loss_4": 1332.9084411621093, "kl_loss_9": 493.32887573242186, "learning_rate": 0.0002894101373095867, "loss": 1024.0775, "step": 6420 }, { "ce_loss_13": 3.2900022864341736, "ce_loss_17": 3.240232455730438, "ce_loss_2": 4.217806363105774, "ce_loss_4": 3.795979070663452, "ce_loss_9": 3.435810911655426, "epoch": 0.643, "grad_norm": 1200.0, "kl_loss_13": 130.18300704956056, "kl_loss_2": 2108.4472778320314, "kl_loss_4": 1272.9562927246093, "kl_loss_9": 485.9696578979492, "learning_rate": 0.00028797213292019926, "loss": 1003.0688, "step": 6430 }, { "ce_loss_13": 3.270418703556061, "ce_loss_17": 3.2178316354751586, "ce_loss_2": 4.210356426239014, "ce_loss_4": 3.7777705430984496, "ce_loss_9": 3.4157944202423094, "epoch": 0.644, "grad_norm": 968.0, "kl_loss_13": 130.0338954925537, "kl_loss_2": 2128.2608032226562, "kl_loss_4": 1279.8341979980469, "kl_loss_9": 488.6791137695312, "learning_rate": 0.0002865362636490791, "loss": 1022.8849, "step": 6440 }, { "ce_loss_13": 3.286911189556122, "ce_loss_17": 3.2367611169815063, "ce_loss_2": 4.217555093765259, "ce_loss_4": 3.7942359924316404, "ce_loss_9": 3.431019830703735, "epoch": 0.645, "grad_norm": 1360.0, "kl_loss_13": 127.99123420715333, "kl_loss_2": 2108.8230102539064, "kl_loss_4": 1282.9461730957032, "kl_loss_9": 483.26610107421874, "learning_rate": 0.0002851025439554142, "loss": 999.0702, "step": 6450 }, { "ce_loss_13": 3.272003960609436, "ce_loss_17": 3.216594707965851, "ce_loss_2": 4.197656345367432, "ce_loss_4": 3.7823782444000242, "ce_loss_9": 3.422859716415405, "epoch": 0.646, "grad_norm": 1264.0, "kl_loss_13": 129.48432121276855, "kl_loss_2": 2081.2620361328127, "kl_loss_4": 1270.5076538085937, "kl_loss_9": 485.07206573486326, "learning_rate": 0.00028367098827674573, "loss": 996.067, "step": 6460 }, { "ce_loss_13": 3.1997097969055175, "ce_loss_17": 3.1473405003547668, "ce_loss_2": 4.153258490562439, "ce_loss_4": 3.709350216388702, "ce_loss_9": 3.342746710777283, "epoch": 0.647, "grad_norm": 1040.0, "kl_loss_13": 126.6732219696045, "kl_loss_2": 2126.6481994628907, "kl_loss_4": 1264.577655029297, "kl_loss_9": 476.38092651367185, "learning_rate": 0.00028224161102882397, "loss": 1005.2549, "step": 6470 }, { "ce_loss_13": 3.1756999135017394, "ce_loss_17": 3.1306716084480284, "ce_loss_2": 4.107118356227875, "ce_loss_4": 3.6820710778236387, "ce_loss_9": 3.319660496711731, "epoch": 0.648, "grad_norm": 1272.0, "kl_loss_13": 125.03843040466309, "kl_loss_2": 2088.859942626953, "kl_loss_4": 1269.54296875, "kl_loss_9": 474.3402862548828, "learning_rate": 0.00028081442660546124, "loss": 1000.5469, "step": 6480 }, { "ce_loss_13": 3.2422716856002807, "ce_loss_17": 3.190127694606781, "ce_loss_2": 4.172718966007233, "ce_loss_4": 3.7388591408729552, "ce_loss_9": 3.385196661949158, "epoch": 0.649, "grad_norm": 1264.0, "kl_loss_13": 130.25692329406738, "kl_loss_2": 2104.5678771972657, "kl_loss_4": 1256.6305847167969, "kl_loss_9": 482.76637268066406, "learning_rate": 0.0002793894493783892, "loss": 1002.3529, "step": 6490 }, { "ce_loss_13": 3.259835934638977, "ce_loss_17": 3.2097046256065367, "ce_loss_2": 4.186160254478454, "ce_loss_4": 3.7537524342536925, "ce_loss_9": 3.399143099784851, "epoch": 0.65, "grad_norm": 1280.0, "kl_loss_13": 126.66931762695313, "kl_loss_2": 2094.7777465820313, "kl_loss_4": 1250.23759765625, "kl_loss_9": 473.72518768310545, "learning_rate": 0.0002779666936971129, "loss": 988.9448, "step": 6500 }, { "ce_loss_13": 3.2655163645744323, "ce_loss_17": 3.2162885427474976, "ce_loss_2": 4.220812892913818, "ce_loss_4": 3.7836662411689757, "ce_loss_9": 3.417439317703247, "epoch": 0.651, "grad_norm": 1240.0, "kl_loss_13": 128.76927223205567, "kl_loss_2": 2144.120294189453, "kl_loss_4": 1293.142724609375, "kl_loss_9": 489.02964477539064, "learning_rate": 0.00027654617388876614, "loss": 1015.7054, "step": 6510 }, { "ce_loss_13": 3.2891767501831053, "ce_loss_17": 3.238159012794495, "ce_loss_2": 4.218898463249206, "ce_loss_4": 3.79874507188797, "ce_loss_9": 3.4348693013191225, "epoch": 0.652, "grad_norm": 880.0, "kl_loss_13": 129.32936744689943, "kl_loss_2": 2110.9775329589843, "kl_loss_4": 1277.5022888183594, "kl_loss_9": 479.2207733154297, "learning_rate": 0.0002751279042579672, "loss": 1001.8629, "step": 6520 }, { "ce_loss_13": 3.2326087832450865, "ce_loss_17": 3.1812520384788514, "ce_loss_2": 4.168764638900757, "ce_loss_4": 3.7309839606285093, "ce_loss_9": 3.3788968563079833, "epoch": 0.653, "grad_norm": 1056.0, "kl_loss_13": 125.64515800476075, "kl_loss_2": 2109.486541748047, "kl_loss_4": 1252.6902893066406, "kl_loss_9": 476.9432205200195, "learning_rate": 0.00027371189908667604, "loss": 1012.3269, "step": 6530 }, { "ce_loss_13": 3.284346079826355, "ce_loss_17": 3.230403184890747, "ce_loss_2": 4.265842258930206, "ce_loss_4": 3.819449806213379, "ce_loss_9": 3.4391526222229003, "epoch": 0.654, "grad_norm": 1256.0, "kl_loss_13": 133.72423400878907, "kl_loss_2": 2193.1118774414062, "kl_loss_4": 1317.9974487304687, "kl_loss_9": 494.0942123413086, "learning_rate": 0.00027229817263404863, "loss": 1036.9691, "step": 6540 }, { "ce_loss_13": 3.267036032676697, "ce_loss_17": 3.2153607606887817, "ce_loss_2": 4.1564322233200075, "ce_loss_4": 3.756962776184082, "ce_loss_9": 3.407789874076843, "epoch": 0.655, "grad_norm": 1592.0, "kl_loss_13": 126.88279800415039, "kl_loss_2": 2033.7720153808593, "kl_loss_4": 1242.9439819335937, "kl_loss_9": 472.1895324707031, "learning_rate": 0.0002708867391362948, "loss": 989.6039, "step": 6550 }, { "ce_loss_13": 3.2516453862190247, "ce_loss_17": 3.200037109851837, "ce_loss_2": 4.161775660514832, "ce_loss_4": 3.7357085227966307, "ce_loss_9": 3.3896394968032837, "epoch": 0.656, "grad_norm": 1040.0, "kl_loss_13": 125.41157989501953, "kl_loss_2": 2046.0890075683594, "kl_loss_4": 1215.223876953125, "kl_loss_9": 463.0111740112305, "learning_rate": 0.0002694776128065345, "loss": 988.7551, "step": 6560 }, { "ce_loss_13": 3.185749578475952, "ce_loss_17": 3.133409357070923, "ce_loss_2": 4.128285074234009, "ce_loss_4": 3.703213024139404, "ce_loss_9": 3.332879328727722, "epoch": 0.657, "grad_norm": 1200.0, "kl_loss_13": 129.56677360534667, "kl_loss_2": 2138.4994995117186, "kl_loss_4": 1298.754376220703, "kl_loss_9": 490.5355499267578, "learning_rate": 0.00026807080783465374, "loss": 994.415, "step": 6570 }, { "ce_loss_13": 3.2867953300476076, "ce_loss_17": 3.236121320724487, "ce_loss_2": 4.237146317958832, "ce_loss_4": 3.8103081464767454, "ce_loss_9": 3.4368946075439455, "epoch": 0.658, "grad_norm": 1208.0, "kl_loss_13": 130.34469261169434, "kl_loss_2": 2130.664013671875, "kl_loss_4": 1295.0588317871093, "kl_loss_9": 488.9468338012695, "learning_rate": 0.00026666633838716316, "loss": 1016.3762, "step": 6580 }, { "ce_loss_13": 3.189263570308685, "ce_loss_17": 3.1374593496322634, "ce_loss_2": 4.149912416934967, "ce_loss_4": 3.7104940176010133, "ce_loss_9": 3.340337836742401, "epoch": 0.659, "grad_norm": 1344.0, "kl_loss_13": 130.73049201965333, "kl_loss_2": 2157.892962646484, "kl_loss_4": 1302.415185546875, "kl_loss_9": 491.5246841430664, "learning_rate": 0.00026526421860705474, "loss": 1025.1493, "step": 6590 }, { "ce_loss_13": 3.213017702102661, "ce_loss_17": 3.1631707668304445, "ce_loss_2": 4.169034993648529, "ce_loss_4": 3.73437956571579, "ce_loss_9": 3.362373471260071, "epoch": 0.66, "grad_norm": 1200.0, "kl_loss_13": 131.15017623901366, "kl_loss_2": 2140.445361328125, "kl_loss_4": 1294.1627563476563, "kl_loss_9": 490.9545104980469, "learning_rate": 0.0002638644626136587, "loss": 1005.0781, "step": 6600 }, { "ce_loss_13": 3.2263850927352906, "ce_loss_17": 3.176849550008774, "ce_loss_2": 4.171831655502319, "ce_loss_4": 3.739851486682892, "ce_loss_9": 3.3724207401275637, "epoch": 0.661, "grad_norm": 1104.0, "kl_loss_13": 127.10174674987793, "kl_loss_2": 2111.197882080078, "kl_loss_4": 1278.4236694335937, "kl_loss_9": 479.78526611328124, "learning_rate": 0.00026246708450250255, "loss": 1002.6556, "step": 6610 }, { "ce_loss_13": 3.214787483215332, "ce_loss_17": 3.1638757944107057, "ce_loss_2": 4.146944725513459, "ce_loss_4": 3.7195912837982177, "ce_loss_9": 3.3586400151252747, "epoch": 0.662, "grad_norm": 1120.0, "kl_loss_13": 126.75207710266113, "kl_loss_2": 2105.3037292480467, "kl_loss_4": 1265.7380981445312, "kl_loss_9": 477.26183319091797, "learning_rate": 0.00026107209834516854, "loss": 999.451, "step": 6620 }, { "ce_loss_13": 3.1704505920410155, "ce_loss_17": 3.120170783996582, "ce_loss_2": 4.159893441200256, "ce_loss_4": 3.711583709716797, "ce_loss_9": 3.3204343795776365, "epoch": 0.663, "grad_norm": 1184.0, "kl_loss_13": 128.71349868774413, "kl_loss_2": 2203.2578979492187, "kl_loss_4": 1324.3592468261718, "kl_loss_9": 488.4567565917969, "learning_rate": 0.0002596795181891514, "loss": 1026.6699, "step": 6630 }, { "ce_loss_13": 3.182750606536865, "ce_loss_17": 3.1276692867279055, "ce_loss_2": 4.144295132160186, "ce_loss_4": 3.7062558770179748, "ce_loss_9": 3.3308401226997377, "epoch": 0.664, "grad_norm": 1840.0, "kl_loss_13": 132.57133026123046, "kl_loss_2": 2164.5920837402346, "kl_loss_4": 1306.9556274414062, "kl_loss_9": 495.95313873291013, "learning_rate": 0.000258289358057718, "loss": 1050.9351, "step": 6640 }, { "ce_loss_13": 3.2482026696205137, "ce_loss_17": 3.193741512298584, "ce_loss_2": 4.212718522548675, "ce_loss_4": 3.7724199533462524, "ce_loss_9": 3.40000319480896, "epoch": 0.665, "grad_norm": 1072.0, "kl_loss_13": 133.5645538330078, "kl_loss_2": 2165.2717834472655, "kl_loss_4": 1305.2390075683593, "kl_loss_9": 499.60159454345705, "learning_rate": 0.0002569016319497657, "loss": 1021.1855, "step": 6650 }, { "ce_loss_13": 3.2325037717819214, "ce_loss_17": 3.1772287130355834, "ce_loss_2": 4.187597799301147, "ce_loss_4": 3.7505903840065002, "ce_loss_9": 3.387890040874481, "epoch": 0.666, "grad_norm": 776.0, "kl_loss_13": 132.4701675415039, "kl_loss_2": 2155.7009033203126, "kl_loss_4": 1304.11435546875, "kl_loss_9": 497.5166931152344, "learning_rate": 0.00025551635383968066, "loss": 1029.0193, "step": 6660 }, { "ce_loss_13": 3.1546674609184264, "ce_loss_17": 3.1015603065490724, "ce_loss_2": 4.116156685352325, "ce_loss_4": 3.670377993583679, "ce_loss_9": 3.299027180671692, "epoch": 0.667, "grad_norm": 1288.0, "kl_loss_13": 130.69430046081544, "kl_loss_2": 2175.8706115722657, "kl_loss_4": 1302.7461791992187, "kl_loss_9": 492.6998886108398, "learning_rate": 0.00025413353767719804, "loss": 1018.743, "step": 6670 }, { "ce_loss_13": 3.2089860677719115, "ce_loss_17": 3.1597649693489074, "ce_loss_2": 4.1520216226577755, "ce_loss_4": 3.720534586906433, "ce_loss_9": 3.3542774081230164, "epoch": 0.668, "grad_norm": 912.0, "kl_loss_13": 125.52513236999512, "kl_loss_2": 2136.416613769531, "kl_loss_4": 1289.7745788574218, "kl_loss_9": 479.05759582519534, "learning_rate": 0.0002527531973872617, "loss": 1011.9354, "step": 6680 }, { "ce_loss_13": 3.2230855226516724, "ce_loss_17": 3.171434426307678, "ce_loss_2": 4.161099433898926, "ce_loss_4": 3.7342797040939333, "ce_loss_9": 3.366728365421295, "epoch": 0.669, "grad_norm": 1120.0, "kl_loss_13": 127.30257530212403, "kl_loss_2": 2122.2246276855467, "kl_loss_4": 1280.1904296875, "kl_loss_9": 480.8235061645508, "learning_rate": 0.0002513753468698826, "loss": 1002.3262, "step": 6690 }, { "ce_loss_13": 3.186684989929199, "ce_loss_17": 3.1340667366981507, "ce_loss_2": 4.154286897182464, "ce_loss_4": 3.710269594192505, "ce_loss_9": 3.3389665246009828, "epoch": 0.67, "grad_norm": 1192.0, "kl_loss_13": 130.9047824859619, "kl_loss_2": 2175.153350830078, "kl_loss_4": 1302.5615844726562, "kl_loss_9": 493.9788070678711, "learning_rate": 0.0002500000000000001, "loss": 1020.2188, "step": 6700 }, { "ce_loss_13": 3.298756146430969, "ce_loss_17": 3.2508628487586977, "ce_loss_2": 4.190404391288757, "ce_loss_4": 3.788142991065979, "ce_loss_9": 3.440716302394867, "epoch": 0.671, "grad_norm": 1012.0, "kl_loss_13": 125.61447410583496, "kl_loss_2": 2032.161199951172, "kl_loss_4": 1241.647802734375, "kl_loss_9": 469.51317291259767, "learning_rate": 0.0002486271706273421, "loss": 1013.3594, "step": 6710 }, { "ce_loss_13": 3.2390043139457703, "ce_loss_17": 3.194638454914093, "ce_loss_2": 4.137998723983765, "ce_loss_4": 3.727764165401459, "ce_loss_9": 3.379214346408844, "epoch": 0.672, "grad_norm": 1448.0, "kl_loss_13": 124.17767028808593, "kl_loss_2": 2043.6036743164063, "kl_loss_4": 1235.2710998535156, "kl_loss_9": 467.24889221191404, "learning_rate": 0.0002472568725762853, "loss": 996.9588, "step": 6720 }, { "ce_loss_13": 3.228515100479126, "ce_loss_17": 3.179260289669037, "ce_loss_2": 4.133379244804383, "ce_loss_4": 3.716725027561188, "ce_loss_9": 3.370402956008911, "epoch": 0.673, "grad_norm": 1168.0, "kl_loss_13": 124.16192741394043, "kl_loss_2": 2062.637451171875, "kl_loss_4": 1241.2323913574219, "kl_loss_9": 464.9031127929687, "learning_rate": 0.00024588911964571554, "loss": 985.1641, "step": 6730 }, { "ce_loss_13": 3.2452532172203066, "ce_loss_17": 3.188890743255615, "ce_loss_2": 4.215212368965149, "ce_loss_4": 3.7804683208465577, "ce_loss_9": 3.40309339761734, "epoch": 0.674, "grad_norm": 964.0, "kl_loss_13": 134.8435428619385, "kl_loss_2": 2169.33310546875, "kl_loss_4": 1320.0959838867188, "kl_loss_9": 503.6511764526367, "learning_rate": 0.00024452392560888974, "loss": 1011.4004, "step": 6740 }, { "ce_loss_13": 3.136171591281891, "ce_loss_17": 3.086545693874359, "ce_loss_2": 4.08036699295044, "ce_loss_4": 3.6442180037498475, "ce_loss_9": 3.284239888191223, "epoch": 0.675, "grad_norm": 1136.0, "kl_loss_13": 125.32639236450196, "kl_loss_2": 2123.318218994141, "kl_loss_4": 1275.6392517089844, "kl_loss_9": 476.06225433349607, "learning_rate": 0.00024316130421329695, "loss": 994.5338, "step": 6750 }, { "ce_loss_13": 3.217696189880371, "ce_loss_17": 3.168676507472992, "ce_loss_2": 4.14402482509613, "ce_loss_4": 3.7269794821739195, "ce_loss_9": 3.3610087633132935, "epoch": 0.676, "grad_norm": 984.0, "kl_loss_13": 126.22435035705567, "kl_loss_2": 2083.14296875, "kl_loss_4": 1259.228985595703, "kl_loss_9": 471.91710662841797, "learning_rate": 0.00024180126918051909, "loss": 997.5952, "step": 6760 }, { "ce_loss_13": 3.261302137374878, "ce_loss_17": 3.209616231918335, "ce_loss_2": 4.188631618022919, "ce_loss_4": 3.7591268181800843, "ce_loss_9": 3.4110424399375914, "epoch": 0.677, "grad_norm": 1144.0, "kl_loss_13": 127.95707626342774, "kl_loss_2": 2091.1271484375, "kl_loss_4": 1253.4050537109374, "kl_loss_9": 477.76488037109374, "learning_rate": 0.00024044383420609406, "loss": 985.7389, "step": 6770 }, { "ce_loss_13": 3.270089292526245, "ce_loss_17": 3.2222145199775696, "ce_loss_2": 4.169995367527008, "ce_loss_4": 3.757718729972839, "ce_loss_9": 3.412195098400116, "epoch": 0.678, "grad_norm": 1408.0, "kl_loss_13": 125.79314193725585, "kl_loss_2": 2068.732110595703, "kl_loss_4": 1252.4252319335938, "kl_loss_9": 471.68780822753905, "learning_rate": 0.00023908901295937712, "loss": 1006.3748, "step": 6780 }, { "ce_loss_13": 3.260562789440155, "ce_loss_17": 3.2077584743499754, "ce_loss_2": 4.176387059688568, "ce_loss_4": 3.755878007411957, "ce_loss_9": 3.4027205109596252, "epoch": 0.679, "grad_norm": 1528.0, "kl_loss_13": 127.13752517700195, "kl_loss_2": 2065.2238708496093, "kl_loss_4": 1248.0218017578125, "kl_loss_9": 472.13244781494143, "learning_rate": 0.00023773681908340283, "loss": 1005.6818, "step": 6790 }, { "ce_loss_13": 3.2425795793533325, "ce_loss_17": 3.1866623997688293, "ce_loss_2": 4.2025432825088505, "ce_loss_4": 3.7606682419776916, "ce_loss_9": 3.394603908061981, "epoch": 0.68, "grad_norm": 1456.0, "kl_loss_13": 134.19757652282715, "kl_loss_2": 2184.184014892578, "kl_loss_4": 1318.5625244140624, "kl_loss_9": 503.539404296875, "learning_rate": 0.00023638726619474876, "loss": 1035.5123, "step": 6800 }, { "ce_loss_13": 3.228830122947693, "ce_loss_17": 3.1752259731292725, "ce_loss_2": 4.21261613368988, "ce_loss_4": 3.7664615154266357, "ce_loss_9": 3.382519745826721, "epoch": 0.681, "grad_norm": 1376.0, "kl_loss_13": 130.8710552215576, "kl_loss_2": 2190.7125244140625, "kl_loss_4": 1318.4497436523438, "kl_loss_9": 490.83096771240236, "learning_rate": 0.0002350403678833976, "loss": 1019.0072, "step": 6810 }, { "ce_loss_13": 3.1595004320144655, "ce_loss_17": 3.107949662208557, "ce_loss_2": 4.116284835338592, "ce_loss_4": 3.68047810792923, "ce_loss_9": 3.306431496143341, "epoch": 0.682, "grad_norm": 1064.0, "kl_loss_13": 126.64262771606445, "kl_loss_2": 2160.460656738281, "kl_loss_4": 1308.418701171875, "kl_loss_9": 484.2623321533203, "learning_rate": 0.00023369613771260007, "loss": 1005.0494, "step": 6820 }, { "ce_loss_13": 3.269521141052246, "ce_loss_17": 3.217289674282074, "ce_loss_2": 4.2214888572692875, "ce_loss_4": 3.789712381362915, "ce_loss_9": 3.417080247402191, "epoch": 0.683, "grad_norm": 1344.0, "kl_loss_13": 130.19417037963868, "kl_loss_2": 2146.8581787109374, "kl_loss_4": 1287.4227416992187, "kl_loss_9": 486.63575286865233, "learning_rate": 0.00023235458921873925, "loss": 1017.8075, "step": 6830 }, { "ce_loss_13": 3.2255374670028685, "ce_loss_17": 3.170731008052826, "ce_loss_2": 4.218565154075622, "ce_loss_4": 3.7700939655303953, "ce_loss_9": 3.3827935814857484, "epoch": 0.684, "grad_norm": 1440.0, "kl_loss_13": 133.2099739074707, "kl_loss_2": 2226.6044921875, "kl_loss_4": 1344.1361938476562, "kl_loss_9": 508.7969268798828, "learning_rate": 0.0002310157359111938, "loss": 1047.0678, "step": 6840 }, { "ce_loss_13": 3.116528046131134, "ce_loss_17": 3.0624534487724304, "ce_loss_2": 4.1585693359375, "ce_loss_4": 3.679171395301819, "ce_loss_9": 3.277780222892761, "epoch": 0.685, "grad_norm": 2192.0, "kl_loss_13": 130.5427173614502, "kl_loss_2": 2303.942761230469, "kl_loss_4": 1365.0715270996093, "kl_loss_9": 500.15128021240236, "learning_rate": 0.0002296795912722014, "loss": 1048.5195, "step": 6850 }, { "ce_loss_13": 3.2616600036621093, "ce_loss_17": 3.2107287406921388, "ce_loss_2": 4.17175680398941, "ce_loss_4": 3.757402217388153, "ce_loss_9": 3.407912719249725, "epoch": 0.686, "grad_norm": 1248.0, "kl_loss_13": 127.52902755737304, "kl_loss_2": 2076.4579833984376, "kl_loss_4": 1256.9023986816405, "kl_loss_9": 479.49196166992186, "learning_rate": 0.0002283461687567236, "loss": 981.6411, "step": 6860 }, { "ce_loss_13": 3.3144801020622254, "ce_loss_17": 3.2634603142738343, "ce_loss_2": 4.211235904693604, "ce_loss_4": 3.8075804352760314, "ce_loss_9": 3.4535457372665403, "epoch": 0.687, "grad_norm": 860.0, "kl_loss_13": 127.26225776672364, "kl_loss_2": 2034.533935546875, "kl_loss_4": 1245.3732055664063, "kl_loss_9": 470.0856002807617, "learning_rate": 0.00022701548179231045, "loss": 999.4862, "step": 6870 }, { "ce_loss_13": 3.2640109777450563, "ce_loss_17": 3.2115107655525206, "ce_loss_2": 4.213279771804809, "ce_loss_4": 3.7720092058181764, "ce_loss_9": 3.411078143119812, "epoch": 0.688, "grad_norm": 1008.0, "kl_loss_13": 129.02279777526854, "kl_loss_2": 2132.1004150390627, "kl_loss_4": 1277.8538146972655, "kl_loss_9": 482.372785949707, "learning_rate": 0.00022568754377896516, "loss": 992.0485, "step": 6880 }, { "ce_loss_13": 3.2603373646736147, "ce_loss_17": 3.20809828042984, "ce_loss_2": 4.186764764785766, "ce_loss_4": 3.7585228800773622, "ce_loss_9": 3.4065467000007628, "epoch": 0.689, "grad_norm": 1168.0, "kl_loss_13": 128.52302513122558, "kl_loss_2": 2106.8260803222656, "kl_loss_4": 1272.237530517578, "kl_loss_9": 487.9483367919922, "learning_rate": 0.00022436236808900844, "loss": 996.8523, "step": 6890 }, { "ce_loss_13": 3.155588388442993, "ce_loss_17": 3.102293407917023, "ce_loss_2": 4.111664521694183, "ce_loss_4": 3.677588188648224, "ce_loss_9": 3.3081274390220643, "epoch": 0.69, "grad_norm": 984.0, "kl_loss_13": 128.9507614135742, "kl_loss_2": 2158.237585449219, "kl_loss_4": 1292.9919067382812, "kl_loss_9": 485.42525177001954, "learning_rate": 0.00022303996806694487, "loss": 1004.0758, "step": 6900 }, { "ce_loss_13": 3.230221700668335, "ce_loss_17": 3.181375229358673, "ce_loss_2": 4.169663190841675, "ce_loss_4": 3.73693927526474, "ce_loss_9": 3.372818911075592, "epoch": 0.691, "grad_norm": 1032.0, "kl_loss_13": 125.93045959472656, "kl_loss_2": 2131.7766052246093, "kl_loss_4": 1283.9748046875, "kl_loss_9": 482.19235076904295, "learning_rate": 0.00022172035702932823, "loss": 1000.8458, "step": 6910 }, { "ce_loss_13": 3.275791335105896, "ce_loss_17": 3.2261704325675966, "ce_loss_2": 4.18710310459137, "ce_loss_4": 3.773553967475891, "ce_loss_9": 3.4210479140281675, "epoch": 0.692, "grad_norm": 1088.0, "kl_loss_13": 128.45828552246093, "kl_loss_2": 2052.8288757324217, "kl_loss_4": 1254.5195251464843, "kl_loss_9": 476.3975234985352, "learning_rate": 0.00022040354826462666, "loss": 985.902, "step": 6920 }, { "ce_loss_13": 3.2033557415008547, "ce_loss_17": 3.1543065190315245, "ce_loss_2": 4.146645784378052, "ce_loss_4": 3.712704610824585, "ce_loss_9": 3.3465515613555907, "epoch": 0.693, "grad_norm": 1712.0, "kl_loss_13": 126.86063690185547, "kl_loss_2": 2114.503106689453, "kl_loss_4": 1266.968243408203, "kl_loss_9": 473.1267807006836, "learning_rate": 0.0002190895550330899, "loss": 1003.8106, "step": 6930 }, { "ce_loss_13": 3.1384754300117494, "ce_loss_17": 3.0868613481521607, "ce_loss_2": 4.113702714443207, "ce_loss_4": 3.671848237514496, "ce_loss_9": 3.2935760021209717, "epoch": 0.694, "grad_norm": 1016.0, "kl_loss_13": 129.83692016601563, "kl_loss_2": 2167.951953125, "kl_loss_4": 1300.027362060547, "kl_loss_9": 491.8223388671875, "learning_rate": 0.00021777839056661552, "loss": 1001.3788, "step": 6940 }, { "ce_loss_13": 3.2230561852455137, "ce_loss_17": 3.1724560499191283, "ce_loss_2": 4.156371986865997, "ce_loss_4": 3.7235463738441466, "ce_loss_9": 3.364833080768585, "epoch": 0.695, "grad_norm": 1192.0, "kl_loss_13": 127.21008987426758, "kl_loss_2": 2104.394970703125, "kl_loss_4": 1263.6509033203124, "kl_loss_9": 476.91744842529295, "learning_rate": 0.0002164700680686147, "loss": 987.3193, "step": 6950 }, { "ce_loss_13": 3.2678646445274353, "ce_loss_17": 3.2189462423324584, "ce_loss_2": 4.184778940677643, "ce_loss_4": 3.7625047326087953, "ce_loss_9": 3.4117999792099, "epoch": 0.696, "grad_norm": 1376.0, "kl_loss_13": 128.22282867431642, "kl_loss_2": 2061.4692260742186, "kl_loss_4": 1240.4134826660156, "kl_loss_9": 475.67960052490236, "learning_rate": 0.0002151646007138806, "loss": 986.3299, "step": 6960 }, { "ce_loss_13": 3.1496576189994814, "ce_loss_17": 3.0974999904632567, "ce_loss_2": 4.112265968322754, "ce_loss_4": 3.6719263076782225, "ce_loss_9": 3.2994167804718018, "epoch": 0.697, "grad_norm": 1056.0, "kl_loss_13": 130.18887596130372, "kl_loss_2": 2185.0795349121095, "kl_loss_4": 1311.5925598144531, "kl_loss_9": 491.06762847900393, "learning_rate": 0.00021386200164845526, "loss": 1011.5449, "step": 6970 }, { "ce_loss_13": 3.3225492358207704, "ce_loss_17": 3.272469866275787, "ce_loss_2": 4.21777765750885, "ce_loss_4": 3.806537127494812, "ce_loss_9": 3.4647100806236266, "epoch": 0.698, "grad_norm": 996.0, "kl_loss_13": 126.68879432678223, "kl_loss_2": 2042.4563903808594, "kl_loss_4": 1241.2991760253906, "kl_loss_9": 472.38414459228517, "learning_rate": 0.0002125622839894964, "loss": 979.2768, "step": 6980 }, { "ce_loss_13": 3.267834711074829, "ce_loss_17": 3.2172841787338258, "ce_loss_2": 4.182934987545013, "ce_loss_4": 3.770441246032715, "ce_loss_9": 3.409151029586792, "epoch": 0.699, "grad_norm": 1680.0, "kl_loss_13": 125.93191146850586, "kl_loss_2": 2054.3854370117188, "kl_loss_4": 1239.4584716796876, "kl_loss_9": 468.6172821044922, "learning_rate": 0.00021126546082514663, "loss": 981.0359, "step": 6990 }, { "ce_loss_13": 3.2867854475975036, "ce_loss_17": 3.236895191669464, "ce_loss_2": 4.1952292919158936, "ce_loss_4": 3.778970551490784, "ce_loss_9": 3.427595484256744, "epoch": 0.7, "grad_norm": 1032.0, "kl_loss_13": 127.05775680541993, "kl_loss_2": 2065.2293701171875, "kl_loss_4": 1247.4901062011718, "kl_loss_9": 472.99816131591797, "learning_rate": 0.00020997154521440098, "loss": 979.7843, "step": 7000 }, { "ce_loss_13": 3.236474347114563, "ce_loss_17": 3.1884860157966615, "ce_loss_2": 4.1605191946029665, "ce_loss_4": 3.73172265291214, "ce_loss_9": 3.3765971183776857, "epoch": 0.701, "grad_norm": 1272.0, "kl_loss_13": 125.59776191711425, "kl_loss_2": 2091.3847106933595, "kl_loss_4": 1255.1470031738281, "kl_loss_9": 472.91735992431643, "learning_rate": 0.0002086805501869749, "loss": 979.9674, "step": 7010 }, { "ce_loss_13": 3.207662892341614, "ce_loss_17": 3.1555300831794737, "ce_loss_2": 4.180721688270569, "ce_loss_4": 3.7340385556221007, "ce_loss_9": 3.3620522141456606, "epoch": 0.702, "grad_norm": 948.0, "kl_loss_13": 130.0142951965332, "kl_loss_2": 2187.624139404297, "kl_loss_4": 1311.0609069824218, "kl_loss_9": 495.14405212402346, "learning_rate": 0.0002073924887431744, "loss": 1012.3102, "step": 7020 }, { "ce_loss_13": 3.211353290081024, "ce_loss_17": 3.1630847215652467, "ce_loss_2": 4.156705856323242, "ce_loss_4": 3.723443305492401, "ce_loss_9": 3.3628470420837404, "epoch": 0.703, "grad_norm": 1320.0, "kl_loss_13": 127.96535224914551, "kl_loss_2": 2135.966864013672, "kl_loss_4": 1291.9100646972656, "kl_loss_9": 482.17771606445314, "learning_rate": 0.00020610737385376348, "loss": 1028.4508, "step": 7030 }, { "ce_loss_13": 3.2688358306884764, "ce_loss_17": 3.2177406191825866, "ce_loss_2": 4.1655452966690065, "ce_loss_4": 3.762382650375366, "ce_loss_9": 3.4092161893844604, "epoch": 0.704, "grad_norm": 956.0, "kl_loss_13": 126.3967498779297, "kl_loss_2": 2040.5072448730468, "kl_loss_4": 1246.9435607910157, "kl_loss_9": 469.7414840698242, "learning_rate": 0.00020482521845983521, "loss": 1000.3937, "step": 7040 }, { "ce_loss_13": 3.26496958732605, "ce_loss_17": 3.21316157579422, "ce_loss_2": 4.20962141752243, "ce_loss_4": 3.777859330177307, "ce_loss_9": 3.410719466209412, "epoch": 0.705, "grad_norm": 1840.0, "kl_loss_13": 130.4114227294922, "kl_loss_2": 2136.3337829589846, "kl_loss_4": 1279.6558837890625, "kl_loss_9": 484.60608978271483, "learning_rate": 0.00020354603547267987, "loss": 1014.8296, "step": 7050 }, { "ce_loss_13": 3.245900106430054, "ce_loss_17": 3.193069911003113, "ce_loss_2": 4.213961720466614, "ce_loss_4": 3.7741793394088745, "ce_loss_9": 3.403426718711853, "epoch": 0.706, "grad_norm": 1112.0, "kl_loss_13": 130.1860149383545, "kl_loss_2": 2133.8913330078126, "kl_loss_4": 1285.244158935547, "kl_loss_9": 487.3147918701172, "learning_rate": 0.00020226983777365604, "loss": 1029.8672, "step": 7060 }, { "ce_loss_13": 3.1597655057907104, "ce_loss_17": 3.1103063225746155, "ce_loss_2": 4.1466184258461, "ce_loss_4": 3.69773725271225, "ce_loss_9": 3.3068239808082582, "epoch": 0.707, "grad_norm": 1048.0, "kl_loss_13": 123.43181076049805, "kl_loss_2": 2191.2349365234377, "kl_loss_4": 1310.4918640136718, "kl_loss_9": 473.4569839477539, "learning_rate": 0.00020099663821406056, "loss": 1005.3595, "step": 7070 }, { "ce_loss_13": 3.2530231356620787, "ce_loss_17": 3.2050622701644897, "ce_loss_2": 4.162657201290131, "ce_loss_4": 3.7510608553886415, "ce_loss_9": 3.3951767444610597, "epoch": 0.708, "grad_norm": 1568.0, "kl_loss_13": 125.58062896728515, "kl_loss_2": 2051.1040161132814, "kl_loss_4": 1238.6790710449218, "kl_loss_9": 469.2374328613281, "learning_rate": 0.00019972644961499853, "loss": 999.2383, "step": 7080 }, { "ce_loss_13": 3.2279919028282165, "ce_loss_17": 3.1748767495155334, "ce_loss_2": 4.189191555976867, "ce_loss_4": 3.752892792224884, "ce_loss_9": 3.3765548348426817, "epoch": 0.709, "grad_norm": 1136.0, "kl_loss_13": 130.36332168579102, "kl_loss_2": 2156.944940185547, "kl_loss_4": 1300.1795043945312, "kl_loss_9": 489.624577331543, "learning_rate": 0.00019845928476725522, "loss": 1009.5598, "step": 7090 }, { "ce_loss_13": 3.302613580226898, "ce_loss_17": 3.250015211105347, "ce_loss_2": 4.230489325523377, "ce_loss_4": 3.8056825518608095, "ce_loss_9": 3.451917815208435, "epoch": 0.71, "grad_norm": 836.0, "kl_loss_13": 128.96048164367676, "kl_loss_2": 2097.059704589844, "kl_loss_4": 1262.009942626953, "kl_loss_9": 483.1839935302734, "learning_rate": 0.00019719515643116677, "loss": 1024.8489, "step": 7100 }, { "ce_loss_13": 3.243379294872284, "ce_loss_17": 3.193085825443268, "ce_loss_2": 4.173605489730835, "ce_loss_4": 3.737858772277832, "ce_loss_9": 3.3846262454986573, "epoch": 0.711, "grad_norm": 1160.0, "kl_loss_13": 126.59504852294921, "kl_loss_2": 2094.7278076171874, "kl_loss_4": 1244.7935729980468, "kl_loss_9": 475.54813995361326, "learning_rate": 0.0001959340773364911, "loss": 1003.0396, "step": 7110 }, { "ce_loss_13": 3.2554232478141785, "ce_loss_17": 3.203369653224945, "ce_loss_2": 4.198822510242462, "ce_loss_4": 3.7670948266983033, "ce_loss_9": 3.403228771686554, "epoch": 0.712, "grad_norm": 1456.0, "kl_loss_13": 127.19849090576172, "kl_loss_2": 2124.5647705078127, "kl_loss_4": 1274.8545837402344, "kl_loss_9": 483.360693359375, "learning_rate": 0.0001946760601822809, "loss": 985.0371, "step": 7120 }, { "ce_loss_13": 3.311061453819275, "ce_loss_17": 3.2628830313682555, "ce_loss_2": 4.22496143579483, "ce_loss_4": 3.8060648441314697, "ce_loss_9": 3.4576387763023377, "epoch": 0.713, "grad_norm": 1184.0, "kl_loss_13": 126.19576568603516, "kl_loss_2": 2072.6936279296874, "kl_loss_4": 1240.158673095703, "kl_loss_9": 474.6136276245117, "learning_rate": 0.00019342111763675512, "loss": 971.7702, "step": 7130 }, { "ce_loss_13": 3.303226590156555, "ce_loss_17": 3.254168963432312, "ce_loss_2": 4.203019201755524, "ce_loss_4": 3.798448085784912, "ce_loss_9": 3.446289324760437, "epoch": 0.714, "grad_norm": 992.0, "kl_loss_13": 128.34785995483398, "kl_loss_2": 2045.7513366699218, "kl_loss_4": 1244.635760498047, "kl_loss_9": 475.7839157104492, "learning_rate": 0.00019216926233717085, "loss": 975.0217, "step": 7140 }, { "ce_loss_13": 3.202015590667725, "ce_loss_17": 3.1513328194618224, "ce_loss_2": 4.192325377464295, "ce_loss_4": 3.730834698677063, "ce_loss_9": 3.345203197002411, "epoch": 0.715, "grad_norm": 1584.0, "kl_loss_13": 125.69464721679688, "kl_loss_2": 2212.5761840820314, "kl_loss_4": 1306.0401489257813, "kl_loss_9": 471.4148483276367, "learning_rate": 0.00019092050688969737, "loss": 1020.5283, "step": 7150 }, { "ce_loss_13": 3.2697688937187195, "ce_loss_17": 3.220064175128937, "ce_loss_2": 4.180932438373565, "ce_loss_4": 3.764020323753357, "ce_loss_9": 3.4085710883140563, "epoch": 0.716, "grad_norm": 1248.0, "kl_loss_13": 126.3196418762207, "kl_loss_2": 2107.537561035156, "kl_loss_4": 1267.1171325683595, "kl_loss_9": 473.8479827880859, "learning_rate": 0.00018967486386928817, "loss": 989.2244, "step": 7160 }, { "ce_loss_13": 3.1472651600837707, "ce_loss_17": 3.095741879940033, "ce_loss_2": 4.107452630996704, "ce_loss_4": 3.665388751029968, "ce_loss_9": 3.296068251132965, "epoch": 0.717, "grad_norm": 1488.0, "kl_loss_13": 128.14934158325195, "kl_loss_2": 2168.0446716308593, "kl_loss_4": 1296.629083251953, "kl_loss_9": 487.0473327636719, "learning_rate": 0.00018843234581955443, "loss": 1037.5325, "step": 7170 }, { "ce_loss_13": 3.155932915210724, "ce_loss_17": 3.1010112285614015, "ce_loss_2": 4.112509799003601, "ce_loss_4": 3.6798654317855837, "ce_loss_9": 3.3065207958221436, "epoch": 0.718, "grad_norm": 1664.0, "kl_loss_13": 128.91173782348633, "kl_loss_2": 2148.219268798828, "kl_loss_4": 1307.0668884277343, "kl_loss_9": 486.8367431640625, "learning_rate": 0.00018719296525263924, "loss": 1013.9701, "step": 7180 }, { "ce_loss_13": 3.253066086769104, "ce_loss_17": 3.2031625390052794, "ce_loss_2": 4.148751163482666, "ce_loss_4": 3.7373283982276915, "ce_loss_9": 3.393080806732178, "epoch": 0.719, "grad_norm": 1000.0, "kl_loss_13": 127.21577033996581, "kl_loss_2": 2039.6119506835937, "kl_loss_4": 1227.5170043945313, "kl_loss_9": 468.51355743408203, "learning_rate": 0.0001859567346490913, "loss": 973.4907, "step": 7190 }, { "ce_loss_13": 3.2330114364624025, "ce_loss_17": 3.1802570104598997, "ce_loss_2": 4.1825693368911745, "ce_loss_4": 3.7411630868911745, "ce_loss_9": 3.3799352407455445, "epoch": 0.72, "grad_norm": 1360.0, "kl_loss_13": 129.87507514953614, "kl_loss_2": 2153.0119140625, "kl_loss_4": 1282.8651062011718, "kl_loss_9": 487.0993377685547, "learning_rate": 0.0001847236664577389, "loss": 993.8551, "step": 7200 }, { "ce_loss_13": 3.2566522479057314, "ce_loss_17": 3.206060528755188, "ce_loss_2": 4.15482417345047, "ce_loss_4": 3.7403355717658995, "ce_loss_9": 3.3943248987197876, "epoch": 0.721, "grad_norm": 1424.0, "kl_loss_13": 125.80461082458496, "kl_loss_2": 2040.988787841797, "kl_loss_4": 1230.864971923828, "kl_loss_9": 467.2418960571289, "learning_rate": 0.00018349377309556487, "loss": 966.0771, "step": 7210 }, { "ce_loss_13": 3.1997822642326357, "ce_loss_17": 3.1479795575141907, "ce_loss_2": 4.176438415050507, "ce_loss_4": 3.717824470996857, "ce_loss_9": 3.348016357421875, "epoch": 0.722, "grad_norm": 952.0, "kl_loss_13": 130.05975723266602, "kl_loss_2": 2216.452008056641, "kl_loss_4": 1319.673565673828, "kl_loss_9": 490.21546630859376, "learning_rate": 0.00018226706694758193, "loss": 1021.2022, "step": 7220 }, { "ce_loss_13": 3.2731100797653196, "ce_loss_17": 3.22364399433136, "ce_loss_2": 4.199114847183227, "ce_loss_4": 3.7679973006248475, "ce_loss_9": 3.4154403448104858, "epoch": 0.723, "grad_norm": 1136.0, "kl_loss_13": 126.32540283203124, "kl_loss_2": 2105.754748535156, "kl_loss_4": 1264.9041320800782, "kl_loss_9": 479.7638931274414, "learning_rate": 0.0001810435603667075, "loss": 1019.5595, "step": 7230 }, { "ce_loss_13": 3.1316038250923155, "ce_loss_17": 3.078946924209595, "ce_loss_2": 4.071647381782531, "ce_loss_4": 3.638521111011505, "ce_loss_9": 3.2765212655067444, "epoch": 0.724, "grad_norm": 1000.0, "kl_loss_13": 124.27244911193847, "kl_loss_2": 2111.363635253906, "kl_loss_4": 1268.3391845703125, "kl_loss_9": 470.92822723388673, "learning_rate": 0.0001798232656736389, "loss": 1013.9793, "step": 7240 }, { "ce_loss_13": 3.294299376010895, "ce_loss_17": 3.2451056122779844, "ce_loss_2": 4.193133473396301, "ce_loss_4": 3.7812564849853514, "ce_loss_9": 3.4369985103607177, "epoch": 0.725, "grad_norm": 1016.0, "kl_loss_13": 126.4567657470703, "kl_loss_2": 2035.6247863769531, "kl_loss_4": 1227.691583251953, "kl_loss_9": 467.4934875488281, "learning_rate": 0.0001786061951567303, "loss": 983.6854, "step": 7250 }, { "ce_loss_13": 3.2077063083648683, "ce_loss_17": 3.1547775864601135, "ce_loss_2": 4.148510277271271, "ce_loss_4": 3.7216678977012636, "ce_loss_9": 3.3572837114334106, "epoch": 0.726, "grad_norm": 1136.0, "kl_loss_13": 128.69629669189453, "kl_loss_2": 2110.1603088378906, "kl_loss_4": 1273.0267028808594, "kl_loss_9": 479.6479858398437, "learning_rate": 0.00017739236107186857, "loss": 1008.1355, "step": 7260 }, { "ce_loss_13": 3.304520070552826, "ce_loss_17": 3.2567023277282714, "ce_loss_2": 4.187015008926392, "ce_loss_4": 3.777002382278442, "ce_loss_9": 3.4412336945533752, "epoch": 0.727, "grad_norm": 1224.0, "kl_loss_13": 125.25346145629882, "kl_loss_2": 2009.7740844726563, "kl_loss_4": 1215.6559143066406, "kl_loss_9": 461.9171646118164, "learning_rate": 0.00017618177564234904, "loss": 970.7781, "step": 7270 }, { "ce_loss_13": 3.2738952040672302, "ce_loss_17": 3.228114640712738, "ce_loss_2": 4.164701628684997, "ce_loss_4": 3.7603036522865296, "ce_loss_9": 3.411753571033478, "epoch": 0.728, "grad_norm": 1064.0, "kl_loss_13": 123.41110420227051, "kl_loss_2": 2004.3503784179688, "kl_loss_4": 1220.9600036621093, "kl_loss_9": 456.93900909423826, "learning_rate": 0.00017497445105875377, "loss": 970.5797, "step": 7280 }, { "ce_loss_13": 3.1852272510528565, "ce_loss_17": 3.134671664237976, "ce_loss_2": 4.154527306556702, "ce_loss_4": 3.7042139887809755, "ce_loss_9": 3.33417626619339, "epoch": 0.729, "grad_norm": 920.0, "kl_loss_13": 127.94668693542481, "kl_loss_2": 2174.3722412109373, "kl_loss_4": 1295.8716918945313, "kl_loss_9": 485.3157470703125, "learning_rate": 0.000173770399478828, "loss": 1008.4029, "step": 7290 }, { "ce_loss_13": 3.1091261744499206, "ce_loss_17": 3.060553777217865, "ce_loss_2": 4.043109095096588, "ce_loss_4": 3.618607926368713, "ce_loss_9": 3.2495880007743834, "epoch": 0.73, "grad_norm": 1048.0, "kl_loss_13": 124.6862777709961, "kl_loss_2": 2100.810650634766, "kl_loss_4": 1261.7956176757812, "kl_loss_9": 469.094221496582, "learning_rate": 0.0001725696330273575, "loss": 1014.6064, "step": 7300 }, { "ce_loss_13": 3.292516028881073, "ce_loss_17": 3.2449800252914427, "ce_loss_2": 4.191593599319458, "ce_loss_4": 3.7825557231903075, "ce_loss_9": 3.4312392234802247, "epoch": 0.731, "grad_norm": 1240.0, "kl_loss_13": 125.54147911071777, "kl_loss_2": 2029.4308288574218, "kl_loss_4": 1228.256005859375, "kl_loss_9": 471.09500885009766, "learning_rate": 0.00017137216379604724, "loss": 968.1071, "step": 7310 }, { "ce_loss_13": 3.1744810819625853, "ce_loss_17": 3.121881425380707, "ce_loss_2": 4.124276387691498, "ce_loss_4": 3.678806388378143, "ce_loss_9": 3.3224066853523255, "epoch": 0.732, "grad_norm": 1112.0, "kl_loss_13": 126.33396644592285, "kl_loss_2": 2108.6168579101563, "kl_loss_4": 1249.1590881347656, "kl_loss_9": 469.87144470214844, "learning_rate": 0.00017017800384339925, "loss": 995.0025, "step": 7320 }, { "ce_loss_13": 3.129823160171509, "ce_loss_17": 3.0787100434303283, "ce_loss_2": 4.103810834884643, "ce_loss_4": 3.6614753365516663, "ce_loss_9": 3.2835329294204714, "epoch": 0.733, "grad_norm": 1208.0, "kl_loss_13": 127.63936767578124, "kl_loss_2": 2183.213818359375, "kl_loss_4": 1304.49404296875, "kl_loss_9": 485.121923828125, "learning_rate": 0.00016898716519459073, "loss": 990.5598, "step": 7330 }, { "ce_loss_13": 3.2466770887374876, "ce_loss_17": 3.1937456846237184, "ce_loss_2": 4.225208568572998, "ce_loss_4": 3.7771113991737364, "ce_loss_9": 3.397054898738861, "epoch": 0.734, "grad_norm": 996.0, "kl_loss_13": 131.66384506225586, "kl_loss_2": 2168.567803955078, "kl_loss_4": 1293.8915283203125, "kl_loss_9": 493.8968078613281, "learning_rate": 0.00016779965984135375, "loss": 1005.5984, "step": 7340 }, { "ce_loss_13": 3.167673718929291, "ce_loss_17": 3.117953050136566, "ce_loss_2": 4.114571452140808, "ce_loss_4": 3.6805097699165343, "ce_loss_9": 3.310451102256775, "epoch": 0.735, "grad_norm": 1760.0, "kl_loss_13": 123.7313346862793, "kl_loss_2": 2107.0096435546875, "kl_loss_4": 1253.8605224609375, "kl_loss_9": 465.2525909423828, "learning_rate": 0.00016661549974185424, "loss": 990.3389, "step": 7350 }, { "ce_loss_13": 3.2033540844917296, "ce_loss_17": 3.1527140974998473, "ce_loss_2": 4.14150083065033, "ce_loss_4": 3.708521914482117, "ce_loss_9": 3.3475730776786805, "epoch": 0.736, "grad_norm": 1264.0, "kl_loss_13": 128.52691764831542, "kl_loss_2": 2108.5188171386717, "kl_loss_4": 1262.9774230957032, "kl_loss_9": 477.729914855957, "learning_rate": 0.00016543469682057105, "loss": 983.4039, "step": 7360 }, { "ce_loss_13": 3.2271689414978026, "ce_loss_17": 3.174753713607788, "ce_loss_2": 4.1670044422149655, "ce_loss_4": 3.7377058029174806, "ce_loss_9": 3.3727575302124024, "epoch": 0.737, "grad_norm": 980.0, "kl_loss_13": 129.232816696167, "kl_loss_2": 2111.8417419433595, "kl_loss_4": 1270.102392578125, "kl_loss_9": 483.41763763427736, "learning_rate": 0.00016425726296817632, "loss": 993.1252, "step": 7370 }, { "ce_loss_13": 3.237946331501007, "ce_loss_17": 3.1895334839820864, "ce_loss_2": 4.159746968746186, "ce_loss_4": 3.7420300602912904, "ce_loss_9": 3.3826239585876463, "epoch": 0.738, "grad_norm": 1400.0, "kl_loss_13": 125.55359840393066, "kl_loss_2": 2046.2783142089843, "kl_loss_4": 1244.47666015625, "kl_loss_9": 467.4946624755859, "learning_rate": 0.00016308321004141607, "loss": 980.5523, "step": 7380 }, { "ce_loss_13": 3.19804185628891, "ce_loss_17": 3.146539306640625, "ce_loss_2": 4.139368546009064, "ce_loss_4": 3.710443580150604, "ce_loss_9": 3.345207667350769, "epoch": 0.739, "grad_norm": 1048.0, "kl_loss_13": 129.52734146118163, "kl_loss_2": 2113.7308959960938, "kl_loss_4": 1273.3792907714844, "kl_loss_9": 484.5537796020508, "learning_rate": 0.00016191254986299043, "loss": 988.6155, "step": 7390 }, { "ce_loss_13": 3.2337637424468992, "ce_loss_17": 3.187696373462677, "ce_loss_2": 4.150008869171143, "ce_loss_4": 3.7309409499168398, "ce_loss_9": 3.3693282127380373, "epoch": 0.74, "grad_norm": 1104.0, "kl_loss_13": 123.86594429016114, "kl_loss_2": 2082.6684509277343, "kl_loss_4": 1256.2157348632813, "kl_loss_9": 465.0095504760742, "learning_rate": 0.00016074529422143398, "loss": 998.0524, "step": 7400 }, { "ce_loss_13": 3.1980300188064574, "ce_loss_17": 3.148793113231659, "ce_loss_2": 4.152233827114105, "ce_loss_4": 3.713741862773895, "ce_loss_9": 3.345681869983673, "epoch": 0.741, "grad_norm": 1888.0, "kl_loss_13": 127.49471626281738, "kl_loss_2": 2137.853479003906, "kl_loss_4": 1277.7441345214843, "kl_loss_9": 478.6956329345703, "learning_rate": 0.0001595814548709983, "loss": 1012.0307, "step": 7410 }, { "ce_loss_13": 3.259041702747345, "ce_loss_17": 3.206249749660492, "ce_loss_2": 4.20650943517685, "ce_loss_4": 3.7714171767234803, "ce_loss_9": 3.409318196773529, "epoch": 0.742, "grad_norm": 1344.0, "kl_loss_13": 130.35422134399414, "kl_loss_2": 2141.9607177734374, "kl_loss_4": 1282.8636535644532, "kl_loss_9": 489.5229217529297, "learning_rate": 0.00015842104353153285, "loss": 1004.6426, "step": 7420 }, { "ce_loss_13": 3.27479293346405, "ce_loss_17": 3.2261789798736573, "ce_loss_2": 4.203705608844757, "ce_loss_4": 3.7818409323692324, "ce_loss_9": 3.4212502121925352, "epoch": 0.743, "grad_norm": 936.0, "kl_loss_13": 128.94611930847168, "kl_loss_2": 2100.057403564453, "kl_loss_4": 1274.6831481933593, "kl_loss_9": 480.43113403320314, "learning_rate": 0.0001572640718883667, "loss": 1012.8867, "step": 7430 }, { "ce_loss_13": 3.2125929951667787, "ce_loss_17": 3.1644372582435607, "ce_loss_2": 4.130357956886291, "ce_loss_4": 3.7038000345230104, "ce_loss_9": 3.3506340742111207, "epoch": 0.744, "grad_norm": 1288.0, "kl_loss_13": 124.034135055542, "kl_loss_2": 2068.4636413574217, "kl_loss_4": 1237.4095458984375, "kl_loss_9": 466.54361419677736, "learning_rate": 0.0001561105515921915, "loss": 1002.4773, "step": 7440 }, { "ce_loss_13": 3.068488872051239, "ce_loss_17": 3.020793008804321, "ce_loss_2": 4.0546399593353275, "ce_loss_4": 3.5953676223754885, "ce_loss_9": 3.218417990207672, "epoch": 0.745, "grad_norm": 1144.0, "kl_loss_13": 123.01975784301757, "kl_loss_2": 2195.9939025878907, "kl_loss_4": 1300.5394409179687, "kl_loss_9": 478.4922409057617, "learning_rate": 0.0001549604942589441, "loss": 1001.5135, "step": 7450 }, { "ce_loss_13": 3.247687554359436, "ce_loss_17": 3.1993032574653624, "ce_loss_2": 4.133608877658844, "ce_loss_4": 3.7205206155776978, "ce_loss_9": 3.38311904668808, "epoch": 0.746, "grad_norm": 1152.0, "kl_loss_13": 122.73404388427734, "kl_loss_2": 2000.661181640625, "kl_loss_4": 1199.69306640625, "kl_loss_9": 457.7883010864258, "learning_rate": 0.00015381391146968864, "loss": 967.7152, "step": 7460 }, { "ce_loss_13": 3.221285402774811, "ce_loss_17": 3.175812816619873, "ce_loss_2": 4.162599980831146, "ce_loss_4": 3.7300588846206666, "ce_loss_9": 3.3635896682739257, "epoch": 0.747, "grad_norm": 1408.0, "kl_loss_13": 122.25125007629394, "kl_loss_2": 2098.288720703125, "kl_loss_4": 1255.3549011230468, "kl_loss_9": 467.0405746459961, "learning_rate": 0.00015267081477050133, "loss": 995.8037, "step": 7470 }, { "ce_loss_13": 3.31267991065979, "ce_loss_17": 3.2617206931114198, "ce_loss_2": 4.2146350026130674, "ce_loss_4": 3.8106497287750245, "ce_loss_9": 3.4549197912216187, "epoch": 0.748, "grad_norm": 964.0, "kl_loss_13": 129.9878952026367, "kl_loss_2": 2048.2322387695312, "kl_loss_4": 1246.3707336425782, "kl_loss_9": 478.71266479492186, "learning_rate": 0.00015153121567235335, "loss": 973.3393, "step": 7480 }, { "ce_loss_13": 3.2170090317726134, "ce_loss_17": 3.166588294506073, "ce_loss_2": 4.158045899868012, "ce_loss_4": 3.725597453117371, "ce_loss_9": 3.3561816573143006, "epoch": 0.749, "grad_norm": 1096.0, "kl_loss_13": 127.26076469421386, "kl_loss_2": 2134.4638732910157, "kl_loss_4": 1277.8035583496094, "kl_loss_9": 476.5727020263672, "learning_rate": 0.00015039512565099468, "loss": 973.1391, "step": 7490 }, { "ce_loss_13": 3.2838091254234314, "ce_loss_17": 3.2312169075012207, "ce_loss_2": 4.19412579536438, "ce_loss_4": 3.771588122844696, "ce_loss_9": 3.4210540413856507, "epoch": 0.75, "grad_norm": 1072.0, "kl_loss_13": 126.35832977294922, "kl_loss_2": 2077.666436767578, "kl_loss_4": 1247.6267395019531, "kl_loss_9": 472.2786499023438, "learning_rate": 0.00014926255614683932, "loss": 1018.3461, "step": 7500 }, { "ce_loss_13": 3.216032338142395, "ce_loss_17": 3.164313554763794, "ce_loss_2": 4.130905139446258, "ce_loss_4": 3.7067161202430725, "ce_loss_9": 3.356319618225098, "epoch": 0.751, "grad_norm": 1056.0, "kl_loss_13": 125.73965377807617, "kl_loss_2": 2082.1769409179688, "kl_loss_4": 1241.2722961425782, "kl_loss_9": 472.9007110595703, "learning_rate": 0.0001481335185648498, "loss": 990.5935, "step": 7510 }, { "ce_loss_13": 3.2335606932640077, "ce_loss_17": 3.1832844257354735, "ce_loss_2": 4.153126907348633, "ce_loss_4": 3.73312771320343, "ce_loss_9": 3.381218659877777, "epoch": 0.752, "grad_norm": 1976.0, "kl_loss_13": 125.85300712585449, "kl_loss_2": 2073.513671875, "kl_loss_4": 1248.5780090332032, "kl_loss_9": 475.14833984375, "learning_rate": 0.0001470080242744218, "loss": 977.7824, "step": 7520 }, { "ce_loss_13": 3.2307426810264586, "ce_loss_17": 3.181879985332489, "ce_loss_2": 4.164555346965789, "ce_loss_4": 3.7288846492767336, "ce_loss_9": 3.370301592350006, "epoch": 0.753, "grad_norm": 1184.0, "kl_loss_13": 123.22625007629395, "kl_loss_2": 2104.5072265625, "kl_loss_4": 1256.258526611328, "kl_loss_9": 467.73716125488284, "learning_rate": 0.0001458860846092705, "loss": 995.138, "step": 7530 }, { "ce_loss_13": 3.2703625679016115, "ce_loss_17": 3.2213937997817994, "ce_loss_2": 4.167250466346741, "ce_loss_4": 3.758751058578491, "ce_loss_9": 3.411143970489502, "epoch": 0.754, "grad_norm": 1224.0, "kl_loss_13": 125.65522918701171, "kl_loss_2": 2028.8781982421874, "kl_loss_4": 1232.3266479492188, "kl_loss_9": 469.32029876708987, "learning_rate": 0.00014476771086731566, "loss": 964.3648, "step": 7540 }, { "ce_loss_13": 3.354090988636017, "ce_loss_17": 3.302971291542053, "ce_loss_2": 4.267653489112854, "ce_loss_4": 3.8486554503440855, "ce_loss_9": 3.498868978023529, "epoch": 0.755, "grad_norm": 1056.0, "kl_loss_13": 128.73139190673828, "kl_loss_2": 2059.5609802246095, "kl_loss_4": 1239.7220764160156, "kl_loss_9": 477.4375518798828, "learning_rate": 0.00014365291431056872, "loss": 1006.7977, "step": 7550 }, { "ce_loss_13": 3.1943013429641725, "ce_loss_17": 3.1426864147186278, "ce_loss_2": 4.13987979888916, "ce_loss_4": 3.7095999360084533, "ce_loss_9": 3.3477648973464964, "epoch": 0.756, "grad_norm": 1560.0, "kl_loss_13": 131.05813865661622, "kl_loss_2": 2142.0512878417967, "kl_loss_4": 1288.128594970703, "kl_loss_9": 491.22144470214846, "learning_rate": 0.00014254170616501827, "loss": 1002.1, "step": 7560 }, { "ce_loss_13": 3.133422148227692, "ce_loss_17": 3.08091197013855, "ce_loss_2": 4.124362003803253, "ce_loss_4": 3.6807229042053224, "ce_loss_9": 3.2924073815345762, "epoch": 0.757, "grad_norm": 1432.0, "kl_loss_13": 129.73982009887695, "kl_loss_2": 2197.456756591797, "kl_loss_4": 1336.1477355957031, "kl_loss_9": 496.1264419555664, "learning_rate": 0.0001414340976205183, "loss": 1035.5784, "step": 7570 }, { "ce_loss_13": 3.153609824180603, "ce_loss_17": 3.102141499519348, "ce_loss_2": 4.127480387687683, "ce_loss_4": 3.6679895758628844, "ce_loss_9": 3.297958755493164, "epoch": 0.758, "grad_norm": 1208.0, "kl_loss_13": 126.9780704498291, "kl_loss_2": 2158.9037536621095, "kl_loss_4": 1277.7978881835938, "kl_loss_9": 477.7869583129883, "learning_rate": 0.00014033009983067452, "loss": 996.5772, "step": 7580 }, { "ce_loss_13": 3.3077316641807557, "ce_loss_17": 3.2599094033241274, "ce_loss_2": 4.194436705112457, "ce_loss_4": 3.784743368625641, "ce_loss_9": 3.4460306406021117, "epoch": 0.759, "grad_norm": 920.0, "kl_loss_13": 124.28801689147949, "kl_loss_2": 2010.4709106445312, "kl_loss_4": 1213.457501220703, "kl_loss_9": 460.99468383789065, "learning_rate": 0.00013922972391273224, "loss": 970.7412, "step": 7590 }, { "ce_loss_13": 3.309078001976013, "ce_loss_17": 3.2573913097381593, "ce_loss_2": 4.253790044784546, "ce_loss_4": 3.796337831020355, "ce_loss_9": 3.452209162712097, "epoch": 0.76, "grad_norm": 1576.0, "kl_loss_13": 125.82244911193848, "kl_loss_2": 2102.5746826171876, "kl_loss_4": 1217.8599975585937, "kl_loss_9": 465.9241317749023, "learning_rate": 0.0001381329809474649, "loss": 989.1443, "step": 7600 }, { "ce_loss_13": 3.213409662246704, "ce_loss_17": 3.157852065563202, "ce_loss_2": 4.18861893415451, "ce_loss_4": 3.73837434053421, "ce_loss_9": 3.364268922805786, "epoch": 0.761, "grad_norm": 1256.0, "kl_loss_13": 129.389058303833, "kl_loss_2": 2192.79970703125, "kl_loss_4": 1298.8254089355469, "kl_loss_9": 487.9811813354492, "learning_rate": 0.0001370398819790621, "loss": 1017.7053, "step": 7610 }, { "ce_loss_13": 3.3477409958839415, "ce_loss_17": 3.2976269125938416, "ce_loss_2": 4.2519958972930905, "ce_loss_4": 3.831505310535431, "ce_loss_9": 3.488226580619812, "epoch": 0.762, "grad_norm": 1168.0, "kl_loss_13": 126.8095832824707, "kl_loss_2": 2035.3596069335938, "kl_loss_4": 1224.7359436035156, "kl_loss_9": 469.43028259277344, "learning_rate": 0.00013595043801501794, "loss": 966.7545, "step": 7620 }, { "ce_loss_13": 3.1501991868019106, "ce_loss_17": 3.097886848449707, "ce_loss_2": 4.159565496444702, "ce_loss_4": 3.69558641910553, "ce_loss_9": 3.3046634435653686, "epoch": 0.763, "grad_norm": 1048.0, "kl_loss_13": 128.3656810760498, "kl_loss_2": 2248.670306396484, "kl_loss_4": 1336.1543212890624, "kl_loss_9": 490.68629455566406, "learning_rate": 0.00013486466002602133, "loss": 1021.7495, "step": 7630 }, { "ce_loss_13": 3.259756255149841, "ce_loss_17": 3.210232126712799, "ce_loss_2": 4.153639578819275, "ce_loss_4": 3.7447076439857483, "ce_loss_9": 3.397176778316498, "epoch": 0.764, "grad_norm": 916.0, "kl_loss_13": 126.04254722595215, "kl_loss_2": 2041.2409790039062, "kl_loss_4": 1235.0521911621095, "kl_loss_9": 468.1321136474609, "learning_rate": 0.00013378255894584462, "loss": 1001.5361, "step": 7640 }, { "ce_loss_13": 3.201395773887634, "ce_loss_17": 3.147140073776245, "ce_loss_2": 4.159450614452362, "ce_loss_4": 3.7129719734191893, "ce_loss_9": 3.3528184294700623, "epoch": 0.765, "grad_norm": 1056.0, "kl_loss_13": 129.11712150573732, "kl_loss_2": 2141.319860839844, "kl_loss_4": 1278.6327209472656, "kl_loss_9": 487.01463317871094, "learning_rate": 0.0001327041456712334, "loss": 1006.6482, "step": 7650 }, { "ce_loss_13": 3.235421097278595, "ce_loss_17": 3.1850746512413024, "ce_loss_2": 4.1679966926574705, "ce_loss_4": 3.7398450493812563, "ce_loss_9": 3.385719919204712, "epoch": 0.766, "grad_norm": 1272.0, "kl_loss_13": 128.1650749206543, "kl_loss_2": 2102.338702392578, "kl_loss_4": 1269.908056640625, "kl_loss_9": 484.285302734375, "learning_rate": 0.00013162943106179747, "loss": 1006.1359, "step": 7660 }, { "ce_loss_13": 3.2116881251335143, "ce_loss_17": 3.161814570426941, "ce_loss_2": 4.129674208164215, "ce_loss_4": 3.710439133644104, "ce_loss_9": 3.354893136024475, "epoch": 0.767, "grad_norm": 976.0, "kl_loss_13": 125.90929985046387, "kl_loss_2": 2078.6627502441406, "kl_loss_4": 1258.658758544922, "kl_loss_9": 470.9509704589844, "learning_rate": 0.00013055842593990132, "loss": 987.3365, "step": 7670 }, { "ce_loss_13": 3.1566450238227843, "ce_loss_17": 3.1092918038368227, "ce_loss_2": 4.08530056476593, "ce_loss_4": 3.6578171968460085, "ce_loss_9": 3.304216682910919, "epoch": 0.768, "grad_norm": 1624.0, "kl_loss_13": 124.16149215698242, "kl_loss_2": 2069.364923095703, "kl_loss_4": 1246.6306213378907, "kl_loss_9": 472.05199737548827, "learning_rate": 0.00012949114109055414, "loss": 1004.8703, "step": 7680 }, { "ce_loss_13": 3.201414632797241, "ce_loss_17": 3.150563156604767, "ce_loss_2": 4.146173572540283, "ce_loss_4": 3.713708817958832, "ce_loss_9": 3.350541877746582, "epoch": 0.769, "grad_norm": 1104.0, "kl_loss_13": 127.03784103393555, "kl_loss_2": 2115.0141174316404, "kl_loss_4": 1267.7196655273438, "kl_loss_9": 477.10606384277344, "learning_rate": 0.00012842758726130281, "loss": 1003.9879, "step": 7690 }, { "ce_loss_13": 3.244269919395447, "ce_loss_17": 3.1911988496780395, "ce_loss_2": 4.209400689601898, "ce_loss_4": 3.765560066699982, "ce_loss_9": 3.3980507135391234, "epoch": 0.77, "grad_norm": 1056.0, "kl_loss_13": 129.5151054382324, "kl_loss_2": 2145.383026123047, "kl_loss_4": 1278.5453063964844, "kl_loss_9": 488.20233764648435, "learning_rate": 0.00012736777516212267, "loss": 992.0566, "step": 7700 }, { "ce_loss_13": 3.2410014629364015, "ce_loss_17": 3.1870803117752073, "ce_loss_2": 4.175254607200623, "ce_loss_4": 3.7514858841896057, "ce_loss_9": 3.389737570285797, "epoch": 0.771, "grad_norm": 1288.0, "kl_loss_13": 129.12823143005372, "kl_loss_2": 2104.0761291503904, "kl_loss_4": 1272.2590026855469, "kl_loss_9": 485.8696517944336, "learning_rate": 0.00012631171546530968, "loss": 983.049, "step": 7710 }, { "ce_loss_13": 3.2552671909332274, "ce_loss_17": 3.2009616017341616, "ce_loss_2": 4.178385841846466, "ce_loss_4": 3.76089323759079, "ce_loss_9": 3.4028782844543457, "epoch": 0.772, "grad_norm": 1176.0, "kl_loss_13": 130.4560733795166, "kl_loss_2": 2100.8325622558596, "kl_loss_4": 1278.2070678710938, "kl_loss_9": 484.74622955322263, "learning_rate": 0.00012525941880537307, "loss": 1009.8533, "step": 7720 }, { "ce_loss_13": 3.2874022483825684, "ce_loss_17": 3.2355789303779603, "ce_loss_2": 4.201828730106354, "ce_loss_4": 3.787112605571747, "ce_loss_9": 3.429548645019531, "epoch": 0.773, "grad_norm": 1096.0, "kl_loss_13": 126.114208984375, "kl_loss_2": 2069.4682739257814, "kl_loss_4": 1255.4886413574218, "kl_loss_9": 473.7471008300781, "learning_rate": 0.00012421089577892869, "loss": 989.6941, "step": 7730 }, { "ce_loss_13": 3.2383461236953734, "ce_loss_17": 3.1848219633102417, "ce_loss_2": 4.184846591949463, "ce_loss_4": 3.747152531147003, "ce_loss_9": 3.385123312473297, "epoch": 0.774, "grad_norm": 1544.0, "kl_loss_13": 128.071431350708, "kl_loss_2": 2142.3081970214844, "kl_loss_4": 1279.3335876464844, "kl_loss_9": 484.71143493652346, "learning_rate": 0.0001231661569445919, "loss": 1004.4449, "step": 7740 }, { "ce_loss_13": 3.102243483066559, "ce_loss_17": 3.0518130540847777, "ce_loss_2": 4.056020653247833, "ce_loss_4": 3.6159384489059447, "ce_loss_9": 3.2508856296539306, "epoch": 0.775, "grad_norm": 1056.0, "kl_loss_13": 125.6069465637207, "kl_loss_2": 2134.4912048339843, "kl_loss_4": 1268.175439453125, "kl_loss_9": 475.76094512939454, "learning_rate": 0.00012212521282287093, "loss": 1017.5979, "step": 7750 }, { "ce_loss_13": 3.246686029434204, "ce_loss_17": 3.192774474620819, "ce_loss_2": 4.1642012119293215, "ce_loss_4": 3.7477113723754885, "ce_loss_9": 3.3939606308937074, "epoch": 0.776, "grad_norm": 1200.0, "kl_loss_13": 129.45451545715332, "kl_loss_2": 2082.440295410156, "kl_loss_4": 1258.4498352050782, "kl_loss_9": 481.8603179931641, "learning_rate": 0.00012108807389606158, "loss": 1011.0832, "step": 7760 }, { "ce_loss_13": 3.2439571619033813, "ce_loss_17": 3.193578803539276, "ce_loss_2": 4.173394310474396, "ce_loss_4": 3.743524396419525, "ce_loss_9": 3.38621164560318, "epoch": 0.777, "grad_norm": 1272.0, "kl_loss_13": 124.83784675598145, "kl_loss_2": 2087.276507568359, "kl_loss_4": 1244.2421630859376, "kl_loss_9": 465.41051025390624, "learning_rate": 0.00012005475060814159, "loss": 982.5583, "step": 7770 }, { "ce_loss_13": 3.1777942895889284, "ce_loss_17": 3.1278825402259827, "ce_loss_2": 4.135720062255859, "ce_loss_4": 3.697056698799133, "ce_loss_9": 3.3257087111473083, "epoch": 0.778, "grad_norm": 1072.0, "kl_loss_13": 126.91192779541015, "kl_loss_2": 2155.36162109375, "kl_loss_4": 1288.0360961914062, "kl_loss_9": 478.98959197998045, "learning_rate": 0.00011902525336466464, "loss": 1003.7271, "step": 7780 }, { "ce_loss_13": 3.1722656369209288, "ce_loss_17": 3.118096101284027, "ce_loss_2": 4.1460060477256775, "ce_loss_4": 3.7058478355407716, "ce_loss_9": 3.3225412607192992, "epoch": 0.779, "grad_norm": 1056.0, "kl_loss_13": 129.72147216796876, "kl_loss_2": 2183.284552001953, "kl_loss_4": 1311.1195861816407, "kl_loss_9": 490.60168914794923, "learning_rate": 0.00011799959253265668, "loss": 1005.6384, "step": 7790 }, { "ce_loss_13": 3.2334508538246154, "ce_loss_17": 3.18063223361969, "ce_loss_2": 4.177849841117859, "ce_loss_4": 3.740126538276672, "ce_loss_9": 3.3784268379211424, "epoch": 0.78, "grad_norm": 1112.0, "kl_loss_13": 128.66394805908203, "kl_loss_2": 2132.2561584472655, "kl_loss_4": 1271.9580139160157, "kl_loss_9": 481.4037460327148, "learning_rate": 0.00011697777844051105, "loss": 1001.8172, "step": 7800 }, { "ce_loss_13": 3.2165528655052187, "ce_loss_17": 3.1631170868873597, "ce_loss_2": 4.193563568592071, "ce_loss_4": 3.748032832145691, "ce_loss_9": 3.362986373901367, "epoch": 0.781, "grad_norm": 1232.0, "kl_loss_13": 128.65180740356445, "kl_loss_2": 2191.9769897460938, "kl_loss_4": 1312.2556457519531, "kl_loss_9": 483.41487884521484, "learning_rate": 0.00011595982137788402, "loss": 1014.5664, "step": 7810 }, { "ce_loss_13": 3.196559286117554, "ce_loss_17": 3.147442638874054, "ce_loss_2": 4.107870244979859, "ce_loss_4": 3.687285029888153, "ce_loss_9": 3.3345187425613405, "epoch": 0.782, "grad_norm": 948.0, "kl_loss_13": 125.45768737792969, "kl_loss_2": 2060.0716796875, "kl_loss_4": 1243.3453979492188, "kl_loss_9": 470.28092193603516, "learning_rate": 0.00011494573159559212, "loss": 991.1266, "step": 7820 }, { "ce_loss_13": 3.1791001200675963, "ce_loss_17": 3.1277450919151306, "ce_loss_2": 4.116717517375946, "ce_loss_4": 3.695025587081909, "ce_loss_9": 3.3245227456092836, "epoch": 0.783, "grad_norm": 892.0, "kl_loss_13": 127.19604988098145, "kl_loss_2": 2115.412738037109, "kl_loss_4": 1281.424786376953, "kl_loss_9": 475.48980407714845, "learning_rate": 0.00011393551930550828, "loss": 1017.2857, "step": 7830 }, { "ce_loss_13": 3.3088568449020386, "ce_loss_17": 3.2577559113502503, "ce_loss_2": 4.216470801830292, "ce_loss_4": 3.7981810092926027, "ce_loss_9": 3.4512784838676454, "epoch": 0.784, "grad_norm": 1096.0, "kl_loss_13": 128.35685081481932, "kl_loss_2": 2060.9100830078123, "kl_loss_4": 1244.4179992675781, "kl_loss_9": 476.0572967529297, "learning_rate": 0.00011292919468045875, "loss": 984.3434, "step": 7840 }, { "ce_loss_13": 3.264451634883881, "ce_loss_17": 3.2164299845695496, "ce_loss_2": 4.189800524711609, "ce_loss_4": 3.7672601699829102, "ce_loss_9": 3.4065866827964784, "epoch": 0.785, "grad_norm": 868.0, "kl_loss_13": 127.06847190856934, "kl_loss_2": 2085.861016845703, "kl_loss_4": 1266.5476989746094, "kl_loss_9": 478.61835021972655, "learning_rate": 0.00011192676785412154, "loss": 984.2806, "step": 7850 }, { "ce_loss_13": 3.2120761632919312, "ce_loss_17": 3.1548707604408266, "ce_loss_2": 4.176346850395203, "ce_loss_4": 3.733182764053345, "ce_loss_9": 3.3602697610855103, "epoch": 0.786, "grad_norm": 1280.0, "kl_loss_13": 128.8749740600586, "kl_loss_2": 2152.088970947266, "kl_loss_4": 1286.3571411132812, "kl_loss_9": 482.0912399291992, "learning_rate": 0.00011092824892092374, "loss": 1005.6135, "step": 7860 }, { "ce_loss_13": 3.141354238986969, "ce_loss_17": 3.0913732171058657, "ce_loss_2": 4.1109631896018985, "ce_loss_4": 3.671216332912445, "ce_loss_9": 3.291155445575714, "epoch": 0.787, "grad_norm": 1016.0, "kl_loss_13": 125.5489543914795, "kl_loss_2": 2175.8348693847656, "kl_loss_4": 1307.8104614257813, "kl_loss_9": 480.6966583251953, "learning_rate": 0.0001099336479359398, "loss": 997.819, "step": 7870 }, { "ce_loss_13": 3.2642038106918334, "ce_loss_17": 3.214054584503174, "ce_loss_2": 4.169588470458985, "ce_loss_4": 3.7548114538192747, "ce_loss_9": 3.406172204017639, "epoch": 0.788, "grad_norm": 996.0, "kl_loss_13": 125.2243495941162, "kl_loss_2": 2058.8989135742186, "kl_loss_4": 1242.5901245117188, "kl_loss_9": 468.96156311035156, "learning_rate": 0.00010894297491479043, "loss": 988.4604, "step": 7880 }, { "ce_loss_13": 3.2456247329711916, "ce_loss_17": 3.1946499943733215, "ce_loss_2": 4.174361944198608, "ce_loss_4": 3.7505073070526125, "ce_loss_9": 3.3868361115455627, "epoch": 0.789, "grad_norm": 1048.0, "kl_loss_13": 126.17346992492676, "kl_loss_2": 2084.7672424316406, "kl_loss_4": 1261.6503051757813, "kl_loss_9": 472.44528045654295, "learning_rate": 0.00010795623983354214, "loss": 983.8962, "step": 7890 }, { "ce_loss_13": 3.149270939826965, "ce_loss_17": 3.097381889820099, "ce_loss_2": 4.095205128192902, "ce_loss_4": 3.6656641364097595, "ce_loss_9": 3.300001299381256, "epoch": 0.79, "grad_norm": 1080.0, "kl_loss_13": 130.0659294128418, "kl_loss_2": 2133.253289794922, "kl_loss_4": 1288.7837036132812, "kl_loss_9": 488.53687896728513, "learning_rate": 0.00010697345262860636, "loss": 998.7774, "step": 7900 }, { "ce_loss_13": 3.2836090445518495, "ce_loss_17": 3.2355277299880982, "ce_loss_2": 4.189699244499207, "ce_loss_4": 3.7610695123672486, "ce_loss_9": 3.420949125289917, "epoch": 0.791, "grad_norm": 1088.0, "kl_loss_13": 126.42734985351562, "kl_loss_2": 2072.301403808594, "kl_loss_4": 1234.5135009765625, "kl_loss_9": 472.6247787475586, "learning_rate": 0.00010599462319663906, "loss": 976.3308, "step": 7910 }, { "ce_loss_13": 3.25415917634964, "ce_loss_17": 3.2047107577323914, "ce_loss_2": 4.152405321598053, "ce_loss_4": 3.7373377323150634, "ce_loss_9": 3.39132581949234, "epoch": 0.792, "grad_norm": 980.0, "kl_loss_13": 124.9365177154541, "kl_loss_2": 2034.1901062011718, "kl_loss_4": 1231.3090881347657, "kl_loss_9": 464.5166213989258, "learning_rate": 0.00010501976139444191, "loss": 972.4814, "step": 7920 }, { "ce_loss_13": 3.277890157699585, "ce_loss_17": 3.2278932690620423, "ce_loss_2": 4.194729161262512, "ce_loss_4": 3.7776479601860045, "ce_loss_9": 3.420755851268768, "epoch": 0.793, "grad_norm": 1240.0, "kl_loss_13": 125.55428695678711, "kl_loss_2": 2063.7543762207033, "kl_loss_4": 1243.5776916503905, "kl_loss_9": 466.4507827758789, "learning_rate": 0.0001040488770388625, "loss": 994.2, "step": 7930 }, { "ce_loss_13": 3.231332314014435, "ce_loss_17": 3.182119643688202, "ce_loss_2": 4.164359343051911, "ce_loss_4": 3.7323949456214907, "ce_loss_9": 3.3758636951446532, "epoch": 0.794, "grad_norm": 964.0, "kl_loss_13": 126.59407997131348, "kl_loss_2": 2111.425891113281, "kl_loss_4": 1264.2519592285157, "kl_loss_9": 477.27134399414064, "learning_rate": 0.00010308197990669538, "loss": 990.0729, "step": 7940 }, { "ce_loss_13": 3.3376906156539916, "ce_loss_17": 3.287771928310394, "ce_loss_2": 4.2649976968765255, "ce_loss_4": 3.8344961643218993, "ce_loss_9": 3.4823749899864196, "epoch": 0.795, "grad_norm": 1040.0, "kl_loss_13": 129.93802070617676, "kl_loss_2": 2092.6590942382813, "kl_loss_4": 1259.03115234375, "kl_loss_9": 479.88941192626953, "learning_rate": 0.0001021190797345839, "loss": 983.237, "step": 7950 }, { "ce_loss_13": 3.0758139848709107, "ce_loss_17": 3.0223918914794923, "ce_loss_2": 4.055166244506836, "ce_loss_4": 3.6145185232162476, "ce_loss_9": 3.2287954568862913, "epoch": 0.796, "grad_norm": 1088.0, "kl_loss_13": 130.94144096374512, "kl_loss_2": 2205.909210205078, "kl_loss_4": 1335.6518127441407, "kl_loss_9": 501.0154800415039, "learning_rate": 0.00010116018621892236, "loss": 1010.6875, "step": 7960 }, { "ce_loss_13": 3.2815113067626953, "ce_loss_17": 3.228566384315491, "ce_loss_2": 4.225681006908417, "ce_loss_4": 3.7939828634262085, "ce_loss_9": 3.431769037246704, "epoch": 0.797, "grad_norm": 1064.0, "kl_loss_13": 133.49212493896485, "kl_loss_2": 2143.985400390625, "kl_loss_4": 1297.4296752929688, "kl_loss_9": 496.98585205078126, "learning_rate": 0.00010020530901575753, "loss": 989.0187, "step": 7970 }, { "ce_loss_13": 3.3017742037773132, "ce_loss_17": 3.252815318107605, "ce_loss_2": 4.221998476982117, "ce_loss_4": 3.7967463612556456, "ce_loss_9": 3.445566773414612, "epoch": 0.798, "grad_norm": 1560.0, "kl_loss_13": 128.60725135803222, "kl_loss_2": 2085.548681640625, "kl_loss_4": 1257.310107421875, "kl_loss_9": 478.0802993774414, "learning_rate": 9.925445774069231e-05, "loss": 973.4955, "step": 7980 }, { "ce_loss_13": 3.2569035410881044, "ce_loss_17": 3.203923428058624, "ce_loss_2": 4.185257935523987, "ce_loss_4": 3.758824324607849, "ce_loss_9": 3.4019472122192385, "epoch": 0.799, "grad_norm": 1352.0, "kl_loss_13": 128.38563804626466, "kl_loss_2": 2072.222589111328, "kl_loss_4": 1249.5017028808593, "kl_loss_9": 475.911930847168, "learning_rate": 9.830764196878872e-05, "loss": 970.5723, "step": 7990 }, { "ce_loss_13": 3.204274308681488, "ce_loss_17": 3.1562834978103638, "ce_loss_2": 4.139583873748779, "ce_loss_4": 3.7099375247955324, "ce_loss_9": 3.3487107038497923, "epoch": 0.8, "grad_norm": 848.0, "kl_loss_13": 125.36464805603028, "kl_loss_2": 2138.724267578125, "kl_loss_4": 1282.1040405273438, "kl_loss_9": 476.3313705444336, "learning_rate": 9.736487123447069e-05, "loss": 996.3294, "step": 8000 }, { "ce_loss_13": 3.1565216183662415, "ce_loss_17": 3.1069478750228883, "ce_loss_2": 4.14747828245163, "ce_loss_4": 3.6925066709518433, "ce_loss_9": 3.301124632358551, "epoch": 0.801, "grad_norm": 920.0, "kl_loss_13": 127.88602828979492, "kl_loss_2": 2238.6373046875, "kl_loss_4": 1330.9870300292969, "kl_loss_9": 479.3197219848633, "learning_rate": 9.642615503142926e-05, "loss": 1022.6663, "step": 8010 }, { "ce_loss_13": 3.214006412029266, "ce_loss_17": 3.165544807910919, "ce_loss_2": 4.162649738788605, "ce_loss_4": 3.720465838909149, "ce_loss_9": 3.360601317882538, "epoch": 0.802, "grad_norm": 1088.0, "kl_loss_13": 126.05521697998047, "kl_loss_2": 2126.1510620117188, "kl_loss_4": 1261.9051818847656, "kl_loss_9": 470.97898864746094, "learning_rate": 9.549150281252633e-05, "loss": 982.9814, "step": 8020 }, { "ce_loss_13": 3.2445815801620483, "ce_loss_17": 3.193167281150818, "ce_loss_2": 4.174015378952026, "ce_loss_4": 3.7480205297470093, "ce_loss_9": 3.388844203948975, "epoch": 0.803, "grad_norm": 1048.0, "kl_loss_13": 127.22411231994629, "kl_loss_2": 2110.5955200195312, "kl_loss_4": 1259.428399658203, "kl_loss_9": 476.8070983886719, "learning_rate": 9.4560923989699e-05, "loss": 1004.0042, "step": 8030 }, { "ce_loss_13": 3.2361866354942324, "ce_loss_17": 3.1863423347473145, "ce_loss_2": 4.163020217418671, "ce_loss_4": 3.738401448726654, "ce_loss_9": 3.3825843691825868, "epoch": 0.804, "grad_norm": 1224.0, "kl_loss_13": 127.82188873291015, "kl_loss_2": 2080.760119628906, "kl_loss_4": 1248.896044921875, "kl_loss_9": 476.63584594726564, "learning_rate": 9.363442793386607e-05, "loss": 1006.4851, "step": 8040 }, { "ce_loss_13": 3.20820951461792, "ce_loss_17": 3.1552165508270265, "ce_loss_2": 4.179384446144104, "ce_loss_4": 3.7377692461013794, "ce_loss_9": 3.359268617630005, "epoch": 0.805, "grad_norm": 1688.0, "kl_loss_13": 129.69084663391112, "kl_loss_2": 2159.6863647460937, "kl_loss_4": 1300.0879516601562, "kl_loss_9": 488.3713012695313, "learning_rate": 9.271202397483213e-05, "loss": 988.8803, "step": 8050 }, { "ce_loss_13": 3.239323115348816, "ce_loss_17": 3.1904271125793455, "ce_loss_2": 4.157148838043213, "ce_loss_4": 3.726734757423401, "ce_loss_9": 3.380331075191498, "epoch": 0.806, "grad_norm": 1248.0, "kl_loss_13": 124.55978889465332, "kl_loss_2": 2069.6554443359373, "kl_loss_4": 1232.3144165039062, "kl_loss_9": 467.20772552490234, "learning_rate": 9.179372140119524e-05, "loss": 994.5643, "step": 8060 }, { "ce_loss_13": 3.1790022373199465, "ce_loss_17": 3.1302085757255553, "ce_loss_2": 4.108354699611664, "ce_loss_4": 3.68483704328537, "ce_loss_9": 3.3257157444953918, "epoch": 0.807, "grad_norm": 1032.0, "kl_loss_13": 125.43813591003418, "kl_loss_2": 2092.418096923828, "kl_loss_4": 1249.7414916992188, "kl_loss_9": 471.3716842651367, "learning_rate": 9.087952946025175e-05, "loss": 1002.8725, "step": 8070 }, { "ce_loss_13": 3.292525851726532, "ce_loss_17": 3.243098485469818, "ce_loss_2": 4.1809345245361325, "ce_loss_4": 3.761371874809265, "ce_loss_9": 3.429012668132782, "epoch": 0.808, "grad_norm": 1408.0, "kl_loss_13": 124.30414505004883, "kl_loss_2": 2029.1312377929687, "kl_loss_4": 1207.5740966796875, "kl_loss_9": 460.64671020507814, "learning_rate": 8.996945735790446e-05, "loss": 983.9166, "step": 8080 }, { "ce_loss_13": 3.1915315628051757, "ce_loss_17": 3.141726863384247, "ce_loss_2": 4.111777424812317, "ce_loss_4": 3.6927315831184386, "ce_loss_9": 3.334754204750061, "epoch": 0.809, "grad_norm": 1136.0, "kl_loss_13": 126.1311695098877, "kl_loss_2": 2105.6974243164063, "kl_loss_4": 1274.2767761230468, "kl_loss_9": 476.5439514160156, "learning_rate": 8.906351425856951e-05, "loss": 1003.1598, "step": 8090 }, { "ce_loss_13": 3.1765520334243775, "ce_loss_17": 3.12770094871521, "ce_loss_2": 4.129962289333344, "ce_loss_4": 3.69805029630661, "ce_loss_9": 3.3253371119499207, "epoch": 0.81, "grad_norm": 1096.0, "kl_loss_13": 127.69316215515137, "kl_loss_2": 2145.4662231445313, "kl_loss_4": 1298.3688903808593, "kl_loss_9": 482.0610290527344, "learning_rate": 8.816170928508365e-05, "loss": 1008.3583, "step": 8100 }, { "ce_loss_13": 3.140737473964691, "ce_loss_17": 3.0913586020469666, "ce_loss_2": 4.119320940971375, "ce_loss_4": 3.664652967453003, "ce_loss_9": 3.2921489119529723, "epoch": 0.811, "grad_norm": 980.0, "kl_loss_13": 127.36379470825196, "kl_loss_2": 2185.985949707031, "kl_loss_4": 1299.5416931152345, "kl_loss_9": 483.8924835205078, "learning_rate": 8.7264051518613e-05, "loss": 1007.6236, "step": 8110 }, { "ce_loss_13": 3.2289447665214537, "ce_loss_17": 3.1823450446128847, "ce_loss_2": 4.140631890296936, "ce_loss_4": 3.722241234779358, "ce_loss_9": 3.370582389831543, "epoch": 0.812, "grad_norm": 920.0, "kl_loss_13": 123.79982299804688, "kl_loss_2": 2053.2001403808595, "kl_loss_4": 1236.4767333984375, "kl_loss_9": 463.91027374267577, "learning_rate": 8.637054999856148e-05, "loss": 983.2399, "step": 8120 }, { "ce_loss_13": 3.2212143898010255, "ce_loss_17": 3.1681386709213255, "ce_loss_2": 4.153304266929626, "ce_loss_4": 3.7268579483032225, "ce_loss_9": 3.3677327036857605, "epoch": 0.813, "grad_norm": 1040.0, "kl_loss_13": 128.18593482971193, "kl_loss_2": 2105.4622314453127, "kl_loss_4": 1265.7773315429688, "kl_loss_9": 477.4667999267578, "learning_rate": 8.548121372247918e-05, "loss": 1005.0689, "step": 8130 }, { "ce_loss_13": 3.287048316001892, "ce_loss_17": 3.237959456443787, "ce_loss_2": 4.197725868225097, "ce_loss_4": 3.7694886803627012, "ce_loss_9": 3.423752284049988, "epoch": 0.814, "grad_norm": 1056.0, "kl_loss_13": 125.91325149536132, "kl_loss_2": 2083.3807678222656, "kl_loss_4": 1237.3005310058593, "kl_loss_9": 465.39888153076174, "learning_rate": 8.459605164597267e-05, "loss": 982.3264, "step": 8140 }, { "ce_loss_13": 3.1721022963523864, "ce_loss_17": 3.1242167592048644, "ce_loss_2": 4.1157526016235355, "ce_loss_4": 3.6828698039054872, "ce_loss_9": 3.316565549373627, "epoch": 0.815, "grad_norm": 912.0, "kl_loss_13": 125.71317977905274, "kl_loss_2": 2113.949322509766, "kl_loss_4": 1262.9661376953125, "kl_loss_9": 469.63636627197263, "learning_rate": 8.371507268261436e-05, "loss": 997.232, "step": 8150 }, { "ce_loss_13": 3.246817874908447, "ce_loss_17": 3.1956051349639893, "ce_loss_2": 4.175493860244751, "ce_loss_4": 3.7497007966041567, "ce_loss_9": 3.3897750854492186, "epoch": 0.816, "grad_norm": 888.0, "kl_loss_13": 127.12842330932617, "kl_loss_2": 2096.1856323242187, "kl_loss_4": 1255.7391967773438, "kl_loss_9": 475.40001068115237, "learning_rate": 8.283828570385238e-05, "loss": 974.9973, "step": 8160 }, { "ce_loss_13": 3.2430410385131836, "ce_loss_17": 3.191920280456543, "ce_loss_2": 4.169815516471862, "ce_loss_4": 3.7445862174034117, "ce_loss_9": 3.386352062225342, "epoch": 0.817, "grad_norm": 1120.0, "kl_loss_13": 126.56979026794434, "kl_loss_2": 2062.275427246094, "kl_loss_4": 1237.4530639648438, "kl_loss_9": 470.1497406005859, "learning_rate": 8.196569953892202e-05, "loss": 983.8533, "step": 8170 }, { "ce_loss_13": 3.1701053857803343, "ce_loss_17": 3.1187949419021606, "ce_loss_2": 4.111348032951355, "ce_loss_4": 3.678385090827942, "ce_loss_9": 3.314553141593933, "epoch": 0.818, "grad_norm": 1128.0, "kl_loss_13": 127.51042594909669, "kl_loss_2": 2099.4003295898438, "kl_loss_4": 1262.1439819335938, "kl_loss_9": 477.9965316772461, "learning_rate": 8.109732297475635e-05, "loss": 985.4639, "step": 8180 }, { "ce_loss_13": 3.1423016667366026, "ce_loss_17": 3.0867604494094847, "ce_loss_2": 4.153495299816131, "ce_loss_4": 3.6957584261894225, "ce_loss_9": 3.2995389342308044, "epoch": 0.819, "grad_norm": 1160.0, "kl_loss_13": 131.07748069763184, "kl_loss_2": 2218.056298828125, "kl_loss_4": 1332.5085388183593, "kl_loss_9": 501.1491760253906, "learning_rate": 8.023316475589754e-05, "loss": 1023.5213, "step": 8190 }, { "ce_loss_13": 3.1085451006889344, "ce_loss_17": 3.0530003547668456, "ce_loss_2": 4.144724249839783, "ce_loss_4": 3.667283225059509, "ce_loss_9": 3.2687911868095396, "epoch": 0.82, "grad_norm": 1600.0, "kl_loss_13": 133.31379203796388, "kl_loss_2": 2280.504187011719, "kl_loss_4": 1355.7112976074218, "kl_loss_9": 504.11670837402346, "learning_rate": 7.937323358440934e-05, "loss": 1038.617, "step": 8200 }, { "ce_loss_13": 3.226624619960785, "ce_loss_17": 3.180790066719055, "ce_loss_2": 4.1268556952476505, "ce_loss_4": 3.714967429637909, "ce_loss_9": 3.364724588394165, "epoch": 0.821, "grad_norm": 892.0, "kl_loss_13": 124.42786140441895, "kl_loss_2": 2048.5586669921877, "kl_loss_4": 1237.1432189941406, "kl_loss_9": 466.97570343017577, "learning_rate": 7.851753811978923e-05, "loss": 981.6991, "step": 8210 }, { "ce_loss_13": 3.2426247358322144, "ce_loss_17": 3.1914229869842528, "ce_loss_2": 4.191365051269531, "ce_loss_4": 3.7572955965995787, "ce_loss_9": 3.3855606436729433, "epoch": 0.822, "grad_norm": 952.0, "kl_loss_13": 128.2224449157715, "kl_loss_2": 2130.727642822266, "kl_loss_4": 1282.9359619140625, "kl_loss_9": 477.17317962646484, "learning_rate": 7.766608697888095e-05, "loss": 989.7021, "step": 8220 }, { "ce_loss_13": 3.253460681438446, "ce_loss_17": 3.2030975222587585, "ce_loss_2": 4.197035622596741, "ce_loss_4": 3.7567020297050475, "ce_loss_9": 3.3987293362617494, "epoch": 0.823, "grad_norm": 952.0, "kl_loss_13": 128.83656311035156, "kl_loss_2": 2139.1093322753904, "kl_loss_4": 1279.9349487304687, "kl_loss_9": 483.03794250488284, "learning_rate": 7.681888873578785e-05, "loss": 1009.851, "step": 8230 }, { "ce_loss_13": 3.1833924889564513, "ce_loss_17": 3.1301746726036073, "ce_loss_2": 4.148987996578216, "ce_loss_4": 3.7055625796318052, "ce_loss_9": 3.3394267082214357, "epoch": 0.824, "grad_norm": 1056.0, "kl_loss_13": 130.2201457977295, "kl_loss_2": 2160.3596069335936, "kl_loss_4": 1288.2772521972656, "kl_loss_9": 486.61197052001955, "learning_rate": 7.597595192178702e-05, "loss": 998.367, "step": 8240 }, { "ce_loss_13": 3.180142366886139, "ce_loss_17": 3.1281127452850344, "ce_loss_2": 4.159886491298676, "ce_loss_4": 3.71297425031662, "ce_loss_9": 3.332278788089752, "epoch": 0.825, "grad_norm": 1000.0, "kl_loss_13": 130.03518295288086, "kl_loss_2": 2205.4490173339846, "kl_loss_4": 1320.2739562988281, "kl_loss_9": 491.71746215820315, "learning_rate": 7.513728502524286e-05, "loss": 1020.7828, "step": 8250 }, { "ce_loss_13": 3.182576823234558, "ce_loss_17": 3.135728359222412, "ce_loss_2": 4.1087141871452335, "ce_loss_4": 3.67440345287323, "ce_loss_9": 3.323219323158264, "epoch": 0.826, "grad_norm": 1176.0, "kl_loss_13": 121.72067947387696, "kl_loss_2": 2081.279437255859, "kl_loss_4": 1236.0435668945313, "kl_loss_9": 461.9503463745117, "learning_rate": 7.430289649152156e-05, "loss": 996.9047, "step": 8260 }, { "ce_loss_13": 3.09291867017746, "ce_loss_17": 3.043274295330048, "ce_loss_2": 4.07514499425888, "ce_loss_4": 3.6362467646598815, "ce_loss_9": 3.2507020950317385, "epoch": 0.827, "grad_norm": 1032.0, "kl_loss_13": 128.74892807006836, "kl_loss_2": 2206.761828613281, "kl_loss_4": 1328.0462646484375, "kl_loss_9": 488.0947967529297, "learning_rate": 7.347279472290646e-05, "loss": 1007.4377, "step": 8270 }, { "ce_loss_13": 3.228681480884552, "ce_loss_17": 3.1775402545928957, "ce_loss_2": 4.1790827989578245, "ce_loss_4": 3.73972247838974, "ce_loss_9": 3.3739606738090515, "epoch": 0.828, "grad_norm": 984.0, "kl_loss_13": 126.37668495178222, "kl_loss_2": 2128.5212280273436, "kl_loss_4": 1277.1488525390625, "kl_loss_9": 475.32989196777345, "learning_rate": 7.264698807851328e-05, "loss": 1002.408, "step": 8280 }, { "ce_loss_13": 3.202029120922089, "ce_loss_17": 3.155063569545746, "ce_loss_2": 4.11312050819397, "ce_loss_4": 3.6948535323143004, "ce_loss_9": 3.3443856239318848, "epoch": 0.829, "grad_norm": 884.0, "kl_loss_13": 123.74527702331542, "kl_loss_2": 2066.8341064453125, "kl_loss_4": 1239.1205505371095, "kl_loss_9": 466.5020278930664, "learning_rate": 7.182548487420554e-05, "loss": 987.3731, "step": 8290 }, { "ce_loss_13": 3.2461774349212646, "ce_loss_17": 3.196709167957306, "ce_loss_2": 4.1702738285064695, "ce_loss_4": 3.747819209098816, "ce_loss_9": 3.392817437648773, "epoch": 0.83, "grad_norm": 952.0, "kl_loss_13": 128.17520065307616, "kl_loss_2": 2096.796044921875, "kl_loss_4": 1263.9498046875, "kl_loss_9": 479.71557769775393, "learning_rate": 7.100829338251146e-05, "loss": 991.0747, "step": 8300 }, { "ce_loss_13": 3.1831459760665894, "ce_loss_17": 3.129330635070801, "ce_loss_2": 4.146289324760437, "ce_loss_4": 3.710370886325836, "ce_loss_9": 3.333435320854187, "epoch": 0.831, "grad_norm": 1720.0, "kl_loss_13": 130.27710647583007, "kl_loss_2": 2146.3361206054688, "kl_loss_4": 1289.3548706054687, "kl_loss_9": 485.4897033691406, "learning_rate": 7.019542183254046e-05, "loss": 991.4107, "step": 8310 }, { "ce_loss_13": 3.226519453525543, "ce_loss_17": 3.1718310356140136, "ce_loss_2": 4.149910616874695, "ce_loss_4": 3.723079574108124, "ce_loss_9": 3.3704106092453, "epoch": 0.832, "grad_norm": 1064.0, "kl_loss_13": 132.9045612335205, "kl_loss_2": 2104.517706298828, "kl_loss_4": 1259.8415832519531, "kl_loss_9": 484.4057922363281, "learning_rate": 6.938687840989971e-05, "loss": 991.9902, "step": 8320 }, { "ce_loss_13": 3.1654950380325317, "ce_loss_17": 3.114250290393829, "ce_loss_2": 4.106305718421936, "ce_loss_4": 3.6778552412986754, "ce_loss_9": 3.312594699859619, "epoch": 0.833, "grad_norm": 1152.0, "kl_loss_13": 129.5602699279785, "kl_loss_2": 2109.0392211914063, "kl_loss_4": 1273.910382080078, "kl_loss_9": 480.05247955322267, "learning_rate": 6.858267125661271e-05, "loss": 1007.8496, "step": 8330 }, { "ce_loss_13": 3.222464072704315, "ce_loss_17": 3.1709493041038512, "ce_loss_2": 4.161825239658356, "ce_loss_4": 3.7274927616119387, "ce_loss_9": 3.3714067459106447, "epoch": 0.834, "grad_norm": 1008.0, "kl_loss_13": 126.04385490417481, "kl_loss_2": 2100.5596618652344, "kl_loss_4": 1258.2550109863282, "kl_loss_9": 475.0465805053711, "learning_rate": 6.778280847103668e-05, "loss": 1013.5768, "step": 8340 }, { "ce_loss_13": 3.2283169984817506, "ce_loss_17": 3.174326276779175, "ce_loss_2": 4.157098972797394, "ce_loss_4": 3.7347694873809814, "ce_loss_9": 3.3756808638572693, "epoch": 0.835, "grad_norm": 1520.0, "kl_loss_13": 129.5789581298828, "kl_loss_2": 2112.519494628906, "kl_loss_4": 1281.607000732422, "kl_loss_9": 487.7536087036133, "learning_rate": 6.698729810778065e-05, "loss": 995.7158, "step": 8350 }, { "ce_loss_13": 3.1411638259887695, "ce_loss_17": 3.0904542565345765, "ce_loss_2": 4.098214209079742, "ce_loss_4": 3.6583452224731445, "ce_loss_9": 3.2883453249931334, "epoch": 0.836, "grad_norm": 1432.0, "kl_loss_13": 125.058740234375, "kl_loss_2": 2133.97958984375, "kl_loss_4": 1277.0558837890626, "kl_loss_9": 478.1378799438477, "learning_rate": 6.619614817762538e-05, "loss": 1005.7371, "step": 8360 }, { "ce_loss_13": 3.115946328639984, "ce_loss_17": 3.0638095855712892, "ce_loss_2": 4.121069860458374, "ce_loss_4": 3.6633601427078246, "ce_loss_9": 3.2699984073638917, "epoch": 0.837, "grad_norm": 1144.0, "kl_loss_13": 127.8916404724121, "kl_loss_2": 2242.4721130371095, "kl_loss_4": 1343.97646484375, "kl_loss_9": 493.2196807861328, "learning_rate": 6.540936664744196e-05, "loss": 1024.0834, "step": 8370 }, { "ce_loss_13": 3.249848222732544, "ce_loss_17": 3.1980754017829893, "ce_loss_2": 4.20130842924118, "ce_loss_4": 3.7598718404769897, "ce_loss_9": 3.396473264694214, "epoch": 0.838, "grad_norm": 824.0, "kl_loss_13": 128.7819030761719, "kl_loss_2": 2131.275244140625, "kl_loss_4": 1273.7136657714843, "kl_loss_9": 480.4968597412109, "learning_rate": 6.462696144011149e-05, "loss": 990.5449, "step": 8380 }, { "ce_loss_13": 3.2049333333969114, "ce_loss_17": 3.1557726502418517, "ce_loss_2": 4.130917632579804, "ce_loss_4": 3.713854670524597, "ce_loss_9": 3.3502824306488037, "epoch": 0.839, "grad_norm": 1360.0, "kl_loss_13": 129.40373153686522, "kl_loss_2": 2104.8400695800783, "kl_loss_4": 1277.5943908691406, "kl_loss_9": 484.39180145263674, "learning_rate": 6.384894043444567e-05, "loss": 983.9594, "step": 8390 }, { "ce_loss_13": 3.232502615451813, "ce_loss_17": 3.1830275416374207, "ce_loss_2": 4.182047176361084, "ce_loss_4": 3.748951864242554, "ce_loss_9": 3.3769433736801147, "epoch": 0.84, "grad_norm": 1120.0, "kl_loss_13": 127.51888313293458, "kl_loss_2": 2121.50625, "kl_loss_4": 1277.4314270019531, "kl_loss_9": 477.809748840332, "learning_rate": 6.307531146510753e-05, "loss": 994.931, "step": 8400 }, { "ce_loss_13": 3.208201026916504, "ce_loss_17": 3.157346510887146, "ce_loss_2": 4.114724183082581, "ce_loss_4": 3.7055205583572386, "ce_loss_9": 3.3508978486061096, "epoch": 0.841, "grad_norm": 1152.0, "kl_loss_13": 127.08865966796876, "kl_loss_2": 2051.7865966796876, "kl_loss_4": 1242.8426727294923, "kl_loss_9": 472.36038665771486, "learning_rate": 6.230608232253226e-05, "loss": 970.267, "step": 8410 }, { "ce_loss_13": 3.1662453055381774, "ce_loss_17": 3.114821898937225, "ce_loss_2": 4.15137482881546, "ce_loss_4": 3.7088841438293456, "ce_loss_9": 3.3174098253250124, "epoch": 0.842, "grad_norm": 1320.0, "kl_loss_13": 127.68940696716308, "kl_loss_2": 2184.6794311523436, "kl_loss_4": 1316.0989074707031, "kl_loss_9": 489.0395141601563, "learning_rate": 6.154126075284855e-05, "loss": 999.2672, "step": 8420 }, { "ce_loss_13": 3.256260704994202, "ce_loss_17": 3.2074158430099486, "ce_loss_2": 4.165883505344391, "ce_loss_4": 3.7552166223526, "ce_loss_9": 3.3978078246116636, "epoch": 0.843, "grad_norm": 1056.0, "kl_loss_13": 124.37138862609864, "kl_loss_2": 2037.5476989746094, "kl_loss_4": 1238.5897888183595, "kl_loss_9": 465.6286026000977, "learning_rate": 6.078085445780129e-05, "loss": 968.5152, "step": 8430 }, { "ce_loss_13": 3.264948070049286, "ce_loss_17": 3.2132066249847413, "ce_loss_2": 4.206857228279114, "ce_loss_4": 3.7718899846076965, "ce_loss_9": 3.410208582878113, "epoch": 0.844, "grad_norm": 1136.0, "kl_loss_13": 129.08828010559083, "kl_loss_2": 2132.2358337402343, "kl_loss_4": 1272.34697265625, "kl_loss_9": 478.4906066894531, "learning_rate": 6.002487109467347e-05, "loss": 983.2783, "step": 8440 }, { "ce_loss_13": 3.2689123153686523, "ce_loss_17": 3.216715896129608, "ce_loss_2": 4.1897262811660765, "ce_loss_4": 3.7696678280830382, "ce_loss_9": 3.4159563183784485, "epoch": 0.845, "grad_norm": 1368.0, "kl_loss_13": 129.72224464416504, "kl_loss_2": 2093.787878417969, "kl_loss_4": 1265.2806579589844, "kl_loss_9": 486.01112670898436, "learning_rate": 5.927331827620902e-05, "loss": 984.2834, "step": 8450 }, { "ce_loss_13": 3.2551909923553466, "ce_loss_17": 3.203092074394226, "ce_loss_2": 4.146451878547668, "ce_loss_4": 3.7350058913230897, "ce_loss_9": 3.395671319961548, "epoch": 0.846, "grad_norm": 1020.0, "kl_loss_13": 125.13777008056641, "kl_loss_2": 2025.831768798828, "kl_loss_4": 1221.0529052734375, "kl_loss_9": 466.49583435058594, "learning_rate": 5.852620357053651e-05, "loss": 978.8377, "step": 8460 }, { "ce_loss_13": 3.2912203311920165, "ce_loss_17": 3.243688333034515, "ce_loss_2": 4.196159589290619, "ce_loss_4": 3.784270453453064, "ce_loss_9": 3.4343780875205994, "epoch": 0.847, "grad_norm": 1012.0, "kl_loss_13": 124.77928199768067, "kl_loss_2": 2052.5224487304686, "kl_loss_4": 1236.8224182128906, "kl_loss_9": 470.0700927734375, "learning_rate": 5.778353450109286e-05, "loss": 982.5012, "step": 8470 }, { "ce_loss_13": 3.325550174713135, "ce_loss_17": 3.2735902190208437, "ce_loss_2": 4.264748060703278, "ce_loss_4": 3.830344486236572, "ce_loss_9": 3.4723749995231628, "epoch": 0.848, "grad_norm": 864.0, "kl_loss_13": 129.32002944946288, "kl_loss_2": 2113.9170043945314, "kl_loss_4": 1266.4134338378906, "kl_loss_9": 480.98582611083987, "learning_rate": 5.7045318546547206e-05, "loss": 988.6468, "step": 8480 }, { "ce_loss_13": 3.2200080156326294, "ce_loss_17": 3.1703635573387148, "ce_loss_2": 4.168017518520355, "ce_loss_4": 3.7376600384712217, "ce_loss_9": 3.372846233844757, "epoch": 0.849, "grad_norm": 1408.0, "kl_loss_13": 126.75526733398438, "kl_loss_2": 2133.253063964844, "kl_loss_4": 1278.94619140625, "kl_loss_9": 478.3946533203125, "learning_rate": 5.631156314072605e-05, "loss": 987.0431, "step": 8490 }, { "ce_loss_13": 3.2455901384353636, "ce_loss_17": 3.1961548566818236, "ce_loss_2": 4.145925235748291, "ce_loss_4": 3.7316495776176453, "ce_loss_9": 3.3879377603530885, "epoch": 0.85, "grad_norm": 1152.0, "kl_loss_13": 125.72395133972168, "kl_loss_2": 2056.9944702148437, "kl_loss_4": 1234.8834716796875, "kl_loss_9": 470.5209045410156, "learning_rate": 5.5582275672538315e-05, "loss": 976.6285, "step": 8500 }, { "ce_loss_13": 3.161041462421417, "ce_loss_17": 3.109645414352417, "ce_loss_2": 4.157955026626587, "ce_loss_4": 3.7063544034957885, "ce_loss_9": 3.3146220445632935, "epoch": 0.851, "grad_norm": 1192.0, "kl_loss_13": 129.59199409484864, "kl_loss_2": 2215.375451660156, "kl_loss_4": 1336.8784729003905, "kl_loss_9": 491.09006652832034, "learning_rate": 5.4857463485900484e-05, "loss": 1018.3746, "step": 8510 }, { "ce_loss_13": 3.2192330241203306, "ce_loss_17": 3.1671828627586365, "ce_loss_2": 4.1410913348197935, "ce_loss_4": 3.7218037366867067, "ce_loss_9": 3.3653714179992678, "epoch": 0.852, "grad_norm": 1168.0, "kl_loss_13": 126.36544914245606, "kl_loss_2": 2087.4425048828125, "kl_loss_4": 1256.467303466797, "kl_loss_9": 476.610417175293, "learning_rate": 5.413713387966329e-05, "loss": 985.3994, "step": 8520 }, { "ce_loss_13": 3.2317323088645935, "ce_loss_17": 3.1797499895095824, "ce_loss_2": 4.180454099178315, "ce_loss_4": 3.746845316886902, "ce_loss_9": 3.3752941131591796, "epoch": 0.853, "grad_norm": 1144.0, "kl_loss_13": 127.6666088104248, "kl_loss_2": 2127.820910644531, "kl_loss_4": 1272.0194213867187, "kl_loss_9": 474.3869293212891, "learning_rate": 5.34212941075381e-05, "loss": 997.2857, "step": 8530 }, { "ce_loss_13": 3.247115230560303, "ce_loss_17": 3.2000760674476623, "ce_loss_2": 4.154358160495758, "ce_loss_4": 3.7302248954772947, "ce_loss_9": 3.384089207649231, "epoch": 0.854, "grad_norm": 960.0, "kl_loss_13": 123.00884552001953, "kl_loss_2": 2055.8799682617187, "kl_loss_4": 1222.059033203125, "kl_loss_9": 458.5852844238281, "learning_rate": 5.270995137802315e-05, "loss": 975.316, "step": 8540 }, { "ce_loss_13": 3.1839019536972044, "ce_loss_17": 3.1360046029090882, "ce_loss_2": 4.115815722942353, "ce_loss_4": 3.6813586354255676, "ce_loss_9": 3.3273687839508055, "epoch": 0.855, "grad_norm": 1048.0, "kl_loss_13": 124.62592010498047, "kl_loss_2": 2112.1836853027344, "kl_loss_4": 1262.5085083007812, "kl_loss_9": 474.87132720947267, "learning_rate": 5.2003112854332125e-05, "loss": 996.2535, "step": 8550 }, { "ce_loss_13": 3.180336606502533, "ce_loss_17": 3.1313059091567994, "ce_loss_2": 4.102069139480591, "ce_loss_4": 3.679996120929718, "ce_loss_9": 3.320179009437561, "epoch": 0.856, "grad_norm": 1064.0, "kl_loss_13": 124.25171737670898, "kl_loss_2": 2095.1885498046877, "kl_loss_4": 1266.2832092285157, "kl_loss_9": 471.6594039916992, "learning_rate": 5.130078565432089e-05, "loss": 975.1686, "step": 8560 }, { "ce_loss_13": 3.2431489706039427, "ce_loss_17": 3.1956213235855104, "ce_loss_2": 4.143830144405365, "ce_loss_4": 3.7313476324081423, "ce_loss_9": 3.3797136545181274, "epoch": 0.857, "grad_norm": 1536.0, "kl_loss_13": 123.3634017944336, "kl_loss_2": 2061.9453002929686, "kl_loss_4": 1242.9366455078125, "kl_loss_9": 461.8501510620117, "learning_rate": 5.060297685041659e-05, "loss": 967.0191, "step": 8570 }, { "ce_loss_13": 3.1751491904258726, "ce_loss_17": 3.1232897281646728, "ce_loss_2": 4.134636676311493, "ce_loss_4": 3.688388764858246, "ce_loss_9": 3.3248652815818787, "epoch": 0.858, "grad_norm": 920.0, "kl_loss_13": 129.5445785522461, "kl_loss_2": 2152.307482910156, "kl_loss_4": 1279.9777587890626, "kl_loss_9": 483.5591354370117, "learning_rate": 4.99096934695461e-05, "loss": 1011.6703, "step": 8580 }, { "ce_loss_13": 3.2341961741447447, "ce_loss_17": 3.1844797492027284, "ce_loss_2": 4.173639976978302, "ce_loss_4": 3.7414966940879824, "ce_loss_9": 3.3768649101257324, "epoch": 0.859, "grad_norm": 1208.0, "kl_loss_13": 125.31369743347167, "kl_loss_2": 2098.755267333984, "kl_loss_4": 1261.5017822265625, "kl_loss_9": 472.1582504272461, "learning_rate": 4.922094249306558e-05, "loss": 978.4313, "step": 8590 }, { "ce_loss_13": 3.268276047706604, "ce_loss_17": 3.219838559627533, "ce_loss_2": 4.201024615764618, "ce_loss_4": 3.7704820156097414, "ce_loss_9": 3.4139145493507383, "epoch": 0.86, "grad_norm": 1320.0, "kl_loss_13": 128.71006774902344, "kl_loss_2": 2105.2793518066405, "kl_loss_4": 1261.836163330078, "kl_loss_9": 481.61446075439454, "learning_rate": 4.853673085668947e-05, "loss": 975.468, "step": 8600 }, { "ce_loss_13": 3.285378623008728, "ce_loss_17": 3.234960913658142, "ce_loss_2": 4.216368556022644, "ce_loss_4": 3.783410882949829, "ce_loss_9": 3.424812173843384, "epoch": 0.861, "grad_norm": 1120.0, "kl_loss_13": 125.52713813781739, "kl_loss_2": 2106.820635986328, "kl_loss_4": 1259.9982299804688, "kl_loss_9": 473.1084777832031, "learning_rate": 4.78570654504214e-05, "loss": 993.9369, "step": 8610 }, { "ce_loss_13": 3.2329298734664915, "ce_loss_17": 3.183330309391022, "ce_loss_2": 4.166739642620087, "ce_loss_4": 3.7482675194740294, "ce_loss_9": 3.3793174982070924, "epoch": 0.862, "grad_norm": 968.0, "kl_loss_13": 126.2738468170166, "kl_loss_2": 2117.8194885253906, "kl_loss_4": 1282.2892456054688, "kl_loss_9": 477.62606353759764, "learning_rate": 4.7181953118484556e-05, "loss": 995.9761, "step": 8620 }, { "ce_loss_13": 3.2551379919052126, "ce_loss_17": 3.2071982860565185, "ce_loss_2": 4.168368661403656, "ce_loss_4": 3.749767470359802, "ce_loss_9": 3.3939077496528625, "epoch": 0.863, "grad_norm": 884.0, "kl_loss_13": 124.79238548278809, "kl_loss_2": 2036.817645263672, "kl_loss_4": 1235.8255004882812, "kl_loss_9": 466.5895751953125, "learning_rate": 4.651140065925269e-05, "loss": 994.0337, "step": 8630 }, { "ce_loss_13": 3.1938095092773438, "ce_loss_17": 3.1430015563964844, "ce_loss_2": 4.122222590446472, "ce_loss_4": 3.6837912678718565, "ce_loss_9": 3.334272301197052, "epoch": 0.864, "grad_norm": 1064.0, "kl_loss_13": 125.90898399353027, "kl_loss_2": 2105.9575744628905, "kl_loss_4": 1251.2004028320312, "kl_loss_9": 473.3047103881836, "learning_rate": 4.58454148251814e-05, "loss": 997.4364, "step": 8640 }, { "ce_loss_13": 3.2039818286895754, "ce_loss_17": 3.1515699863433837, "ce_loss_2": 4.164625298976898, "ce_loss_4": 3.7279044508934023, "ce_loss_9": 3.353543698787689, "epoch": 0.865, "grad_norm": 1216.0, "kl_loss_13": 126.02554054260254, "kl_loss_2": 2147.234686279297, "kl_loss_4": 1290.7346435546874, "kl_loss_9": 478.8132888793945, "learning_rate": 4.518400232274078e-05, "loss": 993.4934, "step": 8650 }, { "ce_loss_13": 3.230000042915344, "ce_loss_17": 3.177415680885315, "ce_loss_2": 4.158129477500916, "ce_loss_4": 3.735176920890808, "ce_loss_9": 3.3776259779930116, "epoch": 0.866, "grad_norm": 1096.0, "kl_loss_13": 127.42555541992188, "kl_loss_2": 2093.7327087402346, "kl_loss_4": 1263.7172912597657, "kl_loss_9": 477.8423767089844, "learning_rate": 4.452716981234745e-05, "loss": 968.3529, "step": 8660 }, { "ce_loss_13": 3.2046499490737914, "ce_loss_17": 3.1577785849571227, "ce_loss_2": 4.12251797914505, "ce_loss_4": 3.702546274662018, "ce_loss_9": 3.346600341796875, "epoch": 0.867, "grad_norm": 968.0, "kl_loss_13": 122.5515968322754, "kl_loss_2": 2066.757708740234, "kl_loss_4": 1245.1973388671875, "kl_loss_9": 464.10980834960935, "learning_rate": 4.3874923908297335e-05, "loss": 965.6994, "step": 8670 }, { "ce_loss_13": 3.255208504199982, "ce_loss_17": 3.2043866753578185, "ce_loss_2": 4.198356425762176, "ce_loss_4": 3.7662445425987245, "ce_loss_9": 3.399057853221893, "epoch": 0.868, "grad_norm": 920.0, "kl_loss_13": 127.82231521606445, "kl_loss_2": 2124.005615234375, "kl_loss_4": 1276.7696044921875, "kl_loss_9": 477.6804626464844, "learning_rate": 4.322727117869951e-05, "loss": 988.4502, "step": 8680 }, { "ce_loss_13": 3.2642483472824098, "ce_loss_17": 3.2134419202804567, "ce_loss_2": 4.197689819335937, "ce_loss_4": 3.768456315994263, "ce_loss_9": 3.4078236699104307, "epoch": 0.869, "grad_norm": 1904.0, "kl_loss_13": 127.22393608093262, "kl_loss_2": 2120.3073974609374, "kl_loss_4": 1269.5004028320313, "kl_loss_9": 477.0129425048828, "learning_rate": 4.2584218145409916e-05, "loss": 984.9196, "step": 8690 }, { "ce_loss_13": 3.3035953879356383, "ce_loss_17": 3.2540407538414002, "ce_loss_2": 4.194243943691253, "ce_loss_4": 3.7808672904968263, "ce_loss_9": 3.4402400135993956, "epoch": 0.87, "grad_norm": 924.0, "kl_loss_13": 123.82973365783691, "kl_loss_2": 2040.0159301757812, "kl_loss_4": 1228.4751098632812, "kl_loss_9": 461.49328765869143, "learning_rate": 4.194577128396521e-05, "loss": 965.0408, "step": 8700 }, { "ce_loss_13": 3.1913701415061952, "ce_loss_17": 3.141020154953003, "ce_loss_2": 4.113580524921417, "ce_loss_4": 3.692421293258667, "ce_loss_9": 3.334137749671936, "epoch": 0.871, "grad_norm": 984.0, "kl_loss_13": 123.5216136932373, "kl_loss_2": 2096.4419189453124, "kl_loss_4": 1255.9306884765624, "kl_loss_9": 462.6844787597656, "learning_rate": 4.1311937023518264e-05, "loss": 994.8435, "step": 8710 }, { "ce_loss_13": 3.2077568054199217, "ce_loss_17": 3.1629718780517577, "ce_loss_2": 4.172623693943024, "ce_loss_4": 3.711873912811279, "ce_loss_9": 3.3451624274253846, "epoch": 0.872, "grad_norm": 1064.0, "kl_loss_13": 121.13611946105956, "kl_loss_2": 2171.769171142578, "kl_loss_4": 1260.903302001953, "kl_loss_9": 455.0580337524414, "learning_rate": 4.0682721746773344e-05, "loss": 985.2782, "step": 8720 }, { "ce_loss_13": 3.076204466819763, "ce_loss_17": 3.026162314414978, "ce_loss_2": 4.049081432819366, "ce_loss_4": 3.6081708908081054, "ce_loss_9": 3.2243421196937563, "epoch": 0.873, "grad_norm": 1312.0, "kl_loss_13": 124.63458366394043, "kl_loss_2": 2151.958917236328, "kl_loss_4": 1293.8103759765625, "kl_loss_9": 473.3404846191406, "learning_rate": 4.0058131789920904e-05, "loss": 979.7602, "step": 8730 }, { "ce_loss_13": 3.228232777118683, "ce_loss_17": 3.1803542256355284, "ce_loss_2": 4.146386194229126, "ce_loss_4": 3.7238203406333925, "ce_loss_9": 3.370889735221863, "epoch": 0.874, "grad_norm": 1012.0, "kl_loss_13": 124.09422607421875, "kl_loss_2": 2093.6116577148437, "kl_loss_4": 1256.6304504394532, "kl_loss_9": 472.42902221679685, "learning_rate": 3.9438173442575e-05, "loss": 1014.0436, "step": 8740 }, { "ce_loss_13": 3.254248082637787, "ce_loss_17": 3.203847897052765, "ce_loss_2": 4.1632434010505674, "ce_loss_4": 3.746878433227539, "ce_loss_9": 3.4015512108802795, "epoch": 0.875, "grad_norm": 1128.0, "kl_loss_13": 125.60369529724122, "kl_loss_2": 2053.2536071777345, "kl_loss_4": 1239.6773498535156, "kl_loss_9": 468.52770233154297, "learning_rate": 3.882285294770937e-05, "loss": 975.6334, "step": 8750 }, { "ce_loss_13": 3.217588412761688, "ce_loss_17": 3.170010805130005, "ce_loss_2": 4.1197646975517275, "ce_loss_4": 3.7122427225112915, "ce_loss_9": 3.3599797964096068, "epoch": 0.876, "grad_norm": 1080.0, "kl_loss_13": 124.51647644042968, "kl_loss_2": 2048.0947387695314, "kl_loss_4": 1240.4429260253905, "kl_loss_9": 467.5818466186523, "learning_rate": 3.821217650159453e-05, "loss": 988.6195, "step": 8760 }, { "ce_loss_13": 3.0944710612297057, "ce_loss_17": 3.0418411016464235, "ce_loss_2": 4.073407590389252, "ce_loss_4": 3.6323368906974793, "ce_loss_9": 3.2468117833137513, "epoch": 0.877, "grad_norm": 1320.0, "kl_loss_13": 126.86514892578126, "kl_loss_2": 2174.796911621094, "kl_loss_4": 1316.1718994140624, "kl_loss_9": 485.6607894897461, "learning_rate": 3.760615025373543e-05, "loss": 1005.2988, "step": 8770 }, { "ce_loss_13": 3.268079960346222, "ce_loss_17": 3.2157804369926453, "ce_loss_2": 4.223985576629639, "ce_loss_4": 3.7878326296806337, "ce_loss_9": 3.418534290790558, "epoch": 0.878, "grad_norm": 1416.0, "kl_loss_13": 130.85871047973632, "kl_loss_2": 2135.668536376953, "kl_loss_4": 1274.788800048828, "kl_loss_9": 483.19738311767577, "learning_rate": 3.700478030680987e-05, "loss": 1008.5094, "step": 8780 }, { "ce_loss_13": 3.262276554107666, "ce_loss_17": 3.2135178804397584, "ce_loss_2": 4.190185832977295, "ce_loss_4": 3.7589842438697816, "ce_loss_9": 3.4042428493499757, "epoch": 0.879, "grad_norm": 1192.0, "kl_loss_13": 125.3928295135498, "kl_loss_2": 2088.604376220703, "kl_loss_4": 1244.1440734863281, "kl_loss_9": 467.53342590332034, "learning_rate": 3.6408072716606344e-05, "loss": 979.2428, "step": 8790 }, { "ce_loss_13": 3.192216694355011, "ce_loss_17": 3.1426405906677246, "ce_loss_2": 4.150838589668274, "ce_loss_4": 3.706217885017395, "ce_loss_9": 3.3396780371665953, "epoch": 0.88, "grad_norm": 1392.0, "kl_loss_13": 127.1377555847168, "kl_loss_2": 2162.9733642578126, "kl_loss_4": 1292.4259826660157, "kl_loss_9": 481.8303497314453, "learning_rate": 3.5816033491963716e-05, "loss": 1021.473, "step": 8800 }, { "ce_loss_13": 3.0536105036735535, "ce_loss_17": 3.0038822650909425, "ce_loss_2": 4.047228884696961, "ce_loss_4": 3.5828949570655824, "ce_loss_9": 3.2023893117904665, "epoch": 0.881, "grad_norm": 980.0, "kl_loss_13": 124.73252182006836, "kl_loss_2": 2198.851037597656, "kl_loss_4": 1294.3963317871094, "kl_loss_9": 473.34668731689453, "learning_rate": 3.522866859471047e-05, "loss": 1001.8871, "step": 8810 }, { "ce_loss_13": 3.2817575454711916, "ce_loss_17": 3.234195387363434, "ce_loss_2": 4.163271033763886, "ce_loss_4": 3.7520456194877623, "ce_loss_9": 3.4171226978302003, "epoch": 0.882, "grad_norm": 1200.0, "kl_loss_13": 122.0683967590332, "kl_loss_2": 2002.7965698242188, "kl_loss_4": 1200.9272399902343, "kl_loss_9": 456.8050308227539, "learning_rate": 3.46459839396045e-05, "loss": 967.4033, "step": 8820 }, { "ce_loss_13": 3.201283943653107, "ce_loss_17": 3.1517493963241576, "ce_loss_2": 4.1490461230278015, "ce_loss_4": 3.722565734386444, "ce_loss_9": 3.348989284038544, "epoch": 0.883, "grad_norm": 1256.0, "kl_loss_13": 126.7153091430664, "kl_loss_2": 2106.8726135253905, "kl_loss_4": 1274.0515014648438, "kl_loss_9": 476.4222808837891, "learning_rate": 3.406798539427386e-05, "loss": 1012.7152, "step": 8830 }, { "ce_loss_13": 3.2606215834617616, "ce_loss_17": 3.209340977668762, "ce_loss_2": 4.186807799339294, "ce_loss_4": 3.759765088558197, "ce_loss_9": 3.3993352174758913, "epoch": 0.884, "grad_norm": 1136.0, "kl_loss_13": 126.07500190734864, "kl_loss_2": 2109.742510986328, "kl_loss_4": 1274.7843200683594, "kl_loss_9": 471.37593536376954, "learning_rate": 3.349467877915746e-05, "loss": 991.0214, "step": 8840 }, { "ce_loss_13": 3.2210415601730347, "ce_loss_17": 3.1695794224739076, "ce_loss_2": 4.174704313278198, "ce_loss_4": 3.736547517776489, "ce_loss_9": 3.371638226509094, "epoch": 0.885, "grad_norm": 948.0, "kl_loss_13": 126.3464225769043, "kl_loss_2": 2157.0158264160154, "kl_loss_4": 1286.7587951660157, "kl_loss_9": 478.5762008666992, "learning_rate": 3.292606986744667e-05, "loss": 1021.7641, "step": 8850 }, { "ce_loss_13": 3.1762444615364074, "ce_loss_17": 3.1300561666488647, "ce_loss_2": 4.1212607502937315, "ce_loss_4": 3.6874821305274965, "ce_loss_9": 3.3210744857788086, "epoch": 0.886, "grad_norm": 1328.0, "kl_loss_13": 123.50991020202636, "kl_loss_2": 2117.460479736328, "kl_loss_4": 1266.8730529785157, "kl_loss_9": 467.5690216064453, "learning_rate": 3.23621643850267e-05, "loss": 992.8555, "step": 8860 }, { "ce_loss_13": 3.252179944515228, "ce_loss_17": 3.2019535541534423, "ce_loss_2": 4.175995469093323, "ce_loss_4": 3.750588226318359, "ce_loss_9": 3.3964988708496096, "epoch": 0.887, "grad_norm": 1072.0, "kl_loss_13": 128.64831771850587, "kl_loss_2": 2098.4038146972657, "kl_loss_4": 1270.8568359375, "kl_loss_9": 481.28710784912107, "learning_rate": 3.180296801041971e-05, "loss": 978.2283, "step": 8870 }, { "ce_loss_13": 3.27991042137146, "ce_loss_17": 3.2322956562042235, "ce_loss_2": 4.2062681913375854, "ce_loss_4": 3.7706706285476685, "ce_loss_9": 3.416417968273163, "epoch": 0.888, "grad_norm": 1040.0, "kl_loss_13": 125.17885055541993, "kl_loss_2": 2105.054296875, "kl_loss_4": 1247.4218444824219, "kl_loss_9": 465.64461517333984, "learning_rate": 3.124848637472688e-05, "loss": 969.5379, "step": 8880 }, { "ce_loss_13": 3.101099932193756, "ce_loss_17": 3.052938091754913, "ce_loss_2": 4.052875924110412, "ce_loss_4": 3.6190237998962402, "ce_loss_9": 3.246625506877899, "epoch": 0.889, "grad_norm": 1232.0, "kl_loss_13": 121.84129676818847, "kl_loss_2": 2128.917657470703, "kl_loss_4": 1275.5157775878906, "kl_loss_9": 467.0811401367188, "learning_rate": 3.069872506157212e-05, "loss": 987.4688, "step": 8890 }, { "ce_loss_13": 3.201661002635956, "ce_loss_17": 3.1545156836509705, "ce_loss_2": 4.131963443756104, "ce_loss_4": 3.701005387306213, "ce_loss_9": 3.3471578001976012, "epoch": 0.89, "grad_norm": 1200.0, "kl_loss_13": 124.4577709197998, "kl_loss_2": 2098.184912109375, "kl_loss_4": 1253.4125366210938, "kl_loss_9": 473.9854080200195, "learning_rate": 3.0153689607045842e-05, "loss": 986.501, "step": 8900 }, { "ce_loss_13": 3.106463384628296, "ce_loss_17": 3.054461884498596, "ce_loss_2": 4.107833957672119, "ce_loss_4": 3.6460415840148928, "ce_loss_9": 3.256603312492371, "epoch": 0.891, "grad_norm": 1408.0, "kl_loss_13": 128.70845260620118, "kl_loss_2": 2247.556707763672, "kl_loss_4": 1345.4517639160156, "kl_loss_9": 489.92423858642576, "learning_rate": 2.9613385499648926e-05, "loss": 1002.9125, "step": 8910 }, { "ce_loss_13": 3.156962585449219, "ce_loss_17": 3.107112765312195, "ce_loss_2": 4.08488130569458, "ce_loss_4": 3.6633270025253295, "ce_loss_9": 3.302465832233429, "epoch": 0.892, "grad_norm": 1256.0, "kl_loss_13": 125.15944747924804, "kl_loss_2": 2074.558489990234, "kl_loss_4": 1259.4126403808593, "kl_loss_9": 467.21313934326173, "learning_rate": 2.9077818180237692e-05, "loss": 988.2283, "step": 8920 }, { "ce_loss_13": 3.2006083607673643, "ce_loss_17": 3.149546241760254, "ce_loss_2": 4.154551482200622, "ce_loss_4": 3.716944098472595, "ce_loss_9": 3.3437914729118345, "epoch": 0.893, "grad_norm": 1312.0, "kl_loss_13": 124.67683334350586, "kl_loss_2": 2121.6332946777343, "kl_loss_4": 1270.0214050292968, "kl_loss_9": 472.15856475830077, "learning_rate": 2.8546993041969172e-05, "loss": 990.7046, "step": 8930 }, { "ce_loss_13": 3.2383341908454897, "ce_loss_17": 3.191279435157776, "ce_loss_2": 4.1336840152740475, "ce_loss_4": 3.7235642313957213, "ce_loss_9": 3.3798244476318358, "epoch": 0.894, "grad_norm": 1080.0, "kl_loss_13": 122.92615585327148, "kl_loss_2": 2051.1390747070313, "kl_loss_4": 1234.8792724609375, "kl_loss_9": 465.47203674316404, "learning_rate": 2.802091543024671e-05, "loss": 982.1504, "step": 8940 }, { "ce_loss_13": 3.2360296487808227, "ce_loss_17": 3.186755084991455, "ce_loss_2": 4.182163751125335, "ce_loss_4": 3.741145741939545, "ce_loss_9": 3.375945234298706, "epoch": 0.895, "grad_norm": 1056.0, "kl_loss_13": 126.24630584716797, "kl_loss_2": 2147.514569091797, "kl_loss_4": 1288.052752685547, "kl_loss_9": 474.3138656616211, "learning_rate": 2.7499590642665774e-05, "loss": 1018.1549, "step": 8950 }, { "ce_loss_13": 3.246128523349762, "ce_loss_17": 3.1959332466125487, "ce_loss_2": 4.188498532772064, "ce_loss_4": 3.7463975071907045, "ce_loss_9": 3.3894697785377503, "epoch": 0.896, "grad_norm": 948.0, "kl_loss_13": 126.02671356201172, "kl_loss_2": 2120.703259277344, "kl_loss_4": 1261.5170654296876, "kl_loss_9": 480.7454559326172, "learning_rate": 2.6983023928961405e-05, "loss": 984.5086, "step": 8960 }, { "ce_loss_13": 3.2192285656929016, "ce_loss_17": 3.166797363758087, "ce_loss_2": 4.1469670057296755, "ce_loss_4": 3.721474361419678, "ce_loss_9": 3.3615905165672304, "epoch": 0.897, "grad_norm": 1184.0, "kl_loss_13": 125.70062522888183, "kl_loss_2": 2087.2116455078126, "kl_loss_4": 1259.384228515625, "kl_loss_9": 470.62376556396487, "learning_rate": 2.6471220490954628e-05, "loss": 998.1463, "step": 8970 }, { "ce_loss_13": 3.2073075652122496, "ce_loss_17": 3.1569489002227784, "ce_loss_2": 4.125803303718567, "ce_loss_4": 3.6917981266975404, "ce_loss_9": 3.3408443212509153, "epoch": 0.898, "grad_norm": 1312.0, "kl_loss_13": 123.13876190185547, "kl_loss_2": 2087.336846923828, "kl_loss_4": 1241.9844482421875, "kl_loss_9": 465.28198699951173, "learning_rate": 2.596418548250029e-05, "loss": 989.099, "step": 8980 }, { "ce_loss_13": 3.2425430059432983, "ce_loss_17": 3.1915982246398924, "ce_loss_2": 4.168435001373291, "ce_loss_4": 3.743153285980225, "ce_loss_9": 3.385032820701599, "epoch": 0.899, "grad_norm": 1440.0, "kl_loss_13": 127.38224143981934, "kl_loss_2": 2120.754669189453, "kl_loss_4": 1275.3709594726563, "kl_loss_9": 479.466975402832, "learning_rate": 2.5461924009435368e-05, "loss": 981.307, "step": 8990 }, { "ce_loss_13": 3.2319785952568054, "ce_loss_17": 3.183781051635742, "ce_loss_2": 4.160268795490265, "ce_loss_4": 3.730000877380371, "ce_loss_9": 3.378864920139313, "epoch": 0.9, "grad_norm": 904.0, "kl_loss_13": 127.01592559814453, "kl_loss_2": 2082.503253173828, "kl_loss_4": 1250.9366455078125, "kl_loss_9": 473.8598922729492, "learning_rate": 2.4964441129527336e-05, "loss": 1003.9225, "step": 9000 }, { "ce_loss_13": 3.2342962980270387, "ce_loss_17": 3.186177933216095, "ce_loss_2": 4.137189567089081, "ce_loss_4": 3.7244582295417787, "ce_loss_9": 3.377157950401306, "epoch": 0.901, "grad_norm": 1120.0, "kl_loss_13": 123.69618873596191, "kl_loss_2": 2040.2627319335938, "kl_loss_4": 1228.0798034667969, "kl_loss_9": 461.71703948974607, "learning_rate": 2.4471741852423235e-05, "loss": 968.0867, "step": 9010 }, { "ce_loss_13": 3.280334162712097, "ce_loss_17": 3.230023729801178, "ce_loss_2": 4.205906891822815, "ce_loss_4": 3.779015672206879, "ce_loss_9": 3.4269187808036805, "epoch": 0.902, "grad_norm": 904.0, "kl_loss_13": 126.10594482421875, "kl_loss_2": 2065.7828857421873, "kl_loss_4": 1241.9036926269532, "kl_loss_9": 469.2466812133789, "learning_rate": 2.3983831139599287e-05, "loss": 981.2714, "step": 9020 }, { "ce_loss_13": 3.2038196563720702, "ce_loss_17": 3.154557228088379, "ce_loss_2": 4.13164883852005, "ce_loss_4": 3.6971403837203978, "ce_loss_9": 3.34529230594635, "epoch": 0.903, "grad_norm": 1880.0, "kl_loss_13": 123.34923286437989, "kl_loss_2": 2070.5382446289063, "kl_loss_4": 1228.1986083984375, "kl_loss_9": 458.6819091796875, "learning_rate": 2.3500713904311022e-05, "loss": 960.0121, "step": 9030 }, { "ce_loss_13": 3.244105279445648, "ce_loss_17": 3.195714461803436, "ce_loss_2": 4.136321234703064, "ce_loss_4": 3.7194257020950316, "ce_loss_9": 3.3810742259025575, "epoch": 0.904, "grad_norm": 1320.0, "kl_loss_13": 122.28379554748535, "kl_loss_2": 2016.4735961914062, "kl_loss_4": 1201.1506774902343, "kl_loss_9": 453.38655853271484, "learning_rate": 2.3022395011543685e-05, "loss": 956.8545, "step": 9040 }, { "ce_loss_13": 3.271865975856781, "ce_loss_17": 3.220155143737793, "ce_loss_2": 4.19095299243927, "ce_loss_4": 3.7741986393928526, "ce_loss_9": 3.419701623916626, "epoch": 0.905, "grad_norm": 1064.0, "kl_loss_13": 128.64427490234374, "kl_loss_2": 2092.2466552734377, "kl_loss_4": 1274.2114501953124, "kl_loss_9": 484.1876281738281, "learning_rate": 2.2548879277963063e-05, "loss": 1005.5, "step": 9050 }, { "ce_loss_13": 3.1932810068130495, "ce_loss_17": 3.1446372509002685, "ce_loss_2": 4.1027255177497866, "ce_loss_4": 3.6808401584625243, "ce_loss_9": 3.3333174109458925, "epoch": 0.906, "grad_norm": 1096.0, "kl_loss_13": 123.65237007141113, "kl_loss_2": 2064.5761779785157, "kl_loss_4": 1238.7606018066406, "kl_loss_9": 465.5109069824219, "learning_rate": 2.208017147186736e-05, "loss": 957.0088, "step": 9060 }, { "ce_loss_13": 3.1838820457458494, "ce_loss_17": 3.1336780190467834, "ce_loss_2": 4.110610842704773, "ce_loss_4": 3.685613822937012, "ce_loss_9": 3.3281081438064577, "epoch": 0.907, "grad_norm": 1144.0, "kl_loss_13": 124.22433738708496, "kl_loss_2": 2092.4880493164064, "kl_loss_4": 1257.078857421875, "kl_loss_9": 470.5039657592773, "learning_rate": 2.1616276313139227e-05, "loss": 977.561, "step": 9070 }, { "ce_loss_13": 3.22432826757431, "ce_loss_17": 3.1728204250335694, "ce_loss_2": 4.151443779468536, "ce_loss_4": 3.7218764424324036, "ce_loss_9": 3.3667750120162965, "epoch": 0.908, "grad_norm": 992.0, "kl_loss_13": 125.00688095092774, "kl_loss_2": 2091.067431640625, "kl_loss_4": 1256.3421875, "kl_loss_9": 469.9505126953125, "learning_rate": 2.1157198473197415e-05, "loss": 992.8523, "step": 9080 }, { "ce_loss_13": 3.283832335472107, "ce_loss_17": 3.232787823677063, "ce_loss_2": 4.215191149711609, "ce_loss_4": 3.7866356134414674, "ce_loss_9": 3.4285438656806946, "epoch": 0.909, "grad_norm": 1400.0, "kl_loss_13": 127.63661041259766, "kl_loss_2": 2084.1884887695314, "kl_loss_4": 1255.0216064453125, "kl_loss_9": 478.0837905883789, "learning_rate": 2.0702942574950812e-05, "loss": 985.9205, "step": 9090 }, { "ce_loss_13": 3.216969573497772, "ce_loss_17": 3.1639683723449705, "ce_loss_2": 4.155269372463226, "ce_loss_4": 3.7214656710624694, "ce_loss_9": 3.3645333886146545, "epoch": 0.91, "grad_norm": 1040.0, "kl_loss_13": 128.13418579101562, "kl_loss_2": 2118.728161621094, "kl_loss_4": 1264.2790588378907, "kl_loss_9": 480.4759552001953, "learning_rate": 2.025351319275137e-05, "loss": 994.0701, "step": 9100 }, { "ce_loss_13": 3.3377450942993163, "ce_loss_17": 3.2858991265296935, "ce_loss_2": 4.254519724845887, "ce_loss_4": 3.8394150495529176, "ce_loss_9": 3.481215536594391, "epoch": 0.911, "grad_norm": 1160.0, "kl_loss_13": 129.7705623626709, "kl_loss_2": 2114.2792785644533, "kl_loss_4": 1286.9357849121093, "kl_loss_9": 488.63895721435546, "learning_rate": 1.9808914852347816e-05, "loss": 1015.5373, "step": 9110 }, { "ce_loss_13": 3.1825668811798096, "ce_loss_17": 3.132770538330078, "ce_loss_2": 4.110361468791962, "ce_loss_4": 3.690675151348114, "ce_loss_9": 3.3298569321632385, "epoch": 0.912, "grad_norm": 964.0, "kl_loss_13": 124.83629989624023, "kl_loss_2": 2073.497210693359, "kl_loss_4": 1255.5652648925782, "kl_loss_9": 470.9684432983398, "learning_rate": 1.9369152030840554e-05, "loss": 977.9961, "step": 9120 }, { "ce_loss_13": 3.260647201538086, "ce_loss_17": 3.2124603152275086, "ce_loss_2": 4.195002579689026, "ce_loss_4": 3.764224982261658, "ce_loss_9": 3.4034763097763063, "epoch": 0.913, "grad_norm": 1320.0, "kl_loss_13": 126.34962158203125, "kl_loss_2": 2106.8051940917967, "kl_loss_4": 1263.4680053710938, "kl_loss_9": 469.1458541870117, "learning_rate": 1.893422915663645e-05, "loss": 988.5841, "step": 9130 }, { "ce_loss_13": 3.1396661758422852, "ce_loss_17": 3.086733412742615, "ce_loss_2": 4.111360728740692, "ce_loss_4": 3.6715479493141174, "ce_loss_9": 3.2856866240501406, "epoch": 0.914, "grad_norm": 1344.0, "kl_loss_13": 126.71647987365722, "kl_loss_2": 2174.335974121094, "kl_loss_4": 1305.815362548828, "kl_loss_9": 483.7636154174805, "learning_rate": 1.850415060940386e-05, "loss": 1008.9053, "step": 9140 }, { "ce_loss_13": 3.2563290357589723, "ce_loss_17": 3.2081819653511046, "ce_loss_2": 4.160827016830444, "ce_loss_4": 3.7483625531196596, "ce_loss_9": 3.398584794998169, "epoch": 0.915, "grad_norm": 1232.0, "kl_loss_13": 125.5147315979004, "kl_loss_2": 2054.2741271972654, "kl_loss_4": 1241.45810546875, "kl_loss_9": 466.45863189697263, "learning_rate": 1.8078920720028978e-05, "loss": 980.6484, "step": 9150 }, { "ce_loss_13": 3.1835547924041747, "ce_loss_17": 3.1357635736465452, "ce_loss_2": 4.085174989700318, "ce_loss_4": 3.676545226573944, "ce_loss_9": 3.3244770765304565, "epoch": 0.916, "grad_norm": 1328.0, "kl_loss_13": 121.81910057067871, "kl_loss_2": 2039.647686767578, "kl_loss_4": 1233.1619567871094, "kl_loss_9": 461.53747100830077, "learning_rate": 1.765854377057219e-05, "loss": 987.4318, "step": 9160 }, { "ce_loss_13": 3.160837733745575, "ce_loss_17": 3.111904430389404, "ce_loss_2": 4.085739696025849, "ce_loss_4": 3.655687320232391, "ce_loss_9": 3.3010897755622866, "epoch": 0.917, "grad_norm": 1064.0, "kl_loss_13": 120.65404739379883, "kl_loss_2": 2077.7172668457033, "kl_loss_4": 1239.4229919433594, "kl_loss_9": 456.7491943359375, "learning_rate": 1.724302399422456e-05, "loss": 979.8818, "step": 9170 }, { "ce_loss_13": 3.126928412914276, "ce_loss_17": 3.0757568001747133, "ce_loss_2": 4.063360798358917, "ce_loss_4": 3.63402179479599, "ce_loss_9": 3.2728739738464356, "epoch": 0.918, "grad_norm": 840.0, "kl_loss_13": 127.9131290435791, "kl_loss_2": 2111.3594970703125, "kl_loss_4": 1266.8044006347657, "kl_loss_9": 478.9010009765625, "learning_rate": 1.683236557526574e-05, "loss": 991.9193, "step": 9180 }, { "ce_loss_13": 3.2391092658042906, "ce_loss_17": 3.19199081659317, "ce_loss_2": 4.1185808181762695, "ce_loss_4": 3.7088035583496093, "ce_loss_9": 3.373170495033264, "epoch": 0.919, "grad_norm": 832.0, "kl_loss_13": 121.6255931854248, "kl_loss_2": 2002.3511840820313, "kl_loss_4": 1204.4359802246095, "kl_loss_9": 453.477001953125, "learning_rate": 1.6426572649021475e-05, "loss": 973.9055, "step": 9190 }, { "ce_loss_13": 3.27240104675293, "ce_loss_17": 3.2241369605064394, "ce_loss_2": 4.147839164733886, "ce_loss_4": 3.7413483262062073, "ce_loss_9": 3.409481644630432, "epoch": 0.92, "grad_norm": 920.0, "kl_loss_13": 125.00206451416015, "kl_loss_2": 2017.1478271484375, "kl_loss_4": 1219.8230895996094, "kl_loss_9": 462.11181640625, "learning_rate": 1.6025649301821876e-05, "loss": 967.6553, "step": 9200 }, { "ce_loss_13": 3.2617267966270447, "ce_loss_17": 3.212771201133728, "ce_loss_2": 4.150565218925476, "ce_loss_4": 3.7432140350341796, "ce_loss_9": 3.403185486793518, "epoch": 0.921, "grad_norm": 1344.0, "kl_loss_13": 126.24272270202637, "kl_loss_2": 2039.8053100585937, "kl_loss_4": 1240.6956481933594, "kl_loss_9": 472.7698318481445, "learning_rate": 1.5629599570960716e-05, "loss": 969.2418, "step": 9210 }, { "ce_loss_13": 3.1714923858642576, "ce_loss_17": 3.1215611338615417, "ce_loss_2": 4.099764513969421, "ce_loss_4": 3.671097922325134, "ce_loss_9": 3.3103894233703612, "epoch": 0.922, "grad_norm": 1096.0, "kl_loss_13": 123.92279281616212, "kl_loss_2": 2111.226904296875, "kl_loss_4": 1256.1555786132812, "kl_loss_9": 469.29527435302737, "learning_rate": 1.5238427444654367e-05, "loss": 982.0157, "step": 9220 }, { "ce_loss_13": 3.223889434337616, "ce_loss_17": 3.1753618359565734, "ce_loss_2": 4.140523469448089, "ce_loss_4": 3.720929265022278, "ce_loss_9": 3.367058265209198, "epoch": 0.923, "grad_norm": 1144.0, "kl_loss_13": 124.11885795593261, "kl_loss_2": 2060.348107910156, "kl_loss_4": 1234.4511108398438, "kl_loss_9": 464.55687408447267, "learning_rate": 1.4852136862001764e-05, "loss": 973.0648, "step": 9230 }, { "ce_loss_13": 3.190079414844513, "ce_loss_17": 3.1419080018997194, "ce_loss_2": 4.098129069805145, "ce_loss_4": 3.683347475528717, "ce_loss_9": 3.332545554637909, "epoch": 0.924, "grad_norm": 996.0, "kl_loss_13": 122.17581253051758, "kl_loss_2": 2036.4146240234375, "kl_loss_4": 1232.0136779785157, "kl_loss_9": 462.5317138671875, "learning_rate": 1.4470731712944884e-05, "loss": 982.9046, "step": 9240 }, { "ce_loss_13": 3.2127140522003175, "ce_loss_17": 3.1617493987083436, "ce_loss_2": 4.137983989715576, "ce_loss_4": 3.7195126891136168, "ce_loss_9": 3.360599732398987, "epoch": 0.925, "grad_norm": 920.0, "kl_loss_13": 126.36507148742676, "kl_loss_2": 2075.7155151367188, "kl_loss_4": 1250.0451416015626, "kl_loss_9": 473.1140640258789, "learning_rate": 1.4094215838229174e-05, "loss": 1001.2113, "step": 9250 }, { "ce_loss_13": 3.1899404883384705, "ce_loss_17": 3.1400984168052672, "ce_loss_2": 4.126039385795593, "ce_loss_4": 3.691938269138336, "ce_loss_9": 3.3360198497772218, "epoch": 0.926, "grad_norm": 1232.0, "kl_loss_13": 125.60745887756347, "kl_loss_2": 2117.256268310547, "kl_loss_4": 1263.1666564941406, "kl_loss_9": 474.5420547485352, "learning_rate": 1.372259302936546e-05, "loss": 1024.7519, "step": 9260 }, { "ce_loss_13": 3.296473526954651, "ce_loss_17": 3.2412025928497314, "ce_loss_2": 4.207285916805267, "ce_loss_4": 3.7896566867828367, "ce_loss_9": 3.436729943752289, "epoch": 0.927, "grad_norm": 896.0, "kl_loss_13": 129.88053436279296, "kl_loss_2": 2075.9804504394533, "kl_loss_4": 1258.4855895996093, "kl_loss_9": 477.35028381347655, "learning_rate": 1.3355867028591206e-05, "loss": 977.2746, "step": 9270 }, { "ce_loss_13": 3.1985045671463013, "ce_loss_17": 3.1508318066596983, "ce_loss_2": 4.090269935131073, "ce_loss_4": 3.6798470258712768, "ce_loss_9": 3.339389705657959, "epoch": 0.928, "grad_norm": 980.0, "kl_loss_13": 123.62951774597168, "kl_loss_2": 2035.2508911132813, "kl_loss_4": 1229.1637878417969, "kl_loss_9": 464.2936706542969, "learning_rate": 1.2994041528833267e-05, "loss": 968.881, "step": 9280 }, { "ce_loss_13": 3.192847216129303, "ce_loss_17": 3.1450926542282103, "ce_loss_2": 4.118480837345123, "ce_loss_4": 3.6920429110527038, "ce_loss_9": 3.3378187656402587, "epoch": 0.929, "grad_norm": 1168.0, "kl_loss_13": 122.0996280670166, "kl_loss_2": 2093.828009033203, "kl_loss_4": 1251.359844970703, "kl_loss_9": 467.27857971191406, "learning_rate": 1.2637120173670358e-05, "loss": 975.8633, "step": 9290 }, { "ce_loss_13": 3.217655837535858, "ce_loss_17": 3.167840909957886, "ce_loss_2": 4.157879602909088, "ce_loss_4": 3.723212778568268, "ce_loss_9": 3.3668641805648805, "epoch": 0.93, "grad_norm": 1216.0, "kl_loss_13": 126.36252250671387, "kl_loss_2": 2099.7136169433593, "kl_loss_4": 1262.4313903808593, "kl_loss_9": 474.13000335693357, "learning_rate": 1.2285106557296478e-05, "loss": 984.6294, "step": 9300 }, { "ce_loss_13": 3.1029295802116392, "ce_loss_17": 3.054238200187683, "ce_loss_2": 4.091546428203583, "ce_loss_4": 3.6369534969329833, "ce_loss_9": 3.250718927383423, "epoch": 0.931, "grad_norm": 1096.0, "kl_loss_13": 124.35437889099121, "kl_loss_2": 2192.22099609375, "kl_loss_4": 1305.7919677734376, "kl_loss_9": 477.1180023193359, "learning_rate": 1.1938004224484989e-05, "loss": 1000.5205, "step": 9310 }, { "ce_loss_13": 3.3303967833518984, "ce_loss_17": 3.2795648336410523, "ce_loss_2": 4.238452625274658, "ce_loss_4": 3.8244492530822756, "ce_loss_9": 3.473407542705536, "epoch": 0.932, "grad_norm": 980.0, "kl_loss_13": 127.10131301879883, "kl_loss_2": 2064.8430419921874, "kl_loss_4": 1249.1025268554688, "kl_loss_9": 472.41053619384763, "learning_rate": 1.1595816670552429e-05, "loss": 996.0648, "step": 9320 }, { "ce_loss_13": 3.2579795718193054, "ce_loss_17": 3.206527304649353, "ce_loss_2": 4.1660768866539, "ce_loss_4": 3.739229905605316, "ce_loss_9": 3.394890856742859, "epoch": 0.933, "grad_norm": 1336.0, "kl_loss_13": 126.0157699584961, "kl_loss_2": 2053.550744628906, "kl_loss_4": 1229.2304260253907, "kl_loss_9": 463.6220397949219, "learning_rate": 1.1258547341323699e-05, "loss": 965.4713, "step": 9330 }, { "ce_loss_13": 3.2862266182899473, "ce_loss_17": 3.234578084945679, "ce_loss_2": 4.191927945613861, "ce_loss_4": 3.7736660718917845, "ce_loss_9": 3.429300677776337, "epoch": 0.934, "grad_norm": 1224.0, "kl_loss_13": 126.44803886413574, "kl_loss_2": 2078.278918457031, "kl_loss_4": 1252.2899230957032, "kl_loss_9": 471.8009948730469, "learning_rate": 1.0926199633097156e-05, "loss": 976.3354, "step": 9340 }, { "ce_loss_13": 3.2950539350509644, "ce_loss_17": 3.2490778088569643, "ce_loss_2": 4.167960727214814, "ce_loss_4": 3.764832890033722, "ce_loss_9": 3.4277522921562196, "epoch": 0.935, "grad_norm": 968.0, "kl_loss_13": 122.42971153259278, "kl_loss_2": 2015.5107971191405, "kl_loss_4": 1213.9035522460938, "kl_loss_9": 458.9512908935547, "learning_rate": 1.0598776892610684e-05, "loss": 984.0514, "step": 9350 }, { "ce_loss_13": 3.1137264132499696, "ce_loss_17": 3.065649724006653, "ce_loss_2": 4.051564931869507, "ce_loss_4": 3.62681360244751, "ce_loss_9": 3.254243183135986, "epoch": 0.936, "grad_norm": 1312.0, "kl_loss_13": 121.12364768981934, "kl_loss_2": 2102.82529296875, "kl_loss_4": 1266.5707641601562, "kl_loss_9": 462.1351348876953, "learning_rate": 1.0276282417007399e-05, "loss": 975.3756, "step": 9360 }, { "ce_loss_13": 3.2665770649909973, "ce_loss_17": 3.216655695438385, "ce_loss_2": 4.145647776126862, "ce_loss_4": 3.738947021961212, "ce_loss_9": 3.404145121574402, "epoch": 0.937, "grad_norm": 1112.0, "kl_loss_13": 123.13276863098145, "kl_loss_2": 2024.9439270019532, "kl_loss_4": 1215.5426879882812, "kl_loss_9": 462.4633453369141, "learning_rate": 9.958719453803277e-06, "loss": 971.8545, "step": 9370 }, { "ce_loss_13": 3.2617274045944216, "ce_loss_17": 3.2096463203430177, "ce_loss_2": 4.184611821174622, "ce_loss_4": 3.7648152709007263, "ce_loss_9": 3.405567800998688, "epoch": 0.938, "grad_norm": 1272.0, "kl_loss_13": 125.90461807250976, "kl_loss_2": 2091.3635864257812, "kl_loss_4": 1270.0306030273437, "kl_loss_9": 473.70897064208987, "learning_rate": 9.646091200853802e-06, "loss": 977.7929, "step": 9380 }, { "ce_loss_13": 3.217831039428711, "ce_loss_17": 3.17045134305954, "ce_loss_2": 4.126152837276459, "ce_loss_4": 3.7095869541168214, "ce_loss_9": 3.358121025562286, "epoch": 0.939, "grad_norm": 1472.0, "kl_loss_13": 121.58394393920898, "kl_loss_2": 2035.5031494140626, "kl_loss_4": 1225.6890380859375, "kl_loss_9": 460.6478500366211, "learning_rate": 9.338400806321978e-06, "loss": 946.1783, "step": 9390 }, { "ce_loss_13": 3.249909830093384, "ce_loss_17": 3.200786316394806, "ce_loss_2": 4.1592383742332455, "ce_loss_4": 3.7454639315605163, "ce_loss_9": 3.3932641744613647, "epoch": 0.94, "grad_norm": 768.0, "kl_loss_13": 126.70610084533692, "kl_loss_2": 2050.856689453125, "kl_loss_4": 1240.7946533203126, "kl_loss_9": 472.20652465820314, "learning_rate": 9.035651368646646e-06, "loss": 970.3141, "step": 9400 }, { "ce_loss_13": 3.254878485202789, "ce_loss_17": 3.2067866206169127, "ce_loss_2": 4.151609563827515, "ce_loss_4": 3.737032163143158, "ce_loss_9": 3.3912570118904113, "epoch": 0.941, "grad_norm": 1080.0, "kl_loss_13": 123.21994323730469, "kl_loss_2": 2049.6838989257812, "kl_loss_4": 1237.212939453125, "kl_loss_9": 460.5226821899414, "learning_rate": 8.737845936511335e-06, "loss": 975.573, "step": 9410 }, { "ce_loss_13": 3.205101728439331, "ce_loss_17": 3.1559471130371093, "ce_loss_2": 4.144949054718017, "ce_loss_4": 3.707587778568268, "ce_loss_9": 3.346090483665466, "epoch": 0.942, "grad_norm": 876.0, "kl_loss_13": 125.93998184204102, "kl_loss_2": 2110.3262573242187, "kl_loss_4": 1250.1687438964843, "kl_loss_9": 472.09921112060545, "learning_rate": 8.444987508813451e-06, "loss": 979.1486, "step": 9420 }, { "ce_loss_13": 3.1636532306671143, "ce_loss_17": 3.112474000453949, "ce_loss_2": 4.124286568164825, "ce_loss_4": 3.6788034439086914, "ce_loss_9": 3.309084987640381, "epoch": 0.943, "grad_norm": 1624.0, "kl_loss_13": 127.45991554260254, "kl_loss_2": 2183.6559143066406, "kl_loss_4": 1300.6641357421875, "kl_loss_9": 480.09204559326173, "learning_rate": 8.157079034633974e-06, "loss": 1003.9906, "step": 9430 }, { "ce_loss_13": 3.161477494239807, "ce_loss_17": 3.1127028942108153, "ce_loss_2": 4.0934004306793215, "ce_loss_4": 3.665445053577423, "ce_loss_9": 3.3026885628700255, "epoch": 0.944, "grad_norm": 820.0, "kl_loss_13": 123.90990447998047, "kl_loss_2": 2110.129669189453, "kl_loss_4": 1270.7024353027343, "kl_loss_9": 469.68713226318357, "learning_rate": 7.874123413208145e-06, "loss": 978.5891, "step": 9440 }, { "ce_loss_13": 3.1290234208106993, "ce_loss_17": 3.080064821243286, "ce_loss_2": 4.085705506801605, "ce_loss_4": 3.645690679550171, "ce_loss_9": 3.2780033826828, "epoch": 0.945, "grad_norm": 976.0, "kl_loss_13": 123.41685485839844, "kl_loss_2": 2121.870007324219, "kl_loss_4": 1265.3032958984375, "kl_loss_9": 472.15820770263673, "learning_rate": 7.59612349389599e-06, "loss": 990.3545, "step": 9450 }, { "ce_loss_13": 3.2238157391548157, "ce_loss_17": 3.175503158569336, "ce_loss_2": 4.110353565216064, "ce_loss_4": 3.7005554318428038, "ce_loss_9": 3.36072324514389, "epoch": 0.946, "grad_norm": 1020.0, "kl_loss_13": 121.90590400695801, "kl_loss_2": 2000.7526306152345, "kl_loss_4": 1206.828594970703, "kl_loss_9": 457.64972534179685, "learning_rate": 7.323082076153509e-06, "loss": 964.6886, "step": 9460 }, { "ce_loss_13": 3.2651328802108766, "ce_loss_17": 3.215626835823059, "ce_loss_2": 4.154508507251739, "ce_loss_4": 3.7416013360023497, "ce_loss_9": 3.4016211867332458, "epoch": 0.947, "grad_norm": 1020.0, "kl_loss_13": 126.65930557250977, "kl_loss_2": 2029.3523010253907, "kl_loss_4": 1219.70224609375, "kl_loss_9": 468.4459487915039, "learning_rate": 7.055001909504755e-06, "loss": 985.2045, "step": 9470 }, { "ce_loss_13": 3.295180356502533, "ce_loss_17": 3.246336281299591, "ce_loss_2": 4.198210215568542, "ce_loss_4": 3.7800925254821776, "ce_loss_9": 3.440964138507843, "epoch": 0.948, "grad_norm": 936.0, "kl_loss_13": 125.49421920776368, "kl_loss_2": 2055.226983642578, "kl_loss_4": 1241.0293273925781, "kl_loss_9": 469.61966400146486, "learning_rate": 6.791885693514133e-06, "loss": 979.4119, "step": 9480 }, { "ce_loss_13": 3.2037457823753357, "ce_loss_17": 3.152704358100891, "ce_loss_2": 4.138142609596253, "ce_loss_4": 3.7104042053222654, "ce_loss_9": 3.344703030586243, "epoch": 0.949, "grad_norm": 1096.0, "kl_loss_13": 126.08903121948242, "kl_loss_2": 2118.9346618652344, "kl_loss_4": 1268.087255859375, "kl_loss_9": 469.74722747802736, "learning_rate": 6.533736077758867e-06, "loss": 993.6676, "step": 9490 }, { "ce_loss_13": 3.171399164199829, "ce_loss_17": 3.120036482810974, "ce_loss_2": 4.136746954917908, "ce_loss_4": 3.6882307291030885, "ce_loss_9": 3.319357442855835, "epoch": 0.95, "grad_norm": 1136.0, "kl_loss_13": 127.54952468872071, "kl_loss_2": 2172.968194580078, "kl_loss_4": 1293.3333740234375, "kl_loss_9": 482.79058532714845, "learning_rate": 6.2805556618028556e-06, "loss": 995.8846, "step": 9500 }, { "ce_loss_13": 3.2562387347221375, "ce_loss_17": 3.207328736782074, "ce_loss_2": 4.148194706439972, "ce_loss_4": 3.7281282782554626, "ce_loss_9": 3.3875306963920595, "epoch": 0.951, "grad_norm": 1240.0, "kl_loss_13": 122.4226905822754, "kl_loss_2": 2020.620587158203, "kl_loss_4": 1199.7545471191406, "kl_loss_9": 451.1108688354492, "learning_rate": 6.032346995169968e-06, "loss": 938.7558, "step": 9510 }, { "ce_loss_13": 3.264230751991272, "ce_loss_17": 3.213871192932129, "ce_loss_2": 4.169318628311157, "ce_loss_4": 3.7474931955337523, "ce_loss_9": 3.3999849915504456, "epoch": 0.952, "grad_norm": 1176.0, "kl_loss_13": 124.4639762878418, "kl_loss_2": 2058.7546569824217, "kl_loss_4": 1239.6460815429687, "kl_loss_9": 464.8585662841797, "learning_rate": 5.789112577318789e-06, "loss": 969.5257, "step": 9520 }, { "ce_loss_13": 3.2406904220581056, "ce_loss_17": 3.191251754760742, "ce_loss_2": 4.17564823627472, "ce_loss_4": 3.743658685684204, "ce_loss_9": 3.3809858679771425, "epoch": 0.953, "grad_norm": 880.0, "kl_loss_13": 126.41565399169922, "kl_loss_2": 2126.825061035156, "kl_loss_4": 1276.6847839355469, "kl_loss_9": 472.33983612060547, "learning_rate": 5.550854857617194e-06, "loss": 975.5995, "step": 9530 }, { "ce_loss_13": 3.2228988647460937, "ce_loss_17": 3.1709113478660584, "ce_loss_2": 4.1705172777175905, "ce_loss_4": 3.734542739391327, "ce_loss_9": 3.369017517566681, "epoch": 0.954, "grad_norm": 928.0, "kl_loss_13": 128.48126487731935, "kl_loss_2": 2131.030615234375, "kl_loss_4": 1272.4220703125, "kl_loss_9": 479.76820678710936, "learning_rate": 5.317576235317756e-06, "loss": 995.018, "step": 9540 }, { "ce_loss_13": 3.2498166799545287, "ce_loss_17": 3.199141788482666, "ce_loss_2": 4.139231407642365, "ce_loss_4": 3.7232473850250245, "ce_loss_9": 3.3869931578636168, "epoch": 0.955, "grad_norm": 1176.0, "kl_loss_13": 123.42841300964355, "kl_loss_2": 1994.9124755859375, "kl_loss_4": 1192.3440673828125, "kl_loss_9": 455.0126983642578, "learning_rate": 5.089279059533658e-06, "loss": 975.8289, "step": 9550 }, { "ce_loss_13": 3.30071382522583, "ce_loss_17": 3.2487955689430237, "ce_loss_2": 4.201845860481262, "ce_loss_4": 3.7940438628196715, "ce_loss_9": 3.445224678516388, "epoch": 0.956, "grad_norm": 1000.0, "kl_loss_13": 128.4445671081543, "kl_loss_2": 2056.6497497558594, "kl_loss_4": 1249.894952392578, "kl_loss_9": 477.7227386474609, "learning_rate": 4.865965629214819e-06, "loss": 972.0356, "step": 9560 }, { "ce_loss_13": 3.248912787437439, "ce_loss_17": 3.1993204593658446, "ce_loss_2": 4.177305388450622, "ce_loss_4": 3.7486638903617857, "ce_loss_9": 3.388227391242981, "epoch": 0.957, "grad_norm": 1344.0, "kl_loss_13": 127.39441986083985, "kl_loss_2": 2112.774267578125, "kl_loss_4": 1265.8854919433593, "kl_loss_9": 475.4095520019531, "learning_rate": 4.6476381931251366e-06, "loss": 972.3218, "step": 9570 }, { "ce_loss_13": 3.2345940828323365, "ce_loss_17": 3.183322012424469, "ce_loss_2": 4.137310314178467, "ce_loss_4": 3.7292892694473267, "ce_loss_9": 3.3763051509857176, "epoch": 0.958, "grad_norm": 844.0, "kl_loss_13": 124.14376068115234, "kl_loss_2": 2045.5467529296875, "kl_loss_4": 1237.586474609375, "kl_loss_9": 462.7819885253906, "learning_rate": 4.434298949819449e-06, "loss": 974.5975, "step": 9580 }, { "ce_loss_13": 3.1979801535606383, "ce_loss_17": 3.145884561538696, "ce_loss_2": 4.158722984790802, "ce_loss_4": 3.719670367240906, "ce_loss_9": 3.347254014015198, "epoch": 0.959, "grad_norm": 1256.0, "kl_loss_13": 128.95343589782715, "kl_loss_2": 2186.166943359375, "kl_loss_4": 1319.104541015625, "kl_loss_9": 487.32177581787107, "learning_rate": 4.2259500476214406e-06, "loss": 1006.9165, "step": 9590 }, { "ce_loss_13": 3.177784872055054, "ce_loss_17": 3.126880383491516, "ce_loss_2": 4.111366820335388, "ce_loss_4": 3.675239050388336, "ce_loss_9": 3.319349431991577, "epoch": 0.96, "grad_norm": 1080.0, "kl_loss_13": 124.77213478088379, "kl_loss_2": 2120.651959228516, "kl_loss_4": 1265.057928466797, "kl_loss_9": 471.9594467163086, "learning_rate": 4.02259358460233e-06, "loss": 981.0849, "step": 9600 }, { "ce_loss_13": 3.2416202902793883, "ce_loss_17": 3.191871929168701, "ce_loss_2": 4.147826766967773, "ce_loss_4": 3.727266800403595, "ce_loss_9": 3.3792614579200744, "epoch": 0.961, "grad_norm": 1008.0, "kl_loss_13": 125.40574684143067, "kl_loss_2": 2043.645684814453, "kl_loss_4": 1218.6265930175782, "kl_loss_9": 467.4931701660156, "learning_rate": 3.8242316085594916e-06, "loss": 965.6211, "step": 9610 }, { "ce_loss_13": 3.129376709461212, "ce_loss_17": 3.0780511856079102, "ce_loss_2": 4.116125667095185, "ce_loss_4": 3.6551926374435424, "ce_loss_9": 3.281545543670654, "epoch": 0.962, "grad_norm": 1096.0, "kl_loss_13": 127.64438514709472, "kl_loss_2": 2225.2640014648437, "kl_loss_4": 1310.2885131835938, "kl_loss_9": 485.20188903808594, "learning_rate": 3.630866116995757e-06, "loss": 1017.9228, "step": 9620 }, { "ce_loss_13": 3.28063040971756, "ce_loss_17": 3.231550455093384, "ce_loss_2": 4.170048546791077, "ce_loss_4": 3.7599850058555604, "ce_loss_9": 3.415730583667755, "epoch": 0.963, "grad_norm": 1012.0, "kl_loss_13": 124.0875675201416, "kl_loss_2": 2025.6632202148437, "kl_loss_4": 1213.776123046875, "kl_loss_9": 460.5186569213867, "learning_rate": 3.4424990570994797e-06, "loss": 987.3754, "step": 9630 }, { "ce_loss_13": 3.2675995945930483, "ce_loss_17": 3.2170332074165344, "ce_loss_2": 4.162374007701874, "ce_loss_4": 3.7501341581344603, "ce_loss_9": 3.4064988136291503, "epoch": 0.964, "grad_norm": 892.0, "kl_loss_13": 124.54148216247559, "kl_loss_2": 2043.5998840332031, "kl_loss_4": 1232.287158203125, "kl_loss_9": 463.18982696533203, "learning_rate": 3.2591323257248896e-06, "loss": 971.5918, "step": 9640 }, { "ce_loss_13": 3.124784469604492, "ce_loss_17": 3.0746599674224853, "ce_loss_2": 4.06125146150589, "ce_loss_4": 3.631430685520172, "ce_loss_9": 3.2648494601249696, "epoch": 0.965, "grad_norm": 1200.0, "kl_loss_13": 123.14472312927246, "kl_loss_2": 2100.73798828125, "kl_loss_4": 1261.9340759277343, "kl_loss_9": 467.69382934570314, "learning_rate": 3.0807677693729385e-06, "loss": 991.8986, "step": 9650 }, { "ce_loss_13": 3.3062557220458983, "ce_loss_17": 3.2574764609336855, "ce_loss_2": 4.201003980636597, "ce_loss_4": 3.7909571647644045, "ce_loss_9": 3.44566832780838, "epoch": 0.966, "grad_norm": 1248.0, "kl_loss_13": 124.11007270812988, "kl_loss_2": 2037.4576721191406, "kl_loss_4": 1231.887139892578, "kl_loss_9": 462.4075668334961, "learning_rate": 2.9074071841727055e-06, "loss": 961.5154, "step": 9660 }, { "ce_loss_13": 3.235342967510223, "ce_loss_17": 3.1868611931800843, "ce_loss_2": 4.148071026802063, "ce_loss_4": 3.7373862504959106, "ce_loss_9": 3.381422054767609, "epoch": 0.967, "grad_norm": 1328.0, "kl_loss_13": 126.08542900085449, "kl_loss_2": 2067.251837158203, "kl_loss_4": 1252.8617431640625, "kl_loss_9": 473.15320587158203, "learning_rate": 2.739052315863355e-06, "loss": 960.7016, "step": 9670 }, { "ce_loss_13": 3.210388922691345, "ce_loss_17": 3.1609103322029113, "ce_loss_2": 4.1357337474823, "ce_loss_4": 3.702705669403076, "ce_loss_9": 3.346673572063446, "epoch": 0.968, "grad_norm": 1136.0, "kl_loss_13": 124.91579856872559, "kl_loss_2": 2088.6549865722654, "kl_loss_4": 1247.3663391113282, "kl_loss_9": 461.30359039306643, "learning_rate": 2.5757048597765396e-06, "loss": 969.2777, "step": 9680 }, { "ce_loss_13": 3.2272926211357116, "ce_loss_17": 3.1778834462165833, "ce_loss_2": 4.157880508899689, "ce_loss_4": 3.727294051647186, "ce_loss_9": 3.3687225341796876, "epoch": 0.969, "grad_norm": 1056.0, "kl_loss_13": 123.88517990112305, "kl_loss_2": 2099.731866455078, "kl_loss_4": 1258.9116271972657, "kl_loss_9": 468.5845092773437, "learning_rate": 2.417366460819359e-06, "loss": 980.8228, "step": 9690 }, { "ce_loss_13": 3.2358564734458923, "ce_loss_17": 3.186364459991455, "ce_loss_2": 4.190982413291931, "ce_loss_4": 3.750655484199524, "ce_loss_9": 3.384175479412079, "epoch": 0.97, "grad_norm": 1104.0, "kl_loss_13": 128.23539390563965, "kl_loss_2": 2141.466271972656, "kl_loss_4": 1277.190234375, "kl_loss_9": 478.7344909667969, "learning_rate": 2.2640387134577057e-06, "loss": 980.0844, "step": 9700 }, { "ce_loss_13": 3.1684568881988526, "ce_loss_17": 3.1200954914093018, "ce_loss_2": 4.045890080928802, "ce_loss_4": 3.643348491191864, "ce_loss_9": 3.304252767562866, "epoch": 0.971, "grad_norm": 1120.0, "kl_loss_13": 118.9888671875, "kl_loss_2": 1977.8094421386718, "kl_loss_4": 1189.5280700683593, "kl_loss_9": 446.3478118896484, "learning_rate": 2.115723161700278e-06, "loss": 950.4072, "step": 9710 }, { "ce_loss_13": 3.149448072910309, "ce_loss_17": 3.0986252307891844, "ce_loss_2": 4.10426265001297, "ce_loss_4": 3.661156380176544, "ce_loss_9": 3.2953903317451476, "epoch": 0.972, "grad_norm": 1064.0, "kl_loss_13": 127.82453002929688, "kl_loss_2": 2147.044091796875, "kl_loss_4": 1280.9542358398437, "kl_loss_9": 481.2591491699219, "learning_rate": 1.9724212990830937e-06, "loss": 1001.5318, "step": 9720 }, { "ce_loss_13": 3.284513461589813, "ce_loss_17": 3.2343594074249267, "ce_loss_2": 4.22420608997345, "ce_loss_4": 3.7923627853393556, "ce_loss_9": 3.4308021903038024, "epoch": 0.973, "grad_norm": 880.0, "kl_loss_13": 126.76237907409669, "kl_loss_2": 2118.9266967773438, "kl_loss_4": 1268.1193359375, "kl_loss_9": 474.581884765625, "learning_rate": 1.8341345686543331e-06, "loss": 987.1477, "step": 9730 }, { "ce_loss_13": 3.269551396369934, "ce_loss_17": 3.2222204566001893, "ce_loss_2": 4.151566231250763, "ce_loss_4": 3.742894542217255, "ce_loss_9": 3.409974718093872, "epoch": 0.974, "grad_norm": 1416.0, "kl_loss_13": 123.14921531677246, "kl_loss_2": 2002.3103088378907, "kl_loss_4": 1210.3436584472656, "kl_loss_9": 460.9965362548828, "learning_rate": 1.7008643629596864e-06, "loss": 979.6496, "step": 9740 }, { "ce_loss_13": 3.2518662810325623, "ce_loss_17": 3.201659917831421, "ce_loss_2": 4.170797145366668, "ce_loss_4": 3.740803039073944, "ce_loss_9": 3.3905496120452883, "epoch": 0.975, "grad_norm": 1200.0, "kl_loss_13": 124.72664642333984, "kl_loss_2": 2090.470294189453, "kl_loss_4": 1241.0427490234374, "kl_loss_9": 466.34546356201173, "learning_rate": 1.5726120240288633e-06, "loss": 994.2612, "step": 9750 }, { "ce_loss_13": 3.1562037229537965, "ce_loss_17": 3.1069855093955994, "ce_loss_2": 4.074446082115173, "ce_loss_4": 3.652340459823608, "ce_loss_9": 3.295357358455658, "epoch": 0.976, "grad_norm": 952.0, "kl_loss_13": 123.31254425048829, "kl_loss_2": 2066.401690673828, "kl_loss_4": 1242.8620727539062, "kl_loss_9": 464.1050720214844, "learning_rate": 1.4493788433612708e-06, "loss": 971.6017, "step": 9760 }, { "ce_loss_13": 3.274170434474945, "ce_loss_17": 3.2258036971092223, "ce_loss_2": 4.195810759067536, "ce_loss_4": 3.7765634536743162, "ce_loss_9": 3.417754316329956, "epoch": 0.977, "grad_norm": 840.0, "kl_loss_13": 125.30190086364746, "kl_loss_2": 2099.268420410156, "kl_loss_4": 1268.2574890136718, "kl_loss_9": 475.16844177246094, "learning_rate": 1.3311660619138578e-06, "loss": 989.599, "step": 9770 }, { "ce_loss_13": 3.2723480820655824, "ce_loss_17": 3.223863685131073, "ce_loss_2": 4.138527846336364, "ce_loss_4": 3.7411954164505006, "ce_loss_9": 3.4104616761207582, "epoch": 0.978, "grad_norm": 984.0, "kl_loss_13": 125.73738975524903, "kl_loss_2": 1984.028741455078, "kl_loss_4": 1208.4887084960938, "kl_loss_9": 467.6315032958984, "learning_rate": 1.2179748700879012e-06, "loss": 970.9781, "step": 9780 }, { "ce_loss_13": 3.1942247629165648, "ce_loss_17": 3.145608389377594, "ce_loss_2": 4.114635896682739, "ce_loss_4": 3.6870145559310914, "ce_loss_9": 3.336309015750885, "epoch": 0.979, "grad_norm": 1320.0, "kl_loss_13": 123.52474479675293, "kl_loss_2": 2065.256066894531, "kl_loss_4": 1232.7650329589844, "kl_loss_9": 462.83345489501954, "learning_rate": 1.1098064077174619e-06, "loss": 980.1039, "step": 9790 }, { "ce_loss_13": 3.225843811035156, "ce_loss_17": 3.1733697414398194, "ce_loss_2": 4.170241928100586, "ce_loss_4": 3.7319687724113466, "ce_loss_9": 3.3707484006881714, "epoch": 0.98, "grad_norm": 936.0, "kl_loss_13": 124.23649864196777, "kl_loss_2": 2119.903448486328, "kl_loss_4": 1266.664892578125, "kl_loss_9": 471.4155990600586, "learning_rate": 1.006661764057837e-06, "loss": 980.9118, "step": 9800 }, { "ce_loss_13": 3.232490563392639, "ce_loss_17": 3.1838974237442015, "ce_loss_2": 4.144217872619629, "ce_loss_4": 3.7293247103691103, "ce_loss_9": 3.3732499957084654, "epoch": 0.981, "grad_norm": 1288.0, "kl_loss_13": 123.3721248626709, "kl_loss_2": 2062.827740478516, "kl_loss_4": 1245.1588562011718, "kl_loss_9": 463.1363006591797, "learning_rate": 9.085419777743465e-07, "loss": 968.233, "step": 9810 }, { "ce_loss_13": 3.1816325426101684, "ce_loss_17": 3.1337949514389036, "ce_loss_2": 4.1018049120903015, "ce_loss_4": 3.6814963102340696, "ce_loss_9": 3.3242132306098937, "epoch": 0.982, "grad_norm": 1096.0, "kl_loss_13": 121.63786392211914, "kl_loss_2": 2080.9870056152345, "kl_loss_4": 1253.1997131347657, "kl_loss_9": 460.1046890258789, "learning_rate": 8.15448036932176e-07, "loss": 962.873, "step": 9820 }, { "ce_loss_13": 3.222775232791901, "ce_loss_17": 3.1739914655685424, "ce_loss_2": 4.14130425453186, "ce_loss_4": 3.719357895851135, "ce_loss_9": 3.366613733768463, "epoch": 0.983, "grad_norm": 976.0, "kl_loss_13": 124.51547050476074, "kl_loss_2": 2080.1892150878907, "kl_loss_4": 1255.691357421875, "kl_loss_9": 471.56447143554686, "learning_rate": 7.273808789862724e-07, "loss": 989.4896, "step": 9830 }, { "ce_loss_13": 3.299858570098877, "ce_loss_17": 3.2506449460983275, "ce_loss_2": 4.20594289302826, "ce_loss_4": 3.786842679977417, "ce_loss_9": 3.4402770400047302, "epoch": 0.984, "grad_norm": 992.0, "kl_loss_13": 126.09925537109375, "kl_loss_2": 2071.2891174316405, "kl_loss_4": 1248.3088500976562, "kl_loss_9": 472.49950714111327, "learning_rate": 6.443413907720186e-07, "loss": 973.4876, "step": 9840 }, { "ce_loss_13": 3.24042694568634, "ce_loss_17": 3.191318082809448, "ce_loss_2": 4.158090043067932, "ce_loss_4": 3.730769419670105, "ce_loss_9": 3.381055843830109, "epoch": 0.985, "grad_norm": 1240.0, "kl_loss_13": 125.07364501953126, "kl_loss_2": 2053.5721923828123, "kl_loss_4": 1225.7321350097657, "kl_loss_9": 464.7360107421875, "learning_rate": 5.663304084960185e-07, "loss": 965.8166, "step": 9850 }, { "ce_loss_13": 3.169883835315704, "ce_loss_17": 3.1190903544425965, "ce_loss_2": 4.097458934783935, "ce_loss_4": 3.6755189538002013, "ce_loss_9": 3.3127743721008303, "epoch": 0.986, "grad_norm": 1184.0, "kl_loss_13": 124.74057044982911, "kl_loss_2": 2095.6127014160156, "kl_loss_4": 1263.0129272460938, "kl_loss_9": 470.11038970947266, "learning_rate": 4.933487177280482e-07, "loss": 966.9545, "step": 9860 }, { "ce_loss_13": 3.26495875120163, "ce_loss_17": 3.2180663347244263, "ce_loss_2": 4.161872386932373, "ce_loss_4": 3.7495540261268614, "ce_loss_9": 3.403304624557495, "epoch": 0.987, "grad_norm": 1216.0, "kl_loss_13": 122.3028465270996, "kl_loss_2": 2044.4353637695312, "kl_loss_4": 1235.3431579589844, "kl_loss_9": 458.77367401123047, "learning_rate": 4.2539705339295075e-07, "loss": 962.6967, "step": 9870 }, { "ce_loss_13": 3.123441767692566, "ce_loss_17": 3.073697257041931, "ce_loss_2": 4.055367410182953, "ce_loss_4": 3.6288355350494386, "ce_loss_9": 3.2657039642333983, "epoch": 0.988, "grad_norm": 1096.0, "kl_loss_13": 123.01094436645508, "kl_loss_2": 2099.833673095703, "kl_loss_4": 1265.3508178710938, "kl_loss_9": 469.0618606567383, "learning_rate": 3.6247609976319816e-07, "loss": 971.042, "step": 9880 }, { "ce_loss_13": 3.2125206351280213, "ce_loss_17": 3.1627071142196654, "ce_loss_2": 4.159348356723785, "ce_loss_4": 3.722857928276062, "ce_loss_9": 3.35996561050415, "epoch": 0.989, "grad_norm": 1176.0, "kl_loss_13": 126.69400634765626, "kl_loss_2": 2117.1642822265626, "kl_loss_4": 1264.5933715820313, "kl_loss_9": 475.0468276977539, "learning_rate": 3.0458649045211895e-07, "loss": 1003.3741, "step": 9890 }, { "ce_loss_13": 3.1834314584732057, "ce_loss_17": 3.1319055557250977, "ce_loss_2": 4.12213898897171, "ce_loss_4": 3.6939411520957948, "ce_loss_9": 3.333473098278046, "epoch": 0.99, "grad_norm": 976.0, "kl_loss_13": 127.41187896728516, "kl_loss_2": 2090.6347534179686, "kl_loss_4": 1269.0287536621095, "kl_loss_9": 477.75504913330076, "learning_rate": 2.517288084074587e-07, "loss": 1000.2297, "step": 9900 }, { "ce_loss_13": 3.223750054836273, "ce_loss_17": 3.169668364524841, "ce_loss_2": 4.1834455013275145, "ce_loss_4": 3.7485483288764954, "ce_loss_9": 3.3776402115821837, "epoch": 0.991, "grad_norm": 1048.0, "kl_loss_13": 128.45537338256835, "kl_loss_2": 2146.605078125, "kl_loss_4": 1295.8306518554687, "kl_loss_9": 486.71576538085935, "learning_rate": 2.0390358590538505e-07, "loss": 995.9149, "step": 9910 }, { "ce_loss_13": 3.2347757816314697, "ce_loss_17": 3.1841603994369505, "ce_loss_2": 4.154892480373382, "ce_loss_4": 3.7379157066345217, "ce_loss_9": 3.3806414365768434, "epoch": 0.992, "grad_norm": 776.0, "kl_loss_13": 127.02340507507324, "kl_loss_2": 2088.952520751953, "kl_loss_4": 1269.9161682128906, "kl_loss_9": 475.69568939208983, "learning_rate": 1.61111304545436e-07, "loss": 975.4877, "step": 9920 }, { "ce_loss_13": 3.2007062673568725, "ce_loss_17": 3.1494250535964965, "ce_loss_2": 4.113853752613068, "ce_loss_4": 3.69007865190506, "ce_loss_9": 3.3415071845054625, "epoch": 0.993, "grad_norm": 988.0, "kl_loss_13": 124.35595512390137, "kl_loss_2": 2075.55322265625, "kl_loss_4": 1247.1264038085938, "kl_loss_9": 468.97376251220703, "learning_rate": 1.2335239524541298e-07, "loss": 967.7408, "step": 9930 }, { "ce_loss_13": 3.168844985961914, "ce_loss_17": 3.120987856388092, "ce_loss_2": 4.08349826335907, "ce_loss_4": 3.6675760984420775, "ce_loss_9": 3.3123563647270204, "epoch": 0.994, "grad_norm": 968.0, "kl_loss_13": 124.11492691040038, "kl_loss_2": 2061.1263793945313, "kl_loss_4": 1242.1570068359374, "kl_loss_9": 466.60970611572264, "learning_rate": 9.06272382371065e-08, "loss": 976.9746, "step": 9940 }, { "ce_loss_13": 3.235102653503418, "ce_loss_17": 3.1886075735092163, "ce_loss_2": 4.176844072341919, "ce_loss_4": 3.740602457523346, "ce_loss_9": 3.3821743607521055, "epoch": 0.995, "grad_norm": 1072.0, "kl_loss_13": 127.04208297729492, "kl_loss_2": 2126.5141174316404, "kl_loss_4": 1272.3483032226563, "kl_loss_9": 479.88003997802736, "learning_rate": 6.293616306246586e-08, "loss": 985.1492, "step": 9950 }, { "ce_loss_13": 3.231929361820221, "ce_loss_17": 3.1844713568687437, "ce_loss_2": 4.117704975605011, "ce_loss_4": 3.715014934539795, "ce_loss_9": 3.3697912454605103, "epoch": 0.996, "grad_norm": 1004.0, "kl_loss_13": 121.88327102661133, "kl_loss_2": 2019.7053100585938, "kl_loss_4": 1224.4122802734375, "kl_loss_9": 459.62804260253904, "learning_rate": 4.027944857032395e-08, "loss": 948.9772, "step": 9960 }, { "ce_loss_13": 3.2247726798057554, "ce_loss_17": 3.178985619544983, "ce_loss_2": 4.098883247375488, "ce_loss_4": 3.6907734274864197, "ce_loss_9": 3.358437979221344, "epoch": 0.997, "grad_norm": 896.0, "kl_loss_13": 119.1805362701416, "kl_loss_2": 1971.4253356933593, "kl_loss_4": 1177.7392456054688, "kl_loss_9": 441.6632537841797, "learning_rate": 2.265732291356626e-08, "loss": 940.9627, "step": 9970 }, { "ce_loss_13": 3.267332434654236, "ce_loss_17": 3.2172287464141847, "ce_loss_2": 4.15795384645462, "ce_loss_4": 3.746636116504669, "ce_loss_9": 3.40659499168396, "epoch": 0.998, "grad_norm": 820.0, "kl_loss_13": 124.0533821105957, "kl_loss_2": 2012.9753479003907, "kl_loss_4": 1212.7530822753906, "kl_loss_9": 462.41087341308594, "learning_rate": 1.0069963546743833e-08, "loss": 977.1803, "step": 9980 }, { "ce_loss_13": 3.249015736579895, "ce_loss_17": 3.1974956154823304, "ce_loss_2": 4.166177749633789, "ce_loss_4": 3.743379867076874, "ce_loss_9": 3.392880606651306, "epoch": 0.999, "grad_norm": 908.0, "kl_loss_13": 125.0546661376953, "kl_loss_2": 2078.3261779785157, "kl_loss_4": 1243.9115783691407, "kl_loss_9": 470.76358184814455, "learning_rate": 2.517497224463483e-09, "loss": 974.0283, "step": 9990 }, { "ce_loss_13": 3.1986840963363647, "ce_loss_17": 3.146980345249176, "ce_loss_2": 4.173895561695099, "ce_loss_4": 3.7262062072753905, "ce_loss_9": 3.3516369342803953, "epoch": 1.0, "grad_norm": 1280.0, "kl_loss_13": 127.69902534484864, "kl_loss_2": 2181.7874267578127, "kl_loss_4": 1303.1703796386719, "kl_loss_9": 485.4946563720703, "learning_rate": 0.0, "loss": 1006.608, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.447557417823109e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }