{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_12": 11.027713775634766, "ce_loss_17": 8.15991497039795, "ce_loss_23": 2.965851306915283, "ce_loss_3": 10.737602233886719, "ce_loss_6": 11.190012454986572, "epoch": 0.0001, "grad_norm": 45824.0, "kl_loss_12": 17517.9384765625, "kl_loss_17": 11834.82177734375, "kl_loss_3": 16256.49951171875, "kl_loss_6": 17406.0986328125, "learning_rate": 1e-05, "loss": 16093.7969, "step": 1 }, { "ce_loss_12": 8.402964644961887, "ce_loss_17": 6.732524315516154, "ce_loss_23": 3.0286831193500094, "ce_loss_3": 9.210747030046251, "ce_loss_6": 9.099789328045315, "epoch": 0.001, "grad_norm": 13056.0, "kl_loss_12": 11511.38658311632, "kl_loss_17": 8410.016059027777, "kl_loss_3": 12613.807562934027, "kl_loss_6": 12666.355251736111, "learning_rate": 0.0001, "loss": 11336.5642, "step": 10 }, { "ce_loss_12": 5.460555911064148, "ce_loss_17": 4.456392741203308, "ce_loss_23": 3.0322046637535096, "ce_loss_3": 7.06470832824707, "ce_loss_6": 6.68152768611908, "epoch": 0.002, "grad_norm": 3008.0, "kl_loss_12": 4608.932666015625, "kl_loss_17": 2811.935650634766, "kl_loss_3": 7636.5174560546875, "kl_loss_6": 6889.9828125, "learning_rate": 0.0002, "loss": 5498.3336, "step": 20 }, { "ce_loss_12": 4.363038396835327, "ce_loss_17": 3.561879241466522, "ce_loss_23": 2.842976653575897, "ce_loss_3": 6.049652767181397, "ce_loss_6": 5.463876032829285, "epoch": 0.003, "grad_norm": 2864.0, "kl_loss_12": 2808.842004394531, "kl_loss_17": 1373.155780029297, "kl_loss_3": 6175.716845703125, "kl_loss_6": 5020.55859375, "learning_rate": 0.0003, "loss": 3815.8828, "step": 30 }, { "ce_loss_12": 4.206813776493073, "ce_loss_17": 3.54139963388443, "ce_loss_23": 3.007910418510437, "ce_loss_3": 5.702505111694336, "ce_loss_6": 5.18826208114624, "epoch": 0.004, "grad_norm": 1472.0, "kl_loss_12": 2237.032061767578, "kl_loss_17": 974.1219360351563, "kl_loss_3": 5216.508642578125, "kl_loss_6": 4193.483740234375, "learning_rate": 0.0004, "loss": 3168.891, "step": 40 }, { "ce_loss_12": 3.999967801570892, "ce_loss_17": 3.3918786883354186, "ce_loss_23": 2.9720388770103456, "ce_loss_3": 5.477003407478333, "ce_loss_6": 4.948132801055908, "epoch": 0.005, "grad_norm": 2352.0, "kl_loss_12": 1895.5821166992187, "kl_loss_17": 762.8674591064453, "kl_loss_3": 4879.4290771484375, "kl_loss_6": 3856.8288696289064, "learning_rate": 0.0005, "loss": 2838.7225, "step": 50 }, { "ce_loss_12": 3.9077433347702026, "ce_loss_17": 3.329902720451355, "ce_loss_23": 2.9878324031829835, "ce_loss_3": 5.317534351348877, "ce_loss_6": 4.759640908241272, "epoch": 0.006, "grad_norm": 1752.0, "kl_loss_12": 1724.694140625, "kl_loss_17": 611.9558654785156, "kl_loss_3": 4550.454077148437, "kl_loss_6": 3437.15751953125, "learning_rate": 0.0006, "loss": 2595.8715, "step": 60 }, { "ce_loss_12": 3.709681046009064, "ce_loss_17": 3.2019996762275698, "ce_loss_23": 2.9102911412715913, "ce_loss_3": 5.179440522193909, "ce_loss_6": 4.576542472839355, "epoch": 0.007, "grad_norm": 1256.0, "kl_loss_12": 1498.8394775390625, "kl_loss_17": 528.5001907348633, "kl_loss_3": 4437.479809570313, "kl_loss_6": 3269.5957275390624, "learning_rate": 0.0007, "loss": 2441.3959, "step": 70 }, { "ce_loss_12": 3.632861626148224, "ce_loss_17": 3.1742521047592165, "ce_loss_23": 2.9080315828323364, "ce_loss_3": 5.128901815414428, "ce_loss_6": 4.467587268352508, "epoch": 0.008, "grad_norm": 1616.0, "kl_loss_12": 1368.2988586425781, "kl_loss_17": 472.89087371826173, "kl_loss_3": 4385.906018066406, "kl_loss_6": 3072.7216918945314, "learning_rate": 0.0008, "loss": 2346.1152, "step": 80 }, { "ce_loss_12": 3.569024109840393, "ce_loss_17": 3.1099191904067993, "ce_loss_23": 2.872076082229614, "ce_loss_3": 5.064512729644775, "ce_loss_6": 4.37619469165802, "epoch": 0.009, "grad_norm": 1744.0, "kl_loss_12": 1336.7516662597657, "kl_loss_17": 441.59217834472656, "kl_loss_3": 4326.397912597657, "kl_loss_6": 2987.296838378906, "learning_rate": 0.0009000000000000001, "loss": 2263.0539, "step": 90 }, { "ce_loss_12": 3.666916310787201, "ce_loss_17": 3.244144332408905, "ce_loss_23": 2.976801860332489, "ce_loss_3": 5.146626138687134, "ce_loss_6": 4.481168866157532, "epoch": 0.01, "grad_norm": 1872.0, "kl_loss_12": 1313.0016296386718, "kl_loss_17": 508.8273468017578, "kl_loss_3": 4289.265026855469, "kl_loss_6": 2982.880078125, "learning_rate": 0.001, "loss": 2259.932, "step": 100 }, { "ce_loss_12": 3.6309967994689942, "ce_loss_17": 3.192095232009888, "ce_loss_23": 2.941200518608093, "ce_loss_3": 5.068640613555909, "ce_loss_6": 4.384789574146271, "epoch": 0.011, "grad_norm": 1440.0, "kl_loss_12": 1322.830255126953, "kl_loss_17": 458.69312591552733, "kl_loss_3": 4218.377465820313, "kl_loss_6": 2874.042236328125, "learning_rate": 0.0009999974825027757, "loss": 2212.0162, "step": 110 }, { "ce_loss_12": 3.648383581638336, "ce_loss_17": 3.2187762379646303, "ce_loss_23": 3.0047112464904786, "ce_loss_3": 4.9990413904190065, "ce_loss_6": 4.37911046743393, "epoch": 0.012, "grad_norm": 1168.0, "kl_loss_12": 1230.2259948730468, "kl_loss_17": 396.9843719482422, "kl_loss_3": 4003.4096557617186, "kl_loss_6": 2775.913610839844, "learning_rate": 0.0009999899300364532, "loss": 2079.7395, "step": 120 }, { "ce_loss_12": 3.5785240173339843, "ce_loss_17": 3.1897997498512267, "ce_loss_23": 2.9631885051727296, "ce_loss_3": 5.012343430519104, "ce_loss_6": 4.35903844833374, "epoch": 0.013, "grad_norm": 1000.0, "kl_loss_12": 1155.5515258789062, "kl_loss_17": 420.0738891601562, "kl_loss_3": 4048.8492919921873, "kl_loss_6": 2753.6798706054688, "learning_rate": 0.0009999773426770863, "loss": 2102.8152, "step": 130 }, { "ce_loss_12": 3.590196990966797, "ce_loss_17": 3.2399143815040587, "ce_loss_23": 3.006642484664917, "ce_loss_3": 4.972180771827698, "ce_loss_6": 4.330150175094604, "epoch": 0.014, "grad_norm": 1040.0, "kl_loss_12": 1109.9426330566407, "kl_loss_17": 424.5212860107422, "kl_loss_3": 3912.0217895507812, "kl_loss_6": 2661.2727783203127, "learning_rate": 0.0009999597205514296, "loss": 2044.4775, "step": 140 }, { "ce_loss_12": 3.520315718650818, "ce_loss_17": 3.170581007003784, "ce_loss_23": 2.962330865859985, "ce_loss_3": 4.890919160842896, "ce_loss_6": 4.267568159103393, "epoch": 0.015, "grad_norm": 964.0, "kl_loss_12": 1056.9698181152344, "kl_loss_17": 380.9711288452148, "kl_loss_3": 3821.2818237304687, "kl_loss_6": 2600.436962890625, "learning_rate": 0.0009999370638369377, "loss": 1974.6367, "step": 150 }, { "ce_loss_12": 3.5510578632354735, "ce_loss_17": 3.1863274335861207, "ce_loss_23": 3.0006508588790894, "ce_loss_3": 4.916771864891052, "ce_loss_6": 4.249840044975281, "epoch": 0.016, "grad_norm": 1168.0, "kl_loss_12": 1066.2155700683593, "kl_loss_17": 342.7077606201172, "kl_loss_3": 3826.1323852539062, "kl_loss_6": 2510.92431640625, "learning_rate": 0.000999909372761763, "loss": 1937.6813, "step": 160 }, { "ce_loss_12": 3.492412841320038, "ce_loss_17": 3.1263466000556948, "ce_loss_23": 2.9409194469451903, "ce_loss_3": 4.879445147514343, "ce_loss_6": 4.203813099861145, "epoch": 0.017, "grad_norm": 880.0, "kl_loss_12": 1060.1210083007813, "kl_loss_17": 335.2873870849609, "kl_loss_3": 3886.2561767578127, "kl_loss_6": 2559.63173828125, "learning_rate": 0.0009998766476047546, "loss": 1974.0215, "step": 170 }, { "ce_loss_12": 3.5242559790611265, "ce_loss_17": 3.1614002346992494, "ce_loss_23": 2.975456976890564, "ce_loss_3": 4.904916667938233, "ce_loss_6": 4.241925597190857, "epoch": 0.018, "grad_norm": 788.0, "kl_loss_12": 1035.621078491211, "kl_loss_17": 333.09861907958987, "kl_loss_3": 3837.9483276367187, "kl_loss_6": 2543.54970703125, "learning_rate": 0.0009998388886954545, "loss": 1955.3641, "step": 180 }, { "ce_loss_12": 3.4557549357414246, "ce_loss_17": 3.115843212604523, "ce_loss_23": 2.939819598197937, "ce_loss_3": 4.865855956077576, "ce_loss_6": 4.206141090393066, "epoch": 0.019, "grad_norm": 848.0, "kl_loss_12": 983.1588684082031, "kl_loss_17": 315.833642578125, "kl_loss_3": 3826.3384033203124, "kl_loss_6": 2540.1477905273437, "learning_rate": 0.0009997960964140947, "loss": 1911.9391, "step": 190 }, { "ce_loss_12": 3.4466380953788756, "ce_loss_17": 3.1169066548347475, "ce_loss_23": 2.940021049976349, "ce_loss_3": 4.852689170837403, "ce_loss_6": 4.1931051969528195, "epoch": 0.02, "grad_norm": 816.0, "kl_loss_12": 982.9840148925781, "kl_loss_17": 335.00098419189453, "kl_loss_3": 3858.1364135742188, "kl_loss_6": 2551.3482177734377, "learning_rate": 0.0009997482711915926, "loss": 1915.3297, "step": 200 }, { "ce_loss_12": 3.3939271569252014, "ce_loss_17": 3.0816601395606993, "ce_loss_23": 2.9188946008682253, "ce_loss_3": 4.788400340080261, "ce_loss_6": 4.114619171619415, "epoch": 0.021, "grad_norm": 784.0, "kl_loss_12": 928.4573394775391, "kl_loss_17": 305.8069580078125, "kl_loss_3": 3751.7100341796877, "kl_loss_6": 2432.4765625, "learning_rate": 0.0009996954135095479, "loss": 1853.0043, "step": 210 }, { "ce_loss_12": 3.4684497714042664, "ce_loss_17": 3.1626001238822936, "ce_loss_23": 2.989655339717865, "ce_loss_3": 4.786516451835633, "ce_loss_6": 4.147206258773804, "epoch": 0.022, "grad_norm": 716.0, "kl_loss_12": 920.043212890625, "kl_loss_17": 303.1504531860352, "kl_loss_3": 3612.4007934570313, "kl_loss_6": 2348.7169921875, "learning_rate": 0.0009996375239002368, "loss": 1794.6467, "step": 220 }, { "ce_loss_12": 3.5193947434425352, "ce_loss_17": 3.214660811424255, "ce_loss_23": 3.05567672252655, "ce_loss_3": 4.798125290870667, "ce_loss_6": 4.18331127166748, "epoch": 0.023, "grad_norm": 836.0, "kl_loss_12": 896.357308959961, "kl_loss_17": 297.10578002929685, "kl_loss_3": 3529.04033203125, "kl_loss_6": 2312.949139404297, "learning_rate": 0.0009995746029466072, "loss": 1758.3887, "step": 230 }, { "ce_loss_12": 3.3271968960762024, "ce_loss_17": 3.020535373687744, "ce_loss_23": 2.8603257536888123, "ce_loss_3": 4.72745258808136, "ce_loss_6": 4.0547412753105165, "epoch": 0.024, "grad_norm": 704.0, "kl_loss_12": 917.2084899902344, "kl_loss_17": 300.5008575439453, "kl_loss_3": 3771.9653076171876, "kl_loss_6": 2428.7272094726563, "learning_rate": 0.0009995066512822719, "loss": 1795.3527, "step": 240 }, { "ce_loss_12": 3.4247890949249267, "ce_loss_17": 3.1120498538017274, "ce_loss_23": 2.954189193248749, "ce_loss_3": 4.8511217594146725, "ce_loss_6": 4.178792726993561, "epoch": 0.025, "grad_norm": 872.0, "kl_loss_12": 906.7326904296875, "kl_loss_17": 287.6933708190918, "kl_loss_3": 3802.542761230469, "kl_loss_6": 2477.5896545410155, "learning_rate": 0.000999433669591504, "loss": 1791.8957, "step": 250 }, { "ce_loss_12": 3.3140273809432985, "ce_loss_17": 3.0202324271202086, "ce_loss_23": 2.8615636587142945, "ce_loss_3": 4.698658227920532, "ce_loss_6": 4.027096796035766, "epoch": 0.026, "grad_norm": 936.0, "kl_loss_12": 875.9568481445312, "kl_loss_17": 295.5630393981934, "kl_loss_3": 3720.644665527344, "kl_loss_6": 2388.5712341308595, "learning_rate": 0.000999355658609228, "loss": 1786.2816, "step": 260 }, { "ce_loss_12": 3.3641026854515075, "ce_loss_17": 3.0714613795280457, "ce_loss_23": 2.888243865966797, "ce_loss_3": 4.784202647209168, "ce_loss_6": 4.083048665523529, "epoch": 0.027, "grad_norm": 780.0, "kl_loss_12": 890.0199035644531, "kl_loss_17": 320.20955657958984, "kl_loss_3": 3786.4882202148438, "kl_loss_6": 2406.1595092773437, "learning_rate": 0.0009992726191210138, "loss": 1818.2453, "step": 270 }, { "ce_loss_12": 3.380402755737305, "ce_loss_17": 3.0927987456321717, "ce_loss_23": 2.9245058417320253, "ce_loss_3": 4.725374484062195, "ce_loss_6": 4.070786094665527, "epoch": 0.028, "grad_norm": 864.0, "kl_loss_12": 873.3745178222656, "kl_loss_17": 311.5400817871094, "kl_loss_3": 3613.2262939453126, "kl_loss_6": 2304.922711181641, "learning_rate": 0.0009991845519630679, "loss": 1757.8809, "step": 280 }, { "ce_loss_12": 3.2682459712028504, "ce_loss_17": 2.973331022262573, "ce_loss_23": 2.8167771458625794, "ce_loss_3": 4.632596039772034, "ce_loss_6": 3.960002934932709, "epoch": 0.029, "grad_norm": 708.0, "kl_loss_12": 862.7825988769531, "kl_loss_17": 287.25118255615234, "kl_loss_3": 3642.8630859375, "kl_loss_6": 2318.68984375, "learning_rate": 0.0009990914580222257, "loss": 1770.6906, "step": 290 }, { "ce_loss_12": 3.3801196575164796, "ce_loss_17": 3.098010754585266, "ce_loss_23": 2.9501577854156493, "ce_loss_3": 4.652040600776672, "ce_loss_6": 4.022914433479309, "epoch": 0.03, "grad_norm": 720.0, "kl_loss_12": 830.7660217285156, "kl_loss_17": 263.96018981933594, "kl_loss_3": 3458.6974487304688, "kl_loss_6": 2199.962927246094, "learning_rate": 0.0009989933382359422, "loss": 1727.1951, "step": 300 }, { "ce_loss_12": 3.380411183834076, "ce_loss_17": 3.1013339519500733, "ce_loss_23": 2.9682934045791627, "ce_loss_3": 4.667791080474854, "ce_loss_6": 4.0293840050697325, "epoch": 0.031, "grad_norm": 772.0, "kl_loss_12": 811.3957733154297, "kl_loss_17": 246.9788589477539, "kl_loss_3": 3442.3103881835937, "kl_loss_6": 2183.353387451172, "learning_rate": 0.0009988901935922825, "loss": 1692.7398, "step": 310 }, { "ce_loss_12": 3.2473236680030824, "ce_loss_17": 2.9506585717201235, "ce_loss_23": 2.8136382699012756, "ce_loss_3": 4.609115219116211, "ce_loss_6": 3.9401437640190125, "epoch": 0.032, "grad_norm": 804.0, "kl_loss_12": 848.0315948486328, "kl_loss_17": 251.90018463134766, "kl_loss_3": 3642.8263916015626, "kl_loss_6": 2307.7186950683595, "learning_rate": 0.0009987820251299122, "loss": 1714.9273, "step": 320 }, { "ce_loss_12": 3.3609359502792358, "ce_loss_17": 3.068274986743927, "ce_loss_23": 2.9343497514724732, "ce_loss_3": 4.620841813087464, "ce_loss_6": 3.973675036430359, "epoch": 0.033, "grad_norm": 772.0, "kl_loss_12": 812.5999633789063, "kl_loss_17": 241.0101318359375, "kl_loss_3": 3420.3722534179688, "kl_loss_6": 2130.2982543945313, "learning_rate": 0.0009986688339380862, "loss": 1652.8191, "step": 330 }, { "ce_loss_12": 3.2960716605186464, "ce_loss_17": 3.024462711811066, "ce_loss_23": 2.894329881668091, "ce_loss_3": 4.547412347793579, "ce_loss_6": 3.915596878528595, "epoch": 0.034, "grad_norm": 616.0, "kl_loss_12": 790.2532562255859, "kl_loss_17": 232.27188873291016, "kl_loss_3": 3347.380505371094, "kl_loss_6": 2109.0370239257813, "learning_rate": 0.0009985506211566387, "loss": 1637.9498, "step": 340 }, { "ce_loss_12": 3.3186362624168395, "ce_loss_17": 3.0482547879219055, "ce_loss_23": 2.921323823928833, "ce_loss_3": 4.5384005308151245, "ce_loss_6": 3.9204172134399413, "epoch": 0.035, "grad_norm": 728.0, "kl_loss_12": 766.0471405029297, "kl_loss_17": 230.49155197143554, "kl_loss_3": 3281.216650390625, "kl_loss_6": 2051.8226928710938, "learning_rate": 0.0009984273879759713, "loss": 1596.1112, "step": 350 }, { "ce_loss_12": 3.3629719853401183, "ce_loss_17": 3.0898334860801695, "ce_loss_23": 2.949311065673828, "ce_loss_3": 4.616341137886048, "ce_loss_6": 3.9870093703269958, "epoch": 0.036, "grad_norm": 788.0, "kl_loss_12": 796.4837310791015, "kl_loss_17": 254.82774658203124, "kl_loss_3": 3361.358996582031, "kl_loss_6": 2109.398333740234, "learning_rate": 0.0009982991356370402, "loss": 1651.3512, "step": 360 }, { "ce_loss_12": 3.325030303001404, "ce_loss_17": 3.0661404371261596, "ce_loss_23": 2.927092969417572, "ce_loss_3": 4.571960759162903, "ce_loss_6": 3.94881272315979, "epoch": 0.037, "grad_norm": 672.0, "kl_loss_12": 778.3520477294921, "kl_loss_17": 251.09753952026367, "kl_loss_3": 3342.199499511719, "kl_loss_6": 2113.2349060058596, "learning_rate": 0.0009981658654313456, "loss": 1634.6564, "step": 370 }, { "ce_loss_12": 3.384810674190521, "ce_loss_17": 3.1309564113616943, "ce_loss_23": 2.999415564537048, "ce_loss_3": 4.60067994594574, "ce_loss_6": 3.989968252182007, "epoch": 0.038, "grad_norm": 712.0, "kl_loss_12": 750.3938720703125, "kl_loss_17": 242.80421447753906, "kl_loss_3": 3255.1913208007813, "kl_loss_6": 2043.9017211914063, "learning_rate": 0.000998027578700917, "loss": 1605.898, "step": 380 }, { "ce_loss_12": 3.339533841609955, "ce_loss_17": 3.0749796628952026, "ce_loss_23": 2.9452226638793944, "ce_loss_3": 4.574568462371826, "ce_loss_6": 3.956436836719513, "epoch": 0.039, "grad_norm": 604.0, "kl_loss_12": 756.8385131835937, "kl_loss_17": 238.58657760620116, "kl_loss_3": 3296.7185913085937, "kl_loss_6": 2071.6643005371093, "learning_rate": 0.0009978842768382998, "loss": 1609.5673, "step": 390 }, { "ce_loss_12": 3.3271549105644227, "ce_loss_17": 3.083773648738861, "ce_loss_23": 2.9559371709823608, "ce_loss_3": 4.531307077407837, "ce_loss_6": 3.919724702835083, "epoch": 0.04, "grad_norm": 648.0, "kl_loss_12": 712.7042816162109, "kl_loss_17": 227.77520217895508, "kl_loss_3": 3185.6382080078124, "kl_loss_6": 1979.2585815429688, "learning_rate": 0.0009977359612865424, "loss": 1547.6145, "step": 400 }, { "ce_loss_12": 3.353544569015503, "ce_loss_17": 3.1033652782440186, "ce_loss_23": 2.9658072710037233, "ce_loss_3": 4.566384196281433, "ce_loss_6": 3.953342044353485, "epoch": 0.041, "grad_norm": 708.0, "kl_loss_12": 749.1101745605469, "kl_loss_17": 260.4892097473145, "kl_loss_3": 3245.3873901367188, "kl_loss_6": 2035.0272338867187, "learning_rate": 0.0009975826335391806, "loss": 1560.9627, "step": 410 }, { "ce_loss_12": 3.3578636288642882, "ce_loss_17": 3.122957742214203, "ce_loss_23": 2.988275396823883, "ce_loss_3": 4.5664080619812015, "ce_loss_6": 3.9467087745666505, "epoch": 0.042, "grad_norm": 704.0, "kl_loss_12": 722.9946594238281, "kl_loss_17": 240.11188812255858, "kl_loss_3": 3212.6345458984374, "kl_loss_6": 1997.101141357422, "learning_rate": 0.0009974242951402235, "loss": 1549.8442, "step": 420 }, { "ce_loss_12": 3.370286190509796, "ce_loss_17": 3.122636592388153, "ce_loss_23": 2.9911253452301025, "ce_loss_3": 4.592541313171386, "ce_loss_6": 3.959289622306824, "epoch": 0.043, "grad_norm": 656.0, "kl_loss_12": 739.4854614257813, "kl_loss_17": 239.04573822021484, "kl_loss_3": 3266.663781738281, "kl_loss_6": 2007.7690063476562, "learning_rate": 0.0009972609476841367, "loss": 1543.5379, "step": 430 }, { "ce_loss_12": 3.2838348269462587, "ce_loss_17": 3.0468017935752867, "ce_loss_23": 2.9064618349075317, "ce_loss_3": 4.530054998397827, "ce_loss_6": 3.89483824968338, "epoch": 0.044, "grad_norm": 744.0, "kl_loss_12": 728.234994506836, "kl_loss_17": 247.46401443481446, "kl_loss_3": 3284.870471191406, "kl_loss_6": 2036.9016906738282, "learning_rate": 0.0009970925928158272, "loss": 1577.4176, "step": 440 }, { "ce_loss_12": 3.2323830366134643, "ce_loss_17": 2.9942973256111145, "ce_loss_23": 2.8556951880455017, "ce_loss_3": 4.490892148017883, "ce_loss_6": 3.8565974712371824, "epoch": 0.045, "grad_norm": 620.0, "kl_loss_12": 736.3843292236328, "kl_loss_17": 255.33787460327147, "kl_loss_3": 3333.7939208984376, "kl_loss_6": 2070.1477600097655, "learning_rate": 0.000996919232230627, "loss": 1588.0778, "step": 450 }, { "ce_loss_12": 3.2914091110229493, "ce_loss_17": 3.0677689790725706, "ce_loss_23": 2.941704607009888, "ce_loss_3": 4.490966200828552, "ce_loss_6": 3.880615258216858, "epoch": 0.046, "grad_norm": 748.0, "kl_loss_12": 692.5785369873047, "kl_loss_17": 234.25014266967773, "kl_loss_3": 3176.8756591796873, "kl_loss_6": 1969.9073364257813, "learning_rate": 0.0009967408676742752, "loss": 1487.7033, "step": 460 }, { "ce_loss_12": 3.4376975774765013, "ce_loss_17": 3.204993689060211, "ce_loss_23": 3.0785672545433043, "ce_loss_3": 4.589035892486573, "ce_loss_6": 3.9928988814353943, "epoch": 0.047, "grad_norm": 836.0, "kl_loss_12": 709.9034606933594, "kl_loss_17": 228.83046264648436, "kl_loss_3": 3101.909216308594, "kl_loss_6": 1918.1236389160156, "learning_rate": 0.0009965575009429006, "loss": 1532.509, "step": 470 }, { "ce_loss_12": 3.2385229229927064, "ce_loss_17": 2.984599435329437, "ce_loss_23": 2.8635784864425657, "ce_loss_3": 4.449902439117432, "ce_loss_6": 3.831362020969391, "epoch": 0.048, "grad_norm": 724.0, "kl_loss_12": 722.7018585205078, "kl_loss_17": 220.34202346801757, "kl_loss_3": 3245.0903442382814, "kl_loss_6": 2010.9374084472656, "learning_rate": 0.0009963691338830043, "loss": 1531.1553, "step": 480 }, { "ce_loss_12": 3.305763602256775, "ce_loss_17": 3.0736878156661986, "ce_loss_23": 2.957307231426239, "ce_loss_3": 4.49764199256897, "ce_loss_6": 3.8927690625190734, "epoch": 0.049, "grad_norm": 844.0, "kl_loss_12": 691.3308135986329, "kl_loss_17": 214.79136962890624, "kl_loss_3": 3176.404748535156, "kl_loss_6": 1954.2135070800782, "learning_rate": 0.0009961757683914405, "loss": 1495.7828, "step": 490 }, { "ce_loss_12": 3.3125686287879943, "ce_loss_17": 3.0608574748039246, "ce_loss_23": 2.941720414161682, "ce_loss_3": 4.449414944648742, "ce_loss_6": 3.8704373121261595, "epoch": 0.05, "grad_norm": 724.0, "kl_loss_12": 713.7994873046875, "kl_loss_17": 219.01981506347656, "kl_loss_3": 3097.2513061523437, "kl_loss_6": 1938.0861755371093, "learning_rate": 0.0009959774064153978, "loss": 1505.5656, "step": 500 }, { "ce_loss_12": 3.299354112148285, "ce_loss_17": 3.0693908810615538, "ce_loss_23": 2.959504556655884, "ce_loss_3": 4.431204664707184, "ce_loss_6": 3.84701007604599, "epoch": 0.051, "grad_norm": 624.0, "kl_loss_12": 670.3522155761718, "kl_loss_17": 201.11935424804688, "kl_loss_3": 3017.0482177734375, "kl_loss_6": 1854.9740051269532, "learning_rate": 0.0009957740499523787, "loss": 1465.6556, "step": 510 }, { "ce_loss_12": 3.3268982768058777, "ce_loss_17": 3.08267103433609, "ce_loss_23": 2.9721622109413146, "ce_loss_3": 4.457084774971008, "ce_loss_6": 3.8652746081352234, "epoch": 0.052, "grad_norm": 696.0, "kl_loss_12": 694.7547119140625, "kl_loss_17": 200.21757049560546, "kl_loss_3": 3047.4221069335936, "kl_loss_6": 1886.7686340332032, "learning_rate": 0.0009955657010501807, "loss": 1464.9039, "step": 520 }, { "ce_loss_12": 3.2911906957626345, "ce_loss_17": 3.047533321380615, "ce_loss_23": 2.930856394767761, "ce_loss_3": 4.451212811470032, "ce_loss_6": 3.8664758801460266, "epoch": 0.053, "grad_norm": 896.0, "kl_loss_12": 701.0859619140625, "kl_loss_17": 205.53631591796875, "kl_loss_3": 3102.8578247070313, "kl_loss_6": 1935.0799438476563, "learning_rate": 0.000995352361806875, "loss": 1471.9518, "step": 530 }, { "ce_loss_12": 3.3293033957481386, "ce_loss_17": 3.0855902075767516, "ce_loss_23": 2.9691104531288146, "ce_loss_3": 4.465985608100891, "ce_loss_6": 3.877950441837311, "epoch": 0.054, "grad_norm": 768.0, "kl_loss_12": 701.2718719482422, "kl_loss_17": 213.6372833251953, "kl_loss_3": 3087.834912109375, "kl_loss_6": 1914.8329772949219, "learning_rate": 0.0009951340343707852, "loss": 1498.7925, "step": 540 }, { "ce_loss_12": 3.3613967180252073, "ce_loss_17": 3.127401924133301, "ce_loss_23": 3.0171509265899656, "ce_loss_3": 4.524386692047119, "ce_loss_6": 3.936571645736694, "epoch": 0.055, "grad_norm": 604.0, "kl_loss_12": 668.1853912353515, "kl_loss_17": 200.08008728027343, "kl_loss_3": 3067.0293701171877, "kl_loss_6": 1924.4457397460938, "learning_rate": 0.0009949107209404665, "loss": 1476.3092, "step": 550 }, { "ce_loss_12": 3.27703515291214, "ce_loss_17": 3.0475626826286315, "ce_loss_23": 2.936045062541962, "ce_loss_3": 4.412865042686462, "ce_loss_6": 3.847214663028717, "epoch": 0.056, "grad_norm": 640.0, "kl_loss_12": 660.5824188232422, "kl_loss_17": 194.43747024536134, "kl_loss_3": 3025.404443359375, "kl_loss_6": 1895.8758911132813, "learning_rate": 0.0009946824237646824, "loss": 1449.3217, "step": 560 }, { "ce_loss_12": 3.2381665825843813, "ce_loss_17": 3.0001315474510193, "ce_loss_23": 2.8903649926185606, "ce_loss_3": 4.40073938369751, "ce_loss_6": 3.808553636074066, "epoch": 0.057, "grad_norm": 724.0, "kl_loss_12": 684.8827117919922, "kl_loss_17": 200.08551101684571, "kl_loss_3": 3098.5067626953123, "kl_loss_6": 1913.1522094726563, "learning_rate": 0.0009944491451423828, "loss": 1496.888, "step": 570 }, { "ce_loss_12": 3.2463485479354857, "ce_loss_17": 2.9977986097335814, "ce_loss_23": 2.8845686078071595, "ce_loss_3": 4.425404500961304, "ce_loss_6": 3.823835778236389, "epoch": 0.058, "grad_norm": 828.0, "kl_loss_12": 700.3368286132812, "kl_loss_17": 203.70085220336915, "kl_loss_3": 3160.6127807617186, "kl_loss_6": 1947.4135192871095, "learning_rate": 0.0009942108874226813, "loss": 1473.7351, "step": 580 }, { "ce_loss_12": 3.329897570610046, "ce_loss_17": 3.096381163597107, "ce_loss_23": 2.9894603371620176, "ce_loss_3": 4.442056608200073, "ce_loss_6": 3.874366784095764, "epoch": 0.059, "grad_norm": 784.0, "kl_loss_12": 667.4164886474609, "kl_loss_17": 197.30725631713867, "kl_loss_3": 2966.1633422851564, "kl_loss_6": 1833.4690368652343, "learning_rate": 0.00099396765300483, "loss": 1414.7325, "step": 590 }, { "ce_loss_12": 3.3210636496543886, "ce_loss_17": 3.081927680969238, "ce_loss_23": 2.971800923347473, "ce_loss_3": 4.430813646316528, "ce_loss_6": 3.8526097893714906, "epoch": 0.06, "grad_norm": 912.0, "kl_loss_12": 680.6019836425781, "kl_loss_17": 197.30396728515626, "kl_loss_3": 2995.541357421875, "kl_loss_6": 1854.753192138672, "learning_rate": 0.0009937194443381972, "loss": 1433.0809, "step": 600 }, { "ce_loss_12": 3.3337490797042846, "ce_loss_17": 3.1106279253959657, "ce_loss_23": 3.005161905288696, "ce_loss_3": 4.419551348686218, "ce_loss_6": 3.858152377605438, "epoch": 0.061, "grad_norm": 672.0, "kl_loss_12": 648.7204620361329, "kl_loss_17": 196.67949371337892, "kl_loss_3": 2915.5722045898438, "kl_loss_6": 1795.9505493164063, "learning_rate": 0.0009934662639222412, "loss": 1425.4387, "step": 610 }, { "ce_loss_12": 3.2973732352256775, "ce_loss_17": 3.084644055366516, "ce_loss_23": 2.959382653236389, "ce_loss_3": 4.439628386497498, "ce_loss_6": 3.8573474287986755, "epoch": 0.062, "grad_norm": 828.0, "kl_loss_12": 668.3834808349609, "kl_loss_17": 226.08399047851563, "kl_loss_3": 3057.9033325195314, "kl_loss_6": 1894.1350402832031, "learning_rate": 0.000993208114306486, "loss": 1444.5806, "step": 620 }, { "ce_loss_12": 3.211961007118225, "ce_loss_17": 3.0096256613731383, "ce_loss_23": 2.88716698884964, "ce_loss_3": 4.374472832679748, "ce_loss_6": 3.7805599689483644, "epoch": 0.063, "grad_norm": 752.0, "kl_loss_12": 649.2993743896484, "kl_loss_17": 221.76539993286133, "kl_loss_3": 3058.0645141601562, "kl_loss_6": 1874.6616943359375, "learning_rate": 0.0009929449980904952, "loss": 1417.4091, "step": 630 }, { "ce_loss_12": 3.2723576068878173, "ce_loss_17": 3.0581412434577944, "ce_loss_23": 2.9464841961860655, "ce_loss_3": 4.395217108726501, "ce_loss_6": 3.811108076572418, "epoch": 0.064, "grad_norm": 808.0, "kl_loss_12": 639.2544738769532, "kl_loss_17": 227.6035385131836, "kl_loss_3": 3003.7718872070313, "kl_loss_6": 1840.5796264648438, "learning_rate": 0.0009926769179238466, "loss": 1412.6059, "step": 640 }, { "ce_loss_12": 3.3222896218299867, "ce_loss_17": 3.108461344242096, "ce_loss_23": 2.9796595931053163, "ce_loss_3": 4.43104920387268, "ce_loss_6": 3.864783561229706, "epoch": 0.065, "grad_norm": 716.0, "kl_loss_12": 667.6861999511718, "kl_loss_17": 230.67943572998047, "kl_loss_3": 3005.815673828125, "kl_loss_6": 1862.8342651367188, "learning_rate": 0.000992403876506104, "loss": 1427.4293, "step": 650 }, { "ce_loss_12": 3.2592855095863342, "ce_loss_17": 3.032492387294769, "ce_loss_23": 2.921385622024536, "ce_loss_3": 4.381369090080261, "ce_loss_6": 3.7872144103050234, "epoch": 0.066, "grad_norm": 688.0, "kl_loss_12": 671.0291412353515, "kl_loss_17": 206.38212890625, "kl_loss_3": 2998.441552734375, "kl_loss_6": 1809.7071350097656, "learning_rate": 0.0009921258765867918, "loss": 1421.205, "step": 660 }, { "ce_loss_12": 3.2370525360107423, "ce_loss_17": 3.00315066576004, "ce_loss_23": 2.895841455459595, "ce_loss_3": 4.379059171676635, "ce_loss_6": 3.783574569225311, "epoch": 0.067, "grad_norm": 832.0, "kl_loss_12": 679.0885528564453, "kl_loss_17": 194.57723999023438, "kl_loss_3": 3073.783825683594, "kl_loss_6": 1871.8182250976563, "learning_rate": 0.0009918429209653662, "loss": 1431.0873, "step": 670 }, { "ce_loss_12": 3.299939227104187, "ce_loss_17": 3.046256399154663, "ce_loss_23": 2.94521769285202, "ce_loss_3": 4.396662664413452, "ce_loss_6": 3.8042745709419252, "epoch": 0.068, "grad_norm": 700.0, "kl_loss_12": 693.7759826660156, "kl_loss_17": 188.6014419555664, "kl_loss_3": 3028.060095214844, "kl_loss_6": 1840.6216918945313, "learning_rate": 0.0009915550124911866, "loss": 1401.4508, "step": 680 }, { "ce_loss_12": 3.28766872882843, "ce_loss_17": 3.0496862173080443, "ce_loss_23": 2.9469610452651978, "ce_loss_3": 4.373085260391235, "ce_loss_6": 3.796851623058319, "epoch": 0.069, "grad_norm": 808.0, "kl_loss_12": 666.584228515625, "kl_loss_17": 185.1880355834961, "kl_loss_3": 2915.1931274414064, "kl_loss_6": 1776.2470642089843, "learning_rate": 0.0009912621540634887, "loss": 1391.1077, "step": 690 }, { "ce_loss_12": 3.3050572991371157, "ce_loss_17": 3.0859864115715028, "ce_loss_23": 2.9898800373077394, "ce_loss_3": 4.364710736274719, "ce_loss_6": 3.7870045065879823, "epoch": 0.07, "grad_norm": 920.0, "kl_loss_12": 627.817578125, "kl_loss_17": 173.69211196899414, "kl_loss_3": 2867.003369140625, "kl_loss_6": 1710.6666137695313, "learning_rate": 0.0009909643486313534, "loss": 1356.2304, "step": 700 }, { "ce_loss_12": 3.207731914520264, "ce_loss_17": 2.9801932334899903, "ce_loss_23": 2.8819775700569155, "ce_loss_3": 4.3406788349151615, "ce_loss_6": 3.742799186706543, "epoch": 0.071, "grad_norm": 744.0, "kl_loss_12": 636.164306640625, "kl_loss_17": 175.40868377685547, "kl_loss_3": 3009.928515625, "kl_loss_6": 1818.6133911132813, "learning_rate": 0.000990661599193678, "loss": 1430.1488, "step": 710 }, { "ce_loss_12": 3.2974260449409485, "ce_loss_17": 3.0879198789596556, "ce_loss_23": 2.991551196575165, "ce_loss_3": 4.391224360466003, "ce_loss_6": 3.8298116087913514, "epoch": 0.072, "grad_norm": 936.0, "kl_loss_12": 606.6379913330078, "kl_loss_17": 175.6688980102539, "kl_loss_3": 2895.5029418945314, "kl_loss_6": 1770.1393676757812, "learning_rate": 0.0009903539087991462, "loss": 1362.8802, "step": 720 }, { "ce_loss_12": 3.277816581726074, "ce_loss_17": 3.0691084623336793, "ce_loss_23": 2.974670672416687, "ce_loss_3": 4.373213529586792, "ce_loss_6": 3.8061108589172363, "epoch": 0.073, "grad_norm": 688.0, "kl_loss_12": 607.6536895751954, "kl_loss_17": 177.0436553955078, "kl_loss_3": 2902.8962158203126, "kl_loss_6": 1772.4432495117187, "learning_rate": 0.0009900412805461966, "loss": 1368.357, "step": 730 }, { "ce_loss_12": 3.3488402724266053, "ce_loss_17": 3.1375231623649595, "ce_loss_23": 3.03947719335556, "ce_loss_3": 4.412138867378235, "ce_loss_6": 3.8542447447776795, "epoch": 0.074, "grad_norm": 732.0, "kl_loss_12": 595.930207824707, "kl_loss_17": 170.8621597290039, "kl_loss_3": 2857.8486572265624, "kl_loss_6": 1720.0050415039063, "learning_rate": 0.0009897237175829927, "loss": 1358.9611, "step": 740 }, { "ce_loss_12": 3.242159843444824, "ce_loss_17": 3.0247748136520385, "ce_loss_23": 2.9276575922966, "ce_loss_3": 4.355337357521057, "ce_loss_6": 3.778389298915863, "epoch": 0.075, "grad_norm": 788.0, "kl_loss_12": 619.531216430664, "kl_loss_17": 176.01265029907228, "kl_loss_3": 2965.2418701171873, "kl_loss_6": 1809.105938720703, "learning_rate": 0.0009894012231073895, "loss": 1378.8805, "step": 750 }, { "ce_loss_12": 3.2793312668800354, "ce_loss_17": 3.071231472492218, "ce_loss_23": 2.9780132591724398, "ce_loss_3": 4.380762314796447, "ce_loss_6": 3.808058273792267, "epoch": 0.076, "grad_norm": 880.0, "kl_loss_12": 596.9268798828125, "kl_loss_17": 170.39969711303712, "kl_loss_3": 2894.7660400390623, "kl_loss_6": 1755.1642150878906, "learning_rate": 0.0009890738003669028, "loss": 1372.3736, "step": 760 }, { "ce_loss_12": 3.2576115489006043, "ce_loss_17": 3.042385685443878, "ce_loss_23": 2.9484377264976502, "ce_loss_3": 4.388651013374329, "ce_loss_6": 3.7962119698524477, "epoch": 0.077, "grad_norm": 1760.0, "kl_loss_12": 609.9247650146484, "kl_loss_17": 170.56164016723633, "kl_loss_3": 2988.757531738281, "kl_loss_6": 1815.9149169921875, "learning_rate": 0.0009887414526586764, "loss": 1355.8361, "step": 770 }, { "ce_loss_12": 3.298181438446045, "ce_loss_17": 3.0905930161476136, "ce_loss_23": 2.998029613494873, "ce_loss_3": 4.381036710739136, "ce_loss_6": 3.8145140647888183, "epoch": 0.078, "grad_norm": 760.0, "kl_loss_12": 582.8910247802735, "kl_loss_17": 163.08836669921874, "kl_loss_3": 2856.5729858398436, "kl_loss_6": 1727.3598388671876, "learning_rate": 0.0009884041833294476, "loss": 1319.3967, "step": 780 }, { "ce_loss_12": 3.3041907072067263, "ce_loss_17": 3.0935172557830812, "ce_loss_23": 3.0023789405822754, "ce_loss_3": 4.366575372219086, "ce_loss_6": 3.803239977359772, "epoch": 0.079, "grad_norm": 740.0, "kl_loss_12": 592.9966247558593, "kl_loss_17": 163.48230056762696, "kl_loss_3": 2841.9972534179688, "kl_loss_6": 1702.5080444335938, "learning_rate": 0.000988061995775515, "loss": 1359.5953, "step": 790 }, { "ce_loss_12": 3.2399455189704893, "ce_loss_17": 3.037798821926117, "ce_loss_23": 2.9372796416282654, "ce_loss_3": 4.303165769577026, "ce_loss_6": 3.750383937358856, "epoch": 0.08, "grad_norm": 836.0, "kl_loss_12": 608.0829467773438, "kl_loss_17": 181.16675491333007, "kl_loss_3": 2844.9753662109374, "kl_loss_6": 1732.1241088867187, "learning_rate": 0.0009877148934427035, "loss": 1341.4348, "step": 800 }, { "ce_loss_12": 3.2861321926116944, "ce_loss_17": 3.0791449308395387, "ce_loss_23": 2.976716470718384, "ce_loss_3": 4.362315344810486, "ce_loss_6": 3.7868611097335814, "epoch": 0.081, "grad_norm": 1080.0, "kl_loss_12": 604.6169494628906, "kl_loss_17": 182.59001693725585, "kl_loss_3": 2893.0469970703125, "kl_loss_6": 1735.8979614257812, "learning_rate": 0.0009873628798263297, "loss": 1338.7411, "step": 810 }, { "ce_loss_12": 3.2293206453323364, "ce_loss_17": 3.034642231464386, "ce_loss_23": 2.939870834350586, "ce_loss_3": 4.28888818025589, "ce_loss_6": 3.739218699932098, "epoch": 0.082, "grad_norm": 792.0, "kl_loss_12": 585.2830001831055, "kl_loss_17": 175.42455139160157, "kl_loss_3": 2814.965295410156, "kl_loss_6": 1707.1840454101562, "learning_rate": 0.0009870059584711668, "loss": 1357.3789, "step": 820 }, { "ce_loss_12": 3.2427746772766115, "ce_loss_17": 3.0388410449028016, "ce_loss_23": 2.9463847517967223, "ce_loss_3": 4.313164341449737, "ce_loss_6": 3.7601605892181396, "epoch": 0.083, "grad_norm": 784.0, "kl_loss_12": 577.6069213867188, "kl_loss_17": 166.45866165161132, "kl_loss_3": 2826.7182006835938, "kl_loss_6": 1720.5235717773437, "learning_rate": 0.000986644132971409, "loss": 1323.6652, "step": 830 }, { "ce_loss_12": 3.2438090324401854, "ce_loss_17": 3.0304963111877443, "ce_loss_23": 2.932339680194855, "ce_loss_3": 4.340520238876342, "ce_loss_6": 3.7706413149833677, "epoch": 0.084, "grad_norm": 852.0, "kl_loss_12": 600.9248870849609, "kl_loss_17": 173.94204864501953, "kl_loss_3": 2898.236535644531, "kl_loss_6": 1753.0972961425782, "learning_rate": 0.0009862774069706345, "loss": 1342.0187, "step": 840 }, { "ce_loss_12": 3.3441239953041078, "ce_loss_17": 3.1514394879341125, "ce_loss_23": 3.0594014644622805, "ce_loss_3": 4.38064968585968, "ce_loss_6": 3.8304377436637878, "epoch": 0.085, "grad_norm": 756.0, "kl_loss_12": 577.1782684326172, "kl_loss_17": 166.6529998779297, "kl_loss_3": 2765.473034667969, "kl_loss_6": 1668.087255859375, "learning_rate": 0.000985905784161771, "loss": 1310.3972, "step": 850 }, { "ce_loss_12": 3.2722967624664308, "ce_loss_17": 3.0797245502471924, "ce_loss_23": 2.988293194770813, "ce_loss_3": 4.321183085441589, "ce_loss_6": 3.767487609386444, "epoch": 0.086, "grad_norm": 688.0, "kl_loss_12": 565.3048660278321, "kl_loss_17": 163.8540298461914, "kl_loss_3": 2786.437707519531, "kl_loss_6": 1674.3542053222657, "learning_rate": 0.000985529268287055, "loss": 1295.72, "step": 860 }, { "ce_loss_12": 3.210437059402466, "ce_loss_17": 3.014290964603424, "ce_loss_23": 2.920425224304199, "ce_loss_3": 4.306206333637237, "ce_loss_6": 3.734451103210449, "epoch": 0.087, "grad_norm": 1020.0, "kl_loss_12": 582.5995712280273, "kl_loss_17": 169.39782028198243, "kl_loss_3": 2866.389929199219, "kl_loss_6": 1722.0148559570312, "learning_rate": 0.0009851478631379982, "loss": 1335.3591, "step": 870 }, { "ce_loss_12": 3.2780421018600463, "ce_loss_17": 3.0773184299468994, "ce_loss_23": 2.981848442554474, "ce_loss_3": 4.3347776412963865, "ce_loss_6": 3.7697909474372864, "epoch": 0.088, "grad_norm": 880.0, "kl_loss_12": 581.4047927856445, "kl_loss_17": 169.22406311035155, "kl_loss_3": 2813.3952758789064, "kl_loss_6": 1681.3613159179688, "learning_rate": 0.0009847615725553456, "loss": 1317.7922, "step": 880 }, { "ce_loss_12": 3.308706593513489, "ce_loss_17": 3.1193915247917174, "ce_loss_23": 3.0320719122886657, "ce_loss_3": 4.325762486457824, "ce_loss_6": 3.7881689071655273, "epoch": 0.089, "grad_norm": 728.0, "kl_loss_12": 551.782502746582, "kl_loss_17": 155.5468780517578, "kl_loss_3": 2698.84833984375, "kl_loss_6": 1623.6959045410156, "learning_rate": 0.0009843704004290394, "loss": 1306.5696, "step": 890 }, { "ce_loss_12": 3.2304977893829347, "ce_loss_17": 3.029300880432129, "ce_loss_23": 2.9373438477516176, "ce_loss_3": 4.290966820716858, "ce_loss_6": 3.7355265498161314, "epoch": 0.09, "grad_norm": 652.0, "kl_loss_12": 590.5244323730469, "kl_loss_17": 170.3102066040039, "kl_loss_3": 2834.1925659179688, "kl_loss_6": 1716.8873474121094, "learning_rate": 0.0009839743506981783, "loss": 1320.3553, "step": 900 }, { "ce_loss_12": 3.172462725639343, "ce_loss_17": 2.96781644821167, "ce_loss_23": 2.8684237480163572, "ce_loss_3": 4.273809564113617, "ce_loss_6": 3.6973857641220094, "epoch": 0.091, "grad_norm": 808.0, "kl_loss_12": 602.5951446533203, "kl_loss_17": 176.92565536499023, "kl_loss_3": 2918.4544311523437, "kl_loss_6": 1768.0981994628905, "learning_rate": 0.0009835734273509786, "loss": 1341.5896, "step": 910 }, { "ce_loss_12": 3.2609114408493043, "ce_loss_17": 3.0531147003173826, "ce_loss_23": 2.9561579704284666, "ce_loss_3": 4.325370657444, "ce_loss_6": 3.763854217529297, "epoch": 0.092, "grad_norm": 856.0, "kl_loss_12": 580.0848983764648, "kl_loss_17": 170.60181579589843, "kl_loss_3": 2800.7559448242187, "kl_loss_6": 1674.3670715332032, "learning_rate": 0.0009831676344247342, "loss": 1309.1984, "step": 920 }, { "ce_loss_12": 3.25858074426651, "ce_loss_17": 3.066873002052307, "ce_loss_23": 2.98137309551239, "ce_loss_3": 4.282044243812561, "ce_loss_6": 3.7491400599479676, "epoch": 0.093, "grad_norm": 892.0, "kl_loss_12": 545.7637374877929, "kl_loss_17": 160.86735000610352, "kl_loss_3": 2735.9965576171876, "kl_loss_6": 1650.1052307128907, "learning_rate": 0.0009827569760057755, "loss": 1291.6329, "step": 930 }, { "ce_loss_12": 3.2043312907218935, "ce_loss_17": 2.993998634815216, "ce_loss_23": 2.8952491641044618, "ce_loss_3": 4.315342664718628, "ce_loss_6": 3.740140736103058, "epoch": 0.094, "grad_norm": 1000.0, "kl_loss_12": 590.2115966796875, "kl_loss_17": 173.1332695007324, "kl_loss_3": 2927.032568359375, "kl_loss_6": 1768.2796447753906, "learning_rate": 0.000982341456229428, "loss": 1320.8015, "step": 940 }, { "ce_loss_12": 3.2777076601982116, "ce_loss_17": 3.0761674523353575, "ce_loss_23": 2.986429727077484, "ce_loss_3": 4.346337795257568, "ce_loss_6": 3.786615800857544, "epoch": 0.095, "grad_norm": 748.0, "kl_loss_12": 579.9668441772461, "kl_loss_17": 164.05571441650392, "kl_loss_3": 2845.5396240234377, "kl_loss_6": 1712.9433532714843, "learning_rate": 0.000981921079279971, "loss": 1292.4729, "step": 950 }, { "ce_loss_12": 3.2670510172843934, "ce_loss_17": 3.0814021468162536, "ce_loss_23": 2.997356629371643, "ce_loss_3": 4.266339981555939, "ce_loss_6": 3.732987666130066, "epoch": 0.096, "grad_norm": 716.0, "kl_loss_12": 551.3224319458008, "kl_loss_17": 157.1545051574707, "kl_loss_3": 2686.10380859375, "kl_loss_6": 1609.8591064453126, "learning_rate": 0.0009814958493905962, "loss": 1267.0563, "step": 960 }, { "ce_loss_12": 3.243378257751465, "ce_loss_17": 3.0477044582366943, "ce_loss_23": 2.957053375244141, "ce_loss_3": 4.320212554931641, "ce_loss_6": 3.7490431308746337, "epoch": 0.097, "grad_norm": 772.0, "kl_loss_12": 571.2222473144532, "kl_loss_17": 167.3255302429199, "kl_loss_3": 2832.0980590820313, "kl_loss_6": 1694.2712707519531, "learning_rate": 0.0009810657708433637, "loss": 1335.8684, "step": 970 }, { "ce_loss_12": 3.314019334316254, "ce_loss_17": 3.123671591281891, "ce_loss_23": 3.034430181980133, "ce_loss_3": 4.311093711853028, "ce_loss_6": 3.77311372756958, "epoch": 0.098, "grad_norm": 820.0, "kl_loss_12": 552.7862976074218, "kl_loss_17": 168.19173049926758, "kl_loss_3": 2656.2674072265627, "kl_loss_6": 1591.3033081054687, "learning_rate": 0.0009806308479691594, "loss": 1254.4201, "step": 980 }, { "ce_loss_12": 3.3261463403701783, "ce_loss_17": 3.1423306703567504, "ce_loss_23": 3.032912051677704, "ce_loss_3": 4.357937359809876, "ce_loss_6": 3.80484082698822, "epoch": 0.099, "grad_norm": 804.0, "kl_loss_12": 580.5806060791016, "kl_loss_17": 184.8185920715332, "kl_loss_3": 2754.346044921875, "kl_loss_6": 1644.7821472167968, "learning_rate": 0.0009801910851476522, "loss": 1282.2547, "step": 990 }, { "ce_loss_12": 3.2512516736984254, "ce_loss_17": 3.0605135440826414, "ce_loss_23": 2.9637475967407227, "ce_loss_3": 4.329551267623901, "ce_loss_6": 3.743285083770752, "epoch": 0.1, "grad_norm": 748.0, "kl_loss_12": 582.9849090576172, "kl_loss_17": 174.53399505615235, "kl_loss_3": 2859.1509521484377, "kl_loss_6": 1687.5102966308593, "learning_rate": 0.0009797464868072487, "loss": 1294.724, "step": 1000 }, { "ce_loss_12": 3.2312548875808718, "ce_loss_17": 3.0431445121765135, "ce_loss_23": 2.9497431397438048, "ce_loss_3": 4.286968004703522, "ce_loss_6": 3.7301648378372194, "epoch": 0.101, "grad_norm": 780.0, "kl_loss_12": 578.5622329711914, "kl_loss_17": 171.74994735717775, "kl_loss_3": 2811.974072265625, "kl_loss_6": 1686.0221252441406, "learning_rate": 0.0009792970574250492, "loss": 1304.7269, "step": 1010 }, { "ce_loss_12": 3.2564730048179626, "ce_loss_17": 3.0567847013473513, "ce_loss_23": 2.968216300010681, "ce_loss_3": 4.288906526565552, "ce_loss_6": 3.7319556951522825, "epoch": 0.102, "grad_norm": 924.0, "kl_loss_12": 564.9530090332031, "kl_loss_17": 164.64181747436524, "kl_loss_3": 2745.9426513671874, "kl_loss_6": 1627.5541931152343, "learning_rate": 0.0009788428015268028, "loss": 1261.1484, "step": 1020 }, { "ce_loss_12": 3.261544132232666, "ce_loss_17": 3.06712828874588, "ce_loss_23": 2.9740715503692625, "ce_loss_3": 4.265662479400635, "ce_loss_6": 3.726946938037872, "epoch": 0.103, "grad_norm": 788.0, "kl_loss_12": 576.264567565918, "kl_loss_17": 166.91285705566406, "kl_loss_3": 2714.088330078125, "kl_loss_6": 1614.7607482910157, "learning_rate": 0.0009783837236868609, "loss": 1267.8549, "step": 1030 }, { "ce_loss_12": 3.2258948802948, "ce_loss_17": 3.0288470864295958, "ce_loss_23": 2.936104118824005, "ce_loss_3": 4.249889886379242, "ce_loss_6": 3.698464000225067, "epoch": 0.104, "grad_norm": 760.0, "kl_loss_12": 571.986279296875, "kl_loss_17": 165.31406631469727, "kl_loss_3": 2715.9388916015623, "kl_loss_6": 1618.140185546875, "learning_rate": 0.0009779198285281327, "loss": 1259.2237, "step": 1040 }, { "ce_loss_12": 3.2193004369735716, "ce_loss_17": 3.0228930950164794, "ce_loss_23": 2.9384063124656676, "ce_loss_3": 4.265962791442871, "ce_loss_6": 3.7088080763816835, "epoch": 0.105, "grad_norm": 752.0, "kl_loss_12": 558.9580703735352, "kl_loss_17": 154.56547775268555, "kl_loss_3": 2765.327038574219, "kl_loss_6": 1636.0435363769532, "learning_rate": 0.0009774511207220368, "loss": 1274.2527, "step": 1050 }, { "ce_loss_12": 3.2610196471214294, "ce_loss_17": 3.0684242367744448, "ce_loss_23": 2.978561782836914, "ce_loss_3": 4.308524203300476, "ce_loss_6": 3.7470775604248048, "epoch": 0.106, "grad_norm": 880.0, "kl_loss_12": 561.0937957763672, "kl_loss_17": 159.2811538696289, "kl_loss_3": 2772.8382202148437, "kl_loss_6": 1638.9705444335937, "learning_rate": 0.0009769776049884564, "loss": 1274.5569, "step": 1060 }, { "ce_loss_12": 3.1786951303482054, "ce_loss_17": 2.9803019642829893, "ce_loss_23": 2.8939341425895693, "ce_loss_3": 4.241933369636536, "ce_loss_6": 3.679634761810303, "epoch": 0.107, "grad_norm": 940.0, "kl_loss_12": 566.5046829223633, "kl_loss_17": 157.10613098144532, "kl_loss_3": 2836.979296875, "kl_loss_6": 1695.5368774414062, "learning_rate": 0.0009764992860956889, "loss": 1316.3168, "step": 1070 }, { "ce_loss_12": 3.296342170238495, "ce_loss_17": 3.116221308708191, "ce_loss_23": 3.0379060387611387, "ce_loss_3": 4.2762956619262695, "ce_loss_6": 3.746916079521179, "epoch": 0.108, "grad_norm": 800.0, "kl_loss_12": 527.0881530761719, "kl_loss_17": 147.49216232299804, "kl_loss_3": 2606.6685546875, "kl_loss_6": 1547.9802795410155, "learning_rate": 0.0009760161688604008, "loss": 1232.1656, "step": 1080 }, { "ce_loss_12": 3.315100681781769, "ce_loss_17": 3.1226739048957826, "ce_loss_23": 3.03648464679718, "ce_loss_3": 4.320034456253052, "ce_loss_6": 3.788403558731079, "epoch": 0.109, "grad_norm": 760.0, "kl_loss_12": 553.3834716796875, "kl_loss_17": 152.19630699157716, "kl_loss_3": 2681.25322265625, "kl_loss_6": 1610.068292236328, "learning_rate": 0.0009755282581475768, "loss": 1261.8289, "step": 1090 }, { "ce_loss_12": 3.3554514765739443, "ce_loss_17": 3.1595420360565187, "ce_loss_23": 3.0734885215759276, "ce_loss_3": 4.345818209648132, "ce_loss_6": 3.8123062372207643, "epoch": 0.11, "grad_norm": 744.0, "kl_loss_12": 568.856837463379, "kl_loss_17": 159.01618118286132, "kl_loss_3": 2657.6038818359375, "kl_loss_6": 1583.0754455566407, "learning_rate": 0.0009750355588704727, "loss": 1234.9309, "step": 1100 }, { "ce_loss_12": 3.2044736742973328, "ce_loss_17": 3.0105460405349733, "ce_loss_23": 2.9257432579994203, "ce_loss_3": 4.223796212673188, "ce_loss_6": 3.675619602203369, "epoch": 0.111, "grad_norm": 796.0, "kl_loss_12": 555.5249206542969, "kl_loss_17": 152.13012924194337, "kl_loss_3": 2700.157360839844, "kl_loss_6": 1599.4874572753906, "learning_rate": 0.0009745380759905647, "loss": 1272.1023, "step": 1110 }, { "ce_loss_12": 3.165294623374939, "ce_loss_17": 2.971400058269501, "ce_loss_23": 2.884852170944214, "ce_loss_3": 4.2120347142219545, "ce_loss_6": 3.659716248512268, "epoch": 0.112, "grad_norm": 768.0, "kl_loss_12": 552.1841415405273, "kl_loss_17": 155.7645294189453, "kl_loss_3": 2763.0667358398437, "kl_loss_6": 1642.3040100097655, "learning_rate": 0.0009740358145174998, "loss": 1311.7117, "step": 1120 }, { "ce_loss_12": 3.2912906646728515, "ce_loss_17": 3.1120285749435426, "ce_loss_23": 3.0224124908447267, "ce_loss_3": 4.2669930934906, "ce_loss_6": 3.743947458267212, "epoch": 0.113, "grad_norm": 724.0, "kl_loss_12": 537.3440704345703, "kl_loss_17": 165.69821243286134, "kl_loss_3": 2629.3704833984375, "kl_loss_6": 1580.140069580078, "learning_rate": 0.0009735287795090455, "loss": 1245.9326, "step": 1130 }, { "ce_loss_12": 3.199613857269287, "ce_loss_17": 3.0266300678253173, "ce_loss_23": 2.9283379316329956, "ce_loss_3": 4.228734850883484, "ce_loss_6": 3.688489925861359, "epoch": 0.114, "grad_norm": 852.0, "kl_loss_12": 534.8260620117187, "kl_loss_17": 182.36328735351563, "kl_loss_3": 2714.242395019531, "kl_loss_6": 1629.0295959472655, "learning_rate": 0.0009730169760710386, "loss": 1258.2001, "step": 1140 }, { "ce_loss_12": 3.261527454853058, "ce_loss_17": 3.089632534980774, "ce_loss_23": 2.9942903399467466, "ce_loss_3": 4.275948905944825, "ce_loss_6": 3.733642745018005, "epoch": 0.115, "grad_norm": 1032.0, "kl_loss_12": 527.5316513061523, "kl_loss_17": 171.56800003051757, "kl_loss_3": 2639.7792846679686, "kl_loss_6": 1569.7343627929688, "learning_rate": 0.0009725004093573342, "loss": 1245.8943, "step": 1150 }, { "ce_loss_12": 3.22711728811264, "ce_loss_17": 3.045984649658203, "ce_loss_23": 2.946725940704346, "ce_loss_3": 4.235481929779053, "ce_loss_6": 3.7019497632980345, "epoch": 0.116, "grad_norm": 792.0, "kl_loss_12": 534.4117095947265, "kl_loss_17": 173.247119140625, "kl_loss_3": 2655.5172485351563, "kl_loss_6": 1577.8178833007812, "learning_rate": 0.0009719790845697534, "loss": 1229.9186, "step": 1160 }, { "ce_loss_12": 3.1724898338317873, "ce_loss_17": 2.9979549288749694, "ce_loss_23": 2.912010908126831, "ce_loss_3": 4.142260408401489, "ce_loss_6": 3.6257847785949706, "epoch": 0.117, "grad_norm": 996.0, "kl_loss_12": 512.0660125732422, "kl_loss_17": 154.08805084228516, "kl_loss_3": 2589.680029296875, "kl_loss_6": 1544.665350341797, "learning_rate": 0.0009714530069580309, "loss": 1208.7752, "step": 1170 }, { "ce_loss_12": 3.2630844950675963, "ce_loss_17": 3.078612267971039, "ce_loss_23": 2.991488981246948, "ce_loss_3": 4.273677432537079, "ce_loss_6": 3.739665389060974, "epoch": 0.118, "grad_norm": 916.0, "kl_loss_12": 547.9355621337891, "kl_loss_17": 164.50029220581055, "kl_loss_3": 2683.0645141601562, "kl_loss_6": 1613.4322387695313, "learning_rate": 0.0009709221818197624, "loss": 1239.5195, "step": 1180 }, { "ce_loss_12": 3.306522285938263, "ce_loss_17": 3.119382691383362, "ce_loss_23": 3.0303967595100403, "ce_loss_3": 4.328462171554565, "ce_loss_6": 3.783062732219696, "epoch": 0.119, "grad_norm": 1360.0, "kl_loss_12": 539.4908432006836, "kl_loss_17": 154.80245361328124, "kl_loss_3": 2709.159716796875, "kl_loss_6": 1613.389471435547, "learning_rate": 0.0009703866145003512, "loss": 1249.8813, "step": 1190 }, { "ce_loss_12": 3.2699382424354555, "ce_loss_17": 3.0863998413085936, "ce_loss_23": 3.0056095004081724, "ce_loss_3": 4.255171620845795, "ce_loss_6": 3.7445420384407044, "epoch": 0.12, "grad_norm": 916.0, "kl_loss_12": 530.1376663208008, "kl_loss_17": 145.03771438598633, "kl_loss_3": 2644.3298095703126, "kl_loss_6": 1602.0392944335938, "learning_rate": 0.0009698463103929542, "loss": 1255.4442, "step": 1200 }, { "ce_loss_12": 3.2317134141921997, "ce_loss_17": 3.0455020546913145, "ce_loss_23": 2.96392343044281, "ce_loss_3": 4.257777070999145, "ce_loss_6": 3.7172797322273254, "epoch": 0.121, "grad_norm": 652.0, "kl_loss_12": 538.7726196289062, "kl_loss_17": 150.16256446838378, "kl_loss_3": 2692.4408935546876, "kl_loss_6": 1617.0947509765624, "learning_rate": 0.0009693012749384279, "loss": 1256.565, "step": 1210 }, { "ce_loss_12": 3.2393490433692933, "ce_loss_17": 3.055326855182648, "ce_loss_23": 2.9717833280563353, "ce_loss_3": 4.23775063753128, "ce_loss_6": 3.708564019203186, "epoch": 0.122, "grad_norm": 812.0, "kl_loss_12": 533.0684432983398, "kl_loss_17": 148.2549591064453, "kl_loss_3": 2648.3533569335937, "kl_loss_6": 1581.6532165527344, "learning_rate": 0.0009687515136252732, "loss": 1221.4145, "step": 1220 }, { "ce_loss_12": 3.204907751083374, "ce_loss_17": 3.0170387744903566, "ce_loss_23": 2.9366545915603637, "ce_loss_3": 4.259589290618896, "ce_loss_6": 3.697655403614044, "epoch": 0.123, "grad_norm": 836.0, "kl_loss_12": 535.4695510864258, "kl_loss_17": 145.64403610229493, "kl_loss_3": 2768.4408935546876, "kl_loss_6": 1633.621044921875, "learning_rate": 0.0009681970319895803, "loss": 1285.5104, "step": 1230 }, { "ce_loss_12": 3.283095097541809, "ce_loss_17": 3.0973464846611023, "ce_loss_23": 3.020170819759369, "ce_loss_3": 4.27948043346405, "ce_loss_6": 3.7493759989738464, "epoch": 0.124, "grad_norm": 724.0, "kl_loss_12": 532.69248046875, "kl_loss_17": 142.17120971679688, "kl_loss_3": 2643.1325561523436, "kl_loss_6": 1572.2886047363281, "learning_rate": 0.0009676378356149733, "loss": 1218.1225, "step": 1240 }, { "ce_loss_12": 3.2427470684051514, "ce_loss_17": 3.0617971658706664, "ce_loss_23": 2.986854386329651, "ce_loss_3": 4.217474734783172, "ce_loss_6": 3.691105401515961, "epoch": 0.125, "grad_norm": 1032.0, "kl_loss_12": 514.074739074707, "kl_loss_17": 137.4885368347168, "kl_loss_3": 2594.8048583984373, "kl_loss_6": 1531.531268310547, "learning_rate": 0.0009670739301325534, "loss": 1206.076, "step": 1250 }, { "ce_loss_12": 3.2147034883499144, "ce_loss_17": 3.027522599697113, "ce_loss_23": 2.9497153043746946, "ce_loss_3": 4.191295731067657, "ce_loss_6": 3.679839813709259, "epoch": 0.126, "grad_norm": 720.0, "kl_loss_12": 527.9758728027343, "kl_loss_17": 144.51774787902832, "kl_loss_3": 2607.5860107421877, "kl_loss_6": 1569.477960205078, "learning_rate": 0.0009665053212208426, "loss": 1229.3042, "step": 1260 }, { "ce_loss_12": 3.246464729309082, "ce_loss_17": 3.0629101634025573, "ce_loss_23": 2.9833993911743164, "ce_loss_3": 4.251738870143891, "ce_loss_6": 3.7133025169372558, "epoch": 0.127, "grad_norm": 864.0, "kl_loss_12": 533.1231460571289, "kl_loss_17": 148.98831024169922, "kl_loss_3": 2664.428918457031, "kl_loss_6": 1587.7116638183593, "learning_rate": 0.0009659320146057262, "loss": 1228.6906, "step": 1270 }, { "ce_loss_12": 3.2591238498687742, "ce_loss_17": 3.0797009110450744, "ce_loss_23": 2.998037600517273, "ce_loss_3": 4.238789916038513, "ce_loss_6": 3.7171790480613707, "epoch": 0.128, "grad_norm": 992.0, "kl_loss_12": 518.5297225952148, "kl_loss_17": 148.35939559936523, "kl_loss_3": 2606.920068359375, "kl_loss_6": 1548.8865661621094, "learning_rate": 0.0009653540160603955, "loss": 1210.3549, "step": 1280 }, { "ce_loss_12": 3.2535346031188963, "ce_loss_17": 3.084386110305786, "ce_loss_23": 3.00300794839859, "ce_loss_3": 4.23352427482605, "ce_loss_6": 3.7167670369148254, "epoch": 0.129, "grad_norm": 852.0, "kl_loss_12": 512.0488311767579, "kl_loss_17": 148.47010040283203, "kl_loss_3": 2611.6455810546877, "kl_loss_6": 1550.3183898925781, "learning_rate": 0.0009647713314052896, "loss": 1197.2949, "step": 1290 }, { "ce_loss_12": 3.2205206751823425, "ce_loss_17": 3.0326789259910583, "ce_loss_23": 2.9471697568893434, "ce_loss_3": 4.25264720916748, "ce_loss_6": 3.7139246821403504, "epoch": 0.13, "grad_norm": 768.0, "kl_loss_12": 535.0598526000977, "kl_loss_17": 152.8258155822754, "kl_loss_3": 2725.5468872070314, "kl_loss_6": 1630.4482482910157, "learning_rate": 0.0009641839665080363, "loss": 1248.7602, "step": 1300 }, { "ce_loss_12": 3.184041130542755, "ce_loss_17": 3.0028507471084596, "ce_loss_23": 2.921945583820343, "ce_loss_3": 4.199795937538147, "ce_loss_6": 3.655711257457733, "epoch": 0.131, "grad_norm": 956.0, "kl_loss_12": 509.8348114013672, "kl_loss_17": 144.7429271697998, "kl_loss_3": 2653.9624633789062, "kl_loss_6": 1567.1138244628905, "learning_rate": 0.0009635919272833937, "loss": 1206.475, "step": 1310 }, { "ce_loss_12": 3.212551248073578, "ce_loss_17": 3.026778721809387, "ce_loss_23": 2.946667730808258, "ce_loss_3": 4.211032819747925, "ce_loss_6": 3.672435760498047, "epoch": 0.132, "grad_norm": 852.0, "kl_loss_12": 520.5912582397461, "kl_loss_17": 145.87120895385743, "kl_loss_3": 2639.656628417969, "kl_loss_6": 1553.8392272949218, "learning_rate": 0.0009629952196931902, "loss": 1193.8047, "step": 1320 }, { "ce_loss_12": 3.186399018764496, "ce_loss_17": 3.0172454714775085, "ce_loss_23": 2.9395448088645937, "ce_loss_3": 4.195619773864746, "ce_loss_6": 3.6446786284446717, "epoch": 0.133, "grad_norm": 952.0, "kl_loss_12": 501.36443481445315, "kl_loss_17": 144.396586227417, "kl_loss_3": 2638.032116699219, "kl_loss_6": 1531.3646911621095, "learning_rate": 0.0009623938497462645, "loss": 1198.4854, "step": 1330 }, { "ce_loss_12": 3.1858184933662415, "ce_loss_17": 3.0126922845840456, "ce_loss_23": 2.9323938727378844, "ce_loss_3": 4.1889008522033695, "ce_loss_6": 3.6592318177223206, "epoch": 0.134, "grad_norm": 924.0, "kl_loss_12": 511.55785064697267, "kl_loss_17": 145.21801109313964, "kl_loss_3": 2631.326708984375, "kl_loss_6": 1557.732861328125, "learning_rate": 0.0009617878234984055, "loss": 1217.4046, "step": 1340 }, { "ce_loss_12": 3.2636426448822022, "ce_loss_17": 3.0926445841789247, "ce_loss_23": 3.0167927503585816, "ce_loss_3": 4.219664669036865, "ce_loss_6": 3.698105537891388, "epoch": 0.135, "grad_norm": 916.0, "kl_loss_12": 497.62325286865234, "kl_loss_17": 138.0944309234619, "kl_loss_3": 2526.13603515625, "kl_loss_6": 1484.2228942871093, "learning_rate": 0.0009611771470522907, "loss": 1180.7393, "step": 1350 }, { "ce_loss_12": 3.2092088937759398, "ce_loss_17": 3.0275454759597777, "ce_loss_23": 2.9461109161376955, "ce_loss_3": 4.19175910949707, "ce_loss_6": 3.6639052033424377, "epoch": 0.136, "grad_norm": 856.0, "kl_loss_12": 504.0848876953125, "kl_loss_17": 143.1431613922119, "kl_loss_3": 2579.740380859375, "kl_loss_6": 1511.1883422851563, "learning_rate": 0.0009605618265574251, "loss": 1178.6502, "step": 1360 }, { "ce_loss_12": 3.182911384105682, "ce_loss_17": 3.001040053367615, "ce_loss_23": 2.9205902814865112, "ce_loss_3": 4.194320964813232, "ce_loss_6": 3.6565115213394166, "epoch": 0.137, "grad_norm": 928.0, "kl_loss_12": 532.6784011840821, "kl_loss_17": 147.71124191284179, "kl_loss_3": 2669.027685546875, "kl_loss_6": 1589.4699401855469, "learning_rate": 0.0009599418682100792, "loss": 1215.4029, "step": 1370 }, { "ce_loss_12": 3.2150518894195557, "ce_loss_17": 3.035642147064209, "ce_loss_23": 2.9590429425239564, "ce_loss_3": 4.2062974095344545, "ce_loss_6": 3.6585775136947634, "epoch": 0.138, "grad_norm": 888.0, "kl_loss_12": 514.3810455322266, "kl_loss_17": 138.8052433013916, "kl_loss_3": 2626.3319091796875, "kl_loss_6": 1532.4898864746094, "learning_rate": 0.0009593172782532268, "loss": 1202.498, "step": 1380 }, { "ce_loss_12": 3.2492050409317015, "ce_loss_17": 3.0701080203056335, "ce_loss_23": 2.9916153073310854, "ce_loss_3": 4.2146389245986935, "ce_loss_6": 3.693696665763855, "epoch": 0.139, "grad_norm": 792.0, "kl_loss_12": 509.50305023193357, "kl_loss_17": 140.96539154052735, "kl_loss_3": 2552.8525512695314, "kl_loss_6": 1512.970068359375, "learning_rate": 0.0009586880629764817, "loss": 1181.7294, "step": 1390 }, { "ce_loss_12": 3.1832247495651247, "ce_loss_17": 3.0081828832626343, "ce_loss_23": 2.9266636252403258, "ce_loss_3": 4.176916980743409, "ce_loss_6": 3.6512787103652955, "epoch": 0.14, "grad_norm": 772.0, "kl_loss_12": 509.731379699707, "kl_loss_17": 143.64842796325684, "kl_loss_3": 2588.364465332031, "kl_loss_6": 1531.91630859375, "learning_rate": 0.0009580542287160348, "loss": 1176.7396, "step": 1400 }, { "ce_loss_12": 3.144865083694458, "ce_loss_17": 2.97108371257782, "ce_loss_23": 2.891060936450958, "ce_loss_3": 4.136532425880432, "ce_loss_6": 3.6031069278717043, "epoch": 0.141, "grad_norm": 844.0, "kl_loss_12": 505.5958480834961, "kl_loss_17": 146.13398323059081, "kl_loss_3": 2601.420520019531, "kl_loss_6": 1520.9350463867188, "learning_rate": 0.0009574157818545901, "loss": 1178.8346, "step": 1410 }, { "ce_loss_12": 3.209973669052124, "ce_loss_17": 3.0407453656196592, "ce_loss_23": 2.963426387310028, "ce_loss_3": 4.165764462947846, "ce_loss_6": 3.6472143650054933, "epoch": 0.142, "grad_norm": 908.0, "kl_loss_12": 491.97606353759767, "kl_loss_17": 135.57668418884276, "kl_loss_3": 2528.8818115234376, "kl_loss_6": 1487.7853210449218, "learning_rate": 0.0009567727288213005, "loss": 1190.5035, "step": 1420 }, { "ce_loss_12": 3.197652304172516, "ce_loss_17": 3.0179499268531798, "ce_loss_23": 2.936987841129303, "ce_loss_3": 4.188513946533203, "ce_loss_6": 3.655670428276062, "epoch": 0.143, "grad_norm": 812.0, "kl_loss_12": 511.9153167724609, "kl_loss_17": 145.0369529724121, "kl_loss_3": 2610.760852050781, "kl_loss_6": 1542.4522644042968, "learning_rate": 0.0009561250760917027, "loss": 1185.8725, "step": 1430 }, { "ce_loss_12": 3.207899260520935, "ce_loss_17": 3.0319709539413453, "ce_loss_23": 2.9539456725120545, "ce_loss_3": 4.179163765907288, "ce_loss_6": 3.656933867931366, "epoch": 0.144, "grad_norm": 1064.0, "kl_loss_12": 517.2545806884766, "kl_loss_17": 146.9861888885498, "kl_loss_3": 2600.315710449219, "kl_loss_6": 1545.5985473632813, "learning_rate": 0.0009554728301876525, "loss": 1171.7954, "step": 1440 }, { "ce_loss_12": 3.2509345412254333, "ce_loss_17": 3.0787034749984743, "ce_loss_23": 2.995332622528076, "ce_loss_3": 4.211844718456268, "ce_loss_6": 3.694181203842163, "epoch": 0.145, "grad_norm": 744.0, "kl_loss_12": 508.04298706054686, "kl_loss_17": 149.49794082641603, "kl_loss_3": 2553.605187988281, "kl_loss_6": 1520.3743835449218, "learning_rate": 0.0009548159976772592, "loss": 1214.258, "step": 1450 }, { "ce_loss_12": 3.202357566356659, "ce_loss_17": 3.030918502807617, "ce_loss_23": 2.9502019762992857, "ce_loss_3": 4.206415390968322, "ce_loss_6": 3.6663927674293517, "epoch": 0.146, "grad_norm": 1232.0, "kl_loss_12": 511.38385620117185, "kl_loss_17": 147.039298248291, "kl_loss_3": 2633.431530761719, "kl_loss_6": 1543.5936157226563, "learning_rate": 0.0009541545851748186, "loss": 1194.2137, "step": 1460 }, { "ce_loss_12": 3.083567941188812, "ce_loss_17": 2.910180139541626, "ce_loss_23": 2.829286205768585, "ce_loss_3": 4.110313820838928, "ce_loss_6": 3.5584028244018553, "epoch": 0.147, "grad_norm": 948.0, "kl_loss_12": 504.0245788574219, "kl_loss_17": 143.67748527526857, "kl_loss_3": 2656.521923828125, "kl_loss_6": 1554.0283996582032, "learning_rate": 0.0009534885993407473, "loss": 1201.5422, "step": 1470 }, { "ce_loss_12": 3.228755247592926, "ce_loss_17": 3.060680961608887, "ce_loss_23": 2.9807514429092405, "ce_loss_3": 4.232686340808868, "ce_loss_6": 3.706300401687622, "epoch": 0.148, "grad_norm": 852.0, "kl_loss_12": 493.7154907226562, "kl_loss_17": 142.14012565612794, "kl_loss_3": 2623.7803466796877, "kl_loss_6": 1554.9101318359376, "learning_rate": 0.0009528180468815154, "loss": 1201.7234, "step": 1480 }, { "ce_loss_12": 3.2848423957824706, "ce_loss_17": 3.110364580154419, "ce_loss_23": 3.0347512125968934, "ce_loss_3": 4.23952442407608, "ce_loss_6": 3.733375918865204, "epoch": 0.149, "grad_norm": 1004.0, "kl_loss_12": 506.6558837890625, "kl_loss_17": 142.2900062561035, "kl_loss_3": 2538.5910400390626, "kl_loss_6": 1513.7965209960937, "learning_rate": 0.0009521429345495787, "loss": 1177.942, "step": 1490 }, { "ce_loss_12": 3.254639232158661, "ce_loss_17": 3.0876608014106752, "ce_loss_23": 3.0133098363876343, "ce_loss_3": 4.207061076164246, "ce_loss_6": 3.694426679611206, "epoch": 0.15, "grad_norm": 900.0, "kl_loss_12": 485.9706497192383, "kl_loss_17": 134.57788467407227, "kl_loss_3": 2534.81689453125, "kl_loss_6": 1497.9059631347657, "learning_rate": 0.0009514632691433108, "loss": 1177.4586, "step": 1500 }, { "ce_loss_12": 3.2227351903915404, "ce_loss_17": 3.052746081352234, "ce_loss_23": 2.976072347164154, "ce_loss_3": 4.198372399806976, "ce_loss_6": 3.6763845801353456, "epoch": 0.151, "grad_norm": 904.0, "kl_loss_12": 504.3296157836914, "kl_loss_17": 141.9261100769043, "kl_loss_3": 2578.879296875, "kl_loss_6": 1531.7337280273437, "learning_rate": 0.0009507790575069346, "loss": 1180.6392, "step": 1510 }, { "ce_loss_12": 3.2037697315216063, "ce_loss_17": 3.018774151802063, "ce_loss_23": 2.937272822856903, "ce_loss_3": 4.196621096134185, "ce_loss_6": 3.666115713119507, "epoch": 0.152, "grad_norm": 768.0, "kl_loss_12": 513.6897033691406, "kl_loss_17": 142.41551475524903, "kl_loss_3": 2613.3070068359375, "kl_loss_6": 1544.65146484375, "learning_rate": 0.0009500903065304539, "loss": 1210.4954, "step": 1520 }, { "ce_loss_12": 3.2276593685150146, "ce_loss_17": 3.0604055762290954, "ce_loss_23": 2.9863309025764466, "ce_loss_3": 4.167424845695495, "ce_loss_6": 3.653580594062805, "epoch": 0.153, "grad_norm": 884.0, "kl_loss_12": 483.8401397705078, "kl_loss_17": 136.10087394714355, "kl_loss_3": 2484.0209106445313, "kl_loss_6": 1456.9175537109375, "learning_rate": 0.0009493970231495835, "loss": 1168.7832, "step": 1530 }, { "ce_loss_12": 3.1678776383399962, "ce_loss_17": 3.011521375179291, "ce_loss_23": 2.9375467181205748, "ce_loss_3": 4.114212560653686, "ce_loss_6": 3.602833318710327, "epoch": 0.154, "grad_norm": 908.0, "kl_loss_12": 477.70878448486326, "kl_loss_17": 139.93613510131837, "kl_loss_3": 2506.1269653320314, "kl_loss_6": 1474.152099609375, "learning_rate": 0.0009486992143456792, "loss": 1153.3016, "step": 1540 }, { "ce_loss_12": 3.2115009784698487, "ce_loss_17": 3.0346609592437743, "ce_loss_23": 2.94760525226593, "ce_loss_3": 4.2460575819015505, "ce_loss_6": 3.6979887962341307, "epoch": 0.155, "grad_norm": 860.0, "kl_loss_12": 519.8610046386718, "kl_loss_17": 157.5887466430664, "kl_loss_3": 2708.2066040039062, "kl_loss_6": 1593.1645080566407, "learning_rate": 0.0009479968871456679, "loss": 1207.1498, "step": 1550 }, { "ce_loss_12": 3.174054169654846, "ce_loss_17": 3.0042254090309144, "ce_loss_23": 2.9235622763633726, "ce_loss_3": 4.172900831699371, "ce_loss_6": 3.6299402713775635, "epoch": 0.156, "grad_norm": 952.0, "kl_loss_12": 495.34519653320314, "kl_loss_17": 147.48748016357422, "kl_loss_3": 2616.0154663085937, "kl_loss_6": 1526.7384216308594, "learning_rate": 0.0009472900486219768, "loss": 1173.4233, "step": 1560 }, { "ce_loss_12": 3.166046977043152, "ce_loss_17": 2.994255745410919, "ce_loss_23": 2.9190826416015625, "ce_loss_3": 4.122887992858887, "ce_loss_6": 3.6104302167892457, "epoch": 0.157, "grad_norm": 856.0, "kl_loss_12": 491.6877075195313, "kl_loss_17": 139.1889949798584, "kl_loss_3": 2547.5237548828127, "kl_loss_6": 1506.4313537597657, "learning_rate": 0.000946578705892462, "loss": 1174.5682, "step": 1570 }, { "ce_loss_12": 3.1944627285003664, "ce_loss_17": 3.0258451461791993, "ce_loss_23": 2.951625919342041, "ce_loss_3": 4.150630140304566, "ce_loss_6": 3.6336331605911254, "epoch": 0.158, "grad_norm": 964.0, "kl_loss_12": 479.3685333251953, "kl_loss_17": 133.69121551513672, "kl_loss_3": 2496.5006469726563, "kl_loss_6": 1450.4319641113282, "learning_rate": 0.0009458628661203367, "loss": 1163.8299, "step": 1580 }, { "ce_loss_12": 3.199857234954834, "ce_loss_17": 3.0304608583450316, "ce_loss_23": 2.9556132555007935, "ce_loss_3": 4.197483813762664, "ce_loss_6": 3.6625853896141054, "epoch": 0.159, "grad_norm": 1048.0, "kl_loss_12": 500.5685180664062, "kl_loss_17": 134.92954444885254, "kl_loss_3": 2611.937060546875, "kl_loss_6": 1533.4802795410155, "learning_rate": 0.0009451425365140996, "loss": 1155.3817, "step": 1590 }, { "ce_loss_12": 3.2707769751548765, "ce_loss_17": 3.097273528575897, "ce_loss_23": 3.021830976009369, "ce_loss_3": 4.201311850547791, "ce_loss_6": 3.6886289954185485, "epoch": 0.16, "grad_norm": 876.0, "kl_loss_12": 494.73425140380857, "kl_loss_17": 134.58801956176757, "kl_loss_3": 2474.3725708007814, "kl_loss_6": 1450.2538146972656, "learning_rate": 0.0009444177243274617, "loss": 1134.3242, "step": 1600 }, { "ce_loss_12": 3.145673596858978, "ce_loss_17": 2.964465343952179, "ce_loss_23": 2.886202132701874, "ce_loss_3": 4.129389917850494, "ce_loss_6": 3.5948860883712768, "epoch": 0.161, "grad_norm": 856.0, "kl_loss_12": 515.6139999389649, "kl_loss_17": 142.84942321777345, "kl_loss_3": 2606.3384521484377, "kl_loss_6": 1527.6505615234375, "learning_rate": 0.0009436884368592739, "loss": 1183.754, "step": 1610 }, { "ce_loss_12": 3.186392605304718, "ce_loss_17": 3.013194441795349, "ce_loss_23": 2.93712317943573, "ce_loss_3": 4.130860531330109, "ce_loss_6": 3.6174773812294005, "epoch": 0.162, "grad_norm": 872.0, "kl_loss_12": 492.7932693481445, "kl_loss_17": 142.2027275085449, "kl_loss_3": 2515.2781982421875, "kl_loss_6": 1469.6235412597657, "learning_rate": 0.0009429546814534529, "loss": 1176.3047, "step": 1620 }, { "ce_loss_12": 3.184974658489227, "ce_loss_17": 3.0268144249916076, "ce_loss_23": 2.9497244000434875, "ce_loss_3": 4.144134628772735, "ce_loss_6": 3.611461842060089, "epoch": 0.163, "grad_norm": 848.0, "kl_loss_12": 488.8755386352539, "kl_loss_17": 142.62328567504883, "kl_loss_3": 2515.289831542969, "kl_loss_6": 1463.6363830566406, "learning_rate": 0.0009422164654989072, "loss": 1137.5589, "step": 1630 }, { "ce_loss_12": 3.29748820066452, "ce_loss_17": 3.1312389373779297, "ce_loss_23": 3.053789258003235, "ce_loss_3": 4.228594875335693, "ce_loss_6": 3.7183935761451723, "epoch": 0.164, "grad_norm": 848.0, "kl_loss_12": 488.71146697998046, "kl_loss_17": 141.67050704956054, "kl_loss_3": 2480.7939575195314, "kl_loss_6": 1452.5811706542968, "learning_rate": 0.0009414737964294635, "loss": 1149.8963, "step": 1640 }, { "ce_loss_12": 3.2155895709991453, "ce_loss_17": 3.062555527687073, "ce_loss_23": 2.989735221862793, "ce_loss_3": 4.130454444885254, "ce_loss_6": 3.6353893160820006, "epoch": 0.165, "grad_norm": 828.0, "kl_loss_12": 461.0352813720703, "kl_loss_17": 136.16140022277833, "kl_loss_3": 2398.9843627929686, "kl_loss_6": 1397.462957763672, "learning_rate": 0.000940726681723791, "loss": 1135.6822, "step": 1650 }, { "ce_loss_12": 3.0923858523368835, "ce_loss_17": 2.9247390151023867, "ce_loss_23": 2.8481791257858275, "ce_loss_3": 4.089363622665405, "ce_loss_6": 3.5547229528427122, "epoch": 0.166, "grad_norm": 892.0, "kl_loss_12": 491.5706314086914, "kl_loss_17": 140.37573165893554, "kl_loss_3": 2631.875341796875, "kl_loss_6": 1530.0634033203125, "learning_rate": 0.0009399751289053266, "loss": 1147.4444, "step": 1660 }, { "ce_loss_12": 3.2702404379844667, "ce_loss_17": 3.105391335487366, "ce_loss_23": 3.0320743322372437, "ce_loss_3": 4.202740812301636, "ce_loss_6": 3.697451651096344, "epoch": 0.167, "grad_norm": 1032.0, "kl_loss_12": 474.04954681396487, "kl_loss_17": 133.29354972839354, "kl_loss_3": 2468.889294433594, "kl_loss_6": 1437.5356201171876, "learning_rate": 0.0009392191455421988, "loss": 1149.0023, "step": 1670 }, { "ce_loss_12": 3.2619558930397035, "ce_loss_17": 3.0974825978279115, "ce_loss_23": 3.0210219264030456, "ce_loss_3": 4.192395412921906, "ce_loss_6": 3.6811763405799867, "epoch": 0.168, "grad_norm": 796.0, "kl_loss_12": 493.43628845214846, "kl_loss_17": 139.0162052154541, "kl_loss_3": 2491.118200683594, "kl_loss_6": 1461.1446044921875, "learning_rate": 0.0009384587392471515, "loss": 1121.5728, "step": 1680 }, { "ce_loss_12": 3.237948274612427, "ce_loss_17": 3.0769503355026244, "ce_loss_23": 3.007404637336731, "ce_loss_3": 4.150859999656677, "ce_loss_6": 3.659588062763214, "epoch": 0.169, "grad_norm": 988.0, "kl_loss_12": 471.13790740966795, "kl_loss_17": 131.52924728393555, "kl_loss_3": 2426.4301513671876, "kl_loss_6": 1429.9769104003906, "learning_rate": 0.0009376939176774678, "loss": 1119.1035, "step": 1690 }, { "ce_loss_12": 3.221153450012207, "ce_loss_17": 3.0557522654533384, "ce_loss_23": 2.978453016281128, "ce_loss_3": 4.167772364616394, "ce_loss_6": 3.648601543903351, "epoch": 0.17, "grad_norm": 904.0, "kl_loss_12": 479.8827072143555, "kl_loss_17": 138.62282676696776, "kl_loss_3": 2484.6701049804688, "kl_loss_6": 1445.5995483398438, "learning_rate": 0.0009369246885348925, "loss": 1158.6926, "step": 1700 }, { "ce_loss_12": 3.211071527004242, "ce_loss_17": 3.0465246558189394, "ce_loss_23": 2.9701019644737245, "ce_loss_3": 4.180959832668305, "ce_loss_6": 3.659099280834198, "epoch": 0.171, "grad_norm": 960.0, "kl_loss_12": 484.2249557495117, "kl_loss_17": 137.59749946594238, "kl_loss_3": 2560.4348388671874, "kl_loss_6": 1507.995050048828, "learning_rate": 0.0009361510595655545, "loss": 1161.0425, "step": 1710 }, { "ce_loss_12": 3.175062394142151, "ce_loss_17": 3.0082626700401307, "ce_loss_23": 2.929028642177582, "ce_loss_3": 4.122467339038849, "ce_loss_6": 3.617999482154846, "epoch": 0.172, "grad_norm": 1056.0, "kl_loss_12": 486.29063415527344, "kl_loss_17": 137.6948272705078, "kl_loss_3": 2509.7924682617186, "kl_loss_6": 1487.8664428710938, "learning_rate": 0.0009353730385598887, "loss": 1146.3372, "step": 1720 }, { "ce_loss_12": 3.106047534942627, "ce_loss_17": 2.9398711800575255, "ce_loss_23": 2.868954026699066, "ce_loss_3": 4.094696950912476, "ce_loss_6": 3.555660057067871, "epoch": 0.173, "grad_norm": 1008.0, "kl_loss_12": 480.9398559570312, "kl_loss_17": 131.35497131347657, "kl_loss_3": 2565.51640625, "kl_loss_6": 1486.1780517578125, "learning_rate": 0.0009345906333525581, "loss": 1160.8581, "step": 1730 }, { "ce_loss_12": 3.145271563529968, "ce_loss_17": 2.98147029876709, "ce_loss_23": 2.9027063608169557, "ce_loss_3": 4.109023904800415, "ce_loss_6": 3.5946555733680725, "epoch": 0.174, "grad_norm": 1072.0, "kl_loss_12": 481.08248443603514, "kl_loss_17": 140.1177764892578, "kl_loss_3": 2530.2619873046874, "kl_loss_6": 1495.8839233398437, "learning_rate": 0.0009338038518223745, "loss": 1142.776, "step": 1740 }, { "ce_loss_12": 3.209577441215515, "ce_loss_17": 3.0408326625823974, "ce_loss_23": 2.9612845182418823, "ce_loss_3": 4.166906929016113, "ce_loss_6": 3.65167076587677, "epoch": 0.175, "grad_norm": 788.0, "kl_loss_12": 500.57960510253906, "kl_loss_17": 143.46262283325194, "kl_loss_3": 2559.1065063476562, "kl_loss_6": 1501.0489196777344, "learning_rate": 0.0009330127018922195, "loss": 1190.3428, "step": 1750 }, { "ce_loss_12": 3.1649666905403135, "ce_loss_17": 3.0007057666778563, "ce_loss_23": 2.928002047538757, "ce_loss_3": 4.12604067325592, "ce_loss_6": 3.60684871673584, "epoch": 0.176, "grad_norm": 908.0, "kl_loss_12": 485.37786254882815, "kl_loss_17": 137.02273483276366, "kl_loss_3": 2536.665869140625, "kl_loss_6": 1478.797186279297, "learning_rate": 0.0009322171915289634, "loss": 1156.0584, "step": 1760 }, { "ce_loss_12": 3.197109591960907, "ce_loss_17": 3.0425989389419557, "ce_loss_23": 2.9691351413726808, "ce_loss_3": 4.1132927775383, "ce_loss_6": 3.619373655319214, "epoch": 0.177, "grad_norm": 852.0, "kl_loss_12": 477.05029296875, "kl_loss_17": 136.0932559967041, "kl_loss_3": 2461.841271972656, "kl_loss_6": 1455.2650573730468, "learning_rate": 0.0009314173287433873, "loss": 1123.975, "step": 1770 }, { "ce_loss_12": 3.188371479511261, "ce_loss_17": 3.0140843629837035, "ce_loss_23": 2.940100073814392, "ce_loss_3": 4.126374185085297, "ce_loss_6": 3.613701510429382, "epoch": 0.178, "grad_norm": 996.0, "kl_loss_12": 499.4425277709961, "kl_loss_17": 136.04979972839357, "kl_loss_3": 2514.0261108398436, "kl_loss_6": 1475.0163818359374, "learning_rate": 0.0009306131215901003, "loss": 1132.1109, "step": 1780 }, { "ce_loss_12": 3.2126179456710817, "ce_loss_17": 3.0473007202148437, "ce_loss_23": 2.9747347474098205, "ce_loss_3": 4.1394585251808165, "ce_loss_6": 3.638213038444519, "epoch": 0.179, "grad_norm": 932.0, "kl_loss_12": 494.14884796142576, "kl_loss_17": 131.85014839172362, "kl_loss_3": 2461.104040527344, "kl_loss_6": 1451.3593811035157, "learning_rate": 0.0009298045781674596, "loss": 1116.6383, "step": 1790 }, { "ce_loss_12": 3.1938929080963137, "ce_loss_17": 3.0310275554656982, "ce_loss_23": 2.958340787887573, "ce_loss_3": 4.108129847049713, "ce_loss_6": 3.6047824025154114, "epoch": 0.18, "grad_norm": 916.0, "kl_loss_12": 475.7731002807617, "kl_loss_17": 129.48459243774414, "kl_loss_3": 2430.8353759765623, "kl_loss_6": 1416.467462158203, "learning_rate": 0.0009289917066174886, "loss": 1130.1948, "step": 1800 }, { "ce_loss_12": 3.1844147086143493, "ce_loss_17": 3.0295247793197633, "ce_loss_23": 2.9598302364349367, "ce_loss_3": 4.082317817211151, "ce_loss_6": 3.588042676448822, "epoch": 0.181, "grad_norm": 1008.0, "kl_loss_12": 455.54750061035156, "kl_loss_17": 125.22672424316406, "kl_loss_3": 2396.4680908203127, "kl_loss_6": 1387.836181640625, "learning_rate": 0.0009281745151257945, "loss": 1099.9523, "step": 1810 }, { "ce_loss_12": 3.2134677290916445, "ce_loss_17": 3.048130762577057, "ce_loss_23": 2.9771452069282534, "ce_loss_3": 4.141134595870971, "ce_loss_6": 3.630683124065399, "epoch": 0.182, "grad_norm": 856.0, "kl_loss_12": 470.7882507324219, "kl_loss_17": 128.9965015411377, "kl_loss_3": 2445.9111450195314, "kl_loss_6": 1418.3800537109375, "learning_rate": 0.0009273530119214868, "loss": 1128.6573, "step": 1820 }, { "ce_loss_12": 3.2906126141548158, "ce_loss_17": 3.1292755126953127, "ce_loss_23": 3.0643419146537783, "ce_loss_3": 4.215589344501495, "ce_loss_6": 3.7132033705711365, "epoch": 0.183, "grad_norm": 876.0, "kl_loss_12": 464.4059265136719, "kl_loss_17": 125.72085189819336, "kl_loss_3": 2454.9328735351564, "kl_loss_6": 1432.4993469238282, "learning_rate": 0.0009265272052770935, "loss": 1104.2849, "step": 1830 }, { "ce_loss_12": 3.1393822193145753, "ce_loss_17": 2.973209011554718, "ce_loss_23": 2.9009034633636475, "ce_loss_3": 4.102531969547272, "ce_loss_6": 3.575673007965088, "epoch": 0.184, "grad_norm": 1096.0, "kl_loss_12": 468.7420257568359, "kl_loss_17": 125.73902816772461, "kl_loss_3": 2499.4821166992188, "kl_loss_6": 1452.368017578125, "learning_rate": 0.0009256971035084784, "loss": 1135.5918, "step": 1840 }, { "ce_loss_12": 3.088402235507965, "ce_loss_17": 2.912094330787659, "ce_loss_23": 2.8388179183006286, "ce_loss_3": 4.067012321949005, "ce_loss_6": 3.541650187969208, "epoch": 0.185, "grad_norm": 868.0, "kl_loss_12": 492.3648712158203, "kl_loss_17": 129.6828712463379, "kl_loss_3": 2577.0054931640625, "kl_loss_6": 1514.3743774414063, "learning_rate": 0.0009248627149747573, "loss": 1154.1738, "step": 1850 }, { "ce_loss_12": 3.2601833939552307, "ce_loss_17": 3.0977912306785584, "ce_loss_23": 3.030880701541901, "ce_loss_3": 4.1803746342659, "ce_loss_6": 3.678359270095825, "epoch": 0.186, "grad_norm": 884.0, "kl_loss_12": 469.87674102783205, "kl_loss_17": 125.56081008911133, "kl_loss_3": 2448.435095214844, "kl_loss_6": 1429.397119140625, "learning_rate": 0.0009240240480782129, "loss": 1120.9773, "step": 1860 }, { "ce_loss_12": 3.169607436656952, "ce_loss_17": 3.004458963871002, "ce_loss_23": 2.9340516090393067, "ce_loss_3": 4.123180067539215, "ce_loss_6": 3.614936649799347, "epoch": 0.187, "grad_norm": 804.0, "kl_loss_12": 471.2584976196289, "kl_loss_17": 126.98157424926758, "kl_loss_3": 2493.7326477050783, "kl_loss_6": 1472.6423034667969, "learning_rate": 0.0009231811112642122, "loss": 1127.0547, "step": 1870 }, { "ce_loss_12": 3.2120662569999694, "ce_loss_17": 3.0467937111854555, "ce_loss_23": 2.978515994548798, "ce_loss_3": 4.115826559066773, "ce_loss_6": 3.6161744952201844, "epoch": 0.188, "grad_norm": 900.0, "kl_loss_12": 468.25464324951173, "kl_loss_17": 126.36864395141602, "kl_loss_3": 2423.8376586914064, "kl_loss_6": 1419.7891235351562, "learning_rate": 0.0009223339130211192, "loss": 1110.4283, "step": 1880 }, { "ce_loss_12": 3.0797258496284483, "ce_loss_17": 2.9191071927547454, "ce_loss_23": 2.8500725448131563, "ce_loss_3": 4.0366758584976195, "ce_loss_6": 3.508550250530243, "epoch": 0.189, "grad_norm": 800.0, "kl_loss_12": 460.5701538085938, "kl_loss_17": 122.64981727600097, "kl_loss_3": 2504.1417602539063, "kl_loss_6": 1442.12265625, "learning_rate": 0.0009214824618802108, "loss": 1126.9061, "step": 1890 }, { "ce_loss_12": 3.2441630601882934, "ce_loss_17": 3.0844648480415344, "ce_loss_23": 3.0122289299964904, "ce_loss_3": 4.170050525665284, "ce_loss_6": 3.666511356830597, "epoch": 0.19, "grad_norm": 1016.0, "kl_loss_12": 471.48859252929685, "kl_loss_17": 129.77956199645996, "kl_loss_3": 2428.87451171875, "kl_loss_6": 1423.535089111328, "learning_rate": 0.0009206267664155906, "loss": 1142.1124, "step": 1900 }, { "ce_loss_12": 3.181778848171234, "ce_loss_17": 3.0219834446907043, "ce_loss_23": 2.9443792939186095, "ce_loss_3": 4.116687369346619, "ce_loss_6": 3.6083747029304503, "epoch": 0.191, "grad_norm": 960.0, "kl_loss_12": 467.8783721923828, "kl_loss_17": 134.10535469055176, "kl_loss_3": 2458.371057128906, "kl_loss_6": 1428.5719116210937, "learning_rate": 0.0009197668352441024, "loss": 1127.5221, "step": 1910 }, { "ce_loss_12": 3.2185313224792482, "ce_loss_17": 3.059431421756744, "ce_loss_23": 2.990352964401245, "ce_loss_3": 4.132525777816772, "ce_loss_6": 3.630076193809509, "epoch": 0.192, "grad_norm": 1160.0, "kl_loss_12": 462.2439666748047, "kl_loss_17": 130.64373245239258, "kl_loss_3": 2414.4199462890624, "kl_loss_6": 1409.8724487304687, "learning_rate": 0.0009189026770252437, "loss": 1114.2314, "step": 1920 }, { "ce_loss_12": 3.2457741260528565, "ce_loss_17": 3.0870991349220276, "ce_loss_23": 3.0153459668159486, "ce_loss_3": 4.164921474456787, "ce_loss_6": 3.6658156156539916, "epoch": 0.193, "grad_norm": 1048.0, "kl_loss_12": 464.60990142822266, "kl_loss_17": 132.1713623046875, "kl_loss_3": 2423.4216552734374, "kl_loss_6": 1410.253729248047, "learning_rate": 0.000918034300461078, "loss": 1145.4147, "step": 1930 }, { "ce_loss_12": 3.2675069332122804, "ce_loss_17": 3.1138991475105287, "ce_loss_23": 3.0397408604621887, "ce_loss_3": 4.165933740139008, "ce_loss_6": 3.682149875164032, "epoch": 0.194, "grad_norm": 1400.0, "kl_loss_12": 462.9583801269531, "kl_loss_17": 134.64812736511232, "kl_loss_3": 2387.39130859375, "kl_loss_6": 1404.232928466797, "learning_rate": 0.0009171617142961477, "loss": 1102.495, "step": 1940 }, { "ce_loss_12": 3.232120132446289, "ce_loss_17": 3.074093961715698, "ce_loss_23": 3.004840636253357, "ce_loss_3": 4.141224360466003, "ce_loss_6": 3.6394020080566407, "epoch": 0.195, "grad_norm": 888.0, "kl_loss_12": 454.7131286621094, "kl_loss_17": 126.47698211669922, "kl_loss_3": 2401.7656982421877, "kl_loss_6": 1396.0390380859376, "learning_rate": 0.0009162849273173857, "loss": 1100.0949, "step": 1950 }, { "ce_loss_12": 3.176328718662262, "ce_loss_17": 3.018619990348816, "ce_loss_23": 2.9515690922737123, "ce_loss_3": 4.0918882369995115, "ce_loss_6": 3.5899390459060667, "epoch": 0.196, "grad_norm": 1176.0, "kl_loss_12": 454.46217041015626, "kl_loss_17": 123.6001823425293, "kl_loss_3": 2400.2062561035154, "kl_loss_6": 1389.1139221191406, "learning_rate": 0.0009154039483540273, "loss": 1104.5887, "step": 1960 }, { "ce_loss_12": 3.1507588386535645, "ce_loss_17": 2.997128760814667, "ce_loss_23": 2.92929230928421, "ce_loss_3": 4.0766734957695006, "ce_loss_6": 3.5648243904113768, "epoch": 0.197, "grad_norm": 1016.0, "kl_loss_12": 456.2421676635742, "kl_loss_17": 125.6520278930664, "kl_loss_3": 2431.879040527344, "kl_loss_6": 1407.830712890625, "learning_rate": 0.0009145187862775209, "loss": 1102.6521, "step": 1970 }, { "ce_loss_12": 3.1837910532951357, "ce_loss_17": 3.0291913986206054, "ce_loss_23": 2.9613188028335573, "ce_loss_3": 4.097797703742981, "ce_loss_6": 3.601459336280823, "epoch": 0.198, "grad_norm": 1168.0, "kl_loss_12": 460.4912338256836, "kl_loss_17": 123.57428398132325, "kl_loss_3": 2414.3682495117187, "kl_loss_6": 1414.0907043457032, "learning_rate": 0.0009136294500014386, "loss": 1100.5844, "step": 1980 }, { "ce_loss_12": 3.150496172904968, "ce_loss_17": 2.980268383026123, "ce_loss_23": 2.9135174036026, "ce_loss_3": 4.127268588542938, "ce_loss_6": 3.5919087529182434, "epoch": 0.199, "grad_norm": 1224.0, "kl_loss_12": 480.1836532592773, "kl_loss_17": 124.83656845092773, "kl_loss_3": 2533.9186401367188, "kl_loss_6": 1465.3629028320313, "learning_rate": 0.000912735948481387, "loss": 1143.2436, "step": 1990 }, { "ce_loss_12": 3.1754995465278624, "ce_loss_17": 3.009310555458069, "ce_loss_23": 2.9422292590141295, "ce_loss_3": 4.082965612411499, "ce_loss_6": 3.591770112514496, "epoch": 0.2, "grad_norm": 1056.0, "kl_loss_12": 480.6165740966797, "kl_loss_17": 125.33241271972656, "kl_loss_3": 2438.861468505859, "kl_loss_6": 1432.2340881347657, "learning_rate": 0.0009118382907149164, "loss": 1097.0717, "step": 2000 }, { "ce_loss_12": 3.196602463722229, "ce_loss_17": 3.0341225266456604, "ce_loss_23": 2.967613184452057, "ce_loss_3": 4.100062417984009, "ce_loss_6": 3.6124598026275634, "epoch": 0.201, "grad_norm": 844.0, "kl_loss_12": 474.27958526611326, "kl_loss_17": 124.60223999023438, "kl_loss_3": 2397.216931152344, "kl_loss_6": 1413.5718933105468, "learning_rate": 0.0009109364857414306, "loss": 1089.634, "step": 2010 }, { "ce_loss_12": 3.1681824326515198, "ce_loss_17": 3.0062640428543093, "ce_loss_23": 2.936304271221161, "ce_loss_3": 4.072107815742493, "ce_loss_6": 3.5693987131118776, "epoch": 0.202, "grad_norm": 1080.0, "kl_loss_12": 464.28491668701173, "kl_loss_17": 123.07480049133301, "kl_loss_3": 2408.75986328125, "kl_loss_6": 1396.0414733886719, "learning_rate": 0.0009100305426420956, "loss": 1123.4555, "step": 2020 }, { "ce_loss_12": 3.1313969492912292, "ce_loss_17": 2.9723105907440184, "ce_loss_23": 2.9064919352531433, "ce_loss_3": 4.095436239242554, "ce_loss_6": 3.5720829606056212, "epoch": 0.203, "grad_norm": 992.0, "kl_loss_12": 464.30756225585935, "kl_loss_17": 121.7504940032959, "kl_loss_3": 2525.1046875, "kl_loss_6": 1458.3286682128905, "learning_rate": 0.0009091204705397484, "loss": 1120.6155, "step": 2030 }, { "ce_loss_12": 3.115929698944092, "ce_loss_17": 2.956736671924591, "ce_loss_23": 2.887710380554199, "ce_loss_3": 4.086121809482575, "ce_loss_6": 3.560148632526398, "epoch": 0.204, "grad_norm": 1376.0, "kl_loss_12": 464.2669448852539, "kl_loss_17": 129.4686809539795, "kl_loss_3": 2529.653125, "kl_loss_6": 1470.6206970214844, "learning_rate": 0.0009082062785988049, "loss": 1131.6215, "step": 2040 }, { "ce_loss_12": 3.245707297325134, "ce_loss_17": 3.098770356178284, "ce_loss_23": 3.0233798742294313, "ce_loss_3": 4.126246273517609, "ce_loss_6": 3.6379145860671995, "epoch": 0.205, "grad_norm": 960.0, "kl_loss_12": 453.8625259399414, "kl_loss_17": 140.02084579467774, "kl_loss_3": 2379.939306640625, "kl_loss_6": 1388.1914367675781, "learning_rate": 0.0009072879760251679, "loss": 1105.3567, "step": 2050 }, { "ce_loss_12": 3.199712705612183, "ce_loss_17": 3.0443119049072265, "ce_loss_23": 2.9701660871505737, "ce_loss_3": 4.1467338562011715, "ce_loss_6": 3.630892109870911, "epoch": 0.206, "grad_norm": 1064.0, "kl_loss_12": 460.1679229736328, "kl_loss_17": 135.61270484924316, "kl_loss_3": 2507.5966186523438, "kl_loss_6": 1452.1191162109376, "learning_rate": 0.0009063655720661341, "loss": 1115.2066, "step": 2060 }, { "ce_loss_12": 3.2333725929260253, "ce_loss_17": 3.0829896688461305, "ce_loss_23": 3.0119826197624207, "ce_loss_3": 4.128749263286591, "ce_loss_6": 3.6334719181060793, "epoch": 0.207, "grad_norm": 976.0, "kl_loss_12": 454.76455841064455, "kl_loss_17": 131.2639488220215, "kl_loss_3": 2376.0222778320312, "kl_loss_6": 1381.4890625, "learning_rate": 0.000905439076010301, "loss": 1093.0187, "step": 2070 }, { "ce_loss_12": 3.1996248722076417, "ce_loss_17": 3.0418588161468505, "ce_loss_23": 2.9662808060646055, "ce_loss_3": 4.1204675197601315, "ce_loss_6": 3.622704017162323, "epoch": 0.208, "grad_norm": 1136.0, "kl_loss_12": 462.43045196533205, "kl_loss_17": 133.24030723571778, "kl_loss_3": 2422.5058837890624, "kl_loss_6": 1413.2706298828125, "learning_rate": 0.0009045084971874737, "loss": 1085.4412, "step": 2080 }, { "ce_loss_12": 3.181397581100464, "ce_loss_17": 3.0283178806304933, "ce_loss_23": 2.9539447903633116, "ce_loss_3": 4.090775012969971, "ce_loss_6": 3.594604218006134, "epoch": 0.209, "grad_norm": 940.0, "kl_loss_12": 458.8741989135742, "kl_loss_17": 135.37420806884765, "kl_loss_3": 2409.614776611328, "kl_loss_6": 1406.745947265625, "learning_rate": 0.0009035738449685707, "loss": 1119.5962, "step": 2090 }, { "ce_loss_12": 3.1231150388717652, "ce_loss_17": 2.9623324155807493, "ce_loss_23": 2.8883072257041933, "ce_loss_3": 4.075538074970245, "ce_loss_6": 3.5636983036994936, "epoch": 0.21, "grad_norm": 880.0, "kl_loss_12": 464.61413421630857, "kl_loss_17": 129.5325019836426, "kl_loss_3": 2488.8315490722657, "kl_loss_6": 1451.3852172851562, "learning_rate": 0.0009026351287655293, "loss": 1103.3046, "step": 2100 }, { "ce_loss_12": 3.2977046012878417, "ce_loss_17": 3.1490105032920837, "ce_loss_23": 3.085929501056671, "ce_loss_3": 4.141742074489594, "ce_loss_6": 3.6752121686935424, "epoch": 0.211, "grad_norm": 1000.0, "kl_loss_12": 438.23717041015624, "kl_loss_17": 120.08209075927735, "kl_loss_3": 2275.33037109375, "kl_loss_6": 1330.7261840820313, "learning_rate": 0.0009016923580312113, "loss": 1046.0168, "step": 2110 }, { "ce_loss_12": 3.162338900566101, "ce_loss_17": 3.011285150051117, "ce_loss_23": 2.9455466270446777, "ce_loss_3": 4.060781872272491, "ce_loss_6": 3.5741167187690737, "epoch": 0.212, "grad_norm": 964.0, "kl_loss_12": 454.36209259033205, "kl_loss_17": 122.61332359313965, "kl_loss_3": 2368.1356811523438, "kl_loss_6": 1377.1678588867187, "learning_rate": 0.0009007455422593077, "loss": 1105.0621, "step": 2120 }, { "ce_loss_12": 3.191146957874298, "ce_loss_17": 3.0287877798080443, "ce_loss_23": 2.960277593135834, "ce_loss_3": 4.120349848270417, "ce_loss_6": 3.6093490719795227, "epoch": 0.213, "grad_norm": 1096.0, "kl_loss_12": 475.2200164794922, "kl_loss_17": 124.79928970336914, "kl_loss_3": 2481.0699829101563, "kl_loss_6": 1431.2489135742187, "learning_rate": 0.0008997946909842425, "loss": 1116.5895, "step": 2130 }, { "ce_loss_12": 3.215824568271637, "ce_loss_17": 3.037301540374756, "ce_loss_23": 2.9644772887229918, "ce_loss_3": 4.186346101760864, "ce_loss_6": 3.6540252685546877, "epoch": 0.214, "grad_norm": 988.0, "kl_loss_12": 500.931103515625, "kl_loss_17": 131.5737678527832, "kl_loss_3": 2556.2944519042967, "kl_loss_6": 1484.178729248047, "learning_rate": 0.0008988398137810777, "loss": 1118.9062, "step": 2140 }, { "ce_loss_12": 3.234931528568268, "ce_loss_17": 3.0760345458984375, "ce_loss_23": 3.0077293157577514, "ce_loss_3": 4.124112021923065, "ce_loss_6": 3.6427318930625914, "epoch": 0.215, "grad_norm": 984.0, "kl_loss_12": 464.1569320678711, "kl_loss_17": 121.77989540100097, "kl_loss_3": 2378.131787109375, "kl_loss_6": 1391.8884033203126, "learning_rate": 0.0008978809202654162, "loss": 1079.9229, "step": 2150 }, { "ce_loss_12": 3.212024140357971, "ce_loss_17": 3.052365016937256, "ce_loss_23": 2.9833517551422117, "ce_loss_3": 4.109187686443329, "ce_loss_6": 3.6157562851905825, "epoch": 0.216, "grad_norm": 1176.0, "kl_loss_12": 460.35668487548827, "kl_loss_17": 122.84857292175293, "kl_loss_3": 2358.084600830078, "kl_loss_6": 1370.2854736328125, "learning_rate": 0.0008969180200933046, "loss": 1091.8207, "step": 2160 }, { "ce_loss_12": 3.179626762866974, "ce_loss_17": 3.0116968154907227, "ce_loss_23": 2.9410921812057493, "ce_loss_3": 4.11946210861206, "ce_loss_6": 3.608268916606903, "epoch": 0.217, "grad_norm": 1184.0, "kl_loss_12": 475.6806076049805, "kl_loss_17": 127.56976089477538, "kl_loss_3": 2447.6637939453126, "kl_loss_6": 1422.2298706054687, "learning_rate": 0.0008959511229611376, "loss": 1118.1624, "step": 2170 }, { "ce_loss_12": 3.2447680592536927, "ce_loss_17": 3.0842313051223753, "ce_loss_23": 3.0126760601997375, "ce_loss_3": 4.148605918884277, "ce_loss_6": 3.6437505006790163, "epoch": 0.218, "grad_norm": 1096.0, "kl_loss_12": 463.01764221191405, "kl_loss_17": 129.428275680542, "kl_loss_3": 2418.147393798828, "kl_loss_6": 1401.0833679199218, "learning_rate": 0.0008949802386055581, "loss": 1098.8416, "step": 2180 }, { "ce_loss_12": 3.114654839038849, "ce_loss_17": 2.9563396692276003, "ce_loss_23": 2.88834308385849, "ce_loss_3": 4.020838689804077, "ce_loss_6": 3.529152476787567, "epoch": 0.219, "grad_norm": 792.0, "kl_loss_12": 456.43510284423826, "kl_loss_17": 126.8063793182373, "kl_loss_3": 2362.814306640625, "kl_loss_6": 1375.901593017578, "learning_rate": 0.0008940053768033609, "loss": 1109.1713, "step": 2190 }, { "ce_loss_12": 3.1917166471481324, "ce_loss_17": 3.041363799571991, "ce_loss_23": 2.971977376937866, "ce_loss_3": 4.0816752195358275, "ce_loss_6": 3.5965646266937257, "epoch": 0.22, "grad_norm": 820.0, "kl_loss_12": 450.95191802978513, "kl_loss_17": 124.02299880981445, "kl_loss_3": 2371.4427490234375, "kl_loss_6": 1382.6896728515626, "learning_rate": 0.0008930265473713938, "loss": 1082.2387, "step": 2200 }, { "ce_loss_12": 3.1572428226470945, "ce_loss_17": 3.003165936470032, "ce_loss_23": 2.9327398777008056, "ce_loss_3": 4.064587521553039, "ce_loss_6": 3.566171145439148, "epoch": 0.221, "grad_norm": 860.0, "kl_loss_12": 453.63380737304686, "kl_loss_17": 127.90091209411621, "kl_loss_3": 2376.099816894531, "kl_loss_6": 1378.780255126953, "learning_rate": 0.0008920437601664579, "loss": 1066.1129, "step": 2210 }, { "ce_loss_12": 3.160594344139099, "ce_loss_17": 3.002773475646973, "ce_loss_23": 2.9288374304771425, "ce_loss_3": 4.075528597831726, "ce_loss_6": 3.578319561481476, "epoch": 0.222, "grad_norm": 1072.0, "kl_loss_12": 465.2222396850586, "kl_loss_17": 133.5637550354004, "kl_loss_3": 2424.0534545898436, "kl_loss_6": 1415.0123779296875, "learning_rate": 0.0008910570250852097, "loss": 1082.2885, "step": 2220 }, { "ce_loss_12": 3.236863040924072, "ce_loss_17": 3.0909160494804384, "ce_loss_23": 3.020583617687225, "ce_loss_3": 4.092880666255951, "ce_loss_6": 3.612149167060852, "epoch": 0.223, "grad_norm": 944.0, "kl_loss_12": 444.67178802490236, "kl_loss_17": 127.97508087158204, "kl_loss_3": 2295.9597778320312, "kl_loss_6": 1331.6259887695312, "learning_rate": 0.0008900663520640604, "loss": 1054.1037, "step": 2230 }, { "ce_loss_12": 3.201057267189026, "ce_loss_17": 3.048276162147522, "ce_loss_23": 2.9798381328582764, "ce_loss_3": 4.093599379062653, "ce_loss_6": 3.5987416982650755, "epoch": 0.224, "grad_norm": 1032.0, "kl_loss_12": 452.34375, "kl_loss_17": 126.09738502502441, "kl_loss_3": 2378.9890625, "kl_loss_6": 1368.7392456054688, "learning_rate": 0.0008890717510790764, "loss": 1085.1502, "step": 2240 }, { "ce_loss_12": 3.1611531615257262, "ce_loss_17": 3.0081787943840026, "ce_loss_23": 2.941867744922638, "ce_loss_3": 4.073286545276642, "ce_loss_6": 3.5713559865951536, "epoch": 0.225, "grad_norm": 980.0, "kl_loss_12": 449.97607879638673, "kl_loss_17": 120.34653587341309, "kl_loss_3": 2397.9912719726562, "kl_loss_6": 1380.2239501953125, "learning_rate": 0.0008880732321458784, "loss": 1093.934, "step": 2250 }, { "ce_loss_12": 3.194337809085846, "ce_loss_17": 3.0360872030258177, "ce_loss_23": 2.968069648742676, "ce_loss_3": 4.076019847393036, "ce_loss_6": 3.5874541282653807, "epoch": 0.226, "grad_norm": 772.0, "kl_loss_12": 453.4205032348633, "kl_loss_17": 121.36964416503906, "kl_loss_3": 2351.7071899414063, "kl_loss_6": 1369.5370178222656, "learning_rate": 0.0008870708053195413, "loss": 1092.1357, "step": 2260 }, { "ce_loss_12": 3.211677670478821, "ce_loss_17": 3.060397219657898, "ce_loss_23": 2.994785559177399, "ce_loss_3": 4.074770200252533, "ce_loss_6": 3.60477374792099, "epoch": 0.227, "grad_norm": 1016.0, "kl_loss_12": 438.2804779052734, "kl_loss_17": 117.40936431884765, "kl_loss_3": 2312.314312744141, "kl_loss_6": 1352.8179443359375, "learning_rate": 0.0008860644806944918, "loss": 1065.2646, "step": 2270 }, { "ce_loss_12": 3.1623907327651977, "ce_loss_17": 3.0015741109848024, "ce_loss_23": 2.9344356179237367, "ce_loss_3": 4.077270436286926, "ce_loss_6": 3.569185900688171, "epoch": 0.228, "grad_norm": 884.0, "kl_loss_12": 458.0107681274414, "kl_loss_17": 121.36178131103516, "kl_loss_3": 2420.191510009766, "kl_loss_6": 1396.1072570800782, "learning_rate": 0.0008850542684044079, "loss": 1068.0572, "step": 2280 }, { "ce_loss_12": 3.132009541988373, "ce_loss_17": 2.966815006732941, "ce_loss_23": 2.8952775597572327, "ce_loss_3": 4.08059937953949, "ce_loss_6": 3.5586902499198914, "epoch": 0.229, "grad_norm": 1012.0, "kl_loss_12": 469.1373596191406, "kl_loss_17": 125.32361526489258, "kl_loss_3": 2500.100225830078, "kl_loss_6": 1446.2523620605468, "learning_rate": 0.0008840401786221159, "loss": 1094.5689, "step": 2290 }, { "ce_loss_12": 3.2482141852378845, "ce_loss_17": 3.0996639013290403, "ce_loss_23": 3.0369733691215517, "ce_loss_3": 4.129230093955994, "ce_loss_6": 3.6413492441177366, "epoch": 0.23, "grad_norm": 884.0, "kl_loss_12": 434.2909759521484, "kl_loss_17": 113.66693344116212, "kl_loss_3": 2317.9691650390623, "kl_loss_6": 1341.140283203125, "learning_rate": 0.000883022221559489, "loss": 1051.0217, "step": 2300 }, { "ce_loss_12": 3.2230896234512327, "ce_loss_17": 3.068192791938782, "ce_loss_23": 3.0025680303573608, "ce_loss_3": 4.128422045707703, "ce_loss_6": 3.628089118003845, "epoch": 0.231, "grad_norm": 1000.0, "kl_loss_12": 443.489567565918, "kl_loss_17": 118.53048629760742, "kl_loss_3": 2399.0051025390626, "kl_loss_6": 1382.7301330566406, "learning_rate": 0.0008820004074673434, "loss": 1112.2879, "step": 2310 }, { "ce_loss_12": 3.1339518427848816, "ce_loss_17": 2.9779538989067076, "ce_loss_23": 2.9133885741233825, "ce_loss_3": 4.032837772369385, "ce_loss_6": 3.536777412891388, "epoch": 0.232, "grad_norm": 852.0, "kl_loss_12": 444.94561157226565, "kl_loss_17": 117.30743064880372, "kl_loss_3": 2404.906109619141, "kl_loss_6": 1389.1506958007812, "learning_rate": 0.0008809747466353355, "loss": 1071.7189, "step": 2320 }, { "ce_loss_12": 3.1373749494552614, "ce_loss_17": 2.982857036590576, "ce_loss_23": 2.9156289100646973, "ce_loss_3": 4.049996149539948, "ce_loss_6": 3.54087575674057, "epoch": 0.233, "grad_norm": 984.0, "kl_loss_12": 440.93540344238284, "kl_loss_17": 120.87570838928222, "kl_loss_3": 2391.51513671875, "kl_loss_6": 1362.7610961914063, "learning_rate": 0.0008799452493918585, "loss": 1085.2658, "step": 2330 }, { "ce_loss_12": 3.2044550657272337, "ce_loss_17": 3.0486762285232545, "ce_loss_23": 2.9857989072799684, "ce_loss_3": 4.096554934978485, "ce_loss_6": 3.6082839727401734, "epoch": 0.234, "grad_norm": 884.0, "kl_loss_12": 438.3496551513672, "kl_loss_17": 117.50934257507325, "kl_loss_3": 2366.7732543945312, "kl_loss_6": 1372.0487976074219, "learning_rate": 0.0008789119261039385, "loss": 1108.4551, "step": 2340 }, { "ce_loss_12": 3.1221569895744326, "ce_loss_17": 2.9757789373397827, "ce_loss_23": 2.905420696735382, "ce_loss_3": 4.026392912864685, "ce_loss_6": 3.5348900079727175, "epoch": 0.235, "grad_norm": 1012.0, "kl_loss_12": 442.50169982910154, "kl_loss_17": 124.05238304138183, "kl_loss_3": 2375.810498046875, "kl_loss_6": 1379.4071655273438, "learning_rate": 0.0008778747871771292, "loss": 1059.9826, "step": 2350 }, { "ce_loss_12": 3.166475248336792, "ce_loss_17": 3.0180999040603638, "ce_loss_23": 2.9576680302619933, "ce_loss_3": 4.037302458286286, "ce_loss_6": 3.55890007019043, "epoch": 0.236, "grad_norm": 1008.0, "kl_loss_12": 424.8075119018555, "kl_loss_17": 121.03117790222169, "kl_loss_3": 2302.770233154297, "kl_loss_6": 1329.6474670410157, "learning_rate": 0.0008768338430554083, "loss": 1043.8572, "step": 2360 }, { "ce_loss_12": 3.1851816058158873, "ce_loss_17": 3.042919707298279, "ce_loss_23": 2.9646944642066955, "ce_loss_3": 4.070168995857239, "ce_loss_6": 3.5822435140609743, "epoch": 0.237, "grad_norm": 1004.0, "kl_loss_12": 445.9438781738281, "kl_loss_17": 137.5145179748535, "kl_loss_3": 2323.822509765625, "kl_loss_6": 1348.6503540039062, "learning_rate": 0.0008757891042210713, "loss": 1073.7238, "step": 2370 }, { "ce_loss_12": 3.195702016353607, "ce_loss_17": 3.0519804120063783, "ce_loss_23": 2.9797383666038515, "ce_loss_3": 4.07917971611023, "ce_loss_6": 3.590975046157837, "epoch": 0.238, "grad_norm": 1032.0, "kl_loss_12": 435.39076232910156, "kl_loss_17": 136.03977317810057, "kl_loss_3": 2320.868212890625, "kl_loss_6": 1343.4944396972655, "learning_rate": 0.0008747405811946271, "loss": 1066.0041, "step": 2380 }, { "ce_loss_12": 3.106317698955536, "ce_loss_17": 2.969082570075989, "ce_loss_23": 2.8858501434326174, "ce_loss_3": 4.052463281154632, "ce_loss_6": 3.5339264392852785, "epoch": 0.239, "grad_norm": 936.0, "kl_loss_12": 450.8729217529297, "kl_loss_17": 147.4965450286865, "kl_loss_3": 2465.299914550781, "kl_loss_6": 1418.1998718261718, "learning_rate": 0.0008736882845346905, "loss": 1078.8531, "step": 2390 }, { "ce_loss_12": 3.189059865474701, "ce_loss_17": 3.0437240958213807, "ce_loss_23": 2.967745578289032, "ce_loss_3": 4.093241429328918, "ce_loss_6": 3.5936091780662536, "epoch": 0.24, "grad_norm": 932.0, "kl_loss_12": 443.7699142456055, "kl_loss_17": 135.94134826660155, "kl_loss_3": 2352.109338378906, "kl_loss_6": 1349.3434997558593, "learning_rate": 0.0008726322248378774, "loss": 1059.4339, "step": 2400 }, { "ce_loss_12": 3.1886565566062925, "ce_loss_17": 3.0453128695487974, "ce_loss_23": 2.97672917842865, "ce_loss_3": 4.110389256477356, "ce_loss_6": 3.5999670028686523, "epoch": 0.241, "grad_norm": 904.0, "kl_loss_12": 431.85757598876955, "kl_loss_17": 126.28680725097657, "kl_loss_3": 2421.1709594726562, "kl_loss_6": 1384.5716857910156, "learning_rate": 0.0008715724127386971, "loss": 1099.6822, "step": 2410 }, { "ce_loss_12": 3.2481358289718627, "ce_loss_17": 3.1088537335395814, "ce_loss_23": 3.042147898674011, "ce_loss_3": 4.126234018802643, "ce_loss_6": 3.647673797607422, "epoch": 0.242, "grad_norm": 836.0, "kl_loss_12": 427.24434967041014, "kl_loss_17": 123.11380424499512, "kl_loss_3": 2319.6755432128907, "kl_loss_6": 1339.1436645507813, "learning_rate": 0.0008705088589094458, "loss": 1064.8498, "step": 2420 }, { "ce_loss_12": 3.265853762626648, "ce_loss_17": 3.1238319754600523, "ce_loss_23": 3.055406653881073, "ce_loss_3": 4.153356218338013, "ce_loss_6": 3.6705095887184145, "epoch": 0.243, "grad_norm": 896.0, "kl_loss_12": 431.32056427001953, "kl_loss_17": 122.70526237487793, "kl_loss_3": 2335.2805053710936, "kl_loss_6": 1359.439990234375, "learning_rate": 0.0008694415740600988, "loss": 1067.6548, "step": 2430 }, { "ce_loss_12": 3.1318012833595277, "ce_loss_17": 2.9968308091163633, "ce_loss_23": 2.9192454338073732, "ce_loss_3": 4.06026520729065, "ce_loss_6": 3.544374239444733, "epoch": 0.244, "grad_norm": 1024.0, "kl_loss_12": 437.3179306030273, "kl_loss_17": 133.74676437377929, "kl_loss_3": 2428.2571411132812, "kl_loss_6": 1388.4118103027345, "learning_rate": 0.0008683705689382025, "loss": 1077.0226, "step": 2440 }, { "ce_loss_12": 3.203168284893036, "ce_loss_17": 3.0576744079589844, "ce_loss_23": 2.9963975667953493, "ce_loss_3": 4.066239619255066, "ce_loss_6": 3.5964288830757143, "epoch": 0.245, "grad_norm": 960.0, "kl_loss_12": 425.4798583984375, "kl_loss_17": 120.27335968017579, "kl_loss_3": 2308.929821777344, "kl_loss_6": 1336.3246276855468, "learning_rate": 0.0008672958543287666, "loss": 1071.6658, "step": 2450 }, { "ce_loss_12": 3.2136458277702333, "ce_loss_17": 3.070457136631012, "ce_loss_23": 3.0044744491577147, "ce_loss_3": 4.076639878749847, "ce_loss_6": 3.60375052690506, "epoch": 0.246, "grad_norm": 1096.0, "kl_loss_12": 434.4812072753906, "kl_loss_17": 120.4621150970459, "kl_loss_3": 2290.6869506835938, "kl_loss_6": 1332.1749755859375, "learning_rate": 0.0008662174410541554, "loss": 1044.3947, "step": 2460 }, { "ce_loss_12": 3.176317536830902, "ce_loss_17": 3.0331140756607056, "ce_loss_23": 2.97104926109314, "ce_loss_3": 4.0398347735404965, "ce_loss_6": 3.5594708561897277, "epoch": 0.247, "grad_norm": 1048.0, "kl_loss_12": 426.33206329345705, "kl_loss_17": 114.9592658996582, "kl_loss_3": 2284.038134765625, "kl_loss_6": 1314.4482360839843, "learning_rate": 0.0008651353399739787, "loss": 1069.7281, "step": 2470 }, { "ce_loss_12": 3.209623324871063, "ce_loss_17": 3.057310092449188, "ce_loss_23": 2.994291377067566, "ce_loss_3": 4.081614565849304, "ce_loss_6": 3.5966670632362367, "epoch": 0.248, "grad_norm": 988.0, "kl_loss_12": 431.0287292480469, "kl_loss_17": 114.07895278930664, "kl_loss_3": 2297.882537841797, "kl_loss_6": 1327.6509521484375, "learning_rate": 0.0008640495619849821, "loss": 1048.9321, "step": 2480 }, { "ce_loss_12": 3.1695863127708437, "ce_loss_17": 3.0206604480743406, "ce_loss_23": 2.9585460782051087, "ce_loss_3": 4.034311413764954, "ce_loss_6": 3.5579578995704653, "epoch": 0.249, "grad_norm": 1240.0, "kl_loss_12": 427.37953186035156, "kl_loss_17": 114.5454231262207, "kl_loss_3": 2294.754217529297, "kl_loss_6": 1322.4585266113281, "learning_rate": 0.0008629601180209381, "loss": 1039.6441, "step": 2490 }, { "ce_loss_12": 3.1601105690002442, "ce_loss_17": 3.0158230662345886, "ce_loss_23": 2.9512540102005005, "ce_loss_3": 4.029484963417053, "ce_loss_6": 3.548910307884216, "epoch": 0.25, "grad_norm": 1072.0, "kl_loss_12": 424.3945587158203, "kl_loss_17": 114.85591850280761, "kl_loss_3": 2280.948974609375, "kl_loss_6": 1314.254864501953, "learning_rate": 0.000861867019052535, "loss": 1051.2812, "step": 2500 }, { "ce_loss_12": 3.0931338906288146, "ce_loss_17": 2.941186046600342, "ce_loss_23": 2.8769421696662905, "ce_loss_3": 4.009210073947907, "ce_loss_6": 3.507812964916229, "epoch": 0.251, "grad_norm": 940.0, "kl_loss_12": 436.55210571289064, "kl_loss_17": 114.65191955566407, "kl_loss_3": 2379.95205078125, "kl_loss_6": 1371.4806030273437, "learning_rate": 0.0008607702760872678, "loss": 1074.0451, "step": 2510 }, { "ce_loss_12": 3.188943064212799, "ce_loss_17": 3.040559208393097, "ce_loss_23": 2.979034185409546, "ce_loss_3": 4.04866658449173, "ce_loss_6": 3.579585540294647, "epoch": 0.252, "grad_norm": 1144.0, "kl_loss_12": 425.6948501586914, "kl_loss_17": 112.35841369628906, "kl_loss_3": 2257.3027099609376, "kl_loss_6": 1316.080010986328, "learning_rate": 0.0008596699001693256, "loss": 1060.0463, "step": 2520 }, { "ce_loss_12": 3.199018669128418, "ce_loss_17": 3.056893455982208, "ce_loss_23": 2.996263098716736, "ce_loss_3": 4.052443480491638, "ce_loss_6": 3.5751360654830933, "epoch": 0.253, "grad_norm": 996.0, "kl_loss_12": 428.9387924194336, "kl_loss_17": 111.91889839172363, "kl_loss_3": 2277.7624450683593, "kl_loss_6": 1303.7885803222657, "learning_rate": 0.0008585659023794818, "loss": 1060.6101, "step": 2530 }, { "ce_loss_12": 3.175670659542084, "ce_loss_17": 3.021572208404541, "ce_loss_23": 2.9581612825393675, "ce_loss_3": 4.091776359081268, "ce_loss_6": 3.5962265133857727, "epoch": 0.254, "grad_norm": 1424.0, "kl_loss_12": 441.34724884033204, "kl_loss_17": 116.35274925231934, "kl_loss_3": 2388.9944702148437, "kl_loss_6": 1388.9312561035156, "learning_rate": 0.0008574582938349817, "loss": 1065.3421, "step": 2540 }, { "ce_loss_12": 3.1620734691619874, "ce_loss_17": 3.002826249599457, "ce_loss_23": 2.9333348870277405, "ce_loss_3": 4.068507182598114, "ce_loss_6": 3.5735328078269957, "epoch": 0.255, "grad_norm": 900.0, "kl_loss_12": 454.44627380371094, "kl_loss_17": 121.72102546691895, "kl_loss_3": 2384.1946350097655, "kl_loss_6": 1385.3286499023438, "learning_rate": 0.0008563470856894315, "loss": 1052.7078, "step": 2550 }, { "ce_loss_12": 3.148736262321472, "ce_loss_17": 2.999965786933899, "ce_loss_23": 2.9350846290588377, "ce_loss_3": 4.050228786468506, "ce_loss_6": 3.5579028367996215, "epoch": 0.256, "grad_norm": 1012.0, "kl_loss_12": 432.3045715332031, "kl_loss_17": 114.02141952514648, "kl_loss_3": 2349.2908142089846, "kl_loss_6": 1367.2147155761718, "learning_rate": 0.0008552322891326845, "loss": 1056.7793, "step": 2560 }, { "ce_loss_12": 3.1203397512435913, "ce_loss_17": 2.9703470706939696, "ce_loss_23": 2.9060755014419555, "ce_loss_3": 4.019045674800873, "ce_loss_6": 3.527764308452606, "epoch": 0.257, "grad_norm": 952.0, "kl_loss_12": 431.4568145751953, "kl_loss_17": 113.82027130126953, "kl_loss_3": 2349.3852966308596, "kl_loss_6": 1351.4473205566405, "learning_rate": 0.0008541139153907296, "loss": 1045.5965, "step": 2570 }, { "ce_loss_12": 3.0819831013679506, "ce_loss_17": 2.9337033152580263, "ce_loss_23": 2.87243572473526, "ce_loss_3": 3.977305507659912, "ce_loss_6": 3.48627450466156, "epoch": 0.258, "grad_norm": 884.0, "kl_loss_12": 425.45489501953125, "kl_loss_17": 110.58322982788086, "kl_loss_3": 2335.633154296875, "kl_loss_6": 1345.2675415039062, "learning_rate": 0.0008529919757255782, "loss": 1064.3127, "step": 2580 }, { "ce_loss_12": 3.107268238067627, "ce_loss_17": 2.967454993724823, "ce_loss_23": 2.909093964099884, "ce_loss_3": 3.9463349342346192, "ce_loss_6": 3.485192346572876, "epoch": 0.259, "grad_norm": 988.0, "kl_loss_12": 414.2021087646484, "kl_loss_17": 111.3341178894043, "kl_loss_3": 2240.2564697265625, "kl_loss_6": 1291.5567199707032, "learning_rate": 0.0008518664814351503, "loss": 1027.255, "step": 2590 }, { "ce_loss_12": 3.0857400417327883, "ce_loss_17": 2.9320381045341493, "ce_loss_23": 2.8670764684677126, "ce_loss_3": 3.991806662082672, "ce_loss_6": 3.48983781337738, "epoch": 0.26, "grad_norm": 1064.0, "kl_loss_12": 441.3775390625, "kl_loss_17": 120.65605735778809, "kl_loss_3": 2370.8674560546874, "kl_loss_6": 1358.5048706054688, "learning_rate": 0.0008507374438531607, "loss": 1100.9932, "step": 2600 }, { "ce_loss_12": 3.0624029636383057, "ce_loss_17": 2.9160149931907653, "ce_loss_23": 2.855610120296478, "ce_loss_3": 3.9515944004058836, "ce_loss_6": 3.459726858139038, "epoch": 0.261, "grad_norm": 1072.0, "kl_loss_12": 426.2549850463867, "kl_loss_17": 115.3815818786621, "kl_loss_3": 2323.017303466797, "kl_loss_6": 1336.0733459472656, "learning_rate": 0.0008496048743490053, "loss": 1046.5982, "step": 2610 }, { "ce_loss_12": 3.2101902842521666, "ce_loss_17": 3.0653091192245485, "ce_loss_23": 3.000552237033844, "ce_loss_3": 4.061716663837433, "ce_loss_6": 3.59234459400177, "epoch": 0.262, "grad_norm": 824.0, "kl_loss_12": 426.0384750366211, "kl_loss_17": 115.67604293823243, "kl_loss_3": 2261.2772827148438, "kl_loss_6": 1312.9195495605468, "learning_rate": 0.0008484687843276469, "loss": 1040.8359, "step": 2620 }, { "ce_loss_12": 3.14266951084137, "ce_loss_17": 2.99924293756485, "ce_loss_23": 2.9332231998443605, "ce_loss_3": 4.01922596693039, "ce_loss_6": 3.536488151550293, "epoch": 0.263, "grad_norm": 1296.0, "kl_loss_12": 436.6288177490234, "kl_loss_17": 118.81113357543946, "kl_loss_3": 2318.0344421386717, "kl_loss_6": 1339.8821228027343, "learning_rate": 0.0008473291852294987, "loss": 1064.5347, "step": 2630 }, { "ce_loss_12": 3.1522043466567995, "ce_loss_17": 3.0062794923782348, "ce_loss_23": 2.938192343711853, "ce_loss_3": 4.026766860485077, "ce_loss_6": 3.545170855522156, "epoch": 0.264, "grad_norm": 1208.0, "kl_loss_12": 429.8892822265625, "kl_loss_17": 119.1082977294922, "kl_loss_3": 2327.79921875, "kl_loss_6": 1337.9722473144532, "learning_rate": 0.0008461860885303114, "loss": 1042.7791, "step": 2640 }, { "ce_loss_12": 3.1773787260055544, "ce_loss_17": 3.032958376407623, "ce_loss_23": 2.9696513175964356, "ce_loss_3": 4.03357914686203, "ce_loss_6": 3.5610399723052977, "epoch": 0.265, "grad_norm": 1304.0, "kl_loss_12": 421.92885131835936, "kl_loss_17": 116.15681076049805, "kl_loss_3": 2260.550634765625, "kl_loss_6": 1304.3487976074218, "learning_rate": 0.000845039505741056, "loss": 1044.8391, "step": 2650 }, { "ce_loss_12": 3.1606823325157167, "ce_loss_17": 3.0121636748313905, "ce_loss_23": 2.9492172598838806, "ce_loss_3": 4.040087330341339, "ce_loss_6": 3.5578425526618958, "epoch": 0.266, "grad_norm": 1168.0, "kl_loss_12": 441.35157623291013, "kl_loss_17": 117.9492961883545, "kl_loss_3": 2351.2228820800783, "kl_loss_6": 1366.394580078125, "learning_rate": 0.0008438894484078086, "loss": 1089.5979, "step": 2660 }, { "ce_loss_12": 3.1689205169677734, "ce_loss_17": 3.0273826241493227, "ce_loss_23": 2.9663585782051087, "ce_loss_3": 4.033535480499268, "ce_loss_6": 3.5576565623283387, "epoch": 0.267, "grad_norm": 968.0, "kl_loss_12": 421.36059722900393, "kl_loss_17": 116.55185966491699, "kl_loss_3": 2282.9838684082033, "kl_loss_6": 1319.715594482422, "learning_rate": 0.0008427359281116334, "loss": 1043.9596, "step": 2670 }, { "ce_loss_12": 3.0684268474578857, "ce_loss_17": 2.9236847519874574, "ce_loss_23": 2.860447108745575, "ce_loss_3": 3.9806047320365905, "ce_loss_6": 3.484599268436432, "epoch": 0.268, "grad_norm": 1128.0, "kl_loss_12": 426.485041809082, "kl_loss_17": 115.8799388885498, "kl_loss_3": 2370.3961486816406, "kl_loss_6": 1367.2152770996095, "learning_rate": 0.0008415789564684673, "loss": 1059.0766, "step": 2680 }, { "ce_loss_12": 3.29848473072052, "ce_loss_17": 3.1556638956069945, "ce_loss_23": 3.0902445912361145, "ce_loss_3": 4.151888036727906, "ce_loss_6": 3.6841147541999817, "epoch": 0.269, "grad_norm": 876.0, "kl_loss_12": 427.7517349243164, "kl_loss_17": 117.3643009185791, "kl_loss_3": 2245.7098876953123, "kl_loss_6": 1308.4502197265624, "learning_rate": 0.0008404185451290017, "loss": 1024.183, "step": 2690 }, { "ce_loss_12": 3.178904449939728, "ce_loss_17": 3.034889113903046, "ce_loss_23": 2.9719478368759153, "ce_loss_3": 4.050006449222565, "ce_loss_6": 3.565938127040863, "epoch": 0.27, "grad_norm": 940.0, "kl_loss_12": 420.5887252807617, "kl_loss_17": 113.05890274047852, "kl_loss_3": 2283.9570190429686, "kl_loss_6": 1309.367413330078, "learning_rate": 0.0008392547057785661, "loss": 1031.9732, "step": 2700 }, { "ce_loss_12": 3.1153995633125304, "ce_loss_17": 2.967165696620941, "ce_loss_23": 2.9075494527816774, "ce_loss_3": 4.030291855335236, "ce_loss_6": 3.5235714077949525, "epoch": 0.271, "grad_norm": 944.0, "kl_loss_12": 431.7582672119141, "kl_loss_17": 113.72574844360352, "kl_loss_3": 2407.028039550781, "kl_loss_6": 1371.627508544922, "learning_rate": 0.0008380874501370098, "loss": 1038.0814, "step": 2710 }, { "ce_loss_12": 3.107207548618317, "ce_loss_17": 2.9570419549942017, "ce_loss_23": 2.892813432216644, "ce_loss_3": 4.019529783725739, "ce_loss_6": 3.521112060546875, "epoch": 0.272, "grad_norm": 1032.0, "kl_loss_12": 437.33069915771483, "kl_loss_17": 114.94045181274414, "kl_loss_3": 2386.276776123047, "kl_loss_6": 1373.3397644042968, "learning_rate": 0.0008369167899585841, "loss": 1059.7808, "step": 2720 }, { "ce_loss_12": 3.21390700340271, "ce_loss_17": 3.072514033317566, "ce_loss_23": 3.011313247680664, "ce_loss_3": 4.039605271816254, "ce_loss_6": 3.5885566592216493, "epoch": 0.273, "grad_norm": 824.0, "kl_loss_12": 416.95458068847654, "kl_loss_17": 111.16670646667481, "kl_loss_3": 2206.9629821777344, "kl_loss_6": 1289.4851623535155, "learning_rate": 0.0008357427370318238, "loss": 1043.6662, "step": 2730 }, { "ce_loss_12": 3.170186769962311, "ce_loss_17": 3.0258415222167967, "ce_loss_23": 2.9628575444221497, "ce_loss_3": 4.049800419807434, "ce_loss_6": 3.5638309955596923, "epoch": 0.274, "grad_norm": 1020.0, "kl_loss_12": 423.51794586181643, "kl_loss_17": 110.81505393981934, "kl_loss_3": 2313.176416015625, "kl_loss_6": 1318.6596801757812, "learning_rate": 0.0008345653031794292, "loss": 1050.7198, "step": 2740 }, { "ce_loss_12": 3.1771156549453736, "ce_loss_17": 3.0280372500419617, "ce_loss_23": 2.965675616264343, "ce_loss_3": 4.041014420986175, "ce_loss_6": 3.56474187374115, "epoch": 0.275, "grad_norm": 1272.0, "kl_loss_12": 423.6371154785156, "kl_loss_17": 111.59429397583008, "kl_loss_3": 2275.3608947753905, "kl_loss_6": 1306.164373779297, "learning_rate": 0.0008333845002581458, "loss": 1033.1295, "step": 2750 }, { "ce_loss_12": 3.1093812108039858, "ce_loss_17": 2.958666718006134, "ce_loss_23": 2.8973594903945923, "ce_loss_3": 4.003350925445557, "ce_loss_6": 3.521108365058899, "epoch": 0.276, "grad_norm": 868.0, "kl_loss_12": 435.3954574584961, "kl_loss_17": 113.91388969421386, "kl_loss_3": 2364.9137329101563, "kl_loss_6": 1371.7270202636719, "learning_rate": 0.0008322003401586462, "loss": 1063.1645, "step": 2760 }, { "ce_loss_12": 3.1297270178794863, "ce_loss_17": 2.990075874328613, "ce_loss_23": 2.9308247327804566, "ce_loss_3": 3.977475678920746, "ce_loss_6": 3.5078927755355833, "epoch": 0.277, "grad_norm": 992.0, "kl_loss_12": 408.00756683349607, "kl_loss_17": 106.93959312438965, "kl_loss_3": 2226.0518310546877, "kl_loss_6": 1278.1108642578124, "learning_rate": 0.0008310128348054094, "loss": 995.3268, "step": 2770 }, { "ce_loss_12": 3.109842586517334, "ce_loss_17": 2.965323638916016, "ce_loss_23": 2.905602777004242, "ce_loss_3": 3.9729790091514587, "ce_loss_6": 3.4934093952178955, "epoch": 0.278, "grad_norm": 1012.0, "kl_loss_12": 418.95628051757814, "kl_loss_17": 108.78140983581542, "kl_loss_3": 2277.008166503906, "kl_loss_6": 1308.7492614746093, "learning_rate": 0.0008298219961566008, "loss": 1030.3977, "step": 2780 }, { "ce_loss_12": 3.0712546944618224, "ce_loss_17": 2.926204741001129, "ce_loss_23": 2.8665321707725524, "ce_loss_3": 3.9693758845329286, "ce_loss_6": 3.480190944671631, "epoch": 0.279, "grad_norm": 1304.0, "kl_loss_12": 432.71239166259767, "kl_loss_17": 110.66368026733399, "kl_loss_3": 2384.9975463867186, "kl_loss_6": 1379.5857421875, "learning_rate": 0.0008286278362039527, "loss": 1040.0875, "step": 2790 }, { "ce_loss_12": 3.099072754383087, "ce_loss_17": 2.9543874859809875, "ce_loss_23": 2.8932689309120176, "ce_loss_3": 4.00441962480545, "ce_loss_6": 3.516203057765961, "epoch": 0.28, "grad_norm": 1024.0, "kl_loss_12": 423.47198181152345, "kl_loss_17": 112.17585258483886, "kl_loss_3": 2383.3101196289062, "kl_loss_6": 1379.0210693359375, "learning_rate": 0.0008274303669726426, "loss": 1041.7575, "step": 2800 }, { "ce_loss_12": 3.018801951408386, "ce_loss_17": 2.870440900325775, "ce_loss_23": 2.8090245962142943, "ce_loss_3": 3.937907099723816, "ce_loss_6": 3.436682391166687, "epoch": 0.281, "grad_norm": 848.0, "kl_loss_12": 422.47137451171875, "kl_loss_17": 111.5949893951416, "kl_loss_3": 2397.2886169433596, "kl_loss_6": 1368.81572265625, "learning_rate": 0.0008262296005211721, "loss": 1038.3342, "step": 2810 }, { "ce_loss_12": 3.1327635169029238, "ce_loss_17": 2.9874435663223267, "ce_loss_23": 2.925761067867279, "ce_loss_3": 4.023141646385193, "ce_loss_6": 3.5383160591125487, "epoch": 0.282, "grad_norm": 1064.0, "kl_loss_12": 425.43332824707034, "kl_loss_17": 112.3005428314209, "kl_loss_3": 2330.1076599121093, "kl_loss_6": 1347.2308227539063, "learning_rate": 0.0008250255489412463, "loss": 1035.3857, "step": 2820 }, { "ce_loss_12": 3.2296959280967714, "ce_loss_17": 3.0806389093399047, "ce_loss_23": 3.019387435913086, "ce_loss_3": 4.097268283367157, "ce_loss_6": 3.613387179374695, "epoch": 0.283, "grad_norm": 952.0, "kl_loss_12": 422.14527893066406, "kl_loss_17": 112.8663158416748, "kl_loss_3": 2287.187646484375, "kl_loss_6": 1308.6017761230469, "learning_rate": 0.0008238182243576511, "loss": 1037.0014, "step": 2830 }, { "ce_loss_12": 3.187533068656921, "ce_loss_17": 3.0509860515594482, "ce_loss_23": 2.9890125155448914, "ce_loss_3": 3.9914371371269226, "ce_loss_6": 3.5471837282180787, "epoch": 0.284, "grad_norm": 1072.0, "kl_loss_12": 410.6913238525391, "kl_loss_17": 115.82479705810547, "kl_loss_3": 2155.456982421875, "kl_loss_6": 1247.588641357422, "learning_rate": 0.0008226076389281315, "loss": 1002.2791, "step": 2840 }, { "ce_loss_12": 3.2360986948013304, "ce_loss_17": 3.095022213459015, "ce_loss_23": 3.0317818760871886, "ce_loss_3": 4.068842041492462, "ce_loss_6": 3.6044496297836304, "epoch": 0.285, "grad_norm": 1072.0, "kl_loss_12": 420.0049194335937, "kl_loss_17": 118.48114700317383, "kl_loss_3": 2255.8744262695313, "kl_loss_6": 1300.3296691894532, "learning_rate": 0.0008213938048432696, "loss": 1004.6877, "step": 2850 }, { "ce_loss_12": 3.1623266458511354, "ce_loss_17": 3.021411645412445, "ce_loss_23": 2.9557711124420165, "ce_loss_3": 4.008575391769409, "ce_loss_6": 3.5391808271408083, "epoch": 0.286, "grad_norm": 964.0, "kl_loss_12": 424.7983062744141, "kl_loss_17": 118.95002555847168, "kl_loss_3": 2251.5058654785157, "kl_loss_6": 1290.7272216796875, "learning_rate": 0.0008201767343263612, "loss": 1030.3887, "step": 2860 }, { "ce_loss_12": 3.106607210636139, "ce_loss_17": 2.961516261100769, "ce_loss_23": 2.8994472503662108, "ce_loss_3": 3.9940276861190798, "ce_loss_6": 3.5045013189315797, "epoch": 0.287, "grad_norm": 1040.0, "kl_loss_12": 418.3986389160156, "kl_loss_17": 114.02130393981933, "kl_loss_3": 2321.1509399414062, "kl_loss_6": 1338.0280334472657, "learning_rate": 0.0008189564396332927, "loss": 1011.8268, "step": 2870 }, { "ce_loss_12": 3.0934667229652404, "ce_loss_17": 2.956806683540344, "ce_loss_23": 2.8943321824073793, "ce_loss_3": 3.9781769156455993, "ce_loss_6": 3.4973119139671325, "epoch": 0.288, "grad_norm": 1064.0, "kl_loss_12": 411.86312408447264, "kl_loss_17": 113.3759391784668, "kl_loss_3": 2299.268927001953, "kl_loss_6": 1324.3412292480468, "learning_rate": 0.0008177329330524181, "loss": 1036.632, "step": 2880 }, { "ce_loss_12": 3.1487688541412355, "ce_loss_17": 3.0031996607780456, "ce_loss_23": 2.940968418121338, "ce_loss_3": 3.9927698016166686, "ce_loss_6": 3.5189738869667053, "epoch": 0.289, "grad_norm": 904.0, "kl_loss_12": 412.4456451416016, "kl_loss_17": 113.02369689941406, "kl_loss_3": 2214.7519104003904, "kl_loss_6": 1272.6784545898438, "learning_rate": 0.0008165062269044352, "loss": 1015.1071, "step": 2890 }, { "ce_loss_12": 3.1071110010147094, "ce_loss_17": 2.9628792285919188, "ce_loss_23": 2.9007930755615234, "ce_loss_3": 3.983273184299469, "ce_loss_6": 3.494569385051727, "epoch": 0.29, "grad_norm": 956.0, "kl_loss_12": 430.44505767822267, "kl_loss_17": 114.5558017730713, "kl_loss_3": 2310.420068359375, "kl_loss_6": 1315.7907348632812, "learning_rate": 0.0008152763335422613, "loss": 1043.0416, "step": 2900 }, { "ce_loss_12": 3.0977216720581056, "ce_loss_17": 2.9489872455596924, "ce_loss_23": 2.8858672261238096, "ce_loss_3": 3.9666316509246826, "ce_loss_6": 3.4889026165008543, "epoch": 0.291, "grad_norm": 1208.0, "kl_loss_12": 426.3074234008789, "kl_loss_17": 115.46015243530273, "kl_loss_3": 2301.8380798339845, "kl_loss_6": 1327.6948974609375, "learning_rate": 0.0008140432653509088, "loss": 1028.4336, "step": 2910 }, { "ce_loss_12": 3.1420980095863342, "ce_loss_17": 2.9944337725639345, "ce_loss_23": 2.9315978407859804, "ce_loss_3": 3.993865358829498, "ce_loss_6": 3.530861973762512, "epoch": 0.292, "grad_norm": 1232.0, "kl_loss_12": 428.00020904541014, "kl_loss_17": 113.76680603027344, "kl_loss_3": 2282.623883056641, "kl_loss_6": 1311.62802734375, "learning_rate": 0.0008128070347473608, "loss": 1026.46, "step": 2920 }, { "ce_loss_12": 3.1560147762298585, "ce_loss_17": 3.0118520140647886, "ce_loss_23": 2.948830020427704, "ce_loss_3": 4.042136788368225, "ce_loss_6": 3.5510075807571413, "epoch": 0.293, "grad_norm": 1064.0, "kl_loss_12": 429.2475128173828, "kl_loss_17": 113.55529251098633, "kl_loss_3": 2338.4506896972657, "kl_loss_6": 1337.592578125, "learning_rate": 0.0008115676541804455, "loss": 1029.9361, "step": 2930 }, { "ce_loss_12": 3.1561770677566527, "ce_loss_17": 3.0117817163467406, "ce_loss_23": 2.950511121749878, "ce_loss_3": 3.995435917377472, "ce_loss_6": 3.5341237783432007, "epoch": 0.294, "grad_norm": 932.0, "kl_loss_12": 416.59341583251955, "kl_loss_17": 111.01059761047364, "kl_loss_3": 2253.631555175781, "kl_loss_6": 1306.9321044921876, "learning_rate": 0.0008103251361307119, "loss": 1031.5532, "step": 2940 }, { "ce_loss_12": 3.1843759298324583, "ce_loss_17": 3.038155961036682, "ce_loss_23": 2.9773050785064696, "ce_loss_3": 4.043914568424225, "ce_loss_6": 3.5680703282356263, "epoch": 0.295, "grad_norm": 1224.0, "kl_loss_12": 420.00162506103516, "kl_loss_17": 111.82075843811035, "kl_loss_3": 2276.9667785644533, "kl_loss_6": 1320.3095397949219, "learning_rate": 0.0008090794931103026, "loss": 1022.3766, "step": 2950 }, { "ce_loss_12": 3.154700481891632, "ce_loss_17": 3.0144895553588866, "ce_loss_23": 2.9569777607917787, "ce_loss_3": 4.001096451282502, "ce_loss_6": 3.536224627494812, "epoch": 0.296, "grad_norm": 1128.0, "kl_loss_12": 408.6188430786133, "kl_loss_17": 106.91668930053712, "kl_loss_3": 2230.6013305664064, "kl_loss_6": 1284.477362060547, "learning_rate": 0.0008078307376628291, "loss": 1015.5539, "step": 2960 }, { "ce_loss_12": 3.216846990585327, "ce_loss_17": 3.07712641954422, "ce_loss_23": 3.020001995563507, "ce_loss_3": 4.0298261642456055, "ce_loss_6": 3.5871922373771667, "epoch": 0.297, "grad_norm": 1056.0, "kl_loss_12": 403.0998001098633, "kl_loss_17": 105.29769783020019, "kl_loss_3": 2154.7777099609375, "kl_loss_6": 1254.5455261230468, "learning_rate": 0.000806578882363245, "loss": 982.7731, "step": 2970 }, { "ce_loss_12": 3.1353084087371825, "ce_loss_17": 2.9982158064842226, "ce_loss_23": 2.9405212998390198, "ce_loss_3": 3.9612733006477354, "ce_loss_6": 3.5113443732261658, "epoch": 0.298, "grad_norm": 1040.0, "kl_loss_12": 405.961865234375, "kl_loss_17": 105.14496688842773, "kl_loss_3": 2208.7371337890627, "kl_loss_6": 1287.3426086425782, "learning_rate": 0.0008053239398177191, "loss": 1026.2143, "step": 2980 }, { "ce_loss_12": 3.1283560276031492, "ce_loss_17": 2.9837308287620545, "ce_loss_23": 2.9255617380142214, "ce_loss_3": 3.985158312320709, "ce_loss_6": 3.516181480884552, "epoch": 0.299, "grad_norm": 1184.0, "kl_loss_12": 409.7952484130859, "kl_loss_17": 106.93139038085937, "kl_loss_3": 2250.543109130859, "kl_loss_6": 1297.1369995117188, "learning_rate": 0.0008040659226635089, "loss": 1039.9756, "step": 2990 }, { "ce_loss_12": 3.2462770462036135, "ce_loss_17": 3.0983072400093077, "ce_loss_23": 3.034535789489746, "ce_loss_3": 4.075495111942291, "ce_loss_6": 3.623253059387207, "epoch": 0.3, "grad_norm": 1208.0, "kl_loss_12": 430.03038177490237, "kl_loss_17": 115.25655326843261, "kl_loss_3": 2238.4564697265623, "kl_loss_6": 1295.3731628417968, "learning_rate": 0.0008028048435688333, "loss": 1009.2599, "step": 3000 }, { "ce_loss_12": 3.1294857978820803, "ce_loss_17": 2.9839457511901855, "ce_loss_23": 2.9239335656166077, "ce_loss_3": 4.003237676620484, "ce_loss_6": 3.5273142576217653, "epoch": 0.301, "grad_norm": 1088.0, "kl_loss_12": 417.4100051879883, "kl_loss_17": 108.43363456726074, "kl_loss_3": 2304.9576171875, "kl_loss_6": 1328.046453857422, "learning_rate": 0.0008015407152327448, "loss": 1026.3252, "step": 3010 }, { "ce_loss_12": 3.164993369579315, "ce_loss_17": 3.02325359582901, "ce_loss_23": 2.9631327986717224, "ce_loss_3": 4.018820178508759, "ce_loss_6": 3.544352960586548, "epoch": 0.302, "grad_norm": 1016.0, "kl_loss_12": 413.7855987548828, "kl_loss_17": 110.33978080749512, "kl_loss_3": 2281.8075317382813, "kl_loss_6": 1308.32470703125, "learning_rate": 0.0008002735503850016, "loss": 1024.3284, "step": 3020 }, { "ce_loss_12": 3.0686097145080566, "ce_loss_17": 2.923471248149872, "ce_loss_23": 2.862740170955658, "ce_loss_3": 3.9519243001937867, "ce_loss_6": 3.4707289814949034, "epoch": 0.303, "grad_norm": 1176.0, "kl_loss_12": 420.33300933837893, "kl_loss_17": 111.22417945861817, "kl_loss_3": 2320.516729736328, "kl_loss_6": 1341.770770263672, "learning_rate": 0.0007990033617859396, "loss": 1043.5717, "step": 3030 }, { "ce_loss_12": 3.1193864822387694, "ce_loss_17": 2.9834821939468386, "ce_loss_23": 2.918761098384857, "ce_loss_3": 3.9739678502082825, "ce_loss_6": 3.4987839579582216, "epoch": 0.304, "grad_norm": 1024.0, "kl_loss_12": 407.8907638549805, "kl_loss_17": 115.28406105041503, "kl_loss_3": 2240.3181579589846, "kl_loss_6": 1293.2356872558594, "learning_rate": 0.000797730162226344, "loss": 991.3312, "step": 3040 }, { "ce_loss_12": 3.1359513640403747, "ce_loss_17": 2.9954004645347596, "ce_loss_23": 2.9337677240371702, "ce_loss_3": 3.989997386932373, "ce_loss_6": 3.5289689540863036, "epoch": 0.305, "grad_norm": 908.0, "kl_loss_12": 412.1574142456055, "kl_loss_17": 116.06979484558106, "kl_loss_3": 2259.043005371094, "kl_loss_6": 1315.9446960449218, "learning_rate": 0.0007964539645273203, "loss": 1015.7215, "step": 3050 }, { "ce_loss_12": 3.1464040160179136, "ce_loss_17": 3.0107370972633363, "ce_loss_23": 2.9521604895591738, "ce_loss_3": 3.9734048008918763, "ce_loss_6": 3.518921947479248, "epoch": 0.306, "grad_norm": 1192.0, "kl_loss_12": 399.71173400878905, "kl_loss_17": 107.98187713623047, "kl_loss_3": 2186.635870361328, "kl_loss_6": 1261.9838256835938, "learning_rate": 0.000795174781540165, "loss": 1006.6209, "step": 3060 }, { "ce_loss_12": 3.2190658688545226, "ce_loss_17": 3.0822181344032287, "ce_loss_23": 3.0252692461013795, "ce_loss_3": 4.028954362869262, "ce_loss_6": 3.583014357089996, "epoch": 0.307, "grad_norm": 1520.0, "kl_loss_12": 402.89805297851564, "kl_loss_17": 109.53241271972657, "kl_loss_3": 2168.199774169922, "kl_loss_6": 1259.1837097167968, "learning_rate": 0.0007938926261462366, "loss": 1009.859, "step": 3070 }, { "ce_loss_12": 3.1720401525497435, "ce_loss_17": 3.0377658724784853, "ce_loss_23": 2.975878930091858, "ce_loss_3": 3.985373318195343, "ce_loss_6": 3.5269261479377745, "epoch": 0.308, "grad_norm": 1296.0, "kl_loss_12": 408.71085662841796, "kl_loss_17": 110.68863906860352, "kl_loss_3": 2193.952880859375, "kl_loss_6": 1263.3701293945312, "learning_rate": 0.0007926075112568258, "loss": 1018.659, "step": 3080 }, { "ce_loss_12": 3.166626286506653, "ce_loss_17": 3.029184174537659, "ce_loss_23": 2.9667676091194153, "ce_loss_3": 4.016194677352905, "ce_loss_6": 3.544252622127533, "epoch": 0.309, "grad_norm": 908.0, "kl_loss_12": 411.1470260620117, "kl_loss_17": 113.14122123718262, "kl_loss_3": 2241.7541931152346, "kl_loss_6": 1290.6083190917968, "learning_rate": 0.0007913194498130252, "loss": 998.1091, "step": 3090 }, { "ce_loss_12": 3.1077103972435, "ce_loss_17": 2.9612428307533265, "ce_loss_23": 2.899013900756836, "ce_loss_3": 3.96414829492569, "ce_loss_6": 3.490557086467743, "epoch": 0.31, "grad_norm": 1176.0, "kl_loss_12": 415.7887680053711, "kl_loss_17": 116.05563888549804, "kl_loss_3": 2242.3578491210938, "kl_loss_6": 1294.021533203125, "learning_rate": 0.0007900284547855992, "loss": 1022.7701, "step": 3100 }, { "ce_loss_12": 3.1130651116371153, "ce_loss_17": 2.9763598322868345, "ce_loss_23": 2.9147239446640016, "ce_loss_3": 3.942672574520111, "ce_loss_6": 3.481655478477478, "epoch": 0.311, "grad_norm": 1136.0, "kl_loss_12": 406.75770263671876, "kl_loss_17": 111.87238311767578, "kl_loss_3": 2207.929296875, "kl_loss_6": 1275.2953369140625, "learning_rate": 0.0007887345391748532, "loss": 1026.1137, "step": 3110 }, { "ce_loss_12": 3.2158465147018434, "ce_loss_17": 3.081123912334442, "ce_loss_23": 3.02043753862381, "ce_loss_3": 4.023183381557464, "ce_loss_6": 3.572895038127899, "epoch": 0.312, "grad_norm": 1136.0, "kl_loss_12": 398.6086669921875, "kl_loss_17": 112.47258186340332, "kl_loss_3": 2146.368359375, "kl_loss_6": 1239.7026672363281, "learning_rate": 0.0007874377160105036, "loss": 972.8377, "step": 3120 }, { "ce_loss_12": 3.127926063537598, "ce_loss_17": 2.9957722187042237, "ce_loss_23": 2.934756410121918, "ce_loss_3": 4.002281177043915, "ce_loss_6": 3.5005795121192933, "epoch": 0.313, "grad_norm": 912.0, "kl_loss_12": 396.9777374267578, "kl_loss_17": 111.40082015991212, "kl_loss_3": 2294.8144287109376, "kl_loss_6": 1272.8683288574218, "learning_rate": 0.0007861379983515449, "loss": 1040.7336, "step": 3130 }, { "ce_loss_12": 3.2019168853759767, "ce_loss_17": 3.067536175251007, "ce_loss_23": 3.005921757221222, "ce_loss_3": 4.045781588554382, "ce_loss_6": 3.590694487094879, "epoch": 0.314, "grad_norm": 1004.0, "kl_loss_12": 407.66050109863284, "kl_loss_17": 112.10332107543945, "kl_loss_3": 2232.9395751953125, "kl_loss_6": 1292.6397766113282, "learning_rate": 0.0007848353992861195, "loss": 1001.023, "step": 3140 }, { "ce_loss_12": 3.2831211566925047, "ce_loss_17": 3.1478240966796873, "ce_loss_23": 3.074808359146118, "ce_loss_3": 4.130173242092132, "ce_loss_6": 3.669738221168518, "epoch": 0.315, "grad_norm": 1056.0, "kl_loss_12": 420.8632064819336, "kl_loss_17": 134.64935264587402, "kl_loss_3": 2235.048028564453, "kl_loss_6": 1306.2391418457032, "learning_rate": 0.0007835299319313853, "loss": 1029.2203, "step": 3150 }, { "ce_loss_12": 3.1660799860954283, "ce_loss_17": 3.03582626581192, "ce_loss_23": 2.974584233760834, "ce_loss_3": 3.981108212471008, "ce_loss_6": 3.528446316719055, "epoch": 0.316, "grad_norm": 1144.0, "kl_loss_12": 403.32799835205077, "kl_loss_17": 116.29181365966797, "kl_loss_3": 2173.312121582031, "kl_loss_6": 1250.9722595214844, "learning_rate": 0.0007822216094333848, "loss": 1024.1848, "step": 3160 }, { "ce_loss_12": 3.18140105009079, "ce_loss_17": 3.042064738273621, "ce_loss_23": 2.979595422744751, "ce_loss_3": 4.035410952568054, "ce_loss_6": 3.564517879486084, "epoch": 0.317, "grad_norm": 848.0, "kl_loss_12": 412.0792602539062, "kl_loss_17": 119.08126792907714, "kl_loss_3": 2249.047265625, "kl_loss_6": 1287.4177551269531, "learning_rate": 0.0007809104449669101, "loss": 1006.7281, "step": 3170 }, { "ce_loss_12": 3.1243847727775576, "ce_loss_17": 2.987361800670624, "ce_loss_23": 2.92877117395401, "ce_loss_3": 3.943815493583679, "ce_loss_6": 3.4851326823234556, "epoch": 0.318, "grad_norm": 1008.0, "kl_loss_12": 405.4247024536133, "kl_loss_17": 112.87823867797852, "kl_loss_3": 2179.9481811523438, "kl_loss_6": 1255.689697265625, "learning_rate": 0.0007795964517353734, "loss": 994.6248, "step": 3180 }, { "ce_loss_12": 3.126452827453613, "ce_loss_17": 2.989318549633026, "ce_loss_23": 2.9306680083274843, "ce_loss_3": 3.9847754001617433, "ce_loss_6": 3.5115119576454163, "epoch": 0.319, "grad_norm": 1104.0, "kl_loss_12": 416.9341049194336, "kl_loss_17": 116.0937271118164, "kl_loss_3": 2259.7885925292967, "kl_loss_6": 1294.2023132324218, "learning_rate": 0.000778279642970672, "loss": 991.9023, "step": 3190 }, { "ce_loss_12": 3.130779039859772, "ce_loss_17": 2.993109381198883, "ce_loss_23": 2.9349839210510256, "ce_loss_3": 3.9459033608436584, "ce_loss_6": 3.4922481060028074, "epoch": 0.32, "grad_norm": 1144.0, "kl_loss_12": 412.4380615234375, "kl_loss_17": 111.97399520874023, "kl_loss_3": 2188.5126892089843, "kl_loss_6": 1262.0484252929687, "learning_rate": 0.0007769600319330552, "loss": 982.9381, "step": 3200 }, { "ce_loss_12": 3.1565820813179015, "ce_loss_17": 3.0179872632026674, "ce_loss_23": 2.9545223474502564, "ce_loss_3": 4.0231255173683165, "ce_loss_6": 3.5447113275527955, "epoch": 0.321, "grad_norm": 1384.0, "kl_loss_12": 412.7830841064453, "kl_loss_17": 114.25502891540528, "kl_loss_3": 2265.739636230469, "kl_loss_6": 1294.0114990234374, "learning_rate": 0.0007756376319109917, "loss": 1008.1775, "step": 3210 }, { "ce_loss_12": 3.1978549718856812, "ce_loss_17": 3.056639587879181, "ce_loss_23": 2.9975166797637938, "ce_loss_3": 4.011419260501862, "ce_loss_6": 3.5697445034980775, "epoch": 0.322, "grad_norm": 980.0, "kl_loss_12": 413.4813034057617, "kl_loss_17": 111.76018180847169, "kl_loss_3": 2179.5991760253905, "kl_loss_6": 1270.4315612792968, "learning_rate": 0.0007743124562210351, "loss": 977.0854, "step": 3220 }, { "ce_loss_12": 3.214629316329956, "ce_loss_17": 3.075866627693176, "ce_loss_23": 3.0133888483047486, "ce_loss_3": 4.0257228255271915, "ce_loss_6": 3.5786707997322083, "epoch": 0.323, "grad_norm": 1160.0, "kl_loss_12": 407.91126861572263, "kl_loss_17": 111.01350746154785, "kl_loss_3": 2191.774792480469, "kl_loss_6": 1270.8741333007813, "learning_rate": 0.0007729845182076895, "loss": 997.8597, "step": 3230 }, { "ce_loss_12": 3.139386999607086, "ce_loss_17": 3.0055894374847414, "ce_loss_23": 2.949425721168518, "ce_loss_3": 3.9491932153701783, "ce_loss_6": 3.50347980260849, "epoch": 0.324, "grad_norm": 1216.0, "kl_loss_12": 401.8107543945313, "kl_loss_17": 105.48669738769532, "kl_loss_3": 2156.247454833984, "kl_loss_6": 1246.0010620117187, "learning_rate": 0.0007716538312432765, "loss": 1005.3651, "step": 3240 }, { "ce_loss_12": 3.1097494840621946, "ce_loss_17": 2.9684781432151794, "ce_loss_23": 2.9068726897239685, "ce_loss_3": 3.9634464383125305, "ce_loss_6": 3.4876855969429017, "epoch": 0.325, "grad_norm": 944.0, "kl_loss_12": 412.408186340332, "kl_loss_17": 113.86479721069335, "kl_loss_3": 2249.631207275391, "kl_loss_6": 1285.4653747558593, "learning_rate": 0.0007703204087277988, "loss": 1014.0202, "step": 3250 }, { "ce_loss_12": 3.195712912082672, "ce_loss_17": 3.066792869567871, "ce_loss_23": 3.0043872356414796, "ce_loss_3": 3.9889792919158937, "ce_loss_6": 3.5458029985427855, "epoch": 0.326, "grad_norm": 1232.0, "kl_loss_12": 389.23892669677736, "kl_loss_17": 119.9925926208496, "kl_loss_3": 2122.7849975585937, "kl_loss_6": 1216.2618896484375, "learning_rate": 0.0007689842640888063, "loss": 977.3232, "step": 3260 }, { "ce_loss_12": 3.1939032673835754, "ce_loss_17": 3.067763018608093, "ce_loss_23": 2.9996448755264282, "ce_loss_3": 4.016518783569336, "ce_loss_6": 3.563250422477722, "epoch": 0.327, "grad_norm": 1096.0, "kl_loss_12": 402.7244369506836, "kl_loss_17": 128.78552703857423, "kl_loss_3": 2164.721325683594, "kl_loss_6": 1253.0573181152345, "learning_rate": 0.0007676454107812607, "loss": 996.1429, "step": 3270 }, { "ce_loss_12": 3.135709273815155, "ce_loss_17": 3.0018914699554444, "ce_loss_23": 2.9393561482429504, "ce_loss_3": 3.9782461881637574, "ce_loss_6": 3.510867726802826, "epoch": 0.328, "grad_norm": 1224.0, "kl_loss_12": 409.4145446777344, "kl_loss_17": 121.70905494689941, "kl_loss_3": 2225.8669372558593, "kl_loss_6": 1269.7090148925781, "learning_rate": 0.0007663038622873999, "loss": 993.0643, "step": 3280 }, { "ce_loss_12": 3.175209379196167, "ce_loss_17": 3.0521249651908873, "ce_loss_23": 2.9861778616905212, "ce_loss_3": 4.006668412685395, "ce_loss_6": 3.546959364414215, "epoch": 0.329, "grad_norm": 1040.0, "kl_loss_12": 399.50186309814455, "kl_loss_17": 125.75395736694335, "kl_loss_3": 2192.2456604003905, "kl_loss_6": 1260.66640625, "learning_rate": 0.0007649596321166025, "loss": 980.2414, "step": 3290 }, { "ce_loss_12": 3.0792182087898254, "ce_loss_17": 2.95730984210968, "ce_loss_23": 2.8904613494873046, "ce_loss_3": 3.8960200428962706, "ce_loss_6": 3.4497509241104125, "epoch": 0.33, "grad_norm": 1128.0, "kl_loss_12": 392.9219116210937, "kl_loss_17": 117.4855842590332, "kl_loss_3": 2138.7407165527343, "kl_loss_6": 1245.0063598632812, "learning_rate": 0.0007636127338052513, "loss": 985.1443, "step": 3300 }, { "ce_loss_12": 3.1815504312515257, "ce_loss_17": 3.0464391469955445, "ce_loss_23": 2.982821536064148, "ce_loss_3": 4.0343057513237, "ce_loss_6": 3.563375544548035, "epoch": 0.331, "grad_norm": 848.0, "kl_loss_12": 407.66025695800784, "kl_loss_17": 122.59229011535645, "kl_loss_3": 2252.943640136719, "kl_loss_6": 1288.318182373047, "learning_rate": 0.0007622631809165971, "loss": 996.1619, "step": 3310 }, { "ce_loss_12": 3.1675572395324707, "ce_loss_17": 3.047197496891022, "ce_loss_23": 2.9831395506858827, "ce_loss_3": 3.9600903511047365, "ce_loss_6": 3.5189494490623474, "epoch": 0.332, "grad_norm": 1160.0, "kl_loss_12": 377.3573394775391, "kl_loss_17": 111.5566593170166, "kl_loss_3": 2082.12412109375, "kl_loss_6": 1190.3214294433594, "learning_rate": 0.000760910987040623, "loss": 962.3131, "step": 3320 }, { "ce_loss_12": 3.166386032104492, "ce_loss_17": 3.025036633014679, "ce_loss_23": 2.9654740929603576, "ce_loss_3": 4.026007843017578, "ce_loss_6": 3.5486589670181274, "epoch": 0.333, "grad_norm": 908.0, "kl_loss_12": 409.37377166748047, "kl_loss_17": 114.46134300231934, "kl_loss_3": 2264.3786010742188, "kl_loss_6": 1299.5189331054687, "learning_rate": 0.000759556165793906, "loss": 991.0748, "step": 3330 }, { "ce_loss_12": 3.175990545749664, "ce_loss_17": 3.0406521558761597, "ce_loss_23": 2.9793988823890687, "ce_loss_3": 4.003675639629364, "ce_loss_6": 3.5533421277999877, "epoch": 0.334, "grad_norm": 728.0, "kl_loss_12": 401.5468505859375, "kl_loss_17": 109.53638153076172, "kl_loss_3": 2201.6816833496096, "kl_loss_6": 1274.5880859375, "learning_rate": 0.000758198730819481, "loss": 1004.2113, "step": 3340 }, { "ce_loss_12": 3.136075222492218, "ce_loss_17": 3.002777171134949, "ce_loss_23": 2.947541129589081, "ce_loss_3": 3.9716083288192747, "ce_loss_6": 3.5095011830329894, "epoch": 0.335, "grad_norm": 1032.0, "kl_loss_12": 391.4615737915039, "kl_loss_17": 104.82349853515625, "kl_loss_3": 2192.2221618652343, "kl_loss_6": 1263.1376037597656, "learning_rate": 0.0007568386957867032, "loss": 988.185, "step": 3350 }, { "ce_loss_12": 3.1859456062316895, "ce_loss_17": 3.051733136177063, "ce_loss_23": 2.990495729446411, "ce_loss_3": 4.007290017604828, "ce_loss_6": 3.5615636348724364, "epoch": 0.336, "grad_norm": 1152.0, "kl_loss_12": 401.7861862182617, "kl_loss_17": 107.89192390441895, "kl_loss_3": 2179.39140625, "kl_loss_6": 1257.9797607421874, "learning_rate": 0.0007554760743911103, "loss": 996.4357, "step": 3360 }, { "ce_loss_12": 3.113254189491272, "ce_loss_17": 2.98225884437561, "ce_loss_23": 2.9265295028686524, "ce_loss_3": 3.9415154337882994, "ce_loss_6": 3.487048614025116, "epoch": 0.337, "grad_norm": 1056.0, "kl_loss_12": 390.0379333496094, "kl_loss_17": 104.20697250366212, "kl_loss_3": 2187.164660644531, "kl_loss_6": 1248.6148010253905, "learning_rate": 0.0007541108803542846, "loss": 1012.4851, "step": 3370 }, { "ce_loss_12": 3.148558294773102, "ce_loss_17": 3.0167269706726074, "ce_loss_23": 2.960436737537384, "ce_loss_3": 3.965547299385071, "ce_loss_6": 3.5091021418571473, "epoch": 0.338, "grad_norm": 1216.0, "kl_loss_12": 395.17091369628906, "kl_loss_17": 104.0645637512207, "kl_loss_3": 2196.504290771484, "kl_loss_6": 1249.5260681152345, "learning_rate": 0.0007527431274237149, "loss": 1036.4824, "step": 3380 }, { "ce_loss_12": 3.1180466532707216, "ce_loss_17": 2.9879327178001405, "ce_loss_23": 2.933281934261322, "ce_loss_3": 3.93823961019516, "ce_loss_6": 3.4857409834861754, "epoch": 0.339, "grad_norm": 1216.0, "kl_loss_12": 389.51560363769534, "kl_loss_17": 103.63929252624511, "kl_loss_3": 2158.0156433105467, "kl_loss_6": 1235.6661071777344, "learning_rate": 0.0007513728293726579, "loss": 980.855, "step": 3390 }, { "ce_loss_12": 3.2241431713104247, "ce_loss_17": 3.0904944062232973, "ce_loss_23": 3.03415367603302, "ce_loss_3": 4.0267979264259335, "ce_loss_6": 3.5803756713867188, "epoch": 0.34, "grad_norm": 1032.0, "kl_loss_12": 397.006591796875, "kl_loss_17": 105.09075622558593, "kl_loss_3": 2149.0771911621096, "kl_loss_6": 1240.8625610351562, "learning_rate": 0.00075, "loss": 970.1844, "step": 3400 }, { "ce_loss_12": 3.221427297592163, "ce_loss_17": 3.0799491167068482, "ce_loss_23": 3.0230050921440124, "ce_loss_3": 4.0491295337677, "ce_loss_6": 3.5963475704193115, "epoch": 0.341, "grad_norm": 1104.0, "kl_loss_12": 401.5384887695312, "kl_loss_17": 105.64668006896973, "kl_loss_3": 2198.474645996094, "kl_loss_6": 1263.7601989746095, "learning_rate": 0.0007486246531301177, "loss": 983.0185, "step": 3410 }, { "ce_loss_12": 3.0349387645721437, "ce_loss_17": 2.9008808970451354, "ce_loss_23": 2.846748244762421, "ce_loss_3": 3.8713913917541505, "ce_loss_6": 3.411065125465393, "epoch": 0.342, "grad_norm": 892.0, "kl_loss_12": 390.5525131225586, "kl_loss_17": 102.45408782958984, "kl_loss_3": 2174.7390075683593, "kl_loss_6": 1251.728338623047, "learning_rate": 0.0007472468026127384, "loss": 966.3946, "step": 3420 }, { "ce_loss_12": 3.18413405418396, "ce_loss_17": 3.043762445449829, "ce_loss_23": 2.980748248100281, "ce_loss_3": 4.039393293857574, "ce_loss_6": 3.5582253456115724, "epoch": 0.343, "grad_norm": 1200.0, "kl_loss_12": 418.48243255615233, "kl_loss_17": 110.33060188293457, "kl_loss_3": 2272.084197998047, "kl_loss_6": 1294.8700378417968, "learning_rate": 0.000745866462322802, "loss": 1008.09, "step": 3430 }, { "ce_loss_12": 3.154288816452026, "ce_loss_17": 3.0218498349189757, "ce_loss_23": 2.9685943603515623, "ce_loss_3": 3.9574212312698362, "ce_loss_6": 3.511477017402649, "epoch": 0.344, "grad_norm": 868.0, "kl_loss_12": 389.3679931640625, "kl_loss_17": 101.62167663574219, "kl_loss_3": 2132.1400817871095, "kl_loss_6": 1221.7820739746094, "learning_rate": 0.0007444836461603195, "loss": 972.9533, "step": 3440 }, { "ce_loss_12": 3.223675560951233, "ce_loss_17": 3.084799551963806, "ce_loss_23": 3.0252248644828796, "ce_loss_3": 4.05693109035492, "ce_loss_6": 3.6015345811843873, "epoch": 0.345, "grad_norm": 1020.0, "kl_loss_12": 418.050358581543, "kl_loss_17": 111.50422401428223, "kl_loss_3": 2230.067041015625, "kl_loss_6": 1304.375439453125, "learning_rate": 0.0007430983680502344, "loss": 1014.8229, "step": 3450 }, { "ce_loss_12": 3.069409763813019, "ce_loss_17": 2.930784726142883, "ce_loss_23": 2.873788070678711, "ce_loss_3": 3.9135555267333983, "ce_loss_6": 3.4463586807250977, "epoch": 0.346, "grad_norm": 972.0, "kl_loss_12": 400.5516082763672, "kl_loss_17": 104.2048439025879, "kl_loss_3": 2217.041046142578, "kl_loss_6": 1268.8075622558595, "learning_rate": 0.0007417106419422819, "loss": 995.1109, "step": 3460 }, { "ce_loss_12": 3.158154022693634, "ce_loss_17": 3.0228779196739195, "ce_loss_23": 2.966097557544708, "ce_loss_3": 3.970812749862671, "ce_loss_6": 3.5208982706069945, "epoch": 0.347, "grad_norm": 880.0, "kl_loss_12": 398.12904663085936, "kl_loss_17": 104.80950889587402, "kl_loss_3": 2142.826318359375, "kl_loss_6": 1236.2609802246093, "learning_rate": 0.0007403204818108486, "loss": 988.4357, "step": 3470 }, { "ce_loss_12": 3.1484235644340517, "ce_loss_17": 3.0128443241119385, "ce_loss_23": 2.954823875427246, "ce_loss_3": 3.9678571701049803, "ce_loss_6": 3.507073760032654, "epoch": 0.348, "grad_norm": 1136.0, "kl_loss_12": 403.04648742675784, "kl_loss_17": 105.83292236328126, "kl_loss_3": 2219.6487243652346, "kl_loss_6": 1265.0550537109375, "learning_rate": 0.0007389279016548316, "loss": 964.9225, "step": 3480 }, { "ce_loss_12": 3.15326087474823, "ce_loss_17": 3.01158367395401, "ce_loss_23": 2.9500399947166445, "ce_loss_3": 4.019190549850464, "ce_loss_6": 3.534512388706207, "epoch": 0.349, "grad_norm": 1216.0, "kl_loss_12": 410.7277206420898, "kl_loss_17": 107.77377014160156, "kl_loss_3": 2273.4454956054688, "kl_loss_6": 1284.2046264648438, "learning_rate": 0.0007375329154974975, "loss": 1005.2963, "step": 3490 }, { "ce_loss_12": 3.1056708455085755, "ce_loss_17": 2.9722748041152953, "ce_loss_23": 2.9173484206199647, "ce_loss_3": 3.9198298692703246, "ce_loss_6": 3.4674972891807556, "epoch": 0.35, "grad_norm": 1192.0, "kl_loss_12": 387.7169982910156, "kl_loss_17": 102.49424819946289, "kl_loss_3": 2145.902227783203, "kl_loss_6": 1230.2499633789062, "learning_rate": 0.0007361355373863414, "loss": 993.6381, "step": 3500 }, { "ce_loss_12": 3.152825403213501, "ce_loss_17": 3.017298102378845, "ce_loss_23": 2.9622105717658997, "ce_loss_3": 3.956553077697754, "ce_loss_6": 3.513141703605652, "epoch": 0.351, "grad_norm": 1144.0, "kl_loss_12": 388.43240814208986, "kl_loss_17": 102.02892303466797, "kl_loss_3": 2127.050341796875, "kl_loss_6": 1218.60732421875, "learning_rate": 0.0007347357813929454, "loss": 989.0248, "step": 3510 }, { "ce_loss_12": 3.1057863593101502, "ce_loss_17": 2.977063238620758, "ce_loss_23": 2.92062748670578, "ce_loss_3": 3.9094584822654723, "ce_loss_6": 3.465732955932617, "epoch": 0.352, "grad_norm": 1472.0, "kl_loss_12": 384.3619110107422, "kl_loss_17": 102.11101379394532, "kl_loss_3": 2122.9948669433593, "kl_loss_6": 1216.6124328613282, "learning_rate": 0.0007333336616128369, "loss": 986.3984, "step": 3520 }, { "ce_loss_12": 3.0813703656196596, "ce_loss_17": 2.9437156200408934, "ce_loss_23": 2.886933445930481, "ce_loss_3": 3.9318979501724245, "ce_loss_6": 3.4582404851913453, "epoch": 0.353, "grad_norm": 1184.0, "kl_loss_12": 403.52349243164065, "kl_loss_17": 104.87230758666992, "kl_loss_3": 2217.4830932617188, "kl_loss_6": 1260.3309204101563, "learning_rate": 0.0007319291921653463, "loss": 993.8051, "step": 3530 }, { "ce_loss_12": 3.168711709976196, "ce_loss_17": 3.026899981498718, "ce_loss_23": 2.9669880628585816, "ce_loss_3": 4.009432983398438, "ce_loss_6": 3.540644121170044, "epoch": 0.354, "grad_norm": 1264.0, "kl_loss_12": 407.08264617919923, "kl_loss_17": 106.93651161193847, "kl_loss_3": 2216.938299560547, "kl_loss_6": 1263.7690002441407, "learning_rate": 0.0007305223871934656, "loss": 982.6785, "step": 3540 }, { "ce_loss_12": 3.130829155445099, "ce_loss_17": 2.995125424861908, "ce_loss_23": 2.9383193969726564, "ce_loss_3": 3.943132388591766, "ce_loss_6": 3.4956005454063415, "epoch": 0.355, "grad_norm": 1040.0, "kl_loss_12": 396.400700378418, "kl_loss_17": 103.8346160888672, "kl_loss_3": 2159.5034057617186, "kl_loss_6": 1242.3389953613282, "learning_rate": 0.0007291132608637052, "loss": 979.3562, "step": 3550 }, { "ce_loss_12": 3.1063939929008484, "ce_loss_17": 2.9719778656959535, "ce_loss_23": 2.917952299118042, "ce_loss_3": 4.00426549911499, "ce_loss_6": 3.5009910464286804, "epoch": 0.356, "grad_norm": 964.0, "kl_loss_12": 387.28465576171874, "kl_loss_17": 100.82154541015625, "kl_loss_3": 2316.0281677246094, "kl_loss_6": 1301.2512268066407, "learning_rate": 0.0007277018273659516, "loss": 1012.7887, "step": 3560 }, { "ce_loss_12": 3.2234113931655886, "ce_loss_17": 3.083226215839386, "ce_loss_23": 3.0215823650360107, "ce_loss_3": 4.039709770679474, "ce_loss_6": 3.591693603992462, "epoch": 0.357, "grad_norm": 1408.0, "kl_loss_12": 412.0844360351563, "kl_loss_17": 109.77905807495117, "kl_loss_3": 2195.4359375, "kl_loss_6": 1277.4073791503906, "learning_rate": 0.0007262881009133242, "loss": 989.1793, "step": 3570 }, { "ce_loss_12": 3.1356869101524354, "ce_loss_17": 3.0047407031059263, "ce_loss_23": 2.951461207866669, "ce_loss_3": 3.9500752568244932, "ce_loss_6": 3.49868620634079, "epoch": 0.358, "grad_norm": 1384.0, "kl_loss_12": 387.04153594970705, "kl_loss_17": 100.57007904052735, "kl_loss_3": 2136.6033447265627, "kl_loss_6": 1228.8325775146484, "learning_rate": 0.0007248720957420329, "loss": 961.4793, "step": 3580 }, { "ce_loss_12": 3.1339301228523255, "ce_loss_17": 3.0050941705703735, "ce_loss_23": 2.952645552158356, "ce_loss_3": 3.929820716381073, "ce_loss_6": 3.487267017364502, "epoch": 0.359, "grad_norm": 1048.0, "kl_loss_12": 381.19686431884764, "kl_loss_17": 100.02181625366211, "kl_loss_3": 2124.4488525390625, "kl_loss_6": 1210.181591796875, "learning_rate": 0.0007234538261112341, "loss": 985.0872, "step": 3590 }, { "ce_loss_12": 3.1808794498443604, "ce_loss_17": 3.0438300013542174, "ce_loss_23": 2.9860937118530275, "ce_loss_3": 4.011133432388306, "ce_loss_6": 3.5475757718086243, "epoch": 0.36, "grad_norm": 804.0, "kl_loss_12": 395.88465728759763, "kl_loss_17": 103.27263717651367, "kl_loss_3": 2172.810369873047, "kl_loss_6": 1242.1477661132812, "learning_rate": 0.0007220333063028871, "loss": 967.4588, "step": 3600 }, { "ce_loss_12": 3.221625530719757, "ce_loss_17": 3.082091951370239, "ce_loss_23": 3.0237446427345276, "ce_loss_3": 4.1428028583526615, "ce_loss_6": 3.6395716786384584, "epoch": 0.361, "grad_norm": 956.0, "kl_loss_12": 410.17210083007814, "kl_loss_17": 107.04776420593262, "kl_loss_3": 2412.4030822753907, "kl_loss_6": 1372.7015441894532, "learning_rate": 0.0007206105506216106, "loss": 1036.6594, "step": 3610 }, { "ce_loss_12": 3.0932498931884767, "ce_loss_17": 2.9613340497016907, "ce_loss_23": 2.9070735931396485, "ce_loss_3": 3.895069921016693, "ce_loss_6": 3.4558199644088745, "epoch": 0.362, "grad_norm": 1104.0, "kl_loss_12": 383.33282775878905, "kl_loss_17": 100.10506706237793, "kl_loss_3": 2114.540673828125, "kl_loss_6": 1220.074530029297, "learning_rate": 0.0007191855733945387, "loss": 949.5305, "step": 3620 }, { "ce_loss_12": 3.182743549346924, "ce_loss_17": 3.0513375759124757, "ce_loss_23": 2.9913695096969604, "ce_loss_3": 3.9969239115715025, "ce_loss_6": 3.5442299962043764, "epoch": 0.363, "grad_norm": 1248.0, "kl_loss_12": 384.91429290771487, "kl_loss_17": 101.22749214172363, "kl_loss_3": 2147.6256896972654, "kl_loss_6": 1226.1741882324218, "learning_rate": 0.0007177583889711762, "loss": 964.0354, "step": 3630 }, { "ce_loss_12": 3.1005003809928895, "ce_loss_17": 2.966002714633942, "ce_loss_23": 2.9100066781044007, "ce_loss_3": 3.9228270649909973, "ce_loss_6": 3.4714234709739684, "epoch": 0.364, "grad_norm": 972.0, "kl_loss_12": 394.1941940307617, "kl_loss_17": 102.08047103881836, "kl_loss_3": 2186.592639160156, "kl_loss_6": 1261.8254516601562, "learning_rate": 0.0007163290117232541, "loss": 981.3645, "step": 3640 }, { "ce_loss_12": 3.1968397974967955, "ce_loss_17": 3.0696872591972353, "ce_loss_23": 3.0185195207595825, "ce_loss_3": 3.967725968360901, "ce_loss_6": 3.5456802606582642, "epoch": 0.365, "grad_norm": 1320.0, "kl_loss_12": 380.22156524658203, "kl_loss_17": 99.77042198181152, "kl_loss_3": 2085.5939819335936, "kl_loss_6": 1208.4053955078125, "learning_rate": 0.0007148974560445859, "loss": 959.1965, "step": 3650 }, { "ce_loss_12": 3.129593849182129, "ce_loss_17": 2.9993977189064025, "ce_loss_23": 2.9464277625083923, "ce_loss_3": 3.928598666191101, "ce_loss_6": 3.4893518686294556, "epoch": 0.366, "grad_norm": 1016.0, "kl_loss_12": 382.19481658935547, "kl_loss_17": 98.97349815368652, "kl_loss_3": 2099.6594787597655, "kl_loss_6": 1206.849383544922, "learning_rate": 0.0007134637363509209, "loss": 948.0132, "step": 3660 }, { "ce_loss_12": 3.2401368737220766, "ce_loss_17": 3.1115305542945864, "ce_loss_23": 3.0598105430603026, "ce_loss_3": 4.030146300792694, "ce_loss_6": 3.593411421775818, "epoch": 0.367, "grad_norm": 1024.0, "kl_loss_12": 381.5959442138672, "kl_loss_17": 98.28689727783203, "kl_loss_3": 2094.195349121094, "kl_loss_6": 1206.679653930664, "learning_rate": 0.0007120278670798009, "loss": 964.0335, "step": 3670 }, { "ce_loss_12": 3.0648820638656615, "ce_loss_17": 2.91995313167572, "ce_loss_23": 2.863781201839447, "ce_loss_3": 3.9347721934318542, "ce_loss_6": 3.457567012310028, "epoch": 0.368, "grad_norm": 1280.0, "kl_loss_12": 407.0378677368164, "kl_loss_17": 102.9833999633789, "kl_loss_3": 2293.2660949707033, "kl_loss_6": 1310.9484741210938, "learning_rate": 0.0007105898626904133, "loss": 1019.8857, "step": 3680 }, { "ce_loss_12": 3.148142862319946, "ce_loss_17": 3.014118826389313, "ce_loss_23": 2.95814106464386, "ce_loss_3": 3.96722731590271, "ce_loss_6": 3.5096484780311585, "epoch": 0.369, "grad_norm": 1072.0, "kl_loss_12": 391.33440551757815, "kl_loss_17": 101.28652992248536, "kl_loss_3": 2147.1658264160155, "kl_loss_6": 1225.1833374023438, "learning_rate": 0.0007091497376634463, "loss": 960.4797, "step": 3690 }, { "ce_loss_12": 3.094625687599182, "ce_loss_17": 2.96570303440094, "ce_loss_23": 2.9103384971618653, "ce_loss_3": 3.9048052191734315, "ce_loss_6": 3.4554564237594603, "epoch": 0.37, "grad_norm": 1032.0, "kl_loss_12": 387.18639221191404, "kl_loss_17": 104.48425025939942, "kl_loss_3": 2125.8362426757812, "kl_loss_6": 1220.0202087402345, "learning_rate": 0.0007077075065009433, "loss": 983.7176, "step": 3700 }, { "ce_loss_12": 3.194843280315399, "ce_loss_17": 3.0574195265769957, "ce_loss_23": 2.999555218219757, "ce_loss_3": 4.023444163799286, "ce_loss_6": 3.5645259976387025, "epoch": 0.371, "grad_norm": 936.0, "kl_loss_12": 398.9264434814453, "kl_loss_17": 108.23813056945801, "kl_loss_3": 2180.979901123047, "kl_loss_6": 1252.6944702148437, "learning_rate": 0.0007062631837261557, "loss": 977.1026, "step": 3710 }, { "ce_loss_12": 3.0720986843109133, "ce_loss_17": 2.9433573603630068, "ce_loss_23": 2.8886937618255617, "ce_loss_3": 3.883027505874634, "ce_loss_6": 3.4352410793304444, "epoch": 0.372, "grad_norm": 1184.0, "kl_loss_12": 382.10267028808596, "kl_loss_17": 100.58302421569825, "kl_loss_3": 2137.9196533203126, "kl_loss_6": 1225.8107116699218, "learning_rate": 0.0007048167838833977, "loss": 986.0539, "step": 3720 }, { "ce_loss_12": 3.1614155411720275, "ce_loss_17": 3.027226221561432, "ce_loss_23": 2.9710015296936034, "ce_loss_3": 3.949637258052826, "ce_loss_6": 3.511720824241638, "epoch": 0.373, "grad_norm": 1344.0, "kl_loss_12": 385.9076919555664, "kl_loss_17": 103.10694885253906, "kl_loss_3": 2117.4567626953126, "kl_loss_6": 1209.1029052734375, "learning_rate": 0.0007033683215379002, "loss": 961.2854, "step": 3730 }, { "ce_loss_12": 3.1443222165107727, "ce_loss_17": 3.0109015107154846, "ce_loss_23": 2.957130324840546, "ce_loss_3": 3.9426589131355287, "ce_loss_6": 3.500744605064392, "epoch": 0.374, "grad_norm": 952.0, "kl_loss_12": 381.2915771484375, "kl_loss_17": 99.72256965637207, "kl_loss_3": 2122.5682678222656, "kl_loss_6": 1217.1044494628907, "learning_rate": 0.0007019178112756625, "loss": 974.4352, "step": 3740 }, { "ce_loss_12": 3.1132125854492188, "ce_loss_17": 2.9844059467315676, "ce_loss_23": 2.930648481845856, "ce_loss_3": 3.9250273585319517, "ce_loss_6": 3.4753824830055238, "epoch": 0.375, "grad_norm": 916.0, "kl_loss_12": 382.26414642333987, "kl_loss_17": 99.75811614990235, "kl_loss_3": 2113.0847900390627, "kl_loss_6": 1220.0052917480468, "learning_rate": 0.0007004652677033068, "loss": 969.0986, "step": 3750 }, { "ce_loss_12": 3.177477180957794, "ce_loss_17": 3.0574225068092344, "ce_loss_23": 3.007947289943695, "ce_loss_3": 3.964920425415039, "ce_loss_6": 3.5296141505241394, "epoch": 0.376, "grad_norm": 908.0, "kl_loss_12": 369.6541061401367, "kl_loss_17": 97.30980415344239, "kl_loss_3": 2076.9691162109375, "kl_loss_6": 1183.470895385742, "learning_rate": 0.0006990107054479312, "loss": 948.6415, "step": 3760 }, { "ce_loss_12": 3.164940822124481, "ce_loss_17": 3.033370578289032, "ce_loss_23": 2.9760269999504088, "ce_loss_3": 3.9709452509880068, "ce_loss_6": 3.528740930557251, "epoch": 0.377, "grad_norm": 1296.0, "kl_loss_12": 389.17967681884767, "kl_loss_17": 102.7863712310791, "kl_loss_3": 2143.7166748046875, "kl_loss_6": 1238.860577392578, "learning_rate": 0.000697554139156961, "loss": 972.5478, "step": 3770 }, { "ce_loss_12": 3.1600608825683594, "ce_loss_17": 3.0301866054534914, "ce_loss_23": 2.974429488182068, "ce_loss_3": 3.9714036107063295, "ce_loss_6": 3.5234702229499817, "epoch": 0.378, "grad_norm": 1096.0, "kl_loss_12": 394.4216369628906, "kl_loss_17": 103.2165542602539, "kl_loss_3": 2147.6901123046873, "kl_loss_6": 1238.7892700195312, "learning_rate": 0.0006960955834980027, "loss": 952.6748, "step": 3780 }, { "ce_loss_12": 3.127192723751068, "ce_loss_17": 2.997998797893524, "ce_loss_23": 2.9416239619255067, "ce_loss_3": 3.928993654251099, "ce_loss_6": 3.4925607323646544, "epoch": 0.379, "grad_norm": 1072.0, "kl_loss_12": 382.92139739990233, "kl_loss_17": 99.97526016235352, "kl_loss_3": 2108.2826599121095, "kl_loss_6": 1218.4925964355468, "learning_rate": 0.0006946350531586958, "loss": 957.6444, "step": 3790 }, { "ce_loss_12": 3.1599705696105955, "ce_loss_17": 3.0228681206703185, "ce_loss_23": 2.9705450177192687, "ce_loss_3": 3.9659302115440367, "ce_loss_6": 3.5193197250366213, "epoch": 0.38, "grad_norm": 900.0, "kl_loss_12": 384.08446350097654, "kl_loss_17": 98.96566047668458, "kl_loss_3": 2138.1342041015623, "kl_loss_6": 1222.6171569824219, "learning_rate": 0.0006931725628465643, "loss": 978.7352, "step": 3800 }, { "ce_loss_12": 3.164839673042297, "ce_loss_17": 3.0324612498283385, "ce_loss_23": 2.974766230583191, "ce_loss_3": 3.9797155141830443, "ce_loss_6": 3.528802454471588, "epoch": 0.381, "grad_norm": 1012.0, "kl_loss_12": 384.5140380859375, "kl_loss_17": 100.59394874572754, "kl_loss_3": 2130.6960021972654, "kl_loss_6": 1219.2356811523437, "learning_rate": 0.0006917081272888696, "loss": 963.9451, "step": 3810 }, { "ce_loss_12": 3.0861921548843383, "ce_loss_17": 2.952903139591217, "ce_loss_23": 2.897795629501343, "ce_loss_3": 3.943486976623535, "ce_loss_6": 3.455733525753021, "epoch": 0.382, "grad_norm": 1048.0, "kl_loss_12": 392.62345428466796, "kl_loss_17": 102.91732864379883, "kl_loss_3": 2244.1899475097657, "kl_loss_6": 1248.053887939453, "learning_rate": 0.0006902417612324615, "loss": 974.233, "step": 3820 }, { "ce_loss_12": 3.211138737201691, "ce_loss_17": 3.065601098537445, "ce_loss_23": 3.0089004755020143, "ce_loss_3": 4.046250092983246, "ce_loss_6": 3.582586634159088, "epoch": 0.383, "grad_norm": 968.0, "kl_loss_12": 410.6596969604492, "kl_loss_17": 106.09553604125976, "kl_loss_3": 2204.2128967285157, "kl_loss_6": 1265.9451049804688, "learning_rate": 0.00068877347944363, "loss": 985.0914, "step": 3830 }, { "ce_loss_12": 3.1957462310791014, "ce_loss_17": 3.0704895853996277, "ce_loss_23": 3.0156718850135804, "ce_loss_3": 3.9832470297813414, "ce_loss_6": 3.5446235060691835, "epoch": 0.384, "grad_norm": 1112.0, "kl_loss_12": 388.15425567626954, "kl_loss_17": 101.4664520263672, "kl_loss_3": 2101.9537353515625, "kl_loss_6": 1207.7802978515624, "learning_rate": 0.0006873032967079561, "loss": 970.1563, "step": 3840 }, { "ce_loss_12": 3.1780274152755736, "ce_loss_17": 3.0505093932151794, "ce_loss_23": 2.996346962451935, "ce_loss_3": 3.950271713733673, "ce_loss_6": 3.5170023918151854, "epoch": 0.385, "grad_norm": 952.0, "kl_loss_12": 382.70263824462893, "kl_loss_17": 100.49216232299804, "kl_loss_3": 2076.2669921875, "kl_loss_6": 1188.8818450927733, "learning_rate": 0.0006858312278301637, "loss": 942.3533, "step": 3850 }, { "ce_loss_12": 3.2134106755256653, "ce_loss_17": 3.085762679576874, "ce_loss_23": 3.035565769672394, "ce_loss_3": 3.979679024219513, "ce_loss_6": 3.5507980227470397, "epoch": 0.386, "grad_norm": 1208.0, "kl_loss_12": 382.0628173828125, "kl_loss_17": 100.69847984313965, "kl_loss_3": 2070.21513671875, "kl_loss_6": 1192.517266845703, "learning_rate": 0.0006843572876339704, "loss": 945.9352, "step": 3860 }, { "ce_loss_12": 3.1366180539131165, "ce_loss_17": 3.0096086144447325, "ce_loss_23": 2.9578014850616454, "ce_loss_3": 3.8984736084938048, "ce_loss_6": 3.4832229256629943, "epoch": 0.387, "grad_norm": 1048.0, "kl_loss_12": 374.30377197265625, "kl_loss_17": 97.24090919494628, "kl_loss_3": 2047.601544189453, "kl_loss_6": 1186.3757263183593, "learning_rate": 0.0006828814909619373, "loss": 972.7648, "step": 3870 }, { "ce_loss_12": 3.257560980319977, "ce_loss_17": 3.126918005943298, "ce_loss_23": 3.0692105889320374, "ce_loss_3": 4.041906023025513, "ce_loss_6": 3.5994970917701723, "epoch": 0.388, "grad_norm": 980.0, "kl_loss_12": 389.04183197021484, "kl_loss_17": 102.8961841583252, "kl_loss_3": 2084.578723144531, "kl_loss_6": 1191.1536590576172, "learning_rate": 0.0006814038526753205, "loss": 936.5445, "step": 3880 }, { "ce_loss_12": 3.1633546233177183, "ce_loss_17": 3.030035877227783, "ce_loss_23": 2.975773298740387, "ce_loss_3": 3.945274126529694, "ce_loss_6": 3.508275043964386, "epoch": 0.389, "grad_norm": 992.0, "kl_loss_12": 383.5569595336914, "kl_loss_17": 99.5365421295166, "kl_loss_3": 2082.189862060547, "kl_loss_6": 1198.423110961914, "learning_rate": 0.0006799243876539213, "loss": 948.3916, "step": 3890 }, { "ce_loss_12": 3.090026044845581, "ce_loss_17": 2.960541236400604, "ce_loss_23": 2.9071210980415345, "ce_loss_3": 3.9189464807510377, "ce_loss_6": 3.452379596233368, "epoch": 0.39, "grad_norm": 1568.0, "kl_loss_12": 384.93935241699216, "kl_loss_17": 99.81928863525391, "kl_loss_3": 2171.3757263183593, "kl_loss_6": 1219.7862731933594, "learning_rate": 0.0006784431107959359, "loss": 972.2284, "step": 3900 }, { "ce_loss_12": 3.150925076007843, "ce_loss_17": 3.0121824502944947, "ce_loss_23": 2.9566686391830443, "ce_loss_3": 3.9797996401786806, "ce_loss_6": 3.520882248878479, "epoch": 0.391, "grad_norm": 1112.0, "kl_loss_12": 393.58673706054685, "kl_loss_17": 101.93662071228027, "kl_loss_3": 2192.713427734375, "kl_loss_6": 1248.4283935546875, "learning_rate": 0.0006769600370178059, "loss": 965.526, "step": 3910 }, { "ce_loss_12": 3.111063516139984, "ce_loss_17": 2.980947160720825, "ce_loss_23": 2.925340807437897, "ce_loss_3": 3.9266233801841737, "ce_loss_6": 3.473023760318756, "epoch": 0.392, "grad_norm": 880.0, "kl_loss_12": 384.3804428100586, "kl_loss_17": 97.82368049621581, "kl_loss_3": 2136.1855529785157, "kl_loss_6": 1223.2701232910156, "learning_rate": 0.0006754751812540679, "loss": 943.424, "step": 3920 }, { "ce_loss_12": 3.162268328666687, "ce_loss_17": 3.02713885307312, "ce_loss_23": 2.971815264225006, "ce_loss_3": 3.9698534488677977, "ce_loss_6": 3.5212791562080383, "epoch": 0.393, "grad_norm": 1120.0, "kl_loss_12": 389.7220794677734, "kl_loss_17": 102.36843185424804, "kl_loss_3": 2160.457275390625, "kl_loss_6": 1239.3633728027344, "learning_rate": 0.0006739885584572025, "loss": 975.3045, "step": 3930 }, { "ce_loss_12": 3.1774211168289184, "ce_loss_17": 3.0457595705986025, "ce_loss_23": 2.989675688743591, "ce_loss_3": 4.002368414402008, "ce_loss_6": 3.538395082950592, "epoch": 0.394, "grad_norm": 1208.0, "kl_loss_12": 387.89739379882815, "kl_loss_17": 101.88522567749024, "kl_loss_3": 2200.5525756835937, "kl_loss_6": 1235.5010864257813, "learning_rate": 0.0006725001835974853, "loss": 958.2978, "step": 3940 }, { "ce_loss_12": 3.176982545852661, "ce_loss_17": 3.0421844482421876, "ce_loss_23": 2.986680102348328, "ce_loss_3": 3.984161686897278, "ce_loss_6": 3.5369669675827025, "epoch": 0.395, "grad_norm": 1256.0, "kl_loss_12": 395.7901412963867, "kl_loss_17": 102.56606636047363, "kl_loss_3": 2153.0941528320313, "kl_loss_6": 1235.6717407226563, "learning_rate": 0.0006710100716628344, "loss": 949.3996, "step": 3950 }, { "ce_loss_12": 3.163419556617737, "ce_loss_17": 3.0290326714515685, "ce_loss_23": 2.972023379802704, "ce_loss_3": 3.9705038547515867, "ce_loss_6": 3.530128002166748, "epoch": 0.396, "grad_norm": 1032.0, "kl_loss_12": 388.55382232666017, "kl_loss_17": 100.28390426635742, "kl_loss_3": 2136.8793762207033, "kl_loss_6": 1231.7751403808593, "learning_rate": 0.0006695182376586602, "loss": 969.2206, "step": 3960 }, { "ce_loss_12": 3.177538585662842, "ce_loss_17": 3.0523517608642576, "ce_loss_23": 2.9978424191474913, "ce_loss_3": 3.936390829086304, "ce_loss_6": 3.5100929021835325, "epoch": 0.397, "grad_norm": 1168.0, "kl_loss_12": 365.78756256103514, "kl_loss_17": 95.89293823242187, "kl_loss_3": 2016.74169921875, "kl_loss_6": 1147.0362548828125, "learning_rate": 0.000668024696607715, "loss": 951.3905, "step": 3970 }, { "ce_loss_12": 3.143988335132599, "ce_loss_17": 3.01651873588562, "ce_loss_23": 2.96545991897583, "ce_loss_3": 3.9358643174171446, "ce_loss_6": 3.496285951137543, "epoch": 0.398, "grad_norm": 1064.0, "kl_loss_12": 378.8801467895508, "kl_loss_17": 98.12645530700684, "kl_loss_3": 2110.0565856933595, "kl_loss_6": 1205.5675537109375, "learning_rate": 0.0006665294635499404, "loss": 949.7793, "step": 3980 }, { "ce_loss_12": 3.163826417922974, "ce_loss_17": 3.029632306098938, "ce_loss_23": 2.9687330961227416, "ce_loss_3": 3.9991076827049254, "ce_loss_6": 3.538382375240326, "epoch": 0.399, "grad_norm": 984.0, "kl_loss_12": 401.962060546875, "kl_loss_17": 106.9761344909668, "kl_loss_3": 2226.279400634766, "kl_loss_6": 1279.4245971679688, "learning_rate": 0.0006650325535423167, "loss": 979.8545, "step": 3990 }, { "ce_loss_12": 3.168172299861908, "ce_loss_17": 3.0481725573539733, "ce_loss_23": 2.990405547618866, "ce_loss_3": 3.929426848888397, "ce_loss_6": 3.5102165937423706, "epoch": 0.4, "grad_norm": 972.0, "kl_loss_12": 368.17052001953124, "kl_loss_17": 111.31200065612794, "kl_loss_3": 2006.9190979003906, "kl_loss_6": 1151.369775390625, "learning_rate": 0.0006635339816587109, "loss": 940.1266, "step": 4000 }, { "ce_loss_12": 3.114549422264099, "ce_loss_17": 2.989661431312561, "ce_loss_23": 2.9311137080192564, "ce_loss_3": 3.935007870197296, "ce_loss_6": 3.475666677951813, "epoch": 0.401, "grad_norm": 976.0, "kl_loss_12": 387.16961059570315, "kl_loss_17": 109.62789192199708, "kl_loss_3": 2164.3400817871093, "kl_loss_6": 1229.5659454345703, "learning_rate": 0.0006620337629897252, "loss": 960.3354, "step": 4010 }, { "ce_loss_12": 3.126537036895752, "ce_loss_17": 2.9946191906929016, "ce_loss_23": 2.936546230316162, "ce_loss_3": 3.928071904182434, "ce_loss_6": 3.478977394104004, "epoch": 0.402, "grad_norm": 916.0, "kl_loss_12": 387.9364669799805, "kl_loss_17": 106.62837104797363, "kl_loss_3": 2123.9172973632812, "kl_loss_6": 1210.9631713867188, "learning_rate": 0.0006605319126425454, "loss": 973.7484, "step": 4020 }, { "ce_loss_12": 3.041149842739105, "ce_loss_17": 2.909222424030304, "ce_loss_23": 2.8535015285015106, "ce_loss_3": 3.876503324508667, "ce_loss_6": 3.4090627312660216, "epoch": 0.403, "grad_norm": 1088.0, "kl_loss_12": 385.33301696777346, "kl_loss_17": 103.06710166931153, "kl_loss_3": 2199.996661376953, "kl_loss_6": 1249.3460327148437, "learning_rate": 0.0006590284457407876, "loss": 975.139, "step": 4030 }, { "ce_loss_12": 3.128358006477356, "ce_loss_17": 2.9978256225585938, "ce_loss_23": 2.9422669768333436, "ce_loss_3": 3.930923879146576, "ce_loss_6": 3.486605441570282, "epoch": 0.404, "grad_norm": 1216.0, "kl_loss_12": 383.995783996582, "kl_loss_17": 102.69088897705078, "kl_loss_3": 2117.4116149902343, "kl_loss_6": 1210.769482421875, "learning_rate": 0.0006575233774243465, "loss": 953.6563, "step": 4040 }, { "ce_loss_12": 3.1257184743881226, "ce_loss_17": 2.9924083948135376, "ce_loss_23": 2.936680054664612, "ce_loss_3": 3.937123966217041, "ce_loss_6": 3.488031816482544, "epoch": 0.405, "grad_norm": 1232.0, "kl_loss_12": 389.3181213378906, "kl_loss_17": 101.56726264953613, "kl_loss_3": 2144.496929931641, "kl_loss_6": 1222.655938720703, "learning_rate": 0.0006560167228492435, "loss": 958.7957, "step": 4050 }, { "ce_loss_12": 3.1630433559417725, "ce_loss_17": 3.0304596543312075, "ce_loss_23": 2.980211853981018, "ce_loss_3": 3.93538019657135, "ce_loss_6": 3.5052495121955873, "epoch": 0.406, "grad_norm": 1168.0, "kl_loss_12": 370.5975006103516, "kl_loss_17": 96.3271987915039, "kl_loss_3": 2065.861767578125, "kl_loss_6": 1190.795327758789, "learning_rate": 0.0006545084971874737, "loss": 951.2504, "step": 4060 }, { "ce_loss_12": 3.1312262296676634, "ce_loss_17": 2.9931742548942566, "ce_loss_23": 2.9370701909065247, "ce_loss_3": 3.95162593126297, "ce_loss_6": 3.501697599887848, "epoch": 0.407, "grad_norm": 884.0, "kl_loss_12": 396.94751892089846, "kl_loss_17": 103.59187126159668, "kl_loss_3": 2175.806359863281, "kl_loss_6": 1249.0946655273438, "learning_rate": 0.0006529987156268526, "loss": 953.4985, "step": 4070 }, { "ce_loss_12": 3.0533276081085203, "ce_loss_17": 2.915659213066101, "ce_loss_23": 2.8591691732406614, "ce_loss_3": 3.8715276956558227, "ce_loss_6": 3.4168421745300295, "epoch": 0.408, "grad_norm": 968.0, "kl_loss_12": 383.2756820678711, "kl_loss_17": 99.68055114746093, "kl_loss_3": 2138.8466552734376, "kl_loss_6": 1219.7853637695312, "learning_rate": 0.0006514873933708637, "loss": 980.2502, "step": 4080 }, { "ce_loss_12": 3.1608598113059996, "ce_loss_17": 3.0313393831253053, "ce_loss_23": 2.977697968482971, "ce_loss_3": 3.956602382659912, "ce_loss_6": 3.5155387043952944, "epoch": 0.409, "grad_norm": 1120.0, "kl_loss_12": 375.5689437866211, "kl_loss_17": 96.74662399291992, "kl_loss_3": 2107.760119628906, "kl_loss_6": 1192.8095153808595, "learning_rate": 0.0006499745456385053, "loss": 942.5886, "step": 4090 }, { "ce_loss_12": 3.1293053030967712, "ce_loss_17": 2.9983129143714904, "ce_loss_23": 2.941512739658356, "ce_loss_3": 3.9319061040878296, "ce_loss_6": 3.484344244003296, "epoch": 0.41, "grad_norm": 1144.0, "kl_loss_12": 382.61876373291017, "kl_loss_17": 99.34859237670898, "kl_loss_3": 2114.9715087890627, "kl_loss_6": 1213.1263122558594, "learning_rate": 0.0006484601876641375, "loss": 961.2418, "step": 4100 }, { "ce_loss_12": 3.1114142775535583, "ce_loss_17": 2.9875324964523315, "ce_loss_23": 2.935147023200989, "ce_loss_3": 3.874464249610901, "ce_loss_6": 3.456193673610687, "epoch": 0.411, "grad_norm": 988.0, "kl_loss_12": 372.14844207763673, "kl_loss_17": 97.36947631835938, "kl_loss_3": 2045.9664733886718, "kl_loss_6": 1174.0520263671874, "learning_rate": 0.000646944334697328, "loss": 932.9962, "step": 4110 }, { "ce_loss_12": 3.220192623138428, "ce_loss_17": 3.0896286606788634, "ce_loss_23": 3.037607753276825, "ce_loss_3": 3.9769805669784546, "ce_loss_6": 3.554104244709015, "epoch": 0.412, "grad_norm": 1160.0, "kl_loss_12": 372.82581939697263, "kl_loss_17": 97.23840293884277, "kl_loss_3": 2013.3982482910155, "kl_loss_6": 1167.341049194336, "learning_rate": 0.0006454270020026995, "loss": 917.0743, "step": 4120 }, { "ce_loss_12": 3.185134708881378, "ce_loss_17": 3.0607090830802917, "ce_loss_23": 3.0103032112121584, "ce_loss_3": 3.940109670162201, "ce_loss_6": 3.519068014621735, "epoch": 0.413, "grad_norm": 784.0, "kl_loss_12": 364.3710479736328, "kl_loss_17": 93.57289505004883, "kl_loss_3": 2003.3096008300781, "kl_loss_6": 1152.3987396240234, "learning_rate": 0.0006439082048597755, "loss": 914.7779, "step": 4130 }, { "ce_loss_12": 3.177386391162872, "ce_loss_17": 3.0462804079055785, "ce_loss_23": 2.994649851322174, "ce_loss_3": 3.9677168011665342, "ce_loss_6": 3.5374234557151794, "epoch": 0.414, "grad_norm": 1064.0, "kl_loss_12": 381.20457153320314, "kl_loss_17": 96.80270919799804, "kl_loss_3": 2106.9815979003906, "kl_loss_6": 1217.2957641601563, "learning_rate": 0.0006423879585628261, "loss": 950.1129, "step": 4140 }, { "ce_loss_12": 3.1385213136672974, "ce_loss_17": 3.006602096557617, "ce_loss_23": 2.9507169365882873, "ce_loss_3": 3.9623284220695494, "ce_loss_6": 3.5014745593070984, "epoch": 0.415, "grad_norm": 968.0, "kl_loss_12": 391.1500961303711, "kl_loss_17": 100.56959648132325, "kl_loss_3": 2171.917236328125, "kl_loss_6": 1231.1592224121093, "learning_rate": 0.0006408662784207149, "loss": 967.7506, "step": 4150 }, { "ce_loss_12": 3.1071489095687865, "ce_loss_17": 2.980406606197357, "ce_loss_23": 2.9294055104255676, "ce_loss_3": 3.9036468267440796, "ce_loss_6": 3.4644244074821473, "epoch": 0.416, "grad_norm": 1200.0, "kl_loss_12": 374.96314697265626, "kl_loss_17": 96.86763343811035, "kl_loss_3": 2105.97939453125, "kl_loss_6": 1211.642266845703, "learning_rate": 0.0006393431797567439, "loss": 948.6432, "step": 4160 }, { "ce_loss_12": 3.1838504314422607, "ce_loss_17": 3.0583144307136534, "ce_loss_23": 3.005660831928253, "ce_loss_3": 3.9278244495391847, "ce_loss_6": 3.5054065704345705, "epoch": 0.417, "grad_norm": 972.0, "kl_loss_12": 371.84607696533203, "kl_loss_17": 97.47408409118653, "kl_loss_3": 2020.6477111816407, "kl_loss_6": 1155.8765777587892, "learning_rate": 0.0006378186779084996, "loss": 905.9456, "step": 4170 }, { "ce_loss_12": 3.020306038856506, "ce_loss_17": 2.8910472273826597, "ce_loss_23": 2.8401397347450255, "ce_loss_3": 3.8421530604362486, "ce_loss_6": 3.3880983710289003, "epoch": 0.418, "grad_norm": 1040.0, "kl_loss_12": 381.0573959350586, "kl_loss_17": 98.93289833068847, "kl_loss_3": 2131.9426147460936, "kl_loss_6": 1223.2824462890626, "learning_rate": 0.0006362927882276989, "loss": 962.801, "step": 4180 }, { "ce_loss_12": 3.1948235034942627, "ce_loss_17": 3.0719444274902346, "ce_loss_23": 3.0190091133117676, "ce_loss_3": 3.9587900042533875, "ce_loss_6": 3.534806561470032, "epoch": 0.419, "grad_norm": 1072.0, "kl_loss_12": 362.573291015625, "kl_loss_17": 95.47424354553223, "kl_loss_3": 2034.4496154785156, "kl_loss_6": 1160.2906799316406, "learning_rate": 0.000634765526080034, "loss": 907.3845, "step": 4190 }, { "ce_loss_12": 3.209235632419586, "ce_loss_17": 3.0822045683860777, "ce_loss_23": 3.0283665299415587, "ce_loss_3": 3.9763604760169984, "ce_loss_6": 3.5511908173561095, "epoch": 0.42, "grad_norm": 900.0, "kl_loss_12": 375.2816680908203, "kl_loss_17": 98.58403778076172, "kl_loss_3": 2044.1361572265625, "kl_loss_6": 1181.4364013671875, "learning_rate": 0.0006332369068450174, "loss": 924.5047, "step": 4200 }, { "ce_loss_12": 3.148201882839203, "ce_loss_17": 3.0171813011169433, "ce_loss_23": 2.965887463092804, "ce_loss_3": 3.9352025747299195, "ce_loss_6": 3.504900002479553, "epoch": 0.421, "grad_norm": 1168.0, "kl_loss_12": 375.26795501708983, "kl_loss_17": 98.06200065612794, "kl_loss_3": 2075.0803649902346, "kl_loss_6": 1197.796453857422, "learning_rate": 0.0006317069459158283, "loss": 932.6061, "step": 4210 }, { "ce_loss_12": 3.2403309464454653, "ce_loss_17": 3.1184786796569823, "ce_loss_23": 3.0642859816551207, "ce_loss_3": 3.9955074787139893, "ce_loss_6": 3.5689878702163695, "epoch": 0.422, "grad_norm": 1232.0, "kl_loss_12": 370.9367172241211, "kl_loss_17": 98.52574310302734, "kl_loss_3": 2026.5843505859375, "kl_loss_6": 1160.4238342285157, "learning_rate": 0.0006301756586991561, "loss": 921.3623, "step": 4220 }, { "ce_loss_12": 3.0353715300559996, "ce_loss_17": 2.9087610244750977, "ce_loss_23": 2.85482771396637, "ce_loss_3": 3.8594297885894777, "ce_loss_6": 3.405829298496246, "epoch": 0.423, "grad_norm": 1408.0, "kl_loss_12": 382.88744964599607, "kl_loss_17": 98.91964225769043, "kl_loss_3": 2180.594561767578, "kl_loss_6": 1240.1722412109375, "learning_rate": 0.0006286430606150459, "loss": 959.4756, "step": 4230 }, { "ce_loss_12": 3.2323502779006956, "ce_loss_17": 3.105207896232605, "ce_loss_23": 3.053818738460541, "ce_loss_3": 4.012285470962524, "ce_loss_6": 3.5798202991485595, "epoch": 0.424, "grad_norm": 852.0, "kl_loss_12": 379.60814971923827, "kl_loss_17": 98.71002159118652, "kl_loss_3": 2080.068084716797, "kl_loss_6": 1186.7841796875, "learning_rate": 0.0006271091670967436, "loss": 936.2564, "step": 4240 }, { "ce_loss_12": 3.1621394395828246, "ce_loss_17": 3.0232860922813414, "ce_loss_23": 2.9645561575889587, "ce_loss_3": 3.9861019134521483, "ce_loss_6": 3.53287935256958, "epoch": 0.425, "grad_norm": 1272.0, "kl_loss_12": 398.60731048583983, "kl_loss_17": 102.87269020080566, "kl_loss_3": 2185.2963439941404, "kl_loss_6": 1259.7050170898438, "learning_rate": 0.0006255739935905395, "loss": 960.5764, "step": 4250 }, { "ce_loss_12": 3.186166834831238, "ce_loss_17": 3.057102692127228, "ce_loss_23": 3.004942548274994, "ce_loss_3": 3.951607370376587, "ce_loss_6": 3.523983860015869, "epoch": 0.426, "grad_norm": 1020.0, "kl_loss_12": 375.788801574707, "kl_loss_17": 98.5140495300293, "kl_loss_3": 2047.4385620117187, "kl_loss_6": 1168.862127685547, "learning_rate": 0.0006240375555556145, "loss": 959.7219, "step": 4260 }, { "ce_loss_12": 3.1884969115257262, "ce_loss_17": 3.055912899971008, "ce_loss_23": 3.0020667195320128, "ce_loss_3": 4.003851306438446, "ce_loss_6": 3.5518761396408083, "epoch": 0.427, "grad_norm": 1040.0, "kl_loss_12": 385.05956878662107, "kl_loss_17": 99.31076583862304, "kl_loss_3": 2143.3416748046875, "kl_loss_6": 1220.8311340332032, "learning_rate": 0.000622499868463882, "loss": 952.8904, "step": 4270 }, { "ce_loss_12": 3.1552950739860535, "ce_loss_17": 3.0333962559700014, "ce_loss_23": 2.982784461975098, "ce_loss_3": 3.8959374904632567, "ce_loss_6": 3.484722626209259, "epoch": 0.428, "grad_norm": 1096.0, "kl_loss_12": 364.5569732666016, "kl_loss_17": 97.09578018188476, "kl_loss_3": 2005.030322265625, "kl_loss_6": 1149.582730102539, "learning_rate": 0.0006209609477998338, "loss": 922.2639, "step": 4280 }, { "ce_loss_12": 3.213744878768921, "ce_loss_17": 3.0866466403007506, "ce_loss_23": 3.0286855936050414, "ce_loss_3": 3.989298403263092, "ce_loss_6": 3.5574489712715147, "epoch": 0.429, "grad_norm": 1008.0, "kl_loss_12": 376.99034729003904, "kl_loss_17": 100.63436355590821, "kl_loss_3": 2063.216760253906, "kl_loss_6": 1188.7937561035155, "learning_rate": 0.0006194208090603844, "loss": 947.5668, "step": 4290 }, { "ce_loss_12": 3.1322065472602842, "ce_loss_17": 3.008976089954376, "ce_loss_23": 2.955915665626526, "ce_loss_3": 3.9148309588432313, "ce_loss_6": 3.4751729369163513, "epoch": 0.43, "grad_norm": 1360.0, "kl_loss_12": 365.5053421020508, "kl_loss_17": 96.05026359558106, "kl_loss_3": 2053.0044128417967, "kl_loss_6": 1161.1391235351562, "learning_rate": 0.0006178794677547138, "loss": 916.7221, "step": 4300 }, { "ce_loss_12": 3.1605747818946837, "ce_loss_17": 3.0332695603370667, "ce_loss_23": 2.981340205669403, "ce_loss_3": 3.9655585885047913, "ce_loss_6": 3.5191497564315797, "epoch": 0.431, "grad_norm": 928.0, "kl_loss_12": 384.50793609619143, "kl_loss_17": 100.78701591491699, "kl_loss_3": 2113.747119140625, "kl_loss_6": 1213.9062072753907, "learning_rate": 0.0006163369394041111, "loss": 947.5874, "step": 4310 }, { "ce_loss_12": 3.1003188490867615, "ce_loss_17": 2.9701900720596313, "ce_loss_23": 2.913443350791931, "ce_loss_3": 3.9075787544250487, "ce_loss_6": 3.458979332447052, "epoch": 0.432, "grad_norm": 1120.0, "kl_loss_12": 377.0905960083008, "kl_loss_17": 99.02350425720215, "kl_loss_3": 2128.7301391601563, "kl_loss_6": 1217.1617401123046, "learning_rate": 0.0006147932395418205, "loss": 971.5845, "step": 4320 }, { "ce_loss_12": 3.140204668045044, "ce_loss_17": 3.011886489391327, "ce_loss_23": 2.9580222249031065, "ce_loss_3": 3.903342294692993, "ce_loss_6": 3.4775962233543396, "epoch": 0.433, "grad_norm": 1040.0, "kl_loss_12": 377.0015670776367, "kl_loss_17": 98.69484672546386, "kl_loss_3": 2057.992724609375, "kl_loss_6": 1186.7297576904298, "learning_rate": 0.0006132483837128823, "loss": 925.6322, "step": 4330 }, { "ce_loss_12": 3.114229154586792, "ce_loss_17": 2.9869271516799927, "ce_loss_23": 2.935202646255493, "ce_loss_3": 3.9059092879295347, "ce_loss_6": 3.4597684383392333, "epoch": 0.434, "grad_norm": 1232.0, "kl_loss_12": 375.26554107666016, "kl_loss_17": 97.48203544616699, "kl_loss_3": 2104.536700439453, "kl_loss_6": 1186.8380126953125, "learning_rate": 0.0006117023874739772, "loss": 939.7988, "step": 4340 }, { "ce_loss_12": 3.1105947494506836, "ce_loss_17": 2.98136168718338, "ce_loss_23": 2.929484283924103, "ce_loss_3": 3.9065537333488463, "ce_loss_6": 3.461872565746307, "epoch": 0.435, "grad_norm": 1312.0, "kl_loss_12": 379.8814270019531, "kl_loss_17": 97.82222518920898, "kl_loss_3": 2112.0099365234373, "kl_loss_6": 1206.9251831054687, "learning_rate": 0.0006101552663932703, "loss": 959.5463, "step": 4350 }, { "ce_loss_12": 3.1442917227745055, "ce_loss_17": 3.0152896285057067, "ce_loss_23": 2.9616350173950194, "ce_loss_3": 3.9271872639656067, "ce_loss_6": 3.4907243251800537, "epoch": 0.436, "grad_norm": 1056.0, "kl_loss_12": 380.0096099853516, "kl_loss_17": 99.47969856262208, "kl_loss_3": 2090.22939453125, "kl_loss_6": 1197.7749267578124, "learning_rate": 0.0006086070360502539, "loss": 938.3344, "step": 4360 }, { "ce_loss_12": 3.14714058637619, "ce_loss_17": 3.0191622018814086, "ce_loss_23": 2.9650089621543883, "ce_loss_3": 3.93569530248642, "ce_loss_6": 3.4921825528144836, "epoch": 0.437, "grad_norm": 1104.0, "kl_loss_12": 375.81161346435545, "kl_loss_17": 96.95657653808594, "kl_loss_3": 2095.228955078125, "kl_loss_6": 1190.774935913086, "learning_rate": 0.0006070577120355903, "loss": 940.5141, "step": 4370 }, { "ce_loss_12": 3.1431150674819945, "ce_loss_17": 3.0185340642929077, "ce_loss_23": 2.963430368900299, "ce_loss_3": 3.907380211353302, "ce_loss_6": 3.4939981341362, "epoch": 0.438, "grad_norm": 1064.0, "kl_loss_12": 364.2071594238281, "kl_loss_17": 94.97382392883301, "kl_loss_3": 2012.2494140625, "kl_loss_6": 1164.0711059570312, "learning_rate": 0.0006055073099509549, "loss": 926.577, "step": 4380 }, { "ce_loss_12": 3.2006898403167723, "ce_loss_17": 3.081389880180359, "ce_loss_23": 3.027166259288788, "ce_loss_3": 3.9555254697799684, "ce_loss_6": 3.5358532547950743, "epoch": 0.439, "grad_norm": 980.0, "kl_loss_12": 371.9804412841797, "kl_loss_17": 97.23721199035644, "kl_loss_3": 2038.4843444824219, "kl_loss_6": 1173.850146484375, "learning_rate": 0.0006039558454088796, "loss": 939.1463, "step": 4390 }, { "ce_loss_12": 3.1692385196685793, "ce_loss_17": 3.0419707775115965, "ce_loss_23": 2.985661518573761, "ce_loss_3": 3.955595600605011, "ce_loss_6": 3.5197404265403747, "epoch": 0.44, "grad_norm": 1064.0, "kl_loss_12": 377.35308837890625, "kl_loss_17": 99.0156379699707, "kl_loss_3": 2082.0146545410157, "kl_loss_6": 1190.1516021728517, "learning_rate": 0.0006024033340325954, "loss": 919.5555, "step": 4400 }, { "ce_loss_12": 3.235099709033966, "ce_loss_17": 3.113989222049713, "ce_loss_23": 3.060835361480713, "ce_loss_3": 3.9762577414512634, "ce_loss_6": 3.561458909511566, "epoch": 0.441, "grad_norm": 980.0, "kl_loss_12": 361.0665008544922, "kl_loss_17": 93.74198760986329, "kl_loss_3": 1977.7686767578125, "kl_loss_6": 1137.2376220703125, "learning_rate": 0.0006008497914558743, "loss": 909.0549, "step": 4410 }, { "ce_loss_12": 3.1947230458259583, "ce_loss_17": 3.059620177745819, "ce_loss_23": 3.004351055622101, "ce_loss_3": 3.9716932654380797, "ce_loss_6": 3.537883198261261, "epoch": 0.442, "grad_norm": 980.0, "kl_loss_12": 386.25874786376954, "kl_loss_17": 101.76474266052246, "kl_loss_3": 2093.4546142578124, "kl_loss_6": 1198.059033203125, "learning_rate": 0.0005992952333228728, "loss": 941.9635, "step": 4420 }, { "ce_loss_12": 3.125914978981018, "ce_loss_17": 3.0019068121910095, "ce_loss_23": 2.9484395027160644, "ce_loss_3": 3.9122184991836546, "ce_loss_6": 3.4705653548240663, "epoch": 0.443, "grad_norm": 1096.0, "kl_loss_12": 370.42543334960936, "kl_loss_17": 95.97517318725586, "kl_loss_3": 2097.5520629882812, "kl_loss_6": 1187.7123474121095, "learning_rate": 0.0005977396752879741, "loss": 933.6965, "step": 4430 }, { "ce_loss_12": 3.059049963951111, "ce_loss_17": 2.9283310770988464, "ce_loss_23": 2.877157747745514, "ce_loss_3": 3.843676042556763, "ce_loss_6": 3.412899672985077, "epoch": 0.444, "grad_norm": 1344.0, "kl_loss_12": 375.6527328491211, "kl_loss_17": 94.45379600524902, "kl_loss_3": 2104.6698791503904, "kl_loss_6": 1208.5965942382813, "learning_rate": 0.0005961831330156305, "loss": 929.7277, "step": 4440 }, { "ce_loss_12": 3.1946892142295837, "ce_loss_17": 3.0648535251617433, "ce_loss_23": 3.0128404259681703, "ce_loss_3": 3.9888625264167787, "ce_loss_6": 3.547437810897827, "epoch": 0.445, "grad_norm": 992.0, "kl_loss_12": 371.147119140625, "kl_loss_17": 95.90944900512696, "kl_loss_3": 2109.418914794922, "kl_loss_6": 1197.6739135742187, "learning_rate": 0.0005946256221802051, "loss": 956.1494, "step": 4450 }, { "ce_loss_12": 3.1555089116096497, "ce_loss_17": 3.0345204710960387, "ce_loss_23": 2.986414337158203, "ce_loss_3": 3.897111940383911, "ce_loss_6": 3.481855571269989, "epoch": 0.446, "grad_norm": 1160.0, "kl_loss_12": 356.09037628173826, "kl_loss_17": 93.4267032623291, "kl_loss_3": 1978.97578125, "kl_loss_6": 1134.439404296875, "learning_rate": 0.0005930671584658151, "loss": 946.3023, "step": 4460 }, { "ce_loss_12": 3.1738540887832642, "ce_loss_17": 3.047953486442566, "ce_loss_23": 2.9957111835479737, "ce_loss_3": 3.9466116666793822, "ce_loss_6": 3.5165825486183167, "epoch": 0.447, "grad_norm": 912.0, "kl_loss_12": 373.43541717529297, "kl_loss_17": 96.29221115112304, "kl_loss_3": 2065.424639892578, "kl_loss_6": 1179.5342651367187, "learning_rate": 0.0005915077575661722, "loss": 939.1451, "step": 4470 }, { "ce_loss_12": 3.187889552116394, "ce_loss_17": 3.0592567563056945, "ce_loss_23": 3.0062554359436033, "ce_loss_3": 3.9646561622619627, "ce_loss_6": 3.530262494087219, "epoch": 0.448, "grad_norm": 792.0, "kl_loss_12": 382.846728515625, "kl_loss_17": 99.91208915710449, "kl_loss_3": 2096.3197631835938, "kl_loss_6": 1201.9329803466796, "learning_rate": 0.000589947435184427, "loss": 929.466, "step": 4480 }, { "ce_loss_12": 3.2426429748535157, "ce_loss_17": 3.1219165563583373, "ce_loss_23": 3.069086766242981, "ce_loss_3": 3.97622035741806, "ce_loss_6": 3.5719561100006105, "epoch": 0.449, "grad_norm": 956.0, "kl_loss_12": 369.9361053466797, "kl_loss_17": 97.55784950256347, "kl_loss_3": 2008.1300842285157, "kl_loss_6": 1160.9046325683594, "learning_rate": 0.0005883862070330078, "loss": 922.6117, "step": 4490 }, { "ce_loss_12": 3.188134014606476, "ce_loss_17": 3.0585672497749328, "ce_loss_23": 3.007031726837158, "ce_loss_3": 3.9629687309265136, "ce_loss_6": 3.5271342396736145, "epoch": 0.45, "grad_norm": 1264.0, "kl_loss_12": 377.42894134521487, "kl_loss_17": 97.14705848693848, "kl_loss_3": 2071.5281982421875, "kl_loss_6": 1186.3728515625, "learning_rate": 0.0005868240888334653, "loss": 927.5315, "step": 4500 }, { "ce_loss_12": 3.079320514202118, "ce_loss_17": 2.9505565047264097, "ce_loss_23": 2.894635808467865, "ce_loss_3": 3.8832791805267335, "ce_loss_6": 3.433999705314636, "epoch": 0.451, "grad_norm": 1040.0, "kl_loss_12": 379.57346801757814, "kl_loss_17": 97.66094398498535, "kl_loss_3": 2117.039208984375, "kl_loss_6": 1208.289532470703, "learning_rate": 0.0005852610963163119, "loss": 945.0416, "step": 4510 }, { "ce_loss_12": 3.098237121105194, "ce_loss_17": 2.971955049037933, "ce_loss_23": 2.921879529953003, "ce_loss_3": 3.8654279828071596, "ce_loss_6": 3.443196380138397, "epoch": 0.452, "grad_norm": 1200.0, "kl_loss_12": 367.5508056640625, "kl_loss_17": 95.00853729248047, "kl_loss_3": 2048.9790588378905, "kl_loss_6": 1173.7472778320312, "learning_rate": 0.0005836972452208654, "loss": 917.1092, "step": 4520 }, { "ce_loss_12": 3.0992449522018433, "ce_loss_17": 2.9783245265483855, "ce_loss_23": 2.92637876868248, "ce_loss_3": 3.888488268852234, "ce_loss_6": 3.4487679481506346, "epoch": 0.453, "grad_norm": 1216.0, "kl_loss_12": 372.78917388916017, "kl_loss_17": 97.17004623413087, "kl_loss_3": 2088.7503173828127, "kl_loss_6": 1194.8785522460937, "learning_rate": 0.0005821325512950885, "loss": 936.7268, "step": 4530 }, { "ce_loss_12": 3.124701368808746, "ce_loss_17": 2.9975485682487486, "ce_loss_23": 2.947983968257904, "ce_loss_3": 3.8898465156555178, "ce_loss_6": 3.468051314353943, "epoch": 0.454, "grad_norm": 1004.0, "kl_loss_12": 361.3929946899414, "kl_loss_17": 94.27552947998046, "kl_loss_3": 2016.76611328125, "kl_loss_6": 1158.698434448242, "learning_rate": 0.0005805670302954321, "loss": 926.6719, "step": 4540 }, { "ce_loss_12": 3.1272517681121825, "ce_loss_17": 3.0075201272964476, "ce_loss_23": 2.954355263710022, "ce_loss_3": 3.896501433849335, "ce_loss_6": 3.466426062583923, "epoch": 0.455, "grad_norm": 1032.0, "kl_loss_12": 365.66592864990236, "kl_loss_17": 94.69521942138672, "kl_loss_3": 2050.3198303222657, "kl_loss_6": 1168.731463623047, "learning_rate": 0.000579000697986675, "loss": 918.8904, "step": 4550 }, { "ce_loss_12": 3.0973682522773744, "ce_loss_17": 2.9664054989814757, "ce_loss_23": 2.9097727060317995, "ce_loss_3": 3.914142680168152, "ce_loss_6": 3.4645751833915712, "epoch": 0.456, "grad_norm": 836.0, "kl_loss_12": 383.7973129272461, "kl_loss_17": 98.52790298461915, "kl_loss_3": 2142.2695434570314, "kl_loss_6": 1232.745263671875, "learning_rate": 0.0005774335701417662, "loss": 941.8937, "step": 4560 }, { "ce_loss_12": 3.0880584955215453, "ce_loss_17": 2.960580062866211, "ce_loss_23": 2.908521521091461, "ce_loss_3": 3.900552272796631, "ce_loss_6": 3.445142912864685, "epoch": 0.457, "grad_norm": 936.0, "kl_loss_12": 373.11179809570314, "kl_loss_17": 94.61213645935058, "kl_loss_3": 2147.471405029297, "kl_loss_6": 1208.840509033203, "learning_rate": 0.0005758656625416658, "loss": 940.7441, "step": 4570 }, { "ce_loss_12": 3.1407280206680297, "ce_loss_17": 3.015048694610596, "ce_loss_23": 2.9619325399398804, "ce_loss_3": 3.9227320909500123, "ce_loss_6": 3.4882035851478577, "epoch": 0.458, "grad_norm": 960.0, "kl_loss_12": 373.65845947265626, "kl_loss_17": 97.12631492614746, "kl_loss_3": 2067.6907592773437, "kl_loss_6": 1185.5317321777343, "learning_rate": 0.0005742969909751859, "loss": 918.8994, "step": 4580 }, { "ce_loss_12": 3.1578575372695923, "ce_loss_17": 3.030971610546112, "ce_loss_23": 2.9798396110534666, "ce_loss_3": 3.932333254814148, "ce_loss_6": 3.497673225402832, "epoch": 0.459, "grad_norm": 996.0, "kl_loss_12": 373.4551177978516, "kl_loss_17": 96.02978591918945, "kl_loss_3": 2071.560614013672, "kl_loss_6": 1177.2343200683595, "learning_rate": 0.0005727275712388318, "loss": 936.2139, "step": 4590 }, { "ce_loss_12": 3.166626787185669, "ce_loss_17": 3.0451488375663756, "ce_loss_23": 2.9955615639686584, "ce_loss_3": 3.912241852283478, "ce_loss_6": 3.4970428705215455, "epoch": 0.46, "grad_norm": 1120.0, "kl_loss_12": 359.3360260009766, "kl_loss_17": 92.84842300415039, "kl_loss_3": 2001.1885681152344, "kl_loss_6": 1146.0368438720702, "learning_rate": 0.0005711574191366427, "loss": 913.1519, "step": 4600 }, { "ce_loss_12": 3.1245293021202087, "ce_loss_17": 3.0007786989212035, "ce_loss_23": 2.950817012786865, "ce_loss_3": 3.8915412545204164, "ce_loss_6": 3.461117720603943, "epoch": 0.461, "grad_norm": 944.0, "kl_loss_12": 364.3477020263672, "kl_loss_17": 93.56401329040527, "kl_loss_3": 2047.8031494140625, "kl_loss_6": 1158.8289428710937, "learning_rate": 0.0005695865504800327, "loss": 914.1635, "step": 4610 }, { "ce_loss_12": 3.0745707035064695, "ce_loss_17": 2.939027392864227, "ce_loss_23": 2.8857049703598023, "ce_loss_3": 3.928879237174988, "ce_loss_6": 3.456984746456146, "epoch": 0.462, "grad_norm": 1104.0, "kl_loss_12": 392.5043655395508, "kl_loss_17": 99.27418518066406, "kl_loss_3": 2230.4755615234376, "kl_loss_6": 1273.105810546875, "learning_rate": 0.0005680149810876322, "loss": 958.7232, "step": 4620 }, { "ce_loss_12": 3.1221439361572267, "ce_loss_17": 2.9936205983161925, "ce_loss_23": 2.9434250235557555, "ce_loss_3": 3.906174600124359, "ce_loss_6": 3.469303858280182, "epoch": 0.463, "grad_norm": 968.0, "kl_loss_12": 367.0161819458008, "kl_loss_17": 94.79076499938965, "kl_loss_3": 2075.8826171875, "kl_loss_6": 1182.3226745605468, "learning_rate": 0.0005664427267851271, "loss": 926.7291, "step": 4630 }, { "ce_loss_12": 3.0404475688934327, "ce_loss_17": 2.914381778240204, "ce_loss_23": 2.863242745399475, "ce_loss_3": 3.8283162117004395, "ce_loss_6": 3.390611732006073, "epoch": 0.464, "grad_norm": 1048.0, "kl_loss_12": 364.2582321166992, "kl_loss_17": 93.55969924926758, "kl_loss_3": 2070.353057861328, "kl_loss_6": 1176.0620391845703, "learning_rate": 0.0005648698034051009, "loss": 921.2185, "step": 4640 }, { "ce_loss_12": 3.1544418692588807, "ce_loss_17": 3.0264967560768126, "ce_loss_23": 2.973117482662201, "ce_loss_3": 3.952759790420532, "ce_loss_6": 3.5050989270210264, "epoch": 0.465, "grad_norm": 980.0, "kl_loss_12": 368.0677200317383, "kl_loss_17": 95.34860954284667, "kl_loss_3": 2107.058544921875, "kl_loss_6": 1191.1350402832031, "learning_rate": 0.0005632962267868747, "loss": 922.8924, "step": 4650 }, { "ce_loss_12": 3.086225914955139, "ce_loss_17": 2.9620978832244873, "ce_loss_23": 2.912720251083374, "ce_loss_3": 3.851222813129425, "ce_loss_6": 3.42860985994339, "epoch": 0.466, "grad_norm": 1136.0, "kl_loss_12": 358.4297149658203, "kl_loss_17": 91.42180213928222, "kl_loss_3": 2045.6289489746093, "kl_loss_6": 1169.5674255371093, "learning_rate": 0.0005617220127763474, "loss": 924.683, "step": 4660 }, { "ce_loss_12": 3.1667036652565, "ce_loss_17": 3.041495943069458, "ce_loss_23": 2.991810345649719, "ce_loss_3": 3.9326154589653015, "ce_loss_6": 3.5035176277160645, "epoch": 0.467, "grad_norm": 1024.0, "kl_loss_12": 368.75465240478513, "kl_loss_17": 94.89134712219239, "kl_loss_3": 2039.9618713378907, "kl_loss_6": 1157.8766632080078, "learning_rate": 0.0005601471772258368, "loss": 925.9684, "step": 4670 }, { "ce_loss_12": 3.148939609527588, "ce_loss_17": 3.0280992150306703, "ce_loss_23": 2.9784193873405456, "ce_loss_3": 3.905656802654266, "ce_loss_6": 3.4869115591049193, "epoch": 0.468, "grad_norm": 876.0, "kl_loss_12": 363.73346557617185, "kl_loss_17": 94.9500545501709, "kl_loss_3": 1996.96025390625, "kl_loss_6": 1144.792901611328, "learning_rate": 0.0005585717359939192, "loss": 927.9094, "step": 4680 }, { "ce_loss_12": 3.0635921597480773, "ce_loss_17": 2.9403783679008484, "ce_loss_23": 2.890295124053955, "ce_loss_3": 3.8274804949760437, "ce_loss_6": 3.4014335751533507, "epoch": 0.469, "grad_norm": 1032.0, "kl_loss_12": 362.45594482421876, "kl_loss_17": 93.71986045837403, "kl_loss_3": 2028.2100341796875, "kl_loss_6": 1157.830874633789, "learning_rate": 0.0005569957049452703, "loss": 932.6887, "step": 4690 }, { "ce_loss_12": 3.1199838757514953, "ce_loss_17": 2.9960068225860597, "ce_loss_23": 2.9443670392036436, "ce_loss_3": 3.908629536628723, "ce_loss_6": 3.465496504306793, "epoch": 0.47, "grad_norm": 1392.0, "kl_loss_12": 372.039079284668, "kl_loss_17": 97.96028518676758, "kl_loss_3": 2081.887371826172, "kl_loss_6": 1184.3350677490234, "learning_rate": 0.0005554190999505056, "loss": 934.4084, "step": 4700 }, { "ce_loss_12": 3.241987943649292, "ce_loss_17": 3.112707030773163, "ce_loss_23": 3.0603973388671877, "ce_loss_3": 4.018198192119598, "ce_loss_6": 3.590498960018158, "epoch": 0.471, "grad_norm": 1112.0, "kl_loss_12": 382.9134002685547, "kl_loss_17": 98.71521873474121, "kl_loss_3": 2076.810894775391, "kl_loss_6": 1192.9006103515626, "learning_rate": 0.0005538419368860196, "loss": 902.3693, "step": 4710 }, { "ce_loss_12": 3.163465344905853, "ce_loss_17": 3.040043067932129, "ce_loss_23": 2.9864439606666564, "ce_loss_3": 3.9220748066902162, "ce_loss_6": 3.4972716927528382, "epoch": 0.472, "grad_norm": 952.0, "kl_loss_12": 371.4324691772461, "kl_loss_17": 96.35890159606933, "kl_loss_3": 2035.002734375, "kl_loss_6": 1170.274884033203, "learning_rate": 0.0005522642316338268, "loss": 940.209, "step": 4720 }, { "ce_loss_12": 3.1730767607688906, "ce_loss_17": 3.0522518277168276, "ce_loss_23": 3.00358167886734, "ce_loss_3": 3.935346007347107, "ce_loss_6": 3.5060906767845155, "epoch": 0.473, "grad_norm": 1056.0, "kl_loss_12": 368.5505630493164, "kl_loss_17": 95.56421852111816, "kl_loss_3": 2031.9708740234375, "kl_loss_6": 1157.9430053710937, "learning_rate": 0.0005506860000814017, "loss": 940.0187, "step": 4730 }, { "ce_loss_12": 3.19408575296402, "ce_loss_17": 3.076188790798187, "ce_loss_23": 3.028001570701599, "ce_loss_3": 3.931719958782196, "ce_loss_6": 3.524571084976196, "epoch": 0.474, "grad_norm": 992.0, "kl_loss_12": 359.36453399658205, "kl_loss_17": 91.8622055053711, "kl_loss_3": 1987.8832092285156, "kl_loss_6": 1147.102392578125, "learning_rate": 0.0005491072581215186, "loss": 917.3689, "step": 4740 }, { "ce_loss_12": 3.195579099655151, "ce_loss_17": 3.068587398529053, "ce_loss_23": 3.0141397595405577, "ce_loss_3": 3.949463403224945, "ce_loss_6": 3.5331989526748657, "epoch": 0.475, "grad_norm": 1120.0, "kl_loss_12": 373.22421112060545, "kl_loss_17": 98.32062301635742, "kl_loss_3": 2049.5393127441407, "kl_loss_6": 1178.9929168701171, "learning_rate": 0.0005475280216520913, "loss": 907.7835, "step": 4750 }, { "ce_loss_12": 3.116903507709503, "ce_loss_17": 2.995867669582367, "ce_loss_23": 2.945862424373627, "ce_loss_3": 3.8652034997940063, "ce_loss_6": 3.447264242172241, "epoch": 0.476, "grad_norm": 1240.0, "kl_loss_12": 358.89072570800784, "kl_loss_17": 92.7595603942871, "kl_loss_3": 1993.6135681152343, "kl_loss_6": 1138.1050079345703, "learning_rate": 0.0005459483065760138, "loss": 924.2568, "step": 4760 }, { "ce_loss_12": 3.064214551448822, "ce_loss_17": 2.9375820994377135, "ce_loss_23": 2.889118230342865, "ce_loss_3": 3.888459062576294, "ce_loss_6": 3.4316441893577574, "epoch": 0.477, "grad_norm": 1072.0, "kl_loss_12": 369.9542205810547, "kl_loss_17": 94.08586616516114, "kl_loss_3": 2152.941357421875, "kl_loss_6": 1222.8913269042969, "learning_rate": 0.0005443681288009991, "loss": 939.2311, "step": 4770 }, { "ce_loss_12": 3.1037795662879946, "ce_loss_17": 2.985719680786133, "ce_loss_23": 2.9347479224205015, "ce_loss_3": 3.8800550937652587, "ce_loss_6": 3.45303328037262, "epoch": 0.478, "grad_norm": 952.0, "kl_loss_12": 362.2948623657227, "kl_loss_17": 93.37007637023926, "kl_loss_3": 2054.378765869141, "kl_loss_6": 1174.0726287841796, "learning_rate": 0.0005427875042394199, "loss": 924.9251, "step": 4780 }, { "ce_loss_12": 3.148194468021393, "ce_loss_17": 3.0246357798576353, "ce_loss_23": 2.9726595759391783, "ce_loss_3": 3.9054253578186033, "ce_loss_6": 3.4833449959754943, "epoch": 0.479, "grad_norm": 952.0, "kl_loss_12": 369.14268493652344, "kl_loss_17": 97.6407917022705, "kl_loss_3": 2021.9542907714845, "kl_loss_6": 1162.2347930908204, "learning_rate": 0.0005412064488081482, "loss": 930.3105, "step": 4790 }, { "ce_loss_12": 3.137005865573883, "ce_loss_17": 3.0189573764801025, "ce_loss_23": 2.967451441287994, "ce_loss_3": 3.8918606281280517, "ce_loss_6": 3.468139088153839, "epoch": 0.48, "grad_norm": 1128.0, "kl_loss_12": 352.2945220947266, "kl_loss_17": 92.58756942749024, "kl_loss_3": 1984.3945434570312, "kl_loss_6": 1131.745068359375, "learning_rate": 0.0005396249784283942, "loss": 899.6717, "step": 4800 }, { "ce_loss_12": 3.1608723282814024, "ce_loss_17": 3.037445676326752, "ce_loss_23": 2.983047103881836, "ce_loss_3": 3.955673432350159, "ce_loss_6": 3.5141472697257994, "epoch": 0.481, "grad_norm": 1160.0, "kl_loss_12": 373.91689453125, "kl_loss_17": 97.65786895751953, "kl_loss_3": 2108.4563049316407, "kl_loss_6": 1214.4124877929687, "learning_rate": 0.0005380431090255476, "loss": 939.9953, "step": 4810 }, { "ce_loss_12": 3.1526288866996763, "ce_loss_17": 3.036070692539215, "ce_loss_23": 2.987703096866608, "ce_loss_3": 3.9051117300987244, "ce_loss_6": 3.4906025767326354, "epoch": 0.482, "grad_norm": 992.0, "kl_loss_12": 350.09937438964846, "kl_loss_17": 90.7152587890625, "kl_loss_3": 1995.6268737792968, "kl_loss_6": 1147.331491088867, "learning_rate": 0.0005364608565290155, "loss": 901.2539, "step": 4820 }, { "ce_loss_12": 3.163371217250824, "ce_loss_17": 3.04144287109375, "ce_loss_23": 2.988657009601593, "ce_loss_3": 3.927390766143799, "ce_loss_6": 3.5088889718055727, "epoch": 0.483, "grad_norm": 1080.0, "kl_loss_12": 367.5927398681641, "kl_loss_17": 95.43120765686035, "kl_loss_3": 2036.2733276367187, "kl_loss_6": 1174.790771484375, "learning_rate": 0.0005348782368720626, "loss": 919.0738, "step": 4830 }, { "ce_loss_12": 3.1053726196289064, "ce_loss_17": 2.9821720719337463, "ce_loss_23": 2.932705020904541, "ce_loss_3": 3.8632062315940856, "ce_loss_6": 3.4345524430274965, "epoch": 0.484, "grad_norm": 1296.0, "kl_loss_12": 355.072282409668, "kl_loss_17": 91.17291488647462, "kl_loss_3": 2003.0337890625, "kl_loss_6": 1133.6176971435548, "learning_rate": 0.000533295265991652, "loss": 911.5024, "step": 4840 }, { "ce_loss_12": 3.170707905292511, "ce_loss_17": 3.0478097796440125, "ce_loss_23": 2.995159554481506, "ce_loss_3": 3.919725406169891, "ce_loss_6": 3.5044790148735045, "epoch": 0.485, "grad_norm": 1000.0, "kl_loss_12": 361.09333953857424, "kl_loss_17": 92.84121818542481, "kl_loss_3": 1998.759344482422, "kl_loss_6": 1147.2231689453124, "learning_rate": 0.0005317119598282822, "loss": 902.2107, "step": 4850 }, { "ce_loss_12": 3.171194279193878, "ce_loss_17": 3.047281098365784, "ce_loss_23": 2.99584538936615, "ce_loss_3": 3.935044729709625, "ce_loss_6": 3.5136531472206114, "epoch": 0.486, "grad_norm": 1264.0, "kl_loss_12": 368.4178924560547, "kl_loss_17": 94.40206680297851, "kl_loss_3": 2022.4554504394532, "kl_loss_6": 1157.0905395507812, "learning_rate": 0.0005301283343258293, "loss": 910.2678, "step": 4860 }, { "ce_loss_12": 3.225717294216156, "ce_loss_17": 3.102791738510132, "ce_loss_23": 3.052597498893738, "ce_loss_3": 3.970711553096771, "ce_loss_6": 3.5527042150497437, "epoch": 0.487, "grad_norm": 1240.0, "kl_loss_12": 364.56507263183596, "kl_loss_17": 97.28554267883301, "kl_loss_3": 1994.2457397460937, "kl_loss_6": 1140.4625762939454, "learning_rate": 0.000528544405431384, "loss": 898.1916, "step": 4870 }, { "ce_loss_12": 3.1127439618110655, "ce_loss_17": 2.9833473324775697, "ce_loss_23": 2.9293392419815065, "ce_loss_3": 3.8848477840423583, "ce_loss_6": 3.4620235204696654, "epoch": 0.488, "grad_norm": 936.0, "kl_loss_12": 375.3154983520508, "kl_loss_17": 95.93160438537598, "kl_loss_3": 2063.953057861328, "kl_loss_6": 1193.917742919922, "learning_rate": 0.000526960189095093, "loss": 928.2098, "step": 4880 }, { "ce_loss_12": 3.0926320910453797, "ce_loss_17": 2.9746638536453247, "ce_loss_23": 2.9252131819725036, "ce_loss_3": 3.852759027481079, "ce_loss_6": 3.428392505645752, "epoch": 0.489, "grad_norm": 1224.0, "kl_loss_12": 355.71148986816405, "kl_loss_17": 92.43701934814453, "kl_loss_3": 1994.89287109375, "kl_loss_6": 1139.5907104492187, "learning_rate": 0.0005253757012699972, "loss": 904.277, "step": 4890 }, { "ce_loss_12": 3.1681107759475706, "ce_loss_17": 3.044089150428772, "ce_loss_23": 2.995683467388153, "ce_loss_3": 3.917013776302338, "ce_loss_6": 3.4973212122917174, "epoch": 0.49, "grad_norm": 1256.0, "kl_loss_12": 363.9573379516602, "kl_loss_17": 93.36282806396484, "kl_loss_3": 2007.33369140625, "kl_loss_6": 1150.3894775390625, "learning_rate": 0.0005237909579118712, "loss": 922.1821, "step": 4900 }, { "ce_loss_12": 3.143178117275238, "ce_loss_17": 3.012323999404907, "ce_loss_23": 2.957292902469635, "ce_loss_3": 3.92088303565979, "ce_loss_6": 3.4861607670784, "epoch": 0.491, "grad_norm": 1112.0, "kl_loss_12": 376.0688873291016, "kl_loss_17": 97.68312149047851, "kl_loss_3": 2079.636608886719, "kl_loss_6": 1186.3177856445313, "learning_rate": 0.0005222059749790631, "loss": 928.2946, "step": 4910 }, { "ce_loss_12": 3.1950509667396547, "ce_loss_17": 3.075741231441498, "ce_loss_23": 3.024643123149872, "ce_loss_3": 3.918723690509796, "ce_loss_6": 3.5106842637062075, "epoch": 0.492, "grad_norm": 900.0, "kl_loss_12": 355.10656890869143, "kl_loss_17": 92.66862754821777, "kl_loss_3": 1956.1288940429688, "kl_loss_6": 1122.1956909179687, "learning_rate": 0.0005206207684323337, "loss": 884.5479, "step": 4920 }, { "ce_loss_12": 3.177818274497986, "ce_loss_17": 3.055434787273407, "ce_loss_23": 3.0034319281578066, "ce_loss_3": 3.9366918325424196, "ce_loss_6": 3.518252801895142, "epoch": 0.493, "grad_norm": 1192.0, "kl_loss_12": 367.2648208618164, "kl_loss_17": 95.72131729125977, "kl_loss_3": 2022.9251403808594, "kl_loss_6": 1167.5449096679688, "learning_rate": 0.000519035354234695, "loss": 928.926, "step": 4930 }, { "ce_loss_12": 3.1576064467430114, "ce_loss_17": 3.0290244936943056, "ce_loss_23": 2.9728461861610413, "ce_loss_3": 3.9173558950424194, "ce_loss_6": 3.4981216549873353, "epoch": 0.494, "grad_norm": 1224.0, "kl_loss_12": 372.4341445922852, "kl_loss_17": 98.32537727355957, "kl_loss_3": 2013.0077697753907, "kl_loss_6": 1171.3818298339843, "learning_rate": 0.0005174497483512506, "loss": 901.1791, "step": 4940 }, { "ce_loss_12": 3.193759799003601, "ce_loss_17": 3.075028383731842, "ce_loss_23": 3.0255154371261597, "ce_loss_3": 3.9381660342216493, "ce_loss_6": 3.526125121116638, "epoch": 0.495, "grad_norm": 944.0, "kl_loss_12": 356.26817474365237, "kl_loss_17": 92.91269378662109, "kl_loss_3": 2008.5030517578125, "kl_loss_6": 1151.6687255859374, "learning_rate": 0.0005158639667490339, "loss": 917.3115, "step": 4950 }, { "ce_loss_12": 3.112211060523987, "ce_loss_17": 2.986400294303894, "ce_loss_23": 2.936023163795471, "ce_loss_3": 3.8713951110839844, "ce_loss_6": 3.450876069068909, "epoch": 0.496, "grad_norm": 1016.0, "kl_loss_12": 363.6869583129883, "kl_loss_17": 93.26083564758301, "kl_loss_3": 2023.2762573242187, "kl_loss_6": 1160.9851287841798, "learning_rate": 0.0005142780253968481, "loss": 911.3936, "step": 4960 }, { "ce_loss_12": 3.0601794362068175, "ce_loss_17": 2.942017126083374, "ce_loss_23": 2.8925352931022643, "ce_loss_3": 3.8043387174606322, "ce_loss_6": 3.383818244934082, "epoch": 0.497, "grad_norm": 1032.0, "kl_loss_12": 351.1629837036133, "kl_loss_17": 91.15731086730958, "kl_loss_3": 1988.4179992675781, "kl_loss_6": 1128.8858367919922, "learning_rate": 0.0005126919402651053, "loss": 883.4854, "step": 4970 }, { "ce_loss_12": 3.1314974069595336, "ce_loss_17": 3.004086196422577, "ce_loss_23": 2.949925649166107, "ce_loss_3": 3.9035497426986696, "ce_loss_6": 3.4774454474449157, "epoch": 0.498, "grad_norm": 1232.0, "kl_loss_12": 369.9680694580078, "kl_loss_17": 97.28400802612305, "kl_loss_3": 2040.5506774902344, "kl_loss_6": 1173.4661041259765, "learning_rate": 0.0005111057273256647, "loss": 918.4297, "step": 4980 }, { "ce_loss_12": 3.2182634353637694, "ce_loss_17": 3.1030057668685913, "ce_loss_23": 3.053433656692505, "ce_loss_3": 3.9287408590316772, "ce_loss_6": 3.5249813079833983, "epoch": 0.499, "grad_norm": 968.0, "kl_loss_12": 346.4434982299805, "kl_loss_17": 89.57510108947754, "kl_loss_3": 1916.7842163085938, "kl_loss_6": 1099.401171875, "learning_rate": 0.0005095194025516733, "loss": 873.9033, "step": 4990 }, { "ce_loss_12": 3.149612510204315, "ce_loss_17": 3.0297858476638795, "ce_loss_23": 2.983256685733795, "ce_loss_3": 3.8866697907447816, "ce_loss_6": 3.4731158018112183, "epoch": 0.5, "grad_norm": 1184.0, "kl_loss_12": 351.9287445068359, "kl_loss_17": 89.64690284729004, "kl_loss_3": 1978.0472778320313, "kl_loss_6": 1126.4499816894531, "learning_rate": 0.000507932981917404, "loss": 919.7188, "step": 5000 }, { "ce_loss_12": 3.1033830046653748, "ce_loss_17": 2.9746859550476072, "ce_loss_23": 2.9226545691490173, "ce_loss_3": 3.9122170090675352, "ce_loss_6": 3.4662346839904785, "epoch": 0.501, "grad_norm": 1096.0, "kl_loss_12": 374.91630401611326, "kl_loss_17": 97.60987014770508, "kl_loss_3": 2124.3409912109373, "kl_loss_6": 1212.289617919922, "learning_rate": 0.0005063464813980949, "loss": 944.7277, "step": 5010 }, { "ce_loss_12": 3.0833741784095765, "ce_loss_17": 2.9644308209419252, "ce_loss_23": 2.91454701423645, "ce_loss_3": 3.8580423831939696, "ce_loss_6": 3.424077236652374, "epoch": 0.502, "grad_norm": 956.0, "kl_loss_12": 361.62098236083983, "kl_loss_17": 93.23923530578614, "kl_loss_3": 2055.8266784667967, "kl_loss_6": 1173.4408447265625, "learning_rate": 0.0005047599169697884, "loss": 910.5002, "step": 5020 }, { "ce_loss_12": 3.037166953086853, "ce_loss_17": 2.9079468607902528, "ce_loss_23": 2.855610752105713, "ce_loss_3": 3.8071164965629576, "ce_loss_6": 3.3815191626548766, "epoch": 0.503, "grad_norm": 1192.0, "kl_loss_12": 360.40843505859374, "kl_loss_17": 93.68807334899903, "kl_loss_3": 2028.0902526855468, "kl_loss_6": 1152.7849853515625, "learning_rate": 0.000503173304609171, "loss": 890.5323, "step": 5030 }, { "ce_loss_12": 3.143783712387085, "ce_loss_17": 3.0220723867416384, "ce_loss_23": 2.97117475271225, "ce_loss_3": 3.904027593135834, "ce_loss_6": 3.487496542930603, "epoch": 0.504, "grad_norm": 1232.0, "kl_loss_12": 358.4159912109375, "kl_loss_17": 92.68148765563964, "kl_loss_3": 2011.387628173828, "kl_loss_6": 1162.4315704345704, "learning_rate": 0.0005015866602934111, "loss": 893.0471, "step": 5040 }, { "ce_loss_12": 3.1251502990722657, "ce_loss_17": 2.9951428532600404, "ce_loss_23": 2.943488895893097, "ce_loss_3": 3.918997848033905, "ce_loss_6": 3.479409730434418, "epoch": 0.505, "grad_norm": 1344.0, "kl_loss_12": 379.06178131103513, "kl_loss_17": 98.6830223083496, "kl_loss_3": 2080.636865234375, "kl_loss_6": 1195.0086212158203, "learning_rate": 0.0005, "loss": 921.599, "step": 5050 }, { "ce_loss_12": 3.1191251039505006, "ce_loss_17": 2.9923450350761414, "ce_loss_23": 2.94129501581192, "ce_loss_3": 3.8779847741127016, "ce_loss_6": 3.4552345752716063, "epoch": 0.506, "grad_norm": 1328.0, "kl_loss_12": 367.75157928466797, "kl_loss_17": 96.9900463104248, "kl_loss_3": 2014.9070922851563, "kl_loss_6": 1160.6669158935547, "learning_rate": 0.0004984133397065889, "loss": 897.3158, "step": 5060 }, { "ce_loss_12": 3.1227738738059996, "ce_loss_17": 2.994934821128845, "ce_loss_23": 2.941101610660553, "ce_loss_3": 3.906764030456543, "ce_loss_6": 3.4776443004608155, "epoch": 0.507, "grad_norm": 900.0, "kl_loss_12": 364.47747497558595, "kl_loss_17": 94.02899131774902, "kl_loss_3": 2045.6871948242188, "kl_loss_6": 1174.2436950683593, "learning_rate": 0.0004968266953908291, "loss": 902.484, "step": 5070 }, { "ce_loss_12": 3.1509108543395996, "ce_loss_17": 3.033639430999756, "ce_loss_23": 2.9840845227241517, "ce_loss_3": 3.928315889835358, "ce_loss_6": 3.496352505683899, "epoch": 0.508, "grad_norm": 1320.0, "kl_loss_12": 357.41788177490236, "kl_loss_17": 91.3696117401123, "kl_loss_3": 2039.4884704589845, "kl_loss_6": 1157.7149505615234, "learning_rate": 0.0004952400830302117, "loss": 910.76, "step": 5080 }, { "ce_loss_12": 3.091415858268738, "ce_loss_17": 2.965611445903778, "ce_loss_23": 2.913990008831024, "ce_loss_3": 3.8812857627868653, "ce_loss_6": 3.446223962306976, "epoch": 0.509, "grad_norm": 952.0, "kl_loss_12": 371.3483428955078, "kl_loss_17": 94.62365684509277, "kl_loss_3": 2075.6438903808594, "kl_loss_6": 1190.573193359375, "learning_rate": 0.0004936535186019053, "loss": 913.7413, "step": 5090 }, { "ce_loss_12": 3.1742780089378355, "ce_loss_17": 3.0596349954605104, "ce_loss_23": 3.0131367444992065, "ce_loss_3": 3.909999895095825, "ce_loss_6": 3.497714567184448, "epoch": 0.51, "grad_norm": 680.0, "kl_loss_12": 346.65772705078126, "kl_loss_17": 89.82437286376953, "kl_loss_3": 1948.2659973144532, "kl_loss_6": 1110.781576538086, "learning_rate": 0.000492067018082596, "loss": 888.6368, "step": 5100 }, { "ce_loss_12": 3.125352656841278, "ce_loss_17": 2.9971254229545594, "ce_loss_23": 2.9447287559509276, "ce_loss_3": 3.9266711831092835, "ce_loss_6": 3.488107669353485, "epoch": 0.511, "grad_norm": 1128.0, "kl_loss_12": 373.9530364990234, "kl_loss_17": 95.72911529541015, "kl_loss_3": 2099.6183837890626, "kl_loss_6": 1202.3802612304687, "learning_rate": 0.0004904805974483267, "loss": 945.6771, "step": 5110 }, { "ce_loss_12": 3.238801729679108, "ce_loss_17": 3.1054012417793273, "ce_loss_23": 3.050589954853058, "ce_loss_3": 4.021122086048126, "ce_loss_6": 3.5947796225547792, "epoch": 0.512, "grad_norm": 1176.0, "kl_loss_12": 387.00370330810546, "kl_loss_17": 100.9587459564209, "kl_loss_3": 2088.327673339844, "kl_loss_6": 1220.1336486816406, "learning_rate": 0.0004888942726743353, "loss": 958.7537, "step": 5120 }, { "ce_loss_12": 3.107498383522034, "ce_loss_17": 2.981675314903259, "ce_loss_23": 2.9318625807762144, "ce_loss_3": 3.886816442012787, "ce_loss_6": 3.460335433483124, "epoch": 0.513, "grad_norm": 1008.0, "kl_loss_12": 368.3828353881836, "kl_loss_17": 94.39456787109376, "kl_loss_3": 2080.064324951172, "kl_loss_6": 1183.9714385986329, "learning_rate": 0.0004873080597348947, "loss": 928.1082, "step": 5130 }, { "ce_loss_12": 3.0038116216659545, "ce_loss_17": 2.8782567024230956, "ce_loss_23": 2.8273938059806825, "ce_loss_3": 3.823837125301361, "ce_loss_6": 3.3672744631767273, "epoch": 0.514, "grad_norm": 1128.0, "kl_loss_12": 364.96567687988284, "kl_loss_17": 92.57419853210449, "kl_loss_3": 2139.435168457031, "kl_loss_6": 1213.2633575439454, "learning_rate": 0.0004857219746031519, "loss": 928.2652, "step": 5140 }, { "ce_loss_12": 3.161403167247772, "ce_loss_17": 3.0452697038650514, "ce_loss_23": 2.992450773715973, "ce_loss_3": 3.908959686756134, "ce_loss_6": 3.492176127433777, "epoch": 0.515, "grad_norm": 920.0, "kl_loss_12": 359.31848602294923, "kl_loss_17": 93.00241546630859, "kl_loss_3": 1990.7133911132812, "kl_loss_6": 1138.233218383789, "learning_rate": 0.0004841360332509663, "loss": 903.4719, "step": 5150 }, { "ce_loss_12": 3.1129663228988647, "ce_loss_17": 2.9991484761238096, "ce_loss_23": 2.9499995946884154, "ce_loss_3": 3.863976776599884, "ce_loss_6": 3.447406566143036, "epoch": 0.516, "grad_norm": 936.0, "kl_loss_12": 350.9371063232422, "kl_loss_17": 89.56887741088867, "kl_loss_3": 1980.1753356933593, "kl_loss_6": 1128.3441619873047, "learning_rate": 0.0004825502516487497, "loss": 868.0496, "step": 5160 }, { "ce_loss_12": 3.082901954650879, "ce_loss_17": 2.963200068473816, "ce_loss_23": 2.9151252150535583, "ce_loss_3": 3.8651347398757934, "ce_loss_6": 3.434497356414795, "epoch": 0.517, "grad_norm": 1264.0, "kl_loss_12": 364.8398162841797, "kl_loss_17": 92.54983940124512, "kl_loss_3": 2064.647216796875, "kl_loss_6": 1177.4377349853517, "learning_rate": 0.00048096464576530507, "loss": 924.965, "step": 5170 }, { "ce_loss_12": 3.1856318593025206, "ce_loss_17": 3.0665177941322326, "ce_loss_23": 3.0153189420700075, "ce_loss_3": 3.9054733991622923, "ce_loss_6": 3.5001368641853334, "epoch": 0.518, "grad_norm": 872.0, "kl_loss_12": 352.8712661743164, "kl_loss_17": 93.02919006347656, "kl_loss_3": 1937.8787841796875, "kl_loss_6": 1115.0434326171876, "learning_rate": 0.00047937923156766646, "loss": 884.1963, "step": 5180 }, { "ce_loss_12": 3.2206114172935485, "ce_loss_17": 3.107780420780182, "ce_loss_23": 3.057665026187897, "ce_loss_3": 3.9358922600746156, "ce_loss_6": 3.5398075461387633, "epoch": 0.519, "grad_norm": 928.0, "kl_loss_12": 355.00037841796876, "kl_loss_17": 91.45009002685546, "kl_loss_3": 1950.4388427734375, "kl_loss_6": 1120.3299621582032, "learning_rate": 0.00047779402502093696, "loss": 888.757, "step": 5190 }, { "ce_loss_12": 3.19130482673645, "ce_loss_17": 3.0732568383216856, "ce_loss_23": 3.023844051361084, "ce_loss_3": 3.9327818512916566, "ce_loss_6": 3.529018151760101, "epoch": 0.52, "grad_norm": 976.0, "kl_loss_12": 357.42832336425784, "kl_loss_17": 92.59470291137696, "kl_loss_3": 1972.9936462402343, "kl_loss_6": 1136.743600463867, "learning_rate": 0.0004762090420881289, "loss": 900.1426, "step": 5200 }, { "ce_loss_12": 3.107156443595886, "ce_loss_17": 2.9926238656044006, "ce_loss_23": 2.942973482608795, "ce_loss_3": 3.8481027603149416, "ce_loss_6": 3.4392590045928957, "epoch": 0.521, "grad_norm": 904.0, "kl_loss_12": 353.58110961914065, "kl_loss_17": 92.02993736267089, "kl_loss_3": 1987.946844482422, "kl_loss_6": 1144.1151733398438, "learning_rate": 0.00047462429873000296, "loss": 883.161, "step": 5210 }, { "ce_loss_12": 3.1907013177871706, "ce_loss_17": 3.072627866268158, "ce_loss_23": 3.019580602645874, "ce_loss_3": 3.920751178264618, "ce_loss_6": 3.5119782209396364, "epoch": 0.522, "grad_norm": 1280.0, "kl_loss_12": 359.06115875244143, "kl_loss_17": 94.30262756347656, "kl_loss_3": 1971.3868774414063, "kl_loss_6": 1127.5974426269531, "learning_rate": 0.0004730398109049071, "loss": 888.6526, "step": 5220 }, { "ce_loss_12": 3.1245632767677307, "ce_loss_17": 2.9969809293746947, "ce_loss_23": 2.9452990651130677, "ce_loss_3": 3.9162464261054994, "ce_loss_6": 3.4832543969154357, "epoch": 0.523, "grad_norm": 1104.0, "kl_loss_12": 372.46168823242186, "kl_loss_17": 95.79502906799317, "kl_loss_3": 2096.8451416015623, "kl_loss_6": 1210.0787658691406, "learning_rate": 0.000471455594568616, "loss": 919.6003, "step": 5230 }, { "ce_loss_12": 3.182738959789276, "ce_loss_17": 3.0637787580490112, "ce_loss_23": 3.0149332642555238, "ce_loss_3": 3.9052466630935667, "ce_loss_6": 3.50392689704895, "epoch": 0.524, "grad_norm": 960.0, "kl_loss_12": 356.23197479248046, "kl_loss_17": 92.7229564666748, "kl_loss_3": 1947.691229248047, "kl_loss_6": 1117.3941131591796, "learning_rate": 0.00046987166567417086, "loss": 896.7503, "step": 5240 }, { "ce_loss_12": 3.1106405377388002, "ce_loss_17": 2.9953038454055787, "ce_loss_23": 2.945481073856354, "ce_loss_3": 3.8631661891937257, "ce_loss_6": 3.451376569271088, "epoch": 0.525, "grad_norm": 924.0, "kl_loss_12": 354.53553771972656, "kl_loss_17": 90.6540023803711, "kl_loss_3": 1989.9297302246093, "kl_loss_6": 1144.197021484375, "learning_rate": 0.00046828804017171776, "loss": 872.9939, "step": 5250 }, { "ce_loss_12": 3.1602378487586975, "ce_loss_17": 3.0360376477241515, "ce_loss_23": 2.982585108280182, "ce_loss_3": 3.9386395692825316, "ce_loss_6": 3.5046775221824644, "epoch": 0.526, "grad_norm": 896.0, "kl_loss_12": 362.33819274902345, "kl_loss_17": 94.27711219787598, "kl_loss_3": 2038.4729858398437, "kl_loss_6": 1152.3221740722656, "learning_rate": 0.00046670473400834805, "loss": 916.7203, "step": 5260 }, { "ce_loss_12": 3.089023697376251, "ce_loss_17": 2.9738704323768617, "ce_loss_23": 2.9252676248550413, "ce_loss_3": 3.8307597994804383, "ce_loss_6": 3.415070629119873, "epoch": 0.527, "grad_norm": 1144.0, "kl_loss_12": 348.49229125976564, "kl_loss_17": 89.61092948913574, "kl_loss_3": 1966.6946228027343, "kl_loss_6": 1126.227001953125, "learning_rate": 0.00046512176312793734, "loss": 918.152, "step": 5270 }, { "ce_loss_12": 3.0861546993255615, "ce_loss_17": 2.9649991750717164, "ce_loss_23": 2.9146908640861513, "ce_loss_3": 3.834387981891632, "ce_loss_6": 3.420784020423889, "epoch": 0.528, "grad_norm": 968.0, "kl_loss_12": 354.7429397583008, "kl_loss_17": 91.66234436035157, "kl_loss_3": 1994.0042297363282, "kl_loss_6": 1143.562939453125, "learning_rate": 0.00046353914347098467, "loss": 904.6952, "step": 5280 }, { "ce_loss_12": 3.188432586193085, "ce_loss_17": 3.0671284794807434, "ce_loss_23": 3.0156295895576477, "ce_loss_3": 3.933156156539917, "ce_loss_6": 3.518996202945709, "epoch": 0.529, "grad_norm": 1272.0, "kl_loss_12": 351.3106292724609, "kl_loss_17": 91.73496208190917, "kl_loss_3": 1971.3292175292968, "kl_loss_6": 1125.4321075439452, "learning_rate": 0.0004619568909744524, "loss": 900.5209, "step": 5290 }, { "ce_loss_12": 3.18534996509552, "ce_loss_17": 3.066155529022217, "ce_loss_23": 3.015329587459564, "ce_loss_3": 3.9257039070129394, "ce_loss_6": 3.5112979888916014, "epoch": 0.53, "grad_norm": 1120.0, "kl_loss_12": 355.6421432495117, "kl_loss_17": 91.74192085266114, "kl_loss_3": 1977.2301391601563, "kl_loss_6": 1131.9021057128907, "learning_rate": 0.00046037502157160573, "loss": 901.8163, "step": 5300 }, { "ce_loss_12": 3.0694101095199584, "ce_loss_17": 2.9440566062927247, "ce_loss_23": 2.893705356121063, "ce_loss_3": 3.827756106853485, "ce_loss_6": 3.4043239831924437, "epoch": 0.531, "grad_norm": 1104.0, "kl_loss_12": 363.43805236816405, "kl_loss_17": 93.20171585083008, "kl_loss_3": 2020.3906494140624, "kl_loss_6": 1151.7597961425781, "learning_rate": 0.00045879355119185207, "loss": 907.058, "step": 5310 }, { "ce_loss_12": 3.1483601093292237, "ce_loss_17": 3.0286761045455934, "ce_loss_23": 2.9779433488845823, "ce_loss_3": 3.91595743894577, "ce_loss_6": 3.4881858229637146, "epoch": 0.532, "grad_norm": 1760.0, "kl_loss_12": 366.58548431396486, "kl_loss_17": 95.05085411071778, "kl_loss_3": 2042.227117919922, "kl_loss_6": 1169.8006713867187, "learning_rate": 0.0004572124957605803, "loss": 922.2746, "step": 5320 }, { "ce_loss_12": 3.1587769985198975, "ce_loss_17": 3.033687078952789, "ce_loss_23": 2.9832841396331786, "ce_loss_3": 3.9054378986358644, "ce_loss_6": 3.4926193952560425, "epoch": 0.533, "grad_norm": 936.0, "kl_loss_12": 361.675846862793, "kl_loss_17": 92.17733879089356, "kl_loss_3": 1999.2652709960937, "kl_loss_6": 1142.4123748779298, "learning_rate": 0.00045563187119900103, "loss": 892.2507, "step": 5330 }, { "ce_loss_12": 3.011014699935913, "ce_loss_17": 2.887872743606567, "ce_loss_23": 2.8407442569732666, "ce_loss_3": 3.7862492918968202, "ce_loss_6": 3.3594083547592164, "epoch": 0.534, "grad_norm": 1136.0, "kl_loss_12": 360.4061248779297, "kl_loss_17": 91.58968658447266, "kl_loss_3": 2058.1018188476564, "kl_loss_6": 1166.63935546875, "learning_rate": 0.00045405169342398633, "loss": 914.1059, "step": 5340 }, { "ce_loss_12": 3.1034294724464417, "ce_loss_17": 2.979544150829315, "ce_loss_23": 2.928696060180664, "ce_loss_3": 3.8748332500457763, "ce_loss_6": 3.445829749107361, "epoch": 0.535, "grad_norm": 1136.0, "kl_loss_12": 363.433837890625, "kl_loss_17": 94.02782135009765, "kl_loss_3": 2045.0534240722657, "kl_loss_6": 1166.5263275146485, "learning_rate": 0.0004524719783479088, "loss": 894.2564, "step": 5350 }, { "ce_loss_12": 3.063448226451874, "ce_loss_17": 2.9326701164245605, "ce_loss_23": 2.881749415397644, "ce_loss_3": 3.8512691020965577, "ce_loss_6": 3.412831723690033, "epoch": 0.536, "grad_norm": 1336.0, "kl_loss_12": 367.49913635253904, "kl_loss_17": 93.98178405761719, "kl_loss_3": 2083.023223876953, "kl_loss_6": 1178.4531219482421, "learning_rate": 0.00045089274187848144, "loss": 901.6736, "step": 5360 }, { "ce_loss_12": 3.166870927810669, "ce_loss_17": 3.049614202976227, "ce_loss_23": 3.0012126564979553, "ce_loss_3": 3.898930561542511, "ce_loss_6": 3.487534236907959, "epoch": 0.537, "grad_norm": 1352.0, "kl_loss_12": 352.57630462646483, "kl_loss_17": 91.50114479064942, "kl_loss_3": 1975.2602416992188, "kl_loss_6": 1119.994873046875, "learning_rate": 0.00044931399991859835, "loss": 886.5867, "step": 5370 }, { "ce_loss_12": 3.029424798488617, "ce_loss_17": 2.9093127846717834, "ce_loss_23": 2.859842073917389, "ce_loss_3": 3.7813164949417115, "ce_loss_6": 3.365421676635742, "epoch": 0.538, "grad_norm": 1224.0, "kl_loss_12": 357.1093246459961, "kl_loss_17": 90.75511360168457, "kl_loss_3": 2008.6002685546875, "kl_loss_6": 1151.0897399902344, "learning_rate": 0.00044773576836617336, "loss": 891.3827, "step": 5380 }, { "ce_loss_12": 3.126391017436981, "ce_loss_17": 3.002385115623474, "ce_loss_23": 2.950945734977722, "ce_loss_3": 3.9017954230308534, "ce_loss_6": 3.4763889789581297, "epoch": 0.539, "grad_norm": 1128.0, "kl_loss_12": 366.2182907104492, "kl_loss_17": 93.43424530029297, "kl_loss_3": 2048.6008544921874, "kl_loss_6": 1176.9113891601562, "learning_rate": 0.00044615806311398056, "loss": 925.7355, "step": 5390 }, { "ce_loss_12": 3.190409755706787, "ce_loss_17": 3.0776893973350523, "ce_loss_23": 3.0306317090988157, "ce_loss_3": 3.882905352115631, "ce_loss_6": 3.4969664454460143, "epoch": 0.54, "grad_norm": 852.0, "kl_loss_12": 345.8917999267578, "kl_loss_17": 88.83585586547852, "kl_loss_3": 1907.6739685058594, "kl_loss_6": 1096.0950744628906, "learning_rate": 0.00044458090004949454, "loss": 895.4913, "step": 5400 }, { "ce_loss_12": 3.075972247123718, "ce_loss_17": 2.9414425611495973, "ce_loss_23": 2.8867051243782043, "ce_loss_3": 3.8747236490249635, "ce_loss_6": 3.4382914423942568, "epoch": 0.541, "grad_norm": 1032.0, "kl_loss_12": 375.124333190918, "kl_loss_17": 95.7294204711914, "kl_loss_3": 2125.295440673828, "kl_loss_6": 1218.7462677001954, "learning_rate": 0.0004430042950547297, "loss": 912.583, "step": 5410 }, { "ce_loss_12": 3.1478776812553404, "ce_loss_17": 3.0218735575675963, "ce_loss_23": 2.9717148423194883, "ce_loss_3": 3.9084969997406005, "ce_loss_6": 3.4848543286323546, "epoch": 0.542, "grad_norm": 1008.0, "kl_loss_12": 369.2810485839844, "kl_loss_17": 96.3768123626709, "kl_loss_3": 2033.286114501953, "kl_loss_6": 1161.711181640625, "learning_rate": 0.0004414282640060809, "loss": 906.5489, "step": 5420 }, { "ce_loss_12": 3.229024577140808, "ce_loss_17": 3.109649395942688, "ce_loss_23": 3.057064139842987, "ce_loss_3": 3.9660748839378357, "ce_loss_6": 3.563118839263916, "epoch": 0.543, "grad_norm": 1216.0, "kl_loss_12": 354.3711135864258, "kl_loss_17": 92.91755104064941, "kl_loss_3": 1950.7111328125, "kl_loss_6": 1131.4530395507813, "learning_rate": 0.0004398528227741633, "loss": 886.8607, "step": 5430 }, { "ce_loss_12": 3.112036645412445, "ce_loss_17": 2.9836097359657288, "ce_loss_23": 2.9331039428710937, "ce_loss_3": 3.8796893119812013, "ce_loss_6": 3.4567083358764648, "epoch": 0.544, "grad_norm": 1360.0, "kl_loss_12": 360.694091796875, "kl_loss_17": 94.16159896850586, "kl_loss_3": 2016.5906494140625, "kl_loss_6": 1165.4428131103516, "learning_rate": 0.00043827798722365264, "loss": 913.6246, "step": 5440 }, { "ce_loss_12": 3.2171663522720335, "ce_loss_17": 3.099006462097168, "ce_loss_23": 3.050920236110687, "ce_loss_3": 3.942546272277832, "ce_loss_6": 3.540075421333313, "epoch": 0.545, "grad_norm": 1320.0, "kl_loss_12": 358.2791030883789, "kl_loss_17": 93.43666496276856, "kl_loss_3": 1943.8274780273437, "kl_loss_6": 1128.9095367431642, "learning_rate": 0.00043670377321312535, "loss": 880.9171, "step": 5450 }, { "ce_loss_12": 3.2231846690177917, "ce_loss_17": 3.108734202384949, "ce_loss_23": 3.0587079524993896, "ce_loss_3": 3.938978838920593, "ce_loss_6": 3.542694401741028, "epoch": 0.546, "grad_norm": 1184.0, "kl_loss_12": 357.0698211669922, "kl_loss_17": 92.49217147827149, "kl_loss_3": 1930.8526916503906, "kl_loss_6": 1119.0527770996093, "learning_rate": 0.0004351301965948991, "loss": 899.8764, "step": 5460 }, { "ce_loss_12": 3.1383449912071226, "ce_loss_17": 3.0184853315353393, "ce_loss_23": 2.969007098674774, "ce_loss_3": 3.85242840051651, "ce_loss_6": 3.4543243527412413, "epoch": 0.547, "grad_norm": 944.0, "kl_loss_12": 350.8236831665039, "kl_loss_17": 90.69459686279296, "kl_loss_3": 1917.282891845703, "kl_loss_6": 1102.7481719970704, "learning_rate": 0.000433557273214873, "loss": 882.44, "step": 5470 }, { "ce_loss_12": 3.1253939270973206, "ce_loss_17": 3.000547182559967, "ce_loss_23": 2.9495556354522705, "ce_loss_3": 3.8606873035430906, "ce_loss_6": 3.4494663953781126, "epoch": 0.548, "grad_norm": 1256.0, "kl_loss_12": 356.8171081542969, "kl_loss_17": 92.74531326293945, "kl_loss_3": 1953.0103088378905, "kl_loss_6": 1111.0361450195312, "learning_rate": 0.000431985018912368, "loss": 875.3377, "step": 5480 }, { "ce_loss_12": 3.1010571360588073, "ce_loss_17": 2.97558354139328, "ce_loss_23": 2.925135314464569, "ce_loss_3": 3.869184398651123, "ce_loss_6": 3.4456772685050963, "epoch": 0.549, "grad_norm": 1128.0, "kl_loss_12": 365.4259338378906, "kl_loss_17": 93.24345054626465, "kl_loss_3": 2039.9272033691407, "kl_loss_6": 1174.1738891601562, "learning_rate": 0.0004304134495199674, "loss": 888.229, "step": 5490 }, { "ce_loss_12": 3.135417914390564, "ce_loss_17": 3.0078897953033445, "ce_loss_23": 2.957447588443756, "ce_loss_3": 3.89321106672287, "ce_loss_6": 3.4768668532371523, "epoch": 0.55, "grad_norm": 1112.0, "kl_loss_12": 370.8565933227539, "kl_loss_17": 93.26599197387695, "kl_loss_3": 2049.9021118164064, "kl_loss_6": 1183.0644409179688, "learning_rate": 0.0004288425808633575, "loss": 902.8559, "step": 5500 }, { "ce_loss_12": 3.1100064396858214, "ce_loss_17": 2.9872559189796446, "ce_loss_23": 2.9383681416511536, "ce_loss_3": 3.861984574794769, "ce_loss_6": 3.4384658455848696, "epoch": 0.551, "grad_norm": 1352.0, "kl_loss_12": 354.1197265625, "kl_loss_17": 90.87067832946778, "kl_loss_3": 2005.800714111328, "kl_loss_6": 1139.4069519042969, "learning_rate": 0.0004272724287611684, "loss": 899.1554, "step": 5510 }, { "ce_loss_12": 3.0869866490364073, "ce_loss_17": 2.9635907649993896, "ce_loss_23": 2.911494505405426, "ce_loss_3": 3.8518809914588927, "ce_loss_6": 3.418315315246582, "epoch": 0.552, "grad_norm": 980.0, "kl_loss_12": 359.54785308837893, "kl_loss_17": 92.89949645996094, "kl_loss_3": 2041.656689453125, "kl_loss_6": 1154.5992065429687, "learning_rate": 0.00042570300902481425, "loss": 903.5547, "step": 5520 }, { "ce_loss_12": 3.111093783378601, "ce_loss_17": 2.9967490673065185, "ce_loss_23": 2.9479554176330565, "ce_loss_3": 3.8493823170661927, "ce_loss_6": 3.4331010580062866, "epoch": 0.553, "grad_norm": 1176.0, "kl_loss_12": 350.27253875732424, "kl_loss_17": 90.20486984252929, "kl_loss_3": 1977.1471374511718, "kl_loss_6": 1123.4446014404298, "learning_rate": 0.00042413433745833423, "loss": 887.315, "step": 5530 }, { "ce_loss_12": 3.1101133584976197, "ce_loss_17": 2.987442672252655, "ce_loss_23": 2.9393104672431947, "ce_loss_3": 3.866848182678223, "ce_loss_6": 3.4454284429550173, "epoch": 0.554, "grad_norm": 1040.0, "kl_loss_12": 356.58823547363284, "kl_loss_17": 91.02802352905273, "kl_loss_3": 2003.3032836914062, "kl_loss_6": 1143.326953125, "learning_rate": 0.0004225664298582339, "loss": 874.8443, "step": 5540 }, { "ce_loss_12": 3.1919366121292114, "ce_loss_17": 3.0718690514564515, "ce_loss_23": 3.021695065498352, "ce_loss_3": 3.918123209476471, "ce_loss_6": 3.508436751365662, "epoch": 0.555, "grad_norm": 1176.0, "kl_loss_12": 349.13452911376953, "kl_loss_17": 90.69793014526367, "kl_loss_3": 1941.2161254882812, "kl_loss_6": 1109.2159545898437, "learning_rate": 0.000420999302013325, "loss": 875.6727, "step": 5550 }, { "ce_loss_12": 3.0942121744155884, "ce_loss_17": 2.96941956281662, "ce_loss_23": 2.9174832582473753, "ce_loss_3": 3.8882731795310974, "ce_loss_6": 3.4454726815223693, "epoch": 0.556, "grad_norm": 1352.0, "kl_loss_12": 368.905517578125, "kl_loss_17": 95.6106086730957, "kl_loss_3": 2071.4515686035156, "kl_loss_6": 1175.006658935547, "learning_rate": 0.000419432969704568, "loss": 896.6355, "step": 5560 }, { "ce_loss_12": 3.1318360328674317, "ce_loss_17": 3.011006569862366, "ce_loss_23": 2.9601503133773805, "ce_loss_3": 3.866017699241638, "ce_loss_6": 3.4546066999435423, "epoch": 0.557, "grad_norm": 852.0, "kl_loss_12": 353.10416107177736, "kl_loss_17": 91.07056617736816, "kl_loss_3": 1960.4804931640624, "kl_loss_6": 1120.8563903808595, "learning_rate": 0.00041786744870491154, "loss": 909.5014, "step": 5570 }, { "ce_loss_12": 3.0762211441993714, "ce_loss_17": 2.9487279295921325, "ce_loss_23": 2.897049582004547, "ce_loss_3": 3.8321069359779356, "ce_loss_6": 3.4129802227020263, "epoch": 0.558, "grad_norm": 972.0, "kl_loss_12": 362.3402847290039, "kl_loss_17": 93.73015022277832, "kl_loss_3": 2016.0342102050781, "kl_loss_6": 1164.3839385986328, "learning_rate": 0.0004163027547791347, "loss": 899.6561, "step": 5580 }, { "ce_loss_12": 3.063253676891327, "ce_loss_17": 2.9407611727714538, "ce_loss_23": 2.8906123280525207, "ce_loss_3": 3.8545337080955506, "ce_loss_6": 3.412693679332733, "epoch": 0.559, "grad_norm": 1240.0, "kl_loss_12": 360.7710876464844, "kl_loss_17": 92.40084419250488, "kl_loss_3": 2056.2872314453125, "kl_loss_6": 1158.0092742919921, "learning_rate": 0.0004147389036836881, "loss": 903.7966, "step": 5590 }, { "ce_loss_12": 3.1042871475219727, "ce_loss_17": 2.980177104473114, "ce_loss_23": 2.93089816570282, "ce_loss_3": 3.870959210395813, "ce_loss_6": 3.4439244508743285, "epoch": 0.56, "grad_norm": 1208.0, "kl_loss_12": 358.4499801635742, "kl_loss_17": 92.21180305480956, "kl_loss_3": 2018.989453125, "kl_loss_6": 1156.2593170166015, "learning_rate": 0.00041317591116653486, "loss": 913.3369, "step": 5600 }, { "ce_loss_12": 3.1313674330711363, "ce_loss_17": 3.0127056002616883, "ce_loss_23": 2.9640170097351075, "ce_loss_3": 3.897299957275391, "ce_loss_6": 3.473569095134735, "epoch": 0.561, "grad_norm": 912.0, "kl_loss_12": 366.00707244873047, "kl_loss_17": 93.89572296142578, "kl_loss_3": 2028.6972534179688, "kl_loss_6": 1155.3702209472656, "learning_rate": 0.0004116137929669921, "loss": 891.6586, "step": 5610 }, { "ce_loss_12": 3.136381483078003, "ce_loss_17": 3.0133532643318177, "ce_loss_23": 2.963248634338379, "ce_loss_3": 3.884948122501373, "ce_loss_6": 3.463275647163391, "epoch": 0.562, "grad_norm": 1320.0, "kl_loss_12": 352.8741882324219, "kl_loss_17": 89.78136978149413, "kl_loss_3": 1996.60400390625, "kl_loss_6": 1136.3984741210938, "learning_rate": 0.00041005256481557305, "loss": 880.7673, "step": 5620 }, { "ce_loss_12": 3.2142569780349732, "ce_loss_17": 3.0997274041175844, "ce_loss_23": 3.051528239250183, "ce_loss_3": 3.918693208694458, "ce_loss_6": 3.5266610264778135, "epoch": 0.563, "grad_norm": 1012.0, "kl_loss_12": 343.21154327392577, "kl_loss_17": 88.66844177246094, "kl_loss_3": 1894.733349609375, "kl_loss_6": 1093.9771453857422, "learning_rate": 0.00040849224243382767, "loss": 871.3256, "step": 5630 }, { "ce_loss_12": 3.0870473980903625, "ce_loss_17": 2.965904676914215, "ce_loss_23": 2.9143540382385256, "ce_loss_3": 3.8411280512809753, "ce_loss_6": 3.418475079536438, "epoch": 0.564, "grad_norm": 1216.0, "kl_loss_12": 355.9902709960937, "kl_loss_17": 91.11196594238281, "kl_loss_3": 2005.0414489746095, "kl_loss_6": 1145.2955535888673, "learning_rate": 0.000406932841534185, "loss": 880.5096, "step": 5640 }, { "ce_loss_12": 3.051490914821625, "ce_loss_17": 2.9329665184020994, "ce_loss_23": 2.8812998056411745, "ce_loss_3": 3.816966009140015, "ce_loss_6": 3.391273760795593, "epoch": 0.565, "grad_norm": 1120.0, "kl_loss_12": 360.7217727661133, "kl_loss_17": 93.10445556640624, "kl_loss_3": 2024.0443725585938, "kl_loss_6": 1157.3559936523438, "learning_rate": 0.0004053743778197951, "loss": 920.442, "step": 5650 }, { "ce_loss_12": 3.1533095955848696, "ce_loss_17": 3.0284717679023743, "ce_loss_23": 2.9783113837242126, "ce_loss_3": 3.901489055156708, "ce_loss_6": 3.4922434449195863, "epoch": 0.566, "grad_norm": 1152.0, "kl_loss_12": 365.6111724853516, "kl_loss_17": 94.32415237426758, "kl_loss_3": 1988.0268127441407, "kl_loss_6": 1152.2390167236329, "learning_rate": 0.0004038168669843697, "loss": 908.1027, "step": 5660 }, { "ce_loss_12": 3.1055307507514955, "ce_loss_17": 2.988659369945526, "ce_loss_23": 2.9381643414497374, "ce_loss_3": 3.8319229960441588, "ce_loss_6": 3.4249443411827087, "epoch": 0.567, "grad_norm": 1088.0, "kl_loss_12": 349.71607971191406, "kl_loss_17": 90.56960487365723, "kl_loss_3": 1944.890283203125, "kl_loss_6": 1114.1278930664062, "learning_rate": 0.000402260324712026, "loss": 898.1495, "step": 5670 }, { "ce_loss_12": 3.14718611240387, "ce_loss_17": 3.0279541492462156, "ce_loss_23": 2.979733180999756, "ce_loss_3": 3.918551576137543, "ce_loss_6": 3.4888805150985718, "epoch": 0.568, "grad_norm": 1144.0, "kl_loss_12": 354.7816497802734, "kl_loss_17": 89.66108932495118, "kl_loss_3": 2034.6115295410157, "kl_loss_6": 1157.8374237060548, "learning_rate": 0.00040070476667712743, "loss": 888.1705, "step": 5680 }, { "ce_loss_12": 3.1779404878616333, "ce_loss_17": 3.058204233646393, "ce_loss_23": 3.006940519809723, "ce_loss_3": 3.9126431226730345, "ce_loss_6": 3.4999541878700255, "epoch": 0.569, "grad_norm": 868.0, "kl_loss_12": 354.8095184326172, "kl_loss_17": 91.35061798095703, "kl_loss_3": 1974.644305419922, "kl_loss_6": 1131.2801147460937, "learning_rate": 0.0003991502085441259, "loss": 894.1547, "step": 5690 }, { "ce_loss_12": 3.2082398533821106, "ce_loss_17": 3.09513920545578, "ce_loss_23": 3.046538519859314, "ce_loss_3": 3.9162412881851196, "ce_loss_6": 3.5189447522163393, "epoch": 0.57, "grad_norm": 1004.0, "kl_loss_12": 345.07321014404295, "kl_loss_17": 89.30585174560547, "kl_loss_3": 1900.429345703125, "kl_loss_6": 1088.3466583251952, "learning_rate": 0.0003975966659674047, "loss": 882.0586, "step": 5700 }, { "ce_loss_12": 3.188878560066223, "ce_loss_17": 3.067433404922485, "ce_loss_23": 3.0170228600502016, "ce_loss_3": 3.924565625190735, "ce_loss_6": 3.514917719364166, "epoch": 0.571, "grad_norm": 1376.0, "kl_loss_12": 353.6061096191406, "kl_loss_17": 91.92852973937988, "kl_loss_3": 1958.1591064453125, "kl_loss_6": 1125.2135955810547, "learning_rate": 0.0003960441545911204, "loss": 877.4781, "step": 5710 }, { "ce_loss_12": 3.1661776185035704, "ce_loss_17": 3.0493478536605836, "ce_loss_23": 2.999690568447113, "ce_loss_3": 3.904485309123993, "ce_loss_6": 3.4924548625946046, "epoch": 0.572, "grad_norm": 1344.0, "kl_loss_12": 354.6435806274414, "kl_loss_17": 90.59515380859375, "kl_loss_3": 1981.4042236328125, "kl_loss_6": 1136.697817993164, "learning_rate": 0.0003944926900490452, "loss": 883.3597, "step": 5720 }, { "ce_loss_12": 3.1009980201721192, "ce_loss_17": 2.973039472103119, "ce_loss_23": 2.9208882093429565, "ce_loss_3": 3.871054542064667, "ce_loss_6": 3.4438048243522643, "epoch": 0.573, "grad_norm": 1192.0, "kl_loss_12": 362.32491302490234, "kl_loss_17": 92.63758544921875, "kl_loss_3": 2032.8689392089843, "kl_loss_6": 1159.4469604492188, "learning_rate": 0.0003929422879644099, "loss": 889.1459, "step": 5730 }, { "ce_loss_12": 3.0974637866020203, "ce_loss_17": 2.983030641078949, "ce_loss_23": 2.9348677277565, "ce_loss_3": 3.8249285221099854, "ce_loss_6": 3.4108102679252625, "epoch": 0.574, "grad_norm": 1192.0, "kl_loss_12": 344.463232421875, "kl_loss_17": 89.30107688903809, "kl_loss_3": 1950.4291198730468, "kl_loss_6": 1106.0761596679688, "learning_rate": 0.0003913929639497462, "loss": 860.5516, "step": 5740 }, { "ce_loss_12": 3.0569748997688295, "ce_loss_17": 2.934243309497833, "ce_loss_23": 2.8869590044021605, "ce_loss_3": 3.823574161529541, "ce_loss_6": 3.3891812562942505, "epoch": 0.575, "grad_norm": 1128.0, "kl_loss_12": 351.39185638427733, "kl_loss_17": 89.34204635620117, "kl_loss_3": 2018.800634765625, "kl_loss_6": 1142.7618347167968, "learning_rate": 0.00038984473360672965, "loss": 879.9574, "step": 5750 }, { "ce_loss_12": 3.064702033996582, "ce_loss_17": 2.9443649649620056, "ce_loss_23": 2.8942911982536317, "ce_loss_3": 3.836893093585968, "ce_loss_6": 3.4024426221847532, "epoch": 0.576, "grad_norm": 1200.0, "kl_loss_12": 351.22936859130857, "kl_loss_17": 89.25144119262696, "kl_loss_3": 2022.8355651855468, "kl_loss_6": 1151.775244140625, "learning_rate": 0.0003882976125260229, "loss": 880.2629, "step": 5760 }, { "ce_loss_12": 3.129497694969177, "ce_loss_17": 3.010597312450409, "ce_loss_23": 2.9593307852745054, "ce_loss_3": 3.8632505893707276, "ce_loss_6": 3.454318583011627, "epoch": 0.577, "grad_norm": 1000.0, "kl_loss_12": 351.70952758789065, "kl_loss_17": 90.51013984680176, "kl_loss_3": 1959.9725219726563, "kl_loss_6": 1122.036703491211, "learning_rate": 0.00038675161628711776, "loss": 886.1361, "step": 5770 }, { "ce_loss_12": 3.1580193042755127, "ce_loss_17": 3.042417895793915, "ce_loss_23": 2.9932226300239564, "ce_loss_3": 3.894358551502228, "ce_loss_6": 3.482797610759735, "epoch": 0.578, "grad_norm": 960.0, "kl_loss_12": 350.21394805908204, "kl_loss_17": 91.14322891235352, "kl_loss_3": 1943.130877685547, "kl_loss_6": 1110.1543884277344, "learning_rate": 0.0003852067604581794, "loss": 901.2336, "step": 5780 }, { "ce_loss_12": 3.109062159061432, "ce_loss_17": 2.992610514163971, "ce_loss_23": 2.943720591068268, "ce_loss_3": 3.8661123752593993, "ce_loss_6": 3.4403353691101075, "epoch": 0.579, "grad_norm": 1096.0, "kl_loss_12": 351.67835693359376, "kl_loss_17": 89.08604125976562, "kl_loss_3": 2006.901873779297, "kl_loss_6": 1143.3357177734374, "learning_rate": 0.0003836630605958888, "loss": 884.7488, "step": 5790 }, { "ce_loss_12": 3.161776900291443, "ce_loss_17": 3.0456367135047913, "ce_loss_23": 2.998029279708862, "ce_loss_3": 3.9008978247642516, "ce_loss_6": 3.494714057445526, "epoch": 0.58, "grad_norm": 1288.0, "kl_loss_12": 353.23523712158203, "kl_loss_17": 91.2992992401123, "kl_loss_3": 1989.3494140625, "kl_loss_6": 1139.3370666503906, "learning_rate": 0.0003821205322452863, "loss": 922.5059, "step": 5800 }, { "ce_loss_12": 3.1459306120872497, "ce_loss_17": 3.0311925053596496, "ce_loss_23": 2.9824758887290956, "ce_loss_3": 3.872205066680908, "ce_loss_6": 3.459067690372467, "epoch": 0.581, "grad_norm": 1784.0, "kl_loss_12": 347.05757904052734, "kl_loss_17": 89.70176620483399, "kl_loss_3": 1963.4107666015625, "kl_loss_6": 1114.5962188720703, "learning_rate": 0.0003805791909396155, "loss": 884.1924, "step": 5810 }, { "ce_loss_12": 3.1018191814422607, "ce_loss_17": 2.9827213287353516, "ce_loss_23": 2.9357261180877687, "ce_loss_3": 3.847515308856964, "ce_loss_6": 3.431482744216919, "epoch": 0.582, "grad_norm": 1004.0, "kl_loss_12": 345.5039291381836, "kl_loss_17": 88.27181510925293, "kl_loss_3": 1973.841162109375, "kl_loss_6": 1116.8025634765625, "learning_rate": 0.0003790390522001662, "loss": 890.882, "step": 5820 }, { "ce_loss_12": 3.045158123970032, "ce_loss_17": 2.93050411939621, "ce_loss_23": 2.881169056892395, "ce_loss_3": 3.7953080892562867, "ce_loss_6": 3.3693145155906676, "epoch": 0.583, "grad_norm": 984.0, "kl_loss_12": 346.33858489990234, "kl_loss_17": 88.57795143127441, "kl_loss_3": 1998.8186462402343, "kl_loss_6": 1127.1401947021484, "learning_rate": 0.0003775001315361183, "loss": 877.8606, "step": 5830 }, { "ce_loss_12": 3.143208909034729, "ce_loss_17": 3.019788587093353, "ce_loss_23": 2.968058371543884, "ce_loss_3": 3.902362859249115, "ce_loss_6": 3.47598592042923, "epoch": 0.584, "grad_norm": 852.0, "kl_loss_12": 352.2950210571289, "kl_loss_17": 90.95420379638672, "kl_loss_3": 1999.0019226074219, "kl_loss_6": 1129.328665161133, "learning_rate": 0.0003759624444443858, "loss": 892.2592, "step": 5840 }, { "ce_loss_12": 3.1722511053085327, "ce_loss_17": 3.0544722199440004, "ce_loss_23": 3.0061080932617186, "ce_loss_3": 3.895999026298523, "ce_loss_6": 3.4860278964042664, "epoch": 0.585, "grad_norm": 996.0, "kl_loss_12": 348.66165618896486, "kl_loss_17": 89.14413490295411, "kl_loss_3": 1957.5979736328125, "kl_loss_6": 1110.2203674316406, "learning_rate": 0.00037442600640946044, "loss": 871.4254, "step": 5850 }, { "ce_loss_12": 3.134334754943848, "ce_loss_17": 3.0207717776298524, "ce_loss_23": 2.9735180854797365, "ce_loss_3": 3.863844406604767, "ce_loss_6": 3.453572678565979, "epoch": 0.586, "grad_norm": 1160.0, "kl_loss_12": 350.11250457763674, "kl_loss_17": 88.69996490478516, "kl_loss_3": 1947.98056640625, "kl_loss_6": 1116.696987915039, "learning_rate": 0.00037289083290325663, "loss": 862.4559, "step": 5860 }, { "ce_loss_12": 3.1132630348205566, "ce_loss_17": 2.996178901195526, "ce_loss_23": 2.9467347741127012, "ce_loss_3": 3.843463182449341, "ce_loss_6": 3.4376041889190674, "epoch": 0.587, "grad_norm": 996.0, "kl_loss_12": 342.12928619384763, "kl_loss_17": 89.54325790405274, "kl_loss_3": 1928.3241943359376, "kl_loss_6": 1100.103323364258, "learning_rate": 0.0003713569393849543, "loss": 870.1561, "step": 5870 }, { "ce_loss_12": 3.1706372618675234, "ce_loss_17": 3.0503373622894285, "ce_loss_23": 3.0024590611457826, "ce_loss_3": 3.9031503558158875, "ce_loss_6": 3.4925827860832213, "epoch": 0.588, "grad_norm": 1192.0, "kl_loss_12": 354.43677673339846, "kl_loss_17": 91.15572128295898, "kl_loss_3": 1967.8707458496094, "kl_loss_6": 1126.3148254394532, "learning_rate": 0.00036982434130084397, "loss": 883.001, "step": 5880 }, { "ce_loss_12": 3.0874892592430117, "ce_loss_17": 2.966780185699463, "ce_loss_23": 2.9160982012748717, "ce_loss_3": 3.820379447937012, "ce_loss_6": 3.4098856925964354, "epoch": 0.589, "grad_norm": 1224.0, "kl_loss_12": 354.72835235595704, "kl_loss_17": 91.79812850952149, "kl_loss_3": 1963.2278564453125, "kl_loss_6": 1119.7965881347657, "learning_rate": 0.00036829305408417166, "loss": 889.332, "step": 5890 }, { "ce_loss_12": 3.077235984802246, "ce_loss_17": 2.9538180470466613, "ce_loss_23": 2.9021143913269043, "ce_loss_3": 3.838461446762085, "ce_loss_6": 3.4159860849380492, "epoch": 0.59, "grad_norm": 1012.0, "kl_loss_12": 355.88343353271483, "kl_loss_17": 91.19095001220703, "kl_loss_3": 2016.5178649902343, "kl_loss_6": 1152.0101684570313, "learning_rate": 0.0003667630931549826, "loss": 892.2932, "step": 5900 }, { "ce_loss_12": 3.045335614681244, "ce_loss_17": 2.9218624353408815, "ce_loss_23": 2.872580373287201, "ce_loss_3": 3.8366833329200745, "ce_loss_6": 3.394924187660217, "epoch": 0.591, "grad_norm": 1144.0, "kl_loss_12": 360.2607162475586, "kl_loss_17": 91.28438262939453, "kl_loss_3": 2077.546520996094, "kl_loss_6": 1174.0156555175781, "learning_rate": 0.00036523447391996613, "loss": 908.0979, "step": 5910 }, { "ce_loss_12": 3.126784014701843, "ce_loss_17": 3.012990856170654, "ce_loss_23": 2.967097759246826, "ce_loss_3": 3.8611212372779846, "ce_loss_6": 3.4536351919174195, "epoch": 0.592, "grad_norm": 984.0, "kl_loss_12": 345.7523956298828, "kl_loss_17": 88.32137145996094, "kl_loss_3": 1947.22578125, "kl_loss_6": 1109.581494140625, "learning_rate": 0.00036370721177230114, "loss": 871.7516, "step": 5920 }, { "ce_loss_12": 3.1307328581809997, "ce_loss_17": 3.011709153652191, "ce_loss_23": 2.962051284313202, "ce_loss_3": 3.8877512216567993, "ce_loss_6": 3.470838952064514, "epoch": 0.593, "grad_norm": 1216.0, "kl_loss_12": 358.9762557983398, "kl_loss_17": 92.42395324707032, "kl_loss_3": 2001.0065490722657, "kl_loss_6": 1151.392578125, "learning_rate": 0.00036218132209150044, "loss": 890.91, "step": 5930 }, { "ce_loss_12": 3.0935922861099243, "ce_loss_17": 2.9630477666854858, "ce_loss_23": 2.908335840702057, "ce_loss_3": 3.875299608707428, "ce_loss_6": 3.4436821937561035, "epoch": 0.594, "grad_norm": 1064.0, "kl_loss_12": 368.01280364990237, "kl_loss_17": 95.11238098144531, "kl_loss_3": 2086.326556396484, "kl_loss_6": 1186.2103179931642, "learning_rate": 0.0003606568202432562, "loss": 908.7041, "step": 5940 }, { "ce_loss_12": 3.1535122275352476, "ce_loss_17": 3.0376246333122254, "ce_loss_23": 2.988776755332947, "ce_loss_3": 3.9159383296966555, "ce_loss_6": 3.4887673020362855, "epoch": 0.595, "grad_norm": 1352.0, "kl_loss_12": 354.77533264160155, "kl_loss_17": 91.6845832824707, "kl_loss_3": 2039.5684753417968, "kl_loss_6": 1158.846859741211, "learning_rate": 0.0003591337215792851, "loss": 885.2172, "step": 5950 }, { "ce_loss_12": 3.1813810586929323, "ce_loss_17": 3.066852557659149, "ce_loss_23": 3.0204002499580382, "ce_loss_3": 3.9011024475097655, "ce_loss_6": 3.5039899230003355, "epoch": 0.596, "grad_norm": 1232.0, "kl_loss_12": 344.7813095092773, "kl_loss_17": 88.03176994323731, "kl_loss_3": 1934.431787109375, "kl_loss_6": 1117.7573455810548, "learning_rate": 0.00035761204143717383, "loss": 882.9086, "step": 5960 }, { "ce_loss_12": 3.1446377277374267, "ce_loss_17": 3.025981771945953, "ce_loss_23": 2.9761717557907104, "ce_loss_3": 3.878723645210266, "ce_loss_6": 3.468522787094116, "epoch": 0.597, "grad_norm": 1256.0, "kl_loss_12": 353.09497833251953, "kl_loss_17": 91.71636924743652, "kl_loss_3": 1978.5257202148437, "kl_loss_6": 1137.846499633789, "learning_rate": 0.0003560917951402245, "loss": 906.2232, "step": 5970 }, { "ce_loss_12": 3.1228631258010866, "ce_loss_17": 3.007214438915253, "ce_loss_23": 2.9603391289711, "ce_loss_3": 3.8574273109436037, "ce_loss_6": 3.4532262802124025, "epoch": 0.598, "grad_norm": 1360.0, "kl_loss_12": 346.21971282958987, "kl_loss_17": 89.21910247802734, "kl_loss_3": 1960.0101745605468, "kl_loss_6": 1125.2134948730468, "learning_rate": 0.00035457299799730046, "loss": 879.1287, "step": 5980 }, { "ce_loss_12": 3.1803099870681764, "ce_loss_17": 3.0637570142745973, "ce_loss_23": 3.0157534003257753, "ce_loss_3": 3.9152540445327757, "ce_loss_6": 3.5102675199508666, "epoch": 0.599, "grad_norm": 1432.0, "kl_loss_12": 349.6127532958984, "kl_loss_17": 89.91424903869628, "kl_loss_3": 1961.0085571289062, "kl_loss_6": 1123.6217254638673, "learning_rate": 0.0003530556653026721, "loss": 887.6521, "step": 5990 }, { "ce_loss_12": 3.1075904726982118, "ce_loss_17": 2.9898393511772157, "ce_loss_23": 2.9403223633766173, "ce_loss_3": 3.8573669672012327, "ce_loss_6": 3.430279052257538, "epoch": 0.6, "grad_norm": 1856.0, "kl_loss_12": 342.2793685913086, "kl_loss_17": 89.26798706054687, "kl_loss_3": 1981.4362060546875, "kl_loss_6": 1109.620657348633, "learning_rate": 0.00035153981233586274, "loss": 887.7717, "step": 6000 }, { "ce_loss_12": 3.074912130832672, "ce_loss_17": 2.9589345335960386, "ce_loss_23": 2.9115482330322267, "ce_loss_3": 3.8282949447631838, "ce_loss_6": 3.4091708064079285, "epoch": 0.601, "grad_norm": 1168.0, "kl_loss_12": 343.47901458740233, "kl_loss_17": 87.6121109008789, "kl_loss_3": 1983.5056091308593, "kl_loss_6": 1129.1260070800781, "learning_rate": 0.00035002545436149473, "loss": 916.9419, "step": 6010 }, { "ce_loss_12": 3.092540967464447, "ce_loss_17": 2.97232928276062, "ce_loss_23": 2.9233181953430174, "ce_loss_3": 3.8545306801795958, "ce_loss_6": 3.435434603691101, "epoch": 0.602, "grad_norm": 1048.0, "kl_loss_12": 360.00843353271483, "kl_loss_17": 92.88990707397461, "kl_loss_3": 2020.6975036621093, "kl_loss_6": 1152.5501831054687, "learning_rate": 0.0003485126066291364, "loss": 883.2751, "step": 6020 }, { "ce_loss_12": 3.124794828891754, "ce_loss_17": 3.0092039346694945, "ce_loss_23": 2.9585169315338136, "ce_loss_3": 3.880570352077484, "ce_loss_6": 3.4638235807418822, "epoch": 0.603, "grad_norm": 1184.0, "kl_loss_12": 346.15485229492185, "kl_loss_17": 88.88919639587402, "kl_loss_3": 1991.0694091796875, "kl_loss_6": 1138.7969970703125, "learning_rate": 0.0003470012843731476, "loss": 891.7182, "step": 6030 }, { "ce_loss_12": 3.0740342855453493, "ce_loss_17": 2.957772696018219, "ce_loss_23": 2.9076455354690554, "ce_loss_3": 3.8349327206611634, "ce_loss_6": 3.418463349342346, "epoch": 0.604, "grad_norm": 1040.0, "kl_loss_12": 349.4450286865234, "kl_loss_17": 89.32358436584472, "kl_loss_3": 2008.395458984375, "kl_loss_6": 1149.2588104248048, "learning_rate": 0.00034549150281252633, "loss": 911.2477, "step": 6040 }, { "ce_loss_12": 3.063339948654175, "ce_loss_17": 2.9427681088447573, "ce_loss_23": 2.8925299048423767, "ce_loss_3": 3.7877164006233217, "ce_loss_6": 3.3812638998031614, "epoch": 0.605, "grad_norm": 1120.0, "kl_loss_12": 349.8405731201172, "kl_loss_17": 90.2714687347412, "kl_loss_3": 1932.9745483398438, "kl_loss_6": 1105.6616638183593, "learning_rate": 0.0003439832771507565, "loss": 873.6602, "step": 6050 }, { "ce_loss_12": 3.0673495173454284, "ce_loss_17": 2.9466548919677735, "ce_loss_23": 2.8978442192077636, "ce_loss_3": 3.8202001094818114, "ce_loss_6": 3.41012419462204, "epoch": 0.606, "grad_norm": 904.0, "kl_loss_12": 350.63096923828124, "kl_loss_17": 89.38655128479004, "kl_loss_3": 2003.3099365234375, "kl_loss_6": 1148.8616790771484, "learning_rate": 0.0003424766225756537, "loss": 880.6595, "step": 6060 }, { "ce_loss_12": 3.1238632917404177, "ce_loss_17": 3.0024963498115538, "ce_loss_23": 2.953491246700287, "ce_loss_3": 3.8675854563713075, "ce_loss_6": 3.450518810749054, "epoch": 0.607, "grad_norm": 1168.0, "kl_loss_12": 352.34055786132814, "kl_loss_17": 89.9672866821289, "kl_loss_3": 1970.7400146484374, "kl_loss_6": 1118.7429931640625, "learning_rate": 0.00034097155425921255, "loss": 871.059, "step": 6070 }, { "ce_loss_12": 3.0279074907302856, "ce_loss_17": 2.9091128587722777, "ce_loss_23": 2.8589402437210083, "ce_loss_3": 3.785420262813568, "ce_loss_6": 3.3633985996246336, "epoch": 0.608, "grad_norm": 960.0, "kl_loss_12": 353.95676879882814, "kl_loss_17": 90.35716094970704, "kl_loss_3": 2011.920391845703, "kl_loss_6": 1137.7992767333985, "learning_rate": 0.0003394680873574546, "loss": 886.3698, "step": 6080 }, { "ce_loss_12": 3.125973951816559, "ce_loss_17": 3.0037576794624328, "ce_loss_23": 2.953854131698608, "ce_loss_3": 3.8894197821617125, "ce_loss_6": 3.4591413021087645, "epoch": 0.609, "grad_norm": 928.0, "kl_loss_12": 353.62838592529295, "kl_loss_17": 91.09500427246094, "kl_loss_3": 2018.036395263672, "kl_loss_6": 1138.0875274658204, "learning_rate": 0.0003379662370102747, "loss": 882.4516, "step": 6090 }, { "ce_loss_12": 3.1325398445129395, "ce_loss_17": 3.0157789826393127, "ce_loss_23": 2.9670695900917052, "ce_loss_3": 3.8562929153442385, "ce_loss_6": 3.4493812918663025, "epoch": 0.61, "grad_norm": 1216.0, "kl_loss_12": 348.48892974853516, "kl_loss_17": 88.45648956298828, "kl_loss_3": 1969.1524963378906, "kl_loss_6": 1122.2777770996095, "learning_rate": 0.0003364660183412892, "loss": 886.7492, "step": 6100 }, { "ce_loss_12": 3.119068741798401, "ce_loss_17": 3.000202751159668, "ce_loss_23": 2.9513747334480285, "ce_loss_3": 3.848421800136566, "ce_loss_6": 3.448009467124939, "epoch": 0.611, "grad_norm": 1176.0, "kl_loss_12": 352.0840560913086, "kl_loss_17": 90.15714569091797, "kl_loss_3": 1971.662255859375, "kl_loss_6": 1134.4475860595703, "learning_rate": 0.0003349674464576834, "loss": 894.9472, "step": 6110 }, { "ce_loss_12": 3.074628984928131, "ce_loss_17": 2.9548948645591735, "ce_loss_23": 2.9055315613746644, "ce_loss_3": 3.8279956340789796, "ce_loss_6": 3.4047624468803406, "epoch": 0.612, "grad_norm": 868.0, "kl_loss_12": 350.29241943359375, "kl_loss_17": 90.4896282196045, "kl_loss_3": 2006.4168212890625, "kl_loss_6": 1133.1514801025392, "learning_rate": 0.00033347053645005966, "loss": 869.4762, "step": 6120 }, { "ce_loss_12": 3.1577037930488587, "ce_loss_17": 3.0422616958618165, "ce_loss_23": 2.9943039655685424, "ce_loss_3": 3.8772876262664795, "ce_loss_6": 3.4845842838287355, "epoch": 0.613, "grad_norm": 1528.0, "kl_loss_12": 342.38953399658203, "kl_loss_17": 88.46079177856446, "kl_loss_3": 1917.6266235351563, "kl_loss_6": 1107.1890045166015, "learning_rate": 0.00033197530339228485, "loss": 876.5715, "step": 6130 }, { "ce_loss_12": 3.1251278638839723, "ce_loss_17": 3.0028708815574645, "ce_loss_23": 2.9523780941963196, "ce_loss_3": 3.8694166779518127, "ce_loss_6": 3.4621509075164796, "epoch": 0.614, "grad_norm": 1040.0, "kl_loss_12": 354.7750839233398, "kl_loss_17": 92.17405433654785, "kl_loss_3": 1964.5522583007812, "kl_loss_6": 1132.422689819336, "learning_rate": 0.00033048176234133967, "loss": 878.5187, "step": 6140 }, { "ce_loss_12": 3.112536537647247, "ce_loss_17": 2.9995103001594545, "ce_loss_23": 2.952008378505707, "ce_loss_3": 3.850481855869293, "ce_loss_6": 3.4409647226333617, "epoch": 0.615, "grad_norm": 1160.0, "kl_loss_12": 352.7617446899414, "kl_loss_17": 89.95292663574219, "kl_loss_3": 1976.0133422851563, "kl_loss_6": 1132.9945495605468, "learning_rate": 0.0003289899283371657, "loss": 888.2209, "step": 6150 }, { "ce_loss_12": 3.129842388629913, "ce_loss_17": 3.0115858197212217, "ce_loss_23": 2.9639342188835145, "ce_loss_3": 3.8740663528442383, "ce_loss_6": 3.4587475419044496, "epoch": 0.616, "grad_norm": 1216.0, "kl_loss_12": 342.78956146240233, "kl_loss_17": 88.99765205383301, "kl_loss_3": 1968.557421875, "kl_loss_6": 1118.0649200439452, "learning_rate": 0.0003274998164025148, "loss": 890.9506, "step": 6160 }, { "ce_loss_12": 3.1652899265289305, "ce_loss_17": 3.044243943691254, "ce_loss_23": 2.994454729557037, "ce_loss_3": 3.891256046295166, "ce_loss_6": 3.4863043069839477, "epoch": 0.617, "grad_norm": 1152.0, "kl_loss_12": 351.40711822509763, "kl_loss_17": 91.19853973388672, "kl_loss_3": 1949.5838562011718, "kl_loss_6": 1121.7305847167968, "learning_rate": 0.0003260114415427975, "loss": 901.9518, "step": 6170 }, { "ce_loss_12": 3.083340036869049, "ce_loss_17": 2.966404664516449, "ce_loss_23": 2.9196917533874513, "ce_loss_3": 3.8479635953903197, "ce_loss_6": 3.4319477438926698, "epoch": 0.618, "grad_norm": 1056.0, "kl_loss_12": 348.9749816894531, "kl_loss_17": 89.25509567260742, "kl_loss_3": 2011.7194213867188, "kl_loss_6": 1152.0232543945312, "learning_rate": 0.0003245248187459323, "loss": 905.0213, "step": 6180 }, { "ce_loss_12": 3.0779054880142214, "ce_loss_17": 2.9654456257820128, "ce_loss_23": 2.9188602209091186, "ce_loss_3": 3.7875309348106385, "ce_loss_6": 3.390846574306488, "epoch": 0.619, "grad_norm": 920.0, "kl_loss_12": 337.221875, "kl_loss_17": 86.04836540222168, "kl_loss_3": 1908.584539794922, "kl_loss_6": 1083.1012786865235, "learning_rate": 0.00032303996298219416, "loss": 861.1268, "step": 6190 }, { "ce_loss_12": 3.1569186449050903, "ce_loss_17": 3.0399226546287537, "ce_loss_23": 2.9923227071762084, "ce_loss_3": 3.869111704826355, "ce_loss_6": 3.474336361885071, "epoch": 0.62, "grad_norm": 1088.0, "kl_loss_12": 340.1462677001953, "kl_loss_17": 87.56219062805175, "kl_loss_3": 1890.5591369628905, "kl_loss_6": 1085.7106719970702, "learning_rate": 0.00032155688920406414, "loss": 857.2023, "step": 6200 }, { "ce_loss_12": 3.066182005405426, "ce_loss_17": 2.9488109350204468, "ce_loss_23": 2.9000027894973757, "ce_loss_3": 3.8342225193977355, "ce_loss_6": 3.4069128751754763, "epoch": 0.621, "grad_norm": 972.0, "kl_loss_12": 350.70646514892576, "kl_loss_17": 90.66474761962891, "kl_loss_3": 2010.2277587890626, "kl_loss_6": 1134.7549102783203, "learning_rate": 0.0003200756123460788, "loss": 904.0088, "step": 6210 }, { "ce_loss_12": 3.1090978145599366, "ce_loss_17": 2.9894903898239136, "ce_loss_23": 2.93879611492157, "ce_loss_3": 3.8707842230796814, "ce_loss_6": 3.4465802669525147, "epoch": 0.622, "grad_norm": 1368.0, "kl_loss_12": 359.2712600708008, "kl_loss_17": 92.23049201965333, "kl_loss_3": 2029.2839599609374, "kl_loss_6": 1159.1133087158203, "learning_rate": 0.00031859614732467957, "loss": 903.4491, "step": 6220 }, { "ce_loss_12": 3.156878316402435, "ce_loss_17": 3.0391077756881715, "ce_loss_23": 2.9916378617286683, "ce_loss_3": 3.8727733850479127, "ce_loss_6": 3.472846496105194, "epoch": 0.623, "grad_norm": 1216.0, "kl_loss_12": 341.9202194213867, "kl_loss_17": 88.34065742492676, "kl_loss_3": 1915.2870239257813, "kl_loss_6": 1094.0911712646484, "learning_rate": 0.00031711850903806275, "loss": 864.2437, "step": 6230 }, { "ce_loss_12": 3.071133351325989, "ce_loss_17": 2.9489038586616516, "ce_loss_23": 2.898697924613953, "ce_loss_3": 3.8311947345733643, "ce_loss_6": 3.404621160030365, "epoch": 0.624, "grad_norm": 928.0, "kl_loss_12": 356.1224136352539, "kl_loss_17": 91.2601390838623, "kl_loss_3": 2014.823046875, "kl_loss_6": 1142.6964630126954, "learning_rate": 0.0003156427123660297, "loss": 883.211, "step": 6240 }, { "ce_loss_12": 3.1468286871910096, "ce_loss_17": 3.0284937620162964, "ce_loss_23": 2.9805765390396117, "ce_loss_3": 3.8659072875976563, "ce_loss_6": 3.4674535155296327, "epoch": 0.625, "grad_norm": 932.0, "kl_loss_12": 346.74879150390626, "kl_loss_17": 88.12692947387696, "kl_loss_3": 1929.708056640625, "kl_loss_6": 1108.6400970458985, "learning_rate": 0.0003141687721698363, "loss": 882.9596, "step": 6250 }, { "ce_loss_12": 3.1200728535652162, "ce_loss_17": 3.0045360565185546, "ce_loss_23": 2.9588016986846926, "ce_loss_3": 3.823894214630127, "ce_loss_6": 3.420210433006287, "epoch": 0.626, "grad_norm": 1056.0, "kl_loss_12": 330.5125015258789, "kl_loss_17": 85.87870597839355, "kl_loss_3": 1873.3631591796875, "kl_loss_6": 1061.2933288574218, "learning_rate": 0.00031269670329204396, "loss": 861.3563, "step": 6260 }, { "ce_loss_12": 3.1616135358810427, "ce_loss_17": 3.044730508327484, "ce_loss_23": 2.998891019821167, "ce_loss_3": 3.868626630306244, "ce_loss_6": 3.478573274612427, "epoch": 0.627, "grad_norm": 1104.0, "kl_loss_12": 343.72743530273436, "kl_loss_17": 88.55654296875, "kl_loss_3": 1918.8240783691406, "kl_loss_6": 1103.6495300292968, "learning_rate": 0.00031122652055637015, "loss": 874.6357, "step": 6270 }, { "ce_loss_12": 3.117260479927063, "ce_loss_17": 3.0006085872650146, "ce_loss_23": 2.9568544030189514, "ce_loss_3": 3.8618600845336912, "ce_loss_6": 3.44553599357605, "epoch": 0.628, "grad_norm": 924.0, "kl_loss_12": 351.38975219726564, "kl_loss_17": 89.64115447998047, "kl_loss_3": 1987.9572021484375, "kl_loss_6": 1137.688525390625, "learning_rate": 0.0003097582387675385, "loss": 872.7838, "step": 6280 }, { "ce_loss_12": 3.1549283385276796, "ce_loss_17": 3.0403485178947447, "ce_loss_23": 2.9937935709953307, "ce_loss_3": 3.8901216626167296, "ce_loss_6": 3.482912063598633, "epoch": 0.629, "grad_norm": 1064.0, "kl_loss_12": 348.3187423706055, "kl_loss_17": 89.56165046691895, "kl_loss_3": 1971.6549133300782, "kl_loss_6": 1121.3032287597657, "learning_rate": 0.00030829187271113034, "loss": 875.4443, "step": 6290 }, { "ce_loss_12": 3.1473076701164246, "ce_loss_17": 3.0363417267799377, "ce_loss_23": 2.9871679186820983, "ce_loss_3": 3.8660532355308534, "ce_loss_6": 3.465396547317505, "epoch": 0.63, "grad_norm": 1120.0, "kl_loss_12": 336.1268035888672, "kl_loss_17": 87.23834419250488, "kl_loss_3": 1905.2057373046875, "kl_loss_6": 1086.9825592041016, "learning_rate": 0.00030682743715343565, "loss": 874.1156, "step": 6300 }, { "ce_loss_12": 3.099404048919678, "ce_loss_17": 2.978138637542725, "ce_loss_23": 2.928191804885864, "ce_loss_3": 3.8511648416519164, "ce_loss_6": 3.4375685691833495, "epoch": 0.631, "grad_norm": 1184.0, "kl_loss_12": 354.819580078125, "kl_loss_17": 91.24221687316894, "kl_loss_3": 1976.64677734375, "kl_loss_6": 1133.5737030029297, "learning_rate": 0.0003053649468413043, "loss": 895.8477, "step": 6310 }, { "ce_loss_12": 3.2084802389144897, "ce_loss_17": 3.089839148521423, "ce_loss_23": 3.039863979816437, "ce_loss_3": 3.9332666873931883, "ce_loss_6": 3.529804158210754, "epoch": 0.632, "grad_norm": 1352.0, "kl_loss_12": 349.0574127197266, "kl_loss_17": 90.16268653869629, "kl_loss_3": 1947.8064453125, "kl_loss_6": 1119.1037353515626, "learning_rate": 0.00030390441650199725, "loss": 874.6631, "step": 6320 }, { "ce_loss_12": 3.109989643096924, "ce_loss_17": 2.993617391586304, "ce_loss_23": 2.9476993680000305, "ce_loss_3": 3.8355260372161863, "ce_loss_6": 3.4362170815467836, "epoch": 0.633, "grad_norm": 932.0, "kl_loss_12": 341.5677261352539, "kl_loss_17": 87.88154983520508, "kl_loss_3": 1922.833917236328, "kl_loss_6": 1101.4575500488281, "learning_rate": 0.00030244586084303903, "loss": 865.0869, "step": 6330 }, { "ce_loss_12": 3.092487156391144, "ce_loss_17": 2.970741927623749, "ce_loss_23": 2.922047734260559, "ce_loss_3": 3.845520186424255, "ce_loss_6": 3.432330071926117, "epoch": 0.634, "grad_norm": 856.0, "kl_loss_12": 354.30458679199216, "kl_loss_17": 90.62174263000489, "kl_loss_3": 2000.5931579589844, "kl_loss_6": 1151.717950439453, "learning_rate": 0.00030098929455206903, "loss": 877.8803, "step": 6340 }, { "ce_loss_12": 3.0834580183029177, "ce_loss_17": 2.968330907821655, "ce_loss_23": 2.9212151169776917, "ce_loss_3": 3.8257362484931945, "ce_loss_6": 3.4121026039123534, "epoch": 0.635, "grad_norm": 1232.0, "kl_loss_12": 341.7967956542969, "kl_loss_17": 87.78205146789551, "kl_loss_3": 1968.1894470214843, "kl_loss_6": 1118.4420776367188, "learning_rate": 0.00029953473229669324, "loss": 897.0741, "step": 6350 }, { "ce_loss_12": 3.1118922233581543, "ce_loss_17": 2.9967432618141174, "ce_loss_23": 2.949505364894867, "ce_loss_3": 3.8615681648254396, "ce_loss_6": 3.448829710483551, "epoch": 0.636, "grad_norm": 1088.0, "kl_loss_12": 349.56189880371096, "kl_loss_17": 88.25876350402832, "kl_loss_3": 1980.3615417480469, "kl_loss_6": 1137.783837890625, "learning_rate": 0.00029808218872433767, "loss": 875.4202, "step": 6360 }, { "ce_loss_12": 3.1684186816215516, "ce_loss_17": 3.054055690765381, "ce_loss_23": 3.0063771963119508, "ce_loss_3": 3.8962082982063295, "ce_loss_6": 3.487860989570618, "epoch": 0.637, "grad_norm": 900.0, "kl_loss_12": 340.72945098876954, "kl_loss_17": 87.79001235961914, "kl_loss_3": 1942.4515502929687, "kl_loss_6": 1100.2603485107422, "learning_rate": 0.0002966316784621, "loss": 862.3001, "step": 6370 }, { "ce_loss_12": 3.0944795846939086, "ce_loss_17": 2.9716971039772035, "ce_loss_23": 2.9208733558654787, "ce_loss_3": 3.8506593465805055, "ce_loss_6": 3.4324075102806093, "epoch": 0.638, "grad_norm": 876.0, "kl_loss_12": 353.5914505004883, "kl_loss_17": 89.74720306396485, "kl_loss_3": 1994.5528137207032, "kl_loss_6": 1142.38095703125, "learning_rate": 0.0002951832161166024, "loss": 872.7703, "step": 6380 }, { "ce_loss_12": 3.1604527473449706, "ce_loss_17": 3.0406524538993835, "ce_loss_23": 2.988952672481537, "ce_loss_3": 3.8952385902404787, "ce_loss_6": 3.4893474698066713, "epoch": 0.639, "grad_norm": 960.0, "kl_loss_12": 353.39493865966796, "kl_loss_17": 90.45221481323242, "kl_loss_3": 1959.9931701660157, "kl_loss_6": 1130.186865234375, "learning_rate": 0.0002937368162738445, "loss": 868.2771, "step": 6390 }, { "ce_loss_12": 3.1031800866127015, "ce_loss_17": 2.991485595703125, "ce_loss_23": 2.9482226371765137, "ce_loss_3": 3.8349567890167235, "ce_loss_6": 3.4268946051597595, "epoch": 0.64, "grad_norm": 1376.0, "kl_loss_12": 335.4634048461914, "kl_loss_17": 85.38014450073243, "kl_loss_3": 1946.2797729492188, "kl_loss_6": 1106.7311584472657, "learning_rate": 0.0002922924934990568, "loss": 884.1594, "step": 6400 }, { "ce_loss_12": 3.0481751799583434, "ce_loss_17": 2.928518009185791, "ce_loss_23": 2.8782082557678224, "ce_loss_3": 3.8162668228149412, "ce_loss_6": 3.392408585548401, "epoch": 0.641, "grad_norm": 888.0, "kl_loss_12": 347.2042831420898, "kl_loss_17": 88.85644226074218, "kl_loss_3": 2026.7457641601563, "kl_loss_6": 1156.6705841064454, "learning_rate": 0.0002908502623365536, "loss": 888.1911, "step": 6410 }, { "ce_loss_12": 2.985366642475128, "ce_loss_17": 2.8656646013259888, "ce_loss_23": 2.8177972435951233, "ce_loss_3": 3.7540844678878784, "ce_loss_6": 3.326254391670227, "epoch": 0.642, "grad_norm": 1072.0, "kl_loss_12": 346.08390350341796, "kl_loss_17": 87.24323692321778, "kl_loss_3": 2033.0718933105468, "kl_loss_6": 1149.7964965820313, "learning_rate": 0.0002894101373095867, "loss": 888.3158, "step": 6420 }, { "ce_loss_12": 3.192607414722443, "ce_loss_17": 3.0764593362808226, "ce_loss_23": 3.0285913348197937, "ce_loss_3": 3.91159108877182, "ce_loss_6": 3.5078271865844726, "epoch": 0.643, "grad_norm": 1144.0, "kl_loss_12": 346.41454162597654, "kl_loss_17": 90.39042320251465, "kl_loss_3": 1931.5027893066406, "kl_loss_6": 1098.9398071289063, "learning_rate": 0.00028797213292019926, "loss": 868.6021, "step": 6430 }, { "ce_loss_12": 3.1715727925300596, "ce_loss_17": 3.0562381982803344, "ce_loss_23": 3.00568106174469, "ce_loss_3": 3.901808428764343, "ce_loss_6": 3.4961312413215637, "epoch": 0.644, "grad_norm": 1080.0, "kl_loss_12": 351.4957702636719, "kl_loss_17": 90.0567756652832, "kl_loss_3": 1957.8979187011719, "kl_loss_6": 1124.9115966796876, "learning_rate": 0.0002865362636490791, "loss": 894.7577, "step": 6440 }, { "ce_loss_12": 3.181258165836334, "ce_loss_17": 3.0656901359558106, "ce_loss_23": 3.017771327495575, "ce_loss_3": 3.9070634961128237, "ce_loss_6": 3.498952269554138, "epoch": 0.645, "grad_norm": 800.0, "kl_loss_12": 344.0481979370117, "kl_loss_17": 87.99182357788087, "kl_loss_3": 1944.129833984375, "kl_loss_6": 1109.7974243164062, "learning_rate": 0.0002851025439554142, "loss": 868.3958, "step": 6450 }, { "ce_loss_12": 3.163985526561737, "ce_loss_17": 3.0450597405433655, "ce_loss_23": 2.9969645500183106, "ce_loss_3": 3.8787487506866456, "ce_loss_6": 3.4883544564247133, "epoch": 0.646, "grad_norm": 1024.0, "kl_loss_12": 343.316162109375, "kl_loss_17": 88.75982627868652, "kl_loss_3": 1901.1232666015626, "kl_loss_6": 1103.8402252197266, "learning_rate": 0.00028367098827674573, "loss": 861.943, "step": 6460 }, { "ce_loss_12": 3.100857067108154, "ce_loss_17": 2.9854223847389223, "ce_loss_23": 2.9373781800270082, "ce_loss_3": 3.8324575304985045, "ce_loss_6": 3.418183147907257, "epoch": 0.647, "grad_norm": 1012.0, "kl_loss_12": 340.87472381591795, "kl_loss_17": 87.60878677368164, "kl_loss_3": 1938.8650695800782, "kl_loss_6": 1095.55888671875, "learning_rate": 0.00028224161102882397, "loss": 876.8292, "step": 6470 }, { "ce_loss_12": 3.0689453721046447, "ce_loss_17": 2.957372415065765, "ce_loss_23": 2.912537622451782, "ce_loss_3": 3.794622015953064, "ce_loss_6": 3.3898186564445494, "epoch": 0.648, "grad_norm": 1064.0, "kl_loss_12": 338.6136016845703, "kl_loss_17": 86.14839630126953, "kl_loss_3": 1925.9338500976562, "kl_loss_6": 1106.3623138427733, "learning_rate": 0.00028081442660546124, "loss": 873.1646, "step": 6480 }, { "ce_loss_12": 3.135506010055542, "ce_loss_17": 3.0200687527656553, "ce_loss_23": 2.9733571290969847, "ce_loss_3": 3.8508325934410097, "ce_loss_6": 3.450616705417633, "epoch": 0.649, "grad_norm": 944.0, "kl_loss_12": 346.6316909790039, "kl_loss_17": 88.86284599304199, "kl_loss_3": 1919.2559936523437, "kl_loss_6": 1095.6363037109375, "learning_rate": 0.0002793894493783892, "loss": 873.3639, "step": 6490 }, { "ce_loss_12": 3.148543083667755, "ce_loss_17": 3.0385451436042787, "ce_loss_23": 2.9913148522377013, "ce_loss_3": 3.8768768310546875, "ce_loss_6": 3.472771918773651, "epoch": 0.65, "grad_norm": 944.0, "kl_loss_12": 337.93117370605466, "kl_loss_17": 86.5095645904541, "kl_loss_3": 1913.8453186035156, "kl_loss_6": 1097.8296691894532, "learning_rate": 0.0002779666936971129, "loss": 864.3172, "step": 6500 }, { "ce_loss_12": 3.1691304445266724, "ce_loss_17": 3.050932431221008, "ce_loss_23": 3.0028540015220644, "ce_loss_3": 3.907598543167114, "ce_loss_6": 3.50301855802536, "epoch": 0.651, "grad_norm": 1048.0, "kl_loss_12": 347.24015655517576, "kl_loss_17": 87.80660743713379, "kl_loss_3": 1963.5269592285156, "kl_loss_6": 1127.437063598633, "learning_rate": 0.00027654617388876614, "loss": 881.9879, "step": 6510 }, { "ce_loss_12": 3.1799694776535032, "ce_loss_17": 3.067358613014221, "ce_loss_23": 3.017275059223175, "ce_loss_3": 3.9080140829086303, "ce_loss_6": 3.501030957698822, "epoch": 0.652, "grad_norm": 868.0, "kl_loss_12": 345.9472351074219, "kl_loss_17": 89.5890552520752, "kl_loss_3": 1949.9092651367187, "kl_loss_6": 1110.8510864257812, "learning_rate": 0.0002751279042579672, "loss": 876.3406, "step": 6520 }, { "ce_loss_12": 3.1285813450813293, "ce_loss_17": 3.014330744743347, "ce_loss_23": 2.9665272355079653, "ce_loss_3": 3.847626578807831, "ce_loss_6": 3.4419679284095763, "epoch": 0.653, "grad_norm": 1288.0, "kl_loss_12": 337.32386627197263, "kl_loss_17": 86.3776023864746, "kl_loss_3": 1921.6126403808594, "kl_loss_6": 1091.7433349609375, "learning_rate": 0.00027371189908667604, "loss": 878.335, "step": 6530 }, { "ce_loss_12": 3.188789117336273, "ce_loss_17": 3.0666088819503785, "ce_loss_23": 3.015404152870178, "ce_loss_3": 3.9438202381134033, "ce_loss_6": 3.523683476448059, "epoch": 0.654, "grad_norm": 1192.0, "kl_loss_12": 356.82942962646484, "kl_loss_17": 91.85579376220703, "kl_loss_3": 1994.953546142578, "kl_loss_6": 1134.5701141357422, "learning_rate": 0.00027229817263404863, "loss": 899.9428, "step": 6540 }, { "ce_loss_12": 3.1566753506660463, "ce_loss_17": 3.047822904586792, "ce_loss_23": 3.000984263420105, "ce_loss_3": 3.8460690855979918, "ce_loss_6": 3.467942750453949, "epoch": 0.655, "grad_norm": 1232.0, "kl_loss_12": 336.4552856445313, "kl_loss_17": 86.81032180786133, "kl_loss_3": 1874.8006896972656, "kl_loss_6": 1083.900930786133, "learning_rate": 0.0002708867391362948, "loss": 859.1926, "step": 6550 }, { "ce_loss_12": 3.1401477456092834, "ce_loss_17": 3.0290218114852907, "ce_loss_23": 2.9835850477218626, "ce_loss_3": 3.841712236404419, "ce_loss_6": 3.4439943075180053, "epoch": 0.656, "grad_norm": 1136.0, "kl_loss_12": 329.05927124023435, "kl_loss_17": 85.45395736694336, "kl_loss_3": 1860.962646484375, "kl_loss_6": 1059.7574798583985, "learning_rate": 0.0002694776128065345, "loss": 856.4867, "step": 6560 }, { "ce_loss_12": 3.0816213965415953, "ce_loss_17": 2.9665581941604615, "ce_loss_23": 2.9174054503440856, "ce_loss_3": 3.8168569803237915, "ce_loss_6": 3.410453128814697, "epoch": 0.657, "grad_norm": 1112.0, "kl_loss_12": 349.16869201660154, "kl_loss_17": 88.75262565612793, "kl_loss_3": 1967.9637023925782, "kl_loss_6": 1130.642840576172, "learning_rate": 0.00026807080783465374, "loss": 865.674, "step": 6570 }, { "ce_loss_12": 3.185134494304657, "ce_loss_17": 3.0686640977859496, "ce_loss_23": 3.022062087059021, "ce_loss_3": 3.925693726539612, "ce_loss_6": 3.520680642127991, "epoch": 0.658, "grad_norm": 996.0, "kl_loss_12": 348.81055450439453, "kl_loss_17": 89.27261466979981, "kl_loss_3": 1972.7971313476562, "kl_loss_6": 1136.3583374023438, "learning_rate": 0.00026666633838716316, "loss": 888.1977, "step": 6580 }, { "ce_loss_12": 3.094407868385315, "ce_loss_17": 2.976101267337799, "ce_loss_23": 2.924073362350464, "ce_loss_3": 3.838667631149292, "ce_loss_6": 3.427040421962738, "epoch": 0.659, "grad_norm": 1408.0, "kl_loss_12": 355.3506591796875, "kl_loss_17": 90.87181396484375, "kl_loss_3": 1980.0582946777345, "kl_loss_6": 1138.5891876220703, "learning_rate": 0.00026526421860705474, "loss": 891.5605, "step": 6590 }, { "ce_loss_12": 3.112147605419159, "ce_loss_17": 2.9922482490539553, "ce_loss_23": 2.943576455116272, "ce_loss_3": 3.8509011030197144, "ce_loss_6": 3.446483051776886, "epoch": 0.66, "grad_norm": 980.0, "kl_loss_12": 350.5442840576172, "kl_loss_17": 90.05998497009277, "kl_loss_3": 1955.520330810547, "kl_loss_6": 1126.2021484375, "learning_rate": 0.0002638644626136587, "loss": 870.4355, "step": 6600 }, { "ce_loss_12": 3.125984454154968, "ce_loss_17": 3.0119236409664154, "ce_loss_23": 2.965866357088089, "ce_loss_3": 3.8583549737930296, "ce_loss_6": 3.4497647523880004, "epoch": 0.661, "grad_norm": 864.0, "kl_loss_12": 341.7684814453125, "kl_loss_17": 86.04758720397949, "kl_loss_3": 1936.7129333496093, "kl_loss_6": 1113.954165649414, "learning_rate": 0.00026246708450250255, "loss": 873.6549, "step": 6610 }, { "ce_loss_12": 3.111504018306732, "ce_loss_17": 2.9966780304908753, "ce_loss_23": 2.9501025676727295, "ce_loss_3": 3.827069544792175, "ce_loss_6": 3.4274869561195374, "epoch": 0.662, "grad_norm": 1272.0, "kl_loss_12": 341.32257080078125, "kl_loss_17": 88.25752258300781, "kl_loss_3": 1922.753759765625, "kl_loss_6": 1097.9517578125, "learning_rate": 0.00026107209834516854, "loss": 864.8915, "step": 6620 }, { "ce_loss_12": 3.0738446593284605, "ce_loss_17": 2.9582096576690673, "ce_loss_23": 2.9104017496109007, "ce_loss_3": 3.843344843387604, "ce_loss_6": 3.418946015834808, "epoch": 0.663, "grad_norm": 1232.0, "kl_loss_12": 348.487255859375, "kl_loss_17": 88.84328269958496, "kl_loss_3": 2021.6365051269531, "kl_loss_6": 1148.4022888183595, "learning_rate": 0.0002596795181891514, "loss": 896.4338, "step": 6630 }, { "ce_loss_12": 3.082929515838623, "ce_loss_17": 2.9603469371795654, "ce_loss_23": 2.910764241218567, "ce_loss_3": 3.821289229393005, "ce_loss_6": 3.41152765750885, "epoch": 0.664, "grad_norm": 1280.0, "kl_loss_12": 355.551774597168, "kl_loss_17": 92.20106925964356, "kl_loss_3": 1979.1425537109376, "kl_loss_6": 1138.566717529297, "learning_rate": 0.000258289358057718, "loss": 917.6799, "step": 6640 }, { "ce_loss_12": 3.148861360549927, "ce_loss_17": 3.025576043128967, "ce_loss_23": 2.977057957649231, "ce_loss_3": 3.89022433757782, "ce_loss_6": 3.4762032985687257, "epoch": 0.665, "grad_norm": 940.0, "kl_loss_12": 359.8484146118164, "kl_loss_17": 93.93804359436035, "kl_loss_3": 1993.680645751953, "kl_loss_6": 1136.3337860107422, "learning_rate": 0.0002569016319497657, "loss": 894.0275, "step": 6650 }, { "ce_loss_12": 3.139300358295441, "ce_loss_17": 3.019014894962311, "ce_loss_23": 2.9669580936431883, "ce_loss_3": 3.8815082907676697, "ce_loss_6": 3.470949947834015, "epoch": 0.666, "grad_norm": 1016.0, "kl_loss_12": 359.97652435302734, "kl_loss_17": 93.55386581420899, "kl_loss_3": 1994.8902160644532, "kl_loss_6": 1142.2246398925781, "learning_rate": 0.00025551635383968066, "loss": 903.9031, "step": 6660 }, { "ce_loss_12": 3.047380030155182, "ce_loss_17": 2.9349704504013063, "ce_loss_23": 2.8862363934516906, "ce_loss_3": 3.791304576396942, "ce_loss_6": 3.378193163871765, "epoch": 0.667, "grad_norm": 1304.0, "kl_loss_12": 349.3817199707031, "kl_loss_17": 89.86996917724609, "kl_loss_3": 1976.2346069335938, "kl_loss_6": 1131.2955963134766, "learning_rate": 0.00025413353767719804, "loss": 888.0734, "step": 6670 }, { "ce_loss_12": 3.107878494262695, "ce_loss_17": 2.9940646171569822, "ce_loss_23": 2.948066794872284, "ce_loss_3": 3.8429067969322204, "ce_loss_6": 3.4393661618232727, "epoch": 0.668, "grad_norm": 1536.0, "kl_loss_12": 344.29654083251955, "kl_loss_17": 86.87348518371581, "kl_loss_3": 1965.0328430175782, "kl_loss_6": 1133.0926055908203, "learning_rate": 0.0002527531973872617, "loss": 882.2666, "step": 6680 }, { "ce_loss_12": 3.1252907514572144, "ce_loss_17": 3.0092085719108583, "ce_loss_23": 2.9613227605819703, "ce_loss_3": 3.836043107509613, "ce_loss_6": 3.437214457988739, "epoch": 0.669, "grad_norm": 1088.0, "kl_loss_12": 342.63561553955077, "kl_loss_17": 87.7429012298584, "kl_loss_3": 1920.3994262695312, "kl_loss_6": 1103.2438995361329, "learning_rate": 0.0002513753468698826, "loss": 866.9679, "step": 6690 }, { "ce_loss_12": 3.090426170825958, "ce_loss_17": 2.971407890319824, "ce_loss_23": 2.9234967947006227, "ce_loss_3": 3.8281372785568237, "ce_loss_6": 3.412463891506195, "epoch": 0.67, "grad_norm": 1232.0, "kl_loss_12": 352.12214813232424, "kl_loss_17": 89.99090042114258, "kl_loss_3": 1990.1664123535156, "kl_loss_6": 1134.3204223632813, "learning_rate": 0.0002500000000000001, "loss": 886.1559, "step": 6700 }, { "ce_loss_12": 3.1942543387413025, "ce_loss_17": 3.0864752650260927, "ce_loss_23": 3.0406001448631286, "ce_loss_3": 3.8843395829200746, "ce_loss_6": 3.4964444279670714, "epoch": 0.671, "grad_norm": 980.0, "kl_loss_12": 336.6088241577148, "kl_loss_17": 86.05256309509278, "kl_loss_3": 1867.6397338867187, "kl_loss_6": 1070.227133178711, "learning_rate": 0.0002486271706273421, "loss": 887.4312, "step": 6710 }, { "ce_loss_12": 3.1309102058410643, "ce_loss_17": 3.022629773616791, "ce_loss_23": 2.977077305316925, "ce_loss_3": 3.8247514367103577, "ce_loss_6": 3.442046856880188, "epoch": 0.672, "grad_norm": 1032.0, "kl_loss_12": 332.57128143310547, "kl_loss_17": 85.41795845031739, "kl_loss_3": 1874.3389892578125, "kl_loss_6": 1078.250180053711, "learning_rate": 0.0002472568725762853, "loss": 869.0738, "step": 6720 }, { "ce_loss_12": 3.1272271275520325, "ce_loss_17": 3.0170477271080016, "ce_loss_23": 2.9705397963523863, "ce_loss_3": 3.8198402643203737, "ce_loss_6": 3.4317238688468934, "epoch": 0.673, "grad_norm": 1408.0, "kl_loss_12": 330.74756927490233, "kl_loss_17": 84.99472198486328, "kl_loss_3": 1873.458514404297, "kl_loss_6": 1063.2597229003907, "learning_rate": 0.00024588911964571554, "loss": 855.2377, "step": 6730 }, { "ce_loss_12": 3.143399214744568, "ce_loss_17": 3.024151027202606, "ce_loss_23": 2.9710415840148925, "ce_loss_3": 3.900429677963257, "ce_loss_6": 3.483956551551819, "epoch": 0.674, "grad_norm": 904.0, "kl_loss_12": 359.8742080688477, "kl_loss_17": 94.40918464660645, "kl_loss_3": 1998.6011291503905, "kl_loss_6": 1151.3028686523437, "learning_rate": 0.00024452392560888974, "loss": 879.9549, "step": 6740 }, { "ce_loss_12": 3.037467384338379, "ce_loss_17": 2.920524501800537, "ce_loss_23": 2.872658061981201, "ce_loss_3": 3.7702141404151917, "ce_loss_6": 3.3598833203315737, "epoch": 0.675, "grad_norm": 824.0, "kl_loss_12": 333.89322967529296, "kl_loss_17": 85.252885055542, "kl_loss_3": 1944.8146240234375, "kl_loss_6": 1102.9042541503907, "learning_rate": 0.00024316130421329695, "loss": 859.0686, "step": 6750 }, { "ce_loss_12": 3.110549819469452, "ce_loss_17": 2.9969101905822755, "ce_loss_23": 2.9517931759357454, "ce_loss_3": 3.8281806349754333, "ce_loss_6": 3.4278959035873413, "epoch": 0.676, "grad_norm": 980.0, "kl_loss_12": 339.3283462524414, "kl_loss_17": 87.01463661193847, "kl_loss_3": 1923.8144470214843, "kl_loss_6": 1103.3825225830078, "learning_rate": 0.00024180126918051909, "loss": 870.1158, "step": 6760 }, { "ce_loss_12": 3.157505512237549, "ce_loss_17": 3.043172836303711, "ce_loss_23": 2.996259617805481, "ce_loss_3": 3.8729994893074036, "ce_loss_6": 3.469846773147583, "epoch": 0.677, "grad_norm": 1144.0, "kl_loss_12": 343.28330078125, "kl_loss_17": 87.83675193786621, "kl_loss_3": 1921.9919799804688, "kl_loss_6": 1093.9075073242188, "learning_rate": 0.00024044383420609406, "loss": 859.0797, "step": 6770 }, { "ce_loss_12": 3.167113757133484, "ce_loss_17": 3.0563209533691404, "ce_loss_23": 3.0103363037109374, "ce_loss_3": 3.8601980209350586, "ce_loss_6": 3.4720909357070924, "epoch": 0.678, "grad_norm": 1184.0, "kl_loss_12": 333.5920715332031, "kl_loss_17": 86.2124641418457, "kl_loss_3": 1889.3303771972655, "kl_loss_6": 1086.4960693359376, "learning_rate": 0.00023908901295937712, "loss": 873.1142, "step": 6780 }, { "ce_loss_12": 3.1507126450538636, "ce_loss_17": 3.0366512298583985, "ce_loss_23": 2.989604079723358, "ce_loss_3": 3.8650084137916565, "ce_loss_6": 3.472484755516052, "epoch": 0.679, "grad_norm": 1456.0, "kl_loss_12": 336.9345169067383, "kl_loss_17": 87.20873107910157, "kl_loss_3": 1891.5245971679688, "kl_loss_6": 1082.5831726074218, "learning_rate": 0.00023773681908340283, "loss": 879.5051, "step": 6790 }, { "ce_loss_12": 3.135229218006134, "ce_loss_17": 3.01503472328186, "ce_loss_23": 2.96337548494339, "ce_loss_3": 3.88761568069458, "ce_loss_6": 3.474038767814636, "epoch": 0.68, "grad_norm": 1512.0, "kl_loss_12": 359.9027587890625, "kl_loss_17": 93.6316131591797, "kl_loss_3": 2011.5318969726563, "kl_loss_6": 1154.1799377441407, "learning_rate": 0.00023638726619474876, "loss": 906.6357, "step": 6800 }, { "ce_loss_12": 3.1319125294685364, "ce_loss_17": 3.0071893095970155, "ce_loss_23": 2.956976556777954, "ce_loss_3": 3.8915478587150574, "ce_loss_6": 3.468007779121399, "epoch": 0.681, "grad_norm": 1208.0, "kl_loss_12": 354.7691177368164, "kl_loss_17": 90.4473373413086, "kl_loss_3": 1999.4680297851562, "kl_loss_6": 1150.2789642333985, "learning_rate": 0.0002350403678833976, "loss": 885.597, "step": 6810 }, { "ce_loss_12": 3.0541595101356505, "ce_loss_17": 2.937679874897003, "ce_loss_23": 2.8889075756072997, "ce_loss_3": 3.793595790863037, "ce_loss_6": 3.3809202313423157, "epoch": 0.682, "grad_norm": 888.0, "kl_loss_12": 339.1263671875, "kl_loss_17": 86.2660758972168, "kl_loss_3": 1964.4327758789063, "kl_loss_6": 1122.2195220947265, "learning_rate": 0.00023369613771260007, "loss": 871.1181, "step": 6820 }, { "ce_loss_12": 3.1654935002326967, "ce_loss_17": 3.0478216648101806, "ce_loss_23": 3.000415086746216, "ce_loss_3": 3.908292329311371, "ce_loss_6": 3.494389367103577, "epoch": 0.683, "grad_norm": 992.0, "kl_loss_12": 349.88165588378905, "kl_loss_17": 89.48608589172363, "kl_loss_3": 1978.239434814453, "kl_loss_6": 1126.9151611328125, "learning_rate": 0.00023235458921873925, "loss": 888.1568, "step": 6830 }, { "ce_loss_12": 3.133337688446045, "ce_loss_17": 3.009847807884216, "ce_loss_23": 2.9573482275009155, "ce_loss_3": 3.9090792179107665, "ce_loss_6": 3.488210666179657, "epoch": 0.684, "grad_norm": 1216.0, "kl_loss_12": 364.1305847167969, "kl_loss_17": 93.46566314697266, "kl_loss_3": 2065.647784423828, "kl_loss_6": 1187.9217010498046, "learning_rate": 0.0002310157359111938, "loss": 916.889, "step": 6840 }, { "ce_loss_12": 3.0303258895874023, "ce_loss_17": 2.9021595239639284, "ce_loss_23": 2.8519881129264832, "ce_loss_3": 3.840841066837311, "ce_loss_6": 3.3887616753578187, "epoch": 0.685, "grad_norm": 1696.0, "kl_loss_12": 356.69389190673826, "kl_loss_17": 90.0155418395996, "kl_loss_3": 2106.9754943847656, "kl_loss_6": 1189.3093841552734, "learning_rate": 0.0002296795912722014, "loss": 915.0975, "step": 6850 }, { "ce_loss_12": 3.1557368278503417, "ce_loss_17": 3.0395566701889036, "ce_loss_23": 2.9924458980560305, "ce_loss_3": 3.868706929683685, "ce_loss_6": 3.4693487644195558, "epoch": 0.686, "grad_norm": 904.0, "kl_loss_12": 342.0593780517578, "kl_loss_17": 88.2308448791504, "kl_loss_3": 1928.0559936523437, "kl_loss_6": 1095.3077056884765, "learning_rate": 0.0002283461687567236, "loss": 855.7789, "step": 6860 }, { "ce_loss_12": 3.2024538993835447, "ce_loss_17": 3.0915324687957764, "ce_loss_23": 3.044475185871124, "ce_loss_3": 3.900763988494873, "ce_loss_6": 3.5116928100585936, "epoch": 0.687, "grad_norm": 748.0, "kl_loss_12": 335.2772933959961, "kl_loss_17": 87.18923454284668, "kl_loss_3": 1872.83974609375, "kl_loss_6": 1073.4641540527343, "learning_rate": 0.00022701548179231045, "loss": 868.1582, "step": 6870 }, { "ce_loss_12": 3.164746177196503, "ce_loss_17": 3.047294557094574, "ce_loss_23": 2.99811338186264, "ce_loss_3": 3.9032017946243287, "ce_loss_6": 3.4921378016471865, "epoch": 0.688, "grad_norm": 1008.0, "kl_loss_12": 348.18983459472656, "kl_loss_17": 90.73311271667481, "kl_loss_3": 1962.2305603027344, "kl_loss_6": 1121.4248413085938, "learning_rate": 0.00022568754377896516, "loss": 866.5005, "step": 6880 }, { "ce_loss_12": 3.156641185283661, "ce_loss_17": 3.0423587560653687, "ce_loss_23": 2.9942126631736756, "ce_loss_3": 3.866535186767578, "ce_loss_6": 3.4736101984977723, "epoch": 0.689, "grad_norm": 1080.0, "kl_loss_12": 345.99285278320315, "kl_loss_17": 87.87312545776368, "kl_loss_3": 1916.1283081054687, "kl_loss_6": 1102.4018157958985, "learning_rate": 0.00022436236808900844, "loss": 861.774, "step": 6890 }, { "ce_loss_12": 3.0551820278167723, "ce_loss_17": 2.9351372838020326, "ce_loss_23": 2.885222589969635, "ce_loss_3": 3.7950286746025084, "ce_loss_6": 3.3820699572563173, "epoch": 0.69, "grad_norm": 1224.0, "kl_loss_12": 347.7524154663086, "kl_loss_17": 88.94291534423829, "kl_loss_3": 1973.8519226074218, "kl_loss_6": 1127.2658599853517, "learning_rate": 0.00022303996806694487, "loss": 875.0857, "step": 6900 }, { "ce_loss_12": 3.1233952760696413, "ce_loss_17": 3.0083462238311767, "ce_loss_23": 2.962642455101013, "ce_loss_3": 3.8505759239196777, "ce_loss_6": 3.449055218696594, "epoch": 0.691, "grad_norm": 1004.0, "kl_loss_12": 338.82100067138674, "kl_loss_17": 85.58787803649902, "kl_loss_3": 1945.1169677734374, "kl_loss_6": 1111.6448516845703, "learning_rate": 0.00022172035702932823, "loss": 868.8034, "step": 6910 }, { "ce_loss_12": 3.1731364607810972, "ce_loss_17": 3.061260771751404, "ce_loss_23": 3.012500262260437, "ce_loss_3": 3.884373688697815, "ce_loss_6": 3.4877657175064085, "epoch": 0.692, "grad_norm": 1240.0, "kl_loss_12": 338.7787292480469, "kl_loss_17": 88.33567352294922, "kl_loss_3": 1898.0742065429688, "kl_loss_6": 1094.025244140625, "learning_rate": 0.00022040354826462666, "loss": 861.8279, "step": 6920 }, { "ce_loss_12": 3.098303437232971, "ce_loss_17": 2.9883047938346863, "ce_loss_23": 2.9403274059295654, "ce_loss_3": 3.8265650987625124, "ce_loss_6": 3.4196027517318726, "epoch": 0.693, "grad_norm": 1128.0, "kl_loss_12": 334.6059051513672, "kl_loss_17": 86.24344863891602, "kl_loss_3": 1942.0025817871094, "kl_loss_6": 1100.2935638427734, "learning_rate": 0.0002190895550330899, "loss": 879.4563, "step": 6930 }, { "ce_loss_12": 3.0402795314788817, "ce_loss_17": 2.9218369483947755, "ce_loss_23": 2.870386278629303, "ce_loss_3": 3.793871819972992, "ce_loss_6": 3.3839752197265627, "epoch": 0.694, "grad_norm": 1048.0, "kl_loss_12": 351.19903564453125, "kl_loss_17": 90.00217399597167, "kl_loss_3": 1991.8451843261719, "kl_loss_6": 1148.4777099609375, "learning_rate": 0.00021777839056661552, "loss": 872.2974, "step": 6940 }, { "ce_loss_12": 3.1183370471000673, "ce_loss_17": 3.0035004377365113, "ce_loss_23": 2.95679976940155, "ce_loss_3": 3.8357904314994813, "ce_loss_6": 3.437212681770325, "epoch": 0.695, "grad_norm": 852.0, "kl_loss_12": 337.89208068847654, "kl_loss_17": 87.83383026123047, "kl_loss_3": 1916.7237243652344, "kl_loss_6": 1096.3088836669922, "learning_rate": 0.0002164700680686147, "loss": 853.3035, "step": 6950 }, { "ce_loss_12": 3.163424181938171, "ce_loss_17": 3.0453892946243286, "ce_loss_23": 2.998319923877716, "ce_loss_3": 3.864614713191986, "ce_loss_6": 3.4790354490280153, "epoch": 0.696, "grad_norm": 964.0, "kl_loss_12": 342.2417327880859, "kl_loss_17": 88.33109970092774, "kl_loss_3": 1880.5371337890624, "kl_loss_6": 1082.1784576416017, "learning_rate": 0.0002151646007138806, "loss": 855.9359, "step": 6960 }, { "ce_loss_12": 3.052464473247528, "ce_loss_17": 2.939312756061554, "ce_loss_23": 2.888992202281952, "ce_loss_3": 3.794123411178589, "ce_loss_6": 3.389705038070679, "epoch": 0.697, "grad_norm": 940.0, "kl_loss_12": 350.82405548095704, "kl_loss_17": 90.2900203704834, "kl_loss_3": 1985.181329345703, "kl_loss_6": 1139.6411743164062, "learning_rate": 0.00021386200164845526, "loss": 878.1575, "step": 6970 }, { "ce_loss_12": 3.2136444211006165, "ce_loss_17": 3.10131151676178, "ce_loss_23": 3.0563345670700075, "ce_loss_3": 3.9051543354988096, "ce_loss_6": 3.5153585076332092, "epoch": 0.698, "grad_norm": 832.0, "kl_loss_12": 337.0234146118164, "kl_loss_17": 86.35291862487793, "kl_loss_3": 1872.8874267578126, "kl_loss_6": 1080.3144561767579, "learning_rate": 0.0002125622839894964, "loss": 852.6143, "step": 6980 }, { "ce_loss_12": 3.1607075691223145, "ce_loss_17": 3.0520373225212096, "ce_loss_23": 3.0057042121887205, "ce_loss_3": 3.8714554071426392, "ce_loss_6": 3.475553238391876, "epoch": 0.699, "grad_norm": 1064.0, "kl_loss_12": 332.19777221679686, "kl_loss_17": 86.07902908325195, "kl_loss_3": 1877.6947509765625, "kl_loss_6": 1072.3549865722657, "learning_rate": 0.00021126546082514663, "loss": 851.5051, "step": 6990 }, { "ce_loss_12": 3.1810610413551332, "ce_loss_17": 3.069001257419586, "ce_loss_23": 3.023350381851196, "ce_loss_3": 3.877126181125641, "ce_loss_6": 3.493105447292328, "epoch": 0.7, "grad_norm": 1320.0, "kl_loss_12": 338.05188903808596, "kl_loss_17": 87.35322494506836, "kl_loss_3": 1883.8476501464843, "kl_loss_6": 1090.8296142578124, "learning_rate": 0.00020997154521440098, "loss": 851.0469, "step": 7000 }, { "ce_loss_12": 3.1302197575569153, "ce_loss_17": 3.016958808898926, "ce_loss_23": 2.9736286282539366, "ce_loss_3": 3.8471718311309813, "ce_loss_6": 3.4467532515525816, "epoch": 0.701, "grad_norm": 804.0, "kl_loss_12": 336.49234161376955, "kl_loss_17": 85.4226303100586, "kl_loss_3": 1913.783056640625, "kl_loss_6": 1095.0326568603516, "learning_rate": 0.0002086805501869749, "loss": 852.4146, "step": 7010 }, { "ce_loss_12": 3.109818232059479, "ce_loss_17": 2.988493525981903, "ce_loss_23": 2.9379398584365846, "ce_loss_3": 3.852206456661224, "ce_loss_6": 3.4425936937332153, "epoch": 0.702, "grad_norm": 1096.0, "kl_loss_12": 353.8260467529297, "kl_loss_17": 90.05978813171387, "kl_loss_3": 1995.5918395996093, "kl_loss_6": 1145.5393829345703, "learning_rate": 0.0002073924887431744, "loss": 878.7024, "step": 7020 }, { "ce_loss_12": 3.1103074669837953, "ce_loss_17": 2.996567392349243, "ce_loss_23": 2.9493033409118654, "ce_loss_3": 3.8395477652549745, "ce_loss_6": 3.4339751482009886, "epoch": 0.703, "grad_norm": 1160.0, "kl_loss_12": 344.3564910888672, "kl_loss_17": 87.64359970092774, "kl_loss_3": 1950.38740234375, "kl_loss_6": 1118.0618469238282, "learning_rate": 0.00020610737385376348, "loss": 890.4451, "step": 7030 }, { "ce_loss_12": 3.1601242661476134, "ce_loss_17": 3.0453317403793334, "ce_loss_23": 2.998131000995636, "ce_loss_3": 3.851083850860596, "ce_loss_6": 3.4640984535217285, "epoch": 0.704, "grad_norm": 1064.0, "kl_loss_12": 337.7087921142578, "kl_loss_17": 86.78579254150391, "kl_loss_3": 1862.4344177246094, "kl_loss_6": 1071.2653228759766, "learning_rate": 0.00020482521845983521, "loss": 868.2957, "step": 7040 }, { "ce_loss_12": 3.1671868681907656, "ce_loss_17": 3.0499786496162415, "ce_loss_23": 2.9989811420440673, "ce_loss_3": 3.8885140657424926, "ce_loss_6": 3.4780256271362306, "epoch": 0.705, "grad_norm": 1272.0, "kl_loss_12": 346.44360504150393, "kl_loss_17": 90.91349830627442, "kl_loss_3": 1948.5189453125, "kl_loss_6": 1106.0490844726562, "learning_rate": 0.00020354603547267987, "loss": 880.9363, "step": 7050 }, { "ce_loss_12": 3.1477226972579957, "ce_loss_17": 3.0317694783210754, "ce_loss_23": 2.9822145223617555, "ce_loss_3": 3.893599569797516, "ce_loss_6": 3.4841063261032104, "epoch": 0.706, "grad_norm": 948.0, "kl_loss_12": 349.3151168823242, "kl_loss_17": 89.96737899780274, "kl_loss_3": 1960.026806640625, "kl_loss_6": 1124.279348754883, "learning_rate": 0.00020226983777365604, "loss": 898.4166, "step": 7060 }, { "ce_loss_12": 3.055716669559479, "ce_loss_17": 2.9457828521728517, "ce_loss_23": 2.8991126537323, "ce_loss_3": 3.8222302317619326, "ce_loss_6": 3.3887375950813294, "epoch": 0.707, "grad_norm": 1064.0, "kl_loss_12": 335.7860305786133, "kl_loss_17": 85.3084270477295, "kl_loss_3": 1999.919091796875, "kl_loss_6": 1111.3100799560548, "learning_rate": 0.00020099663821406056, "loss": 872.2579, "step": 7070 }, { "ce_loss_12": 3.149730110168457, "ce_loss_17": 3.0381961107254027, "ce_loss_23": 2.9923747301101686, "ce_loss_3": 3.8555148601531983, "ce_loss_6": 3.462322497367859, "epoch": 0.708, "grad_norm": 1336.0, "kl_loss_12": 332.4214065551758, "kl_loss_17": 86.18175811767578, "kl_loss_3": 1885.1385131835937, "kl_loss_6": 1081.348455810547, "learning_rate": 0.00019972644961499853, "loss": 866.4621, "step": 7080 }, { "ce_loss_12": 3.128303360939026, "ce_loss_17": 3.0102851152420045, "ce_loss_23": 2.9599443554878233, "ce_loss_3": 3.876959776878357, "ce_loss_6": 3.456597423553467, "epoch": 0.709, "grad_norm": 1020.0, "kl_loss_12": 352.72449493408203, "kl_loss_17": 90.64090385437012, "kl_loss_3": 1994.6687866210937, "kl_loss_6": 1131.3768310546875, "learning_rate": 0.00019845928476725522, "loss": 882.0244, "step": 7090 }, { "ce_loss_12": 3.2001837849617005, "ce_loss_17": 3.0850595355033876, "ce_loss_23": 3.034242641925812, "ce_loss_3": 3.914669561386108, "ce_loss_6": 3.519736313819885, "epoch": 0.71, "grad_norm": 728.0, "kl_loss_12": 345.6641387939453, "kl_loss_17": 89.23448104858399, "kl_loss_3": 1916.251934814453, "kl_loss_6": 1110.915444946289, "learning_rate": 0.00019719515643116677, "loss": 895.3077, "step": 7100 }, { "ce_loss_12": 3.1400701642036437, "ce_loss_17": 3.025814187526703, "ce_loss_23": 2.9760770320892336, "ce_loss_3": 3.8354206919670104, "ce_loss_6": 3.4461462020874025, "epoch": 0.711, "grad_norm": 1048.0, "kl_loss_12": 337.6152114868164, "kl_loss_17": 87.09186248779297, "kl_loss_3": 1894.4208740234376, "kl_loss_6": 1079.6950958251953, "learning_rate": 0.0001959340773364911, "loss": 868.769, "step": 7110 }, { "ce_loss_12": 3.1553873419761658, "ce_loss_17": 3.0396679520606993, "ce_loss_23": 2.991921770572662, "ce_loss_3": 3.885389029979706, "ce_loss_6": 3.4764964818954467, "epoch": 0.712, "grad_norm": 840.0, "kl_loss_12": 343.460107421875, "kl_loss_17": 87.91554718017578, "kl_loss_3": 1938.1870910644532, "kl_loss_6": 1113.4761871337892, "learning_rate": 0.0001946760601822809, "loss": 854.4359, "step": 7120 }, { "ce_loss_12": 3.2049408555030823, "ce_loss_17": 3.0892166614532472, "ce_loss_23": 3.0430971145629884, "ce_loss_3": 3.9063705444335937, "ce_loss_6": 3.5126201152801513, "epoch": 0.713, "grad_norm": 1056.0, "kl_loss_12": 338.95025939941405, "kl_loss_17": 86.48576545715332, "kl_loss_3": 1890.3039306640626, "kl_loss_6": 1079.1358520507813, "learning_rate": 0.00019342111763675512, "loss": 841.8441, "step": 7130 }, { "ce_loss_12": 3.2013929009437563, "ce_loss_17": 3.090296244621277, "ce_loss_23": 3.039723289012909, "ce_loss_3": 3.8926509261131286, "ce_loss_6": 3.5071531534194946, "epoch": 0.714, "grad_norm": 1880.0, "kl_loss_12": 339.79481201171876, "kl_loss_17": 88.69997673034668, "kl_loss_3": 1879.4584411621095, "kl_loss_6": 1082.5133972167969, "learning_rate": 0.00019216926233717085, "loss": 849.3725, "step": 7140 }, { "ce_loss_12": 3.095425474643707, "ce_loss_17": 2.9842472314834594, "ce_loss_23": 2.9373265862464906, "ce_loss_3": 3.8718192100524904, "ce_loss_6": 3.442226326465607, "epoch": 0.715, "grad_norm": 1048.0, "kl_loss_12": 337.72288665771487, "kl_loss_17": 87.01406669616699, "kl_loss_3": 2025.1176879882812, "kl_loss_6": 1144.7917572021483, "learning_rate": 0.00019092050688969737, "loss": 886.4156, "step": 7150 }, { "ce_loss_12": 3.160278844833374, "ce_loss_17": 3.0518559575080872, "ce_loss_23": 3.003028392791748, "ce_loss_3": 3.862757754325867, "ce_loss_6": 3.4689655780792235, "epoch": 0.716, "grad_norm": 1024.0, "kl_loss_12": 334.71154327392577, "kl_loss_17": 86.16739501953126, "kl_loss_3": 1914.939208984375, "kl_loss_6": 1089.51484375, "learning_rate": 0.00018967486386928817, "loss": 852.5919, "step": 7160 }, { "ce_loss_12": 3.0488394856452943, "ce_loss_17": 2.930952322483063, "ce_loss_23": 2.881786513328552, "ce_loss_3": 3.789450442790985, "ce_loss_6": 3.3785236358642576, "epoch": 0.717, "grad_norm": 1168.0, "kl_loss_12": 345.0021209716797, "kl_loss_17": 87.61870803833008, "kl_loss_3": 1971.9166870117188, "kl_loss_6": 1127.8095123291016, "learning_rate": 0.00018843234581955443, "loss": 904.8341, "step": 7170 }, { "ce_loss_12": 3.054139864444733, "ce_loss_17": 2.93440922498703, "ce_loss_23": 2.8850677728652956, "ce_loss_3": 3.8047616720199584, "ce_loss_6": 3.3870360374450685, "epoch": 0.718, "grad_norm": 1392.0, "kl_loss_12": 345.5686233520508, "kl_loss_17": 88.29729309082032, "kl_loss_3": 1980.608221435547, "kl_loss_6": 1131.4310211181642, "learning_rate": 0.00018719296525263924, "loss": 881.9613, "step": 7180 }, { "ce_loss_12": 3.1456296682357787, "ce_loss_17": 3.035650098323822, "ce_loss_23": 2.9887816667556764, "ce_loss_3": 3.8410556077957154, "ce_loss_6": 3.4524468541145326, "epoch": 0.719, "grad_norm": 1120.0, "kl_loss_12": 334.7310394287109, "kl_loss_17": 86.69911155700683, "kl_loss_3": 1863.3901123046876, "kl_loss_6": 1063.2905212402343, "learning_rate": 0.0001859567346490913, "loss": 844.3992, "step": 7190 }, { "ce_loss_12": 3.1302160382270814, "ce_loss_17": 3.0147815585136413, "ce_loss_23": 2.9667599201202393, "ce_loss_3": 3.867290210723877, "ce_loss_6": 3.4623439788818358, "epoch": 0.72, "grad_norm": 1176.0, "kl_loss_12": 349.16916809082034, "kl_loss_17": 90.70890274047852, "kl_loss_3": 1963.2574035644532, "kl_loss_6": 1129.784686279297, "learning_rate": 0.0001847236664577389, "loss": 867.5368, "step": 7200 }, { "ce_loss_12": 3.1433603167533875, "ce_loss_17": 3.035125803947449, "ce_loss_23": 2.9883386731147765, "ce_loss_3": 3.834554946422577, "ce_loss_6": 3.444643223285675, "epoch": 0.721, "grad_norm": 1048.0, "kl_loss_12": 332.42013092041014, "kl_loss_17": 86.5483169555664, "kl_loss_3": 1854.5558959960938, "kl_loss_6": 1068.553970336914, "learning_rate": 0.00018349377309556487, "loss": 837.7305, "step": 7210 }, { "ce_loss_12": 3.0995816230773925, "ce_loss_17": 2.9839762926101683, "ce_loss_23": 2.936460256576538, "ce_loss_3": 3.8673662662506105, "ce_loss_6": 3.4319155335426332, "epoch": 0.722, "grad_norm": 1360.0, "kl_loss_12": 350.46849212646487, "kl_loss_17": 88.42042655944825, "kl_loss_3": 2043.6094055175781, "kl_loss_6": 1153.0741485595704, "learning_rate": 0.00018226706694758193, "loss": 891.0816, "step": 7220 }, { "ce_loss_12": 3.170437383651733, "ce_loss_17": 3.0576380372047423, "ce_loss_23": 3.010854959487915, "ce_loss_3": 3.889014732837677, "ce_loss_6": 3.4898629069328306, "epoch": 0.723, "grad_norm": 968.0, "kl_loss_12": 346.222639465332, "kl_loss_17": 87.71672706604004, "kl_loss_3": 1940.7088317871094, "kl_loss_6": 1117.3578521728516, "learning_rate": 0.0001810435603667075, "loss": 890.0791, "step": 7230 }, { "ce_loss_12": 3.0272300839424133, "ce_loss_17": 2.91408885717392, "ce_loss_23": 2.866550886631012, "ce_loss_3": 3.7645853042602537, "ce_loss_6": 3.3573010325431825, "epoch": 0.724, "grad_norm": 1200.0, "kl_loss_12": 335.91829986572264, "kl_loss_17": 85.53136253356934, "kl_loss_3": 1942.4877502441407, "kl_loss_6": 1104.8149505615233, "learning_rate": 0.0001798232656736389, "loss": 885.0737, "step": 7240 }, { "ce_loss_12": 3.1877684593200684, "ce_loss_17": 3.0757036089897154, "ce_loss_23": 3.0268973112106323, "ce_loss_3": 3.8765776991844176, "ce_loss_6": 3.4895286560058594, "epoch": 0.725, "grad_norm": 1008.0, "kl_loss_12": 334.43251953125, "kl_loss_17": 87.21447486877442, "kl_loss_3": 1844.7488708496094, "kl_loss_6": 1054.6623291015626, "learning_rate": 0.0001786061951567303, "loss": 851.126, "step": 7250 }, { "ce_loss_12": 3.1058161854743958, "ce_loss_17": 2.985197162628174, "ce_loss_23": 2.93839852809906, "ce_loss_3": 3.833547604084015, "ce_loss_6": 3.426920509338379, "epoch": 0.726, "grad_norm": 1168.0, "kl_loss_12": 345.34449768066406, "kl_loss_17": 88.04669914245605, "kl_loss_3": 1946.1626953125, "kl_loss_6": 1114.6494232177733, "learning_rate": 0.00017739236107186857, "loss": 881.4555, "step": 7260 }, { "ce_loss_12": 3.193452262878418, "ce_loss_17": 3.0852574586868284, "ce_loss_23": 3.0407148003578186, "ce_loss_3": 3.8763112664222716, "ce_loss_6": 3.491565001010895, "epoch": 0.727, "grad_norm": 1048.0, "kl_loss_12": 330.26270599365233, "kl_loss_17": 85.11148948669434, "kl_loss_3": 1837.509814453125, "kl_loss_6": 1056.963153076172, "learning_rate": 0.00017618177564234904, "loss": 842.5387, "step": 7270 }, { "ce_loss_12": 3.164004623889923, "ce_loss_17": 3.053889298439026, "ce_loss_23": 3.007025444507599, "ce_loss_3": 3.852389669418335, "ce_loss_6": 3.4695786118507383, "epoch": 0.728, "grad_norm": 832.0, "kl_loss_12": 326.8455780029297, "kl_loss_17": 84.78852577209473, "kl_loss_3": 1833.0364624023437, "kl_loss_6": 1058.3569793701172, "learning_rate": 0.00017497445105875377, "loss": 844.2723, "step": 7280 }, { "ce_loss_12": 3.0891406416893004, "ce_loss_17": 2.967900788784027, "ce_loss_23": 2.9209889650344847, "ce_loss_3": 3.8261526226997375, "ce_loss_6": 3.4110962629318236, "epoch": 0.729, "grad_norm": 1232.0, "kl_loss_12": 348.3873992919922, "kl_loss_17": 88.26057052612305, "kl_loss_3": 1980.3658569335937, "kl_loss_6": 1120.3544342041016, "learning_rate": 0.000173770399478828, "loss": 873.916, "step": 7290 }, { "ce_loss_12": 3.0114962816238404, "ce_loss_17": 2.900802159309387, "ce_loss_23": 2.8546555995941163, "ce_loss_3": 3.731195402145386, "ce_loss_6": 3.3254734754562376, "epoch": 0.73, "grad_norm": 1344.0, "kl_loss_12": 330.6724685668945, "kl_loss_17": 84.6192008972168, "kl_loss_3": 1910.844775390625, "kl_loss_6": 1080.101806640625, "learning_rate": 0.0001725696330273575, "loss": 879.4715, "step": 7300 }, { "ce_loss_12": 3.1867899537086486, "ce_loss_17": 3.0713447093963624, "ce_loss_23": 3.0244084358215333, "ce_loss_3": 3.8816495537757874, "ce_loss_6": 3.4958046793937685, "epoch": 0.731, "grad_norm": 1128.0, "kl_loss_12": 329.4728759765625, "kl_loss_17": 85.06861877441406, "kl_loss_3": 1856.8272705078125, "kl_loss_6": 1061.183837890625, "learning_rate": 0.00017137216379604724, "loss": 838.392, "step": 7310 }, { "ce_loss_12": 3.0677610874176025, "ce_loss_17": 2.9559975385665895, "ce_loss_23": 2.9073064804077147, "ce_loss_3": 3.7928367018699647, "ce_loss_6": 3.3871406435966493, "epoch": 0.732, "grad_norm": 1056.0, "kl_loss_12": 333.28870391845703, "kl_loss_17": 86.70836715698242, "kl_loss_3": 1921.3812133789063, "kl_loss_6": 1090.7506713867188, "learning_rate": 0.00017017800384339925, "loss": 863.3447, "step": 7320 }, { "ce_loss_12": 3.0337472200393676, "ce_loss_17": 2.9114068865776064, "ce_loss_23": 2.862027609348297, "ce_loss_3": 3.785917317867279, "ce_loss_6": 3.3646507382392885, "epoch": 0.733, "grad_norm": 1120.0, "kl_loss_12": 343.82637176513674, "kl_loss_17": 86.75051422119141, "kl_loss_3": 1992.6470275878905, "kl_loss_6": 1134.6840881347657, "learning_rate": 0.00016898716519459073, "loss": 862.7164, "step": 7330 }, { "ce_loss_12": 3.1520249962806703, "ce_loss_17": 3.0307239174842833, "ce_loss_23": 2.9793155193328857, "ce_loss_3": 3.9048298597335815, "ce_loss_6": 3.485947072505951, "epoch": 0.734, "grad_norm": 960.0, "kl_loss_12": 353.37522888183594, "kl_loss_17": 90.54953956604004, "kl_loss_3": 1989.7527282714843, "kl_loss_6": 1132.9010803222657, "learning_rate": 0.00016779965984135375, "loss": 875.7061, "step": 7340 }, { "ce_loss_12": 3.0715187668800352, "ce_loss_17": 2.9542425632476808, "ce_loss_23": 2.907315993309021, "ce_loss_3": 3.7954859375953673, "ce_loss_6": 3.3883111596107485, "epoch": 0.735, "grad_norm": 1184.0, "kl_loss_12": 332.02828521728514, "kl_loss_17": 85.16774787902833, "kl_loss_3": 1912.3802062988282, "kl_loss_6": 1084.7641662597657, "learning_rate": 0.00016661549974185424, "loss": 856.5238, "step": 7350 }, { "ce_loss_12": 3.0979945421218873, "ce_loss_17": 2.9852666735649107, "ce_loss_23": 2.9374090909957884, "ce_loss_3": 3.814216876029968, "ce_loss_6": 3.4164140701293944, "epoch": 0.736, "grad_norm": 1032.0, "kl_loss_12": 340.04807434082034, "kl_loss_17": 88.28477745056152, "kl_loss_3": 1913.0336975097657, "kl_loss_6": 1095.8302642822266, "learning_rate": 0.00016543469682057105, "loss": 852.7598, "step": 7360 }, { "ce_loss_12": 3.1197781205177306, "ce_loss_17": 3.0080280542373656, "ce_loss_23": 2.958756446838379, "ce_loss_3": 3.840847039222717, "ce_loss_6": 3.441115641593933, "epoch": 0.737, "grad_norm": 924.0, "kl_loss_12": 341.9563293457031, "kl_loss_17": 88.53944358825683, "kl_loss_3": 1920.9719665527343, "kl_loss_6": 1102.9890533447265, "learning_rate": 0.00016425726296817632, "loss": 859.2916, "step": 7370 }, { "ce_loss_12": 3.1310214161872865, "ce_loss_17": 3.0205642342567445, "ce_loss_23": 2.972715675830841, "ce_loss_3": 3.8499599575996397, "ce_loss_6": 3.447257339954376, "epoch": 0.738, "grad_norm": 1072.0, "kl_loss_12": 334.7860809326172, "kl_loss_17": 86.60757408142089, "kl_loss_3": 1892.2875061035156, "kl_loss_6": 1087.2785888671874, "learning_rate": 0.00016308321004141607, "loss": 857.2181, "step": 7380 }, { "ce_loss_12": 3.0963312149047852, "ce_loss_17": 2.9749263405799864, "ce_loss_23": 2.9255574226379393, "ce_loss_3": 3.832499289512634, "ce_loss_6": 3.4212956547737123, "epoch": 0.739, "grad_norm": 948.0, "kl_loss_12": 348.71044921875, "kl_loss_17": 90.2692081451416, "kl_loss_3": 1962.9808471679687, "kl_loss_6": 1123.849609375, "learning_rate": 0.00016191254986299043, "loss": 862.7074, "step": 7390 }, { "ce_loss_12": 3.1230055689811707, "ce_loss_17": 3.017664575576782, "ce_loss_23": 2.9723396420478823, "ce_loss_3": 3.840611970424652, "ce_loss_6": 3.440969097614288, "epoch": 0.74, "grad_norm": 1488.0, "kl_loss_12": 331.56529541015624, "kl_loss_17": 84.85293769836426, "kl_loss_3": 1911.277490234375, "kl_loss_6": 1093.4230743408202, "learning_rate": 0.00016074529422143398, "loss": 870.5579, "step": 7400 }, { "ce_loss_12": 3.0909118056297302, "ce_loss_17": 2.977470803260803, "ce_loss_23": 2.930717635154724, "ce_loss_3": 3.830814278125763, "ce_loss_6": 3.4192137718200684, "epoch": 0.741, "grad_norm": 3680.0, "kl_loss_12": 342.2099548339844, "kl_loss_17": 88.08484802246093, "kl_loss_3": 1951.8666381835938, "kl_loss_6": 1113.004623413086, "learning_rate": 0.0001595814548709983, "loss": 878.5271, "step": 7410 }, { "ce_loss_12": 3.160782778263092, "ce_loss_17": 3.0384599566459656, "ce_loss_23": 2.989315950870514, "ce_loss_3": 3.892155075073242, "ce_loss_6": 3.485103762149811, "epoch": 0.742, "grad_norm": 1264.0, "kl_loss_12": 353.0607345581055, "kl_loss_17": 90.35011444091796, "kl_loss_3": 1965.493115234375, "kl_loss_6": 1121.4259765625, "learning_rate": 0.00015842104353153285, "loss": 877.4688, "step": 7420 }, { "ce_loss_12": 3.1701106309890745, "ce_loss_17": 3.052255642414093, "ce_loss_23": 3.003518533706665, "ce_loss_3": 3.886456573009491, "ce_loss_6": 3.490762734413147, "epoch": 0.743, "grad_norm": 980.0, "kl_loss_12": 343.64434204101565, "kl_loss_17": 88.13621711730957, "kl_loss_3": 1922.4880676269531, "kl_loss_6": 1111.2306365966797, "learning_rate": 0.0001572640718883667, "loss": 884.9933, "step": 7430 }, { "ce_loss_12": 3.1077826380729676, "ce_loss_17": 2.9971907138824463, "ce_loss_23": 2.952656555175781, "ce_loss_3": 3.8170455574989317, "ce_loss_6": 3.420013201236725, "epoch": 0.744, "grad_norm": 1032.0, "kl_loss_12": 331.71873168945314, "kl_loss_17": 84.61126899719238, "kl_loss_3": 1899.858135986328, "kl_loss_6": 1076.7710723876953, "learning_rate": 0.0001561105515921915, "loss": 873.8502, "step": 7440 }, { "ce_loss_12": 2.9765826940536497, "ce_loss_17": 2.858693015575409, "ce_loss_23": 2.815770745277405, "ce_loss_3": 3.734588646888733, "ce_loss_6": 3.314176118373871, "epoch": 0.745, "grad_norm": 1072.0, "kl_loss_12": 336.96295471191405, "kl_loss_17": 84.19045066833496, "kl_loss_3": 2006.8513732910155, "kl_loss_6": 1138.3247772216796, "learning_rate": 0.0001549604942589441, "loss": 870.7094, "step": 7450 }, { "ce_loss_12": 3.143989932537079, "ce_loss_17": 3.035001349449158, "ce_loss_23": 2.9899677276611327, "ce_loss_3": 3.828385293483734, "ce_loss_6": 3.4482077240943907, "epoch": 0.746, "grad_norm": 876.0, "kl_loss_12": 324.0212677001953, "kl_loss_17": 84.12775382995605, "kl_loss_3": 1829.0653442382813, "kl_loss_6": 1048.3529968261719, "learning_rate": 0.00015381391146968864, "loss": 841.7859, "step": 7460 }, { "ce_loss_12": 3.1158527016639708, "ce_loss_17": 3.0048242568969727, "ce_loss_23": 2.9606425762176514, "ce_loss_3": 3.841553473472595, "ce_loss_6": 3.434686744213104, "epoch": 0.747, "grad_norm": 876.0, "kl_loss_12": 329.89722595214846, "kl_loss_17": 83.96846466064453, "kl_loss_3": 1906.8902648925782, "kl_loss_6": 1084.9622039794922, "learning_rate": 0.00015267081477050133, "loss": 861.179, "step": 7470 }, { "ce_loss_12": 3.213250148296356, "ce_loss_17": 3.096405303478241, "ce_loss_23": 3.0469411969184876, "ce_loss_3": 3.907569968700409, "ce_loss_6": 3.520617687702179, "epoch": 0.748, "grad_norm": 700.0, "kl_loss_12": 343.44225158691404, "kl_loss_17": 89.3300106048584, "kl_loss_3": 1879.0430603027344, "kl_loss_6": 1088.609292602539, "learning_rate": 0.00015153121567235335, "loss": 846.3131, "step": 7480 }, { "ce_loss_12": 3.1084712862968447, "ce_loss_17": 2.9978137254714965, "ce_loss_23": 2.9507338762283326, "ce_loss_3": 3.8373326897621154, "ce_loss_6": 3.4332410097122192, "epoch": 0.749, "grad_norm": 884.0, "kl_loss_12": 339.2166152954102, "kl_loss_17": 86.84514045715332, "kl_loss_3": 1947.7872985839845, "kl_loss_6": 1114.5451385498047, "learning_rate": 0.00015039512565099468, "loss": 846.359, "step": 7490 }, { "ce_loss_12": 3.1697430849075316, "ce_loss_17": 3.0568503975868224, "ce_loss_23": 3.0112765431404114, "ce_loss_3": 3.877428340911865, "ce_loss_6": 3.4780817866325378, "epoch": 0.75, "grad_norm": 1216.0, "kl_loss_12": 334.6671508789062, "kl_loss_17": 86.38764419555665, "kl_loss_3": 1901.6724182128905, "kl_loss_6": 1086.621160888672, "learning_rate": 0.00014926255614683932, "loss": 888.325, "step": 7500 }, { "ce_loss_12": 3.1050979256629945, "ce_loss_17": 2.9946041226387026, "ce_loss_23": 2.9476001858711243, "ce_loss_3": 3.813271498680115, "ce_loss_6": 3.409119975566864, "epoch": 0.751, "grad_norm": 944.0, "kl_loss_12": 335.94759063720704, "kl_loss_17": 85.60544509887696, "kl_loss_3": 1916.4962829589845, "kl_loss_6": 1076.0567474365234, "learning_rate": 0.0001481335185648498, "loss": 862.1184, "step": 7510 }, { "ce_loss_12": 3.128680205345154, "ce_loss_17": 3.0160374522209166, "ce_loss_23": 2.9687310814857484, "ce_loss_3": 3.846818244457245, "ce_loss_6": 3.4491943359375, "epoch": 0.752, "grad_norm": 876.0, "kl_loss_12": 336.658674621582, "kl_loss_17": 85.4277328491211, "kl_loss_3": 1906.6753662109375, "kl_loss_6": 1090.7926818847657, "learning_rate": 0.0001470080242744218, "loss": 851.4889, "step": 7520 }, { "ce_loss_12": 3.1233017206192017, "ce_loss_17": 3.0149004459381104, "ce_loss_23": 2.9699440360069276, "ce_loss_3": 3.8468761801719666, "ce_loss_6": 3.4403663873672485, "epoch": 0.753, "grad_norm": 1112.0, "kl_loss_12": 329.28695526123045, "kl_loss_17": 84.30590324401855, "kl_loss_3": 1924.2193359375, "kl_loss_6": 1089.3094604492187, "learning_rate": 0.0001458860846092705, "loss": 864.8695, "step": 7530 }, { "ce_loss_12": 3.1665912747383116, "ce_loss_17": 3.0538085103034973, "ce_loss_23": 3.006639873981476, "ce_loss_3": 3.859456181526184, "ce_loss_6": 3.472790026664734, "epoch": 0.754, "grad_norm": 1088.0, "kl_loss_12": 331.51774444580076, "kl_loss_17": 86.2380859375, "kl_loss_3": 1861.3445129394531, "kl_loss_6": 1068.8755920410156, "learning_rate": 0.00014476771086731566, "loss": 836.1793, "step": 7540 }, { "ce_loss_12": 3.250907337665558, "ce_loss_17": 3.1357580065727233, "ce_loss_23": 3.0857962131500245, "ce_loss_3": 3.951405715942383, "ce_loss_6": 3.560814690589905, "epoch": 0.755, "grad_norm": 984.0, "kl_loss_12": 341.4471206665039, "kl_loss_17": 91.6925064086914, "kl_loss_3": 1883.063543701172, "kl_loss_6": 1079.9726928710938, "learning_rate": 0.00014365291431056872, "loss": 877.0996, "step": 7550 }, { "ce_loss_12": 3.0967212200164793, "ce_loss_17": 2.9790724515914917, "ce_loss_23": 2.9304062604904173, "ce_loss_3": 3.826195013523102, "ce_loss_6": 3.4224355697631834, "epoch": 0.756, "grad_norm": 1112.0, "kl_loss_12": 350.49027557373046, "kl_loss_17": 89.87589607238769, "kl_loss_3": 1960.891094970703, "kl_loss_6": 1128.4765869140624, "learning_rate": 0.00014254170616501827, "loss": 871.8258, "step": 7560 }, { "ce_loss_12": 3.0440022349357605, "ce_loss_17": 2.922041583061218, "ce_loss_23": 2.870465707778931, "ce_loss_3": 3.807706618309021, "ce_loss_6": 3.387653279304504, "epoch": 0.757, "grad_norm": 1232.0, "kl_loss_12": 355.4050354003906, "kl_loss_17": 90.11079940795898, "kl_loss_3": 2026.7626831054688, "kl_loss_6": 1164.5902770996095, "learning_rate": 0.0001414340976205183, "loss": 902.8793, "step": 7570 }, { "ce_loss_12": 3.054443883895874, "ce_loss_17": 2.935328185558319, "ce_loss_23": 2.888164484500885, "ce_loss_3": 3.7937758922576905, "ce_loss_6": 3.3804110169410704, "epoch": 0.758, "grad_norm": 1248.0, "kl_loss_12": 341.5797866821289, "kl_loss_17": 86.31509780883789, "kl_loss_3": 1966.69814453125, "kl_loss_6": 1117.404315185547, "learning_rate": 0.00014033009983067452, "loss": 866.4357, "step": 7580 }, { "ce_loss_12": 3.192322778701782, "ce_loss_17": 3.083972525596619, "ce_loss_23": 3.0374709248542784, "ce_loss_3": 3.8759530782699585, "ce_loss_6": 3.4928683996200562, "epoch": 0.759, "grad_norm": 1184.0, "kl_loss_12": 327.23913726806643, "kl_loss_17": 84.71457977294922, "kl_loss_3": 1846.6355773925782, "kl_loss_6": 1052.3153900146485, "learning_rate": 0.00013922972391273224, "loss": 846.8314, "step": 7590 }, { "ce_loss_12": 3.1961251497268677, "ce_loss_17": 3.0865646481513975, "ce_loss_23": 3.0396140217781067, "ce_loss_3": 3.923987865447998, "ce_loss_6": 3.4996038913726806, "epoch": 0.76, "grad_norm": 1368.0, "kl_loss_12": 335.3072052001953, "kl_loss_17": 87.41547546386718, "kl_loss_3": 1925.2472839355469, "kl_loss_6": 1060.9364440917968, "learning_rate": 0.0001381329809474649, "loss": 860.7051, "step": 7600 }, { "ce_loss_12": 3.120570731163025, "ce_loss_17": 2.9973181962966917, "ce_loss_23": 2.946481966972351, "ce_loss_3": 3.8685464024543763, "ce_loss_6": 3.451310873031616, "epoch": 0.761, "grad_norm": 1256.0, "kl_loss_12": 350.7531967163086, "kl_loss_17": 89.52051429748535, "kl_loss_3": 1992.3995178222656, "kl_loss_6": 1138.2024780273437, "learning_rate": 0.0001370398819790621, "loss": 880.6619, "step": 7610 }, { "ce_loss_12": 3.2387707591056825, "ce_loss_17": 3.125054621696472, "ce_loss_23": 3.0756537199020384, "ce_loss_3": 3.933832108974457, "ce_loss_6": 3.54216411113739, "epoch": 0.762, "grad_norm": 1536.0, "kl_loss_12": 336.23710479736326, "kl_loss_17": 87.17769012451171, "kl_loss_3": 1866.5935974121094, "kl_loss_6": 1064.9635650634766, "learning_rate": 0.00013595043801501794, "loss": 841.4134, "step": 7620 }, { "ce_loss_12": 3.0535167813301087, "ce_loss_17": 2.9316890597343446, "ce_loss_23": 2.8828993320465086, "ce_loss_3": 3.835411822795868, "ce_loss_6": 3.4029742956161497, "epoch": 0.763, "grad_norm": 1216.0, "kl_loss_12": 349.1750427246094, "kl_loss_17": 88.26301383972168, "kl_loss_3": 2051.720281982422, "kl_loss_6": 1157.8002868652343, "learning_rate": 0.00013486466002602133, "loss": 884.99, "step": 7630 }, { "ce_loss_12": 3.1495963454246523, "ce_loss_17": 3.039739465713501, "ce_loss_23": 2.994281256198883, "ce_loss_3": 3.836189568042755, "ce_loss_6": 3.449015259742737, "epoch": 0.764, "grad_norm": 936.0, "kl_loss_12": 333.25688018798826, "kl_loss_17": 85.82595405578613, "kl_loss_3": 1866.9323425292969, "kl_loss_6": 1069.9278411865234, "learning_rate": 0.00013378255894584462, "loss": 871.2354, "step": 7640 }, { "ce_loss_12": 3.094685697555542, "ce_loss_17": 2.97695597410202, "ce_loss_23": 2.9276471614837645, "ce_loss_3": 3.837893486022949, "ce_loss_6": 3.421032023429871, "epoch": 0.765, "grad_norm": 1072.0, "kl_loss_12": 346.69294891357424, "kl_loss_17": 89.06358261108399, "kl_loss_3": 1963.3975646972656, "kl_loss_6": 1115.4526763916015, "learning_rate": 0.0001327041456712334, "loss": 876.7345, "step": 7650 }, { "ce_loss_12": 3.131850790977478, "ce_loss_17": 3.0180388569831846, "ce_loss_23": 2.969780373573303, "ce_loss_3": 3.849341535568237, "ce_loss_6": 3.4498900294303896, "epoch": 0.766, "grad_norm": 1232.0, "kl_loss_12": 344.56713409423827, "kl_loss_17": 88.26175918579102, "kl_loss_3": 1933.8005432128907, "kl_loss_6": 1108.826953125, "learning_rate": 0.00013162943106179747, "loss": 874.494, "step": 7660 }, { "ce_loss_12": 3.1083985328674317, "ce_loss_17": 2.994685435295105, "ce_loss_23": 2.948492395877838, "ce_loss_3": 3.8132495284080505, "ce_loss_6": 3.419460141658783, "epoch": 0.767, "grad_norm": 1168.0, "kl_loss_12": 333.51930847167966, "kl_loss_17": 86.19657096862792, "kl_loss_3": 1899.5893188476562, "kl_loss_6": 1087.605291748047, "learning_rate": 0.00013055842593990132, "loss": 856.2435, "step": 7670 }, { "ce_loss_12": 3.061744201183319, "ce_loss_17": 2.94765499830246, "ce_loss_23": 2.902495539188385, "ce_loss_3": 3.770342934131622, "ce_loss_6": 3.374074196815491, "epoch": 0.768, "grad_norm": 864.0, "kl_loss_12": 333.0192306518555, "kl_loss_17": 84.84715881347657, "kl_loss_3": 1887.3901062011719, "kl_loss_6": 1079.4616302490235, "learning_rate": 0.00012949114109055414, "loss": 870.6168, "step": 7680 }, { "ce_loss_12": 3.1028016567230225, "ce_loss_17": 2.9852315068244932, "ce_loss_23": 2.938108468055725, "ce_loss_3": 3.8279983043670653, "ce_loss_6": 3.423141694068909, "epoch": 0.769, "grad_norm": 988.0, "kl_loss_12": 343.0029541015625, "kl_loss_17": 87.91576690673828, "kl_loss_3": 1937.405712890625, "kl_loss_6": 1109.458090209961, "learning_rate": 0.00012842758726130281, "loss": 876.5707, "step": 7690 }, { "ce_loss_12": 3.149825596809387, "ce_loss_17": 3.0305551290512085, "ce_loss_23": 2.9809163212776184, "ce_loss_3": 3.894960582256317, "ce_loss_6": 3.4808794498443603, "epoch": 0.77, "grad_norm": 1496.0, "kl_loss_12": 347.4535766601563, "kl_loss_17": 89.44935035705566, "kl_loss_3": 1967.200506591797, "kl_loss_6": 1123.7082885742188, "learning_rate": 0.00012736777516212267, "loss": 862.7064, "step": 7700 }, { "ce_loss_12": 3.1452438831329346, "ce_loss_17": 3.028602695465088, "ce_loss_23": 2.9798243761062624, "ce_loss_3": 3.8623616099357605, "ce_loss_6": 3.4660856246948244, "epoch": 0.771, "grad_norm": 1016.0, "kl_loss_12": 346.66766967773435, "kl_loss_17": 88.37929077148438, "kl_loss_3": 1928.3756225585937, "kl_loss_6": 1109.7884368896484, "learning_rate": 0.00012631171546530968, "loss": 855.5857, "step": 7710 }, { "ce_loss_12": 3.151771867275238, "ce_loss_17": 3.0342516899108887, "ce_loss_23": 2.983285999298096, "ce_loss_3": 3.868927764892578, "ce_loss_6": 3.4712912917137144, "epoch": 0.772, "grad_norm": 952.0, "kl_loss_12": 347.64147186279297, "kl_loss_17": 90.16236190795898, "kl_loss_3": 1936.895196533203, "kl_loss_6": 1113.5256469726562, "learning_rate": 0.00012525941880537307, "loss": 878.1221, "step": 7720 }, { "ce_loss_12": 3.181893002986908, "ce_loss_17": 3.068693768978119, "ce_loss_23": 3.0194692611694336, "ce_loss_3": 3.8950560331344604, "ce_loss_6": 3.498482322692871, "epoch": 0.773, "grad_norm": 1004.0, "kl_loss_12": 336.63256683349607, "kl_loss_17": 86.24755783081055, "kl_loss_3": 1900.8433837890625, "kl_loss_6": 1094.9439544677734, "learning_rate": 0.00012421089577892869, "loss": 859.998, "step": 7730 }, { "ce_loss_12": 3.137857365608215, "ce_loss_17": 3.022036409378052, "ce_loss_23": 2.974807071685791, "ce_loss_3": 3.8699662685394287, "ce_loss_6": 3.463915300369263, "epoch": 0.774, "grad_norm": 1280.0, "kl_loss_12": 343.84002380371095, "kl_loss_17": 86.91533393859864, "kl_loss_3": 1949.8188903808593, "kl_loss_6": 1111.0187286376954, "learning_rate": 0.0001231661569445919, "loss": 872.7768, "step": 7740 }, { "ce_loss_12": 3.00217719078064, "ce_loss_17": 2.8886006474494934, "ce_loss_23": 2.842159616947174, "ce_loss_3": 3.737683582305908, "ce_loss_6": 3.327630090713501, "epoch": 0.775, "grad_norm": 1088.0, "kl_loss_12": 338.36918182373046, "kl_loss_17": 85.87432556152343, "kl_loss_3": 1941.089581298828, "kl_loss_6": 1107.751934814453, "learning_rate": 0.00012212521282287093, "loss": 881.309, "step": 7750 }, { "ce_loss_12": 3.144416570663452, "ce_loss_17": 3.0234966397285463, "ce_loss_23": 2.975559389591217, "ce_loss_3": 3.850447392463684, "ce_loss_6": 3.460440826416016, "epoch": 0.776, "grad_norm": 1232.0, "kl_loss_12": 345.28298034667966, "kl_loss_17": 89.02879981994629, "kl_loss_3": 1899.7856140136719, "kl_loss_6": 1097.8369415283203, "learning_rate": 0.00012108807389606158, "loss": 873.9021, "step": 7760 }, { "ce_loss_12": 3.1395771861076356, "ce_loss_17": 3.029573917388916, "ce_loss_23": 2.9835257172584533, "ce_loss_3": 3.853815996646881, "ce_loss_6": 3.4518202900886537, "epoch": 0.777, "grad_norm": 976.0, "kl_loss_12": 333.39319763183596, "kl_loss_17": 85.62209701538086, "kl_loss_3": 1905.5265747070312, "kl_loss_6": 1087.9601959228517, "learning_rate": 0.00012005475060814159, "loss": 858.8643, "step": 7770 }, { "ce_loss_12": 3.0747068762779235, "ce_loss_17": 2.959312844276428, "ce_loss_23": 2.9127468824386598, "ce_loss_3": 3.827551472187042, "ce_loss_6": 3.4109951853752136, "epoch": 0.778, "grad_norm": 1104.0, "kl_loss_12": 343.21605529785154, "kl_loss_17": 87.49714851379395, "kl_loss_3": 1972.1259399414062, "kl_loss_6": 1126.4033752441405, "learning_rate": 0.00011902525336466464, "loss": 873.4488, "step": 7780 }, { "ce_loss_12": 3.0730289697647093, "ce_loss_17": 2.9537671327590944, "ce_loss_23": 2.904254841804504, "ce_loss_3": 3.8248645424842835, "ce_loss_6": 3.4150359272956847, "epoch": 0.779, "grad_norm": 952.0, "kl_loss_12": 349.9987197875977, "kl_loss_17": 89.52234764099121, "kl_loss_3": 1999.2891967773437, "kl_loss_6": 1142.511083984375, "learning_rate": 0.00011799959253265668, "loss": 875.4477, "step": 7790 }, { "ce_loss_12": 3.123792839050293, "ce_loss_17": 3.0112548232078553, "ce_loss_23": 2.9621198654174803, "ce_loss_3": 3.8543333888053892, "ce_loss_6": 3.442795205116272, "epoch": 0.78, "grad_norm": 1176.0, "kl_loss_12": 345.0049163818359, "kl_loss_17": 89.56121673583985, "kl_loss_3": 1956.1147583007812, "kl_loss_6": 1116.1721801757812, "learning_rate": 0.00011697777844051105, "loss": 874.4314, "step": 7800 }, { "ce_loss_12": 3.1173406839370728, "ce_loss_17": 3.001301908493042, "ce_loss_23": 2.9508003115653993, "ce_loss_3": 3.874517786502838, "ce_loss_6": 3.4531596899032593, "epoch": 0.781, "grad_norm": 1240.0, "kl_loss_12": 345.39916534423827, "kl_loss_17": 89.18724632263184, "kl_loss_3": 2014.996337890625, "kl_loss_6": 1148.1400665283204, "learning_rate": 0.00011595982137788402, "loss": 883.8523, "step": 7810 }, { "ce_loss_12": 3.0881298422813415, "ce_loss_17": 2.9777971744537353, "ce_loss_23": 2.9313037514686586, "ce_loss_3": 3.7849818110466003, "ce_loss_6": 3.3961219668388365, "epoch": 0.782, "grad_norm": 1096.0, "kl_loss_12": 333.4578491210938, "kl_loss_17": 85.8090145111084, "kl_loss_3": 1869.9925354003906, "kl_loss_6": 1074.4398956298828, "learning_rate": 0.00011494573159559212, "loss": 856.6957, "step": 7820 }, { "ce_loss_12": 3.081245505809784, "ce_loss_17": 2.9624974012374876, "ce_loss_23": 2.9150551199913024, "ce_loss_3": 3.804460048675537, "ce_loss_6": 3.403433883190155, "epoch": 0.783, "grad_norm": 1128.0, "kl_loss_12": 341.8298904418945, "kl_loss_17": 88.31861038208008, "kl_loss_3": 1931.8560729980468, "kl_loss_6": 1109.651821899414, "learning_rate": 0.00011393551930550828, "loss": 881.5422, "step": 7830 }, { "ce_loss_12": 3.1982470750808716, "ce_loss_17": 3.0831907868385313, "ce_loss_23": 3.036144268512726, "ce_loss_3": 3.900965392589569, "ce_loss_6": 3.509566235542297, "epoch": 0.784, "grad_norm": 976.0, "kl_loss_12": 338.9253555297852, "kl_loss_17": 88.68736457824707, "kl_loss_3": 1885.1073425292968, "kl_loss_6": 1077.3505157470704, "learning_rate": 0.00011292919468045875, "loss": 855.8523, "step": 7840 }, { "ce_loss_12": 3.1600067257881164, "ce_loss_17": 3.0468934297561647, "ce_loss_23": 2.9999232292175293, "ce_loss_3": 3.8751365542411804, "ce_loss_6": 3.4719273805618287, "epoch": 0.785, "grad_norm": 740.0, "kl_loss_12": 340.442431640625, "kl_loss_17": 86.88551063537598, "kl_loss_3": 1915.2488525390625, "kl_loss_6": 1095.1250030517579, "learning_rate": 0.00011192676785412154, "loss": 854.8797, "step": 7850 }, { "ce_loss_12": 3.0997593998908997, "ce_loss_17": 2.983069372177124, "ce_loss_23": 2.9338279128074647, "ce_loss_3": 3.8563276171684264, "ce_loss_6": 3.438832497596741, "epoch": 0.786, "grad_norm": 1096.0, "kl_loss_12": 342.4455062866211, "kl_loss_17": 88.51511421203614, "kl_loss_3": 1975.3369506835938, "kl_loss_6": 1121.7781005859374, "learning_rate": 0.00011092824892092374, "loss": 877.8574, "step": 7860 }, { "ce_loss_12": 3.0392884254455566, "ce_loss_17": 2.924478280544281, "ce_loss_23": 2.878358209133148, "ce_loss_3": 3.789666223526001, "ce_loss_6": 3.373346519470215, "epoch": 0.787, "grad_norm": 944.0, "kl_loss_12": 343.36658477783203, "kl_loss_17": 86.76849899291992, "kl_loss_3": 1991.3924743652344, "kl_loss_6": 1129.0875396728516, "learning_rate": 0.0001099336479359398, "loss": 869.0258, "step": 7870 }, { "ce_loss_12": 3.153367805480957, "ce_loss_17": 3.041387641429901, "ce_loss_23": 2.9948896527290345, "ce_loss_3": 3.8533716440200805, "ce_loss_6": 3.459426212310791, "epoch": 0.788, "grad_norm": 868.0, "kl_loss_12": 333.95757446289065, "kl_loss_17": 86.1301658630371, "kl_loss_3": 1893.079901123047, "kl_loss_6": 1074.507797241211, "learning_rate": 0.00010894297491479043, "loss": 861.5433, "step": 7880 }, { "ce_loss_12": 3.138637971878052, "ce_loss_17": 3.026522934436798, "ce_loss_23": 2.980404424667358, "ce_loss_3": 3.85728714466095, "ce_loss_6": 3.4558101177215574, "epoch": 0.789, "grad_norm": 1048.0, "kl_loss_12": 336.65147552490237, "kl_loss_17": 86.43539237976074, "kl_loss_3": 1911.0948852539063, "kl_loss_6": 1093.9242156982423, "learning_rate": 0.00010795623983354214, "loss": 856.4976, "step": 7890 }, { "ce_loss_12": 3.048935604095459, "ce_loss_17": 2.9298244953155517, "ce_loss_23": 2.8829049706459045, "ce_loss_3": 3.783765459060669, "ce_loss_6": 3.3757433891296387, "epoch": 0.79, "grad_norm": 1064.0, "kl_loss_12": 348.3014938354492, "kl_loss_17": 89.91097488403321, "kl_loss_3": 1957.4156311035156, "kl_loss_6": 1125.400372314453, "learning_rate": 0.00010697345262860636, "loss": 867.2932, "step": 7900 }, { "ce_loss_12": 3.1738720655441286, "ce_loss_17": 3.0629943370819093, "ce_loss_23": 3.017685425281525, "ce_loss_3": 3.871600615978241, "ce_loss_6": 3.4759154319763184, "epoch": 0.791, "grad_norm": 964.0, "kl_loss_12": 334.6929733276367, "kl_loss_17": 86.53432846069336, "kl_loss_3": 1884.543878173828, "kl_loss_6": 1070.356005859375, "learning_rate": 0.00010599462319663906, "loss": 846.766, "step": 7910 }, { "ce_loss_12": 3.1430269718170165, "ce_loss_17": 3.0329277157783507, "ce_loss_23": 2.9859153389930726, "ce_loss_3": 3.834847366809845, "ce_loss_6": 3.4506487131118773, "epoch": 0.792, "grad_norm": 904.0, "kl_loss_12": 331.15181121826174, "kl_loss_17": 85.92568435668946, "kl_loss_3": 1852.7552917480468, "kl_loss_6": 1062.5147399902344, "learning_rate": 0.00010501976139444191, "loss": 840.7529, "step": 7920 }, { "ce_loss_12": 3.1669753551483155, "ce_loss_17": 3.0577998042106627, "ce_loss_23": 3.0104619026184083, "ce_loss_3": 3.8746066093444824, "ce_loss_6": 3.4819621324539183, "epoch": 0.793, "grad_norm": 1296.0, "kl_loss_12": 332.4377197265625, "kl_loss_17": 87.50224800109864, "kl_loss_3": 1898.9986511230468, "kl_loss_6": 1078.7464477539063, "learning_rate": 0.0001040488770388625, "loss": 866.2617, "step": 7930 }, { "ce_loss_12": 3.1284486651420593, "ce_loss_17": 3.0125857830047607, "ce_loss_23": 2.9665513753890993, "ce_loss_3": 3.844206702709198, "ce_loss_6": 3.443767809867859, "epoch": 0.794, "grad_norm": 1160.0, "kl_loss_12": 340.0913330078125, "kl_loss_17": 86.53030662536621, "kl_loss_3": 1935.131103515625, "kl_loss_6": 1105.029623413086, "learning_rate": 0.00010308197990669538, "loss": 862.9814, "step": 7940 }, { "ce_loss_12": 3.2380952715873716, "ce_loss_17": 3.122182333469391, "ce_loss_23": 3.070917046070099, "ce_loss_3": 3.947544777393341, "ce_loss_6": 3.5485092639923095, "epoch": 0.795, "grad_norm": 840.0, "kl_loss_12": 347.7717346191406, "kl_loss_17": 90.15585250854492, "kl_loss_3": 1925.0963745117188, "kl_loss_6": 1109.9462158203125, "learning_rate": 0.0001021190797345839, "loss": 859.2271, "step": 7950 }, { "ce_loss_12": 2.982831633090973, "ce_loss_17": 2.862782156467438, "ce_loss_23": 2.813780164718628, "ce_loss_3": 3.741881549358368, "ce_loss_6": 3.326892077922821, "epoch": 0.796, "grad_norm": 948.0, "kl_loss_12": 356.4972442626953, "kl_loss_17": 90.53372077941894, "kl_loss_3": 2013.8299743652344, "kl_loss_6": 1158.7713195800782, "learning_rate": 0.00010116018621892236, "loss": 880.7428, "step": 7960 }, { "ce_loss_12": 3.1805360555648803, "ce_loss_17": 3.0628061056137086, "ce_loss_23": 3.014175093173981, "ce_loss_3": 3.913769769668579, "ce_loss_6": 3.50421302318573, "epoch": 0.797, "grad_norm": 1168.0, "kl_loss_12": 355.2409729003906, "kl_loss_17": 92.6179973602295, "kl_loss_3": 1972.5615600585938, "kl_loss_6": 1139.47099609375, "learning_rate": 0.00010020530901575753, "loss": 861.009, "step": 7970 }, { "ce_loss_12": 3.196563351154327, "ce_loss_17": 3.083208155632019, "ce_loss_23": 3.0344853520393373, "ce_loss_3": 3.9116249561309813, "ce_loss_6": 3.5146140575408937, "epoch": 0.798, "grad_norm": 1020.0, "kl_loss_12": 343.97747192382815, "kl_loss_17": 88.96430740356445, "kl_loss_3": 1921.1753173828124, "kl_loss_6": 1103.5718902587892, "learning_rate": 9.925445774069231e-05, "loss": 848.7297, "step": 7980 }, { "ce_loss_12": 3.1499200463294983, "ce_loss_17": 3.034737801551819, "ce_loss_23": 2.986804115772247, "ce_loss_3": 3.862005889415741, "ce_loss_6": 3.4680118560791016, "epoch": 0.799, "grad_norm": 1004.0, "kl_loss_12": 335.8933639526367, "kl_loss_17": 87.28639183044433, "kl_loss_3": 1894.1492004394531, "kl_loss_6": 1091.5393951416015, "learning_rate": 9.830764196878872e-05, "loss": 839.0573, "step": 7990 }, { "ce_loss_12": 3.100402855873108, "ce_loss_17": 2.9863922595977783, "ce_loss_23": 2.9403823256492614, "ce_loss_3": 3.817837917804718, "ce_loss_6": 3.4157135605812075, "epoch": 0.8, "grad_norm": 868.0, "kl_loss_12": 337.6185836791992, "kl_loss_17": 86.11444549560547, "kl_loss_3": 1953.2614868164062, "kl_loss_6": 1107.330581665039, "learning_rate": 9.736487123447069e-05, "loss": 864.8945, "step": 8000 }, { "ce_loss_12": 3.0528087258338927, "ce_loss_17": 2.940442717075348, "ce_loss_23": 2.8930461049079894, "ce_loss_3": 3.8335362315177917, "ce_loss_6": 3.400530993938446, "epoch": 0.801, "grad_norm": 1272.0, "kl_loss_12": 343.6899673461914, "kl_loss_17": 87.17022972106933, "kl_loss_3": 2055.389855957031, "kl_loss_6": 1163.5688232421876, "learning_rate": 9.642615503142926e-05, "loss": 892.4299, "step": 8010 }, { "ce_loss_12": 3.114699602127075, "ce_loss_17": 2.99845632314682, "ce_loss_23": 2.949676924943924, "ce_loss_3": 3.836538827419281, "ce_loss_6": 3.439238655567169, "epoch": 0.802, "grad_norm": 888.0, "kl_loss_12": 334.9020050048828, "kl_loss_17": 86.2359634399414, "kl_loss_3": 1921.4569213867187, "kl_loss_6": 1090.2377807617188, "learning_rate": 9.549150281252633e-05, "loss": 850.6668, "step": 8020 }, { "ce_loss_12": 3.1417679309844972, "ce_loss_17": 3.025522565841675, "ce_loss_23": 2.9773662090301514, "ce_loss_3": 3.8609896898269653, "ce_loss_6": 3.461854100227356, "epoch": 0.803, "grad_norm": 1048.0, "kl_loss_12": 341.2946090698242, "kl_loss_17": 87.30495986938476, "kl_loss_3": 1931.938671875, "kl_loss_6": 1098.1311889648437, "learning_rate": 9.4560923989699e-05, "loss": 874.5707, "step": 8030 }, { "ce_loss_12": 3.132170486450195, "ce_loss_17": 3.016941010951996, "ce_loss_23": 2.9699743747711183, "ce_loss_3": 3.850451099872589, "ce_loss_6": 3.449750316143036, "epoch": 0.804, "grad_norm": 900.0, "kl_loss_12": 342.9051147460938, "kl_loss_17": 87.89902000427246, "kl_loss_3": 1910.6194458007812, "kl_loss_6": 1091.275827026367, "learning_rate": 9.363442793386607e-05, "loss": 880.9614, "step": 8040 }, { "ce_loss_12": 3.1098117351531984, "ce_loss_17": 2.9859721541404722, "ce_loss_23": 2.9361562609672545, "ce_loss_3": 3.861330676078796, "ce_loss_6": 3.4467449426651, "epoch": 0.805, "grad_norm": 1416.0, "kl_loss_12": 350.2294982910156, "kl_loss_17": 89.05590744018555, "kl_loss_3": 1981.568341064453, "kl_loss_6": 1135.2284088134766, "learning_rate": 9.271202397483213e-05, "loss": 859.5157, "step": 8050 }, { "ce_loss_12": 3.13518203496933, "ce_loss_17": 3.0245556592941285, "ce_loss_23": 2.97843199968338, "ce_loss_3": 3.836519110202789, "ce_loss_6": 3.4399279713630677, "epoch": 0.806, "grad_norm": 912.0, "kl_loss_12": 335.34840240478513, "kl_loss_17": 86.3288730621338, "kl_loss_3": 1890.5824157714844, "kl_loss_6": 1081.4476745605468, "learning_rate": 9.179372140119524e-05, "loss": 867.2266, "step": 8060 }, { "ce_loss_12": 3.0809193730354307, "ce_loss_17": 2.9720325112342834, "ce_loss_23": 2.925378918647766, "ce_loss_3": 3.795903038978577, "ce_loss_6": 3.3943702578544617, "epoch": 0.807, "grad_norm": 1048.0, "kl_loss_12": 333.8464828491211, "kl_loss_17": 85.5449089050293, "kl_loss_3": 1908.2876770019532, "kl_loss_6": 1088.5864166259767, "learning_rate": 9.087952946025175e-05, "loss": 870.8111, "step": 8070 }, { "ce_loss_12": 3.1861281633377074, "ce_loss_17": 3.076117146015167, "ce_loss_23": 3.0290566802024843, "ce_loss_3": 3.8652841329574583, "ce_loss_6": 3.4856042742729185, "epoch": 0.808, "grad_norm": 1000.0, "kl_loss_12": 327.95980072021484, "kl_loss_17": 85.12698631286621, "kl_loss_3": 1835.6515197753906, "kl_loss_6": 1050.8891906738281, "learning_rate": 8.996945735790446e-05, "loss": 856.2945, "step": 8080 }, { "ce_loss_12": 3.0848318457603456, "ce_loss_17": 2.9725335121154783, "ce_loss_23": 2.9265166401863096, "ce_loss_3": 3.7977205395698546, "ce_loss_6": 3.4063352584838866, "epoch": 0.809, "grad_norm": 1136.0, "kl_loss_12": 336.2959716796875, "kl_loss_17": 86.1030330657959, "kl_loss_3": 1917.192578125, "kl_loss_6": 1102.2066314697265, "learning_rate": 8.906351425856951e-05, "loss": 870.3629, "step": 8090 }, { "ce_loss_12": 3.071573090553284, "ce_loss_17": 2.9595313549041746, "ce_loss_23": 2.9116446137428285, "ce_loss_3": 3.8096127271652223, "ce_loss_6": 3.4040393114089964, "epoch": 0.81, "grad_norm": 1112.0, "kl_loss_12": 340.50690307617185, "kl_loss_17": 87.05605773925781, "kl_loss_3": 1975.457861328125, "kl_loss_6": 1130.7320648193358, "learning_rate": 8.816170928508365e-05, "loss": 881.4886, "step": 8100 }, { "ce_loss_12": 3.04113427400589, "ce_loss_17": 2.9222721576690676, "ce_loss_23": 2.875444030761719, "ce_loss_3": 3.8006600379943847, "ce_loss_6": 3.37364467382431, "epoch": 0.811, "grad_norm": 952.0, "kl_loss_12": 346.2675354003906, "kl_loss_17": 87.38955802917481, "kl_loss_3": 2010.8611755371094, "kl_loss_6": 1136.984552001953, "learning_rate": 8.7264051518613e-05, "loss": 873.9125, "step": 8110 }, { "ce_loss_12": 3.1181398153305055, "ce_loss_17": 3.007498395442963, "ce_loss_23": 2.962209713459015, "ce_loss_3": 3.821540927886963, "ce_loss_6": 3.4283391356468202, "epoch": 0.812, "grad_norm": 1064.0, "kl_loss_12": 328.8416946411133, "kl_loss_17": 84.78668899536133, "kl_loss_3": 1879.4911193847656, "kl_loss_6": 1073.913980102539, "learning_rate": 8.637054999856148e-05, "loss": 858.3178, "step": 8120 }, { "ce_loss_12": 3.1156737446784972, "ce_loss_17": 2.999300456047058, "ce_loss_23": 2.9509395956993103, "ce_loss_3": 3.8430278182029722, "ce_loss_6": 3.4434648156166077, "epoch": 0.813, "grad_norm": 932.0, "kl_loss_12": 343.0796173095703, "kl_loss_17": 87.72957420349121, "kl_loss_3": 1928.7910522460938, "kl_loss_6": 1103.8207946777343, "learning_rate": 8.548121372247918e-05, "loss": 875.3312, "step": 8130 }, { "ce_loss_12": 3.1797402143478393, "ce_loss_17": 3.072401428222656, "ce_loss_23": 3.0259174704551697, "ce_loss_3": 3.880035877227783, "ce_loss_6": 3.4850649356842043, "epoch": 0.814, "grad_norm": 1360.0, "kl_loss_12": 333.69579162597654, "kl_loss_17": 86.78376846313476, "kl_loss_3": 1905.3100463867188, "kl_loss_6": 1079.9478851318358, "learning_rate": 8.459605164597267e-05, "loss": 853.8594, "step": 8140 }, { "ce_loss_12": 3.0721118688583373, "ce_loss_17": 2.96068993806839, "ce_loss_23": 2.9150487184524536, "ce_loss_3": 3.8026944279670714, "ce_loss_6": 3.4011393070220945, "epoch": 0.815, "grad_norm": 796.0, "kl_loss_12": 336.3142486572266, "kl_loss_17": 86.4559268951416, "kl_loss_3": 1935.2485229492188, "kl_loss_6": 1104.572802734375, "learning_rate": 8.371507268261436e-05, "loss": 873.4855, "step": 8150 }, { "ce_loss_12": 3.1365590333938598, "ce_loss_17": 3.0222877860069275, "ce_loss_23": 2.97467303276062, "ce_loss_3": 3.8597262263298036, "ce_loss_6": 3.45579069852829, "epoch": 0.816, "grad_norm": 932.0, "kl_loss_12": 338.98358764648435, "kl_loss_17": 86.73196105957031, "kl_loss_3": 1914.698223876953, "kl_loss_6": 1087.3596771240234, "learning_rate": 8.283828570385238e-05, "loss": 840.1645, "step": 8160 }, { "ce_loss_12": 3.1392388939857483, "ce_loss_17": 3.0261239886283873, "ce_loss_23": 2.978219211101532, "ce_loss_3": 3.853709506988525, "ce_loss_6": 3.455215263366699, "epoch": 0.817, "grad_norm": 864.0, "kl_loss_12": 335.7754180908203, "kl_loss_17": 86.84048919677734, "kl_loss_3": 1893.3304809570313, "kl_loss_6": 1083.5698669433593, "learning_rate": 8.196569953892202e-05, "loss": 855.8519, "step": 8170 }, { "ce_loss_12": 3.071793007850647, "ce_loss_17": 2.9521210312843325, "ce_loss_23": 2.90672482252121, "ce_loss_3": 3.79334899187088, "ce_loss_6": 3.3872650504112243, "epoch": 0.818, "grad_norm": 1264.0, "kl_loss_12": 343.29752960205076, "kl_loss_17": 87.74607048034667, "kl_loss_3": 1917.1433898925782, "kl_loss_6": 1101.1959533691406, "learning_rate": 8.109732297475635e-05, "loss": 856.9976, "step": 8180 }, { "ce_loss_12": 3.044912302494049, "ce_loss_17": 2.9229483723640444, "ce_loss_23": 2.8729085326194763, "ce_loss_3": 3.828094220161438, "ce_loss_6": 3.4056942462921143, "epoch": 0.819, "grad_norm": 1032.0, "kl_loss_12": 352.4162796020508, "kl_loss_17": 89.06830749511718, "kl_loss_3": 2030.4641357421874, "kl_loss_6": 1173.2847595214844, "learning_rate": 8.023316475589754e-05, "loss": 893.1128, "step": 8190 }, { "ce_loss_12": 3.016770374774933, "ce_loss_17": 2.8914675116539, "ce_loss_23": 2.839850068092346, "ce_loss_3": 3.8171961426734926, "ce_loss_6": 3.369654881954193, "epoch": 0.82, "grad_norm": 1616.0, "kl_loss_12": 363.68381652832034, "kl_loss_17": 93.56408424377442, "kl_loss_3": 2083.283099365234, "kl_loss_6": 1186.300814819336, "learning_rate": 7.937323358440934e-05, "loss": 905.9598, "step": 8200 }, { "ce_loss_12": 3.12202787399292, "ce_loss_17": 3.014573073387146, "ce_loss_23": 2.9696925044059754, "ce_loss_3": 3.817815911769867, "ce_loss_6": 3.4261608719825745, "epoch": 0.821, "grad_norm": 1040.0, "kl_loss_12": 329.6168685913086, "kl_loss_17": 85.38657341003417, "kl_loss_3": 1873.8861389160156, "kl_loss_6": 1069.3458221435546, "learning_rate": 7.851753811978923e-05, "loss": 855.5345, "step": 8210 }, { "ce_loss_12": 3.140878129005432, "ce_loss_17": 3.0264705181121827, "ce_loss_23": 2.9791894435882567, "ce_loss_3": 3.8736305713653563, "ce_loss_6": 3.462307298183441, "epoch": 0.822, "grad_norm": 984.0, "kl_loss_12": 339.41978149414064, "kl_loss_17": 88.28028259277343, "kl_loss_3": 1955.4935302734375, "kl_loss_6": 1115.261944580078, "learning_rate": 7.766608697888095e-05, "loss": 862.4045, "step": 8220 }, { "ce_loss_12": 3.1515311360359193, "ce_loss_17": 3.036611783504486, "ce_loss_23": 2.9878187775611877, "ce_loss_3": 3.8725401878356935, "ce_loss_6": 3.470382332801819, "epoch": 0.823, "grad_norm": 952.0, "kl_loss_12": 342.2221923828125, "kl_loss_17": 89.53544616699219, "kl_loss_3": 1944.193017578125, "kl_loss_6": 1110.061944580078, "learning_rate": 7.681888873578785e-05, "loss": 875.9349, "step": 8230 }, { "ce_loss_12": 3.082021200656891, "ce_loss_17": 2.9599790930747987, "ce_loss_23": 2.9113461136817933, "ce_loss_3": 3.8219818115234374, "ce_loss_6": 3.4070132970809937, "epoch": 0.824, "grad_norm": 1024.0, "kl_loss_12": 350.0909225463867, "kl_loss_17": 90.64294090270997, "kl_loss_3": 1968.9792602539062, "kl_loss_6": 1120.007046508789, "learning_rate": 7.597595192178702e-05, "loss": 868.1928, "step": 8240 }, { "ce_loss_12": 3.085583007335663, "ce_loss_17": 2.968785548210144, "ce_loss_23": 2.9183531761169434, "ce_loss_3": 3.834295225143433, "ce_loss_6": 3.4201825857162476, "epoch": 0.825, "grad_norm": 1272.0, "kl_loss_12": 349.5534942626953, "kl_loss_17": 89.26756324768067, "kl_loss_3": 2006.1392211914062, "kl_loss_6": 1147.5581146240233, "learning_rate": 7.513728502524286e-05, "loss": 885.6963, "step": 8250 }, { "ce_loss_12": 3.0719595193862914, "ce_loss_17": 2.9636343240737917, "ce_loss_23": 2.9201133489608764, "ce_loss_3": 3.7877281069755555, "ce_loss_6": 3.389252471923828, "epoch": 0.826, "grad_norm": 1176.0, "kl_loss_12": 328.5639221191406, "kl_loss_17": 83.89287757873535, "kl_loss_3": 1886.0435302734375, "kl_loss_6": 1071.7314575195312, "learning_rate": 7.430289649152156e-05, "loss": 863.2703, "step": 8260 }, { "ce_loss_12": 2.9939164757728576, "ce_loss_17": 2.8743756294250487, "ce_loss_23": 2.8279815316200256, "ce_loss_3": 3.760581076145172, "ce_loss_6": 3.338099813461304, "epoch": 0.827, "grad_norm": 1216.0, "kl_loss_12": 348.948991394043, "kl_loss_17": 87.2121368408203, "kl_loss_3": 2031.262713623047, "kl_loss_6": 1162.1164459228517, "learning_rate": 7.347279472290646e-05, "loss": 875.545, "step": 8270 }, { "ce_loss_12": 3.126525890827179, "ce_loss_17": 3.014357566833496, "ce_loss_23": 2.966294693946838, "ce_loss_3": 3.8614994049072267, "ce_loss_6": 3.4501480221748353, "epoch": 0.828, "grad_norm": 1208.0, "kl_loss_12": 340.5355484008789, "kl_loss_17": 87.52486991882324, "kl_loss_3": 1956.3447204589843, "kl_loss_6": 1108.7800689697265, "learning_rate": 7.264698807851328e-05, "loss": 876.5871, "step": 8280 }, { "ce_loss_12": 3.0982959866523743, "ce_loss_17": 2.9892932772636414, "ce_loss_23": 2.9442530274391174, "ce_loss_3": 3.8029705762863157, "ce_loss_6": 3.4126698970794678, "epoch": 0.829, "grad_norm": 908.0, "kl_loss_12": 334.2535629272461, "kl_loss_17": 85.19641418457032, "kl_loss_3": 1887.4463623046875, "kl_loss_6": 1079.3946502685546, "learning_rate": 7.182548487420554e-05, "loss": 855.0638, "step": 8290 }, { "ce_loss_12": 3.1432425498962404, "ce_loss_17": 3.0315305590629578, "ce_loss_23": 2.9838999629020693, "ce_loss_3": 3.854983127117157, "ce_loss_6": 3.4639780282974244, "epoch": 0.83, "grad_norm": 1064.0, "kl_loss_12": 340.45317993164065, "kl_loss_17": 88.31193199157715, "kl_loss_3": 1916.8724243164063, "kl_loss_6": 1103.293978881836, "learning_rate": 7.100829338251146e-05, "loss": 858.6983, "step": 8300 }, { "ce_loss_12": 3.0850770592689516, "ce_loss_17": 2.9665639281272886, "ce_loss_23": 2.9138702034950255, "ce_loss_3": 3.8345587968826296, "ce_loss_6": 3.4193907499313356, "epoch": 0.831, "grad_norm": 1344.0, "kl_loss_12": 347.6676956176758, "kl_loss_17": 89.90985565185547, "kl_loss_3": 1973.8903259277345, "kl_loss_6": 1123.459619140625, "learning_rate": 7.019542183254046e-05, "loss": 864.3824, "step": 8310 }, { "ce_loss_12": 3.1197735190391542, "ce_loss_17": 3.005459153652191, "ce_loss_23": 2.953192150592804, "ce_loss_3": 3.8333282589912416, "ce_loss_6": 3.435945522785187, "epoch": 0.832, "grad_norm": 1080.0, "kl_loss_12": 348.20618896484376, "kl_loss_17": 93.78602905273438, "kl_loss_3": 1924.5072937011719, "kl_loss_6": 1107.3147766113282, "learning_rate": 6.938687840989971e-05, "loss": 864.4219, "step": 8320 }, { "ce_loss_12": 3.06420384645462, "ce_loss_17": 2.948847937583923, "ce_loss_23": 2.901505208015442, "ce_loss_3": 3.7880621790885924, "ce_loss_6": 3.3873242259025576, "epoch": 0.833, "grad_norm": 1080.0, "kl_loss_12": 345.3905944824219, "kl_loss_17": 89.02677841186524, "kl_loss_3": 1930.5537353515624, "kl_loss_6": 1102.0173278808593, "learning_rate": 6.858267125661271e-05, "loss": 875.7271, "step": 8330 }, { "ce_loss_12": 3.117763566970825, "ce_loss_17": 3.0019028782844543, "ce_loss_23": 2.955692636966705, "ce_loss_3": 3.8445546865463256, "ce_loss_6": 3.442584776878357, "epoch": 0.834, "grad_norm": 1080.0, "kl_loss_12": 338.07284088134764, "kl_loss_17": 86.10532608032227, "kl_loss_3": 1933.6854675292968, "kl_loss_6": 1102.285546875, "learning_rate": 6.778280847103668e-05, "loss": 881.1859, "step": 8340 }, { "ce_loss_12": 3.1274876832962035, "ce_loss_17": 3.0086837649345397, "ce_loss_23": 2.9614779591560363, "ce_loss_3": 3.8465988516807554, "ce_loss_6": 3.445386898517609, "epoch": 0.835, "grad_norm": 896.0, "kl_loss_12": 348.3533416748047, "kl_loss_17": 89.17362174987792, "kl_loss_3": 1939.704248046875, "kl_loss_6": 1107.0311492919923, "learning_rate": 6.698729810778065e-05, "loss": 860.6041, "step": 8350 }, { "ce_loss_12": 3.040190947055817, "ce_loss_17": 2.92340030670166, "ce_loss_23": 2.877941405773163, "ce_loss_3": 3.7735044956207275, "ce_loss_6": 3.3638616800308228, "epoch": 0.836, "grad_norm": 1176.0, "kl_loss_12": 334.4034454345703, "kl_loss_17": 85.51121520996094, "kl_loss_3": 1925.1980590820312, "kl_loss_6": 1095.8139862060548, "learning_rate": 6.619614817762538e-05, "loss": 867.955, "step": 8360 }, { "ce_loss_12": 3.0219133734703063, "ce_loss_17": 2.900725555419922, "ce_loss_23": 2.8514190554618835, "ce_loss_3": 3.804880404472351, "ce_loss_6": 3.36659722328186, "epoch": 0.837, "grad_norm": 996.0, "kl_loss_12": 348.86095428466797, "kl_loss_17": 86.93252220153809, "kl_loss_3": 2054.0288818359377, "kl_loss_6": 1159.649105834961, "learning_rate": 6.540936664744196e-05, "loss": 893.201, "step": 8370 }, { "ce_loss_12": 3.1456708312034607, "ce_loss_17": 3.0280561089515685, "ce_loss_23": 2.9808369278907776, "ce_loss_3": 3.8817422866821287, "ce_loss_6": 3.4683672308921816, "epoch": 0.838, "grad_norm": 892.0, "kl_loss_12": 344.3450592041016, "kl_loss_17": 87.86822891235352, "kl_loss_3": 1957.4243774414062, "kl_loss_6": 1115.648455810547, "learning_rate": 6.462696144011149e-05, "loss": 858.9438, "step": 8380 }, { "ce_loss_12": 3.107890796661377, "ce_loss_17": 2.9926645755767822, "ce_loss_23": 2.9458924531936646, "ce_loss_3": 3.8240071654319765, "ce_loss_6": 3.4266958951950075, "epoch": 0.839, "grad_norm": 1072.0, "kl_loss_12": 346.38677673339845, "kl_loss_17": 88.94830360412598, "kl_loss_3": 1925.6028625488282, "kl_loss_6": 1109.2246826171875, "learning_rate": 6.384894043444567e-05, "loss": 856.969, "step": 8390 }, { "ce_loss_12": 3.1257767200469972, "ce_loss_17": 3.009605872631073, "ce_loss_23": 2.9617841124534605, "ce_loss_3": 3.8602431058883666, "ce_loss_6": 3.455742073059082, "epoch": 0.84, "grad_norm": 1096.0, "kl_loss_12": 343.82138977050784, "kl_loss_17": 88.10531845092774, "kl_loss_3": 1947.715557861328, "kl_loss_6": 1115.6705200195313, "learning_rate": 6.307531146510753e-05, "loss": 867.9402, "step": 8400 }, { "ce_loss_12": 3.106555771827698, "ce_loss_17": 2.9946197390556337, "ce_loss_23": 2.945484471321106, "ce_loss_3": 3.8135352969169616, "ce_loss_6": 3.4177019596099854, "epoch": 0.841, "grad_norm": 1144.0, "kl_loss_12": 339.7760940551758, "kl_loss_17": 87.89406509399414, "kl_loss_3": 1883.2806030273437, "kl_loss_6": 1087.097219848633, "learning_rate": 6.230608232253226e-05, "loss": 845.5771, "step": 8410 }, { "ce_loss_12": 3.071239709854126, "ce_loss_17": 2.951303815841675, "ce_loss_23": 2.901365566253662, "ce_loss_3": 3.838588201999664, "ce_loss_6": 3.4165117025375364, "epoch": 0.842, "grad_norm": 1168.0, "kl_loss_12": 347.6035659790039, "kl_loss_17": 88.24365577697753, "kl_loss_3": 2011.380517578125, "kl_loss_6": 1143.0382446289063, "learning_rate": 6.154126075284855e-05, "loss": 871.0586, "step": 8420 }, { "ce_loss_12": 3.1536173582077027, "ce_loss_17": 3.040555489063263, "ce_loss_23": 2.9942591905593874, "ce_loss_3": 3.85747230052948, "ce_loss_6": 3.466206908226013, "epoch": 0.843, "grad_norm": 948.0, "kl_loss_12": 331.64387664794924, "kl_loss_17": 85.18689041137695, "kl_loss_3": 1886.2900390625, "kl_loss_6": 1079.935122680664, "learning_rate": 6.078085445780129e-05, "loss": 841.4134, "step": 8430 }, { "ce_loss_12": 3.157128167152405, "ce_loss_17": 3.0442301392555238, "ce_loss_23": 2.998222291469574, "ce_loss_3": 3.8884846687316896, "ce_loss_6": 3.4774367213249207, "epoch": 0.844, "grad_norm": 1112.0, "kl_loss_12": 338.92323608398436, "kl_loss_17": 86.8118854522705, "kl_loss_3": 1954.67890625, "kl_loss_6": 1101.4988372802734, "learning_rate": 6.002487109467347e-05, "loss": 854.1086, "step": 8440 }, { "ce_loss_12": 3.163812208175659, "ce_loss_17": 3.0485057711601256, "ce_loss_23": 3.0002379298210142, "ce_loss_3": 3.879017102718353, "ce_loss_6": 3.482664239406586, "epoch": 0.845, "grad_norm": 976.0, "kl_loss_12": 347.79635467529295, "kl_loss_17": 89.86136512756347, "kl_loss_3": 1939.941583251953, "kl_loss_6": 1113.0343994140626, "learning_rate": 5.927331827620902e-05, "loss": 858.4156, "step": 8450 }, { "ce_loss_12": 3.149532949924469, "ce_loss_17": 3.0329127073287965, "ce_loss_23": 2.9848304629325866, "ce_loss_3": 3.836488330364227, "ce_loss_6": 3.4556496262550356, "epoch": 0.846, "grad_norm": 984.0, "kl_loss_12": 333.4063385009766, "kl_loss_17": 85.9187614440918, "kl_loss_3": 1856.5706665039063, "kl_loss_6": 1064.6796936035157, "learning_rate": 5.852620357053651e-05, "loss": 851.8359, "step": 8460 }, { "ce_loss_12": 3.1828793048858643, "ce_loss_17": 3.0734182953834535, "ce_loss_23": 3.0279962182044984, "ce_loss_3": 3.8866792559623717, "ce_loss_6": 3.491861045360565, "epoch": 0.847, "grad_norm": 1136.0, "kl_loss_12": 334.7801971435547, "kl_loss_17": 85.28386154174805, "kl_loss_3": 1885.297296142578, "kl_loss_6": 1077.3123596191406, "learning_rate": 5.778353450109286e-05, "loss": 852.2954, "step": 8470 }, { "ce_loss_12": 3.213039755821228, "ce_loss_17": 3.0986974835395813, "ce_loss_23": 3.0505094647407534, "ce_loss_3": 3.9459834694862366, "ce_loss_6": 3.535405671596527, "epoch": 0.848, "grad_norm": 1544.0, "kl_loss_12": 343.61567840576174, "kl_loss_17": 88.51277122497558, "kl_loss_3": 1930.9443481445312, "kl_loss_6": 1099.9347137451173, "learning_rate": 5.7045318546547206e-05, "loss": 858.4686, "step": 8480 }, { "ce_loss_12": 3.1145872831344605, "ce_loss_17": 3.0008885622024537, "ce_loss_23": 2.952608811855316, "ce_loss_3": 3.8506309747695924, "ce_loss_6": 3.4415759801864625, "epoch": 0.849, "grad_norm": 1224.0, "kl_loss_12": 338.5645751953125, "kl_loss_17": 87.2589542388916, "kl_loss_3": 1946.0604858398438, "kl_loss_6": 1108.356982421875, "learning_rate": 5.631156314072605e-05, "loss": 860.1698, "step": 8490 }, { "ce_loss_12": 3.143726873397827, "ce_loss_17": 3.026803493499756, "ce_loss_23": 2.979511725902557, "ce_loss_3": 3.8262677907943727, "ce_loss_6": 3.4486370205879213, "epoch": 0.85, "grad_norm": 1008.0, "kl_loss_12": 337.11222686767576, "kl_loss_17": 87.5474349975586, "kl_loss_3": 1863.6341430664063, "kl_loss_6": 1077.056936645508, "learning_rate": 5.5582275672538315e-05, "loss": 845.0238, "step": 8500 }, { "ce_loss_12": 3.062679886817932, "ce_loss_17": 2.9370684027671814, "ce_loss_23": 2.889982986450195, "ce_loss_3": 3.8353065729141234, "ce_loss_6": 3.4114139795303347, "epoch": 0.851, "grad_norm": 920.0, "kl_loss_12": 352.9176330566406, "kl_loss_17": 89.93073768615723, "kl_loss_3": 2035.2821105957032, "kl_loss_6": 1160.1542114257813, "learning_rate": 5.4857463485900484e-05, "loss": 884.4208, "step": 8510 }, { "ce_loss_12": 3.120178520679474, "ce_loss_17": 3.0025755405426025, "ce_loss_23": 2.9565329909324647, "ce_loss_3": 3.817841875553131, "ce_loss_6": 3.431487536430359, "epoch": 0.852, "grad_norm": 992.0, "kl_loss_12": 336.0100524902344, "kl_loss_17": 86.01943740844726, "kl_loss_3": 1894.4754455566406, "kl_loss_6": 1090.1786346435547, "learning_rate": 5.413713387966329e-05, "loss": 850.633, "step": 8520 }, { "ce_loss_12": 3.125340521335602, "ce_loss_17": 3.01132732629776, "ce_loss_23": 2.9644988775253296, "ce_loss_3": 3.854294538497925, "ce_loss_6": 3.4520197987556456, "epoch": 0.853, "grad_norm": 1208.0, "kl_loss_12": 339.7386764526367, "kl_loss_17": 87.40383911132812, "kl_loss_3": 1942.8307739257812, "kl_loss_6": 1113.2786926269532, "learning_rate": 5.34212941075381e-05, "loss": 864.8773, "step": 8530 }, { "ce_loss_12": 3.128991258144379, "ce_loss_17": 3.0231241464614866, "ce_loss_23": 2.978782618045807, "ce_loss_3": 3.830638825893402, "ce_loss_6": 3.4327388763427735, "epoch": 0.854, "grad_norm": 976.0, "kl_loss_12": 323.4237808227539, "kl_loss_17": 84.3223777770996, "kl_loss_3": 1873.2142211914063, "kl_loss_6": 1054.9510284423827, "learning_rate": 5.270995137802315e-05, "loss": 846.1405, "step": 8540 }, { "ce_loss_12": 3.0760897994041443, "ce_loss_17": 2.964383137226105, "ce_loss_23": 2.918411982059479, "ce_loss_3": 3.7972556352615356, "ce_loss_6": 3.3881014347076417, "epoch": 0.855, "grad_norm": 836.0, "kl_loss_12": 334.1419174194336, "kl_loss_17": 84.58750801086425, "kl_loss_3": 1927.474542236328, "kl_loss_6": 1087.4206146240235, "learning_rate": 5.2003112854332125e-05, "loss": 864.5331, "step": 8550 }, { "ce_loss_12": 3.072225642204285, "ce_loss_17": 2.964117980003357, "ce_loss_23": 2.919295918941498, "ce_loss_3": 3.7820751905441283, "ce_loss_6": 3.3831193804740907, "epoch": 0.856, "grad_norm": 1192.0, "kl_loss_12": 330.09010925292966, "kl_loss_17": 84.22143287658692, "kl_loss_3": 1912.9076110839844, "kl_loss_6": 1084.7628662109375, "learning_rate": 5.130078565432089e-05, "loss": 839.2182, "step": 8560 }, { "ce_loss_12": 3.1370887756347656, "ce_loss_17": 3.0283565163612365, "ce_loss_23": 2.982879114151001, "ce_loss_3": 3.831531059741974, "ce_loss_6": 3.444535326957703, "epoch": 0.857, "grad_norm": 996.0, "kl_loss_12": 329.6750778198242, "kl_loss_17": 83.78762245178223, "kl_loss_3": 1881.2487365722657, "kl_loss_6": 1080.3342071533202, "learning_rate": 5.060297685041659e-05, "loss": 838.5309, "step": 8570 }, { "ce_loss_12": 3.0776275753974915, "ce_loss_17": 2.956244874000549, "ce_loss_23": 2.9072667360305786, "ce_loss_3": 3.807856500148773, "ce_loss_6": 3.3985143542289733, "epoch": 0.858, "grad_norm": 1056.0, "kl_loss_12": 342.1207244873047, "kl_loss_17": 88.82546920776367, "kl_loss_3": 1940.8138427734375, "kl_loss_6": 1105.3937622070312, "learning_rate": 4.99096934695461e-05, "loss": 870.5254, "step": 8580 }, { "ce_loss_12": 3.134142804145813, "ce_loss_17": 3.0192827582359314, "ce_loss_23": 2.9710862755775453, "ce_loss_3": 3.853245532512665, "ce_loss_6": 3.4556553840637205, "epoch": 0.859, "grad_norm": 832.0, "kl_loss_12": 333.8627334594727, "kl_loss_17": 85.99855766296386, "kl_loss_3": 1913.0985778808595, "kl_loss_6": 1089.306918334961, "learning_rate": 4.922094249306558e-05, "loss": 848.9322, "step": 8590 }, { "ce_loss_12": 3.166194033622742, "ce_loss_17": 3.0500768542289736, "ce_loss_23": 3.0017306447029113, "ce_loss_3": 3.883514702320099, "ce_loss_6": 3.481926476955414, "epoch": 0.86, "grad_norm": 1020.0, "kl_loss_12": 343.3954132080078, "kl_loss_17": 88.88944129943847, "kl_loss_3": 1925.6196594238281, "kl_loss_6": 1095.9616302490235, "learning_rate": 4.853673085668947e-05, "loss": 845.7038, "step": 8600 }, { "ce_loss_12": 3.173807406425476, "ce_loss_17": 3.0630053758621214, "ce_loss_23": 3.0165204882621763, "ce_loss_3": 3.893686830997467, "ce_loss_6": 3.4856738567352297, "epoch": 0.861, "grad_norm": 900.0, "kl_loss_12": 332.5966766357422, "kl_loss_17": 85.91055793762207, "kl_loss_3": 1913.8761779785157, "kl_loss_6": 1079.535546875, "learning_rate": 4.78570654504214e-05, "loss": 856.4615, "step": 8610 }, { "ce_loss_12": 3.1255746483802795, "ce_loss_17": 3.014578711986542, "ce_loss_23": 2.9669725060462953, "ce_loss_3": 3.8446498036384584, "ce_loss_6": 3.4471297383308412, "epoch": 0.862, "grad_norm": 1072.0, "kl_loss_12": 335.4307342529297, "kl_loss_17": 85.39923210144043, "kl_loss_3": 1918.1002502441406, "kl_loss_6": 1101.1010650634767, "learning_rate": 4.7181953118484556e-05, "loss": 859.0185, "step": 8620 }, { "ce_loss_12": 3.149523913860321, "ce_loss_17": 3.0387833952903747, "ce_loss_23": 2.993229532241821, "ce_loss_3": 3.855341386795044, "ce_loss_6": 3.460942280292511, "epoch": 0.863, "grad_norm": 944.0, "kl_loss_12": 330.67168884277345, "kl_loss_17": 84.8177474975586, "kl_loss_3": 1874.0156555175781, "kl_loss_6": 1074.759991455078, "learning_rate": 4.651140065925269e-05, "loss": 864.2121, "step": 8630 }, { "ce_loss_12": 3.087773311138153, "ce_loss_17": 2.975247311592102, "ce_loss_23": 2.9271497249603273, "ce_loss_3": 3.8027710556983947, "ce_loss_6": 3.4052886724472047, "epoch": 0.864, "grad_norm": 1136.0, "kl_loss_12": 336.3831130981445, "kl_loss_17": 87.06652030944824, "kl_loss_3": 1922.2005981445313, "kl_loss_6": 1094.44453125, "learning_rate": 4.58454148251814e-05, "loss": 865.4342, "step": 8640 }, { "ce_loss_12": 3.099566066265106, "ce_loss_17": 2.982946813106537, "ce_loss_23": 2.9340903520584107, "ce_loss_3": 3.8434081077575684, "ce_loss_6": 3.429073524475098, "epoch": 0.865, "grad_norm": 988.0, "kl_loss_12": 337.62081756591795, "kl_loss_17": 86.17353515625, "kl_loss_3": 1954.7680358886719, "kl_loss_6": 1113.4993255615234, "learning_rate": 4.518400232274078e-05, "loss": 859.0479, "step": 8650 }, { "ce_loss_12": 3.131185245513916, "ce_loss_17": 3.014595639705658, "ce_loss_23": 2.9649352073669433, "ce_loss_3": 3.838270437717438, "ce_loss_6": 3.4395124316215515, "epoch": 0.866, "grad_norm": 960.0, "kl_loss_12": 338.11429901123046, "kl_loss_17": 87.64392280578613, "kl_loss_3": 1888.8003845214844, "kl_loss_6": 1081.4321197509767, "learning_rate": 4.452716981234745e-05, "loss": 832.7505, "step": 8660 }, { "ce_loss_12": 3.094347560405731, "ce_loss_17": 2.985401213169098, "ce_loss_23": 2.9405669450759886, "ce_loss_3": 3.801581919193268, "ce_loss_6": 3.4098233222961425, "epoch": 0.867, "grad_norm": 856.0, "kl_loss_12": 329.37064361572266, "kl_loss_17": 84.24899635314941, "kl_loss_3": 1886.796435546875, "kl_loss_6": 1086.4415130615234, "learning_rate": 4.3874923908297335e-05, "loss": 836.1057, "step": 8670 }, { "ce_loss_12": 3.149175775051117, "ce_loss_17": 3.0373104572296143, "ce_loss_23": 2.988961696624756, "ce_loss_3": 3.8777933359146117, "ce_loss_6": 3.474110317230225, "epoch": 0.868, "grad_norm": 884.0, "kl_loss_12": 339.9120407104492, "kl_loss_17": 87.37800178527831, "kl_loss_3": 1943.5669311523438, "kl_loss_6": 1111.9998474121094, "learning_rate": 4.322727117869951e-05, "loss": 854.1123, "step": 8680 }, { "ce_loss_12": 3.159498453140259, "ce_loss_17": 3.0466421127319334, "ce_loss_23": 2.998374545574188, "ce_loss_3": 3.880115878582001, "ce_loss_6": 3.4746498703956603, "epoch": 0.869, "grad_norm": 1448.0, "kl_loss_12": 339.47459411621094, "kl_loss_17": 86.88564872741699, "kl_loss_3": 1935.815850830078, "kl_loss_6": 1101.9241333007812, "learning_rate": 4.2584218145409916e-05, "loss": 854.732, "step": 8690 }, { "ce_loss_12": 3.1883333921432495, "ce_loss_17": 3.0860480070114136, "ce_loss_23": 3.0403281450271606, "ce_loss_3": 3.8731122970581056, "ce_loss_6": 3.490442156791687, "epoch": 0.87, "grad_norm": 944.0, "kl_loss_12": 325.2259918212891, "kl_loss_17": 83.94673805236816, "kl_loss_3": 1845.4943603515626, "kl_loss_6": 1057.3880157470703, "learning_rate": 4.194577128396521e-05, "loss": 833.2588, "step": 8700 }, { "ce_loss_12": 3.083304762840271, "ce_loss_17": 2.973324239253998, "ce_loss_23": 2.927659106254578, "ce_loss_3": 3.8043991804122923, "ce_loss_6": 3.397646701335907, "epoch": 0.871, "grad_norm": 976.0, "kl_loss_12": 327.7694839477539, "kl_loss_17": 84.01599769592285, "kl_loss_3": 1917.2221069335938, "kl_loss_6": 1079.2329345703124, "learning_rate": 4.1311937023518264e-05, "loss": 862.6654, "step": 8710 }, { "ce_loss_12": 3.0983502507209777, "ce_loss_17": 2.993625044822693, "ce_loss_23": 2.9482877254486084, "ce_loss_3": 3.86238557100296, "ce_loss_6": 3.4176156878471375, "epoch": 0.872, "grad_norm": 1248.0, "kl_loss_12": 319.80799255371096, "kl_loss_17": 83.03072204589844, "kl_loss_3": 1995.8182006835937, "kl_loss_6": 1084.0275299072266, "learning_rate": 4.0682721746773344e-05, "loss": 855.5651, "step": 8720 }, { "ce_loss_12": 2.974739396572113, "ce_loss_17": 2.862412619590759, "ce_loss_23": 2.817744278907776, "ce_loss_3": 3.7292062759399416, "ce_loss_6": 3.310219419002533, "epoch": 0.873, "grad_norm": 984.0, "kl_loss_12": 333.8487045288086, "kl_loss_17": 84.45549850463867, "kl_loss_3": 1967.0060729980469, "kl_loss_6": 1117.6822021484375, "learning_rate": 4.0058131789920904e-05, "loss": 850.5296, "step": 8730 }, { "ce_loss_12": 3.123417413234711, "ce_loss_17": 3.0125887989997864, "ce_loss_23": 2.965476393699646, "ce_loss_3": 3.8315968990325926, "ce_loss_6": 3.4407531023025513, "epoch": 0.874, "grad_norm": 1296.0, "kl_loss_12": 329.74097442626953, "kl_loss_17": 83.98312149047851, "kl_loss_3": 1910.9992614746093, "kl_loss_6": 1092.8551208496094, "learning_rate": 3.9438173442575e-05, "loss": 877.51, "step": 8740 }, { "ce_loss_12": 3.14895213842392, "ce_loss_17": 3.033963847160339, "ce_loss_23": 2.9877347469329836, "ce_loss_3": 3.853372836112976, "ce_loss_6": 3.4575639367103577, "epoch": 0.875, "grad_norm": 1248.0, "kl_loss_12": 331.0361068725586, "kl_loss_17": 84.8048080444336, "kl_loss_3": 1875.0467468261718, "kl_loss_6": 1067.8612365722656, "learning_rate": 3.882285294770937e-05, "loss": 846.115, "step": 8750 }, { "ce_loss_12": 3.103008270263672, "ce_loss_17": 2.992861568927765, "ce_loss_23": 2.947165012359619, "ce_loss_3": 3.8003880858421324, "ce_loss_6": 3.410064435005188, "epoch": 0.876, "grad_norm": 1040.0, "kl_loss_12": 327.96388244628906, "kl_loss_17": 85.37591094970703, "kl_loss_3": 1864.6184143066407, "kl_loss_6": 1062.9603637695313, "learning_rate": 3.821217650159453e-05, "loss": 855.1959, "step": 8760 }, { "ce_loss_12": 3.0010708808898925, "ce_loss_17": 2.883316230773926, "ce_loss_23": 2.8356757760047913, "ce_loss_3": 3.7577064514160154, "ce_loss_6": 3.331834352016449, "epoch": 0.877, "grad_norm": 1184.0, "kl_loss_12": 340.2316040039062, "kl_loss_17": 85.8433349609375, "kl_loss_3": 1977.02939453125, "kl_loss_6": 1125.2041931152344, "learning_rate": 3.760615025373543e-05, "loss": 867.5315, "step": 8770 }, { "ce_loss_12": 3.161791479587555, "ce_loss_17": 3.0443769335746764, "ce_loss_23": 2.9945632100105284, "ce_loss_3": 3.8910972476005554, "ce_loss_6": 3.4862359285354616, "epoch": 0.878, "grad_norm": 1288.0, "kl_loss_12": 343.78154907226565, "kl_loss_17": 89.50541648864746, "kl_loss_3": 1940.6373046875, "kl_loss_6": 1113.4889892578126, "learning_rate": 3.700478030680987e-05, "loss": 873.9607, "step": 8780 }, { "ce_loss_12": 3.1520805835723875, "ce_loss_17": 3.0417134046554564, "ce_loss_23": 2.995950186252594, "ce_loss_3": 3.8786579847335814, "ce_loss_6": 3.4690412402153017, "epoch": 0.879, "grad_norm": 1312.0, "kl_loss_12": 330.60288696289064, "kl_loss_17": 85.94220123291015, "kl_loss_3": 1907.498956298828, "kl_loss_6": 1094.785562133789, "learning_rate": 3.6408072716606344e-05, "loss": 851.4786, "step": 8790 }, { "ce_loss_12": 3.0897973656654356, "ce_loss_17": 2.9741249084472656, "ce_loss_23": 2.927089250087738, "ce_loss_3": 3.836412763595581, "ce_loss_6": 3.4170141220092773, "epoch": 0.88, "grad_norm": 936.0, "kl_loss_12": 338.46410064697267, "kl_loss_17": 86.6291488647461, "kl_loss_3": 1971.1416748046875, "kl_loss_6": 1117.8702606201173, "learning_rate": 3.5816033491963716e-05, "loss": 883.7929, "step": 8800 }, { "ce_loss_12": 2.9557029008865356, "ce_loss_17": 2.840587717294693, "ce_loss_23": 2.7953685760498046, "ce_loss_3": 3.7151981711387636, "ce_loss_6": 3.287673497200012, "epoch": 0.881, "grad_norm": 1264.0, "kl_loss_12": 334.52431030273436, "kl_loss_17": 85.75007247924805, "kl_loss_3": 1989.5989318847655, "kl_loss_6": 1118.5227844238282, "learning_rate": 3.522866859471047e-05, "loss": 864.8224, "step": 8810 }, { "ce_loss_12": 3.173339104652405, "ce_loss_17": 3.067618262767792, "ce_loss_23": 3.021925222873688, "ce_loss_3": 3.8509360671043398, "ce_loss_6": 3.4695744156837462, "epoch": 0.882, "grad_norm": 1256.0, "kl_loss_12": 323.1707489013672, "kl_loss_17": 82.98987579345703, "kl_loss_3": 1828.076483154297, "kl_loss_6": 1041.479315185547, "learning_rate": 3.46459839396045e-05, "loss": 838.6922, "step": 8820 }, { "ce_loss_12": 3.0913305282592773, "ce_loss_17": 2.9775137305259705, "ce_loss_23": 2.9278905749320985, "ce_loss_3": 3.8238929867744447, "ce_loss_6": 3.4145618438720704, "epoch": 0.883, "grad_norm": 980.0, "kl_loss_12": 334.30641021728513, "kl_loss_17": 85.81579704284668, "kl_loss_3": 1927.4597717285155, "kl_loss_6": 1095.1721893310546, "learning_rate": 3.406798539427386e-05, "loss": 874.5561, "step": 8830 }, { "ce_loss_12": 3.1545315861701964, "ce_loss_17": 3.044299101829529, "ce_loss_23": 2.998510789871216, "ce_loss_3": 3.8693243622779847, "ce_loss_6": 3.469870162010193, "epoch": 0.884, "grad_norm": 1208.0, "kl_loss_12": 332.1228424072266, "kl_loss_17": 84.90392875671387, "kl_loss_3": 1923.0780151367187, "kl_loss_6": 1091.9577911376953, "learning_rate": 3.349467877915746e-05, "loss": 859.1238, "step": 8840 }, { "ce_loss_12": 3.1225974440574644, "ce_loss_17": 3.009087359905243, "ce_loss_23": 2.962168884277344, "ce_loss_3": 3.8547245621681214, "ce_loss_6": 3.4457091689109802, "epoch": 0.885, "grad_norm": 876.0, "kl_loss_12": 338.3793884277344, "kl_loss_17": 86.19623184204102, "kl_loss_3": 1967.9500549316406, "kl_loss_6": 1116.0619323730468, "learning_rate": 3.292606986744667e-05, "loss": 886.2995, "step": 8850 }, { "ce_loss_12": 3.0737382054328917, "ce_loss_17": 2.9652695178985597, "ce_loss_23": 2.9208232402801513, "ce_loss_3": 3.804908299446106, "ce_loss_6": 3.3878675580024717, "epoch": 0.886, "grad_norm": 1064.0, "kl_loss_12": 330.1391632080078, "kl_loss_17": 83.49464492797851, "kl_loss_3": 1942.0145629882813, "kl_loss_6": 1098.344384765625, "learning_rate": 3.23621643850267e-05, "loss": 862.1052, "step": 8860 }, { "ce_loss_12": 3.1435423254966737, "ce_loss_17": 3.031849682331085, "ce_loss_23": 2.9862906098365785, "ce_loss_3": 3.855881190299988, "ce_loss_6": 3.462362253665924, "epoch": 0.887, "grad_norm": 1072.0, "kl_loss_12": 340.6441955566406, "kl_loss_17": 87.4849063873291, "kl_loss_3": 1918.8952880859374, "kl_loss_6": 1104.3206878662108, "learning_rate": 3.180296801041971e-05, "loss": 846.5516, "step": 8870 }, { "ce_loss_12": 3.1657923221588136, "ce_loss_17": 3.0589999556541443, "ce_loss_23": 3.0135733246803285, "ce_loss_3": 3.878432369232178, "ce_loss_6": 3.475983202457428, "epoch": 0.888, "grad_norm": 1012.0, "kl_loss_12": 329.00604553222655, "kl_loss_17": 84.90734176635742, "kl_loss_3": 1914.929833984375, "kl_loss_6": 1079.0008392333984, "learning_rate": 3.124848637472688e-05, "loss": 837.532, "step": 8880 }, { "ce_loss_12": 2.998358190059662, "ce_loss_17": 2.8849174857139586, "ce_loss_23": 2.8381486892700196, "ce_loss_3": 3.734885573387146, "ce_loss_6": 3.3218790531158446, "epoch": 0.889, "grad_norm": 1296.0, "kl_loss_12": 326.6248016357422, "kl_loss_17": 83.15086822509765, "kl_loss_3": 1941.88310546875, "kl_loss_6": 1098.8846435546875, "learning_rate": 3.069872506157212e-05, "loss": 853.1146, "step": 8890 }, { "ce_loss_12": 3.093616080284119, "ce_loss_17": 2.982220458984375, "ce_loss_23": 2.9373262763023376, "ce_loss_3": 3.803342950344086, "ce_loss_6": 3.406595754623413, "epoch": 0.89, "grad_norm": 1152.0, "kl_loss_12": 331.7616683959961, "kl_loss_17": 84.35318107604981, "kl_loss_3": 1904.2030456542968, "kl_loss_6": 1078.6033813476563, "learning_rate": 3.0153689607045842e-05, "loss": 844.5693, "step": 8900 }, { "ce_loss_12": 3.0073063373565674, "ce_loss_17": 2.889033830165863, "ce_loss_23": 2.841313922405243, "ce_loss_3": 3.780926537513733, "ce_loss_6": 3.350403296947479, "epoch": 0.891, "grad_norm": 1064.0, "kl_loss_12": 345.90849609375, "kl_loss_17": 87.65661125183105, "kl_loss_3": 2050.542547607422, "kl_loss_6": 1161.115560913086, "learning_rate": 2.9613385499648926e-05, "loss": 870.6569, "step": 8910 }, { "ce_loss_12": 3.0530014753341677, "ce_loss_17": 2.9412503004074098, "ce_loss_23": 2.8958068609237673, "ce_loss_3": 3.7602118015289308, "ce_loss_6": 3.3697606444358827, "epoch": 0.892, "grad_norm": 1112.0, "kl_loss_12": 328.8154098510742, "kl_loss_17": 84.93235816955567, "kl_loss_3": 1880.8406860351563, "kl_loss_6": 1080.6198760986329, "learning_rate": 2.9077818180237692e-05, "loss": 852.0275, "step": 8920 }, { "ce_loss_12": 3.0982262253761292, "ce_loss_17": 2.9827243089675903, "ce_loss_23": 2.9341540217399595, "ce_loss_3": 3.8311649203300475, "ce_loss_6": 3.420704758167267, "epoch": 0.893, "grad_norm": 1400.0, "kl_loss_12": 332.6533721923828, "kl_loss_17": 85.156884765625, "kl_loss_3": 1921.9191345214845, "kl_loss_6": 1094.7849243164062, "learning_rate": 2.8546993041969172e-05, "loss": 856.9773, "step": 8930 }, { "ce_loss_12": 3.1285369038581847, "ce_loss_17": 3.0208025217056274, "ce_loss_23": 2.9757667064666746, "ce_loss_3": 3.8146532654762266, "ce_loss_6": 3.428730511665344, "epoch": 0.894, "grad_norm": 892.0, "kl_loss_12": 324.5311508178711, "kl_loss_17": 83.1994400024414, "kl_loss_3": 1870.1867980957031, "kl_loss_6": 1062.970556640625, "learning_rate": 2.802091543024671e-05, "loss": 847.6863, "step": 8940 }, { "ce_loss_12": 3.1246097564697264, "ce_loss_17": 3.013726496696472, "ce_loss_23": 2.965451240539551, "ce_loss_3": 3.8655041098594665, "ce_loss_6": 3.450291562080383, "epoch": 0.895, "grad_norm": 1168.0, "kl_loss_12": 333.080322265625, "kl_loss_17": 85.56056671142578, "kl_loss_3": 1969.126446533203, "kl_loss_6": 1106.444076538086, "learning_rate": 2.7499590642665774e-05, "loss": 880.2635, "step": 8950 }, { "ce_loss_12": 3.143812417984009, "ce_loss_17": 3.028483211994171, "ce_loss_23": 2.985080027580261, "ce_loss_3": 3.8509507298469545, "ce_loss_6": 3.441773521900177, "epoch": 0.896, "grad_norm": 1224.0, "kl_loss_12": 349.53120574951174, "kl_loss_17": 86.6789764404297, "kl_loss_3": 1895.8159729003905, "kl_loss_6": 1070.7532867431642, "learning_rate": 2.6983023928961405e-05, "loss": 845.1069, "step": 8960 }, { "ce_loss_12": 3.106435704231262, "ce_loss_17": 2.9932427644729613, "ce_loss_23": 2.946383273601532, "ce_loss_3": 3.8312835454940797, "ce_loss_6": 3.4233412861824037, "epoch": 0.897, "grad_norm": 1144.0, "kl_loss_12": 333.72619476318357, "kl_loss_17": 85.79256935119629, "kl_loss_3": 1905.6740844726562, "kl_loss_6": 1080.7652252197265, "learning_rate": 2.6471220490954628e-05, "loss": 860.4385, "step": 8970 }, { "ce_loss_12": 3.0967092156410216, "ce_loss_17": 2.991311955451965, "ce_loss_23": 2.946649193763733, "ce_loss_3": 3.8147808074951173, "ce_loss_6": 3.400782382488251, "epoch": 0.898, "grad_norm": 1088.0, "kl_loss_12": 326.99341583251953, "kl_loss_17": 83.46413383483886, "kl_loss_3": 1898.6377075195312, "kl_loss_6": 1065.3228820800782, "learning_rate": 2.596418548250029e-05, "loss": 852.1803, "step": 8980 }, { "ce_loss_12": 3.1327340722084047, "ce_loss_17": 3.021799325942993, "ce_loss_23": 2.975428318977356, "ce_loss_3": 3.8464512825012207, "ce_loss_6": 3.4454381346702574, "epoch": 0.899, "grad_norm": 972.0, "kl_loss_12": 335.007633972168, "kl_loss_17": 87.43348655700683, "kl_loss_3": 1915.6479431152343, "kl_loss_6": 1085.7200561523437, "learning_rate": 2.5461924009435368e-05, "loss": 844.5648, "step": 8990 }, { "ce_loss_12": 3.129016411304474, "ce_loss_17": 3.016228425502777, "ce_loss_23": 2.971251893043518, "ce_loss_3": 3.837902855873108, "ce_loss_6": 3.4397639632225037, "epoch": 0.9, "grad_norm": 1072.0, "kl_loss_12": 336.40973510742185, "kl_loss_17": 86.76480216979981, "kl_loss_3": 1904.9064086914063, "kl_loss_6": 1085.8543060302734, "learning_rate": 2.4964441129527336e-05, "loss": 869.592, "step": 9000 }, { "ce_loss_12": 3.125925374031067, "ce_loss_17": 3.0190950870513915, "ce_loss_23": 2.9737316727638246, "ce_loss_3": 3.812189018726349, "ce_loss_6": 3.430092179775238, "epoch": 0.901, "grad_norm": 1128.0, "kl_loss_12": 324.31306915283204, "kl_loss_17": 83.48861198425293, "kl_loss_3": 1863.2838806152345, "kl_loss_6": 1058.821725463867, "learning_rate": 2.4471741852423235e-05, "loss": 838.7601, "step": 9010 }, { "ce_loss_12": 3.1805315017700195, "ce_loss_17": 3.064588391780853, "ce_loss_23": 3.018610644340515, "ce_loss_3": 3.8909000754356384, "ce_loss_6": 3.4915917038917543, "epoch": 0.902, "grad_norm": 904.0, "kl_loss_12": 332.6283920288086, "kl_loss_17": 86.11167373657227, "kl_loss_3": 1886.189385986328, "kl_loss_6": 1075.392333984375, "learning_rate": 2.3983831139599287e-05, "loss": 849.1871, "step": 9020 }, { "ce_loss_12": 3.09921315908432, "ce_loss_17": 2.9914977669715883, "ce_loss_23": 2.945024645328522, "ce_loss_3": 3.813475775718689, "ce_loss_6": 3.4056436419487, "epoch": 0.903, "grad_norm": 972.0, "kl_loss_12": 320.38296661376955, "kl_loss_17": 83.96120872497559, "kl_loss_3": 1880.3851745605468, "kl_loss_6": 1054.7029083251953, "learning_rate": 2.3500713904311022e-05, "loss": 828.3093, "step": 9030 }, { "ce_loss_12": 3.129648447036743, "ce_loss_17": 3.0214686155319215, "ce_loss_23": 2.977446722984314, "ce_loss_3": 3.816241002082825, "ce_loss_6": 3.427567720413208, "epoch": 0.904, "grad_norm": 1312.0, "kl_loss_12": 320.1314926147461, "kl_loss_17": 82.71114997863769, "kl_loss_3": 1835.7401123046875, "kl_loss_6": 1046.6744812011718, "learning_rate": 2.3022395011543685e-05, "loss": 827.5029, "step": 9040 }, { "ce_loss_12": 3.1652731776237486, "ce_loss_17": 3.04982990026474, "ce_loss_23": 3.0022929906845093, "ce_loss_3": 3.8810678482055665, "ce_loss_6": 3.483680856227875, "epoch": 0.905, "grad_norm": 1128.0, "kl_loss_12": 341.3849838256836, "kl_loss_17": 87.51144218444824, "kl_loss_3": 1925.983953857422, "kl_loss_6": 1097.1318237304688, "learning_rate": 2.2548879277963063e-05, "loss": 873.3892, "step": 9050 }, { "ce_loss_12": 3.081729865074158, "ce_loss_17": 2.9729421496391297, "ce_loss_23": 2.928011405467987, "ce_loss_3": 3.7899386644363404, "ce_loss_6": 3.3915660858154295, "epoch": 0.906, "grad_norm": 912.0, "kl_loss_12": 327.93106536865236, "kl_loss_17": 84.1146312713623, "kl_loss_3": 1879.2633178710937, "kl_loss_6": 1068.782211303711, "learning_rate": 2.208017147186736e-05, "loss": 826.4768, "step": 9060 }, { "ce_loss_12": 3.0779837131500245, "ce_loss_17": 2.967180871963501, "ce_loss_23": 2.920120894908905, "ce_loss_3": 3.788610649108887, "ce_loss_6": 3.3945882081985475, "epoch": 0.907, "grad_norm": 800.0, "kl_loss_12": 328.81197052001954, "kl_loss_17": 84.20720748901367, "kl_loss_3": 1906.1780151367188, "kl_loss_6": 1087.1554595947266, "learning_rate": 2.1616276313139227e-05, "loss": 847.2346, "step": 9070 }, { "ce_loss_12": 3.1198357820510862, "ce_loss_17": 3.0084909439086913, "ce_loss_23": 2.9599400043487547, "ce_loss_3": 3.832070028781891, "ce_loss_6": 3.4358290672302245, "epoch": 0.908, "grad_norm": 1224.0, "kl_loss_12": 331.44759979248045, "kl_loss_17": 85.02637100219727, "kl_loss_3": 1910.9259521484375, "kl_loss_6": 1087.1692504882812, "learning_rate": 2.1157198473197415e-05, "loss": 859.7826, "step": 9080 }, { "ce_loss_12": 3.170629847049713, "ce_loss_17": 3.056658124923706, "ce_loss_23": 3.0103936195373535, "ce_loss_3": 3.8856051087379457, "ce_loss_6": 3.4874077081680297, "epoch": 0.909, "grad_norm": 980.0, "kl_loss_12": 339.3678176879883, "kl_loss_17": 87.31046714782715, "kl_loss_3": 1910.169189453125, "kl_loss_6": 1094.2420532226563, "learning_rate": 2.0702942574950812e-05, "loss": 854.4599, "step": 9090 }, { "ce_loss_12": 3.1087508678436278, "ce_loss_17": 2.9925315380096436, "ce_loss_23": 2.944721484184265, "ce_loss_3": 3.84104905128479, "ce_loss_6": 3.4278690695762633, "epoch": 0.91, "grad_norm": 940.0, "kl_loss_12": 340.95262756347654, "kl_loss_17": 87.44319190979004, "kl_loss_3": 1935.718896484375, "kl_loss_6": 1098.7049499511718, "learning_rate": 2.025351319275137e-05, "loss": 857.4225, "step": 9100 }, { "ce_loss_12": 3.224490487575531, "ce_loss_17": 3.110004448890686, "ce_loss_23": 3.060851216316223, "ce_loss_3": 3.9367117643356324, "ce_loss_6": 3.543748676776886, "epoch": 0.911, "grad_norm": 1064.0, "kl_loss_12": 344.0806182861328, "kl_loss_17": 87.59589996337891, "kl_loss_3": 1927.4166748046875, "kl_loss_6": 1112.0118194580077, "learning_rate": 1.9808914852347816e-05, "loss": 878.4797, "step": 9110 }, { "ce_loss_12": 3.0762908935546873, "ce_loss_17": 2.9619147539138795, "ce_loss_23": 2.9152398824691774, "ce_loss_3": 3.789369535446167, "ce_loss_6": 3.395467746257782, "epoch": 0.912, "grad_norm": 784.0, "kl_loss_12": 330.6476486206055, "kl_loss_17": 85.57843284606933, "kl_loss_3": 1885.4349487304687, "kl_loss_6": 1078.397769165039, "learning_rate": 1.9369152030840554e-05, "loss": 846.1178, "step": 9120 }, { "ce_loss_12": 3.154605543613434, "ce_loss_17": 3.046464478969574, "ce_loss_23": 3.0007370710372925, "ce_loss_3": 3.867127311229706, "ce_loss_6": 3.4716737866401672, "epoch": 0.913, "grad_norm": 812.0, "kl_loss_12": 333.79958190917966, "kl_loss_17": 85.41837768554687, "kl_loss_3": 1922.9818054199218, "kl_loss_6": 1094.322232055664, "learning_rate": 1.893422915663645e-05, "loss": 857.3375, "step": 9130 }, { "ce_loss_12": 3.0396557331085203, "ce_loss_17": 2.9230321049690247, "ce_loss_23": 2.8751866221427917, "ce_loss_3": 3.7878071665763855, "ce_loss_6": 3.3714339256286623, "epoch": 0.914, "grad_norm": 1032.0, "kl_loss_12": 337.8765075683594, "kl_loss_17": 86.10670356750488, "kl_loss_3": 1972.9100341796875, "kl_loss_6": 1125.4221252441407, "learning_rate": 1.850415060940386e-05, "loss": 871.0162, "step": 9140 }, { "ce_loss_12": 3.1497557282447817, "ce_loss_17": 3.0385369658470154, "ce_loss_23": 2.9938205003738405, "ce_loss_3": 3.8452399015426635, "ce_loss_6": 3.45322071313858, "epoch": 0.915, "grad_norm": 1008.0, "kl_loss_12": 332.32138671875, "kl_loss_17": 84.9898525238037, "kl_loss_3": 1874.5397216796875, "kl_loss_6": 1070.864324951172, "learning_rate": 1.8078920720028978e-05, "loss": 848.5287, "step": 9150 }, { "ce_loss_12": 3.076209282875061, "ce_loss_17": 2.9700810432434084, "ce_loss_23": 2.926768720149994, "ce_loss_3": 3.778885042667389, "ce_loss_6": 3.385262680053711, "epoch": 0.916, "grad_norm": 1056.0, "kl_loss_12": 321.25645141601564, "kl_loss_17": 82.72433166503906, "kl_loss_3": 1865.1188659667969, "kl_loss_6": 1059.3348358154296, "learning_rate": 1.765854377057219e-05, "loss": 851.4663, "step": 9160 }, { "ce_loss_12": 3.051760160923004, "ce_loss_17": 2.9470738410949706, "ce_loss_23": 2.9018150329589845, "ce_loss_3": 3.762616181373596, "ce_loss_6": 3.360524618625641, "epoch": 0.917, "grad_norm": 1152.0, "kl_loss_12": 322.6339859008789, "kl_loss_17": 82.14870758056641, "kl_loss_3": 1894.1626953125, "kl_loss_6": 1062.8217315673828, "learning_rate": 1.724302399422456e-05, "loss": 849.0074, "step": 9170 }, { "ce_loss_12": 3.025525665283203, "ce_loss_17": 2.9144426941871644, "ce_loss_23": 2.8663503766059875, "ce_loss_3": 3.747741138935089, "ce_loss_6": 3.3477275729179383, "epoch": 0.918, "grad_norm": 1104.0, "kl_loss_12": 337.24256134033203, "kl_loss_17": 87.08244514465332, "kl_loss_3": 1926.6942932128907, "kl_loss_6": 1100.351480102539, "learning_rate": 1.683236557526574e-05, "loss": 858.5575, "step": 9180 }, { "ce_loss_12": 3.1238354563713076, "ce_loss_17": 3.018564236164093, "ce_loss_23": 2.9754155397415163, "ce_loss_3": 3.81089985370636, "ce_loss_6": 3.4238336086273193, "epoch": 0.919, "grad_norm": 832.0, "kl_loss_12": 317.4216339111328, "kl_loss_17": 82.28768424987793, "kl_loss_3": 1827.3025207519531, "kl_loss_6": 1032.5061828613282, "learning_rate": 1.6426572649021475e-05, "loss": 837.4054, "step": 9190 }, { "ce_loss_12": 3.159148633480072, "ce_loss_17": 3.0522322416305543, "ce_loss_23": 3.006185734272003, "ce_loss_3": 3.827952432632446, "ce_loss_6": 3.44946551322937, "epoch": 0.92, "grad_norm": 1440.0, "kl_loss_12": 324.6236053466797, "kl_loss_17": 85.08680648803711, "kl_loss_3": 1835.1791931152343, "kl_loss_6": 1047.3831878662108, "learning_rate": 1.6025649301821876e-05, "loss": 837.1028, "step": 9200 }, { "ce_loss_12": 3.1503032088279723, "ce_loss_17": 3.0398178339004516, "ce_loss_23": 2.9948899507522584, "ce_loss_3": 3.8383944511413572, "ce_loss_6": 3.4516194224357606, "epoch": 0.921, "grad_norm": 1096.0, "kl_loss_12": 331.9905776977539, "kl_loss_17": 85.62064247131347, "kl_loss_3": 1859.518487548828, "kl_loss_6": 1065.316110229492, "learning_rate": 1.5629599570960716e-05, "loss": 836.26, "step": 9210 }, { "ce_loss_12": 3.064479887485504, "ce_loss_17": 2.9572617769241334, "ce_loss_23": 2.9113405346870422, "ce_loss_3": 3.78310467004776, "ce_loss_6": 3.3747425436973573, "epoch": 0.922, "grad_norm": 976.0, "kl_loss_12": 329.91296234130857, "kl_loss_17": 84.50942268371583, "kl_loss_3": 1919.332208251953, "kl_loss_6": 1082.223892211914, "learning_rate": 1.5238427444654367e-05, "loss": 846.9709, "step": 9220 }, { "ce_loss_12": 3.1175356030464174, "ce_loss_17": 3.004572319984436, "ce_loss_23": 2.960529851913452, "ce_loss_3": 3.8180633306503298, "ce_loss_6": 3.421934986114502, "epoch": 0.923, "grad_norm": 940.0, "kl_loss_12": 326.40318756103517, "kl_loss_17": 84.79353713989258, "kl_loss_3": 1871.1626403808593, "kl_loss_6": 1054.7036376953124, "learning_rate": 1.4852136862001764e-05, "loss": 840.9615, "step": 9230 }, { "ce_loss_12": 3.087792432308197, "ce_loss_17": 2.9745679259300233, "ce_loss_23": 2.929926407337189, "ce_loss_3": 3.7816343307495117, "ce_loss_6": 3.394350898265839, "epoch": 0.924, "grad_norm": 884.0, "kl_loss_12": 323.3223403930664, "kl_loss_17": 82.07476539611817, "kl_loss_3": 1860.5654235839843, "kl_loss_6": 1062.8414367675782, "learning_rate": 1.4470731712944884e-05, "loss": 846.7811, "step": 9240 }, { "ce_loss_12": 3.1074899077415465, "ce_loss_17": 2.995665502548218, "ce_loss_23": 2.9461994886398317, "ce_loss_3": 3.8176616072654723, "ce_loss_6": 3.4217876195907593, "epoch": 0.925, "grad_norm": 948.0, "kl_loss_12": 333.7071365356445, "kl_loss_17": 85.56305122375488, "kl_loss_3": 1878.6777465820312, "kl_loss_6": 1069.3385070800782, "learning_rate": 1.4094215838229174e-05, "loss": 861.8266, "step": 9250 }, { "ce_loss_12": 3.085869753360748, "ce_loss_17": 2.9727274417877196, "ce_loss_23": 2.925786566734314, "ce_loss_3": 3.807579004764557, "ce_loss_6": 3.399830675125122, "epoch": 0.926, "grad_norm": 1136.0, "kl_loss_12": 335.0611145019531, "kl_loss_17": 85.76180076599121, "kl_loss_3": 1930.875244140625, "kl_loss_6": 1097.8640533447265, "learning_rate": 1.372259302936546e-05, "loss": 887.4711, "step": 9260 }, { "ce_loss_12": 3.185081458091736, "ce_loss_17": 3.069750726222992, "ce_loss_23": 3.0205004453659057, "ce_loss_3": 3.8892650842666625, "ce_loss_6": 3.492465448379517, "epoch": 0.927, "grad_norm": 792.0, "kl_loss_12": 341.22171783447266, "kl_loss_17": 89.73458824157714, "kl_loss_3": 1897.4500854492187, "kl_loss_6": 1078.6844818115235, "learning_rate": 1.3355867028591206e-05, "loss": 842.859, "step": 9270 }, { "ce_loss_12": 3.0859570026397707, "ce_loss_17": 2.977510154247284, "ce_loss_23": 2.934697890281677, "ce_loss_3": 3.7636232137680055, "ce_loss_6": 3.385506844520569, "epoch": 0.928, "grad_norm": 1400.0, "kl_loss_12": 324.6181671142578, "kl_loss_17": 83.21072921752929, "kl_loss_3": 1841.8223083496093, "kl_loss_6": 1056.7283020019531, "learning_rate": 1.2994041528833267e-05, "loss": 835.5336, "step": 9280 }, { "ce_loss_12": 3.0869906425476072, "ce_loss_17": 2.977507305145264, "ce_loss_23": 2.931162786483765, "ce_loss_3": 3.7968642473220826, "ce_loss_6": 3.3973227381706237, "epoch": 0.929, "grad_norm": 1504.0, "kl_loss_12": 326.4042282104492, "kl_loss_17": 82.84918251037598, "kl_loss_3": 1895.9616149902345, "kl_loss_6": 1075.722134399414, "learning_rate": 1.2637120173670358e-05, "loss": 839.5449, "step": 9290 }, { "ce_loss_12": 3.1100995540618896, "ce_loss_17": 2.9969801187515257, "ce_loss_23": 2.9494405508041384, "ce_loss_3": 3.835551941394806, "ce_loss_6": 3.431933581829071, "epoch": 0.93, "grad_norm": 1176.0, "kl_loss_12": 332.53186492919923, "kl_loss_17": 85.74410667419434, "kl_loss_3": 1908.8519104003906, "kl_loss_6": 1083.8726348876953, "learning_rate": 1.2285106557296478e-05, "loss": 844.5967, "step": 9300 }, { "ce_loss_12": 3.0027957916259767, "ce_loss_17": 2.8903992056846617, "ce_loss_23": 2.8444465160369874, "ce_loss_3": 3.7717514753341677, "ce_loss_6": 3.340221178531647, "epoch": 0.931, "grad_norm": 1008.0, "kl_loss_12": 334.00975036621094, "kl_loss_17": 84.80331535339356, "kl_loss_3": 2001.4889343261718, "kl_loss_6": 1126.0714202880858, "learning_rate": 1.1938004224484989e-05, "loss": 866.8414, "step": 9310 }, { "ce_loss_12": 3.2222105979919435, "ce_loss_17": 3.111147093772888, "ce_loss_23": 3.063632643222809, "ce_loss_3": 3.917266309261322, "ce_loss_6": 3.5249305725097657, "epoch": 0.932, "grad_norm": 1012.0, "kl_loss_12": 334.32122497558595, "kl_loss_17": 86.87853698730468, "kl_loss_3": 1899.5992370605468, "kl_loss_6": 1080.962240600586, "learning_rate": 1.1595816670552429e-05, "loss": 865.2732, "step": 9320 }, { "ce_loss_12": 3.1430585265159605, "ce_loss_17": 3.0352956652641296, "ce_loss_23": 2.9885062336921693, "ce_loss_3": 3.8439278960227967, "ce_loss_6": 3.447865056991577, "epoch": 0.933, "grad_norm": 980.0, "kl_loss_12": 327.3254165649414, "kl_loss_17": 86.49773216247559, "kl_loss_3": 1866.330029296875, "kl_loss_6": 1056.9034118652344, "learning_rate": 1.1258547341323699e-05, "loss": 834.5617, "step": 9330 }, { "ce_loss_12": 3.176737833023071, "ce_loss_17": 3.0673033356666566, "ce_loss_23": 3.018969011306763, "ce_loss_3": 3.8674515962600706, "ce_loss_6": 3.4846546173095705, "epoch": 0.934, "grad_norm": 884.0, "kl_loss_12": 334.75340576171874, "kl_loss_17": 85.98496704101562, "kl_loss_3": 1884.177197265625, "kl_loss_6": 1081.2376861572266, "learning_rate": 1.0926199633097156e-05, "loss": 845.4896, "step": 9340 }, { "ce_loss_12": 3.179938495159149, "ce_loss_17": 3.074588692188263, "ce_loss_23": 3.0303378343582152, "ce_loss_3": 3.8442990660667418, "ce_loss_6": 3.471651017665863, "epoch": 0.935, "grad_norm": 956.0, "kl_loss_12": 319.90943603515626, "kl_loss_17": 83.01764106750488, "kl_loss_3": 1814.667059326172, "kl_loss_6": 1041.4213287353516, "learning_rate": 1.0598776892610684e-05, "loss": 848.2934, "step": 9350 }, { "ce_loss_12": 3.009808611869812, "ce_loss_17": 2.898372745513916, "ce_loss_23": 2.854316532611847, "ce_loss_3": 3.726544368267059, "ce_loss_6": 3.32262647151947, "epoch": 0.936, "grad_norm": 916.0, "kl_loss_12": 326.60680999755857, "kl_loss_17": 83.0755630493164, "kl_loss_3": 1924.3721801757813, "kl_loss_6": 1085.0824829101562, "learning_rate": 1.0276282417007399e-05, "loss": 842.6063, "step": 9360 }, { "ce_loss_12": 3.1516371846199034, "ce_loss_17": 3.0410276889801025, "ce_loss_23": 2.996881353855133, "ce_loss_3": 3.834710657596588, "ce_loss_6": 3.450020658969879, "epoch": 0.937, "grad_norm": 1168.0, "kl_loss_12": 323.99993896484375, "kl_loss_17": 82.53963088989258, "kl_loss_3": 1854.7433349609375, "kl_loss_6": 1059.0154510498046, "learning_rate": 9.958719453803277e-06, "loss": 840.2551, "step": 9370 }, { "ce_loss_12": 3.1496444582939147, "ce_loss_17": 3.0366398692131042, "ce_loss_23": 2.989711654186249, "ce_loss_3": 3.8697296500205995, "ce_loss_6": 3.471941578388214, "epoch": 0.938, "grad_norm": 952.0, "kl_loss_12": 331.8085479736328, "kl_loss_17": 85.06403846740723, "kl_loss_3": 1910.690576171875, "kl_loss_6": 1096.4352661132812, "learning_rate": 9.646091200853802e-06, "loss": 847.9618, "step": 9380 }, { "ce_loss_12": 3.1037580490112306, "ce_loss_17": 2.995933508872986, "ce_loss_23": 2.9508240699768065, "ce_loss_3": 3.8010664582252502, "ce_loss_6": 3.4100754141807554, "epoch": 0.939, "grad_norm": 1088.0, "kl_loss_12": 324.2424377441406, "kl_loss_17": 83.03877677917481, "kl_loss_3": 1846.7161865234375, "kl_loss_6": 1062.0440032958984, "learning_rate": 9.338400806321978e-06, "loss": 819.034, "step": 9390 }, { "ce_loss_12": 3.1476006865501405, "ce_loss_17": 3.033066248893738, "ce_loss_23": 2.9848249673843386, "ce_loss_3": 3.8408823251724242, "ce_loss_6": 3.4523388862609865, "epoch": 0.94, "grad_norm": 908.0, "kl_loss_12": 334.59171295166016, "kl_loss_17": 86.80456924438477, "kl_loss_3": 1869.5985900878907, "kl_loss_6": 1072.3442169189452, "learning_rate": 9.035651368646646e-06, "loss": 836.1096, "step": 9400 }, { "ce_loss_12": 3.145501697063446, "ce_loss_17": 3.03811719417572, "ce_loss_23": 2.994671952724457, "ce_loss_3": 3.834343147277832, "ce_loss_6": 3.452866768836975, "epoch": 0.941, "grad_norm": 1072.0, "kl_loss_12": 323.7934066772461, "kl_loss_17": 83.24091529846191, "kl_loss_3": 1857.227685546875, "kl_loss_6": 1066.3185974121093, "learning_rate": 8.737845936511335e-06, "loss": 844.8186, "step": 9410 }, { "ce_loss_12": 3.098772418498993, "ce_loss_17": 2.9861217737197876, "ce_loss_23": 2.9370655179023744, "ce_loss_3": 3.816584086418152, "ce_loss_6": 3.4150121927261354, "epoch": 0.942, "grad_norm": 884.0, "kl_loss_12": 331.8512329101562, "kl_loss_17": 85.79718551635742, "kl_loss_3": 1910.79208984375, "kl_loss_6": 1083.8923706054688, "learning_rate": 8.444987508813451e-06, "loss": 842.9543, "step": 9420 }, { "ce_loss_12": 3.06053763628006, "ce_loss_17": 2.9460514068603514, "ce_loss_23": 2.9014501929283143, "ce_loss_3": 3.802388143539429, "ce_loss_6": 3.39177383184433, "epoch": 0.943, "grad_norm": 1256.0, "kl_loss_12": 337.7682632446289, "kl_loss_17": 85.87634010314942, "kl_loss_3": 1981.4451171875, "kl_loss_6": 1125.2985046386718, "learning_rate": 8.157079034633974e-06, "loss": 865.9455, "step": 9430 }, { "ce_loss_12": 3.049058198928833, "ce_loss_17": 2.938900911808014, "ce_loss_23": 2.8931046962738036, "ce_loss_3": 3.7696901321411134, "ce_loss_6": 3.3683017015457155, "epoch": 0.944, "grad_norm": 1248.0, "kl_loss_12": 329.84632568359376, "kl_loss_17": 83.97733726501465, "kl_loss_3": 1931.3895690917968, "kl_loss_6": 1095.4517517089844, "learning_rate": 7.874123413208145e-06, "loss": 849.1218, "step": 9440 }, { "ce_loss_12": 3.0315654158592222, "ce_loss_17": 2.9158872246742247, "ce_loss_23": 2.8700754761695864, "ce_loss_3": 3.767153263092041, "ce_loss_6": 3.3529996633529664, "epoch": 0.945, "grad_norm": 1056.0, "kl_loss_12": 328.3330795288086, "kl_loss_17": 83.33571510314941, "kl_loss_3": 1934.7627197265624, "kl_loss_6": 1086.8557250976562, "learning_rate": 7.59612349389599e-06, "loss": 859.2561, "step": 9450 }, { "ce_loss_12": 3.108149600028992, "ce_loss_17": 3.000080704689026, "ce_loss_23": 2.956249988079071, "ce_loss_3": 3.7995969891548156, "ce_loss_6": 3.4120611190795898, "epoch": 0.946, "grad_norm": 960.0, "kl_loss_12": 320.4307144165039, "kl_loss_17": 82.51799201965332, "kl_loss_3": 1834.1530395507812, "kl_loss_6": 1044.237774658203, "learning_rate": 7.323082076153509e-06, "loss": 834.5738, "step": 9460 }, { "ce_loss_12": 3.151083838939667, "ce_loss_17": 3.0425574660301207, "ce_loss_23": 2.9955819845199585, "ce_loss_3": 3.8394291162490846, "ce_loss_6": 3.4562087774276735, "epoch": 0.947, "grad_norm": 936.0, "kl_loss_12": 333.69583892822266, "kl_loss_17": 86.60095405578613, "kl_loss_3": 1847.519189453125, "kl_loss_6": 1059.3768951416016, "learning_rate": 7.055001909504755e-06, "loss": 850.0775, "step": 9470 }, { "ce_loss_12": 3.189653384685516, "ce_loss_17": 3.0824284672737123, "ce_loss_23": 3.0342490315437316, "ce_loss_3": 3.8908013105392456, "ce_loss_6": 3.494161105155945, "epoch": 0.948, "grad_norm": 1168.0, "kl_loss_12": 329.65736083984376, "kl_loss_17": 85.09084739685059, "kl_loss_3": 1888.2302124023438, "kl_loss_6": 1074.1681365966797, "learning_rate": 6.791885693514133e-06, "loss": 849.4602, "step": 9480 }, { "ce_loss_12": 3.09908903837204, "ce_loss_17": 2.9901888966560364, "ce_loss_23": 2.9434477210044863, "ce_loss_3": 3.8163867354393006, "ce_loss_6": 3.4153868079185488, "epoch": 0.949, "grad_norm": 1384.0, "kl_loss_12": 330.6887634277344, "kl_loss_17": 85.88697090148926, "kl_loss_3": 1924.2292724609374, "kl_loss_6": 1094.1362182617188, "learning_rate": 6.533736077758867e-06, "loss": 859.2564, "step": 9490 }, { "ce_loss_12": 3.0679742455482484, "ce_loss_17": 2.953362250328064, "ce_loss_23": 2.903950023651123, "ce_loss_3": 3.8092184901237487, "ce_loss_6": 3.3871954679489136, "epoch": 0.95, "grad_norm": 1224.0, "kl_loss_12": 338.8248016357422, "kl_loss_17": 86.66958503723144, "kl_loss_3": 1991.7789733886718, "kl_loss_6": 1120.8234252929688, "learning_rate": 6.2805556618028556e-06, "loss": 859.4271, "step": 9500 }, { "ce_loss_12": 3.1421627521514894, "ce_loss_17": 3.037944483757019, "ce_loss_23": 2.9918121695518494, "ce_loss_3": 3.8194591641426086, "ce_loss_6": 3.4336347937583924, "epoch": 0.951, "grad_norm": 1072.0, "kl_loss_12": 315.54112854003904, "kl_loss_17": 82.4297420501709, "kl_loss_3": 1820.9282653808593, "kl_loss_6": 1024.5725524902343, "learning_rate": 6.032346995169968e-06, "loss": 808.4711, "step": 9510 }, { "ce_loss_12": 3.1413091897964476, "ce_loss_17": 3.034634304046631, "ce_loss_23": 2.9904202222824097, "ce_loss_3": 3.851827323436737, "ce_loss_6": 3.446918821334839, "epoch": 0.952, "grad_norm": 1336.0, "kl_loss_12": 327.0206527709961, "kl_loss_17": 84.0272274017334, "kl_loss_3": 1891.8768371582032, "kl_loss_6": 1072.3389282226562, "learning_rate": 5.789112577318789e-06, "loss": 837.899, "step": 9520 }, { "ce_loss_12": 3.132501184940338, "ce_loss_17": 3.020099472999573, "ce_loss_23": 2.9733578681945803, "ce_loss_3": 3.8476534247398377, "ce_loss_6": 3.4466808438301086, "epoch": 0.953, "grad_norm": 808.0, "kl_loss_12": 332.8140274047852, "kl_loss_17": 86.41438598632813, "kl_loss_3": 1920.221014404297, "kl_loss_6": 1092.9197998046875, "learning_rate": 5.550854857617194e-06, "loss": 838.1862, "step": 9530 }, { "ce_loss_12": 3.1130447149276734, "ce_loss_17": 3.000280570983887, "ce_loss_23": 2.9520017981529234, "ce_loss_3": 3.850728416442871, "ce_loss_6": 3.4371416687965395, "epoch": 0.954, "grad_norm": 1072.0, "kl_loss_12": 340.49346618652345, "kl_loss_17": 87.64574966430663, "kl_loss_3": 1954.3390808105469, "kl_loss_6": 1107.7118560791016, "learning_rate": 5.317576235317756e-06, "loss": 863.4284, "step": 9540 }, { "ce_loss_12": 3.1355828166007997, "ce_loss_17": 3.029031991958618, "ce_loss_23": 2.9835096120834352, "ce_loss_3": 3.8195814967155455, "ce_loss_6": 3.436410915851593, "epoch": 0.955, "grad_norm": 1012.0, "kl_loss_12": 320.99874114990234, "kl_loss_17": 84.38487701416015, "kl_loss_3": 1820.7837524414062, "kl_loss_6": 1038.4941101074219, "learning_rate": 5.089279059533658e-06, "loss": 839.8436, "step": 9550 }, { "ce_loss_12": 3.1969587922096254, "ce_loss_17": 3.0810975074768066, "ce_loss_23": 3.030389428138733, "ce_loss_3": 3.8857970118522642, "ce_loss_6": 3.496463358402252, "epoch": 0.956, "grad_norm": 1160.0, "kl_loss_12": 340.56368255615234, "kl_loss_17": 88.42728080749512, "kl_loss_3": 1877.9746032714843, "kl_loss_6": 1079.9369049072266, "learning_rate": 4.865965629214819e-06, "loss": 840.1536, "step": 9560 }, { "ce_loss_12": 3.1409691333770753, "ce_loss_17": 3.029647672176361, "ce_loss_23": 2.9838614463806152, "ce_loss_3": 3.8524129986763, "ce_loss_6": 3.45097074508667, "epoch": 0.957, "grad_norm": 884.0, "kl_loss_12": 334.6656768798828, "kl_loss_17": 85.21070098876953, "kl_loss_3": 1923.1624877929687, "kl_loss_6": 1092.950177001953, "learning_rate": 4.6476381931251366e-06, "loss": 841.2354, "step": 9570 }, { "ce_loss_12": 3.126495099067688, "ce_loss_17": 3.0162636756896974, "ce_loss_23": 2.9702324748039244, "ce_loss_3": 3.824518692493439, "ce_loss_6": 3.4368972539901734, "epoch": 0.958, "grad_norm": 824.0, "kl_loss_12": 323.58738861083987, "kl_loss_17": 83.89055442810059, "kl_loss_3": 1857.772998046875, "kl_loss_6": 1060.3517059326173, "learning_rate": 4.434298949819449e-06, "loss": 835.967, "step": 9580 }, { "ce_loss_12": 3.0928184270858763, "ce_loss_17": 2.9766878366470335, "ce_loss_23": 2.929574990272522, "ce_loss_3": 3.835130035877228, "ce_loss_6": 3.425615167617798, "epoch": 0.959, "grad_norm": 872.0, "kl_loss_12": 344.17464141845704, "kl_loss_17": 90.14529151916504, "kl_loss_3": 1993.1905578613282, "kl_loss_6": 1141.2810089111329, "learning_rate": 4.2259500476214406e-06, "loss": 866.5919, "step": 9590 }, { "ce_loss_12": 3.07260400056839, "ce_loss_17": 2.9639098167419435, "ce_loss_23": 2.915282893180847, "ce_loss_3": 3.7865395665168764, "ce_loss_6": 3.3866448998451233, "epoch": 0.96, "grad_norm": 1208.0, "kl_loss_12": 330.17433776855466, "kl_loss_17": 85.21898574829102, "kl_loss_3": 1919.545556640625, "kl_loss_6": 1089.7303771972656, "learning_rate": 4.02259358460233e-06, "loss": 844.8156, "step": 9600 }, { "ce_loss_12": 3.134173059463501, "ce_loss_17": 3.02146919965744, "ce_loss_23": 2.9746540904045107, "ce_loss_3": 3.835234320163727, "ce_loss_6": 3.444248282909393, "epoch": 0.961, "grad_norm": 896.0, "kl_loss_12": 329.57266693115236, "kl_loss_17": 85.9197624206543, "kl_loss_3": 1850.4071044921875, "kl_loss_6": 1055.000308227539, "learning_rate": 3.8242316085594916e-06, "loss": 833.8258, "step": 9610 }, { "ce_loss_12": 3.028651475906372, "ce_loss_17": 2.9132359266281127, "ce_loss_23": 2.864736044406891, "ce_loss_3": 3.7799143195152283, "ce_loss_6": 3.3598650455474854, "epoch": 0.962, "grad_norm": 1112.0, "kl_loss_12": 335.8763122558594, "kl_loss_17": 87.01072463989257, "kl_loss_3": 1995.7976135253907, "kl_loss_6": 1126.1893524169923, "learning_rate": 3.630866116995757e-06, "loss": 873.6197, "step": 9620 }, { "ce_loss_12": 3.167985463142395, "ce_loss_17": 3.0596530079841613, "ce_loss_23": 3.0135764598846437, "ce_loss_3": 3.8555119752883913, "ce_loss_6": 3.4640034198760987, "epoch": 0.963, "grad_norm": 972.0, "kl_loss_12": 323.5938186645508, "kl_loss_17": 83.68088264465332, "kl_loss_3": 1853.1992553710938, "kl_loss_6": 1047.2032257080077, "learning_rate": 3.4424990570994797e-06, "loss": 848.4127, "step": 9630 }, { "ce_loss_12": 3.158530128002167, "ce_loss_17": 3.0523101449012757, "ce_loss_23": 3.005872070789337, "ce_loss_3": 3.8581893920898436, "ce_loss_6": 3.4666857361793517, "epoch": 0.964, "grad_norm": 1056.0, "kl_loss_12": 325.6300765991211, "kl_loss_17": 84.23550529479981, "kl_loss_3": 1884.3258483886718, "kl_loss_6": 1070.839437866211, "learning_rate": 3.2591323257248896e-06, "loss": 842.3455, "step": 9640 }, { "ce_loss_12": 3.0171950697898864, "ce_loss_17": 2.907667005062103, "ce_loss_23": 2.860476005077362, "ce_loss_3": 3.7420357704162597, "ce_loss_6": 3.3383049249649046, "epoch": 0.965, "grad_norm": 1168.0, "kl_loss_12": 326.80652008056643, "kl_loss_17": 84.05732383728028, "kl_loss_3": 1914.0955505371094, "kl_loss_6": 1086.868325805664, "learning_rate": 3.0807677693729385e-06, "loss": 858.0629, "step": 9650 }, { "ce_loss_12": 3.1908772110939028, "ce_loss_17": 3.0809871912002564, "ce_loss_23": 3.0360643744468687, "ce_loss_3": 3.8822084307670592, "ce_loss_6": 3.501388466358185, "epoch": 0.966, "grad_norm": 1008.0, "kl_loss_12": 325.91085662841795, "kl_loss_17": 84.52700004577636, "kl_loss_3": 1853.4956420898438, "kl_loss_6": 1064.3986785888671, "learning_rate": 2.9074071841727055e-06, "loss": 828.0664, "step": 9660 }, { "ce_loss_12": 3.12662855386734, "ce_loss_17": 3.0138697266578673, "ce_loss_23": 2.968185830116272, "ce_loss_3": 3.82765930891037, "ce_loss_6": 3.4405104398727415, "epoch": 0.967, "grad_norm": 1624.0, "kl_loss_12": 328.47566375732424, "kl_loss_17": 84.35643615722657, "kl_loss_3": 1888.4803894042968, "kl_loss_6": 1080.2502044677735, "learning_rate": 2.739052315863355e-06, "loss": 826.3218, "step": 9670 }, { "ce_loss_12": 3.0976916670799257, "ce_loss_17": 2.990750730037689, "ce_loss_23": 2.9455241322517396, "ce_loss_3": 3.8131043910980225, "ce_loss_6": 3.404400360584259, "epoch": 0.968, "grad_norm": 1272.0, "kl_loss_12": 327.33734436035155, "kl_loss_17": 84.63534660339356, "kl_loss_3": 1917.4770935058593, "kl_loss_6": 1077.3879791259765, "learning_rate": 2.5757048597765396e-06, "loss": 839.4121, "step": 9680 }, { "ce_loss_12": 3.1200202107429504, "ce_loss_17": 3.008747708797455, "ce_loss_23": 2.9609153032302857, "ce_loss_3": 3.8308356046676635, "ce_loss_6": 3.435415732860565, "epoch": 0.969, "grad_norm": 1096.0, "kl_loss_12": 331.40294494628904, "kl_loss_17": 84.86789970397949, "kl_loss_3": 1902.2723022460937, "kl_loss_6": 1090.882376098633, "learning_rate": 2.417366460819359e-06, "loss": 848.7949, "step": 9690 }, { "ce_loss_12": 3.134627604484558, "ce_loss_17": 3.0217684507369995, "ce_loss_23": 2.9727313041687013, "ce_loss_3": 3.8631083250045775, "ce_loss_6": 3.451991927623749, "epoch": 0.97, "grad_norm": 1104.0, "kl_loss_12": 336.93389892578125, "kl_loss_17": 87.41482429504394, "kl_loss_3": 1944.5840698242187, "kl_loss_6": 1095.5060607910157, "learning_rate": 2.2640387134577057e-06, "loss": 846.1895, "step": 9700 }, { "ce_loss_12": 3.0515653133392333, "ce_loss_17": 2.9476856708526613, "ce_loss_23": 2.9044357180595397, "ce_loss_3": 3.7333238244056703, "ce_loss_6": 3.3488406419754027, "epoch": 0.971, "grad_norm": 884.0, "kl_loss_12": 313.5850357055664, "kl_loss_17": 80.26754531860351, "kl_loss_3": 1804.5651916503907, "kl_loss_6": 1024.8025848388672, "learning_rate": 2.115723161700278e-06, "loss": 822.8891, "step": 9710 }, { "ce_loss_12": 3.0469194531440733, "ce_loss_17": 2.931409239768982, "ce_loss_23": 2.8847802877426147, "ce_loss_3": 3.774531900882721, "ce_loss_6": 3.364378833770752, "epoch": 0.972, "grad_norm": 1032.0, "kl_loss_12": 338.0134704589844, "kl_loss_17": 87.07059669494629, "kl_loss_3": 1949.6646728515625, "kl_loss_6": 1105.4224090576172, "learning_rate": 1.9724212990830937e-06, "loss": 862.8866, "step": 9720 }, { "ce_loss_12": 3.176584577560425, "ce_loss_17": 3.0638570547103883, "ce_loss_23": 3.0177934408187865, "ce_loss_3": 3.896821069717407, "ce_loss_6": 3.4962318897247315, "epoch": 0.973, "grad_norm": 1392.0, "kl_loss_12": 336.9397857666016, "kl_loss_17": 85.75758895874023, "kl_loss_3": 1931.8845520019531, "kl_loss_6": 1098.299368286133, "learning_rate": 1.8341345686543331e-06, "loss": 854.241, "step": 9730 }, { "ce_loss_12": 3.153524124622345, "ce_loss_17": 3.0473845958709718, "ce_loss_23": 3.0015310406684876, "ce_loss_3": 3.8341501355171204, "ce_loss_6": 3.457658517360687, "epoch": 0.974, "grad_norm": 1400.0, "kl_loss_12": 324.8598434448242, "kl_loss_17": 84.20447654724121, "kl_loss_3": 1832.1616271972657, "kl_loss_6": 1051.3619445800782, "learning_rate": 1.7008643629596864e-06, "loss": 851.5814, "step": 9740 }, { "ce_loss_12": 3.1431527733802795, "ce_loss_17": 3.0320016860961916, "ce_loss_23": 2.9844090819358824, "ce_loss_3": 3.8373767614364622, "ce_loss_6": 3.4459742307662964, "epoch": 0.975, "grad_norm": 860.0, "kl_loss_12": 328.33545074462893, "kl_loss_17": 84.7749095916748, "kl_loss_3": 1882.492529296875, "kl_loss_6": 1063.1302673339844, "learning_rate": 1.5726120240288633e-06, "loss": 852.7666, "step": 9750 }, { "ce_loss_12": 3.0448407530784607, "ce_loss_17": 2.940678894519806, "ce_loss_23": 2.8972470283508303, "ce_loss_3": 3.7465156078338624, "ce_loss_6": 3.3575788140296936, "epoch": 0.976, "grad_norm": 972.0, "kl_loss_12": 327.9582092285156, "kl_loss_17": 83.42790260314942, "kl_loss_3": 1890.5948608398437, "kl_loss_6": 1073.0351593017579, "learning_rate": 1.4493788433612708e-06, "loss": 840.3595, "step": 9760 }, { "ce_loss_12": 3.1668558955192565, "ce_loss_17": 3.05363472700119, "ce_loss_23": 3.0063628435134886, "ce_loss_3": 3.8779948472976686, "ce_loss_6": 3.4830434560775756, "epoch": 0.977, "grad_norm": 792.0, "kl_loss_12": 333.98754425048827, "kl_loss_17": 85.3881763458252, "kl_loss_3": 1917.9533081054688, "kl_loss_6": 1089.0577514648437, "learning_rate": 1.3311660619138578e-06, "loss": 855.6904, "step": 9770 }, { "ce_loss_12": 3.160209369659424, "ce_loss_17": 3.051060974597931, "ce_loss_23": 3.005156457424164, "ce_loss_3": 3.82720787525177, "ce_loss_6": 3.4543066024780273, "epoch": 0.978, "grad_norm": 776.0, "kl_loss_12": 326.8641891479492, "kl_loss_17": 84.59445114135742, "kl_loss_3": 1810.7693969726563, "kl_loss_6": 1046.252163696289, "learning_rate": 1.2179748700879012e-06, "loss": 837.7814, "step": 9780 }, { "ce_loss_12": 3.093959939479828, "ce_loss_17": 2.9830276489257814, "ce_loss_23": 2.9352121472358705, "ce_loss_3": 3.794635844230652, "ce_loss_6": 3.4019384741783143, "epoch": 0.979, "grad_norm": 1352.0, "kl_loss_12": 324.03418731689453, "kl_loss_17": 83.56771278381348, "kl_loss_3": 1865.2319885253905, "kl_loss_6": 1053.4732147216796, "learning_rate": 1.1098064077174619e-06, "loss": 840.0145, "step": 9790 }, { "ce_loss_12": 3.1171369314193726, "ce_loss_17": 3.0069626808166503, "ce_loss_23": 2.960393154621124, "ce_loss_3": 3.846533477306366, "ce_loss_6": 3.4357937932014466, "epoch": 0.98, "grad_norm": 992.0, "kl_loss_12": 327.81168365478516, "kl_loss_17": 84.16520195007324, "kl_loss_3": 1931.3702514648437, "kl_loss_6": 1085.5306610107423, "learning_rate": 1.006661764057837e-06, "loss": 848.5051, "step": 9800 }, { "ce_loss_12": 3.124183714389801, "ce_loss_17": 3.016022801399231, "ce_loss_23": 2.9706171631813048, "ce_loss_3": 3.83089302778244, "ce_loss_6": 3.4350167989730833, "epoch": 0.981, "grad_norm": 1024.0, "kl_loss_12": 325.91871337890626, "kl_loss_17": 84.2316764831543, "kl_loss_3": 1892.4080688476563, "kl_loss_6": 1070.907534790039, "learning_rate": 9.085419777743465e-07, "loss": 837.4656, "step": 9810 }, { "ce_loss_12": 3.0744644165039063, "ce_loss_17": 2.969471788406372, "ce_loss_23": 2.9252761244773864, "ce_loss_3": 3.7865662932395936, "ce_loss_6": 3.387745177745819, "epoch": 0.982, "grad_norm": 956.0, "kl_loss_12": 323.53853302001954, "kl_loss_17": 81.95206718444824, "kl_loss_3": 1891.0988708496093, "kl_loss_6": 1076.6217407226563, "learning_rate": 8.15448036932176e-07, "loss": 828.8999, "step": 9820 }, { "ce_loss_12": 3.1218610644340514, "ce_loss_17": 3.0108360290527343, "ce_loss_23": 2.9639970779418947, "ce_loss_3": 3.821909260749817, "ce_loss_6": 3.4302916169166564, "epoch": 0.983, "grad_norm": 968.0, "kl_loss_12": 328.03336334228516, "kl_loss_17": 83.96088600158691, "kl_loss_3": 1896.133447265625, "kl_loss_6": 1085.7091247558594, "learning_rate": 7.273808789862724e-07, "loss": 854.9318, "step": 9830 }, { "ce_loss_12": 3.188025426864624, "ce_loss_17": 3.0801629781723023, "ce_loss_23": 3.0319138884544374, "ce_loss_3": 3.8802616000175476, "ce_loss_6": 3.491058576107025, "epoch": 0.984, "grad_norm": 812.0, "kl_loss_12": 330.5854736328125, "kl_loss_17": 85.91607627868652, "kl_loss_3": 1877.5823303222655, "kl_loss_6": 1070.721142578125, "learning_rate": 6.443413907720186e-07, "loss": 838.1486, "step": 9840 }, { "ce_loss_12": 3.128363573551178, "ce_loss_17": 3.0203511357307433, "ce_loss_23": 2.975878059864044, "ce_loss_3": 3.831292140483856, "ce_loss_6": 3.431589329242706, "epoch": 0.985, "grad_norm": 1096.0, "kl_loss_12": 326.2767532348633, "kl_loss_17": 84.12903938293456, "kl_loss_3": 1856.6162231445312, "kl_loss_6": 1048.189892578125, "learning_rate": 5.663304084960185e-07, "loss": 833.4552, "step": 9850 }, { "ce_loss_12": 3.0610152006149294, "ce_loss_17": 2.955245614051819, "ce_loss_23": 2.9065255403518675, "ce_loss_3": 3.7786110639572144, "ce_loss_6": 3.3778342723846437, "epoch": 0.986, "grad_norm": 1424.0, "kl_loss_12": 330.9642135620117, "kl_loss_17": 87.89242706298828, "kl_loss_3": 1924.0705871582031, "kl_loss_6": 1085.759259033203, "learning_rate": 4.933487177280482e-07, "loss": 835.5607, "step": 9860 }, { "ce_loss_12": 3.1539594650268556, "ce_loss_17": 3.048413944244385, "ce_loss_23": 3.004293072223663, "ce_loss_3": 3.8455660343170166, "ce_loss_6": 3.4557252764701842, "epoch": 0.987, "grad_norm": 1072.0, "kl_loss_12": 320.58961791992186, "kl_loss_17": 82.36095275878907, "kl_loss_3": 1859.45830078125, "kl_loss_6": 1056.7829376220702, "learning_rate": 4.2539705339295075e-07, "loss": 828.8172, "step": 9870 }, { "ce_loss_12": 3.0203701496124267, "ce_loss_17": 2.909262204170227, "ce_loss_23": 2.8623374581336973, "ce_loss_3": 3.741202163696289, "ce_loss_6": 3.3373022198677065, "epoch": 0.988, "grad_norm": 904.0, "kl_loss_12": 334.73585510253906, "kl_loss_17": 83.74605751037598, "kl_loss_3": 1923.2291320800782, "kl_loss_6": 1087.5317352294921, "learning_rate": 3.6247609976319816e-07, "loss": 841.1823, "step": 9880 }, { "ce_loss_12": 3.1118029594421386, "ce_loss_17": 2.9975714921951293, "ce_loss_23": 2.9505231618881225, "ce_loss_3": 3.838503098487854, "ce_loss_6": 3.4238746047019957, "epoch": 0.989, "grad_norm": 1248.0, "kl_loss_12": 330.36513671875, "kl_loss_17": 85.91090202331543, "kl_loss_3": 1926.5442749023437, "kl_loss_6": 1087.4236846923827, "learning_rate": 3.0458649045211895e-07, "loss": 863.6676, "step": 9890 }, { "ce_loss_12": 3.079368329048157, "ce_loss_17": 2.9630495429039003, "ce_loss_23": 2.914914536476135, "ce_loss_3": 3.8031314492225645, "ce_loss_6": 3.4066123366355896, "epoch": 0.99, "grad_norm": 1020.0, "kl_loss_12": 335.53031463623046, "kl_loss_17": 86.8194580078125, "kl_loss_3": 1906.4886291503906, "kl_loss_6": 1089.162533569336, "learning_rate": 2.517288084074587e-07, "loss": 859.5178, "step": 9900 }, { "ce_loss_12": 3.1262808203697205, "ce_loss_17": 3.0067719340324404, "ce_loss_23": 2.9572256088256834, "ce_loss_3": 3.8658326506614684, "ce_loss_6": 3.4577017426490784, "epoch": 0.991, "grad_norm": 1020.0, "kl_loss_12": 345.738835144043, "kl_loss_17": 87.76178359985352, "kl_loss_3": 1967.6563049316405, "kl_loss_6": 1125.1915313720704, "learning_rate": 2.0390358590538505e-07, "loss": 862.3332, "step": 9910 }, { "ce_loss_12": 3.121526575088501, "ce_loss_17": 3.0088306665420532, "ce_loss_23": 2.961964523792267, "ce_loss_3": 3.8342851638793944, "ce_loss_6": 3.434750163555145, "epoch": 0.992, "grad_norm": 1128.0, "kl_loss_12": 335.33690185546874, "kl_loss_17": 85.0649803161621, "kl_loss_3": 1908.5850769042968, "kl_loss_6": 1088.622216796875, "learning_rate": 1.61111304545436e-07, "loss": 841.5836, "step": 9920 }, { "ce_loss_12": 3.093986284732819, "ce_loss_17": 2.98314254283905, "ce_loss_23": 2.9363115191459657, "ce_loss_3": 3.7975124955177306, "ce_loss_6": 3.4003641247749328, "epoch": 0.993, "grad_norm": 972.0, "kl_loss_12": 327.34204864501953, "kl_loss_17": 84.11604232788086, "kl_loss_3": 1882.315252685547, "kl_loss_6": 1068.8870910644532, "learning_rate": 1.2335239524541298e-07, "loss": 831.5901, "step": 9930 }, { "ce_loss_12": 3.062349224090576, "ce_loss_17": 2.949763739109039, "ce_loss_23": 2.9034113764762877, "ce_loss_3": 3.7702368974685667, "ce_loss_6": 3.3707385420799256, "epoch": 0.994, "grad_norm": 944.0, "kl_loss_12": 328.5026824951172, "kl_loss_17": 84.36543121337891, "kl_loss_3": 1878.0137756347656, "kl_loss_6": 1060.1397888183594, "learning_rate": 9.06272382371065e-08, "loss": 841.7564, "step": 9940 }, { "ce_loss_12": 3.1279003024101257, "ce_loss_17": 3.017787003517151, "ce_loss_23": 2.973386991024017, "ce_loss_3": 3.852622413635254, "ce_loss_6": 3.445349323749542, "epoch": 0.995, "grad_norm": 996.0, "kl_loss_12": 335.48767242431643, "kl_loss_17": 85.91932830810546, "kl_loss_3": 1929.320849609375, "kl_loss_6": 1100.2056854248046, "learning_rate": 6.293616306246586e-08, "loss": 850.2181, "step": 9950 }, { "ce_loss_12": 3.115555453300476, "ce_loss_17": 3.0102423310279844, "ce_loss_23": 2.9651247382164003, "ce_loss_3": 3.7982796311378477, "ce_loss_6": 3.4162571668624877, "epoch": 0.996, "grad_norm": 1224.0, "kl_loss_12": 320.39947814941405, "kl_loss_17": 81.87819061279296, "kl_loss_3": 1844.3757263183593, "kl_loss_6": 1054.789862060547, "learning_rate": 4.027944857032395e-08, "loss": 816.7188, "step": 9960 }, { "ce_loss_12": 3.1120410203933715, "ce_loss_17": 3.010934889316559, "ce_loss_23": 2.9690916180610656, "ce_loss_3": 3.7765786409378053, "ce_loss_6": 3.4030073404312136, "epoch": 0.997, "grad_norm": 972.0, "kl_loss_12": 308.8306060791016, "kl_loss_17": 80.754195022583, "kl_loss_3": 1784.6604797363282, "kl_loss_6": 1010.75185546875, "learning_rate": 2.265732291356626e-08, "loss": 806.2186, "step": 9970 }, { "ce_loss_12": 3.1557921767234802, "ce_loss_17": 3.050547778606415, "ce_loss_23": 3.00582869052887, "ce_loss_3": 3.840213453769684, "ce_loss_6": 3.453501784801483, "epoch": 0.998, "grad_norm": 776.0, "kl_loss_12": 326.1690475463867, "kl_loss_17": 83.65138893127441, "kl_loss_3": 1832.3079162597655, "kl_loss_6": 1048.4389709472657, "learning_rate": 1.0069963546743833e-08, "loss": 843.8186, "step": 9980 }, { "ce_loss_12": 3.1398675203323365, "ce_loss_17": 3.0271052956581115, "ce_loss_23": 2.9797831535339356, "ce_loss_3": 3.8401438117027284, "ce_loss_6": 3.447658562660217, "epoch": 0.999, "grad_norm": 1192.0, "kl_loss_12": 331.73143615722654, "kl_loss_17": 85.19338836669922, "kl_loss_3": 1888.2928466796875, "kl_loss_6": 1073.5358764648438, "learning_rate": 2.517497224463483e-09, "loss": 838.4053, "step": 9990 }, { "ce_loss_12": 3.0968006253242493, "ce_loss_17": 2.980567467212677, "ce_loss_23": 2.933296859264374, "ce_loss_3": 3.8451858282089235, "ce_loss_6": 3.42595511674881, "epoch": 1.0, "grad_norm": 1136.0, "kl_loss_12": 338.3313674926758, "kl_loss_17": 86.15568351745605, "kl_loss_3": 1987.994921875, "kl_loss_6": 1112.111392211914, "learning_rate": 0.0, "loss": 866.3291, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.502582338838856e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }