diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_12": 11.027713775634766, + "ce_loss_17": 8.15991497039795, + "ce_loss_23": 2.965851306915283, + "ce_loss_3": 10.737602233886719, + "ce_loss_6": 11.190012454986572, + "epoch": 0.0001, + "grad_norm": 45824.0, + "kl_loss_12": 17517.9384765625, + "kl_loss_17": 11834.82177734375, + "kl_loss_3": 16256.49951171875, + "kl_loss_6": 17406.0986328125, + "learning_rate": 1e-05, + "loss": 16093.7969, + "step": 1 + }, + { + "ce_loss_12": 8.402964644961887, + "ce_loss_17": 6.732524315516154, + "ce_loss_23": 3.0286831193500094, + "ce_loss_3": 9.210747030046251, + "ce_loss_6": 9.099789328045315, + "epoch": 0.001, + "grad_norm": 13056.0, + "kl_loss_12": 11511.38658311632, + "kl_loss_17": 8410.016059027777, + "kl_loss_3": 12613.807562934027, + "kl_loss_6": 12666.355251736111, + "learning_rate": 0.0001, + "loss": 11336.5642, + "step": 10 + }, + { + "ce_loss_12": 5.460555911064148, + "ce_loss_17": 4.456392741203308, + "ce_loss_23": 3.0322046637535096, + "ce_loss_3": 7.06470832824707, + "ce_loss_6": 6.68152768611908, + "epoch": 0.002, + "grad_norm": 3008.0, + "kl_loss_12": 4608.932666015625, + "kl_loss_17": 2811.935650634766, + "kl_loss_3": 7636.5174560546875, + "kl_loss_6": 6889.9828125, + "learning_rate": 0.0002, + "loss": 5498.3336, + "step": 20 + }, + { + "ce_loss_12": 4.363038396835327, + "ce_loss_17": 3.561879241466522, + "ce_loss_23": 2.842976653575897, + "ce_loss_3": 6.049652767181397, + "ce_loss_6": 5.463876032829285, + "epoch": 0.003, + "grad_norm": 2864.0, + "kl_loss_12": 2808.842004394531, + "kl_loss_17": 1373.155780029297, + "kl_loss_3": 6175.716845703125, + "kl_loss_6": 5020.55859375, + "learning_rate": 0.0003, + "loss": 3815.8828, + "step": 30 + }, + { + "ce_loss_12": 4.206813776493073, + "ce_loss_17": 3.54139963388443, + "ce_loss_23": 3.007910418510437, + "ce_loss_3": 5.702505111694336, + "ce_loss_6": 5.18826208114624, + "epoch": 0.004, + "grad_norm": 1472.0, + "kl_loss_12": 2237.032061767578, + "kl_loss_17": 974.1219360351563, + "kl_loss_3": 5216.508642578125, + "kl_loss_6": 4193.483740234375, + "learning_rate": 0.0004, + "loss": 3168.891, + "step": 40 + }, + { + "ce_loss_12": 3.999967801570892, + "ce_loss_17": 3.3918786883354186, + "ce_loss_23": 2.9720388770103456, + "ce_loss_3": 5.477003407478333, + "ce_loss_6": 4.948132801055908, + "epoch": 0.005, + "grad_norm": 2352.0, + "kl_loss_12": 1895.5821166992187, + "kl_loss_17": 762.8674591064453, + "kl_loss_3": 4879.4290771484375, + "kl_loss_6": 3856.8288696289064, + "learning_rate": 0.0005, + "loss": 2838.7225, + "step": 50 + }, + { + "ce_loss_12": 3.9077433347702026, + "ce_loss_17": 3.329902720451355, + "ce_loss_23": 2.9878324031829835, + "ce_loss_3": 5.317534351348877, + "ce_loss_6": 4.759640908241272, + "epoch": 0.006, + "grad_norm": 1752.0, + "kl_loss_12": 1724.694140625, + "kl_loss_17": 611.9558654785156, + "kl_loss_3": 4550.454077148437, + "kl_loss_6": 3437.15751953125, + "learning_rate": 0.0006, + "loss": 2595.8715, + "step": 60 + }, + { + "ce_loss_12": 3.709681046009064, + "ce_loss_17": 3.2019996762275698, + "ce_loss_23": 2.9102911412715913, + "ce_loss_3": 5.179440522193909, + "ce_loss_6": 4.576542472839355, + "epoch": 0.007, + "grad_norm": 1256.0, + "kl_loss_12": 1498.8394775390625, + "kl_loss_17": 528.5001907348633, + "kl_loss_3": 4437.479809570313, + "kl_loss_6": 3269.5957275390624, + "learning_rate": 0.0007, + "loss": 2441.3959, + "step": 70 + }, + { + "ce_loss_12": 3.632861626148224, + "ce_loss_17": 3.1742521047592165, + "ce_loss_23": 2.9080315828323364, + "ce_loss_3": 5.128901815414428, + "ce_loss_6": 4.467587268352508, + "epoch": 0.008, + "grad_norm": 1616.0, + "kl_loss_12": 1368.2988586425781, + "kl_loss_17": 472.89087371826173, + "kl_loss_3": 4385.906018066406, + "kl_loss_6": 3072.7216918945314, + "learning_rate": 0.0008, + "loss": 2346.1152, + "step": 80 + }, + { + "ce_loss_12": 3.569024109840393, + "ce_loss_17": 3.1099191904067993, + "ce_loss_23": 2.872076082229614, + "ce_loss_3": 5.064512729644775, + "ce_loss_6": 4.37619469165802, + "epoch": 0.009, + "grad_norm": 1744.0, + "kl_loss_12": 1336.7516662597657, + "kl_loss_17": 441.59217834472656, + "kl_loss_3": 4326.397912597657, + "kl_loss_6": 2987.296838378906, + "learning_rate": 0.0009000000000000001, + "loss": 2263.0539, + "step": 90 + }, + { + "ce_loss_12": 3.666916310787201, + "ce_loss_17": 3.244144332408905, + "ce_loss_23": 2.976801860332489, + "ce_loss_3": 5.146626138687134, + "ce_loss_6": 4.481168866157532, + "epoch": 0.01, + "grad_norm": 1872.0, + "kl_loss_12": 1313.0016296386718, + "kl_loss_17": 508.8273468017578, + "kl_loss_3": 4289.265026855469, + "kl_loss_6": 2982.880078125, + "learning_rate": 0.001, + "loss": 2259.932, + "step": 100 + }, + { + "ce_loss_12": 3.6309967994689942, + "ce_loss_17": 3.192095232009888, + "ce_loss_23": 2.941200518608093, + "ce_loss_3": 5.068640613555909, + "ce_loss_6": 4.384789574146271, + "epoch": 0.011, + "grad_norm": 1440.0, + "kl_loss_12": 1322.830255126953, + "kl_loss_17": 458.69312591552733, + "kl_loss_3": 4218.377465820313, + "kl_loss_6": 2874.042236328125, + "learning_rate": 0.0009999974825027757, + "loss": 2212.0162, + "step": 110 + }, + { + "ce_loss_12": 3.648383581638336, + "ce_loss_17": 3.2187762379646303, + "ce_loss_23": 3.0047112464904786, + "ce_loss_3": 4.9990413904190065, + "ce_loss_6": 4.37911046743393, + "epoch": 0.012, + "grad_norm": 1168.0, + "kl_loss_12": 1230.2259948730468, + "kl_loss_17": 396.9843719482422, + "kl_loss_3": 4003.4096557617186, + "kl_loss_6": 2775.913610839844, + "learning_rate": 0.0009999899300364532, + "loss": 2079.7395, + "step": 120 + }, + { + "ce_loss_12": 3.5785240173339843, + "ce_loss_17": 3.1897997498512267, + "ce_loss_23": 2.9631885051727296, + "ce_loss_3": 5.012343430519104, + "ce_loss_6": 4.35903844833374, + "epoch": 0.013, + "grad_norm": 1000.0, + "kl_loss_12": 1155.5515258789062, + "kl_loss_17": 420.0738891601562, + "kl_loss_3": 4048.8492919921873, + "kl_loss_6": 2753.6798706054688, + "learning_rate": 0.0009999773426770863, + "loss": 2102.8152, + "step": 130 + }, + { + "ce_loss_12": 3.590196990966797, + "ce_loss_17": 3.2399143815040587, + "ce_loss_23": 3.006642484664917, + "ce_loss_3": 4.972180771827698, + "ce_loss_6": 4.330150175094604, + "epoch": 0.014, + "grad_norm": 1040.0, + "kl_loss_12": 1109.9426330566407, + "kl_loss_17": 424.5212860107422, + "kl_loss_3": 3912.0217895507812, + "kl_loss_6": 2661.2727783203127, + "learning_rate": 0.0009999597205514296, + "loss": 2044.4775, + "step": 140 + }, + { + "ce_loss_12": 3.520315718650818, + "ce_loss_17": 3.170581007003784, + "ce_loss_23": 2.962330865859985, + "ce_loss_3": 4.890919160842896, + "ce_loss_6": 4.267568159103393, + "epoch": 0.015, + "grad_norm": 964.0, + "kl_loss_12": 1056.9698181152344, + "kl_loss_17": 380.9711288452148, + "kl_loss_3": 3821.2818237304687, + "kl_loss_6": 2600.436962890625, + "learning_rate": 0.0009999370638369377, + "loss": 1974.6367, + "step": 150 + }, + { + "ce_loss_12": 3.5510578632354735, + "ce_loss_17": 3.1863274335861207, + "ce_loss_23": 3.0006508588790894, + "ce_loss_3": 4.916771864891052, + "ce_loss_6": 4.249840044975281, + "epoch": 0.016, + "grad_norm": 1168.0, + "kl_loss_12": 1066.2155700683593, + "kl_loss_17": 342.7077606201172, + "kl_loss_3": 3826.1323852539062, + "kl_loss_6": 2510.92431640625, + "learning_rate": 0.000999909372761763, + "loss": 1937.6813, + "step": 160 + }, + { + "ce_loss_12": 3.492412841320038, + "ce_loss_17": 3.1263466000556948, + "ce_loss_23": 2.9409194469451903, + "ce_loss_3": 4.879445147514343, + "ce_loss_6": 4.203813099861145, + "epoch": 0.017, + "grad_norm": 880.0, + "kl_loss_12": 1060.1210083007813, + "kl_loss_17": 335.2873870849609, + "kl_loss_3": 3886.2561767578127, + "kl_loss_6": 2559.63173828125, + "learning_rate": 0.0009998766476047546, + "loss": 1974.0215, + "step": 170 + }, + { + "ce_loss_12": 3.5242559790611265, + "ce_loss_17": 3.1614002346992494, + "ce_loss_23": 2.975456976890564, + "ce_loss_3": 4.904916667938233, + "ce_loss_6": 4.241925597190857, + "epoch": 0.018, + "grad_norm": 788.0, + "kl_loss_12": 1035.621078491211, + "kl_loss_17": 333.09861907958987, + "kl_loss_3": 3837.9483276367187, + "kl_loss_6": 2543.54970703125, + "learning_rate": 0.0009998388886954545, + "loss": 1955.3641, + "step": 180 + }, + { + "ce_loss_12": 3.4557549357414246, + "ce_loss_17": 3.115843212604523, + "ce_loss_23": 2.939819598197937, + "ce_loss_3": 4.865855956077576, + "ce_loss_6": 4.206141090393066, + "epoch": 0.019, + "grad_norm": 848.0, + "kl_loss_12": 983.1588684082031, + "kl_loss_17": 315.833642578125, + "kl_loss_3": 3826.3384033203124, + "kl_loss_6": 2540.1477905273437, + "learning_rate": 0.0009997960964140947, + "loss": 1911.9391, + "step": 190 + }, + { + "ce_loss_12": 3.4466380953788756, + "ce_loss_17": 3.1169066548347475, + "ce_loss_23": 2.940021049976349, + "ce_loss_3": 4.852689170837403, + "ce_loss_6": 4.1931051969528195, + "epoch": 0.02, + "grad_norm": 816.0, + "kl_loss_12": 982.9840148925781, + "kl_loss_17": 335.00098419189453, + "kl_loss_3": 3858.1364135742188, + "kl_loss_6": 2551.3482177734377, + "learning_rate": 0.0009997482711915926, + "loss": 1915.3297, + "step": 200 + }, + { + "ce_loss_12": 3.3939271569252014, + "ce_loss_17": 3.0816601395606993, + "ce_loss_23": 2.9188946008682253, + "ce_loss_3": 4.788400340080261, + "ce_loss_6": 4.114619171619415, + "epoch": 0.021, + "grad_norm": 784.0, + "kl_loss_12": 928.4573394775391, + "kl_loss_17": 305.8069580078125, + "kl_loss_3": 3751.7100341796877, + "kl_loss_6": 2432.4765625, + "learning_rate": 0.0009996954135095479, + "loss": 1853.0043, + "step": 210 + }, + { + "ce_loss_12": 3.4684497714042664, + "ce_loss_17": 3.1626001238822936, + "ce_loss_23": 2.989655339717865, + "ce_loss_3": 4.786516451835633, + "ce_loss_6": 4.147206258773804, + "epoch": 0.022, + "grad_norm": 716.0, + "kl_loss_12": 920.043212890625, + "kl_loss_17": 303.1504531860352, + "kl_loss_3": 3612.4007934570313, + "kl_loss_6": 2348.7169921875, + "learning_rate": 0.0009996375239002368, + "loss": 1794.6467, + "step": 220 + }, + { + "ce_loss_12": 3.5193947434425352, + "ce_loss_17": 3.214660811424255, + "ce_loss_23": 3.05567672252655, + "ce_loss_3": 4.798125290870667, + "ce_loss_6": 4.18331127166748, + "epoch": 0.023, + "grad_norm": 836.0, + "kl_loss_12": 896.357308959961, + "kl_loss_17": 297.10578002929685, + "kl_loss_3": 3529.04033203125, + "kl_loss_6": 2312.949139404297, + "learning_rate": 0.0009995746029466072, + "loss": 1758.3887, + "step": 230 + }, + { + "ce_loss_12": 3.3271968960762024, + "ce_loss_17": 3.020535373687744, + "ce_loss_23": 2.8603257536888123, + "ce_loss_3": 4.72745258808136, + "ce_loss_6": 4.0547412753105165, + "epoch": 0.024, + "grad_norm": 704.0, + "kl_loss_12": 917.2084899902344, + "kl_loss_17": 300.5008575439453, + "kl_loss_3": 3771.9653076171876, + "kl_loss_6": 2428.7272094726563, + "learning_rate": 0.0009995066512822719, + "loss": 1795.3527, + "step": 240 + }, + { + "ce_loss_12": 3.4247890949249267, + "ce_loss_17": 3.1120498538017274, + "ce_loss_23": 2.954189193248749, + "ce_loss_3": 4.8511217594146725, + "ce_loss_6": 4.178792726993561, + "epoch": 0.025, + "grad_norm": 872.0, + "kl_loss_12": 906.7326904296875, + "kl_loss_17": 287.6933708190918, + "kl_loss_3": 3802.542761230469, + "kl_loss_6": 2477.5896545410155, + "learning_rate": 0.000999433669591504, + "loss": 1791.8957, + "step": 250 + }, + { + "ce_loss_12": 3.3140273809432985, + "ce_loss_17": 3.0202324271202086, + "ce_loss_23": 2.8615636587142945, + "ce_loss_3": 4.698658227920532, + "ce_loss_6": 4.027096796035766, + "epoch": 0.026, + "grad_norm": 936.0, + "kl_loss_12": 875.9568481445312, + "kl_loss_17": 295.5630393981934, + "kl_loss_3": 3720.644665527344, + "kl_loss_6": 2388.5712341308595, + "learning_rate": 0.000999355658609228, + "loss": 1786.2816, + "step": 260 + }, + { + "ce_loss_12": 3.3641026854515075, + "ce_loss_17": 3.0714613795280457, + "ce_loss_23": 2.888243865966797, + "ce_loss_3": 4.784202647209168, + "ce_loss_6": 4.083048665523529, + "epoch": 0.027, + "grad_norm": 780.0, + "kl_loss_12": 890.0199035644531, + "kl_loss_17": 320.20955657958984, + "kl_loss_3": 3786.4882202148438, + "kl_loss_6": 2406.1595092773437, + "learning_rate": 0.0009992726191210138, + "loss": 1818.2453, + "step": 270 + }, + { + "ce_loss_12": 3.380402755737305, + "ce_loss_17": 3.0927987456321717, + "ce_loss_23": 2.9245058417320253, + "ce_loss_3": 4.725374484062195, + "ce_loss_6": 4.070786094665527, + "epoch": 0.028, + "grad_norm": 864.0, + "kl_loss_12": 873.3745178222656, + "kl_loss_17": 311.5400817871094, + "kl_loss_3": 3613.2262939453126, + "kl_loss_6": 2304.922711181641, + "learning_rate": 0.0009991845519630679, + "loss": 1757.8809, + "step": 280 + }, + { + "ce_loss_12": 3.2682459712028504, + "ce_loss_17": 2.973331022262573, + "ce_loss_23": 2.8167771458625794, + "ce_loss_3": 4.632596039772034, + "ce_loss_6": 3.960002934932709, + "epoch": 0.029, + "grad_norm": 708.0, + "kl_loss_12": 862.7825988769531, + "kl_loss_17": 287.25118255615234, + "kl_loss_3": 3642.8630859375, + "kl_loss_6": 2318.68984375, + "learning_rate": 0.0009990914580222257, + "loss": 1770.6906, + "step": 290 + }, + { + "ce_loss_12": 3.3801196575164796, + "ce_loss_17": 3.098010754585266, + "ce_loss_23": 2.9501577854156493, + "ce_loss_3": 4.652040600776672, + "ce_loss_6": 4.022914433479309, + "epoch": 0.03, + "grad_norm": 720.0, + "kl_loss_12": 830.7660217285156, + "kl_loss_17": 263.96018981933594, + "kl_loss_3": 3458.6974487304688, + "kl_loss_6": 2199.962927246094, + "learning_rate": 0.0009989933382359422, + "loss": 1727.1951, + "step": 300 + }, + { + "ce_loss_12": 3.380411183834076, + "ce_loss_17": 3.1013339519500733, + "ce_loss_23": 2.9682934045791627, + "ce_loss_3": 4.667791080474854, + "ce_loss_6": 4.0293840050697325, + "epoch": 0.031, + "grad_norm": 772.0, + "kl_loss_12": 811.3957733154297, + "kl_loss_17": 246.9788589477539, + "kl_loss_3": 3442.3103881835937, + "kl_loss_6": 2183.353387451172, + "learning_rate": 0.0009988901935922825, + "loss": 1692.7398, + "step": 310 + }, + { + "ce_loss_12": 3.2473236680030824, + "ce_loss_17": 2.9506585717201235, + "ce_loss_23": 2.8136382699012756, + "ce_loss_3": 4.609115219116211, + "ce_loss_6": 3.9401437640190125, + "epoch": 0.032, + "grad_norm": 804.0, + "kl_loss_12": 848.0315948486328, + "kl_loss_17": 251.90018463134766, + "kl_loss_3": 3642.8263916015626, + "kl_loss_6": 2307.7186950683595, + "learning_rate": 0.0009987820251299122, + "loss": 1714.9273, + "step": 320 + }, + { + "ce_loss_12": 3.3609359502792358, + "ce_loss_17": 3.068274986743927, + "ce_loss_23": 2.9343497514724732, + "ce_loss_3": 4.620841813087464, + "ce_loss_6": 3.973675036430359, + "epoch": 0.033, + "grad_norm": 772.0, + "kl_loss_12": 812.5999633789063, + "kl_loss_17": 241.0101318359375, + "kl_loss_3": 3420.3722534179688, + "kl_loss_6": 2130.2982543945313, + "learning_rate": 0.0009986688339380862, + "loss": 1652.8191, + "step": 330 + }, + { + "ce_loss_12": 3.2960716605186464, + "ce_loss_17": 3.024462711811066, + "ce_loss_23": 2.894329881668091, + "ce_loss_3": 4.547412347793579, + "ce_loss_6": 3.915596878528595, + "epoch": 0.034, + "grad_norm": 616.0, + "kl_loss_12": 790.2532562255859, + "kl_loss_17": 232.27188873291016, + "kl_loss_3": 3347.380505371094, + "kl_loss_6": 2109.0370239257813, + "learning_rate": 0.0009985506211566387, + "loss": 1637.9498, + "step": 340 + }, + { + "ce_loss_12": 3.3186362624168395, + "ce_loss_17": 3.0482547879219055, + "ce_loss_23": 2.921323823928833, + "ce_loss_3": 4.5384005308151245, + "ce_loss_6": 3.9204172134399413, + "epoch": 0.035, + "grad_norm": 728.0, + "kl_loss_12": 766.0471405029297, + "kl_loss_17": 230.49155197143554, + "kl_loss_3": 3281.216650390625, + "kl_loss_6": 2051.8226928710938, + "learning_rate": 0.0009984273879759713, + "loss": 1596.1112, + "step": 350 + }, + { + "ce_loss_12": 3.3629719853401183, + "ce_loss_17": 3.0898334860801695, + "ce_loss_23": 2.949311065673828, + "ce_loss_3": 4.616341137886048, + "ce_loss_6": 3.9870093703269958, + "epoch": 0.036, + "grad_norm": 788.0, + "kl_loss_12": 796.4837310791015, + "kl_loss_17": 254.82774658203124, + "kl_loss_3": 3361.358996582031, + "kl_loss_6": 2109.398333740234, + "learning_rate": 0.0009982991356370402, + "loss": 1651.3512, + "step": 360 + }, + { + "ce_loss_12": 3.325030303001404, + "ce_loss_17": 3.0661404371261596, + "ce_loss_23": 2.927092969417572, + "ce_loss_3": 4.571960759162903, + "ce_loss_6": 3.94881272315979, + "epoch": 0.037, + "grad_norm": 672.0, + "kl_loss_12": 778.3520477294921, + "kl_loss_17": 251.09753952026367, + "kl_loss_3": 3342.199499511719, + "kl_loss_6": 2113.2349060058596, + "learning_rate": 0.0009981658654313456, + "loss": 1634.6564, + "step": 370 + }, + { + "ce_loss_12": 3.384810674190521, + "ce_loss_17": 3.1309564113616943, + "ce_loss_23": 2.999415564537048, + "ce_loss_3": 4.60067994594574, + "ce_loss_6": 3.989968252182007, + "epoch": 0.038, + "grad_norm": 712.0, + "kl_loss_12": 750.3938720703125, + "kl_loss_17": 242.80421447753906, + "kl_loss_3": 3255.1913208007813, + "kl_loss_6": 2043.9017211914063, + "learning_rate": 0.000998027578700917, + "loss": 1605.898, + "step": 380 + }, + { + "ce_loss_12": 3.339533841609955, + "ce_loss_17": 3.0749796628952026, + "ce_loss_23": 2.9452226638793944, + "ce_loss_3": 4.574568462371826, + "ce_loss_6": 3.956436836719513, + "epoch": 0.039, + "grad_norm": 604.0, + "kl_loss_12": 756.8385131835937, + "kl_loss_17": 238.58657760620116, + "kl_loss_3": 3296.7185913085937, + "kl_loss_6": 2071.6643005371093, + "learning_rate": 0.0009978842768382998, + "loss": 1609.5673, + "step": 390 + }, + { + "ce_loss_12": 3.3271549105644227, + "ce_loss_17": 3.083773648738861, + "ce_loss_23": 2.9559371709823608, + "ce_loss_3": 4.531307077407837, + "ce_loss_6": 3.919724702835083, + "epoch": 0.04, + "grad_norm": 648.0, + "kl_loss_12": 712.7042816162109, + "kl_loss_17": 227.77520217895508, + "kl_loss_3": 3185.6382080078124, + "kl_loss_6": 1979.2585815429688, + "learning_rate": 0.0009977359612865424, + "loss": 1547.6145, + "step": 400 + }, + { + "ce_loss_12": 3.353544569015503, + "ce_loss_17": 3.1033652782440186, + "ce_loss_23": 2.9658072710037233, + "ce_loss_3": 4.566384196281433, + "ce_loss_6": 3.953342044353485, + "epoch": 0.041, + "grad_norm": 708.0, + "kl_loss_12": 749.1101745605469, + "kl_loss_17": 260.4892097473145, + "kl_loss_3": 3245.3873901367188, + "kl_loss_6": 2035.0272338867187, + "learning_rate": 0.0009975826335391806, + "loss": 1560.9627, + "step": 410 + }, + { + "ce_loss_12": 3.3578636288642882, + "ce_loss_17": 3.122957742214203, + "ce_loss_23": 2.988275396823883, + "ce_loss_3": 4.5664080619812015, + "ce_loss_6": 3.9467087745666505, + "epoch": 0.042, + "grad_norm": 704.0, + "kl_loss_12": 722.9946594238281, + "kl_loss_17": 240.11188812255858, + "kl_loss_3": 3212.6345458984374, + "kl_loss_6": 1997.101141357422, + "learning_rate": 0.0009974242951402235, + "loss": 1549.8442, + "step": 420 + }, + { + "ce_loss_12": 3.370286190509796, + "ce_loss_17": 3.122636592388153, + "ce_loss_23": 2.9911253452301025, + "ce_loss_3": 4.592541313171386, + "ce_loss_6": 3.959289622306824, + "epoch": 0.043, + "grad_norm": 656.0, + "kl_loss_12": 739.4854614257813, + "kl_loss_17": 239.04573822021484, + "kl_loss_3": 3266.663781738281, + "kl_loss_6": 2007.7690063476562, + "learning_rate": 0.0009972609476841367, + "loss": 1543.5379, + "step": 430 + }, + { + "ce_loss_12": 3.2838348269462587, + "ce_loss_17": 3.0468017935752867, + "ce_loss_23": 2.9064618349075317, + "ce_loss_3": 4.530054998397827, + "ce_loss_6": 3.89483824968338, + "epoch": 0.044, + "grad_norm": 744.0, + "kl_loss_12": 728.234994506836, + "kl_loss_17": 247.46401443481446, + "kl_loss_3": 3284.870471191406, + "kl_loss_6": 2036.9016906738282, + "learning_rate": 0.0009970925928158272, + "loss": 1577.4176, + "step": 440 + }, + { + "ce_loss_12": 3.2323830366134643, + "ce_loss_17": 2.9942973256111145, + "ce_loss_23": 2.8556951880455017, + "ce_loss_3": 4.490892148017883, + "ce_loss_6": 3.8565974712371824, + "epoch": 0.045, + "grad_norm": 620.0, + "kl_loss_12": 736.3843292236328, + "kl_loss_17": 255.33787460327147, + "kl_loss_3": 3333.7939208984376, + "kl_loss_6": 2070.1477600097655, + "learning_rate": 0.000996919232230627, + "loss": 1588.0778, + "step": 450 + }, + { + "ce_loss_12": 3.2914091110229493, + "ce_loss_17": 3.0677689790725706, + "ce_loss_23": 2.941704607009888, + "ce_loss_3": 4.490966200828552, + "ce_loss_6": 3.880615258216858, + "epoch": 0.046, + "grad_norm": 748.0, + "kl_loss_12": 692.5785369873047, + "kl_loss_17": 234.25014266967773, + "kl_loss_3": 3176.8756591796873, + "kl_loss_6": 1969.9073364257813, + "learning_rate": 0.0009967408676742752, + "loss": 1487.7033, + "step": 460 + }, + { + "ce_loss_12": 3.4376975774765013, + "ce_loss_17": 3.204993689060211, + "ce_loss_23": 3.0785672545433043, + "ce_loss_3": 4.589035892486573, + "ce_loss_6": 3.9928988814353943, + "epoch": 0.047, + "grad_norm": 836.0, + "kl_loss_12": 709.9034606933594, + "kl_loss_17": 228.83046264648436, + "kl_loss_3": 3101.909216308594, + "kl_loss_6": 1918.1236389160156, + "learning_rate": 0.0009965575009429006, + "loss": 1532.509, + "step": 470 + }, + { + "ce_loss_12": 3.2385229229927064, + "ce_loss_17": 2.984599435329437, + "ce_loss_23": 2.8635784864425657, + "ce_loss_3": 4.449902439117432, + "ce_loss_6": 3.831362020969391, + "epoch": 0.048, + "grad_norm": 724.0, + "kl_loss_12": 722.7018585205078, + "kl_loss_17": 220.34202346801757, + "kl_loss_3": 3245.0903442382814, + "kl_loss_6": 2010.9374084472656, + "learning_rate": 0.0009963691338830043, + "loss": 1531.1553, + "step": 480 + }, + { + "ce_loss_12": 3.305763602256775, + "ce_loss_17": 3.0736878156661986, + "ce_loss_23": 2.957307231426239, + "ce_loss_3": 4.49764199256897, + "ce_loss_6": 3.8927690625190734, + "epoch": 0.049, + "grad_norm": 844.0, + "kl_loss_12": 691.3308135986329, + "kl_loss_17": 214.79136962890624, + "kl_loss_3": 3176.404748535156, + "kl_loss_6": 1954.2135070800782, + "learning_rate": 0.0009961757683914405, + "loss": 1495.7828, + "step": 490 + }, + { + "ce_loss_12": 3.3125686287879943, + "ce_loss_17": 3.0608574748039246, + "ce_loss_23": 2.941720414161682, + "ce_loss_3": 4.449414944648742, + "ce_loss_6": 3.8704373121261595, + "epoch": 0.05, + "grad_norm": 724.0, + "kl_loss_12": 713.7994873046875, + "kl_loss_17": 219.01981506347656, + "kl_loss_3": 3097.2513061523437, + "kl_loss_6": 1938.0861755371093, + "learning_rate": 0.0009959774064153978, + "loss": 1505.5656, + "step": 500 + }, + { + "ce_loss_12": 3.299354112148285, + "ce_loss_17": 3.0693908810615538, + "ce_loss_23": 2.959504556655884, + "ce_loss_3": 4.431204664707184, + "ce_loss_6": 3.84701007604599, + "epoch": 0.051, + "grad_norm": 624.0, + "kl_loss_12": 670.3522155761718, + "kl_loss_17": 201.11935424804688, + "kl_loss_3": 3017.0482177734375, + "kl_loss_6": 1854.9740051269532, + "learning_rate": 0.0009957740499523787, + "loss": 1465.6556, + "step": 510 + }, + { + "ce_loss_12": 3.3268982768058777, + "ce_loss_17": 3.08267103433609, + "ce_loss_23": 2.9721622109413146, + "ce_loss_3": 4.457084774971008, + "ce_loss_6": 3.8652746081352234, + "epoch": 0.052, + "grad_norm": 696.0, + "kl_loss_12": 694.7547119140625, + "kl_loss_17": 200.21757049560546, + "kl_loss_3": 3047.4221069335936, + "kl_loss_6": 1886.7686340332032, + "learning_rate": 0.0009955657010501807, + "loss": 1464.9039, + "step": 520 + }, + { + "ce_loss_12": 3.2911906957626345, + "ce_loss_17": 3.047533321380615, + "ce_loss_23": 2.930856394767761, + "ce_loss_3": 4.451212811470032, + "ce_loss_6": 3.8664758801460266, + "epoch": 0.053, + "grad_norm": 896.0, + "kl_loss_12": 701.0859619140625, + "kl_loss_17": 205.53631591796875, + "kl_loss_3": 3102.8578247070313, + "kl_loss_6": 1935.0799438476563, + "learning_rate": 0.000995352361806875, + "loss": 1471.9518, + "step": 530 + }, + { + "ce_loss_12": 3.3293033957481386, + "ce_loss_17": 3.0855902075767516, + "ce_loss_23": 2.9691104531288146, + "ce_loss_3": 4.465985608100891, + "ce_loss_6": 3.877950441837311, + "epoch": 0.054, + "grad_norm": 768.0, + "kl_loss_12": 701.2718719482422, + "kl_loss_17": 213.6372833251953, + "kl_loss_3": 3087.834912109375, + "kl_loss_6": 1914.8329772949219, + "learning_rate": 0.0009951340343707852, + "loss": 1498.7925, + "step": 540 + }, + { + "ce_loss_12": 3.3613967180252073, + "ce_loss_17": 3.127401924133301, + "ce_loss_23": 3.0171509265899656, + "ce_loss_3": 4.524386692047119, + "ce_loss_6": 3.936571645736694, + "epoch": 0.055, + "grad_norm": 604.0, + "kl_loss_12": 668.1853912353515, + "kl_loss_17": 200.08008728027343, + "kl_loss_3": 3067.0293701171877, + "kl_loss_6": 1924.4457397460938, + "learning_rate": 0.0009949107209404665, + "loss": 1476.3092, + "step": 550 + }, + { + "ce_loss_12": 3.27703515291214, + "ce_loss_17": 3.0475626826286315, + "ce_loss_23": 2.936045062541962, + "ce_loss_3": 4.412865042686462, + "ce_loss_6": 3.847214663028717, + "epoch": 0.056, + "grad_norm": 640.0, + "kl_loss_12": 660.5824188232422, + "kl_loss_17": 194.43747024536134, + "kl_loss_3": 3025.404443359375, + "kl_loss_6": 1895.8758911132813, + "learning_rate": 0.0009946824237646824, + "loss": 1449.3217, + "step": 560 + }, + { + "ce_loss_12": 3.2381665825843813, + "ce_loss_17": 3.0001315474510193, + "ce_loss_23": 2.8903649926185606, + "ce_loss_3": 4.40073938369751, + "ce_loss_6": 3.808553636074066, + "epoch": 0.057, + "grad_norm": 724.0, + "kl_loss_12": 684.8827117919922, + "kl_loss_17": 200.08551101684571, + "kl_loss_3": 3098.5067626953123, + "kl_loss_6": 1913.1522094726563, + "learning_rate": 0.0009944491451423828, + "loss": 1496.888, + "step": 570 + }, + { + "ce_loss_12": 3.2463485479354857, + "ce_loss_17": 2.9977986097335814, + "ce_loss_23": 2.8845686078071595, + "ce_loss_3": 4.425404500961304, + "ce_loss_6": 3.823835778236389, + "epoch": 0.058, + "grad_norm": 828.0, + "kl_loss_12": 700.3368286132812, + "kl_loss_17": 203.70085220336915, + "kl_loss_3": 3160.6127807617186, + "kl_loss_6": 1947.4135192871095, + "learning_rate": 0.0009942108874226813, + "loss": 1473.7351, + "step": 580 + }, + { + "ce_loss_12": 3.329897570610046, + "ce_loss_17": 3.096381163597107, + "ce_loss_23": 2.9894603371620176, + "ce_loss_3": 4.442056608200073, + "ce_loss_6": 3.874366784095764, + "epoch": 0.059, + "grad_norm": 784.0, + "kl_loss_12": 667.4164886474609, + "kl_loss_17": 197.30725631713867, + "kl_loss_3": 2966.1633422851564, + "kl_loss_6": 1833.4690368652343, + "learning_rate": 0.00099396765300483, + "loss": 1414.7325, + "step": 590 + }, + { + "ce_loss_12": 3.3210636496543886, + "ce_loss_17": 3.081927680969238, + "ce_loss_23": 2.971800923347473, + "ce_loss_3": 4.430813646316528, + "ce_loss_6": 3.8526097893714906, + "epoch": 0.06, + "grad_norm": 912.0, + "kl_loss_12": 680.6019836425781, + "kl_loss_17": 197.30396728515626, + "kl_loss_3": 2995.541357421875, + "kl_loss_6": 1854.753192138672, + "learning_rate": 0.0009937194443381972, + "loss": 1433.0809, + "step": 600 + }, + { + "ce_loss_12": 3.3337490797042846, + "ce_loss_17": 3.1106279253959657, + "ce_loss_23": 3.005161905288696, + "ce_loss_3": 4.419551348686218, + "ce_loss_6": 3.858152377605438, + "epoch": 0.061, + "grad_norm": 672.0, + "kl_loss_12": 648.7204620361329, + "kl_loss_17": 196.67949371337892, + "kl_loss_3": 2915.5722045898438, + "kl_loss_6": 1795.9505493164063, + "learning_rate": 0.0009934662639222412, + "loss": 1425.4387, + "step": 610 + }, + { + "ce_loss_12": 3.2973732352256775, + "ce_loss_17": 3.084644055366516, + "ce_loss_23": 2.959382653236389, + "ce_loss_3": 4.439628386497498, + "ce_loss_6": 3.8573474287986755, + "epoch": 0.062, + "grad_norm": 828.0, + "kl_loss_12": 668.3834808349609, + "kl_loss_17": 226.08399047851563, + "kl_loss_3": 3057.9033325195314, + "kl_loss_6": 1894.1350402832031, + "learning_rate": 0.000993208114306486, + "loss": 1444.5806, + "step": 620 + }, + { + "ce_loss_12": 3.211961007118225, + "ce_loss_17": 3.0096256613731383, + "ce_loss_23": 2.88716698884964, + "ce_loss_3": 4.374472832679748, + "ce_loss_6": 3.7805599689483644, + "epoch": 0.063, + "grad_norm": 752.0, + "kl_loss_12": 649.2993743896484, + "kl_loss_17": 221.76539993286133, + "kl_loss_3": 3058.0645141601562, + "kl_loss_6": 1874.6616943359375, + "learning_rate": 0.0009929449980904952, + "loss": 1417.4091, + "step": 630 + }, + { + "ce_loss_12": 3.2723576068878173, + "ce_loss_17": 3.0581412434577944, + "ce_loss_23": 2.9464841961860655, + "ce_loss_3": 4.395217108726501, + "ce_loss_6": 3.811108076572418, + "epoch": 0.064, + "grad_norm": 808.0, + "kl_loss_12": 639.2544738769532, + "kl_loss_17": 227.6035385131836, + "kl_loss_3": 3003.7718872070313, + "kl_loss_6": 1840.5796264648438, + "learning_rate": 0.0009926769179238466, + "loss": 1412.6059, + "step": 640 + }, + { + "ce_loss_12": 3.3222896218299867, + "ce_loss_17": 3.108461344242096, + "ce_loss_23": 2.9796595931053163, + "ce_loss_3": 4.43104920387268, + "ce_loss_6": 3.864783561229706, + "epoch": 0.065, + "grad_norm": 716.0, + "kl_loss_12": 667.6861999511718, + "kl_loss_17": 230.67943572998047, + "kl_loss_3": 3005.815673828125, + "kl_loss_6": 1862.8342651367188, + "learning_rate": 0.000992403876506104, + "loss": 1427.4293, + "step": 650 + }, + { + "ce_loss_12": 3.2592855095863342, + "ce_loss_17": 3.032492387294769, + "ce_loss_23": 2.921385622024536, + "ce_loss_3": 4.381369090080261, + "ce_loss_6": 3.7872144103050234, + "epoch": 0.066, + "grad_norm": 688.0, + "kl_loss_12": 671.0291412353515, + "kl_loss_17": 206.38212890625, + "kl_loss_3": 2998.441552734375, + "kl_loss_6": 1809.7071350097656, + "learning_rate": 0.0009921258765867918, + "loss": 1421.205, + "step": 660 + }, + { + "ce_loss_12": 3.2370525360107423, + "ce_loss_17": 3.00315066576004, + "ce_loss_23": 2.895841455459595, + "ce_loss_3": 4.379059171676635, + "ce_loss_6": 3.783574569225311, + "epoch": 0.067, + "grad_norm": 832.0, + "kl_loss_12": 679.0885528564453, + "kl_loss_17": 194.57723999023438, + "kl_loss_3": 3073.783825683594, + "kl_loss_6": 1871.8182250976563, + "learning_rate": 0.0009918429209653662, + "loss": 1431.0873, + "step": 670 + }, + { + "ce_loss_12": 3.299939227104187, + "ce_loss_17": 3.046256399154663, + "ce_loss_23": 2.94521769285202, + "ce_loss_3": 4.396662664413452, + "ce_loss_6": 3.8042745709419252, + "epoch": 0.068, + "grad_norm": 700.0, + "kl_loss_12": 693.7759826660156, + "kl_loss_17": 188.6014419555664, + "kl_loss_3": 3028.060095214844, + "kl_loss_6": 1840.6216918945313, + "learning_rate": 0.0009915550124911866, + "loss": 1401.4508, + "step": 680 + }, + { + "ce_loss_12": 3.28766872882843, + "ce_loss_17": 3.0496862173080443, + "ce_loss_23": 2.9469610452651978, + "ce_loss_3": 4.373085260391235, + "ce_loss_6": 3.796851623058319, + "epoch": 0.069, + "grad_norm": 808.0, + "kl_loss_12": 666.584228515625, + "kl_loss_17": 185.1880355834961, + "kl_loss_3": 2915.1931274414064, + "kl_loss_6": 1776.2470642089843, + "learning_rate": 0.0009912621540634887, + "loss": 1391.1077, + "step": 690 + }, + { + "ce_loss_12": 3.3050572991371157, + "ce_loss_17": 3.0859864115715028, + "ce_loss_23": 2.9898800373077394, + "ce_loss_3": 4.364710736274719, + "ce_loss_6": 3.7870045065879823, + "epoch": 0.07, + "grad_norm": 920.0, + "kl_loss_12": 627.817578125, + "kl_loss_17": 173.69211196899414, + "kl_loss_3": 2867.003369140625, + "kl_loss_6": 1710.6666137695313, + "learning_rate": 0.0009909643486313534, + "loss": 1356.2304, + "step": 700 + }, + { + "ce_loss_12": 3.207731914520264, + "ce_loss_17": 2.9801932334899903, + "ce_loss_23": 2.8819775700569155, + "ce_loss_3": 4.3406788349151615, + "ce_loss_6": 3.742799186706543, + "epoch": 0.071, + "grad_norm": 744.0, + "kl_loss_12": 636.164306640625, + "kl_loss_17": 175.40868377685547, + "kl_loss_3": 3009.928515625, + "kl_loss_6": 1818.6133911132813, + "learning_rate": 0.000990661599193678, + "loss": 1430.1488, + "step": 710 + }, + { + "ce_loss_12": 3.2974260449409485, + "ce_loss_17": 3.0879198789596556, + "ce_loss_23": 2.991551196575165, + "ce_loss_3": 4.391224360466003, + "ce_loss_6": 3.8298116087913514, + "epoch": 0.072, + "grad_norm": 936.0, + "kl_loss_12": 606.6379913330078, + "kl_loss_17": 175.6688980102539, + "kl_loss_3": 2895.5029418945314, + "kl_loss_6": 1770.1393676757812, + "learning_rate": 0.0009903539087991462, + "loss": 1362.8802, + "step": 720 + }, + { + "ce_loss_12": 3.277816581726074, + "ce_loss_17": 3.0691084623336793, + "ce_loss_23": 2.974670672416687, + "ce_loss_3": 4.373213529586792, + "ce_loss_6": 3.8061108589172363, + "epoch": 0.073, + "grad_norm": 688.0, + "kl_loss_12": 607.6536895751954, + "kl_loss_17": 177.0436553955078, + "kl_loss_3": 2902.8962158203126, + "kl_loss_6": 1772.4432495117187, + "learning_rate": 0.0009900412805461966, + "loss": 1368.357, + "step": 730 + }, + { + "ce_loss_12": 3.3488402724266053, + "ce_loss_17": 3.1375231623649595, + "ce_loss_23": 3.03947719335556, + "ce_loss_3": 4.412138867378235, + "ce_loss_6": 3.8542447447776795, + "epoch": 0.074, + "grad_norm": 732.0, + "kl_loss_12": 595.930207824707, + "kl_loss_17": 170.8621597290039, + "kl_loss_3": 2857.8486572265624, + "kl_loss_6": 1720.0050415039063, + "learning_rate": 0.0009897237175829927, + "loss": 1358.9611, + "step": 740 + }, + { + "ce_loss_12": 3.242159843444824, + "ce_loss_17": 3.0247748136520385, + "ce_loss_23": 2.9276575922966, + "ce_loss_3": 4.355337357521057, + "ce_loss_6": 3.778389298915863, + "epoch": 0.075, + "grad_norm": 788.0, + "kl_loss_12": 619.531216430664, + "kl_loss_17": 176.01265029907228, + "kl_loss_3": 2965.2418701171873, + "kl_loss_6": 1809.105938720703, + "learning_rate": 0.0009894012231073895, + "loss": 1378.8805, + "step": 750 + }, + { + "ce_loss_12": 3.2793312668800354, + "ce_loss_17": 3.071231472492218, + "ce_loss_23": 2.9780132591724398, + "ce_loss_3": 4.380762314796447, + "ce_loss_6": 3.808058273792267, + "epoch": 0.076, + "grad_norm": 880.0, + "kl_loss_12": 596.9268798828125, + "kl_loss_17": 170.39969711303712, + "kl_loss_3": 2894.7660400390623, + "kl_loss_6": 1755.1642150878906, + "learning_rate": 0.0009890738003669028, + "loss": 1372.3736, + "step": 760 + }, + { + "ce_loss_12": 3.2576115489006043, + "ce_loss_17": 3.042385685443878, + "ce_loss_23": 2.9484377264976502, + "ce_loss_3": 4.388651013374329, + "ce_loss_6": 3.7962119698524477, + "epoch": 0.077, + "grad_norm": 1760.0, + "kl_loss_12": 609.9247650146484, + "kl_loss_17": 170.56164016723633, + "kl_loss_3": 2988.757531738281, + "kl_loss_6": 1815.9149169921875, + "learning_rate": 0.0009887414526586764, + "loss": 1355.8361, + "step": 770 + }, + { + "ce_loss_12": 3.298181438446045, + "ce_loss_17": 3.0905930161476136, + "ce_loss_23": 2.998029613494873, + "ce_loss_3": 4.381036710739136, + "ce_loss_6": 3.8145140647888183, + "epoch": 0.078, + "grad_norm": 760.0, + "kl_loss_12": 582.8910247802735, + "kl_loss_17": 163.08836669921874, + "kl_loss_3": 2856.5729858398436, + "kl_loss_6": 1727.3598388671876, + "learning_rate": 0.0009884041833294476, + "loss": 1319.3967, + "step": 780 + }, + { + "ce_loss_12": 3.3041907072067263, + "ce_loss_17": 3.0935172557830812, + "ce_loss_23": 3.0023789405822754, + "ce_loss_3": 4.366575372219086, + "ce_loss_6": 3.803239977359772, + "epoch": 0.079, + "grad_norm": 740.0, + "kl_loss_12": 592.9966247558593, + "kl_loss_17": 163.48230056762696, + "kl_loss_3": 2841.9972534179688, + "kl_loss_6": 1702.5080444335938, + "learning_rate": 0.000988061995775515, + "loss": 1359.5953, + "step": 790 + }, + { + "ce_loss_12": 3.2399455189704893, + "ce_loss_17": 3.037798821926117, + "ce_loss_23": 2.9372796416282654, + "ce_loss_3": 4.303165769577026, + "ce_loss_6": 3.750383937358856, + "epoch": 0.08, + "grad_norm": 836.0, + "kl_loss_12": 608.0829467773438, + "kl_loss_17": 181.16675491333007, + "kl_loss_3": 2844.9753662109374, + "kl_loss_6": 1732.1241088867187, + "learning_rate": 0.0009877148934427035, + "loss": 1341.4348, + "step": 800 + }, + { + "ce_loss_12": 3.2861321926116944, + "ce_loss_17": 3.0791449308395387, + "ce_loss_23": 2.976716470718384, + "ce_loss_3": 4.362315344810486, + "ce_loss_6": 3.7868611097335814, + "epoch": 0.081, + "grad_norm": 1080.0, + "kl_loss_12": 604.6169494628906, + "kl_loss_17": 182.59001693725585, + "kl_loss_3": 2893.0469970703125, + "kl_loss_6": 1735.8979614257812, + "learning_rate": 0.0009873628798263297, + "loss": 1338.7411, + "step": 810 + }, + { + "ce_loss_12": 3.2293206453323364, + "ce_loss_17": 3.034642231464386, + "ce_loss_23": 2.939870834350586, + "ce_loss_3": 4.28888818025589, + "ce_loss_6": 3.739218699932098, + "epoch": 0.082, + "grad_norm": 792.0, + "kl_loss_12": 585.2830001831055, + "kl_loss_17": 175.42455139160157, + "kl_loss_3": 2814.965295410156, + "kl_loss_6": 1707.1840454101562, + "learning_rate": 0.0009870059584711668, + "loss": 1357.3789, + "step": 820 + }, + { + "ce_loss_12": 3.2427746772766115, + "ce_loss_17": 3.0388410449028016, + "ce_loss_23": 2.9463847517967223, + "ce_loss_3": 4.313164341449737, + "ce_loss_6": 3.7601605892181396, + "epoch": 0.083, + "grad_norm": 784.0, + "kl_loss_12": 577.6069213867188, + "kl_loss_17": 166.45866165161132, + "kl_loss_3": 2826.7182006835938, + "kl_loss_6": 1720.5235717773437, + "learning_rate": 0.000986644132971409, + "loss": 1323.6652, + "step": 830 + }, + { + "ce_loss_12": 3.2438090324401854, + "ce_loss_17": 3.0304963111877443, + "ce_loss_23": 2.932339680194855, + "ce_loss_3": 4.340520238876342, + "ce_loss_6": 3.7706413149833677, + "epoch": 0.084, + "grad_norm": 852.0, + "kl_loss_12": 600.9248870849609, + "kl_loss_17": 173.94204864501953, + "kl_loss_3": 2898.236535644531, + "kl_loss_6": 1753.0972961425782, + "learning_rate": 0.0009862774069706345, + "loss": 1342.0187, + "step": 840 + }, + { + "ce_loss_12": 3.3441239953041078, + "ce_loss_17": 3.1514394879341125, + "ce_loss_23": 3.0594014644622805, + "ce_loss_3": 4.38064968585968, + "ce_loss_6": 3.8304377436637878, + "epoch": 0.085, + "grad_norm": 756.0, + "kl_loss_12": 577.1782684326172, + "kl_loss_17": 166.6529998779297, + "kl_loss_3": 2765.473034667969, + "kl_loss_6": 1668.087255859375, + "learning_rate": 0.000985905784161771, + "loss": 1310.3972, + "step": 850 + }, + { + "ce_loss_12": 3.2722967624664308, + "ce_loss_17": 3.0797245502471924, + "ce_loss_23": 2.988293194770813, + "ce_loss_3": 4.321183085441589, + "ce_loss_6": 3.767487609386444, + "epoch": 0.086, + "grad_norm": 688.0, + "kl_loss_12": 565.3048660278321, + "kl_loss_17": 163.8540298461914, + "kl_loss_3": 2786.437707519531, + "kl_loss_6": 1674.3542053222657, + "learning_rate": 0.000985529268287055, + "loss": 1295.72, + "step": 860 + }, + { + "ce_loss_12": 3.210437059402466, + "ce_loss_17": 3.014290964603424, + "ce_loss_23": 2.920425224304199, + "ce_loss_3": 4.306206333637237, + "ce_loss_6": 3.734451103210449, + "epoch": 0.087, + "grad_norm": 1020.0, + "kl_loss_12": 582.5995712280273, + "kl_loss_17": 169.39782028198243, + "kl_loss_3": 2866.389929199219, + "kl_loss_6": 1722.0148559570312, + "learning_rate": 0.0009851478631379982, + "loss": 1335.3591, + "step": 870 + }, + { + "ce_loss_12": 3.2780421018600463, + "ce_loss_17": 3.0773184299468994, + "ce_loss_23": 2.981848442554474, + "ce_loss_3": 4.3347776412963865, + "ce_loss_6": 3.7697909474372864, + "epoch": 0.088, + "grad_norm": 880.0, + "kl_loss_12": 581.4047927856445, + "kl_loss_17": 169.22406311035155, + "kl_loss_3": 2813.3952758789064, + "kl_loss_6": 1681.3613159179688, + "learning_rate": 0.0009847615725553456, + "loss": 1317.7922, + "step": 880 + }, + { + "ce_loss_12": 3.308706593513489, + "ce_loss_17": 3.1193915247917174, + "ce_loss_23": 3.0320719122886657, + "ce_loss_3": 4.325762486457824, + "ce_loss_6": 3.7881689071655273, + "epoch": 0.089, + "grad_norm": 728.0, + "kl_loss_12": 551.782502746582, + "kl_loss_17": 155.5468780517578, + "kl_loss_3": 2698.84833984375, + "kl_loss_6": 1623.6959045410156, + "learning_rate": 0.0009843704004290394, + "loss": 1306.5696, + "step": 890 + }, + { + "ce_loss_12": 3.2304977893829347, + "ce_loss_17": 3.029300880432129, + "ce_loss_23": 2.9373438477516176, + "ce_loss_3": 4.290966820716858, + "ce_loss_6": 3.7355265498161314, + "epoch": 0.09, + "grad_norm": 652.0, + "kl_loss_12": 590.5244323730469, + "kl_loss_17": 170.3102066040039, + "kl_loss_3": 2834.1925659179688, + "kl_loss_6": 1716.8873474121094, + "learning_rate": 0.0009839743506981783, + "loss": 1320.3553, + "step": 900 + }, + { + "ce_loss_12": 3.172462725639343, + "ce_loss_17": 2.96781644821167, + "ce_loss_23": 2.8684237480163572, + "ce_loss_3": 4.273809564113617, + "ce_loss_6": 3.6973857641220094, + "epoch": 0.091, + "grad_norm": 808.0, + "kl_loss_12": 602.5951446533203, + "kl_loss_17": 176.92565536499023, + "kl_loss_3": 2918.4544311523437, + "kl_loss_6": 1768.0981994628905, + "learning_rate": 0.0009835734273509786, + "loss": 1341.5896, + "step": 910 + }, + { + "ce_loss_12": 3.2609114408493043, + "ce_loss_17": 3.0531147003173826, + "ce_loss_23": 2.9561579704284666, + "ce_loss_3": 4.325370657444, + "ce_loss_6": 3.763854217529297, + "epoch": 0.092, + "grad_norm": 856.0, + "kl_loss_12": 580.0848983764648, + "kl_loss_17": 170.60181579589843, + "kl_loss_3": 2800.7559448242187, + "kl_loss_6": 1674.3670715332032, + "learning_rate": 0.0009831676344247342, + "loss": 1309.1984, + "step": 920 + }, + { + "ce_loss_12": 3.25858074426651, + "ce_loss_17": 3.066873002052307, + "ce_loss_23": 2.98137309551239, + "ce_loss_3": 4.282044243812561, + "ce_loss_6": 3.7491400599479676, + "epoch": 0.093, + "grad_norm": 892.0, + "kl_loss_12": 545.7637374877929, + "kl_loss_17": 160.86735000610352, + "kl_loss_3": 2735.9965576171876, + "kl_loss_6": 1650.1052307128907, + "learning_rate": 0.0009827569760057755, + "loss": 1291.6329, + "step": 930 + }, + { + "ce_loss_12": 3.2043312907218935, + "ce_loss_17": 2.993998634815216, + "ce_loss_23": 2.8952491641044618, + "ce_loss_3": 4.315342664718628, + "ce_loss_6": 3.740140736103058, + "epoch": 0.094, + "grad_norm": 1000.0, + "kl_loss_12": 590.2115966796875, + "kl_loss_17": 173.1332695007324, + "kl_loss_3": 2927.032568359375, + "kl_loss_6": 1768.2796447753906, + "learning_rate": 0.000982341456229428, + "loss": 1320.8015, + "step": 940 + }, + { + "ce_loss_12": 3.2777076601982116, + "ce_loss_17": 3.0761674523353575, + "ce_loss_23": 2.986429727077484, + "ce_loss_3": 4.346337795257568, + "ce_loss_6": 3.786615800857544, + "epoch": 0.095, + "grad_norm": 748.0, + "kl_loss_12": 579.9668441772461, + "kl_loss_17": 164.05571441650392, + "kl_loss_3": 2845.5396240234377, + "kl_loss_6": 1712.9433532714843, + "learning_rate": 0.000981921079279971, + "loss": 1292.4729, + "step": 950 + }, + { + "ce_loss_12": 3.2670510172843934, + "ce_loss_17": 3.0814021468162536, + "ce_loss_23": 2.997356629371643, + "ce_loss_3": 4.266339981555939, + "ce_loss_6": 3.732987666130066, + "epoch": 0.096, + "grad_norm": 716.0, + "kl_loss_12": 551.3224319458008, + "kl_loss_17": 157.1545051574707, + "kl_loss_3": 2686.10380859375, + "kl_loss_6": 1609.8591064453126, + "learning_rate": 0.0009814958493905962, + "loss": 1267.0563, + "step": 960 + }, + { + "ce_loss_12": 3.243378257751465, + "ce_loss_17": 3.0477044582366943, + "ce_loss_23": 2.957053375244141, + "ce_loss_3": 4.320212554931641, + "ce_loss_6": 3.7490431308746337, + "epoch": 0.097, + "grad_norm": 772.0, + "kl_loss_12": 571.2222473144532, + "kl_loss_17": 167.3255302429199, + "kl_loss_3": 2832.0980590820313, + "kl_loss_6": 1694.2712707519531, + "learning_rate": 0.0009810657708433637, + "loss": 1335.8684, + "step": 970 + }, + { + "ce_loss_12": 3.314019334316254, + "ce_loss_17": 3.123671591281891, + "ce_loss_23": 3.034430181980133, + "ce_loss_3": 4.311093711853028, + "ce_loss_6": 3.77311372756958, + "epoch": 0.098, + "grad_norm": 820.0, + "kl_loss_12": 552.7862976074218, + "kl_loss_17": 168.19173049926758, + "kl_loss_3": 2656.2674072265627, + "kl_loss_6": 1591.3033081054687, + "learning_rate": 0.0009806308479691594, + "loss": 1254.4201, + "step": 980 + }, + { + "ce_loss_12": 3.3261463403701783, + "ce_loss_17": 3.1423306703567504, + "ce_loss_23": 3.032912051677704, + "ce_loss_3": 4.357937359809876, + "ce_loss_6": 3.80484082698822, + "epoch": 0.099, + "grad_norm": 804.0, + "kl_loss_12": 580.5806060791016, + "kl_loss_17": 184.8185920715332, + "kl_loss_3": 2754.346044921875, + "kl_loss_6": 1644.7821472167968, + "learning_rate": 0.0009801910851476522, + "loss": 1282.2547, + "step": 990 + }, + { + "ce_loss_12": 3.2512516736984254, + "ce_loss_17": 3.0605135440826414, + "ce_loss_23": 2.9637475967407227, + "ce_loss_3": 4.329551267623901, + "ce_loss_6": 3.743285083770752, + "epoch": 0.1, + "grad_norm": 748.0, + "kl_loss_12": 582.9849090576172, + "kl_loss_17": 174.53399505615235, + "kl_loss_3": 2859.1509521484377, + "kl_loss_6": 1687.5102966308593, + "learning_rate": 0.0009797464868072487, + "loss": 1294.724, + "step": 1000 + }, + { + "ce_loss_12": 3.2312548875808718, + "ce_loss_17": 3.0431445121765135, + "ce_loss_23": 2.9497431397438048, + "ce_loss_3": 4.286968004703522, + "ce_loss_6": 3.7301648378372194, + "epoch": 0.101, + "grad_norm": 780.0, + "kl_loss_12": 578.5622329711914, + "kl_loss_17": 171.74994735717775, + "kl_loss_3": 2811.974072265625, + "kl_loss_6": 1686.0221252441406, + "learning_rate": 0.0009792970574250492, + "loss": 1304.7269, + "step": 1010 + }, + { + "ce_loss_12": 3.2564730048179626, + "ce_loss_17": 3.0567847013473513, + "ce_loss_23": 2.968216300010681, + "ce_loss_3": 4.288906526565552, + "ce_loss_6": 3.7319556951522825, + "epoch": 0.102, + "grad_norm": 924.0, + "kl_loss_12": 564.9530090332031, + "kl_loss_17": 164.64181747436524, + "kl_loss_3": 2745.9426513671874, + "kl_loss_6": 1627.5541931152343, + "learning_rate": 0.0009788428015268028, + "loss": 1261.1484, + "step": 1020 + }, + { + "ce_loss_12": 3.261544132232666, + "ce_loss_17": 3.06712828874588, + "ce_loss_23": 2.9740715503692625, + "ce_loss_3": 4.265662479400635, + "ce_loss_6": 3.726946938037872, + "epoch": 0.103, + "grad_norm": 788.0, + "kl_loss_12": 576.264567565918, + "kl_loss_17": 166.91285705566406, + "kl_loss_3": 2714.088330078125, + "kl_loss_6": 1614.7607482910157, + "learning_rate": 0.0009783837236868609, + "loss": 1267.8549, + "step": 1030 + }, + { + "ce_loss_12": 3.2258948802948, + "ce_loss_17": 3.0288470864295958, + "ce_loss_23": 2.936104118824005, + "ce_loss_3": 4.249889886379242, + "ce_loss_6": 3.698464000225067, + "epoch": 0.104, + "grad_norm": 760.0, + "kl_loss_12": 571.986279296875, + "kl_loss_17": 165.31406631469727, + "kl_loss_3": 2715.9388916015623, + "kl_loss_6": 1618.140185546875, + "learning_rate": 0.0009779198285281327, + "loss": 1259.2237, + "step": 1040 + }, + { + "ce_loss_12": 3.2193004369735716, + "ce_loss_17": 3.0228930950164794, + "ce_loss_23": 2.9384063124656676, + "ce_loss_3": 4.265962791442871, + "ce_loss_6": 3.7088080763816835, + "epoch": 0.105, + "grad_norm": 752.0, + "kl_loss_12": 558.9580703735352, + "kl_loss_17": 154.56547775268555, + "kl_loss_3": 2765.327038574219, + "kl_loss_6": 1636.0435363769532, + "learning_rate": 0.0009774511207220368, + "loss": 1274.2527, + "step": 1050 + }, + { + "ce_loss_12": 3.2610196471214294, + "ce_loss_17": 3.0684242367744448, + "ce_loss_23": 2.978561782836914, + "ce_loss_3": 4.308524203300476, + "ce_loss_6": 3.7470775604248048, + "epoch": 0.106, + "grad_norm": 880.0, + "kl_loss_12": 561.0937957763672, + "kl_loss_17": 159.2811538696289, + "kl_loss_3": 2772.8382202148437, + "kl_loss_6": 1638.9705444335937, + "learning_rate": 0.0009769776049884564, + "loss": 1274.5569, + "step": 1060 + }, + { + "ce_loss_12": 3.1786951303482054, + "ce_loss_17": 2.9803019642829893, + "ce_loss_23": 2.8939341425895693, + "ce_loss_3": 4.241933369636536, + "ce_loss_6": 3.679634761810303, + "epoch": 0.107, + "grad_norm": 940.0, + "kl_loss_12": 566.5046829223633, + "kl_loss_17": 157.10613098144532, + "kl_loss_3": 2836.979296875, + "kl_loss_6": 1695.5368774414062, + "learning_rate": 0.0009764992860956889, + "loss": 1316.3168, + "step": 1070 + }, + { + "ce_loss_12": 3.296342170238495, + "ce_loss_17": 3.116221308708191, + "ce_loss_23": 3.0379060387611387, + "ce_loss_3": 4.2762956619262695, + "ce_loss_6": 3.746916079521179, + "epoch": 0.108, + "grad_norm": 800.0, + "kl_loss_12": 527.0881530761719, + "kl_loss_17": 147.49216232299804, + "kl_loss_3": 2606.6685546875, + "kl_loss_6": 1547.9802795410155, + "learning_rate": 0.0009760161688604008, + "loss": 1232.1656, + "step": 1080 + }, + { + "ce_loss_12": 3.315100681781769, + "ce_loss_17": 3.1226739048957826, + "ce_loss_23": 3.03648464679718, + "ce_loss_3": 4.320034456253052, + "ce_loss_6": 3.788403558731079, + "epoch": 0.109, + "grad_norm": 760.0, + "kl_loss_12": 553.3834716796875, + "kl_loss_17": 152.19630699157716, + "kl_loss_3": 2681.25322265625, + "kl_loss_6": 1610.068292236328, + "learning_rate": 0.0009755282581475768, + "loss": 1261.8289, + "step": 1090 + }, + { + "ce_loss_12": 3.3554514765739443, + "ce_loss_17": 3.1595420360565187, + "ce_loss_23": 3.0734885215759276, + "ce_loss_3": 4.345818209648132, + "ce_loss_6": 3.8123062372207643, + "epoch": 0.11, + "grad_norm": 744.0, + "kl_loss_12": 568.856837463379, + "kl_loss_17": 159.01618118286132, + "kl_loss_3": 2657.6038818359375, + "kl_loss_6": 1583.0754455566407, + "learning_rate": 0.0009750355588704727, + "loss": 1234.9309, + "step": 1100 + }, + { + "ce_loss_12": 3.2044736742973328, + "ce_loss_17": 3.0105460405349733, + "ce_loss_23": 2.9257432579994203, + "ce_loss_3": 4.223796212673188, + "ce_loss_6": 3.675619602203369, + "epoch": 0.111, + "grad_norm": 796.0, + "kl_loss_12": 555.5249206542969, + "kl_loss_17": 152.13012924194337, + "kl_loss_3": 2700.157360839844, + "kl_loss_6": 1599.4874572753906, + "learning_rate": 0.0009745380759905647, + "loss": 1272.1023, + "step": 1110 + }, + { + "ce_loss_12": 3.165294623374939, + "ce_loss_17": 2.971400058269501, + "ce_loss_23": 2.884852170944214, + "ce_loss_3": 4.2120347142219545, + "ce_loss_6": 3.659716248512268, + "epoch": 0.112, + "grad_norm": 768.0, + "kl_loss_12": 552.1841415405273, + "kl_loss_17": 155.7645294189453, + "kl_loss_3": 2763.0667358398437, + "kl_loss_6": 1642.3040100097655, + "learning_rate": 0.0009740358145174998, + "loss": 1311.7117, + "step": 1120 + }, + { + "ce_loss_12": 3.2912906646728515, + "ce_loss_17": 3.1120285749435426, + "ce_loss_23": 3.0224124908447267, + "ce_loss_3": 4.2669930934906, + "ce_loss_6": 3.743947458267212, + "epoch": 0.113, + "grad_norm": 724.0, + "kl_loss_12": 537.3440704345703, + "kl_loss_17": 165.69821243286134, + "kl_loss_3": 2629.3704833984375, + "kl_loss_6": 1580.140069580078, + "learning_rate": 0.0009735287795090455, + "loss": 1245.9326, + "step": 1130 + }, + { + "ce_loss_12": 3.199613857269287, + "ce_loss_17": 3.0266300678253173, + "ce_loss_23": 2.9283379316329956, + "ce_loss_3": 4.228734850883484, + "ce_loss_6": 3.688489925861359, + "epoch": 0.114, + "grad_norm": 852.0, + "kl_loss_12": 534.8260620117187, + "kl_loss_17": 182.36328735351563, + "kl_loss_3": 2714.242395019531, + "kl_loss_6": 1629.0295959472655, + "learning_rate": 0.0009730169760710386, + "loss": 1258.2001, + "step": 1140 + }, + { + "ce_loss_12": 3.261527454853058, + "ce_loss_17": 3.089632534980774, + "ce_loss_23": 2.9942903399467466, + "ce_loss_3": 4.275948905944825, + "ce_loss_6": 3.733642745018005, + "epoch": 0.115, + "grad_norm": 1032.0, + "kl_loss_12": 527.5316513061523, + "kl_loss_17": 171.56800003051757, + "kl_loss_3": 2639.7792846679686, + "kl_loss_6": 1569.7343627929688, + "learning_rate": 0.0009725004093573342, + "loss": 1245.8943, + "step": 1150 + }, + { + "ce_loss_12": 3.22711728811264, + "ce_loss_17": 3.045984649658203, + "ce_loss_23": 2.946725940704346, + "ce_loss_3": 4.235481929779053, + "ce_loss_6": 3.7019497632980345, + "epoch": 0.116, + "grad_norm": 792.0, + "kl_loss_12": 534.4117095947265, + "kl_loss_17": 173.247119140625, + "kl_loss_3": 2655.5172485351563, + "kl_loss_6": 1577.8178833007812, + "learning_rate": 0.0009719790845697534, + "loss": 1229.9186, + "step": 1160 + }, + { + "ce_loss_12": 3.1724898338317873, + "ce_loss_17": 2.9979549288749694, + "ce_loss_23": 2.912010908126831, + "ce_loss_3": 4.142260408401489, + "ce_loss_6": 3.6257847785949706, + "epoch": 0.117, + "grad_norm": 996.0, + "kl_loss_12": 512.0660125732422, + "kl_loss_17": 154.08805084228516, + "kl_loss_3": 2589.680029296875, + "kl_loss_6": 1544.665350341797, + "learning_rate": 0.0009714530069580309, + "loss": 1208.7752, + "step": 1170 + }, + { + "ce_loss_12": 3.2630844950675963, + "ce_loss_17": 3.078612267971039, + "ce_loss_23": 2.991488981246948, + "ce_loss_3": 4.273677432537079, + "ce_loss_6": 3.739665389060974, + "epoch": 0.118, + "grad_norm": 916.0, + "kl_loss_12": 547.9355621337891, + "kl_loss_17": 164.50029220581055, + "kl_loss_3": 2683.0645141601562, + "kl_loss_6": 1613.4322387695313, + "learning_rate": 0.0009709221818197624, + "loss": 1239.5195, + "step": 1180 + }, + { + "ce_loss_12": 3.306522285938263, + "ce_loss_17": 3.119382691383362, + "ce_loss_23": 3.0303967595100403, + "ce_loss_3": 4.328462171554565, + "ce_loss_6": 3.783062732219696, + "epoch": 0.119, + "grad_norm": 1360.0, + "kl_loss_12": 539.4908432006836, + "kl_loss_17": 154.80245361328124, + "kl_loss_3": 2709.159716796875, + "kl_loss_6": 1613.389471435547, + "learning_rate": 0.0009703866145003512, + "loss": 1249.8813, + "step": 1190 + }, + { + "ce_loss_12": 3.2699382424354555, + "ce_loss_17": 3.0863998413085936, + "ce_loss_23": 3.0056095004081724, + "ce_loss_3": 4.255171620845795, + "ce_loss_6": 3.7445420384407044, + "epoch": 0.12, + "grad_norm": 916.0, + "kl_loss_12": 530.1376663208008, + "kl_loss_17": 145.03771438598633, + "kl_loss_3": 2644.3298095703126, + "kl_loss_6": 1602.0392944335938, + "learning_rate": 0.0009698463103929542, + "loss": 1255.4442, + "step": 1200 + }, + { + "ce_loss_12": 3.2317134141921997, + "ce_loss_17": 3.0455020546913145, + "ce_loss_23": 2.96392343044281, + "ce_loss_3": 4.257777070999145, + "ce_loss_6": 3.7172797322273254, + "epoch": 0.121, + "grad_norm": 652.0, + "kl_loss_12": 538.7726196289062, + "kl_loss_17": 150.16256446838378, + "kl_loss_3": 2692.4408935546876, + "kl_loss_6": 1617.0947509765624, + "learning_rate": 0.0009693012749384279, + "loss": 1256.565, + "step": 1210 + }, + { + "ce_loss_12": 3.2393490433692933, + "ce_loss_17": 3.055326855182648, + "ce_loss_23": 2.9717833280563353, + "ce_loss_3": 4.23775063753128, + "ce_loss_6": 3.708564019203186, + "epoch": 0.122, + "grad_norm": 812.0, + "kl_loss_12": 533.0684432983398, + "kl_loss_17": 148.2549591064453, + "kl_loss_3": 2648.3533569335937, + "kl_loss_6": 1581.6532165527344, + "learning_rate": 0.0009687515136252732, + "loss": 1221.4145, + "step": 1220 + }, + { + "ce_loss_12": 3.204907751083374, + "ce_loss_17": 3.0170387744903566, + "ce_loss_23": 2.9366545915603637, + "ce_loss_3": 4.259589290618896, + "ce_loss_6": 3.697655403614044, + "epoch": 0.123, + "grad_norm": 836.0, + "kl_loss_12": 535.4695510864258, + "kl_loss_17": 145.64403610229493, + "kl_loss_3": 2768.4408935546876, + "kl_loss_6": 1633.621044921875, + "learning_rate": 0.0009681970319895803, + "loss": 1285.5104, + "step": 1230 + }, + { + "ce_loss_12": 3.283095097541809, + "ce_loss_17": 3.0973464846611023, + "ce_loss_23": 3.020170819759369, + "ce_loss_3": 4.27948043346405, + "ce_loss_6": 3.7493759989738464, + "epoch": 0.124, + "grad_norm": 724.0, + "kl_loss_12": 532.69248046875, + "kl_loss_17": 142.17120971679688, + "kl_loss_3": 2643.1325561523436, + "kl_loss_6": 1572.2886047363281, + "learning_rate": 0.0009676378356149733, + "loss": 1218.1225, + "step": 1240 + }, + { + "ce_loss_12": 3.2427470684051514, + "ce_loss_17": 3.0617971658706664, + "ce_loss_23": 2.986854386329651, + "ce_loss_3": 4.217474734783172, + "ce_loss_6": 3.691105401515961, + "epoch": 0.125, + "grad_norm": 1032.0, + "kl_loss_12": 514.074739074707, + "kl_loss_17": 137.4885368347168, + "kl_loss_3": 2594.8048583984373, + "kl_loss_6": 1531.531268310547, + "learning_rate": 0.0009670739301325534, + "loss": 1206.076, + "step": 1250 + }, + { + "ce_loss_12": 3.2147034883499144, + "ce_loss_17": 3.027522599697113, + "ce_loss_23": 2.9497153043746946, + "ce_loss_3": 4.191295731067657, + "ce_loss_6": 3.679839813709259, + "epoch": 0.126, + "grad_norm": 720.0, + "kl_loss_12": 527.9758728027343, + "kl_loss_17": 144.51774787902832, + "kl_loss_3": 2607.5860107421877, + "kl_loss_6": 1569.477960205078, + "learning_rate": 0.0009665053212208426, + "loss": 1229.3042, + "step": 1260 + }, + { + "ce_loss_12": 3.246464729309082, + "ce_loss_17": 3.0629101634025573, + "ce_loss_23": 2.9833993911743164, + "ce_loss_3": 4.251738870143891, + "ce_loss_6": 3.7133025169372558, + "epoch": 0.127, + "grad_norm": 864.0, + "kl_loss_12": 533.1231460571289, + "kl_loss_17": 148.98831024169922, + "kl_loss_3": 2664.428918457031, + "kl_loss_6": 1587.7116638183593, + "learning_rate": 0.0009659320146057262, + "loss": 1228.6906, + "step": 1270 + }, + { + "ce_loss_12": 3.2591238498687742, + "ce_loss_17": 3.0797009110450744, + "ce_loss_23": 2.998037600517273, + "ce_loss_3": 4.238789916038513, + "ce_loss_6": 3.7171790480613707, + "epoch": 0.128, + "grad_norm": 992.0, + "kl_loss_12": 518.5297225952148, + "kl_loss_17": 148.35939559936523, + "kl_loss_3": 2606.920068359375, + "kl_loss_6": 1548.8865661621094, + "learning_rate": 0.0009653540160603955, + "loss": 1210.3549, + "step": 1280 + }, + { + "ce_loss_12": 3.2535346031188963, + "ce_loss_17": 3.084386110305786, + "ce_loss_23": 3.00300794839859, + "ce_loss_3": 4.23352427482605, + "ce_loss_6": 3.7167670369148254, + "epoch": 0.129, + "grad_norm": 852.0, + "kl_loss_12": 512.0488311767579, + "kl_loss_17": 148.47010040283203, + "kl_loss_3": 2611.6455810546877, + "kl_loss_6": 1550.3183898925781, + "learning_rate": 0.0009647713314052896, + "loss": 1197.2949, + "step": 1290 + }, + { + "ce_loss_12": 3.2205206751823425, + "ce_loss_17": 3.0326789259910583, + "ce_loss_23": 2.9471697568893434, + "ce_loss_3": 4.25264720916748, + "ce_loss_6": 3.7139246821403504, + "epoch": 0.13, + "grad_norm": 768.0, + "kl_loss_12": 535.0598526000977, + "kl_loss_17": 152.8258155822754, + "kl_loss_3": 2725.5468872070314, + "kl_loss_6": 1630.4482482910157, + "learning_rate": 0.0009641839665080363, + "loss": 1248.7602, + "step": 1300 + }, + { + "ce_loss_12": 3.184041130542755, + "ce_loss_17": 3.0028507471084596, + "ce_loss_23": 2.921945583820343, + "ce_loss_3": 4.199795937538147, + "ce_loss_6": 3.655711257457733, + "epoch": 0.131, + "grad_norm": 956.0, + "kl_loss_12": 509.8348114013672, + "kl_loss_17": 144.7429271697998, + "kl_loss_3": 2653.9624633789062, + "kl_loss_6": 1567.1138244628905, + "learning_rate": 0.0009635919272833937, + "loss": 1206.475, + "step": 1310 + }, + { + "ce_loss_12": 3.212551248073578, + "ce_loss_17": 3.026778721809387, + "ce_loss_23": 2.946667730808258, + "ce_loss_3": 4.211032819747925, + "ce_loss_6": 3.672435760498047, + "epoch": 0.132, + "grad_norm": 852.0, + "kl_loss_12": 520.5912582397461, + "kl_loss_17": 145.87120895385743, + "kl_loss_3": 2639.656628417969, + "kl_loss_6": 1553.8392272949218, + "learning_rate": 0.0009629952196931902, + "loss": 1193.8047, + "step": 1320 + }, + { + "ce_loss_12": 3.186399018764496, + "ce_loss_17": 3.0172454714775085, + "ce_loss_23": 2.9395448088645937, + "ce_loss_3": 4.195619773864746, + "ce_loss_6": 3.6446786284446717, + "epoch": 0.133, + "grad_norm": 952.0, + "kl_loss_12": 501.36443481445315, + "kl_loss_17": 144.396586227417, + "kl_loss_3": 2638.032116699219, + "kl_loss_6": 1531.3646911621095, + "learning_rate": 0.0009623938497462645, + "loss": 1198.4854, + "step": 1330 + }, + { + "ce_loss_12": 3.1858184933662415, + "ce_loss_17": 3.0126922845840456, + "ce_loss_23": 2.9323938727378844, + "ce_loss_3": 4.1889008522033695, + "ce_loss_6": 3.6592318177223206, + "epoch": 0.134, + "grad_norm": 924.0, + "kl_loss_12": 511.55785064697267, + "kl_loss_17": 145.21801109313964, + "kl_loss_3": 2631.326708984375, + "kl_loss_6": 1557.732861328125, + "learning_rate": 0.0009617878234984055, + "loss": 1217.4046, + "step": 1340 + }, + { + "ce_loss_12": 3.2636426448822022, + "ce_loss_17": 3.0926445841789247, + "ce_loss_23": 3.0167927503585816, + "ce_loss_3": 4.219664669036865, + "ce_loss_6": 3.698105537891388, + "epoch": 0.135, + "grad_norm": 916.0, + "kl_loss_12": 497.62325286865234, + "kl_loss_17": 138.0944309234619, + "kl_loss_3": 2526.13603515625, + "kl_loss_6": 1484.2228942871093, + "learning_rate": 0.0009611771470522907, + "loss": 1180.7393, + "step": 1350 + }, + { + "ce_loss_12": 3.2092088937759398, + "ce_loss_17": 3.0275454759597777, + "ce_loss_23": 2.9461109161376955, + "ce_loss_3": 4.19175910949707, + "ce_loss_6": 3.6639052033424377, + "epoch": 0.136, + "grad_norm": 856.0, + "kl_loss_12": 504.0848876953125, + "kl_loss_17": 143.1431613922119, + "kl_loss_3": 2579.740380859375, + "kl_loss_6": 1511.1883422851563, + "learning_rate": 0.0009605618265574251, + "loss": 1178.6502, + "step": 1360 + }, + { + "ce_loss_12": 3.182911384105682, + "ce_loss_17": 3.001040053367615, + "ce_loss_23": 2.9205902814865112, + "ce_loss_3": 4.194320964813232, + "ce_loss_6": 3.6565115213394166, + "epoch": 0.137, + "grad_norm": 928.0, + "kl_loss_12": 532.6784011840821, + "kl_loss_17": 147.71124191284179, + "kl_loss_3": 2669.027685546875, + "kl_loss_6": 1589.4699401855469, + "learning_rate": 0.0009599418682100792, + "loss": 1215.4029, + "step": 1370 + }, + { + "ce_loss_12": 3.2150518894195557, + "ce_loss_17": 3.035642147064209, + "ce_loss_23": 2.9590429425239564, + "ce_loss_3": 4.2062974095344545, + "ce_loss_6": 3.6585775136947634, + "epoch": 0.138, + "grad_norm": 888.0, + "kl_loss_12": 514.3810455322266, + "kl_loss_17": 138.8052433013916, + "kl_loss_3": 2626.3319091796875, + "kl_loss_6": 1532.4898864746094, + "learning_rate": 0.0009593172782532268, + "loss": 1202.498, + "step": 1380 + }, + { + "ce_loss_12": 3.2492050409317015, + "ce_loss_17": 3.0701080203056335, + "ce_loss_23": 2.9916153073310854, + "ce_loss_3": 4.2146389245986935, + "ce_loss_6": 3.693696665763855, + "epoch": 0.139, + "grad_norm": 792.0, + "kl_loss_12": 509.50305023193357, + "kl_loss_17": 140.96539154052735, + "kl_loss_3": 2552.8525512695314, + "kl_loss_6": 1512.970068359375, + "learning_rate": 0.0009586880629764817, + "loss": 1181.7294, + "step": 1390 + }, + { + "ce_loss_12": 3.1832247495651247, + "ce_loss_17": 3.0081828832626343, + "ce_loss_23": 2.9266636252403258, + "ce_loss_3": 4.176916980743409, + "ce_loss_6": 3.6512787103652955, + "epoch": 0.14, + "grad_norm": 772.0, + "kl_loss_12": 509.731379699707, + "kl_loss_17": 143.64842796325684, + "kl_loss_3": 2588.364465332031, + "kl_loss_6": 1531.91630859375, + "learning_rate": 0.0009580542287160348, + "loss": 1176.7396, + "step": 1400 + }, + { + "ce_loss_12": 3.144865083694458, + "ce_loss_17": 2.97108371257782, + "ce_loss_23": 2.891060936450958, + "ce_loss_3": 4.136532425880432, + "ce_loss_6": 3.6031069278717043, + "epoch": 0.141, + "grad_norm": 844.0, + "kl_loss_12": 505.5958480834961, + "kl_loss_17": 146.13398323059081, + "kl_loss_3": 2601.420520019531, + "kl_loss_6": 1520.9350463867188, + "learning_rate": 0.0009574157818545901, + "loss": 1178.8346, + "step": 1410 + }, + { + "ce_loss_12": 3.209973669052124, + "ce_loss_17": 3.0407453656196592, + "ce_loss_23": 2.963426387310028, + "ce_loss_3": 4.165764462947846, + "ce_loss_6": 3.6472143650054933, + "epoch": 0.142, + "grad_norm": 908.0, + "kl_loss_12": 491.97606353759767, + "kl_loss_17": 135.57668418884276, + "kl_loss_3": 2528.8818115234376, + "kl_loss_6": 1487.7853210449218, + "learning_rate": 0.0009567727288213005, + "loss": 1190.5035, + "step": 1420 + }, + { + "ce_loss_12": 3.197652304172516, + "ce_loss_17": 3.0179499268531798, + "ce_loss_23": 2.936987841129303, + "ce_loss_3": 4.188513946533203, + "ce_loss_6": 3.655670428276062, + "epoch": 0.143, + "grad_norm": 812.0, + "kl_loss_12": 511.9153167724609, + "kl_loss_17": 145.0369529724121, + "kl_loss_3": 2610.760852050781, + "kl_loss_6": 1542.4522644042968, + "learning_rate": 0.0009561250760917027, + "loss": 1185.8725, + "step": 1430 + }, + { + "ce_loss_12": 3.207899260520935, + "ce_loss_17": 3.0319709539413453, + "ce_loss_23": 2.9539456725120545, + "ce_loss_3": 4.179163765907288, + "ce_loss_6": 3.656933867931366, + "epoch": 0.144, + "grad_norm": 1064.0, + "kl_loss_12": 517.2545806884766, + "kl_loss_17": 146.9861888885498, + "kl_loss_3": 2600.315710449219, + "kl_loss_6": 1545.5985473632813, + "learning_rate": 0.0009554728301876525, + "loss": 1171.7954, + "step": 1440 + }, + { + "ce_loss_12": 3.2509345412254333, + "ce_loss_17": 3.0787034749984743, + "ce_loss_23": 2.995332622528076, + "ce_loss_3": 4.211844718456268, + "ce_loss_6": 3.694181203842163, + "epoch": 0.145, + "grad_norm": 744.0, + "kl_loss_12": 508.04298706054686, + "kl_loss_17": 149.49794082641603, + "kl_loss_3": 2553.605187988281, + "kl_loss_6": 1520.3743835449218, + "learning_rate": 0.0009548159976772592, + "loss": 1214.258, + "step": 1450 + }, + { + "ce_loss_12": 3.202357566356659, + "ce_loss_17": 3.030918502807617, + "ce_loss_23": 2.9502019762992857, + "ce_loss_3": 4.206415390968322, + "ce_loss_6": 3.6663927674293517, + "epoch": 0.146, + "grad_norm": 1232.0, + "kl_loss_12": 511.38385620117185, + "kl_loss_17": 147.039298248291, + "kl_loss_3": 2633.431530761719, + "kl_loss_6": 1543.5936157226563, + "learning_rate": 0.0009541545851748186, + "loss": 1194.2137, + "step": 1460 + }, + { + "ce_loss_12": 3.083567941188812, + "ce_loss_17": 2.910180139541626, + "ce_loss_23": 2.829286205768585, + "ce_loss_3": 4.110313820838928, + "ce_loss_6": 3.5584028244018553, + "epoch": 0.147, + "grad_norm": 948.0, + "kl_loss_12": 504.0245788574219, + "kl_loss_17": 143.67748527526857, + "kl_loss_3": 2656.521923828125, + "kl_loss_6": 1554.0283996582032, + "learning_rate": 0.0009534885993407473, + "loss": 1201.5422, + "step": 1470 + }, + { + "ce_loss_12": 3.228755247592926, + "ce_loss_17": 3.060680961608887, + "ce_loss_23": 2.9807514429092405, + "ce_loss_3": 4.232686340808868, + "ce_loss_6": 3.706300401687622, + "epoch": 0.148, + "grad_norm": 852.0, + "kl_loss_12": 493.7154907226562, + "kl_loss_17": 142.14012565612794, + "kl_loss_3": 2623.7803466796877, + "kl_loss_6": 1554.9101318359376, + "learning_rate": 0.0009528180468815154, + "loss": 1201.7234, + "step": 1480 + }, + { + "ce_loss_12": 3.2848423957824706, + "ce_loss_17": 3.110364580154419, + "ce_loss_23": 3.0347512125968934, + "ce_loss_3": 4.23952442407608, + "ce_loss_6": 3.733375918865204, + "epoch": 0.149, + "grad_norm": 1004.0, + "kl_loss_12": 506.6558837890625, + "kl_loss_17": 142.2900062561035, + "kl_loss_3": 2538.5910400390626, + "kl_loss_6": 1513.7965209960937, + "learning_rate": 0.0009521429345495787, + "loss": 1177.942, + "step": 1490 + }, + { + "ce_loss_12": 3.254639232158661, + "ce_loss_17": 3.0876608014106752, + "ce_loss_23": 3.0133098363876343, + "ce_loss_3": 4.207061076164246, + "ce_loss_6": 3.694426679611206, + "epoch": 0.15, + "grad_norm": 900.0, + "kl_loss_12": 485.9706497192383, + "kl_loss_17": 134.57788467407227, + "kl_loss_3": 2534.81689453125, + "kl_loss_6": 1497.9059631347657, + "learning_rate": 0.0009514632691433108, + "loss": 1177.4586, + "step": 1500 + }, + { + "ce_loss_12": 3.2227351903915404, + "ce_loss_17": 3.052746081352234, + "ce_loss_23": 2.976072347164154, + "ce_loss_3": 4.198372399806976, + "ce_loss_6": 3.6763845801353456, + "epoch": 0.151, + "grad_norm": 904.0, + "kl_loss_12": 504.3296157836914, + "kl_loss_17": 141.9261100769043, + "kl_loss_3": 2578.879296875, + "kl_loss_6": 1531.7337280273437, + "learning_rate": 0.0009507790575069346, + "loss": 1180.6392, + "step": 1510 + }, + { + "ce_loss_12": 3.2037697315216063, + "ce_loss_17": 3.018774151802063, + "ce_loss_23": 2.937272822856903, + "ce_loss_3": 4.196621096134185, + "ce_loss_6": 3.666115713119507, + "epoch": 0.152, + "grad_norm": 768.0, + "kl_loss_12": 513.6897033691406, + "kl_loss_17": 142.41551475524903, + "kl_loss_3": 2613.3070068359375, + "kl_loss_6": 1544.65146484375, + "learning_rate": 0.0009500903065304539, + "loss": 1210.4954, + "step": 1520 + }, + { + "ce_loss_12": 3.2276593685150146, + "ce_loss_17": 3.0604055762290954, + "ce_loss_23": 2.9863309025764466, + "ce_loss_3": 4.167424845695495, + "ce_loss_6": 3.653580594062805, + "epoch": 0.153, + "grad_norm": 884.0, + "kl_loss_12": 483.8401397705078, + "kl_loss_17": 136.10087394714355, + "kl_loss_3": 2484.0209106445313, + "kl_loss_6": 1456.9175537109375, + "learning_rate": 0.0009493970231495835, + "loss": 1168.7832, + "step": 1530 + }, + { + "ce_loss_12": 3.1678776383399962, + "ce_loss_17": 3.011521375179291, + "ce_loss_23": 2.9375467181205748, + "ce_loss_3": 4.114212560653686, + "ce_loss_6": 3.602833318710327, + "epoch": 0.154, + "grad_norm": 908.0, + "kl_loss_12": 477.70878448486326, + "kl_loss_17": 139.93613510131837, + "kl_loss_3": 2506.1269653320314, + "kl_loss_6": 1474.152099609375, + "learning_rate": 0.0009486992143456792, + "loss": 1153.3016, + "step": 1540 + }, + { + "ce_loss_12": 3.2115009784698487, + "ce_loss_17": 3.0346609592437743, + "ce_loss_23": 2.94760525226593, + "ce_loss_3": 4.2460575819015505, + "ce_loss_6": 3.6979887962341307, + "epoch": 0.155, + "grad_norm": 860.0, + "kl_loss_12": 519.8610046386718, + "kl_loss_17": 157.5887466430664, + "kl_loss_3": 2708.2066040039062, + "kl_loss_6": 1593.1645080566407, + "learning_rate": 0.0009479968871456679, + "loss": 1207.1498, + "step": 1550 + }, + { + "ce_loss_12": 3.174054169654846, + "ce_loss_17": 3.0042254090309144, + "ce_loss_23": 2.9235622763633726, + "ce_loss_3": 4.172900831699371, + "ce_loss_6": 3.6299402713775635, + "epoch": 0.156, + "grad_norm": 952.0, + "kl_loss_12": 495.34519653320314, + "kl_loss_17": 147.48748016357422, + "kl_loss_3": 2616.0154663085937, + "kl_loss_6": 1526.7384216308594, + "learning_rate": 0.0009472900486219768, + "loss": 1173.4233, + "step": 1560 + }, + { + "ce_loss_12": 3.166046977043152, + "ce_loss_17": 2.994255745410919, + "ce_loss_23": 2.9190826416015625, + "ce_loss_3": 4.122887992858887, + "ce_loss_6": 3.6104302167892457, + "epoch": 0.157, + "grad_norm": 856.0, + "kl_loss_12": 491.6877075195313, + "kl_loss_17": 139.1889949798584, + "kl_loss_3": 2547.5237548828127, + "kl_loss_6": 1506.4313537597657, + "learning_rate": 0.000946578705892462, + "loss": 1174.5682, + "step": 1570 + }, + { + "ce_loss_12": 3.1944627285003664, + "ce_loss_17": 3.0258451461791993, + "ce_loss_23": 2.951625919342041, + "ce_loss_3": 4.150630140304566, + "ce_loss_6": 3.6336331605911254, + "epoch": 0.158, + "grad_norm": 964.0, + "kl_loss_12": 479.3685333251953, + "kl_loss_17": 133.69121551513672, + "kl_loss_3": 2496.5006469726563, + "kl_loss_6": 1450.4319641113282, + "learning_rate": 0.0009458628661203367, + "loss": 1163.8299, + "step": 1580 + }, + { + "ce_loss_12": 3.199857234954834, + "ce_loss_17": 3.0304608583450316, + "ce_loss_23": 2.9556132555007935, + "ce_loss_3": 4.197483813762664, + "ce_loss_6": 3.6625853896141054, + "epoch": 0.159, + "grad_norm": 1048.0, + "kl_loss_12": 500.5685180664062, + "kl_loss_17": 134.92954444885254, + "kl_loss_3": 2611.937060546875, + "kl_loss_6": 1533.4802795410155, + "learning_rate": 0.0009451425365140996, + "loss": 1155.3817, + "step": 1590 + }, + { + "ce_loss_12": 3.2707769751548765, + "ce_loss_17": 3.097273528575897, + "ce_loss_23": 3.021830976009369, + "ce_loss_3": 4.201311850547791, + "ce_loss_6": 3.6886289954185485, + "epoch": 0.16, + "grad_norm": 876.0, + "kl_loss_12": 494.73425140380857, + "kl_loss_17": 134.58801956176757, + "kl_loss_3": 2474.3725708007814, + "kl_loss_6": 1450.2538146972656, + "learning_rate": 0.0009444177243274617, + "loss": 1134.3242, + "step": 1600 + }, + { + "ce_loss_12": 3.145673596858978, + "ce_loss_17": 2.964465343952179, + "ce_loss_23": 2.886202132701874, + "ce_loss_3": 4.129389917850494, + "ce_loss_6": 3.5948860883712768, + "epoch": 0.161, + "grad_norm": 856.0, + "kl_loss_12": 515.6139999389649, + "kl_loss_17": 142.84942321777345, + "kl_loss_3": 2606.3384521484377, + "kl_loss_6": 1527.6505615234375, + "learning_rate": 0.0009436884368592739, + "loss": 1183.754, + "step": 1610 + }, + { + "ce_loss_12": 3.186392605304718, + "ce_loss_17": 3.013194441795349, + "ce_loss_23": 2.93712317943573, + "ce_loss_3": 4.130860531330109, + "ce_loss_6": 3.6174773812294005, + "epoch": 0.162, + "grad_norm": 872.0, + "kl_loss_12": 492.7932693481445, + "kl_loss_17": 142.2027275085449, + "kl_loss_3": 2515.2781982421875, + "kl_loss_6": 1469.6235412597657, + "learning_rate": 0.0009429546814534529, + "loss": 1176.3047, + "step": 1620 + }, + { + "ce_loss_12": 3.184974658489227, + "ce_loss_17": 3.0268144249916076, + "ce_loss_23": 2.9497244000434875, + "ce_loss_3": 4.144134628772735, + "ce_loss_6": 3.611461842060089, + "epoch": 0.163, + "grad_norm": 848.0, + "kl_loss_12": 488.8755386352539, + "kl_loss_17": 142.62328567504883, + "kl_loss_3": 2515.289831542969, + "kl_loss_6": 1463.6363830566406, + "learning_rate": 0.0009422164654989072, + "loss": 1137.5589, + "step": 1630 + }, + { + "ce_loss_12": 3.29748820066452, + "ce_loss_17": 3.1312389373779297, + "ce_loss_23": 3.053789258003235, + "ce_loss_3": 4.228594875335693, + "ce_loss_6": 3.7183935761451723, + "epoch": 0.164, + "grad_norm": 848.0, + "kl_loss_12": 488.71146697998046, + "kl_loss_17": 141.67050704956054, + "kl_loss_3": 2480.7939575195314, + "kl_loss_6": 1452.5811706542968, + "learning_rate": 0.0009414737964294635, + "loss": 1149.8963, + "step": 1640 + }, + { + "ce_loss_12": 3.2155895709991453, + "ce_loss_17": 3.062555527687073, + "ce_loss_23": 2.989735221862793, + "ce_loss_3": 4.130454444885254, + "ce_loss_6": 3.6353893160820006, + "epoch": 0.165, + "grad_norm": 828.0, + "kl_loss_12": 461.0352813720703, + "kl_loss_17": 136.16140022277833, + "kl_loss_3": 2398.9843627929686, + "kl_loss_6": 1397.462957763672, + "learning_rate": 0.000940726681723791, + "loss": 1135.6822, + "step": 1650 + }, + { + "ce_loss_12": 3.0923858523368835, + "ce_loss_17": 2.9247390151023867, + "ce_loss_23": 2.8481791257858275, + "ce_loss_3": 4.089363622665405, + "ce_loss_6": 3.5547229528427122, + "epoch": 0.166, + "grad_norm": 892.0, + "kl_loss_12": 491.5706314086914, + "kl_loss_17": 140.37573165893554, + "kl_loss_3": 2631.875341796875, + "kl_loss_6": 1530.0634033203125, + "learning_rate": 0.0009399751289053266, + "loss": 1147.4444, + "step": 1660 + }, + { + "ce_loss_12": 3.2702404379844667, + "ce_loss_17": 3.105391335487366, + "ce_loss_23": 3.0320743322372437, + "ce_loss_3": 4.202740812301636, + "ce_loss_6": 3.697451651096344, + "epoch": 0.167, + "grad_norm": 1032.0, + "kl_loss_12": 474.04954681396487, + "kl_loss_17": 133.29354972839354, + "kl_loss_3": 2468.889294433594, + "kl_loss_6": 1437.5356201171876, + "learning_rate": 0.0009392191455421988, + "loss": 1149.0023, + "step": 1670 + }, + { + "ce_loss_12": 3.2619558930397035, + "ce_loss_17": 3.0974825978279115, + "ce_loss_23": 3.0210219264030456, + "ce_loss_3": 4.192395412921906, + "ce_loss_6": 3.6811763405799867, + "epoch": 0.168, + "grad_norm": 796.0, + "kl_loss_12": 493.43628845214846, + "kl_loss_17": 139.0162052154541, + "kl_loss_3": 2491.118200683594, + "kl_loss_6": 1461.1446044921875, + "learning_rate": 0.0009384587392471515, + "loss": 1121.5728, + "step": 1680 + }, + { + "ce_loss_12": 3.237948274612427, + "ce_loss_17": 3.0769503355026244, + "ce_loss_23": 3.007404637336731, + "ce_loss_3": 4.150859999656677, + "ce_loss_6": 3.659588062763214, + "epoch": 0.169, + "grad_norm": 988.0, + "kl_loss_12": 471.13790740966795, + "kl_loss_17": 131.52924728393555, + "kl_loss_3": 2426.4301513671876, + "kl_loss_6": 1429.9769104003906, + "learning_rate": 0.0009376939176774678, + "loss": 1119.1035, + "step": 1690 + }, + { + "ce_loss_12": 3.221153450012207, + "ce_loss_17": 3.0557522654533384, + "ce_loss_23": 2.978453016281128, + "ce_loss_3": 4.167772364616394, + "ce_loss_6": 3.648601543903351, + "epoch": 0.17, + "grad_norm": 904.0, + "kl_loss_12": 479.8827072143555, + "kl_loss_17": 138.62282676696776, + "kl_loss_3": 2484.6701049804688, + "kl_loss_6": 1445.5995483398438, + "learning_rate": 0.0009369246885348925, + "loss": 1158.6926, + "step": 1700 + }, + { + "ce_loss_12": 3.211071527004242, + "ce_loss_17": 3.0465246558189394, + "ce_loss_23": 2.9701019644737245, + "ce_loss_3": 4.180959832668305, + "ce_loss_6": 3.659099280834198, + "epoch": 0.171, + "grad_norm": 960.0, + "kl_loss_12": 484.2249557495117, + "kl_loss_17": 137.59749946594238, + "kl_loss_3": 2560.4348388671874, + "kl_loss_6": 1507.995050048828, + "learning_rate": 0.0009361510595655545, + "loss": 1161.0425, + "step": 1710 + }, + { + "ce_loss_12": 3.175062394142151, + "ce_loss_17": 3.0082626700401307, + "ce_loss_23": 2.929028642177582, + "ce_loss_3": 4.122467339038849, + "ce_loss_6": 3.617999482154846, + "epoch": 0.172, + "grad_norm": 1056.0, + "kl_loss_12": 486.29063415527344, + "kl_loss_17": 137.6948272705078, + "kl_loss_3": 2509.7924682617186, + "kl_loss_6": 1487.8664428710938, + "learning_rate": 0.0009353730385598887, + "loss": 1146.3372, + "step": 1720 + }, + { + "ce_loss_12": 3.106047534942627, + "ce_loss_17": 2.9398711800575255, + "ce_loss_23": 2.868954026699066, + "ce_loss_3": 4.094696950912476, + "ce_loss_6": 3.555660057067871, + "epoch": 0.173, + "grad_norm": 1008.0, + "kl_loss_12": 480.9398559570312, + "kl_loss_17": 131.35497131347657, + "kl_loss_3": 2565.51640625, + "kl_loss_6": 1486.1780517578125, + "learning_rate": 0.0009345906333525581, + "loss": 1160.8581, + "step": 1730 + }, + { + "ce_loss_12": 3.145271563529968, + "ce_loss_17": 2.98147029876709, + "ce_loss_23": 2.9027063608169557, + "ce_loss_3": 4.109023904800415, + "ce_loss_6": 3.5946555733680725, + "epoch": 0.174, + "grad_norm": 1072.0, + "kl_loss_12": 481.08248443603514, + "kl_loss_17": 140.1177764892578, + "kl_loss_3": 2530.2619873046874, + "kl_loss_6": 1495.8839233398437, + "learning_rate": 0.0009338038518223745, + "loss": 1142.776, + "step": 1740 + }, + { + "ce_loss_12": 3.209577441215515, + "ce_loss_17": 3.0408326625823974, + "ce_loss_23": 2.9612845182418823, + "ce_loss_3": 4.166906929016113, + "ce_loss_6": 3.65167076587677, + "epoch": 0.175, + "grad_norm": 788.0, + "kl_loss_12": 500.57960510253906, + "kl_loss_17": 143.46262283325194, + "kl_loss_3": 2559.1065063476562, + "kl_loss_6": 1501.0489196777344, + "learning_rate": 0.0009330127018922195, + "loss": 1190.3428, + "step": 1750 + }, + { + "ce_loss_12": 3.1649666905403135, + "ce_loss_17": 3.0007057666778563, + "ce_loss_23": 2.928002047538757, + "ce_loss_3": 4.12604067325592, + "ce_loss_6": 3.60684871673584, + "epoch": 0.176, + "grad_norm": 908.0, + "kl_loss_12": 485.37786254882815, + "kl_loss_17": 137.02273483276366, + "kl_loss_3": 2536.665869140625, + "kl_loss_6": 1478.797186279297, + "learning_rate": 0.0009322171915289634, + "loss": 1156.0584, + "step": 1760 + }, + { + "ce_loss_12": 3.197109591960907, + "ce_loss_17": 3.0425989389419557, + "ce_loss_23": 2.9691351413726808, + "ce_loss_3": 4.1132927775383, + "ce_loss_6": 3.619373655319214, + "epoch": 0.177, + "grad_norm": 852.0, + "kl_loss_12": 477.05029296875, + "kl_loss_17": 136.0932559967041, + "kl_loss_3": 2461.841271972656, + "kl_loss_6": 1455.2650573730468, + "learning_rate": 0.0009314173287433873, + "loss": 1123.975, + "step": 1770 + }, + { + "ce_loss_12": 3.188371479511261, + "ce_loss_17": 3.0140843629837035, + "ce_loss_23": 2.940100073814392, + "ce_loss_3": 4.126374185085297, + "ce_loss_6": 3.613701510429382, + "epoch": 0.178, + "grad_norm": 996.0, + "kl_loss_12": 499.4425277709961, + "kl_loss_17": 136.04979972839357, + "kl_loss_3": 2514.0261108398436, + "kl_loss_6": 1475.0163818359374, + "learning_rate": 0.0009306131215901003, + "loss": 1132.1109, + "step": 1780 + }, + { + "ce_loss_12": 3.2126179456710817, + "ce_loss_17": 3.0473007202148437, + "ce_loss_23": 2.9747347474098205, + "ce_loss_3": 4.1394585251808165, + "ce_loss_6": 3.638213038444519, + "epoch": 0.179, + "grad_norm": 932.0, + "kl_loss_12": 494.14884796142576, + "kl_loss_17": 131.85014839172362, + "kl_loss_3": 2461.104040527344, + "kl_loss_6": 1451.3593811035157, + "learning_rate": 0.0009298045781674596, + "loss": 1116.6383, + "step": 1790 + }, + { + "ce_loss_12": 3.1938929080963137, + "ce_loss_17": 3.0310275554656982, + "ce_loss_23": 2.958340787887573, + "ce_loss_3": 4.108129847049713, + "ce_loss_6": 3.6047824025154114, + "epoch": 0.18, + "grad_norm": 916.0, + "kl_loss_12": 475.7731002807617, + "kl_loss_17": 129.48459243774414, + "kl_loss_3": 2430.8353759765623, + "kl_loss_6": 1416.467462158203, + "learning_rate": 0.0009289917066174886, + "loss": 1130.1948, + "step": 1800 + }, + { + "ce_loss_12": 3.1844147086143493, + "ce_loss_17": 3.0295247793197633, + "ce_loss_23": 2.9598302364349367, + "ce_loss_3": 4.082317817211151, + "ce_loss_6": 3.588042676448822, + "epoch": 0.181, + "grad_norm": 1008.0, + "kl_loss_12": 455.54750061035156, + "kl_loss_17": 125.22672424316406, + "kl_loss_3": 2396.4680908203127, + "kl_loss_6": 1387.836181640625, + "learning_rate": 0.0009281745151257945, + "loss": 1099.9523, + "step": 1810 + }, + { + "ce_loss_12": 3.2134677290916445, + "ce_loss_17": 3.048130762577057, + "ce_loss_23": 2.9771452069282534, + "ce_loss_3": 4.141134595870971, + "ce_loss_6": 3.630683124065399, + "epoch": 0.182, + "grad_norm": 856.0, + "kl_loss_12": 470.7882507324219, + "kl_loss_17": 128.9965015411377, + "kl_loss_3": 2445.9111450195314, + "kl_loss_6": 1418.3800537109375, + "learning_rate": 0.0009273530119214868, + "loss": 1128.6573, + "step": 1820 + }, + { + "ce_loss_12": 3.2906126141548158, + "ce_loss_17": 3.1292755126953127, + "ce_loss_23": 3.0643419146537783, + "ce_loss_3": 4.215589344501495, + "ce_loss_6": 3.7132033705711365, + "epoch": 0.183, + "grad_norm": 876.0, + "kl_loss_12": 464.4059265136719, + "kl_loss_17": 125.72085189819336, + "kl_loss_3": 2454.9328735351564, + "kl_loss_6": 1432.4993469238282, + "learning_rate": 0.0009265272052770935, + "loss": 1104.2849, + "step": 1830 + }, + { + "ce_loss_12": 3.1393822193145753, + "ce_loss_17": 2.973209011554718, + "ce_loss_23": 2.9009034633636475, + "ce_loss_3": 4.102531969547272, + "ce_loss_6": 3.575673007965088, + "epoch": 0.184, + "grad_norm": 1096.0, + "kl_loss_12": 468.7420257568359, + "kl_loss_17": 125.73902816772461, + "kl_loss_3": 2499.4821166992188, + "kl_loss_6": 1452.368017578125, + "learning_rate": 0.0009256971035084784, + "loss": 1135.5918, + "step": 1840 + }, + { + "ce_loss_12": 3.088402235507965, + "ce_loss_17": 2.912094330787659, + "ce_loss_23": 2.8388179183006286, + "ce_loss_3": 4.067012321949005, + "ce_loss_6": 3.541650187969208, + "epoch": 0.185, + "grad_norm": 868.0, + "kl_loss_12": 492.3648712158203, + "kl_loss_17": 129.6828712463379, + "kl_loss_3": 2577.0054931640625, + "kl_loss_6": 1514.3743774414063, + "learning_rate": 0.0009248627149747573, + "loss": 1154.1738, + "step": 1850 + }, + { + "ce_loss_12": 3.2601833939552307, + "ce_loss_17": 3.0977912306785584, + "ce_loss_23": 3.030880701541901, + "ce_loss_3": 4.1803746342659, + "ce_loss_6": 3.678359270095825, + "epoch": 0.186, + "grad_norm": 884.0, + "kl_loss_12": 469.87674102783205, + "kl_loss_17": 125.56081008911133, + "kl_loss_3": 2448.435095214844, + "kl_loss_6": 1429.397119140625, + "learning_rate": 0.0009240240480782129, + "loss": 1120.9773, + "step": 1860 + }, + { + "ce_loss_12": 3.169607436656952, + "ce_loss_17": 3.004458963871002, + "ce_loss_23": 2.9340516090393067, + "ce_loss_3": 4.123180067539215, + "ce_loss_6": 3.614936649799347, + "epoch": 0.187, + "grad_norm": 804.0, + "kl_loss_12": 471.2584976196289, + "kl_loss_17": 126.98157424926758, + "kl_loss_3": 2493.7326477050783, + "kl_loss_6": 1472.6423034667969, + "learning_rate": 0.0009231811112642122, + "loss": 1127.0547, + "step": 1870 + }, + { + "ce_loss_12": 3.2120662569999694, + "ce_loss_17": 3.0467937111854555, + "ce_loss_23": 2.978515994548798, + "ce_loss_3": 4.115826559066773, + "ce_loss_6": 3.6161744952201844, + "epoch": 0.188, + "grad_norm": 900.0, + "kl_loss_12": 468.25464324951173, + "kl_loss_17": 126.36864395141602, + "kl_loss_3": 2423.8376586914064, + "kl_loss_6": 1419.7891235351562, + "learning_rate": 0.0009223339130211192, + "loss": 1110.4283, + "step": 1880 + }, + { + "ce_loss_12": 3.0797258496284483, + "ce_loss_17": 2.9191071927547454, + "ce_loss_23": 2.8500725448131563, + "ce_loss_3": 4.0366758584976195, + "ce_loss_6": 3.508550250530243, + "epoch": 0.189, + "grad_norm": 800.0, + "kl_loss_12": 460.5701538085938, + "kl_loss_17": 122.64981727600097, + "kl_loss_3": 2504.1417602539063, + "kl_loss_6": 1442.12265625, + "learning_rate": 0.0009214824618802108, + "loss": 1126.9061, + "step": 1890 + }, + { + "ce_loss_12": 3.2441630601882934, + "ce_loss_17": 3.0844648480415344, + "ce_loss_23": 3.0122289299964904, + "ce_loss_3": 4.170050525665284, + "ce_loss_6": 3.666511356830597, + "epoch": 0.19, + "grad_norm": 1016.0, + "kl_loss_12": 471.48859252929685, + "kl_loss_17": 129.77956199645996, + "kl_loss_3": 2428.87451171875, + "kl_loss_6": 1423.535089111328, + "learning_rate": 0.0009206267664155906, + "loss": 1142.1124, + "step": 1900 + }, + { + "ce_loss_12": 3.181778848171234, + "ce_loss_17": 3.0219834446907043, + "ce_loss_23": 2.9443792939186095, + "ce_loss_3": 4.116687369346619, + "ce_loss_6": 3.6083747029304503, + "epoch": 0.191, + "grad_norm": 960.0, + "kl_loss_12": 467.8783721923828, + "kl_loss_17": 134.10535469055176, + "kl_loss_3": 2458.371057128906, + "kl_loss_6": 1428.5719116210937, + "learning_rate": 0.0009197668352441024, + "loss": 1127.5221, + "step": 1910 + }, + { + "ce_loss_12": 3.2185313224792482, + "ce_loss_17": 3.059431421756744, + "ce_loss_23": 2.990352964401245, + "ce_loss_3": 4.132525777816772, + "ce_loss_6": 3.630076193809509, + "epoch": 0.192, + "grad_norm": 1160.0, + "kl_loss_12": 462.2439666748047, + "kl_loss_17": 130.64373245239258, + "kl_loss_3": 2414.4199462890624, + "kl_loss_6": 1409.8724487304687, + "learning_rate": 0.0009189026770252437, + "loss": 1114.2314, + "step": 1920 + }, + { + "ce_loss_12": 3.2457741260528565, + "ce_loss_17": 3.0870991349220276, + "ce_loss_23": 3.0153459668159486, + "ce_loss_3": 4.164921474456787, + "ce_loss_6": 3.6658156156539916, + "epoch": 0.193, + "grad_norm": 1048.0, + "kl_loss_12": 464.60990142822266, + "kl_loss_17": 132.1713623046875, + "kl_loss_3": 2423.4216552734374, + "kl_loss_6": 1410.253729248047, + "learning_rate": 0.000918034300461078, + "loss": 1145.4147, + "step": 1930 + }, + { + "ce_loss_12": 3.2675069332122804, + "ce_loss_17": 3.1138991475105287, + "ce_loss_23": 3.0397408604621887, + "ce_loss_3": 4.165933740139008, + "ce_loss_6": 3.682149875164032, + "epoch": 0.194, + "grad_norm": 1400.0, + "kl_loss_12": 462.9583801269531, + "kl_loss_17": 134.64812736511232, + "kl_loss_3": 2387.39130859375, + "kl_loss_6": 1404.232928466797, + "learning_rate": 0.0009171617142961477, + "loss": 1102.495, + "step": 1940 + }, + { + "ce_loss_12": 3.232120132446289, + "ce_loss_17": 3.074093961715698, + "ce_loss_23": 3.004840636253357, + "ce_loss_3": 4.141224360466003, + "ce_loss_6": 3.6394020080566407, + "epoch": 0.195, + "grad_norm": 888.0, + "kl_loss_12": 454.7131286621094, + "kl_loss_17": 126.47698211669922, + "kl_loss_3": 2401.7656982421877, + "kl_loss_6": 1396.0390380859376, + "learning_rate": 0.0009162849273173857, + "loss": 1100.0949, + "step": 1950 + }, + { + "ce_loss_12": 3.176328718662262, + "ce_loss_17": 3.018619990348816, + "ce_loss_23": 2.9515690922737123, + "ce_loss_3": 4.0918882369995115, + "ce_loss_6": 3.5899390459060667, + "epoch": 0.196, + "grad_norm": 1176.0, + "kl_loss_12": 454.46217041015626, + "kl_loss_17": 123.6001823425293, + "kl_loss_3": 2400.2062561035154, + "kl_loss_6": 1389.1139221191406, + "learning_rate": 0.0009154039483540273, + "loss": 1104.5887, + "step": 1960 + }, + { + "ce_loss_12": 3.1507588386535645, + "ce_loss_17": 2.997128760814667, + "ce_loss_23": 2.92929230928421, + "ce_loss_3": 4.0766734957695006, + "ce_loss_6": 3.5648243904113768, + "epoch": 0.197, + "grad_norm": 1016.0, + "kl_loss_12": 456.2421676635742, + "kl_loss_17": 125.6520278930664, + "kl_loss_3": 2431.879040527344, + "kl_loss_6": 1407.830712890625, + "learning_rate": 0.0009145187862775209, + "loss": 1102.6521, + "step": 1970 + }, + { + "ce_loss_12": 3.1837910532951357, + "ce_loss_17": 3.0291913986206054, + "ce_loss_23": 2.9613188028335573, + "ce_loss_3": 4.097797703742981, + "ce_loss_6": 3.601459336280823, + "epoch": 0.198, + "grad_norm": 1168.0, + "kl_loss_12": 460.4912338256836, + "kl_loss_17": 123.57428398132325, + "kl_loss_3": 2414.3682495117187, + "kl_loss_6": 1414.0907043457032, + "learning_rate": 0.0009136294500014386, + "loss": 1100.5844, + "step": 1980 + }, + { + "ce_loss_12": 3.150496172904968, + "ce_loss_17": 2.980268383026123, + "ce_loss_23": 2.9135174036026, + "ce_loss_3": 4.127268588542938, + "ce_loss_6": 3.5919087529182434, + "epoch": 0.199, + "grad_norm": 1224.0, + "kl_loss_12": 480.1836532592773, + "kl_loss_17": 124.83656845092773, + "kl_loss_3": 2533.9186401367188, + "kl_loss_6": 1465.3629028320313, + "learning_rate": 0.000912735948481387, + "loss": 1143.2436, + "step": 1990 + }, + { + "ce_loss_12": 3.1754995465278624, + "ce_loss_17": 3.009310555458069, + "ce_loss_23": 2.9422292590141295, + "ce_loss_3": 4.082965612411499, + "ce_loss_6": 3.591770112514496, + "epoch": 0.2, + "grad_norm": 1056.0, + "kl_loss_12": 480.6165740966797, + "kl_loss_17": 125.33241271972656, + "kl_loss_3": 2438.861468505859, + "kl_loss_6": 1432.2340881347657, + "learning_rate": 0.0009118382907149164, + "loss": 1097.0717, + "step": 2000 + }, + { + "ce_loss_12": 3.196602463722229, + "ce_loss_17": 3.0341225266456604, + "ce_loss_23": 2.967613184452057, + "ce_loss_3": 4.100062417984009, + "ce_loss_6": 3.6124598026275634, + "epoch": 0.201, + "grad_norm": 844.0, + "kl_loss_12": 474.27958526611326, + "kl_loss_17": 124.60223999023438, + "kl_loss_3": 2397.216931152344, + "kl_loss_6": 1413.5718933105468, + "learning_rate": 0.0009109364857414306, + "loss": 1089.634, + "step": 2010 + }, + { + "ce_loss_12": 3.1681824326515198, + "ce_loss_17": 3.0062640428543093, + "ce_loss_23": 2.936304271221161, + "ce_loss_3": 4.072107815742493, + "ce_loss_6": 3.5693987131118776, + "epoch": 0.202, + "grad_norm": 1080.0, + "kl_loss_12": 464.28491668701173, + "kl_loss_17": 123.07480049133301, + "kl_loss_3": 2408.75986328125, + "kl_loss_6": 1396.0414733886719, + "learning_rate": 0.0009100305426420956, + "loss": 1123.4555, + "step": 2020 + }, + { + "ce_loss_12": 3.1313969492912292, + "ce_loss_17": 2.9723105907440184, + "ce_loss_23": 2.9064919352531433, + "ce_loss_3": 4.095436239242554, + "ce_loss_6": 3.5720829606056212, + "epoch": 0.203, + "grad_norm": 992.0, + "kl_loss_12": 464.30756225585935, + "kl_loss_17": 121.7504940032959, + "kl_loss_3": 2525.1046875, + "kl_loss_6": 1458.3286682128905, + "learning_rate": 0.0009091204705397484, + "loss": 1120.6155, + "step": 2030 + }, + { + "ce_loss_12": 3.115929698944092, + "ce_loss_17": 2.956736671924591, + "ce_loss_23": 2.887710380554199, + "ce_loss_3": 4.086121809482575, + "ce_loss_6": 3.560148632526398, + "epoch": 0.204, + "grad_norm": 1376.0, + "kl_loss_12": 464.2669448852539, + "kl_loss_17": 129.4686809539795, + "kl_loss_3": 2529.653125, + "kl_loss_6": 1470.6206970214844, + "learning_rate": 0.0009082062785988049, + "loss": 1131.6215, + "step": 2040 + }, + { + "ce_loss_12": 3.245707297325134, + "ce_loss_17": 3.098770356178284, + "ce_loss_23": 3.0233798742294313, + "ce_loss_3": 4.126246273517609, + "ce_loss_6": 3.6379145860671995, + "epoch": 0.205, + "grad_norm": 960.0, + "kl_loss_12": 453.8625259399414, + "kl_loss_17": 140.02084579467774, + "kl_loss_3": 2379.939306640625, + "kl_loss_6": 1388.1914367675781, + "learning_rate": 0.0009072879760251679, + "loss": 1105.3567, + "step": 2050 + }, + { + "ce_loss_12": 3.199712705612183, + "ce_loss_17": 3.0443119049072265, + "ce_loss_23": 2.9701660871505737, + "ce_loss_3": 4.1467338562011715, + "ce_loss_6": 3.630892109870911, + "epoch": 0.206, + "grad_norm": 1064.0, + "kl_loss_12": 460.1679229736328, + "kl_loss_17": 135.61270484924316, + "kl_loss_3": 2507.5966186523438, + "kl_loss_6": 1452.1191162109376, + "learning_rate": 0.0009063655720661341, + "loss": 1115.2066, + "step": 2060 + }, + { + "ce_loss_12": 3.2333725929260253, + "ce_loss_17": 3.0829896688461305, + "ce_loss_23": 3.0119826197624207, + "ce_loss_3": 4.128749263286591, + "ce_loss_6": 3.6334719181060793, + "epoch": 0.207, + "grad_norm": 976.0, + "kl_loss_12": 454.76455841064455, + "kl_loss_17": 131.2639488220215, + "kl_loss_3": 2376.0222778320312, + "kl_loss_6": 1381.4890625, + "learning_rate": 0.000905439076010301, + "loss": 1093.0187, + "step": 2070 + }, + { + "ce_loss_12": 3.1996248722076417, + "ce_loss_17": 3.0418588161468505, + "ce_loss_23": 2.9662808060646055, + "ce_loss_3": 4.1204675197601315, + "ce_loss_6": 3.622704017162323, + "epoch": 0.208, + "grad_norm": 1136.0, + "kl_loss_12": 462.43045196533205, + "kl_loss_17": 133.24030723571778, + "kl_loss_3": 2422.5058837890624, + "kl_loss_6": 1413.2706298828125, + "learning_rate": 0.0009045084971874737, + "loss": 1085.4412, + "step": 2080 + }, + { + "ce_loss_12": 3.181397581100464, + "ce_loss_17": 3.0283178806304933, + "ce_loss_23": 2.9539447903633116, + "ce_loss_3": 4.090775012969971, + "ce_loss_6": 3.594604218006134, + "epoch": 0.209, + "grad_norm": 940.0, + "kl_loss_12": 458.8741989135742, + "kl_loss_17": 135.37420806884765, + "kl_loss_3": 2409.614776611328, + "kl_loss_6": 1406.745947265625, + "learning_rate": 0.0009035738449685707, + "loss": 1119.5962, + "step": 2090 + }, + { + "ce_loss_12": 3.1231150388717652, + "ce_loss_17": 2.9623324155807493, + "ce_loss_23": 2.8883072257041933, + "ce_loss_3": 4.075538074970245, + "ce_loss_6": 3.5636983036994936, + "epoch": 0.21, + "grad_norm": 880.0, + "kl_loss_12": 464.61413421630857, + "kl_loss_17": 129.5325019836426, + "kl_loss_3": 2488.8315490722657, + "kl_loss_6": 1451.3852172851562, + "learning_rate": 0.0009026351287655293, + "loss": 1103.3046, + "step": 2100 + }, + { + "ce_loss_12": 3.2977046012878417, + "ce_loss_17": 3.1490105032920837, + "ce_loss_23": 3.085929501056671, + "ce_loss_3": 4.141742074489594, + "ce_loss_6": 3.6752121686935424, + "epoch": 0.211, + "grad_norm": 1000.0, + "kl_loss_12": 438.23717041015624, + "kl_loss_17": 120.08209075927735, + "kl_loss_3": 2275.33037109375, + "kl_loss_6": 1330.7261840820313, + "learning_rate": 0.0009016923580312113, + "loss": 1046.0168, + "step": 2110 + }, + { + "ce_loss_12": 3.162338900566101, + "ce_loss_17": 3.011285150051117, + "ce_loss_23": 2.9455466270446777, + "ce_loss_3": 4.060781872272491, + "ce_loss_6": 3.5741167187690737, + "epoch": 0.212, + "grad_norm": 964.0, + "kl_loss_12": 454.36209259033205, + "kl_loss_17": 122.61332359313965, + "kl_loss_3": 2368.1356811523438, + "kl_loss_6": 1377.1678588867187, + "learning_rate": 0.0009007455422593077, + "loss": 1105.0621, + "step": 2120 + }, + { + "ce_loss_12": 3.191146957874298, + "ce_loss_17": 3.0287877798080443, + "ce_loss_23": 2.960277593135834, + "ce_loss_3": 4.120349848270417, + "ce_loss_6": 3.6093490719795227, + "epoch": 0.213, + "grad_norm": 1096.0, + "kl_loss_12": 475.2200164794922, + "kl_loss_17": 124.79928970336914, + "kl_loss_3": 2481.0699829101563, + "kl_loss_6": 1431.2489135742187, + "learning_rate": 0.0008997946909842425, + "loss": 1116.5895, + "step": 2130 + }, + { + "ce_loss_12": 3.215824568271637, + "ce_loss_17": 3.037301540374756, + "ce_loss_23": 2.9644772887229918, + "ce_loss_3": 4.186346101760864, + "ce_loss_6": 3.6540252685546877, + "epoch": 0.214, + "grad_norm": 988.0, + "kl_loss_12": 500.931103515625, + "kl_loss_17": 131.5737678527832, + "kl_loss_3": 2556.2944519042967, + "kl_loss_6": 1484.178729248047, + "learning_rate": 0.0008988398137810777, + "loss": 1118.9062, + "step": 2140 + }, + { + "ce_loss_12": 3.234931528568268, + "ce_loss_17": 3.0760345458984375, + "ce_loss_23": 3.0077293157577514, + "ce_loss_3": 4.124112021923065, + "ce_loss_6": 3.6427318930625914, + "epoch": 0.215, + "grad_norm": 984.0, + "kl_loss_12": 464.1569320678711, + "kl_loss_17": 121.77989540100097, + "kl_loss_3": 2378.131787109375, + "kl_loss_6": 1391.8884033203126, + "learning_rate": 0.0008978809202654162, + "loss": 1079.9229, + "step": 2150 + }, + { + "ce_loss_12": 3.212024140357971, + "ce_loss_17": 3.052365016937256, + "ce_loss_23": 2.9833517551422117, + "ce_loss_3": 4.109187686443329, + "ce_loss_6": 3.6157562851905825, + "epoch": 0.216, + "grad_norm": 1176.0, + "kl_loss_12": 460.35668487548827, + "kl_loss_17": 122.84857292175293, + "kl_loss_3": 2358.084600830078, + "kl_loss_6": 1370.2854736328125, + "learning_rate": 0.0008969180200933046, + "loss": 1091.8207, + "step": 2160 + }, + { + "ce_loss_12": 3.179626762866974, + "ce_loss_17": 3.0116968154907227, + "ce_loss_23": 2.9410921812057493, + "ce_loss_3": 4.11946210861206, + "ce_loss_6": 3.608268916606903, + "epoch": 0.217, + "grad_norm": 1184.0, + "kl_loss_12": 475.6806076049805, + "kl_loss_17": 127.56976089477538, + "kl_loss_3": 2447.6637939453126, + "kl_loss_6": 1422.2298706054687, + "learning_rate": 0.0008959511229611376, + "loss": 1118.1624, + "step": 2170 + }, + { + "ce_loss_12": 3.2447680592536927, + "ce_loss_17": 3.0842313051223753, + "ce_loss_23": 3.0126760601997375, + "ce_loss_3": 4.148605918884277, + "ce_loss_6": 3.6437505006790163, + "epoch": 0.218, + "grad_norm": 1096.0, + "kl_loss_12": 463.01764221191405, + "kl_loss_17": 129.428275680542, + "kl_loss_3": 2418.147393798828, + "kl_loss_6": 1401.0833679199218, + "learning_rate": 0.0008949802386055581, + "loss": 1098.8416, + "step": 2180 + }, + { + "ce_loss_12": 3.114654839038849, + "ce_loss_17": 2.9563396692276003, + "ce_loss_23": 2.88834308385849, + "ce_loss_3": 4.020838689804077, + "ce_loss_6": 3.529152476787567, + "epoch": 0.219, + "grad_norm": 792.0, + "kl_loss_12": 456.43510284423826, + "kl_loss_17": 126.8063793182373, + "kl_loss_3": 2362.814306640625, + "kl_loss_6": 1375.901593017578, + "learning_rate": 0.0008940053768033609, + "loss": 1109.1713, + "step": 2190 + }, + { + "ce_loss_12": 3.1917166471481324, + "ce_loss_17": 3.041363799571991, + "ce_loss_23": 2.971977376937866, + "ce_loss_3": 4.0816752195358275, + "ce_loss_6": 3.5965646266937257, + "epoch": 0.22, + "grad_norm": 820.0, + "kl_loss_12": 450.95191802978513, + "kl_loss_17": 124.02299880981445, + "kl_loss_3": 2371.4427490234375, + "kl_loss_6": 1382.6896728515626, + "learning_rate": 0.0008930265473713938, + "loss": 1082.2387, + "step": 2200 + }, + { + "ce_loss_12": 3.1572428226470945, + "ce_loss_17": 3.003165936470032, + "ce_loss_23": 2.9327398777008056, + "ce_loss_3": 4.064587521553039, + "ce_loss_6": 3.566171145439148, + "epoch": 0.221, + "grad_norm": 860.0, + "kl_loss_12": 453.63380737304686, + "kl_loss_17": 127.90091209411621, + "kl_loss_3": 2376.099816894531, + "kl_loss_6": 1378.780255126953, + "learning_rate": 0.0008920437601664579, + "loss": 1066.1129, + "step": 2210 + }, + { + "ce_loss_12": 3.160594344139099, + "ce_loss_17": 3.002773475646973, + "ce_loss_23": 2.9288374304771425, + "ce_loss_3": 4.075528597831726, + "ce_loss_6": 3.578319561481476, + "epoch": 0.222, + "grad_norm": 1072.0, + "kl_loss_12": 465.2222396850586, + "kl_loss_17": 133.5637550354004, + "kl_loss_3": 2424.0534545898436, + "kl_loss_6": 1415.0123779296875, + "learning_rate": 0.0008910570250852097, + "loss": 1082.2885, + "step": 2220 + }, + { + "ce_loss_12": 3.236863040924072, + "ce_loss_17": 3.0909160494804384, + "ce_loss_23": 3.020583617687225, + "ce_loss_3": 4.092880666255951, + "ce_loss_6": 3.612149167060852, + "epoch": 0.223, + "grad_norm": 944.0, + "kl_loss_12": 444.67178802490236, + "kl_loss_17": 127.97508087158204, + "kl_loss_3": 2295.9597778320312, + "kl_loss_6": 1331.6259887695312, + "learning_rate": 0.0008900663520640604, + "loss": 1054.1037, + "step": 2230 + }, + { + "ce_loss_12": 3.201057267189026, + "ce_loss_17": 3.048276162147522, + "ce_loss_23": 2.9798381328582764, + "ce_loss_3": 4.093599379062653, + "ce_loss_6": 3.5987416982650755, + "epoch": 0.224, + "grad_norm": 1032.0, + "kl_loss_12": 452.34375, + "kl_loss_17": 126.09738502502441, + "kl_loss_3": 2378.9890625, + "kl_loss_6": 1368.7392456054688, + "learning_rate": 0.0008890717510790764, + "loss": 1085.1502, + "step": 2240 + }, + { + "ce_loss_12": 3.1611531615257262, + "ce_loss_17": 3.0081787943840026, + "ce_loss_23": 2.941867744922638, + "ce_loss_3": 4.073286545276642, + "ce_loss_6": 3.5713559865951536, + "epoch": 0.225, + "grad_norm": 980.0, + "kl_loss_12": 449.97607879638673, + "kl_loss_17": 120.34653587341309, + "kl_loss_3": 2397.9912719726562, + "kl_loss_6": 1380.2239501953125, + "learning_rate": 0.0008880732321458784, + "loss": 1093.934, + "step": 2250 + }, + { + "ce_loss_12": 3.194337809085846, + "ce_loss_17": 3.0360872030258177, + "ce_loss_23": 2.968069648742676, + "ce_loss_3": 4.076019847393036, + "ce_loss_6": 3.5874541282653807, + "epoch": 0.226, + "grad_norm": 772.0, + "kl_loss_12": 453.4205032348633, + "kl_loss_17": 121.36964416503906, + "kl_loss_3": 2351.7071899414063, + "kl_loss_6": 1369.5370178222656, + "learning_rate": 0.0008870708053195413, + "loss": 1092.1357, + "step": 2260 + }, + { + "ce_loss_12": 3.211677670478821, + "ce_loss_17": 3.060397219657898, + "ce_loss_23": 2.994785559177399, + "ce_loss_3": 4.074770200252533, + "ce_loss_6": 3.60477374792099, + "epoch": 0.227, + "grad_norm": 1016.0, + "kl_loss_12": 438.2804779052734, + "kl_loss_17": 117.40936431884765, + "kl_loss_3": 2312.314312744141, + "kl_loss_6": 1352.8179443359375, + "learning_rate": 0.0008860644806944918, + "loss": 1065.2646, + "step": 2270 + }, + { + "ce_loss_12": 3.1623907327651977, + "ce_loss_17": 3.0015741109848024, + "ce_loss_23": 2.9344356179237367, + "ce_loss_3": 4.077270436286926, + "ce_loss_6": 3.569185900688171, + "epoch": 0.228, + "grad_norm": 884.0, + "kl_loss_12": 458.0107681274414, + "kl_loss_17": 121.36178131103516, + "kl_loss_3": 2420.191510009766, + "kl_loss_6": 1396.1072570800782, + "learning_rate": 0.0008850542684044079, + "loss": 1068.0572, + "step": 2280 + }, + { + "ce_loss_12": 3.132009541988373, + "ce_loss_17": 2.966815006732941, + "ce_loss_23": 2.8952775597572327, + "ce_loss_3": 4.08059937953949, + "ce_loss_6": 3.5586902499198914, + "epoch": 0.229, + "grad_norm": 1012.0, + "kl_loss_12": 469.1373596191406, + "kl_loss_17": 125.32361526489258, + "kl_loss_3": 2500.100225830078, + "kl_loss_6": 1446.2523620605468, + "learning_rate": 0.0008840401786221159, + "loss": 1094.5689, + "step": 2290 + }, + { + "ce_loss_12": 3.2482141852378845, + "ce_loss_17": 3.0996639013290403, + "ce_loss_23": 3.0369733691215517, + "ce_loss_3": 4.129230093955994, + "ce_loss_6": 3.6413492441177366, + "epoch": 0.23, + "grad_norm": 884.0, + "kl_loss_12": 434.2909759521484, + "kl_loss_17": 113.66693344116212, + "kl_loss_3": 2317.9691650390623, + "kl_loss_6": 1341.140283203125, + "learning_rate": 0.000883022221559489, + "loss": 1051.0217, + "step": 2300 + }, + { + "ce_loss_12": 3.2230896234512327, + "ce_loss_17": 3.068192791938782, + "ce_loss_23": 3.0025680303573608, + "ce_loss_3": 4.128422045707703, + "ce_loss_6": 3.628089118003845, + "epoch": 0.231, + "grad_norm": 1000.0, + "kl_loss_12": 443.489567565918, + "kl_loss_17": 118.53048629760742, + "kl_loss_3": 2399.0051025390626, + "kl_loss_6": 1382.7301330566406, + "learning_rate": 0.0008820004074673434, + "loss": 1112.2879, + "step": 2310 + }, + { + "ce_loss_12": 3.1339518427848816, + "ce_loss_17": 2.9779538989067076, + "ce_loss_23": 2.9133885741233825, + "ce_loss_3": 4.032837772369385, + "ce_loss_6": 3.536777412891388, + "epoch": 0.232, + "grad_norm": 852.0, + "kl_loss_12": 444.94561157226565, + "kl_loss_17": 117.30743064880372, + "kl_loss_3": 2404.906109619141, + "kl_loss_6": 1389.1506958007812, + "learning_rate": 0.0008809747466353355, + "loss": 1071.7189, + "step": 2320 + }, + { + "ce_loss_12": 3.1373749494552614, + "ce_loss_17": 2.982857036590576, + "ce_loss_23": 2.9156289100646973, + "ce_loss_3": 4.049996149539948, + "ce_loss_6": 3.54087575674057, + "epoch": 0.233, + "grad_norm": 984.0, + "kl_loss_12": 440.93540344238284, + "kl_loss_17": 120.87570838928222, + "kl_loss_3": 2391.51513671875, + "kl_loss_6": 1362.7610961914063, + "learning_rate": 0.0008799452493918585, + "loss": 1085.2658, + "step": 2330 + }, + { + "ce_loss_12": 3.2044550657272337, + "ce_loss_17": 3.0486762285232545, + "ce_loss_23": 2.9857989072799684, + "ce_loss_3": 4.096554934978485, + "ce_loss_6": 3.6082839727401734, + "epoch": 0.234, + "grad_norm": 884.0, + "kl_loss_12": 438.3496551513672, + "kl_loss_17": 117.50934257507325, + "kl_loss_3": 2366.7732543945312, + "kl_loss_6": 1372.0487976074219, + "learning_rate": 0.0008789119261039385, + "loss": 1108.4551, + "step": 2340 + }, + { + "ce_loss_12": 3.1221569895744326, + "ce_loss_17": 2.9757789373397827, + "ce_loss_23": 2.905420696735382, + "ce_loss_3": 4.026392912864685, + "ce_loss_6": 3.5348900079727175, + "epoch": 0.235, + "grad_norm": 1012.0, + "kl_loss_12": 442.50169982910154, + "kl_loss_17": 124.05238304138183, + "kl_loss_3": 2375.810498046875, + "kl_loss_6": 1379.4071655273438, + "learning_rate": 0.0008778747871771292, + "loss": 1059.9826, + "step": 2350 + }, + { + "ce_loss_12": 3.166475248336792, + "ce_loss_17": 3.0180999040603638, + "ce_loss_23": 2.9576680302619933, + "ce_loss_3": 4.037302458286286, + "ce_loss_6": 3.55890007019043, + "epoch": 0.236, + "grad_norm": 1008.0, + "kl_loss_12": 424.8075119018555, + "kl_loss_17": 121.03117790222169, + "kl_loss_3": 2302.770233154297, + "kl_loss_6": 1329.6474670410157, + "learning_rate": 0.0008768338430554083, + "loss": 1043.8572, + "step": 2360 + }, + { + "ce_loss_12": 3.1851816058158873, + "ce_loss_17": 3.042919707298279, + "ce_loss_23": 2.9646944642066955, + "ce_loss_3": 4.070168995857239, + "ce_loss_6": 3.5822435140609743, + "epoch": 0.237, + "grad_norm": 1004.0, + "kl_loss_12": 445.9438781738281, + "kl_loss_17": 137.5145179748535, + "kl_loss_3": 2323.822509765625, + "kl_loss_6": 1348.6503540039062, + "learning_rate": 0.0008757891042210713, + "loss": 1073.7238, + "step": 2370 + }, + { + "ce_loss_12": 3.195702016353607, + "ce_loss_17": 3.0519804120063783, + "ce_loss_23": 2.9797383666038515, + "ce_loss_3": 4.07917971611023, + "ce_loss_6": 3.590975046157837, + "epoch": 0.238, + "grad_norm": 1032.0, + "kl_loss_12": 435.39076232910156, + "kl_loss_17": 136.03977317810057, + "kl_loss_3": 2320.868212890625, + "kl_loss_6": 1343.4944396972655, + "learning_rate": 0.0008747405811946271, + "loss": 1066.0041, + "step": 2380 + }, + { + "ce_loss_12": 3.106317698955536, + "ce_loss_17": 2.969082570075989, + "ce_loss_23": 2.8858501434326174, + "ce_loss_3": 4.052463281154632, + "ce_loss_6": 3.5339264392852785, + "epoch": 0.239, + "grad_norm": 936.0, + "kl_loss_12": 450.8729217529297, + "kl_loss_17": 147.4965450286865, + "kl_loss_3": 2465.299914550781, + "kl_loss_6": 1418.1998718261718, + "learning_rate": 0.0008736882845346905, + "loss": 1078.8531, + "step": 2390 + }, + { + "ce_loss_12": 3.189059865474701, + "ce_loss_17": 3.0437240958213807, + "ce_loss_23": 2.967745578289032, + "ce_loss_3": 4.093241429328918, + "ce_loss_6": 3.5936091780662536, + "epoch": 0.24, + "grad_norm": 932.0, + "kl_loss_12": 443.7699142456055, + "kl_loss_17": 135.94134826660155, + "kl_loss_3": 2352.109338378906, + "kl_loss_6": 1349.3434997558593, + "learning_rate": 0.0008726322248378774, + "loss": 1059.4339, + "step": 2400 + }, + { + "ce_loss_12": 3.1886565566062925, + "ce_loss_17": 3.0453128695487974, + "ce_loss_23": 2.97672917842865, + "ce_loss_3": 4.110389256477356, + "ce_loss_6": 3.5999670028686523, + "epoch": 0.241, + "grad_norm": 904.0, + "kl_loss_12": 431.85757598876955, + "kl_loss_17": 126.28680725097657, + "kl_loss_3": 2421.1709594726562, + "kl_loss_6": 1384.5716857910156, + "learning_rate": 0.0008715724127386971, + "loss": 1099.6822, + "step": 2410 + }, + { + "ce_loss_12": 3.2481358289718627, + "ce_loss_17": 3.1088537335395814, + "ce_loss_23": 3.042147898674011, + "ce_loss_3": 4.126234018802643, + "ce_loss_6": 3.647673797607422, + "epoch": 0.242, + "grad_norm": 836.0, + "kl_loss_12": 427.24434967041014, + "kl_loss_17": 123.11380424499512, + "kl_loss_3": 2319.6755432128907, + "kl_loss_6": 1339.1436645507813, + "learning_rate": 0.0008705088589094458, + "loss": 1064.8498, + "step": 2420 + }, + { + "ce_loss_12": 3.265853762626648, + "ce_loss_17": 3.1238319754600523, + "ce_loss_23": 3.055406653881073, + "ce_loss_3": 4.153356218338013, + "ce_loss_6": 3.6705095887184145, + "epoch": 0.243, + "grad_norm": 896.0, + "kl_loss_12": 431.32056427001953, + "kl_loss_17": 122.70526237487793, + "kl_loss_3": 2335.2805053710936, + "kl_loss_6": 1359.439990234375, + "learning_rate": 0.0008694415740600988, + "loss": 1067.6548, + "step": 2430 + }, + { + "ce_loss_12": 3.1318012833595277, + "ce_loss_17": 2.9968308091163633, + "ce_loss_23": 2.9192454338073732, + "ce_loss_3": 4.06026520729065, + "ce_loss_6": 3.544374239444733, + "epoch": 0.244, + "grad_norm": 1024.0, + "kl_loss_12": 437.3179306030273, + "kl_loss_17": 133.74676437377929, + "kl_loss_3": 2428.2571411132812, + "kl_loss_6": 1388.4118103027345, + "learning_rate": 0.0008683705689382025, + "loss": 1077.0226, + "step": 2440 + }, + { + "ce_loss_12": 3.203168284893036, + "ce_loss_17": 3.0576744079589844, + "ce_loss_23": 2.9963975667953493, + "ce_loss_3": 4.066239619255066, + "ce_loss_6": 3.5964288830757143, + "epoch": 0.245, + "grad_norm": 960.0, + "kl_loss_12": 425.4798583984375, + "kl_loss_17": 120.27335968017579, + "kl_loss_3": 2308.929821777344, + "kl_loss_6": 1336.3246276855468, + "learning_rate": 0.0008672958543287666, + "loss": 1071.6658, + "step": 2450 + }, + { + "ce_loss_12": 3.2136458277702333, + "ce_loss_17": 3.070457136631012, + "ce_loss_23": 3.0044744491577147, + "ce_loss_3": 4.076639878749847, + "ce_loss_6": 3.60375052690506, + "epoch": 0.246, + "grad_norm": 1096.0, + "kl_loss_12": 434.4812072753906, + "kl_loss_17": 120.4621150970459, + "kl_loss_3": 2290.6869506835938, + "kl_loss_6": 1332.1749755859375, + "learning_rate": 0.0008662174410541554, + "loss": 1044.3947, + "step": 2460 + }, + { + "ce_loss_12": 3.176317536830902, + "ce_loss_17": 3.0331140756607056, + "ce_loss_23": 2.97104926109314, + "ce_loss_3": 4.0398347735404965, + "ce_loss_6": 3.5594708561897277, + "epoch": 0.247, + "grad_norm": 1048.0, + "kl_loss_12": 426.33206329345705, + "kl_loss_17": 114.9592658996582, + "kl_loss_3": 2284.038134765625, + "kl_loss_6": 1314.4482360839843, + "learning_rate": 0.0008651353399739787, + "loss": 1069.7281, + "step": 2470 + }, + { + "ce_loss_12": 3.209623324871063, + "ce_loss_17": 3.057310092449188, + "ce_loss_23": 2.994291377067566, + "ce_loss_3": 4.081614565849304, + "ce_loss_6": 3.5966670632362367, + "epoch": 0.248, + "grad_norm": 988.0, + "kl_loss_12": 431.0287292480469, + "kl_loss_17": 114.07895278930664, + "kl_loss_3": 2297.882537841797, + "kl_loss_6": 1327.6509521484375, + "learning_rate": 0.0008640495619849821, + "loss": 1048.9321, + "step": 2480 + }, + { + "ce_loss_12": 3.1695863127708437, + "ce_loss_17": 3.0206604480743406, + "ce_loss_23": 2.9585460782051087, + "ce_loss_3": 4.034311413764954, + "ce_loss_6": 3.5579578995704653, + "epoch": 0.249, + "grad_norm": 1240.0, + "kl_loss_12": 427.37953186035156, + "kl_loss_17": 114.5454231262207, + "kl_loss_3": 2294.754217529297, + "kl_loss_6": 1322.4585266113281, + "learning_rate": 0.0008629601180209381, + "loss": 1039.6441, + "step": 2490 + }, + { + "ce_loss_12": 3.1601105690002442, + "ce_loss_17": 3.0158230662345886, + "ce_loss_23": 2.9512540102005005, + "ce_loss_3": 4.029484963417053, + "ce_loss_6": 3.548910307884216, + "epoch": 0.25, + "grad_norm": 1072.0, + "kl_loss_12": 424.3945587158203, + "kl_loss_17": 114.85591850280761, + "kl_loss_3": 2280.948974609375, + "kl_loss_6": 1314.254864501953, + "learning_rate": 0.000861867019052535, + "loss": 1051.2812, + "step": 2500 + }, + { + "ce_loss_12": 3.0931338906288146, + "ce_loss_17": 2.941186046600342, + "ce_loss_23": 2.8769421696662905, + "ce_loss_3": 4.009210073947907, + "ce_loss_6": 3.507812964916229, + "epoch": 0.251, + "grad_norm": 940.0, + "kl_loss_12": 436.55210571289064, + "kl_loss_17": 114.65191955566407, + "kl_loss_3": 2379.95205078125, + "kl_loss_6": 1371.4806030273437, + "learning_rate": 0.0008607702760872678, + "loss": 1074.0451, + "step": 2510 + }, + { + "ce_loss_12": 3.188943064212799, + "ce_loss_17": 3.040559208393097, + "ce_loss_23": 2.979034185409546, + "ce_loss_3": 4.04866658449173, + "ce_loss_6": 3.579585540294647, + "epoch": 0.252, + "grad_norm": 1144.0, + "kl_loss_12": 425.6948501586914, + "kl_loss_17": 112.35841369628906, + "kl_loss_3": 2257.3027099609376, + "kl_loss_6": 1316.080010986328, + "learning_rate": 0.0008596699001693256, + "loss": 1060.0463, + "step": 2520 + }, + { + "ce_loss_12": 3.199018669128418, + "ce_loss_17": 3.056893455982208, + "ce_loss_23": 2.996263098716736, + "ce_loss_3": 4.052443480491638, + "ce_loss_6": 3.5751360654830933, + "epoch": 0.253, + "grad_norm": 996.0, + "kl_loss_12": 428.9387924194336, + "kl_loss_17": 111.91889839172363, + "kl_loss_3": 2277.7624450683593, + "kl_loss_6": 1303.7885803222657, + "learning_rate": 0.0008585659023794818, + "loss": 1060.6101, + "step": 2530 + }, + { + "ce_loss_12": 3.175670659542084, + "ce_loss_17": 3.021572208404541, + "ce_loss_23": 2.9581612825393675, + "ce_loss_3": 4.091776359081268, + "ce_loss_6": 3.5962265133857727, + "epoch": 0.254, + "grad_norm": 1424.0, + "kl_loss_12": 441.34724884033204, + "kl_loss_17": 116.35274925231934, + "kl_loss_3": 2388.9944702148437, + "kl_loss_6": 1388.9312561035156, + "learning_rate": 0.0008574582938349817, + "loss": 1065.3421, + "step": 2540 + }, + { + "ce_loss_12": 3.1620734691619874, + "ce_loss_17": 3.002826249599457, + "ce_loss_23": 2.9333348870277405, + "ce_loss_3": 4.068507182598114, + "ce_loss_6": 3.5735328078269957, + "epoch": 0.255, + "grad_norm": 900.0, + "kl_loss_12": 454.44627380371094, + "kl_loss_17": 121.72102546691895, + "kl_loss_3": 2384.1946350097655, + "kl_loss_6": 1385.3286499023438, + "learning_rate": 0.0008563470856894315, + "loss": 1052.7078, + "step": 2550 + }, + { + "ce_loss_12": 3.148736262321472, + "ce_loss_17": 2.999965786933899, + "ce_loss_23": 2.9350846290588377, + "ce_loss_3": 4.050228786468506, + "ce_loss_6": 3.5579028367996215, + "epoch": 0.256, + "grad_norm": 1012.0, + "kl_loss_12": 432.3045715332031, + "kl_loss_17": 114.02141952514648, + "kl_loss_3": 2349.2908142089846, + "kl_loss_6": 1367.2147155761718, + "learning_rate": 0.0008552322891326845, + "loss": 1056.7793, + "step": 2560 + }, + { + "ce_loss_12": 3.1203397512435913, + "ce_loss_17": 2.9703470706939696, + "ce_loss_23": 2.9060755014419555, + "ce_loss_3": 4.019045674800873, + "ce_loss_6": 3.527764308452606, + "epoch": 0.257, + "grad_norm": 952.0, + "kl_loss_12": 431.4568145751953, + "kl_loss_17": 113.82027130126953, + "kl_loss_3": 2349.3852966308596, + "kl_loss_6": 1351.4473205566405, + "learning_rate": 0.0008541139153907296, + "loss": 1045.5965, + "step": 2570 + }, + { + "ce_loss_12": 3.0819831013679506, + "ce_loss_17": 2.9337033152580263, + "ce_loss_23": 2.87243572473526, + "ce_loss_3": 3.977305507659912, + "ce_loss_6": 3.48627450466156, + "epoch": 0.258, + "grad_norm": 884.0, + "kl_loss_12": 425.45489501953125, + "kl_loss_17": 110.58322982788086, + "kl_loss_3": 2335.633154296875, + "kl_loss_6": 1345.2675415039062, + "learning_rate": 0.0008529919757255782, + "loss": 1064.3127, + "step": 2580 + }, + { + "ce_loss_12": 3.107268238067627, + "ce_loss_17": 2.967454993724823, + "ce_loss_23": 2.909093964099884, + "ce_loss_3": 3.9463349342346192, + "ce_loss_6": 3.485192346572876, + "epoch": 0.259, + "grad_norm": 988.0, + "kl_loss_12": 414.2021087646484, + "kl_loss_17": 111.3341178894043, + "kl_loss_3": 2240.2564697265625, + "kl_loss_6": 1291.5567199707032, + "learning_rate": 0.0008518664814351503, + "loss": 1027.255, + "step": 2590 + }, + { + "ce_loss_12": 3.0857400417327883, + "ce_loss_17": 2.9320381045341493, + "ce_loss_23": 2.8670764684677126, + "ce_loss_3": 3.991806662082672, + "ce_loss_6": 3.48983781337738, + "epoch": 0.26, + "grad_norm": 1064.0, + "kl_loss_12": 441.3775390625, + "kl_loss_17": 120.65605735778809, + "kl_loss_3": 2370.8674560546874, + "kl_loss_6": 1358.5048706054688, + "learning_rate": 0.0008507374438531607, + "loss": 1100.9932, + "step": 2600 + }, + { + "ce_loss_12": 3.0624029636383057, + "ce_loss_17": 2.9160149931907653, + "ce_loss_23": 2.855610120296478, + "ce_loss_3": 3.9515944004058836, + "ce_loss_6": 3.459726858139038, + "epoch": 0.261, + "grad_norm": 1072.0, + "kl_loss_12": 426.2549850463867, + "kl_loss_17": 115.3815818786621, + "kl_loss_3": 2323.017303466797, + "kl_loss_6": 1336.0733459472656, + "learning_rate": 0.0008496048743490053, + "loss": 1046.5982, + "step": 2610 + }, + { + "ce_loss_12": 3.2101902842521666, + "ce_loss_17": 3.0653091192245485, + "ce_loss_23": 3.000552237033844, + "ce_loss_3": 4.061716663837433, + "ce_loss_6": 3.59234459400177, + "epoch": 0.262, + "grad_norm": 824.0, + "kl_loss_12": 426.0384750366211, + "kl_loss_17": 115.67604293823243, + "kl_loss_3": 2261.2772827148438, + "kl_loss_6": 1312.9195495605468, + "learning_rate": 0.0008484687843276469, + "loss": 1040.8359, + "step": 2620 + }, + { + "ce_loss_12": 3.14266951084137, + "ce_loss_17": 2.99924293756485, + "ce_loss_23": 2.9332231998443605, + "ce_loss_3": 4.01922596693039, + "ce_loss_6": 3.536488151550293, + "epoch": 0.263, + "grad_norm": 1296.0, + "kl_loss_12": 436.6288177490234, + "kl_loss_17": 118.81113357543946, + "kl_loss_3": 2318.0344421386717, + "kl_loss_6": 1339.8821228027343, + "learning_rate": 0.0008473291852294987, + "loss": 1064.5347, + "step": 2630 + }, + { + "ce_loss_12": 3.1522043466567995, + "ce_loss_17": 3.0062794923782348, + "ce_loss_23": 2.938192343711853, + "ce_loss_3": 4.026766860485077, + "ce_loss_6": 3.545170855522156, + "epoch": 0.264, + "grad_norm": 1208.0, + "kl_loss_12": 429.8892822265625, + "kl_loss_17": 119.1082977294922, + "kl_loss_3": 2327.79921875, + "kl_loss_6": 1337.9722473144532, + "learning_rate": 0.0008461860885303114, + "loss": 1042.7791, + "step": 2640 + }, + { + "ce_loss_12": 3.1773787260055544, + "ce_loss_17": 3.032958376407623, + "ce_loss_23": 2.9696513175964356, + "ce_loss_3": 4.03357914686203, + "ce_loss_6": 3.5610399723052977, + "epoch": 0.265, + "grad_norm": 1304.0, + "kl_loss_12": 421.92885131835936, + "kl_loss_17": 116.15681076049805, + "kl_loss_3": 2260.550634765625, + "kl_loss_6": 1304.3487976074218, + "learning_rate": 0.000845039505741056, + "loss": 1044.8391, + "step": 2650 + }, + { + "ce_loss_12": 3.1606823325157167, + "ce_loss_17": 3.0121636748313905, + "ce_loss_23": 2.9492172598838806, + "ce_loss_3": 4.040087330341339, + "ce_loss_6": 3.5578425526618958, + "epoch": 0.266, + "grad_norm": 1168.0, + "kl_loss_12": 441.35157623291013, + "kl_loss_17": 117.9492961883545, + "kl_loss_3": 2351.2228820800783, + "kl_loss_6": 1366.394580078125, + "learning_rate": 0.0008438894484078086, + "loss": 1089.5979, + "step": 2660 + }, + { + "ce_loss_12": 3.1689205169677734, + "ce_loss_17": 3.0273826241493227, + "ce_loss_23": 2.9663585782051087, + "ce_loss_3": 4.033535480499268, + "ce_loss_6": 3.5576565623283387, + "epoch": 0.267, + "grad_norm": 968.0, + "kl_loss_12": 421.36059722900393, + "kl_loss_17": 116.55185966491699, + "kl_loss_3": 2282.9838684082033, + "kl_loss_6": 1319.715594482422, + "learning_rate": 0.0008427359281116334, + "loss": 1043.9596, + "step": 2670 + }, + { + "ce_loss_12": 3.0684268474578857, + "ce_loss_17": 2.9236847519874574, + "ce_loss_23": 2.860447108745575, + "ce_loss_3": 3.9806047320365905, + "ce_loss_6": 3.484599268436432, + "epoch": 0.268, + "grad_norm": 1128.0, + "kl_loss_12": 426.485041809082, + "kl_loss_17": 115.8799388885498, + "kl_loss_3": 2370.3961486816406, + "kl_loss_6": 1367.2152770996095, + "learning_rate": 0.0008415789564684673, + "loss": 1059.0766, + "step": 2680 + }, + { + "ce_loss_12": 3.29848473072052, + "ce_loss_17": 3.1556638956069945, + "ce_loss_23": 3.0902445912361145, + "ce_loss_3": 4.151888036727906, + "ce_loss_6": 3.6841147541999817, + "epoch": 0.269, + "grad_norm": 876.0, + "kl_loss_12": 427.7517349243164, + "kl_loss_17": 117.3643009185791, + "kl_loss_3": 2245.7098876953123, + "kl_loss_6": 1308.4502197265624, + "learning_rate": 0.0008404185451290017, + "loss": 1024.183, + "step": 2690 + }, + { + "ce_loss_12": 3.178904449939728, + "ce_loss_17": 3.034889113903046, + "ce_loss_23": 2.9719478368759153, + "ce_loss_3": 4.050006449222565, + "ce_loss_6": 3.565938127040863, + "epoch": 0.27, + "grad_norm": 940.0, + "kl_loss_12": 420.5887252807617, + "kl_loss_17": 113.05890274047852, + "kl_loss_3": 2283.9570190429686, + "kl_loss_6": 1309.367413330078, + "learning_rate": 0.0008392547057785661, + "loss": 1031.9732, + "step": 2700 + }, + { + "ce_loss_12": 3.1153995633125304, + "ce_loss_17": 2.967165696620941, + "ce_loss_23": 2.9075494527816774, + "ce_loss_3": 4.030291855335236, + "ce_loss_6": 3.5235714077949525, + "epoch": 0.271, + "grad_norm": 944.0, + "kl_loss_12": 431.7582672119141, + "kl_loss_17": 113.72574844360352, + "kl_loss_3": 2407.028039550781, + "kl_loss_6": 1371.627508544922, + "learning_rate": 0.0008380874501370098, + "loss": 1038.0814, + "step": 2710 + }, + { + "ce_loss_12": 3.107207548618317, + "ce_loss_17": 2.9570419549942017, + "ce_loss_23": 2.892813432216644, + "ce_loss_3": 4.019529783725739, + "ce_loss_6": 3.521112060546875, + "epoch": 0.272, + "grad_norm": 1032.0, + "kl_loss_12": 437.33069915771483, + "kl_loss_17": 114.94045181274414, + "kl_loss_3": 2386.276776123047, + "kl_loss_6": 1373.3397644042968, + "learning_rate": 0.0008369167899585841, + "loss": 1059.7808, + "step": 2720 + }, + { + "ce_loss_12": 3.21390700340271, + "ce_loss_17": 3.072514033317566, + "ce_loss_23": 3.011313247680664, + "ce_loss_3": 4.039605271816254, + "ce_loss_6": 3.5885566592216493, + "epoch": 0.273, + "grad_norm": 824.0, + "kl_loss_12": 416.95458068847654, + "kl_loss_17": 111.16670646667481, + "kl_loss_3": 2206.9629821777344, + "kl_loss_6": 1289.4851623535155, + "learning_rate": 0.0008357427370318238, + "loss": 1043.6662, + "step": 2730 + }, + { + "ce_loss_12": 3.170186769962311, + "ce_loss_17": 3.0258415222167967, + "ce_loss_23": 2.9628575444221497, + "ce_loss_3": 4.049800419807434, + "ce_loss_6": 3.5638309955596923, + "epoch": 0.274, + "grad_norm": 1020.0, + "kl_loss_12": 423.51794586181643, + "kl_loss_17": 110.81505393981934, + "kl_loss_3": 2313.176416015625, + "kl_loss_6": 1318.6596801757812, + "learning_rate": 0.0008345653031794292, + "loss": 1050.7198, + "step": 2740 + }, + { + "ce_loss_12": 3.1771156549453736, + "ce_loss_17": 3.0280372500419617, + "ce_loss_23": 2.965675616264343, + "ce_loss_3": 4.041014420986175, + "ce_loss_6": 3.56474187374115, + "epoch": 0.275, + "grad_norm": 1272.0, + "kl_loss_12": 423.6371154785156, + "kl_loss_17": 111.59429397583008, + "kl_loss_3": 2275.3608947753905, + "kl_loss_6": 1306.164373779297, + "learning_rate": 0.0008333845002581458, + "loss": 1033.1295, + "step": 2750 + }, + { + "ce_loss_12": 3.1093812108039858, + "ce_loss_17": 2.958666718006134, + "ce_loss_23": 2.8973594903945923, + "ce_loss_3": 4.003350925445557, + "ce_loss_6": 3.521108365058899, + "epoch": 0.276, + "grad_norm": 868.0, + "kl_loss_12": 435.3954574584961, + "kl_loss_17": 113.91388969421386, + "kl_loss_3": 2364.9137329101563, + "kl_loss_6": 1371.7270202636719, + "learning_rate": 0.0008322003401586462, + "loss": 1063.1645, + "step": 2760 + }, + { + "ce_loss_12": 3.1297270178794863, + "ce_loss_17": 2.990075874328613, + "ce_loss_23": 2.9308247327804566, + "ce_loss_3": 3.977475678920746, + "ce_loss_6": 3.5078927755355833, + "epoch": 0.277, + "grad_norm": 992.0, + "kl_loss_12": 408.00756683349607, + "kl_loss_17": 106.93959312438965, + "kl_loss_3": 2226.0518310546877, + "kl_loss_6": 1278.1108642578124, + "learning_rate": 0.0008310128348054094, + "loss": 995.3268, + "step": 2770 + }, + { + "ce_loss_12": 3.109842586517334, + "ce_loss_17": 2.965323638916016, + "ce_loss_23": 2.905602777004242, + "ce_loss_3": 3.9729790091514587, + "ce_loss_6": 3.4934093952178955, + "epoch": 0.278, + "grad_norm": 1012.0, + "kl_loss_12": 418.95628051757814, + "kl_loss_17": 108.78140983581542, + "kl_loss_3": 2277.008166503906, + "kl_loss_6": 1308.7492614746093, + "learning_rate": 0.0008298219961566008, + "loss": 1030.3977, + "step": 2780 + }, + { + "ce_loss_12": 3.0712546944618224, + "ce_loss_17": 2.926204741001129, + "ce_loss_23": 2.8665321707725524, + "ce_loss_3": 3.9693758845329286, + "ce_loss_6": 3.480190944671631, + "epoch": 0.279, + "grad_norm": 1304.0, + "kl_loss_12": 432.71239166259767, + "kl_loss_17": 110.66368026733399, + "kl_loss_3": 2384.9975463867186, + "kl_loss_6": 1379.5857421875, + "learning_rate": 0.0008286278362039527, + "loss": 1040.0875, + "step": 2790 + }, + { + "ce_loss_12": 3.099072754383087, + "ce_loss_17": 2.9543874859809875, + "ce_loss_23": 2.8932689309120176, + "ce_loss_3": 4.00441962480545, + "ce_loss_6": 3.516203057765961, + "epoch": 0.28, + "grad_norm": 1024.0, + "kl_loss_12": 423.47198181152345, + "kl_loss_17": 112.17585258483886, + "kl_loss_3": 2383.3101196289062, + "kl_loss_6": 1379.0210693359375, + "learning_rate": 0.0008274303669726426, + "loss": 1041.7575, + "step": 2800 + }, + { + "ce_loss_12": 3.018801951408386, + "ce_loss_17": 2.870440900325775, + "ce_loss_23": 2.8090245962142943, + "ce_loss_3": 3.937907099723816, + "ce_loss_6": 3.436682391166687, + "epoch": 0.281, + "grad_norm": 848.0, + "kl_loss_12": 422.47137451171875, + "kl_loss_17": 111.5949893951416, + "kl_loss_3": 2397.2886169433596, + "kl_loss_6": 1368.81572265625, + "learning_rate": 0.0008262296005211721, + "loss": 1038.3342, + "step": 2810 + }, + { + "ce_loss_12": 3.1327635169029238, + "ce_loss_17": 2.9874435663223267, + "ce_loss_23": 2.925761067867279, + "ce_loss_3": 4.023141646385193, + "ce_loss_6": 3.5383160591125487, + "epoch": 0.282, + "grad_norm": 1064.0, + "kl_loss_12": 425.43332824707034, + "kl_loss_17": 112.3005428314209, + "kl_loss_3": 2330.1076599121093, + "kl_loss_6": 1347.2308227539063, + "learning_rate": 0.0008250255489412463, + "loss": 1035.3857, + "step": 2820 + }, + { + "ce_loss_12": 3.2296959280967714, + "ce_loss_17": 3.0806389093399047, + "ce_loss_23": 3.019387435913086, + "ce_loss_3": 4.097268283367157, + "ce_loss_6": 3.613387179374695, + "epoch": 0.283, + "grad_norm": 952.0, + "kl_loss_12": 422.14527893066406, + "kl_loss_17": 112.8663158416748, + "kl_loss_3": 2287.187646484375, + "kl_loss_6": 1308.6017761230469, + "learning_rate": 0.0008238182243576511, + "loss": 1037.0014, + "step": 2830 + }, + { + "ce_loss_12": 3.187533068656921, + "ce_loss_17": 3.0509860515594482, + "ce_loss_23": 2.9890125155448914, + "ce_loss_3": 3.9914371371269226, + "ce_loss_6": 3.5471837282180787, + "epoch": 0.284, + "grad_norm": 1072.0, + "kl_loss_12": 410.6913238525391, + "kl_loss_17": 115.82479705810547, + "kl_loss_3": 2155.456982421875, + "kl_loss_6": 1247.588641357422, + "learning_rate": 0.0008226076389281315, + "loss": 1002.2791, + "step": 2840 + }, + { + "ce_loss_12": 3.2360986948013304, + "ce_loss_17": 3.095022213459015, + "ce_loss_23": 3.0317818760871886, + "ce_loss_3": 4.068842041492462, + "ce_loss_6": 3.6044496297836304, + "epoch": 0.285, + "grad_norm": 1072.0, + "kl_loss_12": 420.0049194335937, + "kl_loss_17": 118.48114700317383, + "kl_loss_3": 2255.8744262695313, + "kl_loss_6": 1300.3296691894532, + "learning_rate": 0.0008213938048432696, + "loss": 1004.6877, + "step": 2850 + }, + { + "ce_loss_12": 3.1623266458511354, + "ce_loss_17": 3.021411645412445, + "ce_loss_23": 2.9557711124420165, + "ce_loss_3": 4.008575391769409, + "ce_loss_6": 3.5391808271408083, + "epoch": 0.286, + "grad_norm": 964.0, + "kl_loss_12": 424.7983062744141, + "kl_loss_17": 118.95002555847168, + "kl_loss_3": 2251.5058654785157, + "kl_loss_6": 1290.7272216796875, + "learning_rate": 0.0008201767343263612, + "loss": 1030.3887, + "step": 2860 + }, + { + "ce_loss_12": 3.106607210636139, + "ce_loss_17": 2.961516261100769, + "ce_loss_23": 2.8994472503662108, + "ce_loss_3": 3.9940276861190798, + "ce_loss_6": 3.5045013189315797, + "epoch": 0.287, + "grad_norm": 1040.0, + "kl_loss_12": 418.3986389160156, + "kl_loss_17": 114.02130393981933, + "kl_loss_3": 2321.1509399414062, + "kl_loss_6": 1338.0280334472657, + "learning_rate": 0.0008189564396332927, + "loss": 1011.8268, + "step": 2870 + }, + { + "ce_loss_12": 3.0934667229652404, + "ce_loss_17": 2.956806683540344, + "ce_loss_23": 2.8943321824073793, + "ce_loss_3": 3.9781769156455993, + "ce_loss_6": 3.4973119139671325, + "epoch": 0.288, + "grad_norm": 1064.0, + "kl_loss_12": 411.86312408447264, + "kl_loss_17": 113.3759391784668, + "kl_loss_3": 2299.268927001953, + "kl_loss_6": 1324.3412292480468, + "learning_rate": 0.0008177329330524181, + "loss": 1036.632, + "step": 2880 + }, + { + "ce_loss_12": 3.1487688541412355, + "ce_loss_17": 3.0031996607780456, + "ce_loss_23": 2.940968418121338, + "ce_loss_3": 3.9927698016166686, + "ce_loss_6": 3.5189738869667053, + "epoch": 0.289, + "grad_norm": 904.0, + "kl_loss_12": 412.4456451416016, + "kl_loss_17": 113.02369689941406, + "kl_loss_3": 2214.7519104003904, + "kl_loss_6": 1272.6784545898438, + "learning_rate": 0.0008165062269044352, + "loss": 1015.1071, + "step": 2890 + }, + { + "ce_loss_12": 3.1071110010147094, + "ce_loss_17": 2.9628792285919188, + "ce_loss_23": 2.9007930755615234, + "ce_loss_3": 3.983273184299469, + "ce_loss_6": 3.494569385051727, + "epoch": 0.29, + "grad_norm": 956.0, + "kl_loss_12": 430.44505767822267, + "kl_loss_17": 114.5558017730713, + "kl_loss_3": 2310.420068359375, + "kl_loss_6": 1315.7907348632812, + "learning_rate": 0.0008152763335422613, + "loss": 1043.0416, + "step": 2900 + }, + { + "ce_loss_12": 3.0977216720581056, + "ce_loss_17": 2.9489872455596924, + "ce_loss_23": 2.8858672261238096, + "ce_loss_3": 3.9666316509246826, + "ce_loss_6": 3.4889026165008543, + "epoch": 0.291, + "grad_norm": 1208.0, + "kl_loss_12": 426.3074234008789, + "kl_loss_17": 115.46015243530273, + "kl_loss_3": 2301.8380798339845, + "kl_loss_6": 1327.6948974609375, + "learning_rate": 0.0008140432653509088, + "loss": 1028.4336, + "step": 2910 + }, + { + "ce_loss_12": 3.1420980095863342, + "ce_loss_17": 2.9944337725639345, + "ce_loss_23": 2.9315978407859804, + "ce_loss_3": 3.993865358829498, + "ce_loss_6": 3.530861973762512, + "epoch": 0.292, + "grad_norm": 1232.0, + "kl_loss_12": 428.00020904541014, + "kl_loss_17": 113.76680603027344, + "kl_loss_3": 2282.623883056641, + "kl_loss_6": 1311.62802734375, + "learning_rate": 0.0008128070347473608, + "loss": 1026.46, + "step": 2920 + }, + { + "ce_loss_12": 3.1560147762298585, + "ce_loss_17": 3.0118520140647886, + "ce_loss_23": 2.948830020427704, + "ce_loss_3": 4.042136788368225, + "ce_loss_6": 3.5510075807571413, + "epoch": 0.293, + "grad_norm": 1064.0, + "kl_loss_12": 429.2475128173828, + "kl_loss_17": 113.55529251098633, + "kl_loss_3": 2338.4506896972657, + "kl_loss_6": 1337.592578125, + "learning_rate": 0.0008115676541804455, + "loss": 1029.9361, + "step": 2930 + }, + { + "ce_loss_12": 3.1561770677566527, + "ce_loss_17": 3.0117817163467406, + "ce_loss_23": 2.950511121749878, + "ce_loss_3": 3.995435917377472, + "ce_loss_6": 3.5341237783432007, + "epoch": 0.294, + "grad_norm": 932.0, + "kl_loss_12": 416.59341583251955, + "kl_loss_17": 111.01059761047364, + "kl_loss_3": 2253.631555175781, + "kl_loss_6": 1306.9321044921876, + "learning_rate": 0.0008103251361307119, + "loss": 1031.5532, + "step": 2940 + }, + { + "ce_loss_12": 3.1843759298324583, + "ce_loss_17": 3.038155961036682, + "ce_loss_23": 2.9773050785064696, + "ce_loss_3": 4.043914568424225, + "ce_loss_6": 3.5680703282356263, + "epoch": 0.295, + "grad_norm": 1224.0, + "kl_loss_12": 420.00162506103516, + "kl_loss_17": 111.82075843811035, + "kl_loss_3": 2276.9667785644533, + "kl_loss_6": 1320.3095397949219, + "learning_rate": 0.0008090794931103026, + "loss": 1022.3766, + "step": 2950 + }, + { + "ce_loss_12": 3.154700481891632, + "ce_loss_17": 3.0144895553588866, + "ce_loss_23": 2.9569777607917787, + "ce_loss_3": 4.001096451282502, + "ce_loss_6": 3.536224627494812, + "epoch": 0.296, + "grad_norm": 1128.0, + "kl_loss_12": 408.6188430786133, + "kl_loss_17": 106.91668930053712, + "kl_loss_3": 2230.6013305664064, + "kl_loss_6": 1284.477362060547, + "learning_rate": 0.0008078307376628291, + "loss": 1015.5539, + "step": 2960 + }, + { + "ce_loss_12": 3.216846990585327, + "ce_loss_17": 3.07712641954422, + "ce_loss_23": 3.020001995563507, + "ce_loss_3": 4.0298261642456055, + "ce_loss_6": 3.5871922373771667, + "epoch": 0.297, + "grad_norm": 1056.0, + "kl_loss_12": 403.0998001098633, + "kl_loss_17": 105.29769783020019, + "kl_loss_3": 2154.7777099609375, + "kl_loss_6": 1254.5455261230468, + "learning_rate": 0.000806578882363245, + "loss": 982.7731, + "step": 2970 + }, + { + "ce_loss_12": 3.1353084087371825, + "ce_loss_17": 2.9982158064842226, + "ce_loss_23": 2.9405212998390198, + "ce_loss_3": 3.9612733006477354, + "ce_loss_6": 3.5113443732261658, + "epoch": 0.298, + "grad_norm": 1040.0, + "kl_loss_12": 405.961865234375, + "kl_loss_17": 105.14496688842773, + "kl_loss_3": 2208.7371337890627, + "kl_loss_6": 1287.3426086425782, + "learning_rate": 0.0008053239398177191, + "loss": 1026.2143, + "step": 2980 + }, + { + "ce_loss_12": 3.1283560276031492, + "ce_loss_17": 2.9837308287620545, + "ce_loss_23": 2.9255617380142214, + "ce_loss_3": 3.985158312320709, + "ce_loss_6": 3.516181480884552, + "epoch": 0.299, + "grad_norm": 1184.0, + "kl_loss_12": 409.7952484130859, + "kl_loss_17": 106.93139038085937, + "kl_loss_3": 2250.543109130859, + "kl_loss_6": 1297.1369995117188, + "learning_rate": 0.0008040659226635089, + "loss": 1039.9756, + "step": 2990 + }, + { + "ce_loss_12": 3.2462770462036135, + "ce_loss_17": 3.0983072400093077, + "ce_loss_23": 3.034535789489746, + "ce_loss_3": 4.075495111942291, + "ce_loss_6": 3.623253059387207, + "epoch": 0.3, + "grad_norm": 1208.0, + "kl_loss_12": 430.03038177490237, + "kl_loss_17": 115.25655326843261, + "kl_loss_3": 2238.4564697265623, + "kl_loss_6": 1295.3731628417968, + "learning_rate": 0.0008028048435688333, + "loss": 1009.2599, + "step": 3000 + }, + { + "ce_loss_12": 3.1294857978820803, + "ce_loss_17": 2.9839457511901855, + "ce_loss_23": 2.9239335656166077, + "ce_loss_3": 4.003237676620484, + "ce_loss_6": 3.5273142576217653, + "epoch": 0.301, + "grad_norm": 1088.0, + "kl_loss_12": 417.4100051879883, + "kl_loss_17": 108.43363456726074, + "kl_loss_3": 2304.9576171875, + "kl_loss_6": 1328.046453857422, + "learning_rate": 0.0008015407152327448, + "loss": 1026.3252, + "step": 3010 + }, + { + "ce_loss_12": 3.164993369579315, + "ce_loss_17": 3.02325359582901, + "ce_loss_23": 2.9631327986717224, + "ce_loss_3": 4.018820178508759, + "ce_loss_6": 3.544352960586548, + "epoch": 0.302, + "grad_norm": 1016.0, + "kl_loss_12": 413.7855987548828, + "kl_loss_17": 110.33978080749512, + "kl_loss_3": 2281.8075317382813, + "kl_loss_6": 1308.32470703125, + "learning_rate": 0.0008002735503850016, + "loss": 1024.3284, + "step": 3020 + }, + { + "ce_loss_12": 3.0686097145080566, + "ce_loss_17": 2.923471248149872, + "ce_loss_23": 2.862740170955658, + "ce_loss_3": 3.9519243001937867, + "ce_loss_6": 3.4707289814949034, + "epoch": 0.303, + "grad_norm": 1176.0, + "kl_loss_12": 420.33300933837893, + "kl_loss_17": 111.22417945861817, + "kl_loss_3": 2320.516729736328, + "kl_loss_6": 1341.770770263672, + "learning_rate": 0.0007990033617859396, + "loss": 1043.5717, + "step": 3030 + }, + { + "ce_loss_12": 3.1193864822387694, + "ce_loss_17": 2.9834821939468386, + "ce_loss_23": 2.918761098384857, + "ce_loss_3": 3.9739678502082825, + "ce_loss_6": 3.4987839579582216, + "epoch": 0.304, + "grad_norm": 1024.0, + "kl_loss_12": 407.8907638549805, + "kl_loss_17": 115.28406105041503, + "kl_loss_3": 2240.3181579589846, + "kl_loss_6": 1293.2356872558594, + "learning_rate": 0.000797730162226344, + "loss": 991.3312, + "step": 3040 + }, + { + "ce_loss_12": 3.1359513640403747, + "ce_loss_17": 2.9954004645347596, + "ce_loss_23": 2.9337677240371702, + "ce_loss_3": 3.989997386932373, + "ce_loss_6": 3.5289689540863036, + "epoch": 0.305, + "grad_norm": 908.0, + "kl_loss_12": 412.1574142456055, + "kl_loss_17": 116.06979484558106, + "kl_loss_3": 2259.043005371094, + "kl_loss_6": 1315.9446960449218, + "learning_rate": 0.0007964539645273203, + "loss": 1015.7215, + "step": 3050 + }, + { + "ce_loss_12": 3.1464040160179136, + "ce_loss_17": 3.0107370972633363, + "ce_loss_23": 2.9521604895591738, + "ce_loss_3": 3.9734048008918763, + "ce_loss_6": 3.518921947479248, + "epoch": 0.306, + "grad_norm": 1192.0, + "kl_loss_12": 399.71173400878905, + "kl_loss_17": 107.98187713623047, + "kl_loss_3": 2186.635870361328, + "kl_loss_6": 1261.9838256835938, + "learning_rate": 0.000795174781540165, + "loss": 1006.6209, + "step": 3060 + }, + { + "ce_loss_12": 3.2190658688545226, + "ce_loss_17": 3.0822181344032287, + "ce_loss_23": 3.0252692461013795, + "ce_loss_3": 4.028954362869262, + "ce_loss_6": 3.583014357089996, + "epoch": 0.307, + "grad_norm": 1520.0, + "kl_loss_12": 402.89805297851564, + "kl_loss_17": 109.53241271972657, + "kl_loss_3": 2168.199774169922, + "kl_loss_6": 1259.1837097167968, + "learning_rate": 0.0007938926261462366, + "loss": 1009.859, + "step": 3070 + }, + { + "ce_loss_12": 3.1720401525497435, + "ce_loss_17": 3.0377658724784853, + "ce_loss_23": 2.975878930091858, + "ce_loss_3": 3.985373318195343, + "ce_loss_6": 3.5269261479377745, + "epoch": 0.308, + "grad_norm": 1296.0, + "kl_loss_12": 408.71085662841796, + "kl_loss_17": 110.68863906860352, + "kl_loss_3": 2193.952880859375, + "kl_loss_6": 1263.3701293945312, + "learning_rate": 0.0007926075112568258, + "loss": 1018.659, + "step": 3080 + }, + { + "ce_loss_12": 3.166626286506653, + "ce_loss_17": 3.029184174537659, + "ce_loss_23": 2.9667676091194153, + "ce_loss_3": 4.016194677352905, + "ce_loss_6": 3.544252622127533, + "epoch": 0.309, + "grad_norm": 908.0, + "kl_loss_12": 411.1470260620117, + "kl_loss_17": 113.14122123718262, + "kl_loss_3": 2241.7541931152346, + "kl_loss_6": 1290.6083190917968, + "learning_rate": 0.0007913194498130252, + "loss": 998.1091, + "step": 3090 + }, + { + "ce_loss_12": 3.1077103972435, + "ce_loss_17": 2.9612428307533265, + "ce_loss_23": 2.899013900756836, + "ce_loss_3": 3.96414829492569, + "ce_loss_6": 3.490557086467743, + "epoch": 0.31, + "grad_norm": 1176.0, + "kl_loss_12": 415.7887680053711, + "kl_loss_17": 116.05563888549804, + "kl_loss_3": 2242.3578491210938, + "kl_loss_6": 1294.021533203125, + "learning_rate": 0.0007900284547855992, + "loss": 1022.7701, + "step": 3100 + }, + { + "ce_loss_12": 3.1130651116371153, + "ce_loss_17": 2.9763598322868345, + "ce_loss_23": 2.9147239446640016, + "ce_loss_3": 3.942672574520111, + "ce_loss_6": 3.481655478477478, + "epoch": 0.311, + "grad_norm": 1136.0, + "kl_loss_12": 406.75770263671876, + "kl_loss_17": 111.87238311767578, + "kl_loss_3": 2207.929296875, + "kl_loss_6": 1275.2953369140625, + "learning_rate": 0.0007887345391748532, + "loss": 1026.1137, + "step": 3110 + }, + { + "ce_loss_12": 3.2158465147018434, + "ce_loss_17": 3.081123912334442, + "ce_loss_23": 3.02043753862381, + "ce_loss_3": 4.023183381557464, + "ce_loss_6": 3.572895038127899, + "epoch": 0.312, + "grad_norm": 1136.0, + "kl_loss_12": 398.6086669921875, + "kl_loss_17": 112.47258186340332, + "kl_loss_3": 2146.368359375, + "kl_loss_6": 1239.7026672363281, + "learning_rate": 0.0007874377160105036, + "loss": 972.8377, + "step": 3120 + }, + { + "ce_loss_12": 3.127926063537598, + "ce_loss_17": 2.9957722187042237, + "ce_loss_23": 2.934756410121918, + "ce_loss_3": 4.002281177043915, + "ce_loss_6": 3.5005795121192933, + "epoch": 0.313, + "grad_norm": 912.0, + "kl_loss_12": 396.9777374267578, + "kl_loss_17": 111.40082015991212, + "kl_loss_3": 2294.8144287109376, + "kl_loss_6": 1272.8683288574218, + "learning_rate": 0.0007861379983515449, + "loss": 1040.7336, + "step": 3130 + }, + { + "ce_loss_12": 3.2019168853759767, + "ce_loss_17": 3.067536175251007, + "ce_loss_23": 3.005921757221222, + "ce_loss_3": 4.045781588554382, + "ce_loss_6": 3.590694487094879, + "epoch": 0.314, + "grad_norm": 1004.0, + "kl_loss_12": 407.66050109863284, + "kl_loss_17": 112.10332107543945, + "kl_loss_3": 2232.9395751953125, + "kl_loss_6": 1292.6397766113282, + "learning_rate": 0.0007848353992861195, + "loss": 1001.023, + "step": 3140 + }, + { + "ce_loss_12": 3.2831211566925047, + "ce_loss_17": 3.1478240966796873, + "ce_loss_23": 3.074808359146118, + "ce_loss_3": 4.130173242092132, + "ce_loss_6": 3.669738221168518, + "epoch": 0.315, + "grad_norm": 1056.0, + "kl_loss_12": 420.8632064819336, + "kl_loss_17": 134.64935264587402, + "kl_loss_3": 2235.048028564453, + "kl_loss_6": 1306.2391418457032, + "learning_rate": 0.0007835299319313853, + "loss": 1029.2203, + "step": 3150 + }, + { + "ce_loss_12": 3.1660799860954283, + "ce_loss_17": 3.03582626581192, + "ce_loss_23": 2.974584233760834, + "ce_loss_3": 3.981108212471008, + "ce_loss_6": 3.528446316719055, + "epoch": 0.316, + "grad_norm": 1144.0, + "kl_loss_12": 403.32799835205077, + "kl_loss_17": 116.29181365966797, + "kl_loss_3": 2173.312121582031, + "kl_loss_6": 1250.9722595214844, + "learning_rate": 0.0007822216094333848, + "loss": 1024.1848, + "step": 3160 + }, + { + "ce_loss_12": 3.18140105009079, + "ce_loss_17": 3.042064738273621, + "ce_loss_23": 2.979595422744751, + "ce_loss_3": 4.035410952568054, + "ce_loss_6": 3.564517879486084, + "epoch": 0.317, + "grad_norm": 848.0, + "kl_loss_12": 412.0792602539062, + "kl_loss_17": 119.08126792907714, + "kl_loss_3": 2249.047265625, + "kl_loss_6": 1287.4177551269531, + "learning_rate": 0.0007809104449669101, + "loss": 1006.7281, + "step": 3170 + }, + { + "ce_loss_12": 3.1243847727775576, + "ce_loss_17": 2.987361800670624, + "ce_loss_23": 2.92877117395401, + "ce_loss_3": 3.943815493583679, + "ce_loss_6": 3.4851326823234556, + "epoch": 0.318, + "grad_norm": 1008.0, + "kl_loss_12": 405.4247024536133, + "kl_loss_17": 112.87823867797852, + "kl_loss_3": 2179.9481811523438, + "kl_loss_6": 1255.689697265625, + "learning_rate": 0.0007795964517353734, + "loss": 994.6248, + "step": 3180 + }, + { + "ce_loss_12": 3.126452827453613, + "ce_loss_17": 2.989318549633026, + "ce_loss_23": 2.9306680083274843, + "ce_loss_3": 3.9847754001617433, + "ce_loss_6": 3.5115119576454163, + "epoch": 0.319, + "grad_norm": 1104.0, + "kl_loss_12": 416.9341049194336, + "kl_loss_17": 116.0937271118164, + "kl_loss_3": 2259.7885925292967, + "kl_loss_6": 1294.2023132324218, + "learning_rate": 0.000778279642970672, + "loss": 991.9023, + "step": 3190 + }, + { + "ce_loss_12": 3.130779039859772, + "ce_loss_17": 2.993109381198883, + "ce_loss_23": 2.9349839210510256, + "ce_loss_3": 3.9459033608436584, + "ce_loss_6": 3.4922481060028074, + "epoch": 0.32, + "grad_norm": 1144.0, + "kl_loss_12": 412.4380615234375, + "kl_loss_17": 111.97399520874023, + "kl_loss_3": 2188.5126892089843, + "kl_loss_6": 1262.0484252929687, + "learning_rate": 0.0007769600319330552, + "loss": 982.9381, + "step": 3200 + }, + { + "ce_loss_12": 3.1565820813179015, + "ce_loss_17": 3.0179872632026674, + "ce_loss_23": 2.9545223474502564, + "ce_loss_3": 4.0231255173683165, + "ce_loss_6": 3.5447113275527955, + "epoch": 0.321, + "grad_norm": 1384.0, + "kl_loss_12": 412.7830841064453, + "kl_loss_17": 114.25502891540528, + "kl_loss_3": 2265.739636230469, + "kl_loss_6": 1294.0114990234374, + "learning_rate": 0.0007756376319109917, + "loss": 1008.1775, + "step": 3210 + }, + { + "ce_loss_12": 3.1978549718856812, + "ce_loss_17": 3.056639587879181, + "ce_loss_23": 2.9975166797637938, + "ce_loss_3": 4.011419260501862, + "ce_loss_6": 3.5697445034980775, + "epoch": 0.322, + "grad_norm": 980.0, + "kl_loss_12": 413.4813034057617, + "kl_loss_17": 111.76018180847169, + "kl_loss_3": 2179.5991760253905, + "kl_loss_6": 1270.4315612792968, + "learning_rate": 0.0007743124562210351, + "loss": 977.0854, + "step": 3220 + }, + { + "ce_loss_12": 3.214629316329956, + "ce_loss_17": 3.075866627693176, + "ce_loss_23": 3.0133888483047486, + "ce_loss_3": 4.0257228255271915, + "ce_loss_6": 3.5786707997322083, + "epoch": 0.323, + "grad_norm": 1160.0, + "kl_loss_12": 407.91126861572263, + "kl_loss_17": 111.01350746154785, + "kl_loss_3": 2191.774792480469, + "kl_loss_6": 1270.8741333007813, + "learning_rate": 0.0007729845182076895, + "loss": 997.8597, + "step": 3230 + }, + { + "ce_loss_12": 3.139386999607086, + "ce_loss_17": 3.0055894374847414, + "ce_loss_23": 2.949425721168518, + "ce_loss_3": 3.9491932153701783, + "ce_loss_6": 3.50347980260849, + "epoch": 0.324, + "grad_norm": 1216.0, + "kl_loss_12": 401.8107543945313, + "kl_loss_17": 105.48669738769532, + "kl_loss_3": 2156.247454833984, + "kl_loss_6": 1246.0010620117187, + "learning_rate": 0.0007716538312432765, + "loss": 1005.3651, + "step": 3240 + }, + { + "ce_loss_12": 3.1097494840621946, + "ce_loss_17": 2.9684781432151794, + "ce_loss_23": 2.9068726897239685, + "ce_loss_3": 3.9634464383125305, + "ce_loss_6": 3.4876855969429017, + "epoch": 0.325, + "grad_norm": 944.0, + "kl_loss_12": 412.408186340332, + "kl_loss_17": 113.86479721069335, + "kl_loss_3": 2249.631207275391, + "kl_loss_6": 1285.4653747558593, + "learning_rate": 0.0007703204087277988, + "loss": 1014.0202, + "step": 3250 + }, + { + "ce_loss_12": 3.195712912082672, + "ce_loss_17": 3.066792869567871, + "ce_loss_23": 3.0043872356414796, + "ce_loss_3": 3.9889792919158937, + "ce_loss_6": 3.5458029985427855, + "epoch": 0.326, + "grad_norm": 1232.0, + "kl_loss_12": 389.23892669677736, + "kl_loss_17": 119.9925926208496, + "kl_loss_3": 2122.7849975585937, + "kl_loss_6": 1216.2618896484375, + "learning_rate": 0.0007689842640888063, + "loss": 977.3232, + "step": 3260 + }, + { + "ce_loss_12": 3.1939032673835754, + "ce_loss_17": 3.067763018608093, + "ce_loss_23": 2.9996448755264282, + "ce_loss_3": 4.016518783569336, + "ce_loss_6": 3.563250422477722, + "epoch": 0.327, + "grad_norm": 1096.0, + "kl_loss_12": 402.7244369506836, + "kl_loss_17": 128.78552703857423, + "kl_loss_3": 2164.721325683594, + "kl_loss_6": 1253.0573181152345, + "learning_rate": 0.0007676454107812607, + "loss": 996.1429, + "step": 3270 + }, + { + "ce_loss_12": 3.135709273815155, + "ce_loss_17": 3.0018914699554444, + "ce_loss_23": 2.9393561482429504, + "ce_loss_3": 3.9782461881637574, + "ce_loss_6": 3.510867726802826, + "epoch": 0.328, + "grad_norm": 1224.0, + "kl_loss_12": 409.4145446777344, + "kl_loss_17": 121.70905494689941, + "kl_loss_3": 2225.8669372558593, + "kl_loss_6": 1269.7090148925781, + "learning_rate": 0.0007663038622873999, + "loss": 993.0643, + "step": 3280 + }, + { + "ce_loss_12": 3.175209379196167, + "ce_loss_17": 3.0521249651908873, + "ce_loss_23": 2.9861778616905212, + "ce_loss_3": 4.006668412685395, + "ce_loss_6": 3.546959364414215, + "epoch": 0.329, + "grad_norm": 1040.0, + "kl_loss_12": 399.50186309814455, + "kl_loss_17": 125.75395736694335, + "kl_loss_3": 2192.2456604003905, + "kl_loss_6": 1260.66640625, + "learning_rate": 0.0007649596321166025, + "loss": 980.2414, + "step": 3290 + }, + { + "ce_loss_12": 3.0792182087898254, + "ce_loss_17": 2.95730984210968, + "ce_loss_23": 2.8904613494873046, + "ce_loss_3": 3.8960200428962706, + "ce_loss_6": 3.4497509241104125, + "epoch": 0.33, + "grad_norm": 1128.0, + "kl_loss_12": 392.9219116210937, + "kl_loss_17": 117.4855842590332, + "kl_loss_3": 2138.7407165527343, + "kl_loss_6": 1245.0063598632812, + "learning_rate": 0.0007636127338052513, + "loss": 985.1443, + "step": 3300 + }, + { + "ce_loss_12": 3.1815504312515257, + "ce_loss_17": 3.0464391469955445, + "ce_loss_23": 2.982821536064148, + "ce_loss_3": 4.0343057513237, + "ce_loss_6": 3.563375544548035, + "epoch": 0.331, + "grad_norm": 848.0, + "kl_loss_12": 407.66025695800784, + "kl_loss_17": 122.59229011535645, + "kl_loss_3": 2252.943640136719, + "kl_loss_6": 1288.318182373047, + "learning_rate": 0.0007622631809165971, + "loss": 996.1619, + "step": 3310 + }, + { + "ce_loss_12": 3.1675572395324707, + "ce_loss_17": 3.047197496891022, + "ce_loss_23": 2.9831395506858827, + "ce_loss_3": 3.9600903511047365, + "ce_loss_6": 3.5189494490623474, + "epoch": 0.332, + "grad_norm": 1160.0, + "kl_loss_12": 377.3573394775391, + "kl_loss_17": 111.5566593170166, + "kl_loss_3": 2082.12412109375, + "kl_loss_6": 1190.3214294433594, + "learning_rate": 0.000760910987040623, + "loss": 962.3131, + "step": 3320 + }, + { + "ce_loss_12": 3.166386032104492, + "ce_loss_17": 3.025036633014679, + "ce_loss_23": 2.9654740929603576, + "ce_loss_3": 4.026007843017578, + "ce_loss_6": 3.5486589670181274, + "epoch": 0.333, + "grad_norm": 908.0, + "kl_loss_12": 409.37377166748047, + "kl_loss_17": 114.46134300231934, + "kl_loss_3": 2264.3786010742188, + "kl_loss_6": 1299.5189331054687, + "learning_rate": 0.000759556165793906, + "loss": 991.0748, + "step": 3330 + }, + { + "ce_loss_12": 3.175990545749664, + "ce_loss_17": 3.0406521558761597, + "ce_loss_23": 2.9793988823890687, + "ce_loss_3": 4.003675639629364, + "ce_loss_6": 3.5533421277999877, + "epoch": 0.334, + "grad_norm": 728.0, + "kl_loss_12": 401.5468505859375, + "kl_loss_17": 109.53638153076172, + "kl_loss_3": 2201.6816833496096, + "kl_loss_6": 1274.5880859375, + "learning_rate": 0.000758198730819481, + "loss": 1004.2113, + "step": 3340 + }, + { + "ce_loss_12": 3.136075222492218, + "ce_loss_17": 3.002777171134949, + "ce_loss_23": 2.947541129589081, + "ce_loss_3": 3.9716083288192747, + "ce_loss_6": 3.5095011830329894, + "epoch": 0.335, + "grad_norm": 1032.0, + "kl_loss_12": 391.4615737915039, + "kl_loss_17": 104.82349853515625, + "kl_loss_3": 2192.2221618652343, + "kl_loss_6": 1263.1376037597656, + "learning_rate": 0.0007568386957867032, + "loss": 988.185, + "step": 3350 + }, + { + "ce_loss_12": 3.1859456062316895, + "ce_loss_17": 3.051733136177063, + "ce_loss_23": 2.990495729446411, + "ce_loss_3": 4.007290017604828, + "ce_loss_6": 3.5615636348724364, + "epoch": 0.336, + "grad_norm": 1152.0, + "kl_loss_12": 401.7861862182617, + "kl_loss_17": 107.89192390441895, + "kl_loss_3": 2179.39140625, + "kl_loss_6": 1257.9797607421874, + "learning_rate": 0.0007554760743911103, + "loss": 996.4357, + "step": 3360 + }, + { + "ce_loss_12": 3.113254189491272, + "ce_loss_17": 2.98225884437561, + "ce_loss_23": 2.9265295028686524, + "ce_loss_3": 3.9415154337882994, + "ce_loss_6": 3.487048614025116, + "epoch": 0.337, + "grad_norm": 1056.0, + "kl_loss_12": 390.0379333496094, + "kl_loss_17": 104.20697250366212, + "kl_loss_3": 2187.164660644531, + "kl_loss_6": 1248.6148010253905, + "learning_rate": 0.0007541108803542846, + "loss": 1012.4851, + "step": 3370 + }, + { + "ce_loss_12": 3.148558294773102, + "ce_loss_17": 3.0167269706726074, + "ce_loss_23": 2.960436737537384, + "ce_loss_3": 3.965547299385071, + "ce_loss_6": 3.5091021418571473, + "epoch": 0.338, + "grad_norm": 1216.0, + "kl_loss_12": 395.17091369628906, + "kl_loss_17": 104.0645637512207, + "kl_loss_3": 2196.504290771484, + "kl_loss_6": 1249.5260681152345, + "learning_rate": 0.0007527431274237149, + "loss": 1036.4824, + "step": 3380 + }, + { + "ce_loss_12": 3.1180466532707216, + "ce_loss_17": 2.9879327178001405, + "ce_loss_23": 2.933281934261322, + "ce_loss_3": 3.93823961019516, + "ce_loss_6": 3.4857409834861754, + "epoch": 0.339, + "grad_norm": 1216.0, + "kl_loss_12": 389.51560363769534, + "kl_loss_17": 103.63929252624511, + "kl_loss_3": 2158.0156433105467, + "kl_loss_6": 1235.6661071777344, + "learning_rate": 0.0007513728293726579, + "loss": 980.855, + "step": 3390 + }, + { + "ce_loss_12": 3.2241431713104247, + "ce_loss_17": 3.0904944062232973, + "ce_loss_23": 3.03415367603302, + "ce_loss_3": 4.0267979264259335, + "ce_loss_6": 3.5803756713867188, + "epoch": 0.34, + "grad_norm": 1032.0, + "kl_loss_12": 397.006591796875, + "kl_loss_17": 105.09075622558593, + "kl_loss_3": 2149.0771911621096, + "kl_loss_6": 1240.8625610351562, + "learning_rate": 0.00075, + "loss": 970.1844, + "step": 3400 + }, + { + "ce_loss_12": 3.221427297592163, + "ce_loss_17": 3.0799491167068482, + "ce_loss_23": 3.0230050921440124, + "ce_loss_3": 4.0491295337677, + "ce_loss_6": 3.5963475704193115, + "epoch": 0.341, + "grad_norm": 1104.0, + "kl_loss_12": 401.5384887695312, + "kl_loss_17": 105.64668006896973, + "kl_loss_3": 2198.474645996094, + "kl_loss_6": 1263.7601989746095, + "learning_rate": 0.0007486246531301177, + "loss": 983.0185, + "step": 3410 + }, + { + "ce_loss_12": 3.0349387645721437, + "ce_loss_17": 2.9008808970451354, + "ce_loss_23": 2.846748244762421, + "ce_loss_3": 3.8713913917541505, + "ce_loss_6": 3.411065125465393, + "epoch": 0.342, + "grad_norm": 892.0, + "kl_loss_12": 390.5525131225586, + "kl_loss_17": 102.45408782958984, + "kl_loss_3": 2174.7390075683593, + "kl_loss_6": 1251.728338623047, + "learning_rate": 0.0007472468026127384, + "loss": 966.3946, + "step": 3420 + }, + { + "ce_loss_12": 3.18413405418396, + "ce_loss_17": 3.043762445449829, + "ce_loss_23": 2.980748248100281, + "ce_loss_3": 4.039393293857574, + "ce_loss_6": 3.5582253456115724, + "epoch": 0.343, + "grad_norm": 1200.0, + "kl_loss_12": 418.48243255615233, + "kl_loss_17": 110.33060188293457, + "kl_loss_3": 2272.084197998047, + "kl_loss_6": 1294.8700378417968, + "learning_rate": 0.000745866462322802, + "loss": 1008.09, + "step": 3430 + }, + { + "ce_loss_12": 3.154288816452026, + "ce_loss_17": 3.0218498349189757, + "ce_loss_23": 2.9685943603515623, + "ce_loss_3": 3.9574212312698362, + "ce_loss_6": 3.511477017402649, + "epoch": 0.344, + "grad_norm": 868.0, + "kl_loss_12": 389.3679931640625, + "kl_loss_17": 101.62167663574219, + "kl_loss_3": 2132.1400817871095, + "kl_loss_6": 1221.7820739746094, + "learning_rate": 0.0007444836461603195, + "loss": 972.9533, + "step": 3440 + }, + { + "ce_loss_12": 3.223675560951233, + "ce_loss_17": 3.084799551963806, + "ce_loss_23": 3.0252248644828796, + "ce_loss_3": 4.05693109035492, + "ce_loss_6": 3.6015345811843873, + "epoch": 0.345, + "grad_norm": 1020.0, + "kl_loss_12": 418.050358581543, + "kl_loss_17": 111.50422401428223, + "kl_loss_3": 2230.067041015625, + "kl_loss_6": 1304.375439453125, + "learning_rate": 0.0007430983680502344, + "loss": 1014.8229, + "step": 3450 + }, + { + "ce_loss_12": 3.069409763813019, + "ce_loss_17": 2.930784726142883, + "ce_loss_23": 2.873788070678711, + "ce_loss_3": 3.9135555267333983, + "ce_loss_6": 3.4463586807250977, + "epoch": 0.346, + "grad_norm": 972.0, + "kl_loss_12": 400.5516082763672, + "kl_loss_17": 104.2048439025879, + "kl_loss_3": 2217.041046142578, + "kl_loss_6": 1268.8075622558595, + "learning_rate": 0.0007417106419422819, + "loss": 995.1109, + "step": 3460 + }, + { + "ce_loss_12": 3.158154022693634, + "ce_loss_17": 3.0228779196739195, + "ce_loss_23": 2.966097557544708, + "ce_loss_3": 3.970812749862671, + "ce_loss_6": 3.5208982706069945, + "epoch": 0.347, + "grad_norm": 880.0, + "kl_loss_12": 398.12904663085936, + "kl_loss_17": 104.80950889587402, + "kl_loss_3": 2142.826318359375, + "kl_loss_6": 1236.2609802246093, + "learning_rate": 0.0007403204818108486, + "loss": 988.4357, + "step": 3470 + }, + { + "ce_loss_12": 3.1484235644340517, + "ce_loss_17": 3.0128443241119385, + "ce_loss_23": 2.954823875427246, + "ce_loss_3": 3.9678571701049803, + "ce_loss_6": 3.507073760032654, + "epoch": 0.348, + "grad_norm": 1136.0, + "kl_loss_12": 403.04648742675784, + "kl_loss_17": 105.83292236328126, + "kl_loss_3": 2219.6487243652346, + "kl_loss_6": 1265.0550537109375, + "learning_rate": 0.0007389279016548316, + "loss": 964.9225, + "step": 3480 + }, + { + "ce_loss_12": 3.15326087474823, + "ce_loss_17": 3.01158367395401, + "ce_loss_23": 2.9500399947166445, + "ce_loss_3": 4.019190549850464, + "ce_loss_6": 3.534512388706207, + "epoch": 0.349, + "grad_norm": 1216.0, + "kl_loss_12": 410.7277206420898, + "kl_loss_17": 107.77377014160156, + "kl_loss_3": 2273.4454956054688, + "kl_loss_6": 1284.2046264648438, + "learning_rate": 0.0007375329154974975, + "loss": 1005.2963, + "step": 3490 + }, + { + "ce_loss_12": 3.1056708455085755, + "ce_loss_17": 2.9722748041152953, + "ce_loss_23": 2.9173484206199647, + "ce_loss_3": 3.9198298692703246, + "ce_loss_6": 3.4674972891807556, + "epoch": 0.35, + "grad_norm": 1192.0, + "kl_loss_12": 387.7169982910156, + "kl_loss_17": 102.49424819946289, + "kl_loss_3": 2145.902227783203, + "kl_loss_6": 1230.2499633789062, + "learning_rate": 0.0007361355373863414, + "loss": 993.6381, + "step": 3500 + }, + { + "ce_loss_12": 3.152825403213501, + "ce_loss_17": 3.017298102378845, + "ce_loss_23": 2.9622105717658997, + "ce_loss_3": 3.956553077697754, + "ce_loss_6": 3.513141703605652, + "epoch": 0.351, + "grad_norm": 1144.0, + "kl_loss_12": 388.43240814208986, + "kl_loss_17": 102.02892303466797, + "kl_loss_3": 2127.050341796875, + "kl_loss_6": 1218.60732421875, + "learning_rate": 0.0007347357813929454, + "loss": 989.0248, + "step": 3510 + }, + { + "ce_loss_12": 3.1057863593101502, + "ce_loss_17": 2.977063238620758, + "ce_loss_23": 2.92062748670578, + "ce_loss_3": 3.9094584822654723, + "ce_loss_6": 3.465732955932617, + "epoch": 0.352, + "grad_norm": 1472.0, + "kl_loss_12": 384.3619110107422, + "kl_loss_17": 102.11101379394532, + "kl_loss_3": 2122.9948669433593, + "kl_loss_6": 1216.6124328613282, + "learning_rate": 0.0007333336616128369, + "loss": 986.3984, + "step": 3520 + }, + { + "ce_loss_12": 3.0813703656196596, + "ce_loss_17": 2.9437156200408934, + "ce_loss_23": 2.886933445930481, + "ce_loss_3": 3.9318979501724245, + "ce_loss_6": 3.4582404851913453, + "epoch": 0.353, + "grad_norm": 1184.0, + "kl_loss_12": 403.52349243164065, + "kl_loss_17": 104.87230758666992, + "kl_loss_3": 2217.4830932617188, + "kl_loss_6": 1260.3309204101563, + "learning_rate": 0.0007319291921653463, + "loss": 993.8051, + "step": 3530 + }, + { + "ce_loss_12": 3.168711709976196, + "ce_loss_17": 3.026899981498718, + "ce_loss_23": 2.9669880628585816, + "ce_loss_3": 4.009432983398438, + "ce_loss_6": 3.540644121170044, + "epoch": 0.354, + "grad_norm": 1264.0, + "kl_loss_12": 407.08264617919923, + "kl_loss_17": 106.93651161193847, + "kl_loss_3": 2216.938299560547, + "kl_loss_6": 1263.7690002441407, + "learning_rate": 0.0007305223871934656, + "loss": 982.6785, + "step": 3540 + }, + { + "ce_loss_12": 3.130829155445099, + "ce_loss_17": 2.995125424861908, + "ce_loss_23": 2.9383193969726564, + "ce_loss_3": 3.943132388591766, + "ce_loss_6": 3.4956005454063415, + "epoch": 0.355, + "grad_norm": 1040.0, + "kl_loss_12": 396.400700378418, + "kl_loss_17": 103.8346160888672, + "kl_loss_3": 2159.5034057617186, + "kl_loss_6": 1242.3389953613282, + "learning_rate": 0.0007291132608637052, + "loss": 979.3562, + "step": 3550 + }, + { + "ce_loss_12": 3.1063939929008484, + "ce_loss_17": 2.9719778656959535, + "ce_loss_23": 2.917952299118042, + "ce_loss_3": 4.00426549911499, + "ce_loss_6": 3.5009910464286804, + "epoch": 0.356, + "grad_norm": 964.0, + "kl_loss_12": 387.28465576171874, + "kl_loss_17": 100.82154541015625, + "kl_loss_3": 2316.0281677246094, + "kl_loss_6": 1301.2512268066407, + "learning_rate": 0.0007277018273659516, + "loss": 1012.7887, + "step": 3560 + }, + { + "ce_loss_12": 3.2234113931655886, + "ce_loss_17": 3.083226215839386, + "ce_loss_23": 3.0215823650360107, + "ce_loss_3": 4.039709770679474, + "ce_loss_6": 3.591693603992462, + "epoch": 0.357, + "grad_norm": 1408.0, + "kl_loss_12": 412.0844360351563, + "kl_loss_17": 109.77905807495117, + "kl_loss_3": 2195.4359375, + "kl_loss_6": 1277.4073791503906, + "learning_rate": 0.0007262881009133242, + "loss": 989.1793, + "step": 3570 + }, + { + "ce_loss_12": 3.1356869101524354, + "ce_loss_17": 3.0047407031059263, + "ce_loss_23": 2.951461207866669, + "ce_loss_3": 3.9500752568244932, + "ce_loss_6": 3.49868620634079, + "epoch": 0.358, + "grad_norm": 1384.0, + "kl_loss_12": 387.04153594970705, + "kl_loss_17": 100.57007904052735, + "kl_loss_3": 2136.6033447265627, + "kl_loss_6": 1228.8325775146484, + "learning_rate": 0.0007248720957420329, + "loss": 961.4793, + "step": 3580 + }, + { + "ce_loss_12": 3.1339301228523255, + "ce_loss_17": 3.0050941705703735, + "ce_loss_23": 2.952645552158356, + "ce_loss_3": 3.929820716381073, + "ce_loss_6": 3.487267017364502, + "epoch": 0.359, + "grad_norm": 1048.0, + "kl_loss_12": 381.19686431884764, + "kl_loss_17": 100.02181625366211, + "kl_loss_3": 2124.4488525390625, + "kl_loss_6": 1210.181591796875, + "learning_rate": 0.0007234538261112341, + "loss": 985.0872, + "step": 3590 + }, + { + "ce_loss_12": 3.1808794498443604, + "ce_loss_17": 3.0438300013542174, + "ce_loss_23": 2.9860937118530275, + "ce_loss_3": 4.011133432388306, + "ce_loss_6": 3.5475757718086243, + "epoch": 0.36, + "grad_norm": 804.0, + "kl_loss_12": 395.88465728759763, + "kl_loss_17": 103.27263717651367, + "kl_loss_3": 2172.810369873047, + "kl_loss_6": 1242.1477661132812, + "learning_rate": 0.0007220333063028871, + "loss": 967.4588, + "step": 3600 + }, + { + "ce_loss_12": 3.221625530719757, + "ce_loss_17": 3.082091951370239, + "ce_loss_23": 3.0237446427345276, + "ce_loss_3": 4.1428028583526615, + "ce_loss_6": 3.6395716786384584, + "epoch": 0.361, + "grad_norm": 956.0, + "kl_loss_12": 410.17210083007814, + "kl_loss_17": 107.04776420593262, + "kl_loss_3": 2412.4030822753907, + "kl_loss_6": 1372.7015441894532, + "learning_rate": 0.0007206105506216106, + "loss": 1036.6594, + "step": 3610 + }, + { + "ce_loss_12": 3.0932498931884767, + "ce_loss_17": 2.9613340497016907, + "ce_loss_23": 2.9070735931396485, + "ce_loss_3": 3.895069921016693, + "ce_loss_6": 3.4558199644088745, + "epoch": 0.362, + "grad_norm": 1104.0, + "kl_loss_12": 383.33282775878905, + "kl_loss_17": 100.10506706237793, + "kl_loss_3": 2114.540673828125, + "kl_loss_6": 1220.074530029297, + "learning_rate": 0.0007191855733945387, + "loss": 949.5305, + "step": 3620 + }, + { + "ce_loss_12": 3.182743549346924, + "ce_loss_17": 3.0513375759124757, + "ce_loss_23": 2.9913695096969604, + "ce_loss_3": 3.9969239115715025, + "ce_loss_6": 3.5442299962043764, + "epoch": 0.363, + "grad_norm": 1248.0, + "kl_loss_12": 384.91429290771487, + "kl_loss_17": 101.22749214172363, + "kl_loss_3": 2147.6256896972654, + "kl_loss_6": 1226.1741882324218, + "learning_rate": 0.0007177583889711762, + "loss": 964.0354, + "step": 3630 + }, + { + "ce_loss_12": 3.1005003809928895, + "ce_loss_17": 2.966002714633942, + "ce_loss_23": 2.9100066781044007, + "ce_loss_3": 3.9228270649909973, + "ce_loss_6": 3.4714234709739684, + "epoch": 0.364, + "grad_norm": 972.0, + "kl_loss_12": 394.1941940307617, + "kl_loss_17": 102.08047103881836, + "kl_loss_3": 2186.592639160156, + "kl_loss_6": 1261.8254516601562, + "learning_rate": 0.0007163290117232541, + "loss": 981.3645, + "step": 3640 + }, + { + "ce_loss_12": 3.1968397974967955, + "ce_loss_17": 3.0696872591972353, + "ce_loss_23": 3.0185195207595825, + "ce_loss_3": 3.967725968360901, + "ce_loss_6": 3.5456802606582642, + "epoch": 0.365, + "grad_norm": 1320.0, + "kl_loss_12": 380.22156524658203, + "kl_loss_17": 99.77042198181152, + "kl_loss_3": 2085.5939819335936, + "kl_loss_6": 1208.4053955078125, + "learning_rate": 0.0007148974560445859, + "loss": 959.1965, + "step": 3650 + }, + { + "ce_loss_12": 3.129593849182129, + "ce_loss_17": 2.9993977189064025, + "ce_loss_23": 2.9464277625083923, + "ce_loss_3": 3.928598666191101, + "ce_loss_6": 3.4893518686294556, + "epoch": 0.366, + "grad_norm": 1016.0, + "kl_loss_12": 382.19481658935547, + "kl_loss_17": 98.97349815368652, + "kl_loss_3": 2099.6594787597655, + "kl_loss_6": 1206.849383544922, + "learning_rate": 0.0007134637363509209, + "loss": 948.0132, + "step": 3660 + }, + { + "ce_loss_12": 3.2401368737220766, + "ce_loss_17": 3.1115305542945864, + "ce_loss_23": 3.0598105430603026, + "ce_loss_3": 4.030146300792694, + "ce_loss_6": 3.593411421775818, + "epoch": 0.367, + "grad_norm": 1024.0, + "kl_loss_12": 381.5959442138672, + "kl_loss_17": 98.28689727783203, + "kl_loss_3": 2094.195349121094, + "kl_loss_6": 1206.679653930664, + "learning_rate": 0.0007120278670798009, + "loss": 964.0335, + "step": 3670 + }, + { + "ce_loss_12": 3.0648820638656615, + "ce_loss_17": 2.91995313167572, + "ce_loss_23": 2.863781201839447, + "ce_loss_3": 3.9347721934318542, + "ce_loss_6": 3.457567012310028, + "epoch": 0.368, + "grad_norm": 1280.0, + "kl_loss_12": 407.0378677368164, + "kl_loss_17": 102.9833999633789, + "kl_loss_3": 2293.2660949707033, + "kl_loss_6": 1310.9484741210938, + "learning_rate": 0.0007105898626904133, + "loss": 1019.8857, + "step": 3680 + }, + { + "ce_loss_12": 3.148142862319946, + "ce_loss_17": 3.014118826389313, + "ce_loss_23": 2.95814106464386, + "ce_loss_3": 3.96722731590271, + "ce_loss_6": 3.5096484780311585, + "epoch": 0.369, + "grad_norm": 1072.0, + "kl_loss_12": 391.33440551757815, + "kl_loss_17": 101.28652992248536, + "kl_loss_3": 2147.1658264160155, + "kl_loss_6": 1225.1833374023438, + "learning_rate": 0.0007091497376634463, + "loss": 960.4797, + "step": 3690 + }, + { + "ce_loss_12": 3.094625687599182, + "ce_loss_17": 2.96570303440094, + "ce_loss_23": 2.9103384971618653, + "ce_loss_3": 3.9048052191734315, + "ce_loss_6": 3.4554564237594603, + "epoch": 0.37, + "grad_norm": 1032.0, + "kl_loss_12": 387.18639221191404, + "kl_loss_17": 104.48425025939942, + "kl_loss_3": 2125.8362426757812, + "kl_loss_6": 1220.0202087402345, + "learning_rate": 0.0007077075065009433, + "loss": 983.7176, + "step": 3700 + }, + { + "ce_loss_12": 3.194843280315399, + "ce_loss_17": 3.0574195265769957, + "ce_loss_23": 2.999555218219757, + "ce_loss_3": 4.023444163799286, + "ce_loss_6": 3.5645259976387025, + "epoch": 0.371, + "grad_norm": 936.0, + "kl_loss_12": 398.9264434814453, + "kl_loss_17": 108.23813056945801, + "kl_loss_3": 2180.979901123047, + "kl_loss_6": 1252.6944702148437, + "learning_rate": 0.0007062631837261557, + "loss": 977.1026, + "step": 3710 + }, + { + "ce_loss_12": 3.0720986843109133, + "ce_loss_17": 2.9433573603630068, + "ce_loss_23": 2.8886937618255617, + "ce_loss_3": 3.883027505874634, + "ce_loss_6": 3.4352410793304444, + "epoch": 0.372, + "grad_norm": 1184.0, + "kl_loss_12": 382.10267028808596, + "kl_loss_17": 100.58302421569825, + "kl_loss_3": 2137.9196533203126, + "kl_loss_6": 1225.8107116699218, + "learning_rate": 0.0007048167838833977, + "loss": 986.0539, + "step": 3720 + }, + { + "ce_loss_12": 3.1614155411720275, + "ce_loss_17": 3.027226221561432, + "ce_loss_23": 2.9710015296936034, + "ce_loss_3": 3.949637258052826, + "ce_loss_6": 3.511720824241638, + "epoch": 0.373, + "grad_norm": 1344.0, + "kl_loss_12": 385.9076919555664, + "kl_loss_17": 103.10694885253906, + "kl_loss_3": 2117.4567626953126, + "kl_loss_6": 1209.1029052734375, + "learning_rate": 0.0007033683215379002, + "loss": 961.2854, + "step": 3730 + }, + { + "ce_loss_12": 3.1443222165107727, + "ce_loss_17": 3.0109015107154846, + "ce_loss_23": 2.957130324840546, + "ce_loss_3": 3.9426589131355287, + "ce_loss_6": 3.500744605064392, + "epoch": 0.374, + "grad_norm": 952.0, + "kl_loss_12": 381.2915771484375, + "kl_loss_17": 99.72256965637207, + "kl_loss_3": 2122.5682678222656, + "kl_loss_6": 1217.1044494628907, + "learning_rate": 0.0007019178112756625, + "loss": 974.4352, + "step": 3740 + }, + { + "ce_loss_12": 3.1132125854492188, + "ce_loss_17": 2.9844059467315676, + "ce_loss_23": 2.930648481845856, + "ce_loss_3": 3.9250273585319517, + "ce_loss_6": 3.4753824830055238, + "epoch": 0.375, + "grad_norm": 916.0, + "kl_loss_12": 382.26414642333987, + "kl_loss_17": 99.75811614990235, + "kl_loss_3": 2113.0847900390627, + "kl_loss_6": 1220.0052917480468, + "learning_rate": 0.0007004652677033068, + "loss": 969.0986, + "step": 3750 + }, + { + "ce_loss_12": 3.177477180957794, + "ce_loss_17": 3.0574225068092344, + "ce_loss_23": 3.007947289943695, + "ce_loss_3": 3.964920425415039, + "ce_loss_6": 3.5296141505241394, + "epoch": 0.376, + "grad_norm": 908.0, + "kl_loss_12": 369.6541061401367, + "kl_loss_17": 97.30980415344239, + "kl_loss_3": 2076.9691162109375, + "kl_loss_6": 1183.470895385742, + "learning_rate": 0.0006990107054479312, + "loss": 948.6415, + "step": 3760 + }, + { + "ce_loss_12": 3.164940822124481, + "ce_loss_17": 3.033370578289032, + "ce_loss_23": 2.9760269999504088, + "ce_loss_3": 3.9709452509880068, + "ce_loss_6": 3.528740930557251, + "epoch": 0.377, + "grad_norm": 1296.0, + "kl_loss_12": 389.17967681884767, + "kl_loss_17": 102.7863712310791, + "kl_loss_3": 2143.7166748046875, + "kl_loss_6": 1238.860577392578, + "learning_rate": 0.000697554139156961, + "loss": 972.5478, + "step": 3770 + }, + { + "ce_loss_12": 3.1600608825683594, + "ce_loss_17": 3.0301866054534914, + "ce_loss_23": 2.974429488182068, + "ce_loss_3": 3.9714036107063295, + "ce_loss_6": 3.5234702229499817, + "epoch": 0.378, + "grad_norm": 1096.0, + "kl_loss_12": 394.4216369628906, + "kl_loss_17": 103.2165542602539, + "kl_loss_3": 2147.6901123046873, + "kl_loss_6": 1238.7892700195312, + "learning_rate": 0.0006960955834980027, + "loss": 952.6748, + "step": 3780 + }, + { + "ce_loss_12": 3.127192723751068, + "ce_loss_17": 2.997998797893524, + "ce_loss_23": 2.9416239619255067, + "ce_loss_3": 3.928993654251099, + "ce_loss_6": 3.4925607323646544, + "epoch": 0.379, + "grad_norm": 1072.0, + "kl_loss_12": 382.92139739990233, + "kl_loss_17": 99.97526016235352, + "kl_loss_3": 2108.2826599121095, + "kl_loss_6": 1218.4925964355468, + "learning_rate": 0.0006946350531586958, + "loss": 957.6444, + "step": 3790 + }, + { + "ce_loss_12": 3.1599705696105955, + "ce_loss_17": 3.0228681206703185, + "ce_loss_23": 2.9705450177192687, + "ce_loss_3": 3.9659302115440367, + "ce_loss_6": 3.5193197250366213, + "epoch": 0.38, + "grad_norm": 900.0, + "kl_loss_12": 384.08446350097654, + "kl_loss_17": 98.96566047668458, + "kl_loss_3": 2138.1342041015623, + "kl_loss_6": 1222.6171569824219, + "learning_rate": 0.0006931725628465643, + "loss": 978.7352, + "step": 3800 + }, + { + "ce_loss_12": 3.164839673042297, + "ce_loss_17": 3.0324612498283385, + "ce_loss_23": 2.974766230583191, + "ce_loss_3": 3.9797155141830443, + "ce_loss_6": 3.528802454471588, + "epoch": 0.381, + "grad_norm": 1012.0, + "kl_loss_12": 384.5140380859375, + "kl_loss_17": 100.59394874572754, + "kl_loss_3": 2130.6960021972654, + "kl_loss_6": 1219.2356811523437, + "learning_rate": 0.0006917081272888696, + "loss": 963.9451, + "step": 3810 + }, + { + "ce_loss_12": 3.0861921548843383, + "ce_loss_17": 2.952903139591217, + "ce_loss_23": 2.897795629501343, + "ce_loss_3": 3.943486976623535, + "ce_loss_6": 3.455733525753021, + "epoch": 0.382, + "grad_norm": 1048.0, + "kl_loss_12": 392.62345428466796, + "kl_loss_17": 102.91732864379883, + "kl_loss_3": 2244.1899475097657, + "kl_loss_6": 1248.053887939453, + "learning_rate": 0.0006902417612324615, + "loss": 974.233, + "step": 3820 + }, + { + "ce_loss_12": 3.211138737201691, + "ce_loss_17": 3.065601098537445, + "ce_loss_23": 3.0089004755020143, + "ce_loss_3": 4.046250092983246, + "ce_loss_6": 3.582586634159088, + "epoch": 0.383, + "grad_norm": 968.0, + "kl_loss_12": 410.6596969604492, + "kl_loss_17": 106.09553604125976, + "kl_loss_3": 2204.2128967285157, + "kl_loss_6": 1265.9451049804688, + "learning_rate": 0.00068877347944363, + "loss": 985.0914, + "step": 3830 + }, + { + "ce_loss_12": 3.1957462310791014, + "ce_loss_17": 3.0704895853996277, + "ce_loss_23": 3.0156718850135804, + "ce_loss_3": 3.9832470297813414, + "ce_loss_6": 3.5446235060691835, + "epoch": 0.384, + "grad_norm": 1112.0, + "kl_loss_12": 388.15425567626954, + "kl_loss_17": 101.4664520263672, + "kl_loss_3": 2101.9537353515625, + "kl_loss_6": 1207.7802978515624, + "learning_rate": 0.0006873032967079561, + "loss": 970.1563, + "step": 3840 + }, + { + "ce_loss_12": 3.1780274152755736, + "ce_loss_17": 3.0505093932151794, + "ce_loss_23": 2.996346962451935, + "ce_loss_3": 3.950271713733673, + "ce_loss_6": 3.5170023918151854, + "epoch": 0.385, + "grad_norm": 952.0, + "kl_loss_12": 382.70263824462893, + "kl_loss_17": 100.49216232299804, + "kl_loss_3": 2076.2669921875, + "kl_loss_6": 1188.8818450927733, + "learning_rate": 0.0006858312278301637, + "loss": 942.3533, + "step": 3850 + }, + { + "ce_loss_12": 3.2134106755256653, + "ce_loss_17": 3.085762679576874, + "ce_loss_23": 3.035565769672394, + "ce_loss_3": 3.979679024219513, + "ce_loss_6": 3.5507980227470397, + "epoch": 0.386, + "grad_norm": 1208.0, + "kl_loss_12": 382.0628173828125, + "kl_loss_17": 100.69847984313965, + "kl_loss_3": 2070.21513671875, + "kl_loss_6": 1192.517266845703, + "learning_rate": 0.0006843572876339704, + "loss": 945.9352, + "step": 3860 + }, + { + "ce_loss_12": 3.1366180539131165, + "ce_loss_17": 3.0096086144447325, + "ce_loss_23": 2.9578014850616454, + "ce_loss_3": 3.8984736084938048, + "ce_loss_6": 3.4832229256629943, + "epoch": 0.387, + "grad_norm": 1048.0, + "kl_loss_12": 374.30377197265625, + "kl_loss_17": 97.24090919494628, + "kl_loss_3": 2047.601544189453, + "kl_loss_6": 1186.3757263183593, + "learning_rate": 0.0006828814909619373, + "loss": 972.7648, + "step": 3870 + }, + { + "ce_loss_12": 3.257560980319977, + "ce_loss_17": 3.126918005943298, + "ce_loss_23": 3.0692105889320374, + "ce_loss_3": 4.041906023025513, + "ce_loss_6": 3.5994970917701723, + "epoch": 0.388, + "grad_norm": 980.0, + "kl_loss_12": 389.04183197021484, + "kl_loss_17": 102.8961841583252, + "kl_loss_3": 2084.578723144531, + "kl_loss_6": 1191.1536590576172, + "learning_rate": 0.0006814038526753205, + "loss": 936.5445, + "step": 3880 + }, + { + "ce_loss_12": 3.1633546233177183, + "ce_loss_17": 3.030035877227783, + "ce_loss_23": 2.975773298740387, + "ce_loss_3": 3.945274126529694, + "ce_loss_6": 3.508275043964386, + "epoch": 0.389, + "grad_norm": 992.0, + "kl_loss_12": 383.5569595336914, + "kl_loss_17": 99.5365421295166, + "kl_loss_3": 2082.189862060547, + "kl_loss_6": 1198.423110961914, + "learning_rate": 0.0006799243876539213, + "loss": 948.3916, + "step": 3890 + }, + { + "ce_loss_12": 3.090026044845581, + "ce_loss_17": 2.960541236400604, + "ce_loss_23": 2.9071210980415345, + "ce_loss_3": 3.9189464807510377, + "ce_loss_6": 3.452379596233368, + "epoch": 0.39, + "grad_norm": 1568.0, + "kl_loss_12": 384.93935241699216, + "kl_loss_17": 99.81928863525391, + "kl_loss_3": 2171.3757263183593, + "kl_loss_6": 1219.7862731933594, + "learning_rate": 0.0006784431107959359, + "loss": 972.2284, + "step": 3900 + }, + { + "ce_loss_12": 3.150925076007843, + "ce_loss_17": 3.0121824502944947, + "ce_loss_23": 2.9566686391830443, + "ce_loss_3": 3.9797996401786806, + "ce_loss_6": 3.520882248878479, + "epoch": 0.391, + "grad_norm": 1112.0, + "kl_loss_12": 393.58673706054685, + "kl_loss_17": 101.93662071228027, + "kl_loss_3": 2192.713427734375, + "kl_loss_6": 1248.4283935546875, + "learning_rate": 0.0006769600370178059, + "loss": 965.526, + "step": 3910 + }, + { + "ce_loss_12": 3.111063516139984, + "ce_loss_17": 2.980947160720825, + "ce_loss_23": 2.925340807437897, + "ce_loss_3": 3.9266233801841737, + "ce_loss_6": 3.473023760318756, + "epoch": 0.392, + "grad_norm": 880.0, + "kl_loss_12": 384.3804428100586, + "kl_loss_17": 97.82368049621581, + "kl_loss_3": 2136.1855529785157, + "kl_loss_6": 1223.2701232910156, + "learning_rate": 0.0006754751812540679, + "loss": 943.424, + "step": 3920 + }, + { + "ce_loss_12": 3.162268328666687, + "ce_loss_17": 3.02713885307312, + "ce_loss_23": 2.971815264225006, + "ce_loss_3": 3.9698534488677977, + "ce_loss_6": 3.5212791562080383, + "epoch": 0.393, + "grad_norm": 1120.0, + "kl_loss_12": 389.7220794677734, + "kl_loss_17": 102.36843185424804, + "kl_loss_3": 2160.457275390625, + "kl_loss_6": 1239.3633728027344, + "learning_rate": 0.0006739885584572025, + "loss": 975.3045, + "step": 3930 + }, + { + "ce_loss_12": 3.1774211168289184, + "ce_loss_17": 3.0457595705986025, + "ce_loss_23": 2.989675688743591, + "ce_loss_3": 4.002368414402008, + "ce_loss_6": 3.538395082950592, + "epoch": 0.394, + "grad_norm": 1208.0, + "kl_loss_12": 387.89739379882815, + "kl_loss_17": 101.88522567749024, + "kl_loss_3": 2200.5525756835937, + "kl_loss_6": 1235.5010864257813, + "learning_rate": 0.0006725001835974853, + "loss": 958.2978, + "step": 3940 + }, + { + "ce_loss_12": 3.176982545852661, + "ce_loss_17": 3.0421844482421876, + "ce_loss_23": 2.986680102348328, + "ce_loss_3": 3.984161686897278, + "ce_loss_6": 3.5369669675827025, + "epoch": 0.395, + "grad_norm": 1256.0, + "kl_loss_12": 395.7901412963867, + "kl_loss_17": 102.56606636047363, + "kl_loss_3": 2153.0941528320313, + "kl_loss_6": 1235.6717407226563, + "learning_rate": 0.0006710100716628344, + "loss": 949.3996, + "step": 3950 + }, + { + "ce_loss_12": 3.163419556617737, + "ce_loss_17": 3.0290326714515685, + "ce_loss_23": 2.972023379802704, + "ce_loss_3": 3.9705038547515867, + "ce_loss_6": 3.530128002166748, + "epoch": 0.396, + "grad_norm": 1032.0, + "kl_loss_12": 388.55382232666017, + "kl_loss_17": 100.28390426635742, + "kl_loss_3": 2136.8793762207033, + "kl_loss_6": 1231.7751403808593, + "learning_rate": 0.0006695182376586602, + "loss": 969.2206, + "step": 3960 + }, + { + "ce_loss_12": 3.177538585662842, + "ce_loss_17": 3.0523517608642576, + "ce_loss_23": 2.9978424191474913, + "ce_loss_3": 3.936390829086304, + "ce_loss_6": 3.5100929021835325, + "epoch": 0.397, + "grad_norm": 1168.0, + "kl_loss_12": 365.78756256103514, + "kl_loss_17": 95.89293823242187, + "kl_loss_3": 2016.74169921875, + "kl_loss_6": 1147.0362548828125, + "learning_rate": 0.000668024696607715, + "loss": 951.3905, + "step": 3970 + }, + { + "ce_loss_12": 3.143988335132599, + "ce_loss_17": 3.01651873588562, + "ce_loss_23": 2.96545991897583, + "ce_loss_3": 3.9358643174171446, + "ce_loss_6": 3.496285951137543, + "epoch": 0.398, + "grad_norm": 1064.0, + "kl_loss_12": 378.8801467895508, + "kl_loss_17": 98.12645530700684, + "kl_loss_3": 2110.0565856933595, + "kl_loss_6": 1205.5675537109375, + "learning_rate": 0.0006665294635499404, + "loss": 949.7793, + "step": 3980 + }, + { + "ce_loss_12": 3.163826417922974, + "ce_loss_17": 3.029632306098938, + "ce_loss_23": 2.9687330961227416, + "ce_loss_3": 3.9991076827049254, + "ce_loss_6": 3.538382375240326, + "epoch": 0.399, + "grad_norm": 984.0, + "kl_loss_12": 401.962060546875, + "kl_loss_17": 106.9761344909668, + "kl_loss_3": 2226.279400634766, + "kl_loss_6": 1279.4245971679688, + "learning_rate": 0.0006650325535423167, + "loss": 979.8545, + "step": 3990 + }, + { + "ce_loss_12": 3.168172299861908, + "ce_loss_17": 3.0481725573539733, + "ce_loss_23": 2.990405547618866, + "ce_loss_3": 3.929426848888397, + "ce_loss_6": 3.5102165937423706, + "epoch": 0.4, + "grad_norm": 972.0, + "kl_loss_12": 368.17052001953124, + "kl_loss_17": 111.31200065612794, + "kl_loss_3": 2006.9190979003906, + "kl_loss_6": 1151.369775390625, + "learning_rate": 0.0006635339816587109, + "loss": 940.1266, + "step": 4000 + }, + { + "ce_loss_12": 3.114549422264099, + "ce_loss_17": 2.989661431312561, + "ce_loss_23": 2.9311137080192564, + "ce_loss_3": 3.935007870197296, + "ce_loss_6": 3.475666677951813, + "epoch": 0.401, + "grad_norm": 976.0, + "kl_loss_12": 387.16961059570315, + "kl_loss_17": 109.62789192199708, + "kl_loss_3": 2164.3400817871093, + "kl_loss_6": 1229.5659454345703, + "learning_rate": 0.0006620337629897252, + "loss": 960.3354, + "step": 4010 + }, + { + "ce_loss_12": 3.126537036895752, + "ce_loss_17": 2.9946191906929016, + "ce_loss_23": 2.936546230316162, + "ce_loss_3": 3.928071904182434, + "ce_loss_6": 3.478977394104004, + "epoch": 0.402, + "grad_norm": 916.0, + "kl_loss_12": 387.9364669799805, + "kl_loss_17": 106.62837104797363, + "kl_loss_3": 2123.9172973632812, + "kl_loss_6": 1210.9631713867188, + "learning_rate": 0.0006605319126425454, + "loss": 973.7484, + "step": 4020 + }, + { + "ce_loss_12": 3.041149842739105, + "ce_loss_17": 2.909222424030304, + "ce_loss_23": 2.8535015285015106, + "ce_loss_3": 3.876503324508667, + "ce_loss_6": 3.4090627312660216, + "epoch": 0.403, + "grad_norm": 1088.0, + "kl_loss_12": 385.33301696777346, + "kl_loss_17": 103.06710166931153, + "kl_loss_3": 2199.996661376953, + "kl_loss_6": 1249.3460327148437, + "learning_rate": 0.0006590284457407876, + "loss": 975.139, + "step": 4030 + }, + { + "ce_loss_12": 3.128358006477356, + "ce_loss_17": 2.9978256225585938, + "ce_loss_23": 2.9422669768333436, + "ce_loss_3": 3.930923879146576, + "ce_loss_6": 3.486605441570282, + "epoch": 0.404, + "grad_norm": 1216.0, + "kl_loss_12": 383.995783996582, + "kl_loss_17": 102.69088897705078, + "kl_loss_3": 2117.4116149902343, + "kl_loss_6": 1210.769482421875, + "learning_rate": 0.0006575233774243465, + "loss": 953.6563, + "step": 4040 + }, + { + "ce_loss_12": 3.1257184743881226, + "ce_loss_17": 2.9924083948135376, + "ce_loss_23": 2.936680054664612, + "ce_loss_3": 3.937123966217041, + "ce_loss_6": 3.488031816482544, + "epoch": 0.405, + "grad_norm": 1232.0, + "kl_loss_12": 389.3181213378906, + "kl_loss_17": 101.56726264953613, + "kl_loss_3": 2144.496929931641, + "kl_loss_6": 1222.655938720703, + "learning_rate": 0.0006560167228492435, + "loss": 958.7957, + "step": 4050 + }, + { + "ce_loss_12": 3.1630433559417725, + "ce_loss_17": 3.0304596543312075, + "ce_loss_23": 2.980211853981018, + "ce_loss_3": 3.93538019657135, + "ce_loss_6": 3.5052495121955873, + "epoch": 0.406, + "grad_norm": 1168.0, + "kl_loss_12": 370.5975006103516, + "kl_loss_17": 96.3271987915039, + "kl_loss_3": 2065.861767578125, + "kl_loss_6": 1190.795327758789, + "learning_rate": 0.0006545084971874737, + "loss": 951.2504, + "step": 4060 + }, + { + "ce_loss_12": 3.1312262296676634, + "ce_loss_17": 2.9931742548942566, + "ce_loss_23": 2.9370701909065247, + "ce_loss_3": 3.95162593126297, + "ce_loss_6": 3.501697599887848, + "epoch": 0.407, + "grad_norm": 884.0, + "kl_loss_12": 396.94751892089846, + "kl_loss_17": 103.59187126159668, + "kl_loss_3": 2175.806359863281, + "kl_loss_6": 1249.0946655273438, + "learning_rate": 0.0006529987156268526, + "loss": 953.4985, + "step": 4070 + }, + { + "ce_loss_12": 3.0533276081085203, + "ce_loss_17": 2.915659213066101, + "ce_loss_23": 2.8591691732406614, + "ce_loss_3": 3.8715276956558227, + "ce_loss_6": 3.4168421745300295, + "epoch": 0.408, + "grad_norm": 968.0, + "kl_loss_12": 383.2756820678711, + "kl_loss_17": 99.68055114746093, + "kl_loss_3": 2138.8466552734376, + "kl_loss_6": 1219.7853637695312, + "learning_rate": 0.0006514873933708637, + "loss": 980.2502, + "step": 4080 + }, + { + "ce_loss_12": 3.1608598113059996, + "ce_loss_17": 3.0313393831253053, + "ce_loss_23": 2.977697968482971, + "ce_loss_3": 3.956602382659912, + "ce_loss_6": 3.5155387043952944, + "epoch": 0.409, + "grad_norm": 1120.0, + "kl_loss_12": 375.5689437866211, + "kl_loss_17": 96.74662399291992, + "kl_loss_3": 2107.760119628906, + "kl_loss_6": 1192.8095153808595, + "learning_rate": 0.0006499745456385053, + "loss": 942.5886, + "step": 4090 + }, + { + "ce_loss_12": 3.1293053030967712, + "ce_loss_17": 2.9983129143714904, + "ce_loss_23": 2.941512739658356, + "ce_loss_3": 3.9319061040878296, + "ce_loss_6": 3.484344244003296, + "epoch": 0.41, + "grad_norm": 1144.0, + "kl_loss_12": 382.61876373291017, + "kl_loss_17": 99.34859237670898, + "kl_loss_3": 2114.9715087890627, + "kl_loss_6": 1213.1263122558594, + "learning_rate": 0.0006484601876641375, + "loss": 961.2418, + "step": 4100 + }, + { + "ce_loss_12": 3.1114142775535583, + "ce_loss_17": 2.9875324964523315, + "ce_loss_23": 2.935147023200989, + "ce_loss_3": 3.874464249610901, + "ce_loss_6": 3.456193673610687, + "epoch": 0.411, + "grad_norm": 988.0, + "kl_loss_12": 372.14844207763673, + "kl_loss_17": 97.36947631835938, + "kl_loss_3": 2045.9664733886718, + "kl_loss_6": 1174.0520263671874, + "learning_rate": 0.000646944334697328, + "loss": 932.9962, + "step": 4110 + }, + { + "ce_loss_12": 3.220192623138428, + "ce_loss_17": 3.0896286606788634, + "ce_loss_23": 3.037607753276825, + "ce_loss_3": 3.9769805669784546, + "ce_loss_6": 3.554104244709015, + "epoch": 0.412, + "grad_norm": 1160.0, + "kl_loss_12": 372.82581939697263, + "kl_loss_17": 97.23840293884277, + "kl_loss_3": 2013.3982482910155, + "kl_loss_6": 1167.341049194336, + "learning_rate": 0.0006454270020026995, + "loss": 917.0743, + "step": 4120 + }, + { + "ce_loss_12": 3.185134708881378, + "ce_loss_17": 3.0607090830802917, + "ce_loss_23": 3.0103032112121584, + "ce_loss_3": 3.940109670162201, + "ce_loss_6": 3.519068014621735, + "epoch": 0.413, + "grad_norm": 784.0, + "kl_loss_12": 364.3710479736328, + "kl_loss_17": 93.57289505004883, + "kl_loss_3": 2003.3096008300781, + "kl_loss_6": 1152.3987396240234, + "learning_rate": 0.0006439082048597755, + "loss": 914.7779, + "step": 4130 + }, + { + "ce_loss_12": 3.177386391162872, + "ce_loss_17": 3.0462804079055785, + "ce_loss_23": 2.994649851322174, + "ce_loss_3": 3.9677168011665342, + "ce_loss_6": 3.5374234557151794, + "epoch": 0.414, + "grad_norm": 1064.0, + "kl_loss_12": 381.20457153320314, + "kl_loss_17": 96.80270919799804, + "kl_loss_3": 2106.9815979003906, + "kl_loss_6": 1217.2957641601563, + "learning_rate": 0.0006423879585628261, + "loss": 950.1129, + "step": 4140 + }, + { + "ce_loss_12": 3.1385213136672974, + "ce_loss_17": 3.006602096557617, + "ce_loss_23": 2.9507169365882873, + "ce_loss_3": 3.9623284220695494, + "ce_loss_6": 3.5014745593070984, + "epoch": 0.415, + "grad_norm": 968.0, + "kl_loss_12": 391.1500961303711, + "kl_loss_17": 100.56959648132325, + "kl_loss_3": 2171.917236328125, + "kl_loss_6": 1231.1592224121093, + "learning_rate": 0.0006408662784207149, + "loss": 967.7506, + "step": 4150 + }, + { + "ce_loss_12": 3.1071489095687865, + "ce_loss_17": 2.980406606197357, + "ce_loss_23": 2.9294055104255676, + "ce_loss_3": 3.9036468267440796, + "ce_loss_6": 3.4644244074821473, + "epoch": 0.416, + "grad_norm": 1200.0, + "kl_loss_12": 374.96314697265626, + "kl_loss_17": 96.86763343811035, + "kl_loss_3": 2105.97939453125, + "kl_loss_6": 1211.642266845703, + "learning_rate": 0.0006393431797567439, + "loss": 948.6432, + "step": 4160 + }, + { + "ce_loss_12": 3.1838504314422607, + "ce_loss_17": 3.0583144307136534, + "ce_loss_23": 3.005660831928253, + "ce_loss_3": 3.9278244495391847, + "ce_loss_6": 3.5054065704345705, + "epoch": 0.417, + "grad_norm": 972.0, + "kl_loss_12": 371.84607696533203, + "kl_loss_17": 97.47408409118653, + "kl_loss_3": 2020.6477111816407, + "kl_loss_6": 1155.8765777587892, + "learning_rate": 0.0006378186779084996, + "loss": 905.9456, + "step": 4170 + }, + { + "ce_loss_12": 3.020306038856506, + "ce_loss_17": 2.8910472273826597, + "ce_loss_23": 2.8401397347450255, + "ce_loss_3": 3.8421530604362486, + "ce_loss_6": 3.3880983710289003, + "epoch": 0.418, + "grad_norm": 1040.0, + "kl_loss_12": 381.0573959350586, + "kl_loss_17": 98.93289833068847, + "kl_loss_3": 2131.9426147460936, + "kl_loss_6": 1223.2824462890626, + "learning_rate": 0.0006362927882276989, + "loss": 962.801, + "step": 4180 + }, + { + "ce_loss_12": 3.1948235034942627, + "ce_loss_17": 3.0719444274902346, + "ce_loss_23": 3.0190091133117676, + "ce_loss_3": 3.9587900042533875, + "ce_loss_6": 3.534806561470032, + "epoch": 0.419, + "grad_norm": 1072.0, + "kl_loss_12": 362.573291015625, + "kl_loss_17": 95.47424354553223, + "kl_loss_3": 2034.4496154785156, + "kl_loss_6": 1160.2906799316406, + "learning_rate": 0.000634765526080034, + "loss": 907.3845, + "step": 4190 + }, + { + "ce_loss_12": 3.209235632419586, + "ce_loss_17": 3.0822045683860777, + "ce_loss_23": 3.0283665299415587, + "ce_loss_3": 3.9763604760169984, + "ce_loss_6": 3.5511908173561095, + "epoch": 0.42, + "grad_norm": 900.0, + "kl_loss_12": 375.2816680908203, + "kl_loss_17": 98.58403778076172, + "kl_loss_3": 2044.1361572265625, + "kl_loss_6": 1181.4364013671875, + "learning_rate": 0.0006332369068450174, + "loss": 924.5047, + "step": 4200 + }, + { + "ce_loss_12": 3.148201882839203, + "ce_loss_17": 3.0171813011169433, + "ce_loss_23": 2.965887463092804, + "ce_loss_3": 3.9352025747299195, + "ce_loss_6": 3.504900002479553, + "epoch": 0.421, + "grad_norm": 1168.0, + "kl_loss_12": 375.26795501708983, + "kl_loss_17": 98.06200065612794, + "kl_loss_3": 2075.0803649902346, + "kl_loss_6": 1197.796453857422, + "learning_rate": 0.0006317069459158283, + "loss": 932.6061, + "step": 4210 + }, + { + "ce_loss_12": 3.2403309464454653, + "ce_loss_17": 3.1184786796569823, + "ce_loss_23": 3.0642859816551207, + "ce_loss_3": 3.9955074787139893, + "ce_loss_6": 3.5689878702163695, + "epoch": 0.422, + "grad_norm": 1232.0, + "kl_loss_12": 370.9367172241211, + "kl_loss_17": 98.52574310302734, + "kl_loss_3": 2026.5843505859375, + "kl_loss_6": 1160.4238342285157, + "learning_rate": 0.0006301756586991561, + "loss": 921.3623, + "step": 4220 + }, + { + "ce_loss_12": 3.0353715300559996, + "ce_loss_17": 2.9087610244750977, + "ce_loss_23": 2.85482771396637, + "ce_loss_3": 3.8594297885894777, + "ce_loss_6": 3.405829298496246, + "epoch": 0.423, + "grad_norm": 1408.0, + "kl_loss_12": 382.88744964599607, + "kl_loss_17": 98.91964225769043, + "kl_loss_3": 2180.594561767578, + "kl_loss_6": 1240.1722412109375, + "learning_rate": 0.0006286430606150459, + "loss": 959.4756, + "step": 4230 + }, + { + "ce_loss_12": 3.2323502779006956, + "ce_loss_17": 3.105207896232605, + "ce_loss_23": 3.053818738460541, + "ce_loss_3": 4.012285470962524, + "ce_loss_6": 3.5798202991485595, + "epoch": 0.424, + "grad_norm": 852.0, + "kl_loss_12": 379.60814971923827, + "kl_loss_17": 98.71002159118652, + "kl_loss_3": 2080.068084716797, + "kl_loss_6": 1186.7841796875, + "learning_rate": 0.0006271091670967436, + "loss": 936.2564, + "step": 4240 + }, + { + "ce_loss_12": 3.1621394395828246, + "ce_loss_17": 3.0232860922813414, + "ce_loss_23": 2.9645561575889587, + "ce_loss_3": 3.9861019134521483, + "ce_loss_6": 3.53287935256958, + "epoch": 0.425, + "grad_norm": 1272.0, + "kl_loss_12": 398.60731048583983, + "kl_loss_17": 102.87269020080566, + "kl_loss_3": 2185.2963439941404, + "kl_loss_6": 1259.7050170898438, + "learning_rate": 0.0006255739935905395, + "loss": 960.5764, + "step": 4250 + }, + { + "ce_loss_12": 3.186166834831238, + "ce_loss_17": 3.057102692127228, + "ce_loss_23": 3.004942548274994, + "ce_loss_3": 3.951607370376587, + "ce_loss_6": 3.523983860015869, + "epoch": 0.426, + "grad_norm": 1020.0, + "kl_loss_12": 375.788801574707, + "kl_loss_17": 98.5140495300293, + "kl_loss_3": 2047.4385620117187, + "kl_loss_6": 1168.862127685547, + "learning_rate": 0.0006240375555556145, + "loss": 959.7219, + "step": 4260 + }, + { + "ce_loss_12": 3.1884969115257262, + "ce_loss_17": 3.055912899971008, + "ce_loss_23": 3.0020667195320128, + "ce_loss_3": 4.003851306438446, + "ce_loss_6": 3.5518761396408083, + "epoch": 0.427, + "grad_norm": 1040.0, + "kl_loss_12": 385.05956878662107, + "kl_loss_17": 99.31076583862304, + "kl_loss_3": 2143.3416748046875, + "kl_loss_6": 1220.8311340332032, + "learning_rate": 0.000622499868463882, + "loss": 952.8904, + "step": 4270 + }, + { + "ce_loss_12": 3.1552950739860535, + "ce_loss_17": 3.0333962559700014, + "ce_loss_23": 2.982784461975098, + "ce_loss_3": 3.8959374904632567, + "ce_loss_6": 3.484722626209259, + "epoch": 0.428, + "grad_norm": 1096.0, + "kl_loss_12": 364.5569732666016, + "kl_loss_17": 97.09578018188476, + "kl_loss_3": 2005.030322265625, + "kl_loss_6": 1149.582730102539, + "learning_rate": 0.0006209609477998338, + "loss": 922.2639, + "step": 4280 + }, + { + "ce_loss_12": 3.213744878768921, + "ce_loss_17": 3.0866466403007506, + "ce_loss_23": 3.0286855936050414, + "ce_loss_3": 3.989298403263092, + "ce_loss_6": 3.5574489712715147, + "epoch": 0.429, + "grad_norm": 1008.0, + "kl_loss_12": 376.99034729003904, + "kl_loss_17": 100.63436355590821, + "kl_loss_3": 2063.216760253906, + "kl_loss_6": 1188.7937561035155, + "learning_rate": 0.0006194208090603844, + "loss": 947.5668, + "step": 4290 + }, + { + "ce_loss_12": 3.1322065472602842, + "ce_loss_17": 3.008976089954376, + "ce_loss_23": 2.955915665626526, + "ce_loss_3": 3.9148309588432313, + "ce_loss_6": 3.4751729369163513, + "epoch": 0.43, + "grad_norm": 1360.0, + "kl_loss_12": 365.5053421020508, + "kl_loss_17": 96.05026359558106, + "kl_loss_3": 2053.0044128417967, + "kl_loss_6": 1161.1391235351562, + "learning_rate": 0.0006178794677547138, + "loss": 916.7221, + "step": 4300 + }, + { + "ce_loss_12": 3.1605747818946837, + "ce_loss_17": 3.0332695603370667, + "ce_loss_23": 2.981340205669403, + "ce_loss_3": 3.9655585885047913, + "ce_loss_6": 3.5191497564315797, + "epoch": 0.431, + "grad_norm": 928.0, + "kl_loss_12": 384.50793609619143, + "kl_loss_17": 100.78701591491699, + "kl_loss_3": 2113.747119140625, + "kl_loss_6": 1213.9062072753907, + "learning_rate": 0.0006163369394041111, + "loss": 947.5874, + "step": 4310 + }, + { + "ce_loss_12": 3.1003188490867615, + "ce_loss_17": 2.9701900720596313, + "ce_loss_23": 2.913443350791931, + "ce_loss_3": 3.9075787544250487, + "ce_loss_6": 3.458979332447052, + "epoch": 0.432, + "grad_norm": 1120.0, + "kl_loss_12": 377.0905960083008, + "kl_loss_17": 99.02350425720215, + "kl_loss_3": 2128.7301391601563, + "kl_loss_6": 1217.1617401123046, + "learning_rate": 0.0006147932395418205, + "loss": 971.5845, + "step": 4320 + }, + { + "ce_loss_12": 3.140204668045044, + "ce_loss_17": 3.011886489391327, + "ce_loss_23": 2.9580222249031065, + "ce_loss_3": 3.903342294692993, + "ce_loss_6": 3.4775962233543396, + "epoch": 0.433, + "grad_norm": 1040.0, + "kl_loss_12": 377.0015670776367, + "kl_loss_17": 98.69484672546386, + "kl_loss_3": 2057.992724609375, + "kl_loss_6": 1186.7297576904298, + "learning_rate": 0.0006132483837128823, + "loss": 925.6322, + "step": 4330 + }, + { + "ce_loss_12": 3.114229154586792, + "ce_loss_17": 2.9869271516799927, + "ce_loss_23": 2.935202646255493, + "ce_loss_3": 3.9059092879295347, + "ce_loss_6": 3.4597684383392333, + "epoch": 0.434, + "grad_norm": 1232.0, + "kl_loss_12": 375.26554107666016, + "kl_loss_17": 97.48203544616699, + "kl_loss_3": 2104.536700439453, + "kl_loss_6": 1186.8380126953125, + "learning_rate": 0.0006117023874739772, + "loss": 939.7988, + "step": 4340 + }, + { + "ce_loss_12": 3.1105947494506836, + "ce_loss_17": 2.98136168718338, + "ce_loss_23": 2.929484283924103, + "ce_loss_3": 3.9065537333488463, + "ce_loss_6": 3.461872565746307, + "epoch": 0.435, + "grad_norm": 1312.0, + "kl_loss_12": 379.8814270019531, + "kl_loss_17": 97.82222518920898, + "kl_loss_3": 2112.0099365234373, + "kl_loss_6": 1206.9251831054687, + "learning_rate": 0.0006101552663932703, + "loss": 959.5463, + "step": 4350 + }, + { + "ce_loss_12": 3.1442917227745055, + "ce_loss_17": 3.0152896285057067, + "ce_loss_23": 2.9616350173950194, + "ce_loss_3": 3.9271872639656067, + "ce_loss_6": 3.4907243251800537, + "epoch": 0.436, + "grad_norm": 1056.0, + "kl_loss_12": 380.0096099853516, + "kl_loss_17": 99.47969856262208, + "kl_loss_3": 2090.22939453125, + "kl_loss_6": 1197.7749267578124, + "learning_rate": 0.0006086070360502539, + "loss": 938.3344, + "step": 4360 + }, + { + "ce_loss_12": 3.14714058637619, + "ce_loss_17": 3.0191622018814086, + "ce_loss_23": 2.9650089621543883, + "ce_loss_3": 3.93569530248642, + "ce_loss_6": 3.4921825528144836, + "epoch": 0.437, + "grad_norm": 1104.0, + "kl_loss_12": 375.81161346435545, + "kl_loss_17": 96.95657653808594, + "kl_loss_3": 2095.228955078125, + "kl_loss_6": 1190.774935913086, + "learning_rate": 0.0006070577120355903, + "loss": 940.5141, + "step": 4370 + }, + { + "ce_loss_12": 3.1431150674819945, + "ce_loss_17": 3.0185340642929077, + "ce_loss_23": 2.963430368900299, + "ce_loss_3": 3.907380211353302, + "ce_loss_6": 3.4939981341362, + "epoch": 0.438, + "grad_norm": 1064.0, + "kl_loss_12": 364.2071594238281, + "kl_loss_17": 94.97382392883301, + "kl_loss_3": 2012.2494140625, + "kl_loss_6": 1164.0711059570312, + "learning_rate": 0.0006055073099509549, + "loss": 926.577, + "step": 4380 + }, + { + "ce_loss_12": 3.2006898403167723, + "ce_loss_17": 3.081389880180359, + "ce_loss_23": 3.027166259288788, + "ce_loss_3": 3.9555254697799684, + "ce_loss_6": 3.5358532547950743, + "epoch": 0.439, + "grad_norm": 980.0, + "kl_loss_12": 371.9804412841797, + "kl_loss_17": 97.23721199035644, + "kl_loss_3": 2038.4843444824219, + "kl_loss_6": 1173.850146484375, + "learning_rate": 0.0006039558454088796, + "loss": 939.1463, + "step": 4390 + }, + { + "ce_loss_12": 3.1692385196685793, + "ce_loss_17": 3.0419707775115965, + "ce_loss_23": 2.985661518573761, + "ce_loss_3": 3.955595600605011, + "ce_loss_6": 3.5197404265403747, + "epoch": 0.44, + "grad_norm": 1064.0, + "kl_loss_12": 377.35308837890625, + "kl_loss_17": 99.0156379699707, + "kl_loss_3": 2082.0146545410157, + "kl_loss_6": 1190.1516021728517, + "learning_rate": 0.0006024033340325954, + "loss": 919.5555, + "step": 4400 + }, + { + "ce_loss_12": 3.235099709033966, + "ce_loss_17": 3.113989222049713, + "ce_loss_23": 3.060835361480713, + "ce_loss_3": 3.9762577414512634, + "ce_loss_6": 3.561458909511566, + "epoch": 0.441, + "grad_norm": 980.0, + "kl_loss_12": 361.0665008544922, + "kl_loss_17": 93.74198760986329, + "kl_loss_3": 1977.7686767578125, + "kl_loss_6": 1137.2376220703125, + "learning_rate": 0.0006008497914558743, + "loss": 909.0549, + "step": 4410 + }, + { + "ce_loss_12": 3.1947230458259583, + "ce_loss_17": 3.059620177745819, + "ce_loss_23": 3.004351055622101, + "ce_loss_3": 3.9716932654380797, + "ce_loss_6": 3.537883198261261, + "epoch": 0.442, + "grad_norm": 980.0, + "kl_loss_12": 386.25874786376954, + "kl_loss_17": 101.76474266052246, + "kl_loss_3": 2093.4546142578124, + "kl_loss_6": 1198.059033203125, + "learning_rate": 0.0005992952333228728, + "loss": 941.9635, + "step": 4420 + }, + { + "ce_loss_12": 3.125914978981018, + "ce_loss_17": 3.0019068121910095, + "ce_loss_23": 2.9484395027160644, + "ce_loss_3": 3.9122184991836546, + "ce_loss_6": 3.4705653548240663, + "epoch": 0.443, + "grad_norm": 1096.0, + "kl_loss_12": 370.42543334960936, + "kl_loss_17": 95.97517318725586, + "kl_loss_3": 2097.5520629882812, + "kl_loss_6": 1187.7123474121095, + "learning_rate": 0.0005977396752879741, + "loss": 933.6965, + "step": 4430 + }, + { + "ce_loss_12": 3.059049963951111, + "ce_loss_17": 2.9283310770988464, + "ce_loss_23": 2.877157747745514, + "ce_loss_3": 3.843676042556763, + "ce_loss_6": 3.412899672985077, + "epoch": 0.444, + "grad_norm": 1344.0, + "kl_loss_12": 375.6527328491211, + "kl_loss_17": 94.45379600524902, + "kl_loss_3": 2104.6698791503904, + "kl_loss_6": 1208.5965942382813, + "learning_rate": 0.0005961831330156305, + "loss": 929.7277, + "step": 4440 + }, + { + "ce_loss_12": 3.1946892142295837, + "ce_loss_17": 3.0648535251617433, + "ce_loss_23": 3.0128404259681703, + "ce_loss_3": 3.9888625264167787, + "ce_loss_6": 3.547437810897827, + "epoch": 0.445, + "grad_norm": 992.0, + "kl_loss_12": 371.147119140625, + "kl_loss_17": 95.90944900512696, + "kl_loss_3": 2109.418914794922, + "kl_loss_6": 1197.6739135742187, + "learning_rate": 0.0005946256221802051, + "loss": 956.1494, + "step": 4450 + }, + { + "ce_loss_12": 3.1555089116096497, + "ce_loss_17": 3.0345204710960387, + "ce_loss_23": 2.986414337158203, + "ce_loss_3": 3.897111940383911, + "ce_loss_6": 3.481855571269989, + "epoch": 0.446, + "grad_norm": 1160.0, + "kl_loss_12": 356.09037628173826, + "kl_loss_17": 93.4267032623291, + "kl_loss_3": 1978.97578125, + "kl_loss_6": 1134.439404296875, + "learning_rate": 0.0005930671584658151, + "loss": 946.3023, + "step": 4460 + }, + { + "ce_loss_12": 3.1738540887832642, + "ce_loss_17": 3.047953486442566, + "ce_loss_23": 2.9957111835479737, + "ce_loss_3": 3.9466116666793822, + "ce_loss_6": 3.5165825486183167, + "epoch": 0.447, + "grad_norm": 912.0, + "kl_loss_12": 373.43541717529297, + "kl_loss_17": 96.29221115112304, + "kl_loss_3": 2065.424639892578, + "kl_loss_6": 1179.5342651367187, + "learning_rate": 0.0005915077575661722, + "loss": 939.1451, + "step": 4470 + }, + { + "ce_loss_12": 3.187889552116394, + "ce_loss_17": 3.0592567563056945, + "ce_loss_23": 3.0062554359436033, + "ce_loss_3": 3.9646561622619627, + "ce_loss_6": 3.530262494087219, + "epoch": 0.448, + "grad_norm": 792.0, + "kl_loss_12": 382.846728515625, + "kl_loss_17": 99.91208915710449, + "kl_loss_3": 2096.3197631835938, + "kl_loss_6": 1201.9329803466796, + "learning_rate": 0.000589947435184427, + "loss": 929.466, + "step": 4480 + }, + { + "ce_loss_12": 3.2426429748535157, + "ce_loss_17": 3.1219165563583373, + "ce_loss_23": 3.069086766242981, + "ce_loss_3": 3.97622035741806, + "ce_loss_6": 3.5719561100006105, + "epoch": 0.449, + "grad_norm": 956.0, + "kl_loss_12": 369.9361053466797, + "kl_loss_17": 97.55784950256347, + "kl_loss_3": 2008.1300842285157, + "kl_loss_6": 1160.9046325683594, + "learning_rate": 0.0005883862070330078, + "loss": 922.6117, + "step": 4490 + }, + { + "ce_loss_12": 3.188134014606476, + "ce_loss_17": 3.0585672497749328, + "ce_loss_23": 3.007031726837158, + "ce_loss_3": 3.9629687309265136, + "ce_loss_6": 3.5271342396736145, + "epoch": 0.45, + "grad_norm": 1264.0, + "kl_loss_12": 377.42894134521487, + "kl_loss_17": 97.14705848693848, + "kl_loss_3": 2071.5281982421875, + "kl_loss_6": 1186.3728515625, + "learning_rate": 0.0005868240888334653, + "loss": 927.5315, + "step": 4500 + }, + { + "ce_loss_12": 3.079320514202118, + "ce_loss_17": 2.9505565047264097, + "ce_loss_23": 2.894635808467865, + "ce_loss_3": 3.8832791805267335, + "ce_loss_6": 3.433999705314636, + "epoch": 0.451, + "grad_norm": 1040.0, + "kl_loss_12": 379.57346801757814, + "kl_loss_17": 97.66094398498535, + "kl_loss_3": 2117.039208984375, + "kl_loss_6": 1208.289532470703, + "learning_rate": 0.0005852610963163119, + "loss": 945.0416, + "step": 4510 + }, + { + "ce_loss_12": 3.098237121105194, + "ce_loss_17": 2.971955049037933, + "ce_loss_23": 2.921879529953003, + "ce_loss_3": 3.8654279828071596, + "ce_loss_6": 3.443196380138397, + "epoch": 0.452, + "grad_norm": 1200.0, + "kl_loss_12": 367.5508056640625, + "kl_loss_17": 95.00853729248047, + "kl_loss_3": 2048.9790588378905, + "kl_loss_6": 1173.7472778320312, + "learning_rate": 0.0005836972452208654, + "loss": 917.1092, + "step": 4520 + }, + { + "ce_loss_12": 3.0992449522018433, + "ce_loss_17": 2.9783245265483855, + "ce_loss_23": 2.92637876868248, + "ce_loss_3": 3.888488268852234, + "ce_loss_6": 3.4487679481506346, + "epoch": 0.453, + "grad_norm": 1216.0, + "kl_loss_12": 372.78917388916017, + "kl_loss_17": 97.17004623413087, + "kl_loss_3": 2088.7503173828127, + "kl_loss_6": 1194.8785522460937, + "learning_rate": 0.0005821325512950885, + "loss": 936.7268, + "step": 4530 + }, + { + "ce_loss_12": 3.124701368808746, + "ce_loss_17": 2.9975485682487486, + "ce_loss_23": 2.947983968257904, + "ce_loss_3": 3.8898465156555178, + "ce_loss_6": 3.468051314353943, + "epoch": 0.454, + "grad_norm": 1004.0, + "kl_loss_12": 361.3929946899414, + "kl_loss_17": 94.27552947998046, + "kl_loss_3": 2016.76611328125, + "kl_loss_6": 1158.698434448242, + "learning_rate": 0.0005805670302954321, + "loss": 926.6719, + "step": 4540 + }, + { + "ce_loss_12": 3.1272517681121825, + "ce_loss_17": 3.0075201272964476, + "ce_loss_23": 2.954355263710022, + "ce_loss_3": 3.896501433849335, + "ce_loss_6": 3.466426062583923, + "epoch": 0.455, + "grad_norm": 1032.0, + "kl_loss_12": 365.66592864990236, + "kl_loss_17": 94.69521942138672, + "kl_loss_3": 2050.3198303222657, + "kl_loss_6": 1168.731463623047, + "learning_rate": 0.000579000697986675, + "loss": 918.8904, + "step": 4550 + }, + { + "ce_loss_12": 3.0973682522773744, + "ce_loss_17": 2.9664054989814757, + "ce_loss_23": 2.9097727060317995, + "ce_loss_3": 3.914142680168152, + "ce_loss_6": 3.4645751833915712, + "epoch": 0.456, + "grad_norm": 836.0, + "kl_loss_12": 383.7973129272461, + "kl_loss_17": 98.52790298461915, + "kl_loss_3": 2142.2695434570314, + "kl_loss_6": 1232.745263671875, + "learning_rate": 0.0005774335701417662, + "loss": 941.8937, + "step": 4560 + }, + { + "ce_loss_12": 3.0880584955215453, + "ce_loss_17": 2.960580062866211, + "ce_loss_23": 2.908521521091461, + "ce_loss_3": 3.900552272796631, + "ce_loss_6": 3.445142912864685, + "epoch": 0.457, + "grad_norm": 936.0, + "kl_loss_12": 373.11179809570314, + "kl_loss_17": 94.61213645935058, + "kl_loss_3": 2147.471405029297, + "kl_loss_6": 1208.840509033203, + "learning_rate": 0.0005758656625416658, + "loss": 940.7441, + "step": 4570 + }, + { + "ce_loss_12": 3.1407280206680297, + "ce_loss_17": 3.015048694610596, + "ce_loss_23": 2.9619325399398804, + "ce_loss_3": 3.9227320909500123, + "ce_loss_6": 3.4882035851478577, + "epoch": 0.458, + "grad_norm": 960.0, + "kl_loss_12": 373.65845947265626, + "kl_loss_17": 97.12631492614746, + "kl_loss_3": 2067.6907592773437, + "kl_loss_6": 1185.5317321777343, + "learning_rate": 0.0005742969909751859, + "loss": 918.8994, + "step": 4580 + }, + { + "ce_loss_12": 3.1578575372695923, + "ce_loss_17": 3.030971610546112, + "ce_loss_23": 2.9798396110534666, + "ce_loss_3": 3.932333254814148, + "ce_loss_6": 3.497673225402832, + "epoch": 0.459, + "grad_norm": 996.0, + "kl_loss_12": 373.4551177978516, + "kl_loss_17": 96.02978591918945, + "kl_loss_3": 2071.560614013672, + "kl_loss_6": 1177.2343200683595, + "learning_rate": 0.0005727275712388318, + "loss": 936.2139, + "step": 4590 + }, + { + "ce_loss_12": 3.166626787185669, + "ce_loss_17": 3.0451488375663756, + "ce_loss_23": 2.9955615639686584, + "ce_loss_3": 3.912241852283478, + "ce_loss_6": 3.4970428705215455, + "epoch": 0.46, + "grad_norm": 1120.0, + "kl_loss_12": 359.3360260009766, + "kl_loss_17": 92.84842300415039, + "kl_loss_3": 2001.1885681152344, + "kl_loss_6": 1146.0368438720702, + "learning_rate": 0.0005711574191366427, + "loss": 913.1519, + "step": 4600 + }, + { + "ce_loss_12": 3.1245293021202087, + "ce_loss_17": 3.0007786989212035, + "ce_loss_23": 2.950817012786865, + "ce_loss_3": 3.8915412545204164, + "ce_loss_6": 3.461117720603943, + "epoch": 0.461, + "grad_norm": 944.0, + "kl_loss_12": 364.3477020263672, + "kl_loss_17": 93.56401329040527, + "kl_loss_3": 2047.8031494140625, + "kl_loss_6": 1158.8289428710937, + "learning_rate": 0.0005695865504800327, + "loss": 914.1635, + "step": 4610 + }, + { + "ce_loss_12": 3.0745707035064695, + "ce_loss_17": 2.939027392864227, + "ce_loss_23": 2.8857049703598023, + "ce_loss_3": 3.928879237174988, + "ce_loss_6": 3.456984746456146, + "epoch": 0.462, + "grad_norm": 1104.0, + "kl_loss_12": 392.5043655395508, + "kl_loss_17": 99.27418518066406, + "kl_loss_3": 2230.4755615234376, + "kl_loss_6": 1273.105810546875, + "learning_rate": 0.0005680149810876322, + "loss": 958.7232, + "step": 4620 + }, + { + "ce_loss_12": 3.1221439361572267, + "ce_loss_17": 2.9936205983161925, + "ce_loss_23": 2.9434250235557555, + "ce_loss_3": 3.906174600124359, + "ce_loss_6": 3.469303858280182, + "epoch": 0.463, + "grad_norm": 968.0, + "kl_loss_12": 367.0161819458008, + "kl_loss_17": 94.79076499938965, + "kl_loss_3": 2075.8826171875, + "kl_loss_6": 1182.3226745605468, + "learning_rate": 0.0005664427267851271, + "loss": 926.7291, + "step": 4630 + }, + { + "ce_loss_12": 3.0404475688934327, + "ce_loss_17": 2.914381778240204, + "ce_loss_23": 2.863242745399475, + "ce_loss_3": 3.8283162117004395, + "ce_loss_6": 3.390611732006073, + "epoch": 0.464, + "grad_norm": 1048.0, + "kl_loss_12": 364.2582321166992, + "kl_loss_17": 93.55969924926758, + "kl_loss_3": 2070.353057861328, + "kl_loss_6": 1176.0620391845703, + "learning_rate": 0.0005648698034051009, + "loss": 921.2185, + "step": 4640 + }, + { + "ce_loss_12": 3.1544418692588807, + "ce_loss_17": 3.0264967560768126, + "ce_loss_23": 2.973117482662201, + "ce_loss_3": 3.952759790420532, + "ce_loss_6": 3.5050989270210264, + "epoch": 0.465, + "grad_norm": 980.0, + "kl_loss_12": 368.0677200317383, + "kl_loss_17": 95.34860954284667, + "kl_loss_3": 2107.058544921875, + "kl_loss_6": 1191.1350402832031, + "learning_rate": 0.0005632962267868747, + "loss": 922.8924, + "step": 4650 + }, + { + "ce_loss_12": 3.086225914955139, + "ce_loss_17": 2.9620978832244873, + "ce_loss_23": 2.912720251083374, + "ce_loss_3": 3.851222813129425, + "ce_loss_6": 3.42860985994339, + "epoch": 0.466, + "grad_norm": 1136.0, + "kl_loss_12": 358.4297149658203, + "kl_loss_17": 91.42180213928222, + "kl_loss_3": 2045.6289489746093, + "kl_loss_6": 1169.5674255371093, + "learning_rate": 0.0005617220127763474, + "loss": 924.683, + "step": 4660 + }, + { + "ce_loss_12": 3.1667036652565, + "ce_loss_17": 3.041495943069458, + "ce_loss_23": 2.991810345649719, + "ce_loss_3": 3.9326154589653015, + "ce_loss_6": 3.5035176277160645, + "epoch": 0.467, + "grad_norm": 1024.0, + "kl_loss_12": 368.75465240478513, + "kl_loss_17": 94.89134712219239, + "kl_loss_3": 2039.9618713378907, + "kl_loss_6": 1157.8766632080078, + "learning_rate": 0.0005601471772258368, + "loss": 925.9684, + "step": 4670 + }, + { + "ce_loss_12": 3.148939609527588, + "ce_loss_17": 3.0280992150306703, + "ce_loss_23": 2.9784193873405456, + "ce_loss_3": 3.905656802654266, + "ce_loss_6": 3.4869115591049193, + "epoch": 0.468, + "grad_norm": 876.0, + "kl_loss_12": 363.73346557617185, + "kl_loss_17": 94.9500545501709, + "kl_loss_3": 1996.96025390625, + "kl_loss_6": 1144.792901611328, + "learning_rate": 0.0005585717359939192, + "loss": 927.9094, + "step": 4680 + }, + { + "ce_loss_12": 3.0635921597480773, + "ce_loss_17": 2.9403783679008484, + "ce_loss_23": 2.890295124053955, + "ce_loss_3": 3.8274804949760437, + "ce_loss_6": 3.4014335751533507, + "epoch": 0.469, + "grad_norm": 1032.0, + "kl_loss_12": 362.45594482421876, + "kl_loss_17": 93.71986045837403, + "kl_loss_3": 2028.2100341796875, + "kl_loss_6": 1157.830874633789, + "learning_rate": 0.0005569957049452703, + "loss": 932.6887, + "step": 4690 + }, + { + "ce_loss_12": 3.1199838757514953, + "ce_loss_17": 2.9960068225860597, + "ce_loss_23": 2.9443670392036436, + "ce_loss_3": 3.908629536628723, + "ce_loss_6": 3.465496504306793, + "epoch": 0.47, + "grad_norm": 1392.0, + "kl_loss_12": 372.039079284668, + "kl_loss_17": 97.96028518676758, + "kl_loss_3": 2081.887371826172, + "kl_loss_6": 1184.3350677490234, + "learning_rate": 0.0005554190999505056, + "loss": 934.4084, + "step": 4700 + }, + { + "ce_loss_12": 3.241987943649292, + "ce_loss_17": 3.112707030773163, + "ce_loss_23": 3.0603973388671877, + "ce_loss_3": 4.018198192119598, + "ce_loss_6": 3.590498960018158, + "epoch": 0.471, + "grad_norm": 1112.0, + "kl_loss_12": 382.9134002685547, + "kl_loss_17": 98.71521873474121, + "kl_loss_3": 2076.810894775391, + "kl_loss_6": 1192.9006103515626, + "learning_rate": 0.0005538419368860196, + "loss": 902.3693, + "step": 4710 + }, + { + "ce_loss_12": 3.163465344905853, + "ce_loss_17": 3.040043067932129, + "ce_loss_23": 2.9864439606666564, + "ce_loss_3": 3.9220748066902162, + "ce_loss_6": 3.4972716927528382, + "epoch": 0.472, + "grad_norm": 952.0, + "kl_loss_12": 371.4324691772461, + "kl_loss_17": 96.35890159606933, + "kl_loss_3": 2035.002734375, + "kl_loss_6": 1170.274884033203, + "learning_rate": 0.0005522642316338268, + "loss": 940.209, + "step": 4720 + }, + { + "ce_loss_12": 3.1730767607688906, + "ce_loss_17": 3.0522518277168276, + "ce_loss_23": 3.00358167886734, + "ce_loss_3": 3.935346007347107, + "ce_loss_6": 3.5060906767845155, + "epoch": 0.473, + "grad_norm": 1056.0, + "kl_loss_12": 368.5505630493164, + "kl_loss_17": 95.56421852111816, + "kl_loss_3": 2031.9708740234375, + "kl_loss_6": 1157.9430053710937, + "learning_rate": 0.0005506860000814017, + "loss": 940.0187, + "step": 4730 + }, + { + "ce_loss_12": 3.19408575296402, + "ce_loss_17": 3.076188790798187, + "ce_loss_23": 3.028001570701599, + "ce_loss_3": 3.931719958782196, + "ce_loss_6": 3.524571084976196, + "epoch": 0.474, + "grad_norm": 992.0, + "kl_loss_12": 359.36453399658205, + "kl_loss_17": 91.8622055053711, + "kl_loss_3": 1987.8832092285156, + "kl_loss_6": 1147.102392578125, + "learning_rate": 0.0005491072581215186, + "loss": 917.3689, + "step": 4740 + }, + { + "ce_loss_12": 3.195579099655151, + "ce_loss_17": 3.068587398529053, + "ce_loss_23": 3.0141397595405577, + "ce_loss_3": 3.949463403224945, + "ce_loss_6": 3.5331989526748657, + "epoch": 0.475, + "grad_norm": 1120.0, + "kl_loss_12": 373.22421112060545, + "kl_loss_17": 98.32062301635742, + "kl_loss_3": 2049.5393127441407, + "kl_loss_6": 1178.9929168701171, + "learning_rate": 0.0005475280216520913, + "loss": 907.7835, + "step": 4750 + }, + { + "ce_loss_12": 3.116903507709503, + "ce_loss_17": 2.995867669582367, + "ce_loss_23": 2.945862424373627, + "ce_loss_3": 3.8652034997940063, + "ce_loss_6": 3.447264242172241, + "epoch": 0.476, + "grad_norm": 1240.0, + "kl_loss_12": 358.89072570800784, + "kl_loss_17": 92.7595603942871, + "kl_loss_3": 1993.6135681152343, + "kl_loss_6": 1138.1050079345703, + "learning_rate": 0.0005459483065760138, + "loss": 924.2568, + "step": 4760 + }, + { + "ce_loss_12": 3.064214551448822, + "ce_loss_17": 2.9375820994377135, + "ce_loss_23": 2.889118230342865, + "ce_loss_3": 3.888459062576294, + "ce_loss_6": 3.4316441893577574, + "epoch": 0.477, + "grad_norm": 1072.0, + "kl_loss_12": 369.9542205810547, + "kl_loss_17": 94.08586616516114, + "kl_loss_3": 2152.941357421875, + "kl_loss_6": 1222.8913269042969, + "learning_rate": 0.0005443681288009991, + "loss": 939.2311, + "step": 4770 + }, + { + "ce_loss_12": 3.1037795662879946, + "ce_loss_17": 2.985719680786133, + "ce_loss_23": 2.9347479224205015, + "ce_loss_3": 3.8800550937652587, + "ce_loss_6": 3.45303328037262, + "epoch": 0.478, + "grad_norm": 952.0, + "kl_loss_12": 362.2948623657227, + "kl_loss_17": 93.37007637023926, + "kl_loss_3": 2054.378765869141, + "kl_loss_6": 1174.0726287841796, + "learning_rate": 0.0005427875042394199, + "loss": 924.9251, + "step": 4780 + }, + { + "ce_loss_12": 3.148194468021393, + "ce_loss_17": 3.0246357798576353, + "ce_loss_23": 2.9726595759391783, + "ce_loss_3": 3.9054253578186033, + "ce_loss_6": 3.4833449959754943, + "epoch": 0.479, + "grad_norm": 952.0, + "kl_loss_12": 369.14268493652344, + "kl_loss_17": 97.6407917022705, + "kl_loss_3": 2021.9542907714845, + "kl_loss_6": 1162.2347930908204, + "learning_rate": 0.0005412064488081482, + "loss": 930.3105, + "step": 4790 + }, + { + "ce_loss_12": 3.137005865573883, + "ce_loss_17": 3.0189573764801025, + "ce_loss_23": 2.967451441287994, + "ce_loss_3": 3.8918606281280517, + "ce_loss_6": 3.468139088153839, + "epoch": 0.48, + "grad_norm": 1128.0, + "kl_loss_12": 352.2945220947266, + "kl_loss_17": 92.58756942749024, + "kl_loss_3": 1984.3945434570312, + "kl_loss_6": 1131.745068359375, + "learning_rate": 0.0005396249784283942, + "loss": 899.6717, + "step": 4800 + }, + { + "ce_loss_12": 3.1608723282814024, + "ce_loss_17": 3.037445676326752, + "ce_loss_23": 2.983047103881836, + "ce_loss_3": 3.955673432350159, + "ce_loss_6": 3.5141472697257994, + "epoch": 0.481, + "grad_norm": 1160.0, + "kl_loss_12": 373.91689453125, + "kl_loss_17": 97.65786895751953, + "kl_loss_3": 2108.4563049316407, + "kl_loss_6": 1214.4124877929687, + "learning_rate": 0.0005380431090255476, + "loss": 939.9953, + "step": 4810 + }, + { + "ce_loss_12": 3.1526288866996763, + "ce_loss_17": 3.036070692539215, + "ce_loss_23": 2.987703096866608, + "ce_loss_3": 3.9051117300987244, + "ce_loss_6": 3.4906025767326354, + "epoch": 0.482, + "grad_norm": 992.0, + "kl_loss_12": 350.09937438964846, + "kl_loss_17": 90.7152587890625, + "kl_loss_3": 1995.6268737792968, + "kl_loss_6": 1147.331491088867, + "learning_rate": 0.0005364608565290155, + "loss": 901.2539, + "step": 4820 + }, + { + "ce_loss_12": 3.163371217250824, + "ce_loss_17": 3.04144287109375, + "ce_loss_23": 2.988657009601593, + "ce_loss_3": 3.927390766143799, + "ce_loss_6": 3.5088889718055727, + "epoch": 0.483, + "grad_norm": 1080.0, + "kl_loss_12": 367.5927398681641, + "kl_loss_17": 95.43120765686035, + "kl_loss_3": 2036.2733276367187, + "kl_loss_6": 1174.790771484375, + "learning_rate": 0.0005348782368720626, + "loss": 919.0738, + "step": 4830 + }, + { + "ce_loss_12": 3.1053726196289064, + "ce_loss_17": 2.9821720719337463, + "ce_loss_23": 2.932705020904541, + "ce_loss_3": 3.8632062315940856, + "ce_loss_6": 3.4345524430274965, + "epoch": 0.484, + "grad_norm": 1296.0, + "kl_loss_12": 355.072282409668, + "kl_loss_17": 91.17291488647462, + "kl_loss_3": 2003.0337890625, + "kl_loss_6": 1133.6176971435548, + "learning_rate": 0.000533295265991652, + "loss": 911.5024, + "step": 4840 + }, + { + "ce_loss_12": 3.170707905292511, + "ce_loss_17": 3.0478097796440125, + "ce_loss_23": 2.995159554481506, + "ce_loss_3": 3.919725406169891, + "ce_loss_6": 3.5044790148735045, + "epoch": 0.485, + "grad_norm": 1000.0, + "kl_loss_12": 361.09333953857424, + "kl_loss_17": 92.84121818542481, + "kl_loss_3": 1998.759344482422, + "kl_loss_6": 1147.2231689453124, + "learning_rate": 0.0005317119598282822, + "loss": 902.2107, + "step": 4850 + }, + { + "ce_loss_12": 3.171194279193878, + "ce_loss_17": 3.047281098365784, + "ce_loss_23": 2.99584538936615, + "ce_loss_3": 3.935044729709625, + "ce_loss_6": 3.5136531472206114, + "epoch": 0.486, + "grad_norm": 1264.0, + "kl_loss_12": 368.4178924560547, + "kl_loss_17": 94.40206680297851, + "kl_loss_3": 2022.4554504394532, + "kl_loss_6": 1157.0905395507812, + "learning_rate": 0.0005301283343258293, + "loss": 910.2678, + "step": 4860 + }, + { + "ce_loss_12": 3.225717294216156, + "ce_loss_17": 3.102791738510132, + "ce_loss_23": 3.052597498893738, + "ce_loss_3": 3.970711553096771, + "ce_loss_6": 3.5527042150497437, + "epoch": 0.487, + "grad_norm": 1240.0, + "kl_loss_12": 364.56507263183596, + "kl_loss_17": 97.28554267883301, + "kl_loss_3": 1994.2457397460937, + "kl_loss_6": 1140.4625762939454, + "learning_rate": 0.000528544405431384, + "loss": 898.1916, + "step": 4870 + }, + { + "ce_loss_12": 3.1127439618110655, + "ce_loss_17": 2.9833473324775697, + "ce_loss_23": 2.9293392419815065, + "ce_loss_3": 3.8848477840423583, + "ce_loss_6": 3.4620235204696654, + "epoch": 0.488, + "grad_norm": 936.0, + "kl_loss_12": 375.3154983520508, + "kl_loss_17": 95.93160438537598, + "kl_loss_3": 2063.953057861328, + "kl_loss_6": 1193.917742919922, + "learning_rate": 0.000526960189095093, + "loss": 928.2098, + "step": 4880 + }, + { + "ce_loss_12": 3.0926320910453797, + "ce_loss_17": 2.9746638536453247, + "ce_loss_23": 2.9252131819725036, + "ce_loss_3": 3.852759027481079, + "ce_loss_6": 3.428392505645752, + "epoch": 0.489, + "grad_norm": 1224.0, + "kl_loss_12": 355.71148986816405, + "kl_loss_17": 92.43701934814453, + "kl_loss_3": 1994.89287109375, + "kl_loss_6": 1139.5907104492187, + "learning_rate": 0.0005253757012699972, + "loss": 904.277, + "step": 4890 + }, + { + "ce_loss_12": 3.1681107759475706, + "ce_loss_17": 3.044089150428772, + "ce_loss_23": 2.995683467388153, + "ce_loss_3": 3.917013776302338, + "ce_loss_6": 3.4973212122917174, + "epoch": 0.49, + "grad_norm": 1256.0, + "kl_loss_12": 363.9573379516602, + "kl_loss_17": 93.36282806396484, + "kl_loss_3": 2007.33369140625, + "kl_loss_6": 1150.3894775390625, + "learning_rate": 0.0005237909579118712, + "loss": 922.1821, + "step": 4900 + }, + { + "ce_loss_12": 3.143178117275238, + "ce_loss_17": 3.012323999404907, + "ce_loss_23": 2.957292902469635, + "ce_loss_3": 3.92088303565979, + "ce_loss_6": 3.4861607670784, + "epoch": 0.491, + "grad_norm": 1112.0, + "kl_loss_12": 376.0688873291016, + "kl_loss_17": 97.68312149047851, + "kl_loss_3": 2079.636608886719, + "kl_loss_6": 1186.3177856445313, + "learning_rate": 0.0005222059749790631, + "loss": 928.2946, + "step": 4910 + }, + { + "ce_loss_12": 3.1950509667396547, + "ce_loss_17": 3.075741231441498, + "ce_loss_23": 3.024643123149872, + "ce_loss_3": 3.918723690509796, + "ce_loss_6": 3.5106842637062075, + "epoch": 0.492, + "grad_norm": 900.0, + "kl_loss_12": 355.10656890869143, + "kl_loss_17": 92.66862754821777, + "kl_loss_3": 1956.1288940429688, + "kl_loss_6": 1122.1956909179687, + "learning_rate": 0.0005206207684323337, + "loss": 884.5479, + "step": 4920 + }, + { + "ce_loss_12": 3.177818274497986, + "ce_loss_17": 3.055434787273407, + "ce_loss_23": 3.0034319281578066, + "ce_loss_3": 3.9366918325424196, + "ce_loss_6": 3.518252801895142, + "epoch": 0.493, + "grad_norm": 1192.0, + "kl_loss_12": 367.2648208618164, + "kl_loss_17": 95.72131729125977, + "kl_loss_3": 2022.9251403808594, + "kl_loss_6": 1167.5449096679688, + "learning_rate": 0.000519035354234695, + "loss": 928.926, + "step": 4930 + }, + { + "ce_loss_12": 3.1576064467430114, + "ce_loss_17": 3.0290244936943056, + "ce_loss_23": 2.9728461861610413, + "ce_loss_3": 3.9173558950424194, + "ce_loss_6": 3.4981216549873353, + "epoch": 0.494, + "grad_norm": 1224.0, + "kl_loss_12": 372.4341445922852, + "kl_loss_17": 98.32537727355957, + "kl_loss_3": 2013.0077697753907, + "kl_loss_6": 1171.3818298339843, + "learning_rate": 0.0005174497483512506, + "loss": 901.1791, + "step": 4940 + }, + { + "ce_loss_12": 3.193759799003601, + "ce_loss_17": 3.075028383731842, + "ce_loss_23": 3.0255154371261597, + "ce_loss_3": 3.9381660342216493, + "ce_loss_6": 3.526125121116638, + "epoch": 0.495, + "grad_norm": 944.0, + "kl_loss_12": 356.26817474365237, + "kl_loss_17": 92.91269378662109, + "kl_loss_3": 2008.5030517578125, + "kl_loss_6": 1151.6687255859374, + "learning_rate": 0.0005158639667490339, + "loss": 917.3115, + "step": 4950 + }, + { + "ce_loss_12": 3.112211060523987, + "ce_loss_17": 2.986400294303894, + "ce_loss_23": 2.936023163795471, + "ce_loss_3": 3.8713951110839844, + "ce_loss_6": 3.450876069068909, + "epoch": 0.496, + "grad_norm": 1016.0, + "kl_loss_12": 363.6869583129883, + "kl_loss_17": 93.26083564758301, + "kl_loss_3": 2023.2762573242187, + "kl_loss_6": 1160.9851287841798, + "learning_rate": 0.0005142780253968481, + "loss": 911.3936, + "step": 4960 + }, + { + "ce_loss_12": 3.0601794362068175, + "ce_loss_17": 2.942017126083374, + "ce_loss_23": 2.8925352931022643, + "ce_loss_3": 3.8043387174606322, + "ce_loss_6": 3.383818244934082, + "epoch": 0.497, + "grad_norm": 1032.0, + "kl_loss_12": 351.1629837036133, + "kl_loss_17": 91.15731086730958, + "kl_loss_3": 1988.4179992675781, + "kl_loss_6": 1128.8858367919922, + "learning_rate": 0.0005126919402651053, + "loss": 883.4854, + "step": 4970 + }, + { + "ce_loss_12": 3.1314974069595336, + "ce_loss_17": 3.004086196422577, + "ce_loss_23": 2.949925649166107, + "ce_loss_3": 3.9035497426986696, + "ce_loss_6": 3.4774454474449157, + "epoch": 0.498, + "grad_norm": 1232.0, + "kl_loss_12": 369.9680694580078, + "kl_loss_17": 97.28400802612305, + "kl_loss_3": 2040.5506774902344, + "kl_loss_6": 1173.4661041259765, + "learning_rate": 0.0005111057273256647, + "loss": 918.4297, + "step": 4980 + }, + { + "ce_loss_12": 3.2182634353637694, + "ce_loss_17": 3.1030057668685913, + "ce_loss_23": 3.053433656692505, + "ce_loss_3": 3.9287408590316772, + "ce_loss_6": 3.5249813079833983, + "epoch": 0.499, + "grad_norm": 968.0, + "kl_loss_12": 346.4434982299805, + "kl_loss_17": 89.57510108947754, + "kl_loss_3": 1916.7842163085938, + "kl_loss_6": 1099.401171875, + "learning_rate": 0.0005095194025516733, + "loss": 873.9033, + "step": 4990 + }, + { + "ce_loss_12": 3.149612510204315, + "ce_loss_17": 3.0297858476638795, + "ce_loss_23": 2.983256685733795, + "ce_loss_3": 3.8866697907447816, + "ce_loss_6": 3.4731158018112183, + "epoch": 0.5, + "grad_norm": 1184.0, + "kl_loss_12": 351.9287445068359, + "kl_loss_17": 89.64690284729004, + "kl_loss_3": 1978.0472778320313, + "kl_loss_6": 1126.4499816894531, + "learning_rate": 0.000507932981917404, + "loss": 919.7188, + "step": 5000 + }, + { + "ce_loss_12": 3.1033830046653748, + "ce_loss_17": 2.9746859550476072, + "ce_loss_23": 2.9226545691490173, + "ce_loss_3": 3.9122170090675352, + "ce_loss_6": 3.4662346839904785, + "epoch": 0.501, + "grad_norm": 1096.0, + "kl_loss_12": 374.91630401611326, + "kl_loss_17": 97.60987014770508, + "kl_loss_3": 2124.3409912109373, + "kl_loss_6": 1212.289617919922, + "learning_rate": 0.0005063464813980949, + "loss": 944.7277, + "step": 5010 + }, + { + "ce_loss_12": 3.0833741784095765, + "ce_loss_17": 2.9644308209419252, + "ce_loss_23": 2.91454701423645, + "ce_loss_3": 3.8580423831939696, + "ce_loss_6": 3.424077236652374, + "epoch": 0.502, + "grad_norm": 956.0, + "kl_loss_12": 361.62098236083983, + "kl_loss_17": 93.23923530578614, + "kl_loss_3": 2055.8266784667967, + "kl_loss_6": 1173.4408447265625, + "learning_rate": 0.0005047599169697884, + "loss": 910.5002, + "step": 5020 + }, + { + "ce_loss_12": 3.037166953086853, + "ce_loss_17": 2.9079468607902528, + "ce_loss_23": 2.855610752105713, + "ce_loss_3": 3.8071164965629576, + "ce_loss_6": 3.3815191626548766, + "epoch": 0.503, + "grad_norm": 1192.0, + "kl_loss_12": 360.40843505859374, + "kl_loss_17": 93.68807334899903, + "kl_loss_3": 2028.0902526855468, + "kl_loss_6": 1152.7849853515625, + "learning_rate": 0.000503173304609171, + "loss": 890.5323, + "step": 5030 + }, + { + "ce_loss_12": 3.143783712387085, + "ce_loss_17": 3.0220723867416384, + "ce_loss_23": 2.97117475271225, + "ce_loss_3": 3.904027593135834, + "ce_loss_6": 3.487496542930603, + "epoch": 0.504, + "grad_norm": 1232.0, + "kl_loss_12": 358.4159912109375, + "kl_loss_17": 92.68148765563964, + "kl_loss_3": 2011.387628173828, + "kl_loss_6": 1162.4315704345704, + "learning_rate": 0.0005015866602934111, + "loss": 893.0471, + "step": 5040 + }, + { + "ce_loss_12": 3.1251502990722657, + "ce_loss_17": 2.9951428532600404, + "ce_loss_23": 2.943488895893097, + "ce_loss_3": 3.918997848033905, + "ce_loss_6": 3.479409730434418, + "epoch": 0.505, + "grad_norm": 1344.0, + "kl_loss_12": 379.06178131103513, + "kl_loss_17": 98.6830223083496, + "kl_loss_3": 2080.636865234375, + "kl_loss_6": 1195.0086212158203, + "learning_rate": 0.0005, + "loss": 921.599, + "step": 5050 + }, + { + "ce_loss_12": 3.1191251039505006, + "ce_loss_17": 2.9923450350761414, + "ce_loss_23": 2.94129501581192, + "ce_loss_3": 3.8779847741127016, + "ce_loss_6": 3.4552345752716063, + "epoch": 0.506, + "grad_norm": 1328.0, + "kl_loss_12": 367.75157928466797, + "kl_loss_17": 96.9900463104248, + "kl_loss_3": 2014.9070922851563, + "kl_loss_6": 1160.6669158935547, + "learning_rate": 0.0004984133397065889, + "loss": 897.3158, + "step": 5060 + }, + { + "ce_loss_12": 3.1227738738059996, + "ce_loss_17": 2.994934821128845, + "ce_loss_23": 2.941101610660553, + "ce_loss_3": 3.906764030456543, + "ce_loss_6": 3.4776443004608155, + "epoch": 0.507, + "grad_norm": 900.0, + "kl_loss_12": 364.47747497558595, + "kl_loss_17": 94.02899131774902, + "kl_loss_3": 2045.6871948242188, + "kl_loss_6": 1174.2436950683593, + "learning_rate": 0.0004968266953908291, + "loss": 902.484, + "step": 5070 + }, + { + "ce_loss_12": 3.1509108543395996, + "ce_loss_17": 3.033639430999756, + "ce_loss_23": 2.9840845227241517, + "ce_loss_3": 3.928315889835358, + "ce_loss_6": 3.496352505683899, + "epoch": 0.508, + "grad_norm": 1320.0, + "kl_loss_12": 357.41788177490236, + "kl_loss_17": 91.3696117401123, + "kl_loss_3": 2039.4884704589845, + "kl_loss_6": 1157.7149505615234, + "learning_rate": 0.0004952400830302117, + "loss": 910.76, + "step": 5080 + }, + { + "ce_loss_12": 3.091415858268738, + "ce_loss_17": 2.965611445903778, + "ce_loss_23": 2.913990008831024, + "ce_loss_3": 3.8812857627868653, + "ce_loss_6": 3.446223962306976, + "epoch": 0.509, + "grad_norm": 952.0, + "kl_loss_12": 371.3483428955078, + "kl_loss_17": 94.62365684509277, + "kl_loss_3": 2075.6438903808594, + "kl_loss_6": 1190.573193359375, + "learning_rate": 0.0004936535186019053, + "loss": 913.7413, + "step": 5090 + }, + { + "ce_loss_12": 3.1742780089378355, + "ce_loss_17": 3.0596349954605104, + "ce_loss_23": 3.0131367444992065, + "ce_loss_3": 3.909999895095825, + "ce_loss_6": 3.497714567184448, + "epoch": 0.51, + "grad_norm": 680.0, + "kl_loss_12": 346.65772705078126, + "kl_loss_17": 89.82437286376953, + "kl_loss_3": 1948.2659973144532, + "kl_loss_6": 1110.781576538086, + "learning_rate": 0.000492067018082596, + "loss": 888.6368, + "step": 5100 + }, + { + "ce_loss_12": 3.125352656841278, + "ce_loss_17": 2.9971254229545594, + "ce_loss_23": 2.9447287559509276, + "ce_loss_3": 3.9266711831092835, + "ce_loss_6": 3.488107669353485, + "epoch": 0.511, + "grad_norm": 1128.0, + "kl_loss_12": 373.9530364990234, + "kl_loss_17": 95.72911529541015, + "kl_loss_3": 2099.6183837890626, + "kl_loss_6": 1202.3802612304687, + "learning_rate": 0.0004904805974483267, + "loss": 945.6771, + "step": 5110 + }, + { + "ce_loss_12": 3.238801729679108, + "ce_loss_17": 3.1054012417793273, + "ce_loss_23": 3.050589954853058, + "ce_loss_3": 4.021122086048126, + "ce_loss_6": 3.5947796225547792, + "epoch": 0.512, + "grad_norm": 1176.0, + "kl_loss_12": 387.00370330810546, + "kl_loss_17": 100.9587459564209, + "kl_loss_3": 2088.327673339844, + "kl_loss_6": 1220.1336486816406, + "learning_rate": 0.0004888942726743353, + "loss": 958.7537, + "step": 5120 + }, + { + "ce_loss_12": 3.107498383522034, + "ce_loss_17": 2.981675314903259, + "ce_loss_23": 2.9318625807762144, + "ce_loss_3": 3.886816442012787, + "ce_loss_6": 3.460335433483124, + "epoch": 0.513, + "grad_norm": 1008.0, + "kl_loss_12": 368.3828353881836, + "kl_loss_17": 94.39456787109376, + "kl_loss_3": 2080.064324951172, + "kl_loss_6": 1183.9714385986329, + "learning_rate": 0.0004873080597348947, + "loss": 928.1082, + "step": 5130 + }, + { + "ce_loss_12": 3.0038116216659545, + "ce_loss_17": 2.8782567024230956, + "ce_loss_23": 2.8273938059806825, + "ce_loss_3": 3.823837125301361, + "ce_loss_6": 3.3672744631767273, + "epoch": 0.514, + "grad_norm": 1128.0, + "kl_loss_12": 364.96567687988284, + "kl_loss_17": 92.57419853210449, + "kl_loss_3": 2139.435168457031, + "kl_loss_6": 1213.2633575439454, + "learning_rate": 0.0004857219746031519, + "loss": 928.2652, + "step": 5140 + }, + { + "ce_loss_12": 3.161403167247772, + "ce_loss_17": 3.0452697038650514, + "ce_loss_23": 2.992450773715973, + "ce_loss_3": 3.908959686756134, + "ce_loss_6": 3.492176127433777, + "epoch": 0.515, + "grad_norm": 920.0, + "kl_loss_12": 359.31848602294923, + "kl_loss_17": 93.00241546630859, + "kl_loss_3": 1990.7133911132812, + "kl_loss_6": 1138.233218383789, + "learning_rate": 0.0004841360332509663, + "loss": 903.4719, + "step": 5150 + }, + { + "ce_loss_12": 3.1129663228988647, + "ce_loss_17": 2.9991484761238096, + "ce_loss_23": 2.9499995946884154, + "ce_loss_3": 3.863976776599884, + "ce_loss_6": 3.447406566143036, + "epoch": 0.516, + "grad_norm": 936.0, + "kl_loss_12": 350.9371063232422, + "kl_loss_17": 89.56887741088867, + "kl_loss_3": 1980.1753356933593, + "kl_loss_6": 1128.3441619873047, + "learning_rate": 0.0004825502516487497, + "loss": 868.0496, + "step": 5160 + }, + { + "ce_loss_12": 3.082901954650879, + "ce_loss_17": 2.963200068473816, + "ce_loss_23": 2.9151252150535583, + "ce_loss_3": 3.8651347398757934, + "ce_loss_6": 3.434497356414795, + "epoch": 0.517, + "grad_norm": 1264.0, + "kl_loss_12": 364.8398162841797, + "kl_loss_17": 92.54983940124512, + "kl_loss_3": 2064.647216796875, + "kl_loss_6": 1177.4377349853517, + "learning_rate": 0.00048096464576530507, + "loss": 924.965, + "step": 5170 + }, + { + "ce_loss_12": 3.1856318593025206, + "ce_loss_17": 3.0665177941322326, + "ce_loss_23": 3.0153189420700075, + "ce_loss_3": 3.9054733991622923, + "ce_loss_6": 3.5001368641853334, + "epoch": 0.518, + "grad_norm": 872.0, + "kl_loss_12": 352.8712661743164, + "kl_loss_17": 93.02919006347656, + "kl_loss_3": 1937.8787841796875, + "kl_loss_6": 1115.0434326171876, + "learning_rate": 0.00047937923156766646, + "loss": 884.1963, + "step": 5180 + }, + { + "ce_loss_12": 3.2206114172935485, + "ce_loss_17": 3.107780420780182, + "ce_loss_23": 3.057665026187897, + "ce_loss_3": 3.9358922600746156, + "ce_loss_6": 3.5398075461387633, + "epoch": 0.519, + "grad_norm": 928.0, + "kl_loss_12": 355.00037841796876, + "kl_loss_17": 91.45009002685546, + "kl_loss_3": 1950.4388427734375, + "kl_loss_6": 1120.3299621582032, + "learning_rate": 0.00047779402502093696, + "loss": 888.757, + "step": 5190 + }, + { + "ce_loss_12": 3.19130482673645, + "ce_loss_17": 3.0732568383216856, + "ce_loss_23": 3.023844051361084, + "ce_loss_3": 3.9327818512916566, + "ce_loss_6": 3.529018151760101, + "epoch": 0.52, + "grad_norm": 976.0, + "kl_loss_12": 357.42832336425784, + "kl_loss_17": 92.59470291137696, + "kl_loss_3": 1972.9936462402343, + "kl_loss_6": 1136.743600463867, + "learning_rate": 0.0004762090420881289, + "loss": 900.1426, + "step": 5200 + }, + { + "ce_loss_12": 3.107156443595886, + "ce_loss_17": 2.9926238656044006, + "ce_loss_23": 2.942973482608795, + "ce_loss_3": 3.8481027603149416, + "ce_loss_6": 3.4392590045928957, + "epoch": 0.521, + "grad_norm": 904.0, + "kl_loss_12": 353.58110961914065, + "kl_loss_17": 92.02993736267089, + "kl_loss_3": 1987.946844482422, + "kl_loss_6": 1144.1151733398438, + "learning_rate": 0.00047462429873000296, + "loss": 883.161, + "step": 5210 + }, + { + "ce_loss_12": 3.1907013177871706, + "ce_loss_17": 3.072627866268158, + "ce_loss_23": 3.019580602645874, + "ce_loss_3": 3.920751178264618, + "ce_loss_6": 3.5119782209396364, + "epoch": 0.522, + "grad_norm": 1280.0, + "kl_loss_12": 359.06115875244143, + "kl_loss_17": 94.30262756347656, + "kl_loss_3": 1971.3868774414063, + "kl_loss_6": 1127.5974426269531, + "learning_rate": 0.0004730398109049071, + "loss": 888.6526, + "step": 5220 + }, + { + "ce_loss_12": 3.1245632767677307, + "ce_loss_17": 2.9969809293746947, + "ce_loss_23": 2.9452990651130677, + "ce_loss_3": 3.9162464261054994, + "ce_loss_6": 3.4832543969154357, + "epoch": 0.523, + "grad_norm": 1104.0, + "kl_loss_12": 372.46168823242186, + "kl_loss_17": 95.79502906799317, + "kl_loss_3": 2096.8451416015623, + "kl_loss_6": 1210.0787658691406, + "learning_rate": 0.000471455594568616, + "loss": 919.6003, + "step": 5230 + }, + { + "ce_loss_12": 3.182738959789276, + "ce_loss_17": 3.0637787580490112, + "ce_loss_23": 3.0149332642555238, + "ce_loss_3": 3.9052466630935667, + "ce_loss_6": 3.50392689704895, + "epoch": 0.524, + "grad_norm": 960.0, + "kl_loss_12": 356.23197479248046, + "kl_loss_17": 92.7229564666748, + "kl_loss_3": 1947.691229248047, + "kl_loss_6": 1117.3941131591796, + "learning_rate": 0.00046987166567417086, + "loss": 896.7503, + "step": 5240 + }, + { + "ce_loss_12": 3.1106405377388002, + "ce_loss_17": 2.9953038454055787, + "ce_loss_23": 2.945481073856354, + "ce_loss_3": 3.8631661891937257, + "ce_loss_6": 3.451376569271088, + "epoch": 0.525, + "grad_norm": 924.0, + "kl_loss_12": 354.53553771972656, + "kl_loss_17": 90.6540023803711, + "kl_loss_3": 1989.9297302246093, + "kl_loss_6": 1144.197021484375, + "learning_rate": 0.00046828804017171776, + "loss": 872.9939, + "step": 5250 + }, + { + "ce_loss_12": 3.1602378487586975, + "ce_loss_17": 3.0360376477241515, + "ce_loss_23": 2.982585108280182, + "ce_loss_3": 3.9386395692825316, + "ce_loss_6": 3.5046775221824644, + "epoch": 0.526, + "grad_norm": 896.0, + "kl_loss_12": 362.33819274902345, + "kl_loss_17": 94.27711219787598, + "kl_loss_3": 2038.4729858398437, + "kl_loss_6": 1152.3221740722656, + "learning_rate": 0.00046670473400834805, + "loss": 916.7203, + "step": 5260 + }, + { + "ce_loss_12": 3.089023697376251, + "ce_loss_17": 2.9738704323768617, + "ce_loss_23": 2.9252676248550413, + "ce_loss_3": 3.8307597994804383, + "ce_loss_6": 3.415070629119873, + "epoch": 0.527, + "grad_norm": 1144.0, + "kl_loss_12": 348.49229125976564, + "kl_loss_17": 89.61092948913574, + "kl_loss_3": 1966.6946228027343, + "kl_loss_6": 1126.227001953125, + "learning_rate": 0.00046512176312793734, + "loss": 918.152, + "step": 5270 + }, + { + "ce_loss_12": 3.0861546993255615, + "ce_loss_17": 2.9649991750717164, + "ce_loss_23": 2.9146908640861513, + "ce_loss_3": 3.834387981891632, + "ce_loss_6": 3.420784020423889, + "epoch": 0.528, + "grad_norm": 968.0, + "kl_loss_12": 354.7429397583008, + "kl_loss_17": 91.66234436035157, + "kl_loss_3": 1994.0042297363282, + "kl_loss_6": 1143.562939453125, + "learning_rate": 0.00046353914347098467, + "loss": 904.6952, + "step": 5280 + }, + { + "ce_loss_12": 3.188432586193085, + "ce_loss_17": 3.0671284794807434, + "ce_loss_23": 3.0156295895576477, + "ce_loss_3": 3.933156156539917, + "ce_loss_6": 3.518996202945709, + "epoch": 0.529, + "grad_norm": 1272.0, + "kl_loss_12": 351.3106292724609, + "kl_loss_17": 91.73496208190917, + "kl_loss_3": 1971.3292175292968, + "kl_loss_6": 1125.4321075439452, + "learning_rate": 0.0004619568909744524, + "loss": 900.5209, + "step": 5290 + }, + { + "ce_loss_12": 3.18534996509552, + "ce_loss_17": 3.066155529022217, + "ce_loss_23": 3.015329587459564, + "ce_loss_3": 3.9257039070129394, + "ce_loss_6": 3.5112979888916014, + "epoch": 0.53, + "grad_norm": 1120.0, + "kl_loss_12": 355.6421432495117, + "kl_loss_17": 91.74192085266114, + "kl_loss_3": 1977.2301391601563, + "kl_loss_6": 1131.9021057128907, + "learning_rate": 0.00046037502157160573, + "loss": 901.8163, + "step": 5300 + }, + { + "ce_loss_12": 3.0694101095199584, + "ce_loss_17": 2.9440566062927247, + "ce_loss_23": 2.893705356121063, + "ce_loss_3": 3.827756106853485, + "ce_loss_6": 3.4043239831924437, + "epoch": 0.531, + "grad_norm": 1104.0, + "kl_loss_12": 363.43805236816405, + "kl_loss_17": 93.20171585083008, + "kl_loss_3": 2020.3906494140624, + "kl_loss_6": 1151.7597961425781, + "learning_rate": 0.00045879355119185207, + "loss": 907.058, + "step": 5310 + }, + { + "ce_loss_12": 3.1483601093292237, + "ce_loss_17": 3.0286761045455934, + "ce_loss_23": 2.9779433488845823, + "ce_loss_3": 3.91595743894577, + "ce_loss_6": 3.4881858229637146, + "epoch": 0.532, + "grad_norm": 1760.0, + "kl_loss_12": 366.58548431396486, + "kl_loss_17": 95.05085411071778, + "kl_loss_3": 2042.227117919922, + "kl_loss_6": 1169.8006713867187, + "learning_rate": 0.0004572124957605803, + "loss": 922.2746, + "step": 5320 + }, + { + "ce_loss_12": 3.1587769985198975, + "ce_loss_17": 3.033687078952789, + "ce_loss_23": 2.9832841396331786, + "ce_loss_3": 3.9054378986358644, + "ce_loss_6": 3.4926193952560425, + "epoch": 0.533, + "grad_norm": 936.0, + "kl_loss_12": 361.675846862793, + "kl_loss_17": 92.17733879089356, + "kl_loss_3": 1999.2652709960937, + "kl_loss_6": 1142.4123748779298, + "learning_rate": 0.00045563187119900103, + "loss": 892.2507, + "step": 5330 + }, + { + "ce_loss_12": 3.011014699935913, + "ce_loss_17": 2.887872743606567, + "ce_loss_23": 2.8407442569732666, + "ce_loss_3": 3.7862492918968202, + "ce_loss_6": 3.3594083547592164, + "epoch": 0.534, + "grad_norm": 1136.0, + "kl_loss_12": 360.4061248779297, + "kl_loss_17": 91.58968658447266, + "kl_loss_3": 2058.1018188476564, + "kl_loss_6": 1166.63935546875, + "learning_rate": 0.00045405169342398633, + "loss": 914.1059, + "step": 5340 + }, + { + "ce_loss_12": 3.1034294724464417, + "ce_loss_17": 2.979544150829315, + "ce_loss_23": 2.928696060180664, + "ce_loss_3": 3.8748332500457763, + "ce_loss_6": 3.445829749107361, + "epoch": 0.535, + "grad_norm": 1136.0, + "kl_loss_12": 363.433837890625, + "kl_loss_17": 94.02782135009765, + "kl_loss_3": 2045.0534240722657, + "kl_loss_6": 1166.5263275146485, + "learning_rate": 0.0004524719783479088, + "loss": 894.2564, + "step": 5350 + }, + { + "ce_loss_12": 3.063448226451874, + "ce_loss_17": 2.9326701164245605, + "ce_loss_23": 2.881749415397644, + "ce_loss_3": 3.8512691020965577, + "ce_loss_6": 3.412831723690033, + "epoch": 0.536, + "grad_norm": 1336.0, + "kl_loss_12": 367.49913635253904, + "kl_loss_17": 93.98178405761719, + "kl_loss_3": 2083.023223876953, + "kl_loss_6": 1178.4531219482421, + "learning_rate": 0.00045089274187848144, + "loss": 901.6736, + "step": 5360 + }, + { + "ce_loss_12": 3.166870927810669, + "ce_loss_17": 3.049614202976227, + "ce_loss_23": 3.0012126564979553, + "ce_loss_3": 3.898930561542511, + "ce_loss_6": 3.487534236907959, + "epoch": 0.537, + "grad_norm": 1352.0, + "kl_loss_12": 352.57630462646483, + "kl_loss_17": 91.50114479064942, + "kl_loss_3": 1975.2602416992188, + "kl_loss_6": 1119.994873046875, + "learning_rate": 0.00044931399991859835, + "loss": 886.5867, + "step": 5370 + }, + { + "ce_loss_12": 3.029424798488617, + "ce_loss_17": 2.9093127846717834, + "ce_loss_23": 2.859842073917389, + "ce_loss_3": 3.7813164949417115, + "ce_loss_6": 3.365421676635742, + "epoch": 0.538, + "grad_norm": 1224.0, + "kl_loss_12": 357.1093246459961, + "kl_loss_17": 90.75511360168457, + "kl_loss_3": 2008.6002685546875, + "kl_loss_6": 1151.0897399902344, + "learning_rate": 0.00044773576836617336, + "loss": 891.3827, + "step": 5380 + }, + { + "ce_loss_12": 3.126391017436981, + "ce_loss_17": 3.002385115623474, + "ce_loss_23": 2.950945734977722, + "ce_loss_3": 3.9017954230308534, + "ce_loss_6": 3.4763889789581297, + "epoch": 0.539, + "grad_norm": 1128.0, + "kl_loss_12": 366.2182907104492, + "kl_loss_17": 93.43424530029297, + "kl_loss_3": 2048.6008544921874, + "kl_loss_6": 1176.9113891601562, + "learning_rate": 0.00044615806311398056, + "loss": 925.7355, + "step": 5390 + }, + { + "ce_loss_12": 3.190409755706787, + "ce_loss_17": 3.0776893973350523, + "ce_loss_23": 3.0306317090988157, + "ce_loss_3": 3.882905352115631, + "ce_loss_6": 3.4969664454460143, + "epoch": 0.54, + "grad_norm": 852.0, + "kl_loss_12": 345.8917999267578, + "kl_loss_17": 88.83585586547852, + "kl_loss_3": 1907.6739685058594, + "kl_loss_6": 1096.0950744628906, + "learning_rate": 0.00044458090004949454, + "loss": 895.4913, + "step": 5400 + }, + { + "ce_loss_12": 3.075972247123718, + "ce_loss_17": 2.9414425611495973, + "ce_loss_23": 2.8867051243782043, + "ce_loss_3": 3.8747236490249635, + "ce_loss_6": 3.4382914423942568, + "epoch": 0.541, + "grad_norm": 1032.0, + "kl_loss_12": 375.124333190918, + "kl_loss_17": 95.7294204711914, + "kl_loss_3": 2125.295440673828, + "kl_loss_6": 1218.7462677001954, + "learning_rate": 0.0004430042950547297, + "loss": 912.583, + "step": 5410 + }, + { + "ce_loss_12": 3.1478776812553404, + "ce_loss_17": 3.0218735575675963, + "ce_loss_23": 2.9717148423194883, + "ce_loss_3": 3.9084969997406005, + "ce_loss_6": 3.4848543286323546, + "epoch": 0.542, + "grad_norm": 1008.0, + "kl_loss_12": 369.2810485839844, + "kl_loss_17": 96.3768123626709, + "kl_loss_3": 2033.286114501953, + "kl_loss_6": 1161.711181640625, + "learning_rate": 0.0004414282640060809, + "loss": 906.5489, + "step": 5420 + }, + { + "ce_loss_12": 3.229024577140808, + "ce_loss_17": 3.109649395942688, + "ce_loss_23": 3.057064139842987, + "ce_loss_3": 3.9660748839378357, + "ce_loss_6": 3.563118839263916, + "epoch": 0.543, + "grad_norm": 1216.0, + "kl_loss_12": 354.3711135864258, + "kl_loss_17": 92.91755104064941, + "kl_loss_3": 1950.7111328125, + "kl_loss_6": 1131.4530395507813, + "learning_rate": 0.0004398528227741633, + "loss": 886.8607, + "step": 5430 + }, + { + "ce_loss_12": 3.112036645412445, + "ce_loss_17": 2.9836097359657288, + "ce_loss_23": 2.9331039428710937, + "ce_loss_3": 3.8796893119812013, + "ce_loss_6": 3.4567083358764648, + "epoch": 0.544, + "grad_norm": 1360.0, + "kl_loss_12": 360.694091796875, + "kl_loss_17": 94.16159896850586, + "kl_loss_3": 2016.5906494140625, + "kl_loss_6": 1165.4428131103516, + "learning_rate": 0.00043827798722365264, + "loss": 913.6246, + "step": 5440 + }, + { + "ce_loss_12": 3.2171663522720335, + "ce_loss_17": 3.099006462097168, + "ce_loss_23": 3.050920236110687, + "ce_loss_3": 3.942546272277832, + "ce_loss_6": 3.540075421333313, + "epoch": 0.545, + "grad_norm": 1320.0, + "kl_loss_12": 358.2791030883789, + "kl_loss_17": 93.43666496276856, + "kl_loss_3": 1943.8274780273437, + "kl_loss_6": 1128.9095367431642, + "learning_rate": 0.00043670377321312535, + "loss": 880.9171, + "step": 5450 + }, + { + "ce_loss_12": 3.2231846690177917, + "ce_loss_17": 3.108734202384949, + "ce_loss_23": 3.0587079524993896, + "ce_loss_3": 3.938978838920593, + "ce_loss_6": 3.542694401741028, + "epoch": 0.546, + "grad_norm": 1184.0, + "kl_loss_12": 357.0698211669922, + "kl_loss_17": 92.49217147827149, + "kl_loss_3": 1930.8526916503906, + "kl_loss_6": 1119.0527770996093, + "learning_rate": 0.0004351301965948991, + "loss": 899.8764, + "step": 5460 + }, + { + "ce_loss_12": 3.1383449912071226, + "ce_loss_17": 3.0184853315353393, + "ce_loss_23": 2.969007098674774, + "ce_loss_3": 3.85242840051651, + "ce_loss_6": 3.4543243527412413, + "epoch": 0.547, + "grad_norm": 944.0, + "kl_loss_12": 350.8236831665039, + "kl_loss_17": 90.69459686279296, + "kl_loss_3": 1917.282891845703, + "kl_loss_6": 1102.7481719970704, + "learning_rate": 0.000433557273214873, + "loss": 882.44, + "step": 5470 + }, + { + "ce_loss_12": 3.1253939270973206, + "ce_loss_17": 3.000547182559967, + "ce_loss_23": 2.9495556354522705, + "ce_loss_3": 3.8606873035430906, + "ce_loss_6": 3.4494663953781126, + "epoch": 0.548, + "grad_norm": 1256.0, + "kl_loss_12": 356.8171081542969, + "kl_loss_17": 92.74531326293945, + "kl_loss_3": 1953.0103088378905, + "kl_loss_6": 1111.0361450195312, + "learning_rate": 0.000431985018912368, + "loss": 875.3377, + "step": 5480 + }, + { + "ce_loss_12": 3.1010571360588073, + "ce_loss_17": 2.97558354139328, + "ce_loss_23": 2.925135314464569, + "ce_loss_3": 3.869184398651123, + "ce_loss_6": 3.4456772685050963, + "epoch": 0.549, + "grad_norm": 1128.0, + "kl_loss_12": 365.4259338378906, + "kl_loss_17": 93.24345054626465, + "kl_loss_3": 2039.9272033691407, + "kl_loss_6": 1174.1738891601562, + "learning_rate": 0.0004304134495199674, + "loss": 888.229, + "step": 5490 + }, + { + "ce_loss_12": 3.135417914390564, + "ce_loss_17": 3.0078897953033445, + "ce_loss_23": 2.957447588443756, + "ce_loss_3": 3.89321106672287, + "ce_loss_6": 3.4768668532371523, + "epoch": 0.55, + "grad_norm": 1112.0, + "kl_loss_12": 370.8565933227539, + "kl_loss_17": 93.26599197387695, + "kl_loss_3": 2049.9021118164064, + "kl_loss_6": 1183.0644409179688, + "learning_rate": 0.0004288425808633575, + "loss": 902.8559, + "step": 5500 + }, + { + "ce_loss_12": 3.1100064396858214, + "ce_loss_17": 2.9872559189796446, + "ce_loss_23": 2.9383681416511536, + "ce_loss_3": 3.861984574794769, + "ce_loss_6": 3.4384658455848696, + "epoch": 0.551, + "grad_norm": 1352.0, + "kl_loss_12": 354.1197265625, + "kl_loss_17": 90.87067832946778, + "kl_loss_3": 2005.800714111328, + "kl_loss_6": 1139.4069519042969, + "learning_rate": 0.0004272724287611684, + "loss": 899.1554, + "step": 5510 + }, + { + "ce_loss_12": 3.0869866490364073, + "ce_loss_17": 2.9635907649993896, + "ce_loss_23": 2.911494505405426, + "ce_loss_3": 3.8518809914588927, + "ce_loss_6": 3.418315315246582, + "epoch": 0.552, + "grad_norm": 980.0, + "kl_loss_12": 359.54785308837893, + "kl_loss_17": 92.89949645996094, + "kl_loss_3": 2041.656689453125, + "kl_loss_6": 1154.5992065429687, + "learning_rate": 0.00042570300902481425, + "loss": 903.5547, + "step": 5520 + }, + { + "ce_loss_12": 3.111093783378601, + "ce_loss_17": 2.9967490673065185, + "ce_loss_23": 2.9479554176330565, + "ce_loss_3": 3.8493823170661927, + "ce_loss_6": 3.4331010580062866, + "epoch": 0.553, + "grad_norm": 1176.0, + "kl_loss_12": 350.27253875732424, + "kl_loss_17": 90.20486984252929, + "kl_loss_3": 1977.1471374511718, + "kl_loss_6": 1123.4446014404298, + "learning_rate": 0.00042413433745833423, + "loss": 887.315, + "step": 5530 + }, + { + "ce_loss_12": 3.1101133584976197, + "ce_loss_17": 2.987442672252655, + "ce_loss_23": 2.9393104672431947, + "ce_loss_3": 3.866848182678223, + "ce_loss_6": 3.4454284429550173, + "epoch": 0.554, + "grad_norm": 1040.0, + "kl_loss_12": 356.58823547363284, + "kl_loss_17": 91.02802352905273, + "kl_loss_3": 2003.3032836914062, + "kl_loss_6": 1143.326953125, + "learning_rate": 0.0004225664298582339, + "loss": 874.8443, + "step": 5540 + }, + { + "ce_loss_12": 3.1919366121292114, + "ce_loss_17": 3.0718690514564515, + "ce_loss_23": 3.021695065498352, + "ce_loss_3": 3.918123209476471, + "ce_loss_6": 3.508436751365662, + "epoch": 0.555, + "grad_norm": 1176.0, + "kl_loss_12": 349.13452911376953, + "kl_loss_17": 90.69793014526367, + "kl_loss_3": 1941.2161254882812, + "kl_loss_6": 1109.2159545898437, + "learning_rate": 0.000420999302013325, + "loss": 875.6727, + "step": 5550 + }, + { + "ce_loss_12": 3.0942121744155884, + "ce_loss_17": 2.96941956281662, + "ce_loss_23": 2.9174832582473753, + "ce_loss_3": 3.8882731795310974, + "ce_loss_6": 3.4454726815223693, + "epoch": 0.556, + "grad_norm": 1352.0, + "kl_loss_12": 368.905517578125, + "kl_loss_17": 95.6106086730957, + "kl_loss_3": 2071.4515686035156, + "kl_loss_6": 1175.006658935547, + "learning_rate": 0.000419432969704568, + "loss": 896.6355, + "step": 5560 + }, + { + "ce_loss_12": 3.1318360328674317, + "ce_loss_17": 3.011006569862366, + "ce_loss_23": 2.9601503133773805, + "ce_loss_3": 3.866017699241638, + "ce_loss_6": 3.4546066999435423, + "epoch": 0.557, + "grad_norm": 852.0, + "kl_loss_12": 353.10416107177736, + "kl_loss_17": 91.07056617736816, + "kl_loss_3": 1960.4804931640624, + "kl_loss_6": 1120.8563903808595, + "learning_rate": 0.00041786744870491154, + "loss": 909.5014, + "step": 5570 + }, + { + "ce_loss_12": 3.0762211441993714, + "ce_loss_17": 2.9487279295921325, + "ce_loss_23": 2.897049582004547, + "ce_loss_3": 3.8321069359779356, + "ce_loss_6": 3.4129802227020263, + "epoch": 0.558, + "grad_norm": 972.0, + "kl_loss_12": 362.3402847290039, + "kl_loss_17": 93.73015022277832, + "kl_loss_3": 2016.0342102050781, + "kl_loss_6": 1164.3839385986328, + "learning_rate": 0.0004163027547791347, + "loss": 899.6561, + "step": 5580 + }, + { + "ce_loss_12": 3.063253676891327, + "ce_loss_17": 2.9407611727714538, + "ce_loss_23": 2.8906123280525207, + "ce_loss_3": 3.8545337080955506, + "ce_loss_6": 3.412693679332733, + "epoch": 0.559, + "grad_norm": 1240.0, + "kl_loss_12": 360.7710876464844, + "kl_loss_17": 92.40084419250488, + "kl_loss_3": 2056.2872314453125, + "kl_loss_6": 1158.0092742919921, + "learning_rate": 0.0004147389036836881, + "loss": 903.7966, + "step": 5590 + }, + { + "ce_loss_12": 3.1042871475219727, + "ce_loss_17": 2.980177104473114, + "ce_loss_23": 2.93089816570282, + "ce_loss_3": 3.870959210395813, + "ce_loss_6": 3.4439244508743285, + "epoch": 0.56, + "grad_norm": 1208.0, + "kl_loss_12": 358.4499801635742, + "kl_loss_17": 92.21180305480956, + "kl_loss_3": 2018.989453125, + "kl_loss_6": 1156.2593170166015, + "learning_rate": 0.00041317591116653486, + "loss": 913.3369, + "step": 5600 + }, + { + "ce_loss_12": 3.1313674330711363, + "ce_loss_17": 3.0127056002616883, + "ce_loss_23": 2.9640170097351075, + "ce_loss_3": 3.897299957275391, + "ce_loss_6": 3.473569095134735, + "epoch": 0.561, + "grad_norm": 912.0, + "kl_loss_12": 366.00707244873047, + "kl_loss_17": 93.89572296142578, + "kl_loss_3": 2028.6972534179688, + "kl_loss_6": 1155.3702209472656, + "learning_rate": 0.0004116137929669921, + "loss": 891.6586, + "step": 5610 + }, + { + "ce_loss_12": 3.136381483078003, + "ce_loss_17": 3.0133532643318177, + "ce_loss_23": 2.963248634338379, + "ce_loss_3": 3.884948122501373, + "ce_loss_6": 3.463275647163391, + "epoch": 0.562, + "grad_norm": 1320.0, + "kl_loss_12": 352.8741882324219, + "kl_loss_17": 89.78136978149413, + "kl_loss_3": 1996.60400390625, + "kl_loss_6": 1136.3984741210938, + "learning_rate": 0.00041005256481557305, + "loss": 880.7673, + "step": 5620 + }, + { + "ce_loss_12": 3.2142569780349732, + "ce_loss_17": 3.0997274041175844, + "ce_loss_23": 3.051528239250183, + "ce_loss_3": 3.918693208694458, + "ce_loss_6": 3.5266610264778135, + "epoch": 0.563, + "grad_norm": 1012.0, + "kl_loss_12": 343.21154327392577, + "kl_loss_17": 88.66844177246094, + "kl_loss_3": 1894.733349609375, + "kl_loss_6": 1093.9771453857422, + "learning_rate": 0.00040849224243382767, + "loss": 871.3256, + "step": 5630 + }, + { + "ce_loss_12": 3.0870473980903625, + "ce_loss_17": 2.965904676914215, + "ce_loss_23": 2.9143540382385256, + "ce_loss_3": 3.8411280512809753, + "ce_loss_6": 3.418475079536438, + "epoch": 0.564, + "grad_norm": 1216.0, + "kl_loss_12": 355.9902709960937, + "kl_loss_17": 91.11196594238281, + "kl_loss_3": 2005.0414489746095, + "kl_loss_6": 1145.2955535888673, + "learning_rate": 0.000406932841534185, + "loss": 880.5096, + "step": 5640 + }, + { + "ce_loss_12": 3.051490914821625, + "ce_loss_17": 2.9329665184020994, + "ce_loss_23": 2.8812998056411745, + "ce_loss_3": 3.816966009140015, + "ce_loss_6": 3.391273760795593, + "epoch": 0.565, + "grad_norm": 1120.0, + "kl_loss_12": 360.7217727661133, + "kl_loss_17": 93.10445556640624, + "kl_loss_3": 2024.0443725585938, + "kl_loss_6": 1157.3559936523438, + "learning_rate": 0.0004053743778197951, + "loss": 920.442, + "step": 5650 + }, + { + "ce_loss_12": 3.1533095955848696, + "ce_loss_17": 3.0284717679023743, + "ce_loss_23": 2.9783113837242126, + "ce_loss_3": 3.901489055156708, + "ce_loss_6": 3.4922434449195863, + "epoch": 0.566, + "grad_norm": 1152.0, + "kl_loss_12": 365.6111724853516, + "kl_loss_17": 94.32415237426758, + "kl_loss_3": 1988.0268127441407, + "kl_loss_6": 1152.2390167236329, + "learning_rate": 0.0004038168669843697, + "loss": 908.1027, + "step": 5660 + }, + { + "ce_loss_12": 3.1055307507514955, + "ce_loss_17": 2.988659369945526, + "ce_loss_23": 2.9381643414497374, + "ce_loss_3": 3.8319229960441588, + "ce_loss_6": 3.4249443411827087, + "epoch": 0.567, + "grad_norm": 1088.0, + "kl_loss_12": 349.71607971191406, + "kl_loss_17": 90.56960487365723, + "kl_loss_3": 1944.890283203125, + "kl_loss_6": 1114.1278930664062, + "learning_rate": 0.000402260324712026, + "loss": 898.1495, + "step": 5670 + }, + { + "ce_loss_12": 3.14718611240387, + "ce_loss_17": 3.0279541492462156, + "ce_loss_23": 2.979733180999756, + "ce_loss_3": 3.918551576137543, + "ce_loss_6": 3.4888805150985718, + "epoch": 0.568, + "grad_norm": 1144.0, + "kl_loss_12": 354.7816497802734, + "kl_loss_17": 89.66108932495118, + "kl_loss_3": 2034.6115295410157, + "kl_loss_6": 1157.8374237060548, + "learning_rate": 0.00040070476667712743, + "loss": 888.1705, + "step": 5680 + }, + { + "ce_loss_12": 3.1779404878616333, + "ce_loss_17": 3.058204233646393, + "ce_loss_23": 3.006940519809723, + "ce_loss_3": 3.9126431226730345, + "ce_loss_6": 3.4999541878700255, + "epoch": 0.569, + "grad_norm": 868.0, + "kl_loss_12": 354.8095184326172, + "kl_loss_17": 91.35061798095703, + "kl_loss_3": 1974.644305419922, + "kl_loss_6": 1131.2801147460937, + "learning_rate": 0.0003991502085441259, + "loss": 894.1547, + "step": 5690 + }, + { + "ce_loss_12": 3.2082398533821106, + "ce_loss_17": 3.09513920545578, + "ce_loss_23": 3.046538519859314, + "ce_loss_3": 3.9162412881851196, + "ce_loss_6": 3.5189447522163393, + "epoch": 0.57, + "grad_norm": 1004.0, + "kl_loss_12": 345.07321014404295, + "kl_loss_17": 89.30585174560547, + "kl_loss_3": 1900.429345703125, + "kl_loss_6": 1088.3466583251952, + "learning_rate": 0.0003975966659674047, + "loss": 882.0586, + "step": 5700 + }, + { + "ce_loss_12": 3.188878560066223, + "ce_loss_17": 3.067433404922485, + "ce_loss_23": 3.0170228600502016, + "ce_loss_3": 3.924565625190735, + "ce_loss_6": 3.514917719364166, + "epoch": 0.571, + "grad_norm": 1376.0, + "kl_loss_12": 353.6061096191406, + "kl_loss_17": 91.92852973937988, + "kl_loss_3": 1958.1591064453125, + "kl_loss_6": 1125.2135955810547, + "learning_rate": 0.0003960441545911204, + "loss": 877.4781, + "step": 5710 + }, + { + "ce_loss_12": 3.1661776185035704, + "ce_loss_17": 3.0493478536605836, + "ce_loss_23": 2.999690568447113, + "ce_loss_3": 3.904485309123993, + "ce_loss_6": 3.4924548625946046, + "epoch": 0.572, + "grad_norm": 1344.0, + "kl_loss_12": 354.6435806274414, + "kl_loss_17": 90.59515380859375, + "kl_loss_3": 1981.4042236328125, + "kl_loss_6": 1136.697817993164, + "learning_rate": 0.0003944926900490452, + "loss": 883.3597, + "step": 5720 + }, + { + "ce_loss_12": 3.1009980201721192, + "ce_loss_17": 2.973039472103119, + "ce_loss_23": 2.9208882093429565, + "ce_loss_3": 3.871054542064667, + "ce_loss_6": 3.4438048243522643, + "epoch": 0.573, + "grad_norm": 1192.0, + "kl_loss_12": 362.32491302490234, + "kl_loss_17": 92.63758544921875, + "kl_loss_3": 2032.8689392089843, + "kl_loss_6": 1159.4469604492188, + "learning_rate": 0.0003929422879644099, + "loss": 889.1459, + "step": 5730 + }, + { + "ce_loss_12": 3.0974637866020203, + "ce_loss_17": 2.983030641078949, + "ce_loss_23": 2.9348677277565, + "ce_loss_3": 3.8249285221099854, + "ce_loss_6": 3.4108102679252625, + "epoch": 0.574, + "grad_norm": 1192.0, + "kl_loss_12": 344.463232421875, + "kl_loss_17": 89.30107688903809, + "kl_loss_3": 1950.4291198730468, + "kl_loss_6": 1106.0761596679688, + "learning_rate": 0.0003913929639497462, + "loss": 860.5516, + "step": 5740 + }, + { + "ce_loss_12": 3.0569748997688295, + "ce_loss_17": 2.934243309497833, + "ce_loss_23": 2.8869590044021605, + "ce_loss_3": 3.823574161529541, + "ce_loss_6": 3.3891812562942505, + "epoch": 0.575, + "grad_norm": 1128.0, + "kl_loss_12": 351.39185638427733, + "kl_loss_17": 89.34204635620117, + "kl_loss_3": 2018.800634765625, + "kl_loss_6": 1142.7618347167968, + "learning_rate": 0.00038984473360672965, + "loss": 879.9574, + "step": 5750 + }, + { + "ce_loss_12": 3.064702033996582, + "ce_loss_17": 2.9443649649620056, + "ce_loss_23": 2.8942911982536317, + "ce_loss_3": 3.836893093585968, + "ce_loss_6": 3.4024426221847532, + "epoch": 0.576, + "grad_norm": 1200.0, + "kl_loss_12": 351.22936859130857, + "kl_loss_17": 89.25144119262696, + "kl_loss_3": 2022.8355651855468, + "kl_loss_6": 1151.775244140625, + "learning_rate": 0.0003882976125260229, + "loss": 880.2629, + "step": 5760 + }, + { + "ce_loss_12": 3.129497694969177, + "ce_loss_17": 3.010597312450409, + "ce_loss_23": 2.9593307852745054, + "ce_loss_3": 3.8632505893707276, + "ce_loss_6": 3.454318583011627, + "epoch": 0.577, + "grad_norm": 1000.0, + "kl_loss_12": 351.70952758789065, + "kl_loss_17": 90.51013984680176, + "kl_loss_3": 1959.9725219726563, + "kl_loss_6": 1122.036703491211, + "learning_rate": 0.00038675161628711776, + "loss": 886.1361, + "step": 5770 + }, + { + "ce_loss_12": 3.1580193042755127, + "ce_loss_17": 3.042417895793915, + "ce_loss_23": 2.9932226300239564, + "ce_loss_3": 3.894358551502228, + "ce_loss_6": 3.482797610759735, + "epoch": 0.578, + "grad_norm": 960.0, + "kl_loss_12": 350.21394805908204, + "kl_loss_17": 91.14322891235352, + "kl_loss_3": 1943.130877685547, + "kl_loss_6": 1110.1543884277344, + "learning_rate": 0.0003852067604581794, + "loss": 901.2336, + "step": 5780 + }, + { + "ce_loss_12": 3.109062159061432, + "ce_loss_17": 2.992610514163971, + "ce_loss_23": 2.943720591068268, + "ce_loss_3": 3.8661123752593993, + "ce_loss_6": 3.4403353691101075, + "epoch": 0.579, + "grad_norm": 1096.0, + "kl_loss_12": 351.67835693359376, + "kl_loss_17": 89.08604125976562, + "kl_loss_3": 2006.901873779297, + "kl_loss_6": 1143.3357177734374, + "learning_rate": 0.0003836630605958888, + "loss": 884.7488, + "step": 5790 + }, + { + "ce_loss_12": 3.161776900291443, + "ce_loss_17": 3.0456367135047913, + "ce_loss_23": 2.998029279708862, + "ce_loss_3": 3.9008978247642516, + "ce_loss_6": 3.494714057445526, + "epoch": 0.58, + "grad_norm": 1288.0, + "kl_loss_12": 353.23523712158203, + "kl_loss_17": 91.2992992401123, + "kl_loss_3": 1989.3494140625, + "kl_loss_6": 1139.3370666503906, + "learning_rate": 0.0003821205322452863, + "loss": 922.5059, + "step": 5800 + }, + { + "ce_loss_12": 3.1459306120872497, + "ce_loss_17": 3.0311925053596496, + "ce_loss_23": 2.9824758887290956, + "ce_loss_3": 3.872205066680908, + "ce_loss_6": 3.459067690372467, + "epoch": 0.581, + "grad_norm": 1784.0, + "kl_loss_12": 347.05757904052734, + "kl_loss_17": 89.70176620483399, + "kl_loss_3": 1963.4107666015625, + "kl_loss_6": 1114.5962188720703, + "learning_rate": 0.0003805791909396155, + "loss": 884.1924, + "step": 5810 + }, + { + "ce_loss_12": 3.1018191814422607, + "ce_loss_17": 2.9827213287353516, + "ce_loss_23": 2.9357261180877687, + "ce_loss_3": 3.847515308856964, + "ce_loss_6": 3.431482744216919, + "epoch": 0.582, + "grad_norm": 1004.0, + "kl_loss_12": 345.5039291381836, + "kl_loss_17": 88.27181510925293, + "kl_loss_3": 1973.841162109375, + "kl_loss_6": 1116.8025634765625, + "learning_rate": 0.0003790390522001662, + "loss": 890.882, + "step": 5820 + }, + { + "ce_loss_12": 3.045158123970032, + "ce_loss_17": 2.93050411939621, + "ce_loss_23": 2.881169056892395, + "ce_loss_3": 3.7953080892562867, + "ce_loss_6": 3.3693145155906676, + "epoch": 0.583, + "grad_norm": 984.0, + "kl_loss_12": 346.33858489990234, + "kl_loss_17": 88.57795143127441, + "kl_loss_3": 1998.8186462402343, + "kl_loss_6": 1127.1401947021484, + "learning_rate": 0.0003775001315361183, + "loss": 877.8606, + "step": 5830 + }, + { + "ce_loss_12": 3.143208909034729, + "ce_loss_17": 3.019788587093353, + "ce_loss_23": 2.968058371543884, + "ce_loss_3": 3.902362859249115, + "ce_loss_6": 3.47598592042923, + "epoch": 0.584, + "grad_norm": 852.0, + "kl_loss_12": 352.2950210571289, + "kl_loss_17": 90.95420379638672, + "kl_loss_3": 1999.0019226074219, + "kl_loss_6": 1129.328665161133, + "learning_rate": 0.0003759624444443858, + "loss": 892.2592, + "step": 5840 + }, + { + "ce_loss_12": 3.1722511053085327, + "ce_loss_17": 3.0544722199440004, + "ce_loss_23": 3.0061080932617186, + "ce_loss_3": 3.895999026298523, + "ce_loss_6": 3.4860278964042664, + "epoch": 0.585, + "grad_norm": 996.0, + "kl_loss_12": 348.66165618896486, + "kl_loss_17": 89.14413490295411, + "kl_loss_3": 1957.5979736328125, + "kl_loss_6": 1110.2203674316406, + "learning_rate": 0.00037442600640946044, + "loss": 871.4254, + "step": 5850 + }, + { + "ce_loss_12": 3.134334754943848, + "ce_loss_17": 3.0207717776298524, + "ce_loss_23": 2.9735180854797365, + "ce_loss_3": 3.863844406604767, + "ce_loss_6": 3.453572678565979, + "epoch": 0.586, + "grad_norm": 1160.0, + "kl_loss_12": 350.11250457763674, + "kl_loss_17": 88.69996490478516, + "kl_loss_3": 1947.98056640625, + "kl_loss_6": 1116.696987915039, + "learning_rate": 0.00037289083290325663, + "loss": 862.4559, + "step": 5860 + }, + { + "ce_loss_12": 3.1132630348205566, + "ce_loss_17": 2.996178901195526, + "ce_loss_23": 2.9467347741127012, + "ce_loss_3": 3.843463182449341, + "ce_loss_6": 3.4376041889190674, + "epoch": 0.587, + "grad_norm": 996.0, + "kl_loss_12": 342.12928619384763, + "kl_loss_17": 89.54325790405274, + "kl_loss_3": 1928.3241943359376, + "kl_loss_6": 1100.103323364258, + "learning_rate": 0.0003713569393849543, + "loss": 870.1561, + "step": 5870 + }, + { + "ce_loss_12": 3.1706372618675234, + "ce_loss_17": 3.0503373622894285, + "ce_loss_23": 3.0024590611457826, + "ce_loss_3": 3.9031503558158875, + "ce_loss_6": 3.4925827860832213, + "epoch": 0.588, + "grad_norm": 1192.0, + "kl_loss_12": 354.43677673339846, + "kl_loss_17": 91.15572128295898, + "kl_loss_3": 1967.8707458496094, + "kl_loss_6": 1126.3148254394532, + "learning_rate": 0.00036982434130084397, + "loss": 883.001, + "step": 5880 + }, + { + "ce_loss_12": 3.0874892592430117, + "ce_loss_17": 2.966780185699463, + "ce_loss_23": 2.9160982012748717, + "ce_loss_3": 3.820379447937012, + "ce_loss_6": 3.4098856925964354, + "epoch": 0.589, + "grad_norm": 1224.0, + "kl_loss_12": 354.72835235595704, + "kl_loss_17": 91.79812850952149, + "kl_loss_3": 1963.2278564453125, + "kl_loss_6": 1119.7965881347657, + "learning_rate": 0.00036829305408417166, + "loss": 889.332, + "step": 5890 + }, + { + "ce_loss_12": 3.077235984802246, + "ce_loss_17": 2.9538180470466613, + "ce_loss_23": 2.9021143913269043, + "ce_loss_3": 3.838461446762085, + "ce_loss_6": 3.4159860849380492, + "epoch": 0.59, + "grad_norm": 1012.0, + "kl_loss_12": 355.88343353271483, + "kl_loss_17": 91.19095001220703, + "kl_loss_3": 2016.5178649902343, + "kl_loss_6": 1152.0101684570313, + "learning_rate": 0.0003667630931549826, + "loss": 892.2932, + "step": 5900 + }, + { + "ce_loss_12": 3.045335614681244, + "ce_loss_17": 2.9218624353408815, + "ce_loss_23": 2.872580373287201, + "ce_loss_3": 3.8366833329200745, + "ce_loss_6": 3.394924187660217, + "epoch": 0.591, + "grad_norm": 1144.0, + "kl_loss_12": 360.2607162475586, + "kl_loss_17": 91.28438262939453, + "kl_loss_3": 2077.546520996094, + "kl_loss_6": 1174.0156555175781, + "learning_rate": 0.00036523447391996613, + "loss": 908.0979, + "step": 5910 + }, + { + "ce_loss_12": 3.126784014701843, + "ce_loss_17": 3.012990856170654, + "ce_loss_23": 2.967097759246826, + "ce_loss_3": 3.8611212372779846, + "ce_loss_6": 3.4536351919174195, + "epoch": 0.592, + "grad_norm": 984.0, + "kl_loss_12": 345.7523956298828, + "kl_loss_17": 88.32137145996094, + "kl_loss_3": 1947.22578125, + "kl_loss_6": 1109.581494140625, + "learning_rate": 0.00036370721177230114, + "loss": 871.7516, + "step": 5920 + }, + { + "ce_loss_12": 3.1307328581809997, + "ce_loss_17": 3.011709153652191, + "ce_loss_23": 2.962051284313202, + "ce_loss_3": 3.8877512216567993, + "ce_loss_6": 3.470838952064514, + "epoch": 0.593, + "grad_norm": 1216.0, + "kl_loss_12": 358.9762557983398, + "kl_loss_17": 92.42395324707032, + "kl_loss_3": 2001.0065490722657, + "kl_loss_6": 1151.392578125, + "learning_rate": 0.00036218132209150044, + "loss": 890.91, + "step": 5930 + }, + { + "ce_loss_12": 3.0935922861099243, + "ce_loss_17": 2.9630477666854858, + "ce_loss_23": 2.908335840702057, + "ce_loss_3": 3.875299608707428, + "ce_loss_6": 3.4436821937561035, + "epoch": 0.594, + "grad_norm": 1064.0, + "kl_loss_12": 368.01280364990237, + "kl_loss_17": 95.11238098144531, + "kl_loss_3": 2086.326556396484, + "kl_loss_6": 1186.2103179931642, + "learning_rate": 0.0003606568202432562, + "loss": 908.7041, + "step": 5940 + }, + { + "ce_loss_12": 3.1535122275352476, + "ce_loss_17": 3.0376246333122254, + "ce_loss_23": 2.988776755332947, + "ce_loss_3": 3.9159383296966555, + "ce_loss_6": 3.4887673020362855, + "epoch": 0.595, + "grad_norm": 1352.0, + "kl_loss_12": 354.77533264160155, + "kl_loss_17": 91.6845832824707, + "kl_loss_3": 2039.5684753417968, + "kl_loss_6": 1158.846859741211, + "learning_rate": 0.0003591337215792851, + "loss": 885.2172, + "step": 5950 + }, + { + "ce_loss_12": 3.1813810586929323, + "ce_loss_17": 3.066852557659149, + "ce_loss_23": 3.0204002499580382, + "ce_loss_3": 3.9011024475097655, + "ce_loss_6": 3.5039899230003355, + "epoch": 0.596, + "grad_norm": 1232.0, + "kl_loss_12": 344.7813095092773, + "kl_loss_17": 88.03176994323731, + "kl_loss_3": 1934.431787109375, + "kl_loss_6": 1117.7573455810548, + "learning_rate": 0.00035761204143717383, + "loss": 882.9086, + "step": 5960 + }, + { + "ce_loss_12": 3.1446377277374267, + "ce_loss_17": 3.025981771945953, + "ce_loss_23": 2.9761717557907104, + "ce_loss_3": 3.878723645210266, + "ce_loss_6": 3.468522787094116, + "epoch": 0.597, + "grad_norm": 1256.0, + "kl_loss_12": 353.09497833251953, + "kl_loss_17": 91.71636924743652, + "kl_loss_3": 1978.5257202148437, + "kl_loss_6": 1137.846499633789, + "learning_rate": 0.0003560917951402245, + "loss": 906.2232, + "step": 5970 + }, + { + "ce_loss_12": 3.1228631258010866, + "ce_loss_17": 3.007214438915253, + "ce_loss_23": 2.9603391289711, + "ce_loss_3": 3.8574273109436037, + "ce_loss_6": 3.4532262802124025, + "epoch": 0.598, + "grad_norm": 1360.0, + "kl_loss_12": 346.21971282958987, + "kl_loss_17": 89.21910247802734, + "kl_loss_3": 1960.0101745605468, + "kl_loss_6": 1125.2134948730468, + "learning_rate": 0.00035457299799730046, + "loss": 879.1287, + "step": 5980 + }, + { + "ce_loss_12": 3.1803099870681764, + "ce_loss_17": 3.0637570142745973, + "ce_loss_23": 3.0157534003257753, + "ce_loss_3": 3.9152540445327757, + "ce_loss_6": 3.5102675199508666, + "epoch": 0.599, + "grad_norm": 1432.0, + "kl_loss_12": 349.6127532958984, + "kl_loss_17": 89.91424903869628, + "kl_loss_3": 1961.0085571289062, + "kl_loss_6": 1123.6217254638673, + "learning_rate": 0.0003530556653026721, + "loss": 887.6521, + "step": 5990 + }, + { + "ce_loss_12": 3.1075904726982118, + "ce_loss_17": 2.9898393511772157, + "ce_loss_23": 2.9403223633766173, + "ce_loss_3": 3.8573669672012327, + "ce_loss_6": 3.430279052257538, + "epoch": 0.6, + "grad_norm": 1856.0, + "kl_loss_12": 342.2793685913086, + "kl_loss_17": 89.26798706054687, + "kl_loss_3": 1981.4362060546875, + "kl_loss_6": 1109.620657348633, + "learning_rate": 0.00035153981233586274, + "loss": 887.7717, + "step": 6000 + }, + { + "ce_loss_12": 3.074912130832672, + "ce_loss_17": 2.9589345335960386, + "ce_loss_23": 2.9115482330322267, + "ce_loss_3": 3.8282949447631838, + "ce_loss_6": 3.4091708064079285, + "epoch": 0.601, + "grad_norm": 1168.0, + "kl_loss_12": 343.47901458740233, + "kl_loss_17": 87.6121109008789, + "kl_loss_3": 1983.5056091308593, + "kl_loss_6": 1129.1260070800781, + "learning_rate": 0.00035002545436149473, + "loss": 916.9419, + "step": 6010 + }, + { + "ce_loss_12": 3.092540967464447, + "ce_loss_17": 2.97232928276062, + "ce_loss_23": 2.9233181953430174, + "ce_loss_3": 3.8545306801795958, + "ce_loss_6": 3.435434603691101, + "epoch": 0.602, + "grad_norm": 1048.0, + "kl_loss_12": 360.00843353271483, + "kl_loss_17": 92.88990707397461, + "kl_loss_3": 2020.6975036621093, + "kl_loss_6": 1152.5501831054687, + "learning_rate": 0.0003485126066291364, + "loss": 883.2751, + "step": 6020 + }, + { + "ce_loss_12": 3.124794828891754, + "ce_loss_17": 3.0092039346694945, + "ce_loss_23": 2.9585169315338136, + "ce_loss_3": 3.880570352077484, + "ce_loss_6": 3.4638235807418822, + "epoch": 0.603, + "grad_norm": 1184.0, + "kl_loss_12": 346.15485229492185, + "kl_loss_17": 88.88919639587402, + "kl_loss_3": 1991.0694091796875, + "kl_loss_6": 1138.7969970703125, + "learning_rate": 0.0003470012843731476, + "loss": 891.7182, + "step": 6030 + }, + { + "ce_loss_12": 3.0740342855453493, + "ce_loss_17": 2.957772696018219, + "ce_loss_23": 2.9076455354690554, + "ce_loss_3": 3.8349327206611634, + "ce_loss_6": 3.418463349342346, + "epoch": 0.604, + "grad_norm": 1040.0, + "kl_loss_12": 349.4450286865234, + "kl_loss_17": 89.32358436584472, + "kl_loss_3": 2008.395458984375, + "kl_loss_6": 1149.2588104248048, + "learning_rate": 0.00034549150281252633, + "loss": 911.2477, + "step": 6040 + }, + { + "ce_loss_12": 3.063339948654175, + "ce_loss_17": 2.9427681088447573, + "ce_loss_23": 2.8925299048423767, + "ce_loss_3": 3.7877164006233217, + "ce_loss_6": 3.3812638998031614, + "epoch": 0.605, + "grad_norm": 1120.0, + "kl_loss_12": 349.8405731201172, + "kl_loss_17": 90.2714687347412, + "kl_loss_3": 1932.9745483398438, + "kl_loss_6": 1105.6616638183593, + "learning_rate": 0.0003439832771507565, + "loss": 873.6602, + "step": 6050 + }, + { + "ce_loss_12": 3.0673495173454284, + "ce_loss_17": 2.9466548919677735, + "ce_loss_23": 2.8978442192077636, + "ce_loss_3": 3.8202001094818114, + "ce_loss_6": 3.41012419462204, + "epoch": 0.606, + "grad_norm": 904.0, + "kl_loss_12": 350.63096923828124, + "kl_loss_17": 89.38655128479004, + "kl_loss_3": 2003.3099365234375, + "kl_loss_6": 1148.8616790771484, + "learning_rate": 0.0003424766225756537, + "loss": 880.6595, + "step": 6060 + }, + { + "ce_loss_12": 3.1238632917404177, + "ce_loss_17": 3.0024963498115538, + "ce_loss_23": 2.953491246700287, + "ce_loss_3": 3.8675854563713075, + "ce_loss_6": 3.450518810749054, + "epoch": 0.607, + "grad_norm": 1168.0, + "kl_loss_12": 352.34055786132814, + "kl_loss_17": 89.9672866821289, + "kl_loss_3": 1970.7400146484374, + "kl_loss_6": 1118.7429931640625, + "learning_rate": 0.00034097155425921255, + "loss": 871.059, + "step": 6070 + }, + { + "ce_loss_12": 3.0279074907302856, + "ce_loss_17": 2.9091128587722777, + "ce_loss_23": 2.8589402437210083, + "ce_loss_3": 3.785420262813568, + "ce_loss_6": 3.3633985996246336, + "epoch": 0.608, + "grad_norm": 960.0, + "kl_loss_12": 353.95676879882814, + "kl_loss_17": 90.35716094970704, + "kl_loss_3": 2011.920391845703, + "kl_loss_6": 1137.7992767333985, + "learning_rate": 0.0003394680873574546, + "loss": 886.3698, + "step": 6080 + }, + { + "ce_loss_12": 3.125973951816559, + "ce_loss_17": 3.0037576794624328, + "ce_loss_23": 2.953854131698608, + "ce_loss_3": 3.8894197821617125, + "ce_loss_6": 3.4591413021087645, + "epoch": 0.609, + "grad_norm": 928.0, + "kl_loss_12": 353.62838592529295, + "kl_loss_17": 91.09500427246094, + "kl_loss_3": 2018.036395263672, + "kl_loss_6": 1138.0875274658204, + "learning_rate": 0.0003379662370102747, + "loss": 882.4516, + "step": 6090 + }, + { + "ce_loss_12": 3.1325398445129395, + "ce_loss_17": 3.0157789826393127, + "ce_loss_23": 2.9670695900917052, + "ce_loss_3": 3.8562929153442385, + "ce_loss_6": 3.4493812918663025, + "epoch": 0.61, + "grad_norm": 1216.0, + "kl_loss_12": 348.48892974853516, + "kl_loss_17": 88.45648956298828, + "kl_loss_3": 1969.1524963378906, + "kl_loss_6": 1122.2777770996095, + "learning_rate": 0.0003364660183412892, + "loss": 886.7492, + "step": 6100 + }, + { + "ce_loss_12": 3.119068741798401, + "ce_loss_17": 3.000202751159668, + "ce_loss_23": 2.9513747334480285, + "ce_loss_3": 3.848421800136566, + "ce_loss_6": 3.448009467124939, + "epoch": 0.611, + "grad_norm": 1176.0, + "kl_loss_12": 352.0840560913086, + "kl_loss_17": 90.15714569091797, + "kl_loss_3": 1971.662255859375, + "kl_loss_6": 1134.4475860595703, + "learning_rate": 0.0003349674464576834, + "loss": 894.9472, + "step": 6110 + }, + { + "ce_loss_12": 3.074628984928131, + "ce_loss_17": 2.9548948645591735, + "ce_loss_23": 2.9055315613746644, + "ce_loss_3": 3.8279956340789796, + "ce_loss_6": 3.4047624468803406, + "epoch": 0.612, + "grad_norm": 868.0, + "kl_loss_12": 350.29241943359375, + "kl_loss_17": 90.4896282196045, + "kl_loss_3": 2006.4168212890625, + "kl_loss_6": 1133.1514801025392, + "learning_rate": 0.00033347053645005966, + "loss": 869.4762, + "step": 6120 + }, + { + "ce_loss_12": 3.1577037930488587, + "ce_loss_17": 3.0422616958618165, + "ce_loss_23": 2.9943039655685424, + "ce_loss_3": 3.8772876262664795, + "ce_loss_6": 3.4845842838287355, + "epoch": 0.613, + "grad_norm": 1528.0, + "kl_loss_12": 342.38953399658203, + "kl_loss_17": 88.46079177856446, + "kl_loss_3": 1917.6266235351563, + "kl_loss_6": 1107.1890045166015, + "learning_rate": 0.00033197530339228485, + "loss": 876.5715, + "step": 6130 + }, + { + "ce_loss_12": 3.1251278638839723, + "ce_loss_17": 3.0028708815574645, + "ce_loss_23": 2.9523780941963196, + "ce_loss_3": 3.8694166779518127, + "ce_loss_6": 3.4621509075164796, + "epoch": 0.614, + "grad_norm": 1040.0, + "kl_loss_12": 354.7750839233398, + "kl_loss_17": 92.17405433654785, + "kl_loss_3": 1964.5522583007812, + "kl_loss_6": 1132.422689819336, + "learning_rate": 0.00033048176234133967, + "loss": 878.5187, + "step": 6140 + }, + { + "ce_loss_12": 3.112536537647247, + "ce_loss_17": 2.9995103001594545, + "ce_loss_23": 2.952008378505707, + "ce_loss_3": 3.850481855869293, + "ce_loss_6": 3.4409647226333617, + "epoch": 0.615, + "grad_norm": 1160.0, + "kl_loss_12": 352.7617446899414, + "kl_loss_17": 89.95292663574219, + "kl_loss_3": 1976.0133422851563, + "kl_loss_6": 1132.9945495605468, + "learning_rate": 0.0003289899283371657, + "loss": 888.2209, + "step": 6150 + }, + { + "ce_loss_12": 3.129842388629913, + "ce_loss_17": 3.0115858197212217, + "ce_loss_23": 2.9639342188835145, + "ce_loss_3": 3.8740663528442383, + "ce_loss_6": 3.4587475419044496, + "epoch": 0.616, + "grad_norm": 1216.0, + "kl_loss_12": 342.78956146240233, + "kl_loss_17": 88.99765205383301, + "kl_loss_3": 1968.557421875, + "kl_loss_6": 1118.0649200439452, + "learning_rate": 0.0003274998164025148, + "loss": 890.9506, + "step": 6160 + }, + { + "ce_loss_12": 3.1652899265289305, + "ce_loss_17": 3.044243943691254, + "ce_loss_23": 2.994454729557037, + "ce_loss_3": 3.891256046295166, + "ce_loss_6": 3.4863043069839477, + "epoch": 0.617, + "grad_norm": 1152.0, + "kl_loss_12": 351.40711822509763, + "kl_loss_17": 91.19853973388672, + "kl_loss_3": 1949.5838562011718, + "kl_loss_6": 1121.7305847167968, + "learning_rate": 0.0003260114415427975, + "loss": 901.9518, + "step": 6170 + }, + { + "ce_loss_12": 3.083340036869049, + "ce_loss_17": 2.966404664516449, + "ce_loss_23": 2.9196917533874513, + "ce_loss_3": 3.8479635953903197, + "ce_loss_6": 3.4319477438926698, + "epoch": 0.618, + "grad_norm": 1056.0, + "kl_loss_12": 348.9749816894531, + "kl_loss_17": 89.25509567260742, + "kl_loss_3": 2011.7194213867188, + "kl_loss_6": 1152.0232543945312, + "learning_rate": 0.0003245248187459323, + "loss": 905.0213, + "step": 6180 + }, + { + "ce_loss_12": 3.0779054880142214, + "ce_loss_17": 2.9654456257820128, + "ce_loss_23": 2.9188602209091186, + "ce_loss_3": 3.7875309348106385, + "ce_loss_6": 3.390846574306488, + "epoch": 0.619, + "grad_norm": 920.0, + "kl_loss_12": 337.221875, + "kl_loss_17": 86.04836540222168, + "kl_loss_3": 1908.584539794922, + "kl_loss_6": 1083.1012786865235, + "learning_rate": 0.00032303996298219416, + "loss": 861.1268, + "step": 6190 + }, + { + "ce_loss_12": 3.1569186449050903, + "ce_loss_17": 3.0399226546287537, + "ce_loss_23": 2.9923227071762084, + "ce_loss_3": 3.869111704826355, + "ce_loss_6": 3.474336361885071, + "epoch": 0.62, + "grad_norm": 1088.0, + "kl_loss_12": 340.1462677001953, + "kl_loss_17": 87.56219062805175, + "kl_loss_3": 1890.5591369628905, + "kl_loss_6": 1085.7106719970702, + "learning_rate": 0.00032155688920406414, + "loss": 857.2023, + "step": 6200 + }, + { + "ce_loss_12": 3.066182005405426, + "ce_loss_17": 2.9488109350204468, + "ce_loss_23": 2.9000027894973757, + "ce_loss_3": 3.8342225193977355, + "ce_loss_6": 3.4069128751754763, + "epoch": 0.621, + "grad_norm": 972.0, + "kl_loss_12": 350.70646514892576, + "kl_loss_17": 90.66474761962891, + "kl_loss_3": 2010.2277587890626, + "kl_loss_6": 1134.7549102783203, + "learning_rate": 0.0003200756123460788, + "loss": 904.0088, + "step": 6210 + }, + { + "ce_loss_12": 3.1090978145599366, + "ce_loss_17": 2.9894903898239136, + "ce_loss_23": 2.93879611492157, + "ce_loss_3": 3.8707842230796814, + "ce_loss_6": 3.4465802669525147, + "epoch": 0.622, + "grad_norm": 1368.0, + "kl_loss_12": 359.2712600708008, + "kl_loss_17": 92.23049201965333, + "kl_loss_3": 2029.2839599609374, + "kl_loss_6": 1159.1133087158203, + "learning_rate": 0.00031859614732467957, + "loss": 903.4491, + "step": 6220 + }, + { + "ce_loss_12": 3.156878316402435, + "ce_loss_17": 3.0391077756881715, + "ce_loss_23": 2.9916378617286683, + "ce_loss_3": 3.8727733850479127, + "ce_loss_6": 3.472846496105194, + "epoch": 0.623, + "grad_norm": 1216.0, + "kl_loss_12": 341.9202194213867, + "kl_loss_17": 88.34065742492676, + "kl_loss_3": 1915.2870239257813, + "kl_loss_6": 1094.0911712646484, + "learning_rate": 0.00031711850903806275, + "loss": 864.2437, + "step": 6230 + }, + { + "ce_loss_12": 3.071133351325989, + "ce_loss_17": 2.9489038586616516, + "ce_loss_23": 2.898697924613953, + "ce_loss_3": 3.8311947345733643, + "ce_loss_6": 3.404621160030365, + "epoch": 0.624, + "grad_norm": 928.0, + "kl_loss_12": 356.1224136352539, + "kl_loss_17": 91.2601390838623, + "kl_loss_3": 2014.823046875, + "kl_loss_6": 1142.6964630126954, + "learning_rate": 0.0003156427123660297, + "loss": 883.211, + "step": 6240 + }, + { + "ce_loss_12": 3.1468286871910096, + "ce_loss_17": 3.0284937620162964, + "ce_loss_23": 2.9805765390396117, + "ce_loss_3": 3.8659072875976563, + "ce_loss_6": 3.4674535155296327, + "epoch": 0.625, + "grad_norm": 932.0, + "kl_loss_12": 346.74879150390626, + "kl_loss_17": 88.12692947387696, + "kl_loss_3": 1929.708056640625, + "kl_loss_6": 1108.6400970458985, + "learning_rate": 0.0003141687721698363, + "loss": 882.9596, + "step": 6250 + }, + { + "ce_loss_12": 3.1200728535652162, + "ce_loss_17": 3.0045360565185546, + "ce_loss_23": 2.9588016986846926, + "ce_loss_3": 3.823894214630127, + "ce_loss_6": 3.420210433006287, + "epoch": 0.626, + "grad_norm": 1056.0, + "kl_loss_12": 330.5125015258789, + "kl_loss_17": 85.87870597839355, + "kl_loss_3": 1873.3631591796875, + "kl_loss_6": 1061.2933288574218, + "learning_rate": 0.00031269670329204396, + "loss": 861.3563, + "step": 6260 + }, + { + "ce_loss_12": 3.1616135358810427, + "ce_loss_17": 3.044730508327484, + "ce_loss_23": 2.998891019821167, + "ce_loss_3": 3.868626630306244, + "ce_loss_6": 3.478573274612427, + "epoch": 0.627, + "grad_norm": 1104.0, + "kl_loss_12": 343.72743530273436, + "kl_loss_17": 88.55654296875, + "kl_loss_3": 1918.8240783691406, + "kl_loss_6": 1103.6495300292968, + "learning_rate": 0.00031122652055637015, + "loss": 874.6357, + "step": 6270 + }, + { + "ce_loss_12": 3.117260479927063, + "ce_loss_17": 3.0006085872650146, + "ce_loss_23": 2.9568544030189514, + "ce_loss_3": 3.8618600845336912, + "ce_loss_6": 3.44553599357605, + "epoch": 0.628, + "grad_norm": 924.0, + "kl_loss_12": 351.38975219726564, + "kl_loss_17": 89.64115447998047, + "kl_loss_3": 1987.9572021484375, + "kl_loss_6": 1137.688525390625, + "learning_rate": 0.0003097582387675385, + "loss": 872.7838, + "step": 6280 + }, + { + "ce_loss_12": 3.1549283385276796, + "ce_loss_17": 3.0403485178947447, + "ce_loss_23": 2.9937935709953307, + "ce_loss_3": 3.8901216626167296, + "ce_loss_6": 3.482912063598633, + "epoch": 0.629, + "grad_norm": 1064.0, + "kl_loss_12": 348.3187423706055, + "kl_loss_17": 89.56165046691895, + "kl_loss_3": 1971.6549133300782, + "kl_loss_6": 1121.3032287597657, + "learning_rate": 0.00030829187271113034, + "loss": 875.4443, + "step": 6290 + }, + { + "ce_loss_12": 3.1473076701164246, + "ce_loss_17": 3.0363417267799377, + "ce_loss_23": 2.9871679186820983, + "ce_loss_3": 3.8660532355308534, + "ce_loss_6": 3.465396547317505, + "epoch": 0.63, + "grad_norm": 1120.0, + "kl_loss_12": 336.1268035888672, + "kl_loss_17": 87.23834419250488, + "kl_loss_3": 1905.2057373046875, + "kl_loss_6": 1086.9825592041016, + "learning_rate": 0.00030682743715343565, + "loss": 874.1156, + "step": 6300 + }, + { + "ce_loss_12": 3.099404048919678, + "ce_loss_17": 2.978138637542725, + "ce_loss_23": 2.928191804885864, + "ce_loss_3": 3.8511648416519164, + "ce_loss_6": 3.4375685691833495, + "epoch": 0.631, + "grad_norm": 1184.0, + "kl_loss_12": 354.819580078125, + "kl_loss_17": 91.24221687316894, + "kl_loss_3": 1976.64677734375, + "kl_loss_6": 1133.5737030029297, + "learning_rate": 0.0003053649468413043, + "loss": 895.8477, + "step": 6310 + }, + { + "ce_loss_12": 3.2084802389144897, + "ce_loss_17": 3.089839148521423, + "ce_loss_23": 3.039863979816437, + "ce_loss_3": 3.9332666873931883, + "ce_loss_6": 3.529804158210754, + "epoch": 0.632, + "grad_norm": 1352.0, + "kl_loss_12": 349.0574127197266, + "kl_loss_17": 90.16268653869629, + "kl_loss_3": 1947.8064453125, + "kl_loss_6": 1119.1037353515626, + "learning_rate": 0.00030390441650199725, + "loss": 874.6631, + "step": 6320 + }, + { + "ce_loss_12": 3.109989643096924, + "ce_loss_17": 2.993617391586304, + "ce_loss_23": 2.9476993680000305, + "ce_loss_3": 3.8355260372161863, + "ce_loss_6": 3.4362170815467836, + "epoch": 0.633, + "grad_norm": 932.0, + "kl_loss_12": 341.5677261352539, + "kl_loss_17": 87.88154983520508, + "kl_loss_3": 1922.833917236328, + "kl_loss_6": 1101.4575500488281, + "learning_rate": 0.00030244586084303903, + "loss": 865.0869, + "step": 6330 + }, + { + "ce_loss_12": 3.092487156391144, + "ce_loss_17": 2.970741927623749, + "ce_loss_23": 2.922047734260559, + "ce_loss_3": 3.845520186424255, + "ce_loss_6": 3.432330071926117, + "epoch": 0.634, + "grad_norm": 856.0, + "kl_loss_12": 354.30458679199216, + "kl_loss_17": 90.62174263000489, + "kl_loss_3": 2000.5931579589844, + "kl_loss_6": 1151.717950439453, + "learning_rate": 0.00030098929455206903, + "loss": 877.8803, + "step": 6340 + }, + { + "ce_loss_12": 3.0834580183029177, + "ce_loss_17": 2.968330907821655, + "ce_loss_23": 2.9212151169776917, + "ce_loss_3": 3.8257362484931945, + "ce_loss_6": 3.4121026039123534, + "epoch": 0.635, + "grad_norm": 1232.0, + "kl_loss_12": 341.7967956542969, + "kl_loss_17": 87.78205146789551, + "kl_loss_3": 1968.1894470214843, + "kl_loss_6": 1118.4420776367188, + "learning_rate": 0.00029953473229669324, + "loss": 897.0741, + "step": 6350 + }, + { + "ce_loss_12": 3.1118922233581543, + "ce_loss_17": 2.9967432618141174, + "ce_loss_23": 2.949505364894867, + "ce_loss_3": 3.8615681648254396, + "ce_loss_6": 3.448829710483551, + "epoch": 0.636, + "grad_norm": 1088.0, + "kl_loss_12": 349.56189880371096, + "kl_loss_17": 88.25876350402832, + "kl_loss_3": 1980.3615417480469, + "kl_loss_6": 1137.783837890625, + "learning_rate": 0.00029808218872433767, + "loss": 875.4202, + "step": 6360 + }, + { + "ce_loss_12": 3.1684186816215516, + "ce_loss_17": 3.054055690765381, + "ce_loss_23": 3.0063771963119508, + "ce_loss_3": 3.8962082982063295, + "ce_loss_6": 3.487860989570618, + "epoch": 0.637, + "grad_norm": 900.0, + "kl_loss_12": 340.72945098876954, + "kl_loss_17": 87.79001235961914, + "kl_loss_3": 1942.4515502929687, + "kl_loss_6": 1100.2603485107422, + "learning_rate": 0.0002966316784621, + "loss": 862.3001, + "step": 6370 + }, + { + "ce_loss_12": 3.0944795846939086, + "ce_loss_17": 2.9716971039772035, + "ce_loss_23": 2.9208733558654787, + "ce_loss_3": 3.8506593465805055, + "ce_loss_6": 3.4324075102806093, + "epoch": 0.638, + "grad_norm": 876.0, + "kl_loss_12": 353.5914505004883, + "kl_loss_17": 89.74720306396485, + "kl_loss_3": 1994.5528137207032, + "kl_loss_6": 1142.38095703125, + "learning_rate": 0.0002951832161166024, + "loss": 872.7703, + "step": 6380 + }, + { + "ce_loss_12": 3.1604527473449706, + "ce_loss_17": 3.0406524538993835, + "ce_loss_23": 2.988952672481537, + "ce_loss_3": 3.8952385902404787, + "ce_loss_6": 3.4893474698066713, + "epoch": 0.639, + "grad_norm": 960.0, + "kl_loss_12": 353.39493865966796, + "kl_loss_17": 90.45221481323242, + "kl_loss_3": 1959.9931701660157, + "kl_loss_6": 1130.186865234375, + "learning_rate": 0.0002937368162738445, + "loss": 868.2771, + "step": 6390 + }, + { + "ce_loss_12": 3.1031800866127015, + "ce_loss_17": 2.991485595703125, + "ce_loss_23": 2.9482226371765137, + "ce_loss_3": 3.8349567890167235, + "ce_loss_6": 3.4268946051597595, + "epoch": 0.64, + "grad_norm": 1376.0, + "kl_loss_12": 335.4634048461914, + "kl_loss_17": 85.38014450073243, + "kl_loss_3": 1946.2797729492188, + "kl_loss_6": 1106.7311584472657, + "learning_rate": 0.0002922924934990568, + "loss": 884.1594, + "step": 6400 + }, + { + "ce_loss_12": 3.0481751799583434, + "ce_loss_17": 2.928518009185791, + "ce_loss_23": 2.8782082557678224, + "ce_loss_3": 3.8162668228149412, + "ce_loss_6": 3.392408585548401, + "epoch": 0.641, + "grad_norm": 888.0, + "kl_loss_12": 347.2042831420898, + "kl_loss_17": 88.85644226074218, + "kl_loss_3": 2026.7457641601563, + "kl_loss_6": 1156.6705841064454, + "learning_rate": 0.0002908502623365536, + "loss": 888.1911, + "step": 6410 + }, + { + "ce_loss_12": 2.985366642475128, + "ce_loss_17": 2.8656646013259888, + "ce_loss_23": 2.8177972435951233, + "ce_loss_3": 3.7540844678878784, + "ce_loss_6": 3.326254391670227, + "epoch": 0.642, + "grad_norm": 1072.0, + "kl_loss_12": 346.08390350341796, + "kl_loss_17": 87.24323692321778, + "kl_loss_3": 2033.0718933105468, + "kl_loss_6": 1149.7964965820313, + "learning_rate": 0.0002894101373095867, + "loss": 888.3158, + "step": 6420 + }, + { + "ce_loss_12": 3.192607414722443, + "ce_loss_17": 3.0764593362808226, + "ce_loss_23": 3.0285913348197937, + "ce_loss_3": 3.91159108877182, + "ce_loss_6": 3.5078271865844726, + "epoch": 0.643, + "grad_norm": 1144.0, + "kl_loss_12": 346.41454162597654, + "kl_loss_17": 90.39042320251465, + "kl_loss_3": 1931.5027893066406, + "kl_loss_6": 1098.9398071289063, + "learning_rate": 0.00028797213292019926, + "loss": 868.6021, + "step": 6430 + }, + { + "ce_loss_12": 3.1715727925300596, + "ce_loss_17": 3.0562381982803344, + "ce_loss_23": 3.00568106174469, + "ce_loss_3": 3.901808428764343, + "ce_loss_6": 3.4961312413215637, + "epoch": 0.644, + "grad_norm": 1080.0, + "kl_loss_12": 351.4957702636719, + "kl_loss_17": 90.0567756652832, + "kl_loss_3": 1957.8979187011719, + "kl_loss_6": 1124.9115966796876, + "learning_rate": 0.0002865362636490791, + "loss": 894.7577, + "step": 6440 + }, + { + "ce_loss_12": 3.181258165836334, + "ce_loss_17": 3.0656901359558106, + "ce_loss_23": 3.017771327495575, + "ce_loss_3": 3.9070634961128237, + "ce_loss_6": 3.498952269554138, + "epoch": 0.645, + "grad_norm": 800.0, + "kl_loss_12": 344.0481979370117, + "kl_loss_17": 87.99182357788087, + "kl_loss_3": 1944.129833984375, + "kl_loss_6": 1109.7974243164062, + "learning_rate": 0.0002851025439554142, + "loss": 868.3958, + "step": 6450 + }, + { + "ce_loss_12": 3.163985526561737, + "ce_loss_17": 3.0450597405433655, + "ce_loss_23": 2.9969645500183106, + "ce_loss_3": 3.8787487506866456, + "ce_loss_6": 3.4883544564247133, + "epoch": 0.646, + "grad_norm": 1024.0, + "kl_loss_12": 343.316162109375, + "kl_loss_17": 88.75982627868652, + "kl_loss_3": 1901.1232666015626, + "kl_loss_6": 1103.8402252197266, + "learning_rate": 0.00028367098827674573, + "loss": 861.943, + "step": 6460 + }, + { + "ce_loss_12": 3.100857067108154, + "ce_loss_17": 2.9854223847389223, + "ce_loss_23": 2.9373781800270082, + "ce_loss_3": 3.8324575304985045, + "ce_loss_6": 3.418183147907257, + "epoch": 0.647, + "grad_norm": 1012.0, + "kl_loss_12": 340.87472381591795, + "kl_loss_17": 87.60878677368164, + "kl_loss_3": 1938.8650695800782, + "kl_loss_6": 1095.55888671875, + "learning_rate": 0.00028224161102882397, + "loss": 876.8292, + "step": 6470 + }, + { + "ce_loss_12": 3.0689453721046447, + "ce_loss_17": 2.957372415065765, + "ce_loss_23": 2.912537622451782, + "ce_loss_3": 3.794622015953064, + "ce_loss_6": 3.3898186564445494, + "epoch": 0.648, + "grad_norm": 1064.0, + "kl_loss_12": 338.6136016845703, + "kl_loss_17": 86.14839630126953, + "kl_loss_3": 1925.9338500976562, + "kl_loss_6": 1106.3623138427733, + "learning_rate": 0.00028081442660546124, + "loss": 873.1646, + "step": 6480 + }, + { + "ce_loss_12": 3.135506010055542, + "ce_loss_17": 3.0200687527656553, + "ce_loss_23": 2.9733571290969847, + "ce_loss_3": 3.8508325934410097, + "ce_loss_6": 3.450616705417633, + "epoch": 0.649, + "grad_norm": 944.0, + "kl_loss_12": 346.6316909790039, + "kl_loss_17": 88.86284599304199, + "kl_loss_3": 1919.2559936523437, + "kl_loss_6": 1095.6363037109375, + "learning_rate": 0.0002793894493783892, + "loss": 873.3639, + "step": 6490 + }, + { + "ce_loss_12": 3.148543083667755, + "ce_loss_17": 3.0385451436042787, + "ce_loss_23": 2.9913148522377013, + "ce_loss_3": 3.8768768310546875, + "ce_loss_6": 3.472771918773651, + "epoch": 0.65, + "grad_norm": 944.0, + "kl_loss_12": 337.93117370605466, + "kl_loss_17": 86.5095645904541, + "kl_loss_3": 1913.8453186035156, + "kl_loss_6": 1097.8296691894532, + "learning_rate": 0.0002779666936971129, + "loss": 864.3172, + "step": 6500 + }, + { + "ce_loss_12": 3.1691304445266724, + "ce_loss_17": 3.050932431221008, + "ce_loss_23": 3.0028540015220644, + "ce_loss_3": 3.907598543167114, + "ce_loss_6": 3.50301855802536, + "epoch": 0.651, + "grad_norm": 1048.0, + "kl_loss_12": 347.24015655517576, + "kl_loss_17": 87.80660743713379, + "kl_loss_3": 1963.5269592285156, + "kl_loss_6": 1127.437063598633, + "learning_rate": 0.00027654617388876614, + "loss": 881.9879, + "step": 6510 + }, + { + "ce_loss_12": 3.1799694776535032, + "ce_loss_17": 3.067358613014221, + "ce_loss_23": 3.017275059223175, + "ce_loss_3": 3.9080140829086303, + "ce_loss_6": 3.501030957698822, + "epoch": 0.652, + "grad_norm": 868.0, + "kl_loss_12": 345.9472351074219, + "kl_loss_17": 89.5890552520752, + "kl_loss_3": 1949.9092651367187, + "kl_loss_6": 1110.8510864257812, + "learning_rate": 0.0002751279042579672, + "loss": 876.3406, + "step": 6520 + }, + { + "ce_loss_12": 3.1285813450813293, + "ce_loss_17": 3.014330744743347, + "ce_loss_23": 2.9665272355079653, + "ce_loss_3": 3.847626578807831, + "ce_loss_6": 3.4419679284095763, + "epoch": 0.653, + "grad_norm": 1288.0, + "kl_loss_12": 337.32386627197263, + "kl_loss_17": 86.3776023864746, + "kl_loss_3": 1921.6126403808594, + "kl_loss_6": 1091.7433349609375, + "learning_rate": 0.00027371189908667604, + "loss": 878.335, + "step": 6530 + }, + { + "ce_loss_12": 3.188789117336273, + "ce_loss_17": 3.0666088819503785, + "ce_loss_23": 3.015404152870178, + "ce_loss_3": 3.9438202381134033, + "ce_loss_6": 3.523683476448059, + "epoch": 0.654, + "grad_norm": 1192.0, + "kl_loss_12": 356.82942962646484, + "kl_loss_17": 91.85579376220703, + "kl_loss_3": 1994.953546142578, + "kl_loss_6": 1134.5701141357422, + "learning_rate": 0.00027229817263404863, + "loss": 899.9428, + "step": 6540 + }, + { + "ce_loss_12": 3.1566753506660463, + "ce_loss_17": 3.047822904586792, + "ce_loss_23": 3.000984263420105, + "ce_loss_3": 3.8460690855979918, + "ce_loss_6": 3.467942750453949, + "epoch": 0.655, + "grad_norm": 1232.0, + "kl_loss_12": 336.4552856445313, + "kl_loss_17": 86.81032180786133, + "kl_loss_3": 1874.8006896972656, + "kl_loss_6": 1083.900930786133, + "learning_rate": 0.0002708867391362948, + "loss": 859.1926, + "step": 6550 + }, + { + "ce_loss_12": 3.1401477456092834, + "ce_loss_17": 3.0290218114852907, + "ce_loss_23": 2.9835850477218626, + "ce_loss_3": 3.841712236404419, + "ce_loss_6": 3.4439943075180053, + "epoch": 0.656, + "grad_norm": 1136.0, + "kl_loss_12": 329.05927124023435, + "kl_loss_17": 85.45395736694336, + "kl_loss_3": 1860.962646484375, + "kl_loss_6": 1059.7574798583985, + "learning_rate": 0.0002694776128065345, + "loss": 856.4867, + "step": 6560 + }, + { + "ce_loss_12": 3.0816213965415953, + "ce_loss_17": 2.9665581941604615, + "ce_loss_23": 2.9174054503440856, + "ce_loss_3": 3.8168569803237915, + "ce_loss_6": 3.410453128814697, + "epoch": 0.657, + "grad_norm": 1112.0, + "kl_loss_12": 349.16869201660154, + "kl_loss_17": 88.75262565612793, + "kl_loss_3": 1967.9637023925782, + "kl_loss_6": 1130.642840576172, + "learning_rate": 0.00026807080783465374, + "loss": 865.674, + "step": 6570 + }, + { + "ce_loss_12": 3.185134494304657, + "ce_loss_17": 3.0686640977859496, + "ce_loss_23": 3.022062087059021, + "ce_loss_3": 3.925693726539612, + "ce_loss_6": 3.520680642127991, + "epoch": 0.658, + "grad_norm": 996.0, + "kl_loss_12": 348.81055450439453, + "kl_loss_17": 89.27261466979981, + "kl_loss_3": 1972.7971313476562, + "kl_loss_6": 1136.3583374023438, + "learning_rate": 0.00026666633838716316, + "loss": 888.1977, + "step": 6580 + }, + { + "ce_loss_12": 3.094407868385315, + "ce_loss_17": 2.976101267337799, + "ce_loss_23": 2.924073362350464, + "ce_loss_3": 3.838667631149292, + "ce_loss_6": 3.427040421962738, + "epoch": 0.659, + "grad_norm": 1408.0, + "kl_loss_12": 355.3506591796875, + "kl_loss_17": 90.87181396484375, + "kl_loss_3": 1980.0582946777345, + "kl_loss_6": 1138.5891876220703, + "learning_rate": 0.00026526421860705474, + "loss": 891.5605, + "step": 6590 + }, + { + "ce_loss_12": 3.112147605419159, + "ce_loss_17": 2.9922482490539553, + "ce_loss_23": 2.943576455116272, + "ce_loss_3": 3.8509011030197144, + "ce_loss_6": 3.446483051776886, + "epoch": 0.66, + "grad_norm": 980.0, + "kl_loss_12": 350.5442840576172, + "kl_loss_17": 90.05998497009277, + "kl_loss_3": 1955.520330810547, + "kl_loss_6": 1126.2021484375, + "learning_rate": 0.0002638644626136587, + "loss": 870.4355, + "step": 6600 + }, + { + "ce_loss_12": 3.125984454154968, + "ce_loss_17": 3.0119236409664154, + "ce_loss_23": 2.965866357088089, + "ce_loss_3": 3.8583549737930296, + "ce_loss_6": 3.4497647523880004, + "epoch": 0.661, + "grad_norm": 864.0, + "kl_loss_12": 341.7684814453125, + "kl_loss_17": 86.04758720397949, + "kl_loss_3": 1936.7129333496093, + "kl_loss_6": 1113.954165649414, + "learning_rate": 0.00026246708450250255, + "loss": 873.6549, + "step": 6610 + }, + { + "ce_loss_12": 3.111504018306732, + "ce_loss_17": 2.9966780304908753, + "ce_loss_23": 2.9501025676727295, + "ce_loss_3": 3.827069544792175, + "ce_loss_6": 3.4274869561195374, + "epoch": 0.662, + "grad_norm": 1272.0, + "kl_loss_12": 341.32257080078125, + "kl_loss_17": 88.25752258300781, + "kl_loss_3": 1922.753759765625, + "kl_loss_6": 1097.9517578125, + "learning_rate": 0.00026107209834516854, + "loss": 864.8915, + "step": 6620 + }, + { + "ce_loss_12": 3.0738446593284605, + "ce_loss_17": 2.9582096576690673, + "ce_loss_23": 2.9104017496109007, + "ce_loss_3": 3.843344843387604, + "ce_loss_6": 3.418946015834808, + "epoch": 0.663, + "grad_norm": 1232.0, + "kl_loss_12": 348.487255859375, + "kl_loss_17": 88.84328269958496, + "kl_loss_3": 2021.6365051269531, + "kl_loss_6": 1148.4022888183595, + "learning_rate": 0.0002596795181891514, + "loss": 896.4338, + "step": 6630 + }, + { + "ce_loss_12": 3.082929515838623, + "ce_loss_17": 2.9603469371795654, + "ce_loss_23": 2.910764241218567, + "ce_loss_3": 3.821289229393005, + "ce_loss_6": 3.41152765750885, + "epoch": 0.664, + "grad_norm": 1280.0, + "kl_loss_12": 355.551774597168, + "kl_loss_17": 92.20106925964356, + "kl_loss_3": 1979.1425537109376, + "kl_loss_6": 1138.566717529297, + "learning_rate": 0.000258289358057718, + "loss": 917.6799, + "step": 6640 + }, + { + "ce_loss_12": 3.148861360549927, + "ce_loss_17": 3.025576043128967, + "ce_loss_23": 2.977057957649231, + "ce_loss_3": 3.89022433757782, + "ce_loss_6": 3.4762032985687257, + "epoch": 0.665, + "grad_norm": 940.0, + "kl_loss_12": 359.8484146118164, + "kl_loss_17": 93.93804359436035, + "kl_loss_3": 1993.680645751953, + "kl_loss_6": 1136.3337860107422, + "learning_rate": 0.0002569016319497657, + "loss": 894.0275, + "step": 6650 + }, + { + "ce_loss_12": 3.139300358295441, + "ce_loss_17": 3.019014894962311, + "ce_loss_23": 2.9669580936431883, + "ce_loss_3": 3.8815082907676697, + "ce_loss_6": 3.470949947834015, + "epoch": 0.666, + "grad_norm": 1016.0, + "kl_loss_12": 359.97652435302734, + "kl_loss_17": 93.55386581420899, + "kl_loss_3": 1994.8902160644532, + "kl_loss_6": 1142.2246398925781, + "learning_rate": 0.00025551635383968066, + "loss": 903.9031, + "step": 6660 + }, + { + "ce_loss_12": 3.047380030155182, + "ce_loss_17": 2.9349704504013063, + "ce_loss_23": 2.8862363934516906, + "ce_loss_3": 3.791304576396942, + "ce_loss_6": 3.378193163871765, + "epoch": 0.667, + "grad_norm": 1304.0, + "kl_loss_12": 349.3817199707031, + "kl_loss_17": 89.86996917724609, + "kl_loss_3": 1976.2346069335938, + "kl_loss_6": 1131.2955963134766, + "learning_rate": 0.00025413353767719804, + "loss": 888.0734, + "step": 6670 + }, + { + "ce_loss_12": 3.107878494262695, + "ce_loss_17": 2.9940646171569822, + "ce_loss_23": 2.948066794872284, + "ce_loss_3": 3.8429067969322204, + "ce_loss_6": 3.4393661618232727, + "epoch": 0.668, + "grad_norm": 1536.0, + "kl_loss_12": 344.29654083251955, + "kl_loss_17": 86.87348518371581, + "kl_loss_3": 1965.0328430175782, + "kl_loss_6": 1133.0926055908203, + "learning_rate": 0.0002527531973872617, + "loss": 882.2666, + "step": 6680 + }, + { + "ce_loss_12": 3.1252907514572144, + "ce_loss_17": 3.0092085719108583, + "ce_loss_23": 2.9613227605819703, + "ce_loss_3": 3.836043107509613, + "ce_loss_6": 3.437214457988739, + "epoch": 0.669, + "grad_norm": 1088.0, + "kl_loss_12": 342.63561553955077, + "kl_loss_17": 87.7429012298584, + "kl_loss_3": 1920.3994262695312, + "kl_loss_6": 1103.2438995361329, + "learning_rate": 0.0002513753468698826, + "loss": 866.9679, + "step": 6690 + }, + { + "ce_loss_12": 3.090426170825958, + "ce_loss_17": 2.971407890319824, + "ce_loss_23": 2.9234967947006227, + "ce_loss_3": 3.8281372785568237, + "ce_loss_6": 3.412463891506195, + "epoch": 0.67, + "grad_norm": 1232.0, + "kl_loss_12": 352.12214813232424, + "kl_loss_17": 89.99090042114258, + "kl_loss_3": 1990.1664123535156, + "kl_loss_6": 1134.3204223632813, + "learning_rate": 0.0002500000000000001, + "loss": 886.1559, + "step": 6700 + }, + { + "ce_loss_12": 3.1942543387413025, + "ce_loss_17": 3.0864752650260927, + "ce_loss_23": 3.0406001448631286, + "ce_loss_3": 3.8843395829200746, + "ce_loss_6": 3.4964444279670714, + "epoch": 0.671, + "grad_norm": 980.0, + "kl_loss_12": 336.6088241577148, + "kl_loss_17": 86.05256309509278, + "kl_loss_3": 1867.6397338867187, + "kl_loss_6": 1070.227133178711, + "learning_rate": 0.0002486271706273421, + "loss": 887.4312, + "step": 6710 + }, + { + "ce_loss_12": 3.1309102058410643, + "ce_loss_17": 3.022629773616791, + "ce_loss_23": 2.977077305316925, + "ce_loss_3": 3.8247514367103577, + "ce_loss_6": 3.442046856880188, + "epoch": 0.672, + "grad_norm": 1032.0, + "kl_loss_12": 332.57128143310547, + "kl_loss_17": 85.41795845031739, + "kl_loss_3": 1874.3389892578125, + "kl_loss_6": 1078.250180053711, + "learning_rate": 0.0002472568725762853, + "loss": 869.0738, + "step": 6720 + }, + { + "ce_loss_12": 3.1272271275520325, + "ce_loss_17": 3.0170477271080016, + "ce_loss_23": 2.9705397963523863, + "ce_loss_3": 3.8198402643203737, + "ce_loss_6": 3.4317238688468934, + "epoch": 0.673, + "grad_norm": 1408.0, + "kl_loss_12": 330.74756927490233, + "kl_loss_17": 84.99472198486328, + "kl_loss_3": 1873.458514404297, + "kl_loss_6": 1063.2597229003907, + "learning_rate": 0.00024588911964571554, + "loss": 855.2377, + "step": 6730 + }, + { + "ce_loss_12": 3.143399214744568, + "ce_loss_17": 3.024151027202606, + "ce_loss_23": 2.9710415840148925, + "ce_loss_3": 3.900429677963257, + "ce_loss_6": 3.483956551551819, + "epoch": 0.674, + "grad_norm": 904.0, + "kl_loss_12": 359.8742080688477, + "kl_loss_17": 94.40918464660645, + "kl_loss_3": 1998.6011291503905, + "kl_loss_6": 1151.3028686523437, + "learning_rate": 0.00024452392560888974, + "loss": 879.9549, + "step": 6740 + }, + { + "ce_loss_12": 3.037467384338379, + "ce_loss_17": 2.920524501800537, + "ce_loss_23": 2.872658061981201, + "ce_loss_3": 3.7702141404151917, + "ce_loss_6": 3.3598833203315737, + "epoch": 0.675, + "grad_norm": 824.0, + "kl_loss_12": 333.89322967529296, + "kl_loss_17": 85.252885055542, + "kl_loss_3": 1944.8146240234375, + "kl_loss_6": 1102.9042541503907, + "learning_rate": 0.00024316130421329695, + "loss": 859.0686, + "step": 6750 + }, + { + "ce_loss_12": 3.110549819469452, + "ce_loss_17": 2.9969101905822755, + "ce_loss_23": 2.9517931759357454, + "ce_loss_3": 3.8281806349754333, + "ce_loss_6": 3.4278959035873413, + "epoch": 0.676, + "grad_norm": 980.0, + "kl_loss_12": 339.3283462524414, + "kl_loss_17": 87.01463661193847, + "kl_loss_3": 1923.8144470214843, + "kl_loss_6": 1103.3825225830078, + "learning_rate": 0.00024180126918051909, + "loss": 870.1158, + "step": 6760 + }, + { + "ce_loss_12": 3.157505512237549, + "ce_loss_17": 3.043172836303711, + "ce_loss_23": 2.996259617805481, + "ce_loss_3": 3.8729994893074036, + "ce_loss_6": 3.469846773147583, + "epoch": 0.677, + "grad_norm": 1144.0, + "kl_loss_12": 343.28330078125, + "kl_loss_17": 87.83675193786621, + "kl_loss_3": 1921.9919799804688, + "kl_loss_6": 1093.9075073242188, + "learning_rate": 0.00024044383420609406, + "loss": 859.0797, + "step": 6770 + }, + { + "ce_loss_12": 3.167113757133484, + "ce_loss_17": 3.0563209533691404, + "ce_loss_23": 3.0103363037109374, + "ce_loss_3": 3.8601980209350586, + "ce_loss_6": 3.4720909357070924, + "epoch": 0.678, + "grad_norm": 1184.0, + "kl_loss_12": 333.5920715332031, + "kl_loss_17": 86.2124641418457, + "kl_loss_3": 1889.3303771972655, + "kl_loss_6": 1086.4960693359376, + "learning_rate": 0.00023908901295937712, + "loss": 873.1142, + "step": 6780 + }, + { + "ce_loss_12": 3.1507126450538636, + "ce_loss_17": 3.0366512298583985, + "ce_loss_23": 2.989604079723358, + "ce_loss_3": 3.8650084137916565, + "ce_loss_6": 3.472484755516052, + "epoch": 0.679, + "grad_norm": 1456.0, + "kl_loss_12": 336.9345169067383, + "kl_loss_17": 87.20873107910157, + "kl_loss_3": 1891.5245971679688, + "kl_loss_6": 1082.5831726074218, + "learning_rate": 0.00023773681908340283, + "loss": 879.5051, + "step": 6790 + }, + { + "ce_loss_12": 3.135229218006134, + "ce_loss_17": 3.01503472328186, + "ce_loss_23": 2.96337548494339, + "ce_loss_3": 3.88761568069458, + "ce_loss_6": 3.474038767814636, + "epoch": 0.68, + "grad_norm": 1512.0, + "kl_loss_12": 359.9027587890625, + "kl_loss_17": 93.6316131591797, + "kl_loss_3": 2011.5318969726563, + "kl_loss_6": 1154.1799377441407, + "learning_rate": 0.00023638726619474876, + "loss": 906.6357, + "step": 6800 + }, + { + "ce_loss_12": 3.1319125294685364, + "ce_loss_17": 3.0071893095970155, + "ce_loss_23": 2.956976556777954, + "ce_loss_3": 3.8915478587150574, + "ce_loss_6": 3.468007779121399, + "epoch": 0.681, + "grad_norm": 1208.0, + "kl_loss_12": 354.7691177368164, + "kl_loss_17": 90.4473373413086, + "kl_loss_3": 1999.4680297851562, + "kl_loss_6": 1150.2789642333985, + "learning_rate": 0.0002350403678833976, + "loss": 885.597, + "step": 6810 + }, + { + "ce_loss_12": 3.0541595101356505, + "ce_loss_17": 2.937679874897003, + "ce_loss_23": 2.8889075756072997, + "ce_loss_3": 3.793595790863037, + "ce_loss_6": 3.3809202313423157, + "epoch": 0.682, + "grad_norm": 888.0, + "kl_loss_12": 339.1263671875, + "kl_loss_17": 86.2660758972168, + "kl_loss_3": 1964.4327758789063, + "kl_loss_6": 1122.2195220947265, + "learning_rate": 0.00023369613771260007, + "loss": 871.1181, + "step": 6820 + }, + { + "ce_loss_12": 3.1654935002326967, + "ce_loss_17": 3.0478216648101806, + "ce_loss_23": 3.000415086746216, + "ce_loss_3": 3.908292329311371, + "ce_loss_6": 3.494389367103577, + "epoch": 0.683, + "grad_norm": 992.0, + "kl_loss_12": 349.88165588378905, + "kl_loss_17": 89.48608589172363, + "kl_loss_3": 1978.239434814453, + "kl_loss_6": 1126.9151611328125, + "learning_rate": 0.00023235458921873925, + "loss": 888.1568, + "step": 6830 + }, + { + "ce_loss_12": 3.133337688446045, + "ce_loss_17": 3.009847807884216, + "ce_loss_23": 2.9573482275009155, + "ce_loss_3": 3.9090792179107665, + "ce_loss_6": 3.488210666179657, + "epoch": 0.684, + "grad_norm": 1216.0, + "kl_loss_12": 364.1305847167969, + "kl_loss_17": 93.46566314697266, + "kl_loss_3": 2065.647784423828, + "kl_loss_6": 1187.9217010498046, + "learning_rate": 0.0002310157359111938, + "loss": 916.889, + "step": 6840 + }, + { + "ce_loss_12": 3.0303258895874023, + "ce_loss_17": 2.9021595239639284, + "ce_loss_23": 2.8519881129264832, + "ce_loss_3": 3.840841066837311, + "ce_loss_6": 3.3887616753578187, + "epoch": 0.685, + "grad_norm": 1696.0, + "kl_loss_12": 356.69389190673826, + "kl_loss_17": 90.0155418395996, + "kl_loss_3": 2106.9754943847656, + "kl_loss_6": 1189.3093841552734, + "learning_rate": 0.0002296795912722014, + "loss": 915.0975, + "step": 6850 + }, + { + "ce_loss_12": 3.1557368278503417, + "ce_loss_17": 3.0395566701889036, + "ce_loss_23": 2.9924458980560305, + "ce_loss_3": 3.868706929683685, + "ce_loss_6": 3.4693487644195558, + "epoch": 0.686, + "grad_norm": 904.0, + "kl_loss_12": 342.0593780517578, + "kl_loss_17": 88.2308448791504, + "kl_loss_3": 1928.0559936523437, + "kl_loss_6": 1095.3077056884765, + "learning_rate": 0.0002283461687567236, + "loss": 855.7789, + "step": 6860 + }, + { + "ce_loss_12": 3.2024538993835447, + "ce_loss_17": 3.0915324687957764, + "ce_loss_23": 3.044475185871124, + "ce_loss_3": 3.900763988494873, + "ce_loss_6": 3.5116928100585936, + "epoch": 0.687, + "grad_norm": 748.0, + "kl_loss_12": 335.2772933959961, + "kl_loss_17": 87.18923454284668, + "kl_loss_3": 1872.83974609375, + "kl_loss_6": 1073.4641540527343, + "learning_rate": 0.00022701548179231045, + "loss": 868.1582, + "step": 6870 + }, + { + "ce_loss_12": 3.164746177196503, + "ce_loss_17": 3.047294557094574, + "ce_loss_23": 2.99811338186264, + "ce_loss_3": 3.9032017946243287, + "ce_loss_6": 3.4921378016471865, + "epoch": 0.688, + "grad_norm": 1008.0, + "kl_loss_12": 348.18983459472656, + "kl_loss_17": 90.73311271667481, + "kl_loss_3": 1962.2305603027344, + "kl_loss_6": 1121.4248413085938, + "learning_rate": 0.00022568754377896516, + "loss": 866.5005, + "step": 6880 + }, + { + "ce_loss_12": 3.156641185283661, + "ce_loss_17": 3.0423587560653687, + "ce_loss_23": 2.9942126631736756, + "ce_loss_3": 3.866535186767578, + "ce_loss_6": 3.4736101984977723, + "epoch": 0.689, + "grad_norm": 1080.0, + "kl_loss_12": 345.99285278320315, + "kl_loss_17": 87.87312545776368, + "kl_loss_3": 1916.1283081054687, + "kl_loss_6": 1102.4018157958985, + "learning_rate": 0.00022436236808900844, + "loss": 861.774, + "step": 6890 + }, + { + "ce_loss_12": 3.0551820278167723, + "ce_loss_17": 2.9351372838020326, + "ce_loss_23": 2.885222589969635, + "ce_loss_3": 3.7950286746025084, + "ce_loss_6": 3.3820699572563173, + "epoch": 0.69, + "grad_norm": 1224.0, + "kl_loss_12": 347.7524154663086, + "kl_loss_17": 88.94291534423829, + "kl_loss_3": 1973.8519226074218, + "kl_loss_6": 1127.2658599853517, + "learning_rate": 0.00022303996806694487, + "loss": 875.0857, + "step": 6900 + }, + { + "ce_loss_12": 3.1233952760696413, + "ce_loss_17": 3.0083462238311767, + "ce_loss_23": 2.962642455101013, + "ce_loss_3": 3.8505759239196777, + "ce_loss_6": 3.449055218696594, + "epoch": 0.691, + "grad_norm": 1004.0, + "kl_loss_12": 338.82100067138674, + "kl_loss_17": 85.58787803649902, + "kl_loss_3": 1945.1169677734374, + "kl_loss_6": 1111.6448516845703, + "learning_rate": 0.00022172035702932823, + "loss": 868.8034, + "step": 6910 + }, + { + "ce_loss_12": 3.1731364607810972, + "ce_loss_17": 3.061260771751404, + "ce_loss_23": 3.012500262260437, + "ce_loss_3": 3.884373688697815, + "ce_loss_6": 3.4877657175064085, + "epoch": 0.692, + "grad_norm": 1240.0, + "kl_loss_12": 338.7787292480469, + "kl_loss_17": 88.33567352294922, + "kl_loss_3": 1898.0742065429688, + "kl_loss_6": 1094.025244140625, + "learning_rate": 0.00022040354826462666, + "loss": 861.8279, + "step": 6920 + }, + { + "ce_loss_12": 3.098303437232971, + "ce_loss_17": 2.9883047938346863, + "ce_loss_23": 2.9403274059295654, + "ce_loss_3": 3.8265650987625124, + "ce_loss_6": 3.4196027517318726, + "epoch": 0.693, + "grad_norm": 1128.0, + "kl_loss_12": 334.6059051513672, + "kl_loss_17": 86.24344863891602, + "kl_loss_3": 1942.0025817871094, + "kl_loss_6": 1100.2935638427734, + "learning_rate": 0.0002190895550330899, + "loss": 879.4563, + "step": 6930 + }, + { + "ce_loss_12": 3.0402795314788817, + "ce_loss_17": 2.9218369483947755, + "ce_loss_23": 2.870386278629303, + "ce_loss_3": 3.793871819972992, + "ce_loss_6": 3.3839752197265627, + "epoch": 0.694, + "grad_norm": 1048.0, + "kl_loss_12": 351.19903564453125, + "kl_loss_17": 90.00217399597167, + "kl_loss_3": 1991.8451843261719, + "kl_loss_6": 1148.4777099609375, + "learning_rate": 0.00021777839056661552, + "loss": 872.2974, + "step": 6940 + }, + { + "ce_loss_12": 3.1183370471000673, + "ce_loss_17": 3.0035004377365113, + "ce_loss_23": 2.95679976940155, + "ce_loss_3": 3.8357904314994813, + "ce_loss_6": 3.437212681770325, + "epoch": 0.695, + "grad_norm": 852.0, + "kl_loss_12": 337.89208068847654, + "kl_loss_17": 87.83383026123047, + "kl_loss_3": 1916.7237243652344, + "kl_loss_6": 1096.3088836669922, + "learning_rate": 0.0002164700680686147, + "loss": 853.3035, + "step": 6950 + }, + { + "ce_loss_12": 3.163424181938171, + "ce_loss_17": 3.0453892946243286, + "ce_loss_23": 2.998319923877716, + "ce_loss_3": 3.864614713191986, + "ce_loss_6": 3.4790354490280153, + "epoch": 0.696, + "grad_norm": 964.0, + "kl_loss_12": 342.2417327880859, + "kl_loss_17": 88.33109970092774, + "kl_loss_3": 1880.5371337890624, + "kl_loss_6": 1082.1784576416017, + "learning_rate": 0.0002151646007138806, + "loss": 855.9359, + "step": 6960 + }, + { + "ce_loss_12": 3.052464473247528, + "ce_loss_17": 2.939312756061554, + "ce_loss_23": 2.888992202281952, + "ce_loss_3": 3.794123411178589, + "ce_loss_6": 3.389705038070679, + "epoch": 0.697, + "grad_norm": 940.0, + "kl_loss_12": 350.82405548095704, + "kl_loss_17": 90.2900203704834, + "kl_loss_3": 1985.181329345703, + "kl_loss_6": 1139.6411743164062, + "learning_rate": 0.00021386200164845526, + "loss": 878.1575, + "step": 6970 + }, + { + "ce_loss_12": 3.2136444211006165, + "ce_loss_17": 3.10131151676178, + "ce_loss_23": 3.0563345670700075, + "ce_loss_3": 3.9051543354988096, + "ce_loss_6": 3.5153585076332092, + "epoch": 0.698, + "grad_norm": 832.0, + "kl_loss_12": 337.0234146118164, + "kl_loss_17": 86.35291862487793, + "kl_loss_3": 1872.8874267578126, + "kl_loss_6": 1080.3144561767579, + "learning_rate": 0.0002125622839894964, + "loss": 852.6143, + "step": 6980 + }, + { + "ce_loss_12": 3.1607075691223145, + "ce_loss_17": 3.0520373225212096, + "ce_loss_23": 3.0057042121887205, + "ce_loss_3": 3.8714554071426392, + "ce_loss_6": 3.475553238391876, + "epoch": 0.699, + "grad_norm": 1064.0, + "kl_loss_12": 332.19777221679686, + "kl_loss_17": 86.07902908325195, + "kl_loss_3": 1877.6947509765625, + "kl_loss_6": 1072.3549865722657, + "learning_rate": 0.00021126546082514663, + "loss": 851.5051, + "step": 6990 + }, + { + "ce_loss_12": 3.1810610413551332, + "ce_loss_17": 3.069001257419586, + "ce_loss_23": 3.023350381851196, + "ce_loss_3": 3.877126181125641, + "ce_loss_6": 3.493105447292328, + "epoch": 0.7, + "grad_norm": 1320.0, + "kl_loss_12": 338.05188903808596, + "kl_loss_17": 87.35322494506836, + "kl_loss_3": 1883.8476501464843, + "kl_loss_6": 1090.8296142578124, + "learning_rate": 0.00020997154521440098, + "loss": 851.0469, + "step": 7000 + }, + { + "ce_loss_12": 3.1302197575569153, + "ce_loss_17": 3.016958808898926, + "ce_loss_23": 2.9736286282539366, + "ce_loss_3": 3.8471718311309813, + "ce_loss_6": 3.4467532515525816, + "epoch": 0.701, + "grad_norm": 804.0, + "kl_loss_12": 336.49234161376955, + "kl_loss_17": 85.4226303100586, + "kl_loss_3": 1913.783056640625, + "kl_loss_6": 1095.0326568603516, + "learning_rate": 0.0002086805501869749, + "loss": 852.4146, + "step": 7010 + }, + { + "ce_loss_12": 3.109818232059479, + "ce_loss_17": 2.988493525981903, + "ce_loss_23": 2.9379398584365846, + "ce_loss_3": 3.852206456661224, + "ce_loss_6": 3.4425936937332153, + "epoch": 0.702, + "grad_norm": 1096.0, + "kl_loss_12": 353.8260467529297, + "kl_loss_17": 90.05978813171387, + "kl_loss_3": 1995.5918395996093, + "kl_loss_6": 1145.5393829345703, + "learning_rate": 0.0002073924887431744, + "loss": 878.7024, + "step": 7020 + }, + { + "ce_loss_12": 3.1103074669837953, + "ce_loss_17": 2.996567392349243, + "ce_loss_23": 2.9493033409118654, + "ce_loss_3": 3.8395477652549745, + "ce_loss_6": 3.4339751482009886, + "epoch": 0.703, + "grad_norm": 1160.0, + "kl_loss_12": 344.3564910888672, + "kl_loss_17": 87.64359970092774, + "kl_loss_3": 1950.38740234375, + "kl_loss_6": 1118.0618469238282, + "learning_rate": 0.00020610737385376348, + "loss": 890.4451, + "step": 7030 + }, + { + "ce_loss_12": 3.1601242661476134, + "ce_loss_17": 3.0453317403793334, + "ce_loss_23": 2.998131000995636, + "ce_loss_3": 3.851083850860596, + "ce_loss_6": 3.4640984535217285, + "epoch": 0.704, + "grad_norm": 1064.0, + "kl_loss_12": 337.7087921142578, + "kl_loss_17": 86.78579254150391, + "kl_loss_3": 1862.4344177246094, + "kl_loss_6": 1071.2653228759766, + "learning_rate": 0.00020482521845983521, + "loss": 868.2957, + "step": 7040 + }, + { + "ce_loss_12": 3.1671868681907656, + "ce_loss_17": 3.0499786496162415, + "ce_loss_23": 2.9989811420440673, + "ce_loss_3": 3.8885140657424926, + "ce_loss_6": 3.4780256271362306, + "epoch": 0.705, + "grad_norm": 1272.0, + "kl_loss_12": 346.44360504150393, + "kl_loss_17": 90.91349830627442, + "kl_loss_3": 1948.5189453125, + "kl_loss_6": 1106.0490844726562, + "learning_rate": 0.00020354603547267987, + "loss": 880.9363, + "step": 7050 + }, + { + "ce_loss_12": 3.1477226972579957, + "ce_loss_17": 3.0317694783210754, + "ce_loss_23": 2.9822145223617555, + "ce_loss_3": 3.893599569797516, + "ce_loss_6": 3.4841063261032104, + "epoch": 0.706, + "grad_norm": 948.0, + "kl_loss_12": 349.3151168823242, + "kl_loss_17": 89.96737899780274, + "kl_loss_3": 1960.026806640625, + "kl_loss_6": 1124.279348754883, + "learning_rate": 0.00020226983777365604, + "loss": 898.4166, + "step": 7060 + }, + { + "ce_loss_12": 3.055716669559479, + "ce_loss_17": 2.9457828521728517, + "ce_loss_23": 2.8991126537323, + "ce_loss_3": 3.8222302317619326, + "ce_loss_6": 3.3887375950813294, + "epoch": 0.707, + "grad_norm": 1064.0, + "kl_loss_12": 335.7860305786133, + "kl_loss_17": 85.3084270477295, + "kl_loss_3": 1999.919091796875, + "kl_loss_6": 1111.3100799560548, + "learning_rate": 0.00020099663821406056, + "loss": 872.2579, + "step": 7070 + }, + { + "ce_loss_12": 3.149730110168457, + "ce_loss_17": 3.0381961107254027, + "ce_loss_23": 2.9923747301101686, + "ce_loss_3": 3.8555148601531983, + "ce_loss_6": 3.462322497367859, + "epoch": 0.708, + "grad_norm": 1336.0, + "kl_loss_12": 332.4214065551758, + "kl_loss_17": 86.18175811767578, + "kl_loss_3": 1885.1385131835937, + "kl_loss_6": 1081.348455810547, + "learning_rate": 0.00019972644961499853, + "loss": 866.4621, + "step": 7080 + }, + { + "ce_loss_12": 3.128303360939026, + "ce_loss_17": 3.0102851152420045, + "ce_loss_23": 2.9599443554878233, + "ce_loss_3": 3.876959776878357, + "ce_loss_6": 3.456597423553467, + "epoch": 0.709, + "grad_norm": 1020.0, + "kl_loss_12": 352.72449493408203, + "kl_loss_17": 90.64090385437012, + "kl_loss_3": 1994.6687866210937, + "kl_loss_6": 1131.3768310546875, + "learning_rate": 0.00019845928476725522, + "loss": 882.0244, + "step": 7090 + }, + { + "ce_loss_12": 3.2001837849617005, + "ce_loss_17": 3.0850595355033876, + "ce_loss_23": 3.034242641925812, + "ce_loss_3": 3.914669561386108, + "ce_loss_6": 3.519736313819885, + "epoch": 0.71, + "grad_norm": 728.0, + "kl_loss_12": 345.6641387939453, + "kl_loss_17": 89.23448104858399, + "kl_loss_3": 1916.251934814453, + "kl_loss_6": 1110.915444946289, + "learning_rate": 0.00019719515643116677, + "loss": 895.3077, + "step": 7100 + }, + { + "ce_loss_12": 3.1400701642036437, + "ce_loss_17": 3.025814187526703, + "ce_loss_23": 2.9760770320892336, + "ce_loss_3": 3.8354206919670104, + "ce_loss_6": 3.4461462020874025, + "epoch": 0.711, + "grad_norm": 1048.0, + "kl_loss_12": 337.6152114868164, + "kl_loss_17": 87.09186248779297, + "kl_loss_3": 1894.4208740234376, + "kl_loss_6": 1079.6950958251953, + "learning_rate": 0.0001959340773364911, + "loss": 868.769, + "step": 7110 + }, + { + "ce_loss_12": 3.1553873419761658, + "ce_loss_17": 3.0396679520606993, + "ce_loss_23": 2.991921770572662, + "ce_loss_3": 3.885389029979706, + "ce_loss_6": 3.4764964818954467, + "epoch": 0.712, + "grad_norm": 840.0, + "kl_loss_12": 343.460107421875, + "kl_loss_17": 87.91554718017578, + "kl_loss_3": 1938.1870910644532, + "kl_loss_6": 1113.4761871337892, + "learning_rate": 0.0001946760601822809, + "loss": 854.4359, + "step": 7120 + }, + { + "ce_loss_12": 3.2049408555030823, + "ce_loss_17": 3.0892166614532472, + "ce_loss_23": 3.0430971145629884, + "ce_loss_3": 3.9063705444335937, + "ce_loss_6": 3.5126201152801513, + "epoch": 0.713, + "grad_norm": 1056.0, + "kl_loss_12": 338.95025939941405, + "kl_loss_17": 86.48576545715332, + "kl_loss_3": 1890.3039306640626, + "kl_loss_6": 1079.1358520507813, + "learning_rate": 0.00019342111763675512, + "loss": 841.8441, + "step": 7130 + }, + { + "ce_loss_12": 3.2013929009437563, + "ce_loss_17": 3.090296244621277, + "ce_loss_23": 3.039723289012909, + "ce_loss_3": 3.8926509261131286, + "ce_loss_6": 3.5071531534194946, + "epoch": 0.714, + "grad_norm": 1880.0, + "kl_loss_12": 339.79481201171876, + "kl_loss_17": 88.69997673034668, + "kl_loss_3": 1879.4584411621095, + "kl_loss_6": 1082.5133972167969, + "learning_rate": 0.00019216926233717085, + "loss": 849.3725, + "step": 7140 + }, + { + "ce_loss_12": 3.095425474643707, + "ce_loss_17": 2.9842472314834594, + "ce_loss_23": 2.9373265862464906, + "ce_loss_3": 3.8718192100524904, + "ce_loss_6": 3.442226326465607, + "epoch": 0.715, + "grad_norm": 1048.0, + "kl_loss_12": 337.72288665771487, + "kl_loss_17": 87.01406669616699, + "kl_loss_3": 2025.1176879882812, + "kl_loss_6": 1144.7917572021483, + "learning_rate": 0.00019092050688969737, + "loss": 886.4156, + "step": 7150 + }, + { + "ce_loss_12": 3.160278844833374, + "ce_loss_17": 3.0518559575080872, + "ce_loss_23": 3.003028392791748, + "ce_loss_3": 3.862757754325867, + "ce_loss_6": 3.4689655780792235, + "epoch": 0.716, + "grad_norm": 1024.0, + "kl_loss_12": 334.71154327392577, + "kl_loss_17": 86.16739501953126, + "kl_loss_3": 1914.939208984375, + "kl_loss_6": 1089.51484375, + "learning_rate": 0.00018967486386928817, + "loss": 852.5919, + "step": 7160 + }, + { + "ce_loss_12": 3.0488394856452943, + "ce_loss_17": 2.930952322483063, + "ce_loss_23": 2.881786513328552, + "ce_loss_3": 3.789450442790985, + "ce_loss_6": 3.3785236358642576, + "epoch": 0.717, + "grad_norm": 1168.0, + "kl_loss_12": 345.0021209716797, + "kl_loss_17": 87.61870803833008, + "kl_loss_3": 1971.9166870117188, + "kl_loss_6": 1127.8095123291016, + "learning_rate": 0.00018843234581955443, + "loss": 904.8341, + "step": 7170 + }, + { + "ce_loss_12": 3.054139864444733, + "ce_loss_17": 2.93440922498703, + "ce_loss_23": 2.8850677728652956, + "ce_loss_3": 3.8047616720199584, + "ce_loss_6": 3.3870360374450685, + "epoch": 0.718, + "grad_norm": 1392.0, + "kl_loss_12": 345.5686233520508, + "kl_loss_17": 88.29729309082032, + "kl_loss_3": 1980.608221435547, + "kl_loss_6": 1131.4310211181642, + "learning_rate": 0.00018719296525263924, + "loss": 881.9613, + "step": 7180 + }, + { + "ce_loss_12": 3.1456296682357787, + "ce_loss_17": 3.035650098323822, + "ce_loss_23": 2.9887816667556764, + "ce_loss_3": 3.8410556077957154, + "ce_loss_6": 3.4524468541145326, + "epoch": 0.719, + "grad_norm": 1120.0, + "kl_loss_12": 334.7310394287109, + "kl_loss_17": 86.69911155700683, + "kl_loss_3": 1863.3901123046876, + "kl_loss_6": 1063.2905212402343, + "learning_rate": 0.0001859567346490913, + "loss": 844.3992, + "step": 7190 + }, + { + "ce_loss_12": 3.1302160382270814, + "ce_loss_17": 3.0147815585136413, + "ce_loss_23": 2.9667599201202393, + "ce_loss_3": 3.867290210723877, + "ce_loss_6": 3.4623439788818358, + "epoch": 0.72, + "grad_norm": 1176.0, + "kl_loss_12": 349.16916809082034, + "kl_loss_17": 90.70890274047852, + "kl_loss_3": 1963.2574035644532, + "kl_loss_6": 1129.784686279297, + "learning_rate": 0.0001847236664577389, + "loss": 867.5368, + "step": 7200 + }, + { + "ce_loss_12": 3.1433603167533875, + "ce_loss_17": 3.035125803947449, + "ce_loss_23": 2.9883386731147765, + "ce_loss_3": 3.834554946422577, + "ce_loss_6": 3.444643223285675, + "epoch": 0.721, + "grad_norm": 1048.0, + "kl_loss_12": 332.42013092041014, + "kl_loss_17": 86.5483169555664, + "kl_loss_3": 1854.5558959960938, + "kl_loss_6": 1068.553970336914, + "learning_rate": 0.00018349377309556487, + "loss": 837.7305, + "step": 7210 + }, + { + "ce_loss_12": 3.0995816230773925, + "ce_loss_17": 2.9839762926101683, + "ce_loss_23": 2.936460256576538, + "ce_loss_3": 3.8673662662506105, + "ce_loss_6": 3.4319155335426332, + "epoch": 0.722, + "grad_norm": 1360.0, + "kl_loss_12": 350.46849212646487, + "kl_loss_17": 88.42042655944825, + "kl_loss_3": 2043.6094055175781, + "kl_loss_6": 1153.0741485595704, + "learning_rate": 0.00018226706694758193, + "loss": 891.0816, + "step": 7220 + }, + { + "ce_loss_12": 3.170437383651733, + "ce_loss_17": 3.0576380372047423, + "ce_loss_23": 3.010854959487915, + "ce_loss_3": 3.889014732837677, + "ce_loss_6": 3.4898629069328306, + "epoch": 0.723, + "grad_norm": 968.0, + "kl_loss_12": 346.222639465332, + "kl_loss_17": 87.71672706604004, + "kl_loss_3": 1940.7088317871094, + "kl_loss_6": 1117.3578521728516, + "learning_rate": 0.0001810435603667075, + "loss": 890.0791, + "step": 7230 + }, + { + "ce_loss_12": 3.0272300839424133, + "ce_loss_17": 2.91408885717392, + "ce_loss_23": 2.866550886631012, + "ce_loss_3": 3.7645853042602537, + "ce_loss_6": 3.3573010325431825, + "epoch": 0.724, + "grad_norm": 1200.0, + "kl_loss_12": 335.91829986572264, + "kl_loss_17": 85.53136253356934, + "kl_loss_3": 1942.4877502441407, + "kl_loss_6": 1104.8149505615233, + "learning_rate": 0.0001798232656736389, + "loss": 885.0737, + "step": 7240 + }, + { + "ce_loss_12": 3.1877684593200684, + "ce_loss_17": 3.0757036089897154, + "ce_loss_23": 3.0268973112106323, + "ce_loss_3": 3.8765776991844176, + "ce_loss_6": 3.4895286560058594, + "epoch": 0.725, + "grad_norm": 1008.0, + "kl_loss_12": 334.43251953125, + "kl_loss_17": 87.21447486877442, + "kl_loss_3": 1844.7488708496094, + "kl_loss_6": 1054.6623291015626, + "learning_rate": 0.0001786061951567303, + "loss": 851.126, + "step": 7250 + }, + { + "ce_loss_12": 3.1058161854743958, + "ce_loss_17": 2.985197162628174, + "ce_loss_23": 2.93839852809906, + "ce_loss_3": 3.833547604084015, + "ce_loss_6": 3.426920509338379, + "epoch": 0.726, + "grad_norm": 1168.0, + "kl_loss_12": 345.34449768066406, + "kl_loss_17": 88.04669914245605, + "kl_loss_3": 1946.1626953125, + "kl_loss_6": 1114.6494232177733, + "learning_rate": 0.00017739236107186857, + "loss": 881.4555, + "step": 7260 + }, + { + "ce_loss_12": 3.193452262878418, + "ce_loss_17": 3.0852574586868284, + "ce_loss_23": 3.0407148003578186, + "ce_loss_3": 3.8763112664222716, + "ce_loss_6": 3.491565001010895, + "epoch": 0.727, + "grad_norm": 1048.0, + "kl_loss_12": 330.26270599365233, + "kl_loss_17": 85.11148948669434, + "kl_loss_3": 1837.509814453125, + "kl_loss_6": 1056.963153076172, + "learning_rate": 0.00017618177564234904, + "loss": 842.5387, + "step": 7270 + }, + { + "ce_loss_12": 3.164004623889923, + "ce_loss_17": 3.053889298439026, + "ce_loss_23": 3.007025444507599, + "ce_loss_3": 3.852389669418335, + "ce_loss_6": 3.4695786118507383, + "epoch": 0.728, + "grad_norm": 832.0, + "kl_loss_12": 326.8455780029297, + "kl_loss_17": 84.78852577209473, + "kl_loss_3": 1833.0364624023437, + "kl_loss_6": 1058.3569793701172, + "learning_rate": 0.00017497445105875377, + "loss": 844.2723, + "step": 7280 + }, + { + "ce_loss_12": 3.0891406416893004, + "ce_loss_17": 2.967900788784027, + "ce_loss_23": 2.9209889650344847, + "ce_loss_3": 3.8261526226997375, + "ce_loss_6": 3.4110962629318236, + "epoch": 0.729, + "grad_norm": 1232.0, + "kl_loss_12": 348.3873992919922, + "kl_loss_17": 88.26057052612305, + "kl_loss_3": 1980.3658569335937, + "kl_loss_6": 1120.3544342041016, + "learning_rate": 0.000173770399478828, + "loss": 873.916, + "step": 7290 + }, + { + "ce_loss_12": 3.0114962816238404, + "ce_loss_17": 2.900802159309387, + "ce_loss_23": 2.8546555995941163, + "ce_loss_3": 3.731195402145386, + "ce_loss_6": 3.3254734754562376, + "epoch": 0.73, + "grad_norm": 1344.0, + "kl_loss_12": 330.6724685668945, + "kl_loss_17": 84.6192008972168, + "kl_loss_3": 1910.844775390625, + "kl_loss_6": 1080.101806640625, + "learning_rate": 0.0001725696330273575, + "loss": 879.4715, + "step": 7300 + }, + { + "ce_loss_12": 3.1867899537086486, + "ce_loss_17": 3.0713447093963624, + "ce_loss_23": 3.0244084358215333, + "ce_loss_3": 3.8816495537757874, + "ce_loss_6": 3.4958046793937685, + "epoch": 0.731, + "grad_norm": 1128.0, + "kl_loss_12": 329.4728759765625, + "kl_loss_17": 85.06861877441406, + "kl_loss_3": 1856.8272705078125, + "kl_loss_6": 1061.183837890625, + "learning_rate": 0.00017137216379604724, + "loss": 838.392, + "step": 7310 + }, + { + "ce_loss_12": 3.0677610874176025, + "ce_loss_17": 2.9559975385665895, + "ce_loss_23": 2.9073064804077147, + "ce_loss_3": 3.7928367018699647, + "ce_loss_6": 3.3871406435966493, + "epoch": 0.732, + "grad_norm": 1056.0, + "kl_loss_12": 333.28870391845703, + "kl_loss_17": 86.70836715698242, + "kl_loss_3": 1921.3812133789063, + "kl_loss_6": 1090.7506713867188, + "learning_rate": 0.00017017800384339925, + "loss": 863.3447, + "step": 7320 + }, + { + "ce_loss_12": 3.0337472200393676, + "ce_loss_17": 2.9114068865776064, + "ce_loss_23": 2.862027609348297, + "ce_loss_3": 3.785917317867279, + "ce_loss_6": 3.3646507382392885, + "epoch": 0.733, + "grad_norm": 1120.0, + "kl_loss_12": 343.82637176513674, + "kl_loss_17": 86.75051422119141, + "kl_loss_3": 1992.6470275878905, + "kl_loss_6": 1134.6840881347657, + "learning_rate": 0.00016898716519459073, + "loss": 862.7164, + "step": 7330 + }, + { + "ce_loss_12": 3.1520249962806703, + "ce_loss_17": 3.0307239174842833, + "ce_loss_23": 2.9793155193328857, + "ce_loss_3": 3.9048298597335815, + "ce_loss_6": 3.485947072505951, + "epoch": 0.734, + "grad_norm": 960.0, + "kl_loss_12": 353.37522888183594, + "kl_loss_17": 90.54953956604004, + "kl_loss_3": 1989.7527282714843, + "kl_loss_6": 1132.9010803222657, + "learning_rate": 0.00016779965984135375, + "loss": 875.7061, + "step": 7340 + }, + { + "ce_loss_12": 3.0715187668800352, + "ce_loss_17": 2.9542425632476808, + "ce_loss_23": 2.907315993309021, + "ce_loss_3": 3.7954859375953673, + "ce_loss_6": 3.3883111596107485, + "epoch": 0.735, + "grad_norm": 1184.0, + "kl_loss_12": 332.02828521728514, + "kl_loss_17": 85.16774787902833, + "kl_loss_3": 1912.3802062988282, + "kl_loss_6": 1084.7641662597657, + "learning_rate": 0.00016661549974185424, + "loss": 856.5238, + "step": 7350 + }, + { + "ce_loss_12": 3.0979945421218873, + "ce_loss_17": 2.9852666735649107, + "ce_loss_23": 2.9374090909957884, + "ce_loss_3": 3.814216876029968, + "ce_loss_6": 3.4164140701293944, + "epoch": 0.736, + "grad_norm": 1032.0, + "kl_loss_12": 340.04807434082034, + "kl_loss_17": 88.28477745056152, + "kl_loss_3": 1913.0336975097657, + "kl_loss_6": 1095.8302642822266, + "learning_rate": 0.00016543469682057105, + "loss": 852.7598, + "step": 7360 + }, + { + "ce_loss_12": 3.1197781205177306, + "ce_loss_17": 3.0080280542373656, + "ce_loss_23": 2.958756446838379, + "ce_loss_3": 3.840847039222717, + "ce_loss_6": 3.441115641593933, + "epoch": 0.737, + "grad_norm": 924.0, + "kl_loss_12": 341.9563293457031, + "kl_loss_17": 88.53944358825683, + "kl_loss_3": 1920.9719665527343, + "kl_loss_6": 1102.9890533447265, + "learning_rate": 0.00016425726296817632, + "loss": 859.2916, + "step": 7370 + }, + { + "ce_loss_12": 3.1310214161872865, + "ce_loss_17": 3.0205642342567445, + "ce_loss_23": 2.972715675830841, + "ce_loss_3": 3.8499599575996397, + "ce_loss_6": 3.447257339954376, + "epoch": 0.738, + "grad_norm": 1072.0, + "kl_loss_12": 334.7860809326172, + "kl_loss_17": 86.60757408142089, + "kl_loss_3": 1892.2875061035156, + "kl_loss_6": 1087.2785888671874, + "learning_rate": 0.00016308321004141607, + "loss": 857.2181, + "step": 7380 + }, + { + "ce_loss_12": 3.0963312149047852, + "ce_loss_17": 2.9749263405799864, + "ce_loss_23": 2.9255574226379393, + "ce_loss_3": 3.832499289512634, + "ce_loss_6": 3.4212956547737123, + "epoch": 0.739, + "grad_norm": 948.0, + "kl_loss_12": 348.71044921875, + "kl_loss_17": 90.2692081451416, + "kl_loss_3": 1962.9808471679687, + "kl_loss_6": 1123.849609375, + "learning_rate": 0.00016191254986299043, + "loss": 862.7074, + "step": 7390 + }, + { + "ce_loss_12": 3.1230055689811707, + "ce_loss_17": 3.017664575576782, + "ce_loss_23": 2.9723396420478823, + "ce_loss_3": 3.840611970424652, + "ce_loss_6": 3.440969097614288, + "epoch": 0.74, + "grad_norm": 1488.0, + "kl_loss_12": 331.56529541015624, + "kl_loss_17": 84.85293769836426, + "kl_loss_3": 1911.277490234375, + "kl_loss_6": 1093.4230743408202, + "learning_rate": 0.00016074529422143398, + "loss": 870.5579, + "step": 7400 + }, + { + "ce_loss_12": 3.0909118056297302, + "ce_loss_17": 2.977470803260803, + "ce_loss_23": 2.930717635154724, + "ce_loss_3": 3.830814278125763, + "ce_loss_6": 3.4192137718200684, + "epoch": 0.741, + "grad_norm": 3680.0, + "kl_loss_12": 342.2099548339844, + "kl_loss_17": 88.08484802246093, + "kl_loss_3": 1951.8666381835938, + "kl_loss_6": 1113.004623413086, + "learning_rate": 0.0001595814548709983, + "loss": 878.5271, + "step": 7410 + }, + { + "ce_loss_12": 3.160782778263092, + "ce_loss_17": 3.0384599566459656, + "ce_loss_23": 2.989315950870514, + "ce_loss_3": 3.892155075073242, + "ce_loss_6": 3.485103762149811, + "epoch": 0.742, + "grad_norm": 1264.0, + "kl_loss_12": 353.0607345581055, + "kl_loss_17": 90.35011444091796, + "kl_loss_3": 1965.493115234375, + "kl_loss_6": 1121.4259765625, + "learning_rate": 0.00015842104353153285, + "loss": 877.4688, + "step": 7420 + }, + { + "ce_loss_12": 3.1701106309890745, + "ce_loss_17": 3.052255642414093, + "ce_loss_23": 3.003518533706665, + "ce_loss_3": 3.886456573009491, + "ce_loss_6": 3.490762734413147, + "epoch": 0.743, + "grad_norm": 980.0, + "kl_loss_12": 343.64434204101565, + "kl_loss_17": 88.13621711730957, + "kl_loss_3": 1922.4880676269531, + "kl_loss_6": 1111.2306365966797, + "learning_rate": 0.0001572640718883667, + "loss": 884.9933, + "step": 7430 + }, + { + "ce_loss_12": 3.1077826380729676, + "ce_loss_17": 2.9971907138824463, + "ce_loss_23": 2.952656555175781, + "ce_loss_3": 3.8170455574989317, + "ce_loss_6": 3.420013201236725, + "epoch": 0.744, + "grad_norm": 1032.0, + "kl_loss_12": 331.71873168945314, + "kl_loss_17": 84.61126899719238, + "kl_loss_3": 1899.858135986328, + "kl_loss_6": 1076.7710723876953, + "learning_rate": 0.0001561105515921915, + "loss": 873.8502, + "step": 7440 + }, + { + "ce_loss_12": 2.9765826940536497, + "ce_loss_17": 2.858693015575409, + "ce_loss_23": 2.815770745277405, + "ce_loss_3": 3.734588646888733, + "ce_loss_6": 3.314176118373871, + "epoch": 0.745, + "grad_norm": 1072.0, + "kl_loss_12": 336.96295471191405, + "kl_loss_17": 84.19045066833496, + "kl_loss_3": 2006.8513732910155, + "kl_loss_6": 1138.3247772216796, + "learning_rate": 0.0001549604942589441, + "loss": 870.7094, + "step": 7450 + }, + { + "ce_loss_12": 3.143989932537079, + "ce_loss_17": 3.035001349449158, + "ce_loss_23": 2.9899677276611327, + "ce_loss_3": 3.828385293483734, + "ce_loss_6": 3.4482077240943907, + "epoch": 0.746, + "grad_norm": 876.0, + "kl_loss_12": 324.0212677001953, + "kl_loss_17": 84.12775382995605, + "kl_loss_3": 1829.0653442382813, + "kl_loss_6": 1048.3529968261719, + "learning_rate": 0.00015381391146968864, + "loss": 841.7859, + "step": 7460 + }, + { + "ce_loss_12": 3.1158527016639708, + "ce_loss_17": 3.0048242568969727, + "ce_loss_23": 2.9606425762176514, + "ce_loss_3": 3.841553473472595, + "ce_loss_6": 3.434686744213104, + "epoch": 0.747, + "grad_norm": 876.0, + "kl_loss_12": 329.89722595214846, + "kl_loss_17": 83.96846466064453, + "kl_loss_3": 1906.8902648925782, + "kl_loss_6": 1084.9622039794922, + "learning_rate": 0.00015267081477050133, + "loss": 861.179, + "step": 7470 + }, + { + "ce_loss_12": 3.213250148296356, + "ce_loss_17": 3.096405303478241, + "ce_loss_23": 3.0469411969184876, + "ce_loss_3": 3.907569968700409, + "ce_loss_6": 3.520617687702179, + "epoch": 0.748, + "grad_norm": 700.0, + "kl_loss_12": 343.44225158691404, + "kl_loss_17": 89.3300106048584, + "kl_loss_3": 1879.0430603027344, + "kl_loss_6": 1088.609292602539, + "learning_rate": 0.00015153121567235335, + "loss": 846.3131, + "step": 7480 + }, + { + "ce_loss_12": 3.1084712862968447, + "ce_loss_17": 2.9978137254714965, + "ce_loss_23": 2.9507338762283326, + "ce_loss_3": 3.8373326897621154, + "ce_loss_6": 3.4332410097122192, + "epoch": 0.749, + "grad_norm": 884.0, + "kl_loss_12": 339.2166152954102, + "kl_loss_17": 86.84514045715332, + "kl_loss_3": 1947.7872985839845, + "kl_loss_6": 1114.5451385498047, + "learning_rate": 0.00015039512565099468, + "loss": 846.359, + "step": 7490 + }, + { + "ce_loss_12": 3.1697430849075316, + "ce_loss_17": 3.0568503975868224, + "ce_loss_23": 3.0112765431404114, + "ce_loss_3": 3.877428340911865, + "ce_loss_6": 3.4780817866325378, + "epoch": 0.75, + "grad_norm": 1216.0, + "kl_loss_12": 334.6671508789062, + "kl_loss_17": 86.38764419555665, + "kl_loss_3": 1901.6724182128905, + "kl_loss_6": 1086.621160888672, + "learning_rate": 0.00014926255614683932, + "loss": 888.325, + "step": 7500 + }, + { + "ce_loss_12": 3.1050979256629945, + "ce_loss_17": 2.9946041226387026, + "ce_loss_23": 2.9476001858711243, + "ce_loss_3": 3.813271498680115, + "ce_loss_6": 3.409119975566864, + "epoch": 0.751, + "grad_norm": 944.0, + "kl_loss_12": 335.94759063720704, + "kl_loss_17": 85.60544509887696, + "kl_loss_3": 1916.4962829589845, + "kl_loss_6": 1076.0567474365234, + "learning_rate": 0.0001481335185648498, + "loss": 862.1184, + "step": 7510 + }, + { + "ce_loss_12": 3.128680205345154, + "ce_loss_17": 3.0160374522209166, + "ce_loss_23": 2.9687310814857484, + "ce_loss_3": 3.846818244457245, + "ce_loss_6": 3.4491943359375, + "epoch": 0.752, + "grad_norm": 876.0, + "kl_loss_12": 336.658674621582, + "kl_loss_17": 85.4277328491211, + "kl_loss_3": 1906.6753662109375, + "kl_loss_6": 1090.7926818847657, + "learning_rate": 0.0001470080242744218, + "loss": 851.4889, + "step": 7520 + }, + { + "ce_loss_12": 3.1233017206192017, + "ce_loss_17": 3.0149004459381104, + "ce_loss_23": 2.9699440360069276, + "ce_loss_3": 3.8468761801719666, + "ce_loss_6": 3.4403663873672485, + "epoch": 0.753, + "grad_norm": 1112.0, + "kl_loss_12": 329.28695526123045, + "kl_loss_17": 84.30590324401855, + "kl_loss_3": 1924.2193359375, + "kl_loss_6": 1089.3094604492187, + "learning_rate": 0.0001458860846092705, + "loss": 864.8695, + "step": 7530 + }, + { + "ce_loss_12": 3.1665912747383116, + "ce_loss_17": 3.0538085103034973, + "ce_loss_23": 3.006639873981476, + "ce_loss_3": 3.859456181526184, + "ce_loss_6": 3.472790026664734, + "epoch": 0.754, + "grad_norm": 1088.0, + "kl_loss_12": 331.51774444580076, + "kl_loss_17": 86.2380859375, + "kl_loss_3": 1861.3445129394531, + "kl_loss_6": 1068.8755920410156, + "learning_rate": 0.00014476771086731566, + "loss": 836.1793, + "step": 7540 + }, + { + "ce_loss_12": 3.250907337665558, + "ce_loss_17": 3.1357580065727233, + "ce_loss_23": 3.0857962131500245, + "ce_loss_3": 3.951405715942383, + "ce_loss_6": 3.560814690589905, + "epoch": 0.755, + "grad_norm": 984.0, + "kl_loss_12": 341.4471206665039, + "kl_loss_17": 91.6925064086914, + "kl_loss_3": 1883.063543701172, + "kl_loss_6": 1079.9726928710938, + "learning_rate": 0.00014365291431056872, + "loss": 877.0996, + "step": 7550 + }, + { + "ce_loss_12": 3.0967212200164793, + "ce_loss_17": 2.9790724515914917, + "ce_loss_23": 2.9304062604904173, + "ce_loss_3": 3.826195013523102, + "ce_loss_6": 3.4224355697631834, + "epoch": 0.756, + "grad_norm": 1112.0, + "kl_loss_12": 350.49027557373046, + "kl_loss_17": 89.87589607238769, + "kl_loss_3": 1960.891094970703, + "kl_loss_6": 1128.4765869140624, + "learning_rate": 0.00014254170616501827, + "loss": 871.8258, + "step": 7560 + }, + { + "ce_loss_12": 3.0440022349357605, + "ce_loss_17": 2.922041583061218, + "ce_loss_23": 2.870465707778931, + "ce_loss_3": 3.807706618309021, + "ce_loss_6": 3.387653279304504, + "epoch": 0.757, + "grad_norm": 1232.0, + "kl_loss_12": 355.4050354003906, + "kl_loss_17": 90.11079940795898, + "kl_loss_3": 2026.7626831054688, + "kl_loss_6": 1164.5902770996095, + "learning_rate": 0.0001414340976205183, + "loss": 902.8793, + "step": 7570 + }, + { + "ce_loss_12": 3.054443883895874, + "ce_loss_17": 2.935328185558319, + "ce_loss_23": 2.888164484500885, + "ce_loss_3": 3.7937758922576905, + "ce_loss_6": 3.3804110169410704, + "epoch": 0.758, + "grad_norm": 1248.0, + "kl_loss_12": 341.5797866821289, + "kl_loss_17": 86.31509780883789, + "kl_loss_3": 1966.69814453125, + "kl_loss_6": 1117.404315185547, + "learning_rate": 0.00014033009983067452, + "loss": 866.4357, + "step": 7580 + }, + { + "ce_loss_12": 3.192322778701782, + "ce_loss_17": 3.083972525596619, + "ce_loss_23": 3.0374709248542784, + "ce_loss_3": 3.8759530782699585, + "ce_loss_6": 3.4928683996200562, + "epoch": 0.759, + "grad_norm": 1184.0, + "kl_loss_12": 327.23913726806643, + "kl_loss_17": 84.71457977294922, + "kl_loss_3": 1846.6355773925782, + "kl_loss_6": 1052.3153900146485, + "learning_rate": 0.00013922972391273224, + "loss": 846.8314, + "step": 7590 + }, + { + "ce_loss_12": 3.1961251497268677, + "ce_loss_17": 3.0865646481513975, + "ce_loss_23": 3.0396140217781067, + "ce_loss_3": 3.923987865447998, + "ce_loss_6": 3.4996038913726806, + "epoch": 0.76, + "grad_norm": 1368.0, + "kl_loss_12": 335.3072052001953, + "kl_loss_17": 87.41547546386718, + "kl_loss_3": 1925.2472839355469, + "kl_loss_6": 1060.9364440917968, + "learning_rate": 0.0001381329809474649, + "loss": 860.7051, + "step": 7600 + }, + { + "ce_loss_12": 3.120570731163025, + "ce_loss_17": 2.9973181962966917, + "ce_loss_23": 2.946481966972351, + "ce_loss_3": 3.8685464024543763, + "ce_loss_6": 3.451310873031616, + "epoch": 0.761, + "grad_norm": 1256.0, + "kl_loss_12": 350.7531967163086, + "kl_loss_17": 89.52051429748535, + "kl_loss_3": 1992.3995178222656, + "kl_loss_6": 1138.2024780273437, + "learning_rate": 0.0001370398819790621, + "loss": 880.6619, + "step": 7610 + }, + { + "ce_loss_12": 3.2387707591056825, + "ce_loss_17": 3.125054621696472, + "ce_loss_23": 3.0756537199020384, + "ce_loss_3": 3.933832108974457, + "ce_loss_6": 3.54216411113739, + "epoch": 0.762, + "grad_norm": 1536.0, + "kl_loss_12": 336.23710479736326, + "kl_loss_17": 87.17769012451171, + "kl_loss_3": 1866.5935974121094, + "kl_loss_6": 1064.9635650634766, + "learning_rate": 0.00013595043801501794, + "loss": 841.4134, + "step": 7620 + }, + { + "ce_loss_12": 3.0535167813301087, + "ce_loss_17": 2.9316890597343446, + "ce_loss_23": 2.8828993320465086, + "ce_loss_3": 3.835411822795868, + "ce_loss_6": 3.4029742956161497, + "epoch": 0.763, + "grad_norm": 1216.0, + "kl_loss_12": 349.1750427246094, + "kl_loss_17": 88.26301383972168, + "kl_loss_3": 2051.720281982422, + "kl_loss_6": 1157.8002868652343, + "learning_rate": 0.00013486466002602133, + "loss": 884.99, + "step": 7630 + }, + { + "ce_loss_12": 3.1495963454246523, + "ce_loss_17": 3.039739465713501, + "ce_loss_23": 2.994281256198883, + "ce_loss_3": 3.836189568042755, + "ce_loss_6": 3.449015259742737, + "epoch": 0.764, + "grad_norm": 936.0, + "kl_loss_12": 333.25688018798826, + "kl_loss_17": 85.82595405578613, + "kl_loss_3": 1866.9323425292969, + "kl_loss_6": 1069.9278411865234, + "learning_rate": 0.00013378255894584462, + "loss": 871.2354, + "step": 7640 + }, + { + "ce_loss_12": 3.094685697555542, + "ce_loss_17": 2.97695597410202, + "ce_loss_23": 2.9276471614837645, + "ce_loss_3": 3.837893486022949, + "ce_loss_6": 3.421032023429871, + "epoch": 0.765, + "grad_norm": 1072.0, + "kl_loss_12": 346.69294891357424, + "kl_loss_17": 89.06358261108399, + "kl_loss_3": 1963.3975646972656, + "kl_loss_6": 1115.4526763916015, + "learning_rate": 0.0001327041456712334, + "loss": 876.7345, + "step": 7650 + }, + { + "ce_loss_12": 3.131850790977478, + "ce_loss_17": 3.0180388569831846, + "ce_loss_23": 2.969780373573303, + "ce_loss_3": 3.849341535568237, + "ce_loss_6": 3.4498900294303896, + "epoch": 0.766, + "grad_norm": 1232.0, + "kl_loss_12": 344.56713409423827, + "kl_loss_17": 88.26175918579102, + "kl_loss_3": 1933.8005432128907, + "kl_loss_6": 1108.826953125, + "learning_rate": 0.00013162943106179747, + "loss": 874.494, + "step": 7660 + }, + { + "ce_loss_12": 3.1083985328674317, + "ce_loss_17": 2.994685435295105, + "ce_loss_23": 2.948492395877838, + "ce_loss_3": 3.8132495284080505, + "ce_loss_6": 3.419460141658783, + "epoch": 0.767, + "grad_norm": 1168.0, + "kl_loss_12": 333.51930847167966, + "kl_loss_17": 86.19657096862792, + "kl_loss_3": 1899.5893188476562, + "kl_loss_6": 1087.605291748047, + "learning_rate": 0.00013055842593990132, + "loss": 856.2435, + "step": 7670 + }, + { + "ce_loss_12": 3.061744201183319, + "ce_loss_17": 2.94765499830246, + "ce_loss_23": 2.902495539188385, + "ce_loss_3": 3.770342934131622, + "ce_loss_6": 3.374074196815491, + "epoch": 0.768, + "grad_norm": 864.0, + "kl_loss_12": 333.0192306518555, + "kl_loss_17": 84.84715881347657, + "kl_loss_3": 1887.3901062011719, + "kl_loss_6": 1079.4616302490235, + "learning_rate": 0.00012949114109055414, + "loss": 870.6168, + "step": 7680 + }, + { + "ce_loss_12": 3.1028016567230225, + "ce_loss_17": 2.9852315068244932, + "ce_loss_23": 2.938108468055725, + "ce_loss_3": 3.8279983043670653, + "ce_loss_6": 3.423141694068909, + "epoch": 0.769, + "grad_norm": 988.0, + "kl_loss_12": 343.0029541015625, + "kl_loss_17": 87.91576690673828, + "kl_loss_3": 1937.405712890625, + "kl_loss_6": 1109.458090209961, + "learning_rate": 0.00012842758726130281, + "loss": 876.5707, + "step": 7690 + }, + { + "ce_loss_12": 3.149825596809387, + "ce_loss_17": 3.0305551290512085, + "ce_loss_23": 2.9809163212776184, + "ce_loss_3": 3.894960582256317, + "ce_loss_6": 3.4808794498443603, + "epoch": 0.77, + "grad_norm": 1496.0, + "kl_loss_12": 347.4535766601563, + "kl_loss_17": 89.44935035705566, + "kl_loss_3": 1967.200506591797, + "kl_loss_6": 1123.7082885742188, + "learning_rate": 0.00012736777516212267, + "loss": 862.7064, + "step": 7700 + }, + { + "ce_loss_12": 3.1452438831329346, + "ce_loss_17": 3.028602695465088, + "ce_loss_23": 2.9798243761062624, + "ce_loss_3": 3.8623616099357605, + "ce_loss_6": 3.4660856246948244, + "epoch": 0.771, + "grad_norm": 1016.0, + "kl_loss_12": 346.66766967773435, + "kl_loss_17": 88.37929077148438, + "kl_loss_3": 1928.3756225585937, + "kl_loss_6": 1109.7884368896484, + "learning_rate": 0.00012631171546530968, + "loss": 855.5857, + "step": 7710 + }, + { + "ce_loss_12": 3.151771867275238, + "ce_loss_17": 3.0342516899108887, + "ce_loss_23": 2.983285999298096, + "ce_loss_3": 3.868927764892578, + "ce_loss_6": 3.4712912917137144, + "epoch": 0.772, + "grad_norm": 952.0, + "kl_loss_12": 347.64147186279297, + "kl_loss_17": 90.16236190795898, + "kl_loss_3": 1936.895196533203, + "kl_loss_6": 1113.5256469726562, + "learning_rate": 0.00012525941880537307, + "loss": 878.1221, + "step": 7720 + }, + { + "ce_loss_12": 3.181893002986908, + "ce_loss_17": 3.068693768978119, + "ce_loss_23": 3.0194692611694336, + "ce_loss_3": 3.8950560331344604, + "ce_loss_6": 3.498482322692871, + "epoch": 0.773, + "grad_norm": 1004.0, + "kl_loss_12": 336.63256683349607, + "kl_loss_17": 86.24755783081055, + "kl_loss_3": 1900.8433837890625, + "kl_loss_6": 1094.9439544677734, + "learning_rate": 0.00012421089577892869, + "loss": 859.998, + "step": 7730 + }, + { + "ce_loss_12": 3.137857365608215, + "ce_loss_17": 3.022036409378052, + "ce_loss_23": 2.974807071685791, + "ce_loss_3": 3.8699662685394287, + "ce_loss_6": 3.463915300369263, + "epoch": 0.774, + "grad_norm": 1280.0, + "kl_loss_12": 343.84002380371095, + "kl_loss_17": 86.91533393859864, + "kl_loss_3": 1949.8188903808593, + "kl_loss_6": 1111.0187286376954, + "learning_rate": 0.0001231661569445919, + "loss": 872.7768, + "step": 7740 + }, + { + "ce_loss_12": 3.00217719078064, + "ce_loss_17": 2.8886006474494934, + "ce_loss_23": 2.842159616947174, + "ce_loss_3": 3.737683582305908, + "ce_loss_6": 3.327630090713501, + "epoch": 0.775, + "grad_norm": 1088.0, + "kl_loss_12": 338.36918182373046, + "kl_loss_17": 85.87432556152343, + "kl_loss_3": 1941.089581298828, + "kl_loss_6": 1107.751934814453, + "learning_rate": 0.00012212521282287093, + "loss": 881.309, + "step": 7750 + }, + { + "ce_loss_12": 3.144416570663452, + "ce_loss_17": 3.0234966397285463, + "ce_loss_23": 2.975559389591217, + "ce_loss_3": 3.850447392463684, + "ce_loss_6": 3.460440826416016, + "epoch": 0.776, + "grad_norm": 1232.0, + "kl_loss_12": 345.28298034667966, + "kl_loss_17": 89.02879981994629, + "kl_loss_3": 1899.7856140136719, + "kl_loss_6": 1097.8369415283203, + "learning_rate": 0.00012108807389606158, + "loss": 873.9021, + "step": 7760 + }, + { + "ce_loss_12": 3.1395771861076356, + "ce_loss_17": 3.029573917388916, + "ce_loss_23": 2.9835257172584533, + "ce_loss_3": 3.853815996646881, + "ce_loss_6": 3.4518202900886537, + "epoch": 0.777, + "grad_norm": 976.0, + "kl_loss_12": 333.39319763183596, + "kl_loss_17": 85.62209701538086, + "kl_loss_3": 1905.5265747070312, + "kl_loss_6": 1087.9601959228517, + "learning_rate": 0.00012005475060814159, + "loss": 858.8643, + "step": 7770 + }, + { + "ce_loss_12": 3.0747068762779235, + "ce_loss_17": 2.959312844276428, + "ce_loss_23": 2.9127468824386598, + "ce_loss_3": 3.827551472187042, + "ce_loss_6": 3.4109951853752136, + "epoch": 0.778, + "grad_norm": 1104.0, + "kl_loss_12": 343.21605529785154, + "kl_loss_17": 87.49714851379395, + "kl_loss_3": 1972.1259399414062, + "kl_loss_6": 1126.4033752441405, + "learning_rate": 0.00011902525336466464, + "loss": 873.4488, + "step": 7780 + }, + { + "ce_loss_12": 3.0730289697647093, + "ce_loss_17": 2.9537671327590944, + "ce_loss_23": 2.904254841804504, + "ce_loss_3": 3.8248645424842835, + "ce_loss_6": 3.4150359272956847, + "epoch": 0.779, + "grad_norm": 952.0, + "kl_loss_12": 349.9987197875977, + "kl_loss_17": 89.52234764099121, + "kl_loss_3": 1999.2891967773437, + "kl_loss_6": 1142.511083984375, + "learning_rate": 0.00011799959253265668, + "loss": 875.4477, + "step": 7790 + }, + { + "ce_loss_12": 3.123792839050293, + "ce_loss_17": 3.0112548232078553, + "ce_loss_23": 2.9621198654174803, + "ce_loss_3": 3.8543333888053892, + "ce_loss_6": 3.442795205116272, + "epoch": 0.78, + "grad_norm": 1176.0, + "kl_loss_12": 345.0049163818359, + "kl_loss_17": 89.56121673583985, + "kl_loss_3": 1956.1147583007812, + "kl_loss_6": 1116.1721801757812, + "learning_rate": 0.00011697777844051105, + "loss": 874.4314, + "step": 7800 + }, + { + "ce_loss_12": 3.1173406839370728, + "ce_loss_17": 3.001301908493042, + "ce_loss_23": 2.9508003115653993, + "ce_loss_3": 3.874517786502838, + "ce_loss_6": 3.4531596899032593, + "epoch": 0.781, + "grad_norm": 1240.0, + "kl_loss_12": 345.39916534423827, + "kl_loss_17": 89.18724632263184, + "kl_loss_3": 2014.996337890625, + "kl_loss_6": 1148.1400665283204, + "learning_rate": 0.00011595982137788402, + "loss": 883.8523, + "step": 7810 + }, + { + "ce_loss_12": 3.0881298422813415, + "ce_loss_17": 2.9777971744537353, + "ce_loss_23": 2.9313037514686586, + "ce_loss_3": 3.7849818110466003, + "ce_loss_6": 3.3961219668388365, + "epoch": 0.782, + "grad_norm": 1096.0, + "kl_loss_12": 333.4578491210938, + "kl_loss_17": 85.8090145111084, + "kl_loss_3": 1869.9925354003906, + "kl_loss_6": 1074.4398956298828, + "learning_rate": 0.00011494573159559212, + "loss": 856.6957, + "step": 7820 + }, + { + "ce_loss_12": 3.081245505809784, + "ce_loss_17": 2.9624974012374876, + "ce_loss_23": 2.9150551199913024, + "ce_loss_3": 3.804460048675537, + "ce_loss_6": 3.403433883190155, + "epoch": 0.783, + "grad_norm": 1128.0, + "kl_loss_12": 341.8298904418945, + "kl_loss_17": 88.31861038208008, + "kl_loss_3": 1931.8560729980468, + "kl_loss_6": 1109.651821899414, + "learning_rate": 0.00011393551930550828, + "loss": 881.5422, + "step": 7830 + }, + { + "ce_loss_12": 3.1982470750808716, + "ce_loss_17": 3.0831907868385313, + "ce_loss_23": 3.036144268512726, + "ce_loss_3": 3.900965392589569, + "ce_loss_6": 3.509566235542297, + "epoch": 0.784, + "grad_norm": 976.0, + "kl_loss_12": 338.9253555297852, + "kl_loss_17": 88.68736457824707, + "kl_loss_3": 1885.1073425292968, + "kl_loss_6": 1077.3505157470704, + "learning_rate": 0.00011292919468045875, + "loss": 855.8523, + "step": 7840 + }, + { + "ce_loss_12": 3.1600067257881164, + "ce_loss_17": 3.0468934297561647, + "ce_loss_23": 2.9999232292175293, + "ce_loss_3": 3.8751365542411804, + "ce_loss_6": 3.4719273805618287, + "epoch": 0.785, + "grad_norm": 740.0, + "kl_loss_12": 340.442431640625, + "kl_loss_17": 86.88551063537598, + "kl_loss_3": 1915.2488525390625, + "kl_loss_6": 1095.1250030517579, + "learning_rate": 0.00011192676785412154, + "loss": 854.8797, + "step": 7850 + }, + { + "ce_loss_12": 3.0997593998908997, + "ce_loss_17": 2.983069372177124, + "ce_loss_23": 2.9338279128074647, + "ce_loss_3": 3.8563276171684264, + "ce_loss_6": 3.438832497596741, + "epoch": 0.786, + "grad_norm": 1096.0, + "kl_loss_12": 342.4455062866211, + "kl_loss_17": 88.51511421203614, + "kl_loss_3": 1975.3369506835938, + "kl_loss_6": 1121.7781005859374, + "learning_rate": 0.00011092824892092374, + "loss": 877.8574, + "step": 7860 + }, + { + "ce_loss_12": 3.0392884254455566, + "ce_loss_17": 2.924478280544281, + "ce_loss_23": 2.878358209133148, + "ce_loss_3": 3.789666223526001, + "ce_loss_6": 3.373346519470215, + "epoch": 0.787, + "grad_norm": 944.0, + "kl_loss_12": 343.36658477783203, + "kl_loss_17": 86.76849899291992, + "kl_loss_3": 1991.3924743652344, + "kl_loss_6": 1129.0875396728516, + "learning_rate": 0.0001099336479359398, + "loss": 869.0258, + "step": 7870 + }, + { + "ce_loss_12": 3.153367805480957, + "ce_loss_17": 3.041387641429901, + "ce_loss_23": 2.9948896527290345, + "ce_loss_3": 3.8533716440200805, + "ce_loss_6": 3.459426212310791, + "epoch": 0.788, + "grad_norm": 868.0, + "kl_loss_12": 333.95757446289065, + "kl_loss_17": 86.1301658630371, + "kl_loss_3": 1893.079901123047, + "kl_loss_6": 1074.507797241211, + "learning_rate": 0.00010894297491479043, + "loss": 861.5433, + "step": 7880 + }, + { + "ce_loss_12": 3.138637971878052, + "ce_loss_17": 3.026522934436798, + "ce_loss_23": 2.980404424667358, + "ce_loss_3": 3.85728714466095, + "ce_loss_6": 3.4558101177215574, + "epoch": 0.789, + "grad_norm": 1048.0, + "kl_loss_12": 336.65147552490237, + "kl_loss_17": 86.43539237976074, + "kl_loss_3": 1911.0948852539063, + "kl_loss_6": 1093.9242156982423, + "learning_rate": 0.00010795623983354214, + "loss": 856.4976, + "step": 7890 + }, + { + "ce_loss_12": 3.048935604095459, + "ce_loss_17": 2.9298244953155517, + "ce_loss_23": 2.8829049706459045, + "ce_loss_3": 3.783765459060669, + "ce_loss_6": 3.3757433891296387, + "epoch": 0.79, + "grad_norm": 1064.0, + "kl_loss_12": 348.3014938354492, + "kl_loss_17": 89.91097488403321, + "kl_loss_3": 1957.4156311035156, + "kl_loss_6": 1125.400372314453, + "learning_rate": 0.00010697345262860636, + "loss": 867.2932, + "step": 7900 + }, + { + "ce_loss_12": 3.1738720655441286, + "ce_loss_17": 3.0629943370819093, + "ce_loss_23": 3.017685425281525, + "ce_loss_3": 3.871600615978241, + "ce_loss_6": 3.4759154319763184, + "epoch": 0.791, + "grad_norm": 964.0, + "kl_loss_12": 334.6929733276367, + "kl_loss_17": 86.53432846069336, + "kl_loss_3": 1884.543878173828, + "kl_loss_6": 1070.356005859375, + "learning_rate": 0.00010599462319663906, + "loss": 846.766, + "step": 7910 + }, + { + "ce_loss_12": 3.1430269718170165, + "ce_loss_17": 3.0329277157783507, + "ce_loss_23": 2.9859153389930726, + "ce_loss_3": 3.834847366809845, + "ce_loss_6": 3.4506487131118773, + "epoch": 0.792, + "grad_norm": 904.0, + "kl_loss_12": 331.15181121826174, + "kl_loss_17": 85.92568435668946, + "kl_loss_3": 1852.7552917480468, + "kl_loss_6": 1062.5147399902344, + "learning_rate": 0.00010501976139444191, + "loss": 840.7529, + "step": 7920 + }, + { + "ce_loss_12": 3.1669753551483155, + "ce_loss_17": 3.0577998042106627, + "ce_loss_23": 3.0104619026184083, + "ce_loss_3": 3.8746066093444824, + "ce_loss_6": 3.4819621324539183, + "epoch": 0.793, + "grad_norm": 1296.0, + "kl_loss_12": 332.4377197265625, + "kl_loss_17": 87.50224800109864, + "kl_loss_3": 1898.9986511230468, + "kl_loss_6": 1078.7464477539063, + "learning_rate": 0.0001040488770388625, + "loss": 866.2617, + "step": 7930 + }, + { + "ce_loss_12": 3.1284486651420593, + "ce_loss_17": 3.0125857830047607, + "ce_loss_23": 2.9665513753890993, + "ce_loss_3": 3.844206702709198, + "ce_loss_6": 3.443767809867859, + "epoch": 0.794, + "grad_norm": 1160.0, + "kl_loss_12": 340.0913330078125, + "kl_loss_17": 86.53030662536621, + "kl_loss_3": 1935.131103515625, + "kl_loss_6": 1105.029623413086, + "learning_rate": 0.00010308197990669538, + "loss": 862.9814, + "step": 7940 + }, + { + "ce_loss_12": 3.2380952715873716, + "ce_loss_17": 3.122182333469391, + "ce_loss_23": 3.070917046070099, + "ce_loss_3": 3.947544777393341, + "ce_loss_6": 3.5485092639923095, + "epoch": 0.795, + "grad_norm": 840.0, + "kl_loss_12": 347.7717346191406, + "kl_loss_17": 90.15585250854492, + "kl_loss_3": 1925.0963745117188, + "kl_loss_6": 1109.9462158203125, + "learning_rate": 0.0001021190797345839, + "loss": 859.2271, + "step": 7950 + }, + { + "ce_loss_12": 2.982831633090973, + "ce_loss_17": 2.862782156467438, + "ce_loss_23": 2.813780164718628, + "ce_loss_3": 3.741881549358368, + "ce_loss_6": 3.326892077922821, + "epoch": 0.796, + "grad_norm": 948.0, + "kl_loss_12": 356.4972442626953, + "kl_loss_17": 90.53372077941894, + "kl_loss_3": 2013.8299743652344, + "kl_loss_6": 1158.7713195800782, + "learning_rate": 0.00010116018621892236, + "loss": 880.7428, + "step": 7960 + }, + { + "ce_loss_12": 3.1805360555648803, + "ce_loss_17": 3.0628061056137086, + "ce_loss_23": 3.014175093173981, + "ce_loss_3": 3.913769769668579, + "ce_loss_6": 3.50421302318573, + "epoch": 0.797, + "grad_norm": 1168.0, + "kl_loss_12": 355.2409729003906, + "kl_loss_17": 92.6179973602295, + "kl_loss_3": 1972.5615600585938, + "kl_loss_6": 1139.47099609375, + "learning_rate": 0.00010020530901575753, + "loss": 861.009, + "step": 7970 + }, + { + "ce_loss_12": 3.196563351154327, + "ce_loss_17": 3.083208155632019, + "ce_loss_23": 3.0344853520393373, + "ce_loss_3": 3.9116249561309813, + "ce_loss_6": 3.5146140575408937, + "epoch": 0.798, + "grad_norm": 1020.0, + "kl_loss_12": 343.97747192382815, + "kl_loss_17": 88.96430740356445, + "kl_loss_3": 1921.1753173828124, + "kl_loss_6": 1103.5718902587892, + "learning_rate": 9.925445774069231e-05, + "loss": 848.7297, + "step": 7980 + }, + { + "ce_loss_12": 3.1499200463294983, + "ce_loss_17": 3.034737801551819, + "ce_loss_23": 2.986804115772247, + "ce_loss_3": 3.862005889415741, + "ce_loss_6": 3.4680118560791016, + "epoch": 0.799, + "grad_norm": 1004.0, + "kl_loss_12": 335.8933639526367, + "kl_loss_17": 87.28639183044433, + "kl_loss_3": 1894.1492004394531, + "kl_loss_6": 1091.5393951416015, + "learning_rate": 9.830764196878872e-05, + "loss": 839.0573, + "step": 7990 + }, + { + "ce_loss_12": 3.100402855873108, + "ce_loss_17": 2.9863922595977783, + "ce_loss_23": 2.9403823256492614, + "ce_loss_3": 3.817837917804718, + "ce_loss_6": 3.4157135605812075, + "epoch": 0.8, + "grad_norm": 868.0, + "kl_loss_12": 337.6185836791992, + "kl_loss_17": 86.11444549560547, + "kl_loss_3": 1953.2614868164062, + "kl_loss_6": 1107.330581665039, + "learning_rate": 9.736487123447069e-05, + "loss": 864.8945, + "step": 8000 + }, + { + "ce_loss_12": 3.0528087258338927, + "ce_loss_17": 2.940442717075348, + "ce_loss_23": 2.8930461049079894, + "ce_loss_3": 3.8335362315177917, + "ce_loss_6": 3.400530993938446, + "epoch": 0.801, + "grad_norm": 1272.0, + "kl_loss_12": 343.6899673461914, + "kl_loss_17": 87.17022972106933, + "kl_loss_3": 2055.389855957031, + "kl_loss_6": 1163.5688232421876, + "learning_rate": 9.642615503142926e-05, + "loss": 892.4299, + "step": 8010 + }, + { + "ce_loss_12": 3.114699602127075, + "ce_loss_17": 2.99845632314682, + "ce_loss_23": 2.949676924943924, + "ce_loss_3": 3.836538827419281, + "ce_loss_6": 3.439238655567169, + "epoch": 0.802, + "grad_norm": 888.0, + "kl_loss_12": 334.9020050048828, + "kl_loss_17": 86.2359634399414, + "kl_loss_3": 1921.4569213867187, + "kl_loss_6": 1090.2377807617188, + "learning_rate": 9.549150281252633e-05, + "loss": 850.6668, + "step": 8020 + }, + { + "ce_loss_12": 3.1417679309844972, + "ce_loss_17": 3.025522565841675, + "ce_loss_23": 2.9773662090301514, + "ce_loss_3": 3.8609896898269653, + "ce_loss_6": 3.461854100227356, + "epoch": 0.803, + "grad_norm": 1048.0, + "kl_loss_12": 341.2946090698242, + "kl_loss_17": 87.30495986938476, + "kl_loss_3": 1931.938671875, + "kl_loss_6": 1098.1311889648437, + "learning_rate": 9.4560923989699e-05, + "loss": 874.5707, + "step": 8030 + }, + { + "ce_loss_12": 3.132170486450195, + "ce_loss_17": 3.016941010951996, + "ce_loss_23": 2.9699743747711183, + "ce_loss_3": 3.850451099872589, + "ce_loss_6": 3.449750316143036, + "epoch": 0.804, + "grad_norm": 900.0, + "kl_loss_12": 342.9051147460938, + "kl_loss_17": 87.89902000427246, + "kl_loss_3": 1910.6194458007812, + "kl_loss_6": 1091.275827026367, + "learning_rate": 9.363442793386607e-05, + "loss": 880.9614, + "step": 8040 + }, + { + "ce_loss_12": 3.1098117351531984, + "ce_loss_17": 2.9859721541404722, + "ce_loss_23": 2.9361562609672545, + "ce_loss_3": 3.861330676078796, + "ce_loss_6": 3.4467449426651, + "epoch": 0.805, + "grad_norm": 1416.0, + "kl_loss_12": 350.2294982910156, + "kl_loss_17": 89.05590744018555, + "kl_loss_3": 1981.568341064453, + "kl_loss_6": 1135.2284088134766, + "learning_rate": 9.271202397483213e-05, + "loss": 859.5157, + "step": 8050 + }, + { + "ce_loss_12": 3.13518203496933, + "ce_loss_17": 3.0245556592941285, + "ce_loss_23": 2.97843199968338, + "ce_loss_3": 3.836519110202789, + "ce_loss_6": 3.4399279713630677, + "epoch": 0.806, + "grad_norm": 912.0, + "kl_loss_12": 335.34840240478513, + "kl_loss_17": 86.3288730621338, + "kl_loss_3": 1890.5824157714844, + "kl_loss_6": 1081.4476745605468, + "learning_rate": 9.179372140119524e-05, + "loss": 867.2266, + "step": 8060 + }, + { + "ce_loss_12": 3.0809193730354307, + "ce_loss_17": 2.9720325112342834, + "ce_loss_23": 2.925378918647766, + "ce_loss_3": 3.795903038978577, + "ce_loss_6": 3.3943702578544617, + "epoch": 0.807, + "grad_norm": 1048.0, + "kl_loss_12": 333.8464828491211, + "kl_loss_17": 85.5449089050293, + "kl_loss_3": 1908.2876770019532, + "kl_loss_6": 1088.5864166259767, + "learning_rate": 9.087952946025175e-05, + "loss": 870.8111, + "step": 8070 + }, + { + "ce_loss_12": 3.1861281633377074, + "ce_loss_17": 3.076117146015167, + "ce_loss_23": 3.0290566802024843, + "ce_loss_3": 3.8652841329574583, + "ce_loss_6": 3.4856042742729185, + "epoch": 0.808, + "grad_norm": 1000.0, + "kl_loss_12": 327.95980072021484, + "kl_loss_17": 85.12698631286621, + "kl_loss_3": 1835.6515197753906, + "kl_loss_6": 1050.8891906738281, + "learning_rate": 8.996945735790446e-05, + "loss": 856.2945, + "step": 8080 + }, + { + "ce_loss_12": 3.0848318457603456, + "ce_loss_17": 2.9725335121154783, + "ce_loss_23": 2.9265166401863096, + "ce_loss_3": 3.7977205395698546, + "ce_loss_6": 3.4063352584838866, + "epoch": 0.809, + "grad_norm": 1136.0, + "kl_loss_12": 336.2959716796875, + "kl_loss_17": 86.1030330657959, + "kl_loss_3": 1917.192578125, + "kl_loss_6": 1102.2066314697265, + "learning_rate": 8.906351425856951e-05, + "loss": 870.3629, + "step": 8090 + }, + { + "ce_loss_12": 3.071573090553284, + "ce_loss_17": 2.9595313549041746, + "ce_loss_23": 2.9116446137428285, + "ce_loss_3": 3.8096127271652223, + "ce_loss_6": 3.4040393114089964, + "epoch": 0.81, + "grad_norm": 1112.0, + "kl_loss_12": 340.50690307617185, + "kl_loss_17": 87.05605773925781, + "kl_loss_3": 1975.457861328125, + "kl_loss_6": 1130.7320648193358, + "learning_rate": 8.816170928508365e-05, + "loss": 881.4886, + "step": 8100 + }, + { + "ce_loss_12": 3.04113427400589, + "ce_loss_17": 2.9222721576690676, + "ce_loss_23": 2.875444030761719, + "ce_loss_3": 3.8006600379943847, + "ce_loss_6": 3.37364467382431, + "epoch": 0.811, + "grad_norm": 952.0, + "kl_loss_12": 346.2675354003906, + "kl_loss_17": 87.38955802917481, + "kl_loss_3": 2010.8611755371094, + "kl_loss_6": 1136.984552001953, + "learning_rate": 8.7264051518613e-05, + "loss": 873.9125, + "step": 8110 + }, + { + "ce_loss_12": 3.1181398153305055, + "ce_loss_17": 3.007498395442963, + "ce_loss_23": 2.962209713459015, + "ce_loss_3": 3.821540927886963, + "ce_loss_6": 3.4283391356468202, + "epoch": 0.812, + "grad_norm": 1064.0, + "kl_loss_12": 328.8416946411133, + "kl_loss_17": 84.78668899536133, + "kl_loss_3": 1879.4911193847656, + "kl_loss_6": 1073.913980102539, + "learning_rate": 8.637054999856148e-05, + "loss": 858.3178, + "step": 8120 + }, + { + "ce_loss_12": 3.1156737446784972, + "ce_loss_17": 2.999300456047058, + "ce_loss_23": 2.9509395956993103, + "ce_loss_3": 3.8430278182029722, + "ce_loss_6": 3.4434648156166077, + "epoch": 0.813, + "grad_norm": 932.0, + "kl_loss_12": 343.0796173095703, + "kl_loss_17": 87.72957420349121, + "kl_loss_3": 1928.7910522460938, + "kl_loss_6": 1103.8207946777343, + "learning_rate": 8.548121372247918e-05, + "loss": 875.3312, + "step": 8130 + }, + { + "ce_loss_12": 3.1797402143478393, + "ce_loss_17": 3.072401428222656, + "ce_loss_23": 3.0259174704551697, + "ce_loss_3": 3.880035877227783, + "ce_loss_6": 3.4850649356842043, + "epoch": 0.814, + "grad_norm": 1360.0, + "kl_loss_12": 333.69579162597654, + "kl_loss_17": 86.78376846313476, + "kl_loss_3": 1905.3100463867188, + "kl_loss_6": 1079.9478851318358, + "learning_rate": 8.459605164597267e-05, + "loss": 853.8594, + "step": 8140 + }, + { + "ce_loss_12": 3.0721118688583373, + "ce_loss_17": 2.96068993806839, + "ce_loss_23": 2.9150487184524536, + "ce_loss_3": 3.8026944279670714, + "ce_loss_6": 3.4011393070220945, + "epoch": 0.815, + "grad_norm": 796.0, + "kl_loss_12": 336.3142486572266, + "kl_loss_17": 86.4559268951416, + "kl_loss_3": 1935.2485229492188, + "kl_loss_6": 1104.572802734375, + "learning_rate": 8.371507268261436e-05, + "loss": 873.4855, + "step": 8150 + }, + { + "ce_loss_12": 3.1365590333938598, + "ce_loss_17": 3.0222877860069275, + "ce_loss_23": 2.97467303276062, + "ce_loss_3": 3.8597262263298036, + "ce_loss_6": 3.45579069852829, + "epoch": 0.816, + "grad_norm": 932.0, + "kl_loss_12": 338.98358764648435, + "kl_loss_17": 86.73196105957031, + "kl_loss_3": 1914.698223876953, + "kl_loss_6": 1087.3596771240234, + "learning_rate": 8.283828570385238e-05, + "loss": 840.1645, + "step": 8160 + }, + { + "ce_loss_12": 3.1392388939857483, + "ce_loss_17": 3.0261239886283873, + "ce_loss_23": 2.978219211101532, + "ce_loss_3": 3.853709506988525, + "ce_loss_6": 3.455215263366699, + "epoch": 0.817, + "grad_norm": 864.0, + "kl_loss_12": 335.7754180908203, + "kl_loss_17": 86.84048919677734, + "kl_loss_3": 1893.3304809570313, + "kl_loss_6": 1083.5698669433593, + "learning_rate": 8.196569953892202e-05, + "loss": 855.8519, + "step": 8170 + }, + { + "ce_loss_12": 3.071793007850647, + "ce_loss_17": 2.9521210312843325, + "ce_loss_23": 2.90672482252121, + "ce_loss_3": 3.79334899187088, + "ce_loss_6": 3.3872650504112243, + "epoch": 0.818, + "grad_norm": 1264.0, + "kl_loss_12": 343.29752960205076, + "kl_loss_17": 87.74607048034667, + "kl_loss_3": 1917.1433898925782, + "kl_loss_6": 1101.1959533691406, + "learning_rate": 8.109732297475635e-05, + "loss": 856.9976, + "step": 8180 + }, + { + "ce_loss_12": 3.044912302494049, + "ce_loss_17": 2.9229483723640444, + "ce_loss_23": 2.8729085326194763, + "ce_loss_3": 3.828094220161438, + "ce_loss_6": 3.4056942462921143, + "epoch": 0.819, + "grad_norm": 1032.0, + "kl_loss_12": 352.4162796020508, + "kl_loss_17": 89.06830749511718, + "kl_loss_3": 2030.4641357421874, + "kl_loss_6": 1173.2847595214844, + "learning_rate": 8.023316475589754e-05, + "loss": 893.1128, + "step": 8190 + }, + { + "ce_loss_12": 3.016770374774933, + "ce_loss_17": 2.8914675116539, + "ce_loss_23": 2.839850068092346, + "ce_loss_3": 3.8171961426734926, + "ce_loss_6": 3.369654881954193, + "epoch": 0.82, + "grad_norm": 1616.0, + "kl_loss_12": 363.68381652832034, + "kl_loss_17": 93.56408424377442, + "kl_loss_3": 2083.283099365234, + "kl_loss_6": 1186.300814819336, + "learning_rate": 7.937323358440934e-05, + "loss": 905.9598, + "step": 8200 + }, + { + "ce_loss_12": 3.12202787399292, + "ce_loss_17": 3.014573073387146, + "ce_loss_23": 2.9696925044059754, + "ce_loss_3": 3.817815911769867, + "ce_loss_6": 3.4261608719825745, + "epoch": 0.821, + "grad_norm": 1040.0, + "kl_loss_12": 329.6168685913086, + "kl_loss_17": 85.38657341003417, + "kl_loss_3": 1873.8861389160156, + "kl_loss_6": 1069.3458221435546, + "learning_rate": 7.851753811978923e-05, + "loss": 855.5345, + "step": 8210 + }, + { + "ce_loss_12": 3.140878129005432, + "ce_loss_17": 3.0264705181121827, + "ce_loss_23": 2.9791894435882567, + "ce_loss_3": 3.8736305713653563, + "ce_loss_6": 3.462307298183441, + "epoch": 0.822, + "grad_norm": 984.0, + "kl_loss_12": 339.41978149414064, + "kl_loss_17": 88.28028259277343, + "kl_loss_3": 1955.4935302734375, + "kl_loss_6": 1115.261944580078, + "learning_rate": 7.766608697888095e-05, + "loss": 862.4045, + "step": 8220 + }, + { + "ce_loss_12": 3.1515311360359193, + "ce_loss_17": 3.036611783504486, + "ce_loss_23": 2.9878187775611877, + "ce_loss_3": 3.8725401878356935, + "ce_loss_6": 3.470382332801819, + "epoch": 0.823, + "grad_norm": 952.0, + "kl_loss_12": 342.2221923828125, + "kl_loss_17": 89.53544616699219, + "kl_loss_3": 1944.193017578125, + "kl_loss_6": 1110.061944580078, + "learning_rate": 7.681888873578785e-05, + "loss": 875.9349, + "step": 8230 + }, + { + "ce_loss_12": 3.082021200656891, + "ce_loss_17": 2.9599790930747987, + "ce_loss_23": 2.9113461136817933, + "ce_loss_3": 3.8219818115234374, + "ce_loss_6": 3.4070132970809937, + "epoch": 0.824, + "grad_norm": 1024.0, + "kl_loss_12": 350.0909225463867, + "kl_loss_17": 90.64294090270997, + "kl_loss_3": 1968.9792602539062, + "kl_loss_6": 1120.007046508789, + "learning_rate": 7.597595192178702e-05, + "loss": 868.1928, + "step": 8240 + }, + { + "ce_loss_12": 3.085583007335663, + "ce_loss_17": 2.968785548210144, + "ce_loss_23": 2.9183531761169434, + "ce_loss_3": 3.834295225143433, + "ce_loss_6": 3.4201825857162476, + "epoch": 0.825, + "grad_norm": 1272.0, + "kl_loss_12": 349.5534942626953, + "kl_loss_17": 89.26756324768067, + "kl_loss_3": 2006.1392211914062, + "kl_loss_6": 1147.5581146240233, + "learning_rate": 7.513728502524286e-05, + "loss": 885.6963, + "step": 8250 + }, + { + "ce_loss_12": 3.0719595193862914, + "ce_loss_17": 2.9636343240737917, + "ce_loss_23": 2.9201133489608764, + "ce_loss_3": 3.7877281069755555, + "ce_loss_6": 3.389252471923828, + "epoch": 0.826, + "grad_norm": 1176.0, + "kl_loss_12": 328.5639221191406, + "kl_loss_17": 83.89287757873535, + "kl_loss_3": 1886.0435302734375, + "kl_loss_6": 1071.7314575195312, + "learning_rate": 7.430289649152156e-05, + "loss": 863.2703, + "step": 8260 + }, + { + "ce_loss_12": 2.9939164757728576, + "ce_loss_17": 2.8743756294250487, + "ce_loss_23": 2.8279815316200256, + "ce_loss_3": 3.760581076145172, + "ce_loss_6": 3.338099813461304, + "epoch": 0.827, + "grad_norm": 1216.0, + "kl_loss_12": 348.948991394043, + "kl_loss_17": 87.2121368408203, + "kl_loss_3": 2031.262713623047, + "kl_loss_6": 1162.1164459228517, + "learning_rate": 7.347279472290646e-05, + "loss": 875.545, + "step": 8270 + }, + { + "ce_loss_12": 3.126525890827179, + "ce_loss_17": 3.014357566833496, + "ce_loss_23": 2.966294693946838, + "ce_loss_3": 3.8614994049072267, + "ce_loss_6": 3.4501480221748353, + "epoch": 0.828, + "grad_norm": 1208.0, + "kl_loss_12": 340.5355484008789, + "kl_loss_17": 87.52486991882324, + "kl_loss_3": 1956.3447204589843, + "kl_loss_6": 1108.7800689697265, + "learning_rate": 7.264698807851328e-05, + "loss": 876.5871, + "step": 8280 + }, + { + "ce_loss_12": 3.0982959866523743, + "ce_loss_17": 2.9892932772636414, + "ce_loss_23": 2.9442530274391174, + "ce_loss_3": 3.8029705762863157, + "ce_loss_6": 3.4126698970794678, + "epoch": 0.829, + "grad_norm": 908.0, + "kl_loss_12": 334.2535629272461, + "kl_loss_17": 85.19641418457032, + "kl_loss_3": 1887.4463623046875, + "kl_loss_6": 1079.3946502685546, + "learning_rate": 7.182548487420554e-05, + "loss": 855.0638, + "step": 8290 + }, + { + "ce_loss_12": 3.1432425498962404, + "ce_loss_17": 3.0315305590629578, + "ce_loss_23": 2.9838999629020693, + "ce_loss_3": 3.854983127117157, + "ce_loss_6": 3.4639780282974244, + "epoch": 0.83, + "grad_norm": 1064.0, + "kl_loss_12": 340.45317993164065, + "kl_loss_17": 88.31193199157715, + "kl_loss_3": 1916.8724243164063, + "kl_loss_6": 1103.293978881836, + "learning_rate": 7.100829338251146e-05, + "loss": 858.6983, + "step": 8300 + }, + { + "ce_loss_12": 3.0850770592689516, + "ce_loss_17": 2.9665639281272886, + "ce_loss_23": 2.9138702034950255, + "ce_loss_3": 3.8345587968826296, + "ce_loss_6": 3.4193907499313356, + "epoch": 0.831, + "grad_norm": 1344.0, + "kl_loss_12": 347.6676956176758, + "kl_loss_17": 89.90985565185547, + "kl_loss_3": 1973.8903259277345, + "kl_loss_6": 1123.459619140625, + "learning_rate": 7.019542183254046e-05, + "loss": 864.3824, + "step": 8310 + }, + { + "ce_loss_12": 3.1197735190391542, + "ce_loss_17": 3.005459153652191, + "ce_loss_23": 2.953192150592804, + "ce_loss_3": 3.8333282589912416, + "ce_loss_6": 3.435945522785187, + "epoch": 0.832, + "grad_norm": 1080.0, + "kl_loss_12": 348.20618896484376, + "kl_loss_17": 93.78602905273438, + "kl_loss_3": 1924.5072937011719, + "kl_loss_6": 1107.3147766113282, + "learning_rate": 6.938687840989971e-05, + "loss": 864.4219, + "step": 8320 + }, + { + "ce_loss_12": 3.06420384645462, + "ce_loss_17": 2.948847937583923, + "ce_loss_23": 2.901505208015442, + "ce_loss_3": 3.7880621790885924, + "ce_loss_6": 3.3873242259025576, + "epoch": 0.833, + "grad_norm": 1080.0, + "kl_loss_12": 345.3905944824219, + "kl_loss_17": 89.02677841186524, + "kl_loss_3": 1930.5537353515624, + "kl_loss_6": 1102.0173278808593, + "learning_rate": 6.858267125661271e-05, + "loss": 875.7271, + "step": 8330 + }, + { + "ce_loss_12": 3.117763566970825, + "ce_loss_17": 3.0019028782844543, + "ce_loss_23": 2.955692636966705, + "ce_loss_3": 3.8445546865463256, + "ce_loss_6": 3.442584776878357, + "epoch": 0.834, + "grad_norm": 1080.0, + "kl_loss_12": 338.07284088134764, + "kl_loss_17": 86.10532608032227, + "kl_loss_3": 1933.6854675292968, + "kl_loss_6": 1102.285546875, + "learning_rate": 6.778280847103668e-05, + "loss": 881.1859, + "step": 8340 + }, + { + "ce_loss_12": 3.1274876832962035, + "ce_loss_17": 3.0086837649345397, + "ce_loss_23": 2.9614779591560363, + "ce_loss_3": 3.8465988516807554, + "ce_loss_6": 3.445386898517609, + "epoch": 0.835, + "grad_norm": 896.0, + "kl_loss_12": 348.3533416748047, + "kl_loss_17": 89.17362174987792, + "kl_loss_3": 1939.704248046875, + "kl_loss_6": 1107.0311492919923, + "learning_rate": 6.698729810778065e-05, + "loss": 860.6041, + "step": 8350 + }, + { + "ce_loss_12": 3.040190947055817, + "ce_loss_17": 2.92340030670166, + "ce_loss_23": 2.877941405773163, + "ce_loss_3": 3.7735044956207275, + "ce_loss_6": 3.3638616800308228, + "epoch": 0.836, + "grad_norm": 1176.0, + "kl_loss_12": 334.4034454345703, + "kl_loss_17": 85.51121520996094, + "kl_loss_3": 1925.1980590820312, + "kl_loss_6": 1095.8139862060548, + "learning_rate": 6.619614817762538e-05, + "loss": 867.955, + "step": 8360 + }, + { + "ce_loss_12": 3.0219133734703063, + "ce_loss_17": 2.900725555419922, + "ce_loss_23": 2.8514190554618835, + "ce_loss_3": 3.804880404472351, + "ce_loss_6": 3.36659722328186, + "epoch": 0.837, + "grad_norm": 996.0, + "kl_loss_12": 348.86095428466797, + "kl_loss_17": 86.93252220153809, + "kl_loss_3": 2054.0288818359377, + "kl_loss_6": 1159.649105834961, + "learning_rate": 6.540936664744196e-05, + "loss": 893.201, + "step": 8370 + }, + { + "ce_loss_12": 3.1456708312034607, + "ce_loss_17": 3.0280561089515685, + "ce_loss_23": 2.9808369278907776, + "ce_loss_3": 3.8817422866821287, + "ce_loss_6": 3.4683672308921816, + "epoch": 0.838, + "grad_norm": 892.0, + "kl_loss_12": 344.3450592041016, + "kl_loss_17": 87.86822891235352, + "kl_loss_3": 1957.4243774414062, + "kl_loss_6": 1115.648455810547, + "learning_rate": 6.462696144011149e-05, + "loss": 858.9438, + "step": 8380 + }, + { + "ce_loss_12": 3.107890796661377, + "ce_loss_17": 2.9926645755767822, + "ce_loss_23": 2.9458924531936646, + "ce_loss_3": 3.8240071654319765, + "ce_loss_6": 3.4266958951950075, + "epoch": 0.839, + "grad_norm": 1072.0, + "kl_loss_12": 346.38677673339845, + "kl_loss_17": 88.94830360412598, + "kl_loss_3": 1925.6028625488282, + "kl_loss_6": 1109.2246826171875, + "learning_rate": 6.384894043444567e-05, + "loss": 856.969, + "step": 8390 + }, + { + "ce_loss_12": 3.1257767200469972, + "ce_loss_17": 3.009605872631073, + "ce_loss_23": 2.9617841124534605, + "ce_loss_3": 3.8602431058883666, + "ce_loss_6": 3.455742073059082, + "epoch": 0.84, + "grad_norm": 1096.0, + "kl_loss_12": 343.82138977050784, + "kl_loss_17": 88.10531845092774, + "kl_loss_3": 1947.715557861328, + "kl_loss_6": 1115.6705200195313, + "learning_rate": 6.307531146510753e-05, + "loss": 867.9402, + "step": 8400 + }, + { + "ce_loss_12": 3.106555771827698, + "ce_loss_17": 2.9946197390556337, + "ce_loss_23": 2.945484471321106, + "ce_loss_3": 3.8135352969169616, + "ce_loss_6": 3.4177019596099854, + "epoch": 0.841, + "grad_norm": 1144.0, + "kl_loss_12": 339.7760940551758, + "kl_loss_17": 87.89406509399414, + "kl_loss_3": 1883.2806030273437, + "kl_loss_6": 1087.097219848633, + "learning_rate": 6.230608232253226e-05, + "loss": 845.5771, + "step": 8410 + }, + { + "ce_loss_12": 3.071239709854126, + "ce_loss_17": 2.951303815841675, + "ce_loss_23": 2.901365566253662, + "ce_loss_3": 3.838588201999664, + "ce_loss_6": 3.4165117025375364, + "epoch": 0.842, + "grad_norm": 1168.0, + "kl_loss_12": 347.6035659790039, + "kl_loss_17": 88.24365577697753, + "kl_loss_3": 2011.380517578125, + "kl_loss_6": 1143.0382446289063, + "learning_rate": 6.154126075284855e-05, + "loss": 871.0586, + "step": 8420 + }, + { + "ce_loss_12": 3.1536173582077027, + "ce_loss_17": 3.040555489063263, + "ce_loss_23": 2.9942591905593874, + "ce_loss_3": 3.85747230052948, + "ce_loss_6": 3.466206908226013, + "epoch": 0.843, + "grad_norm": 948.0, + "kl_loss_12": 331.64387664794924, + "kl_loss_17": 85.18689041137695, + "kl_loss_3": 1886.2900390625, + "kl_loss_6": 1079.935122680664, + "learning_rate": 6.078085445780129e-05, + "loss": 841.4134, + "step": 8430 + }, + { + "ce_loss_12": 3.157128167152405, + "ce_loss_17": 3.0442301392555238, + "ce_loss_23": 2.998222291469574, + "ce_loss_3": 3.8884846687316896, + "ce_loss_6": 3.4774367213249207, + "epoch": 0.844, + "grad_norm": 1112.0, + "kl_loss_12": 338.92323608398436, + "kl_loss_17": 86.8118854522705, + "kl_loss_3": 1954.67890625, + "kl_loss_6": 1101.4988372802734, + "learning_rate": 6.002487109467347e-05, + "loss": 854.1086, + "step": 8440 + }, + { + "ce_loss_12": 3.163812208175659, + "ce_loss_17": 3.0485057711601256, + "ce_loss_23": 3.0002379298210142, + "ce_loss_3": 3.879017102718353, + "ce_loss_6": 3.482664239406586, + "epoch": 0.845, + "grad_norm": 976.0, + "kl_loss_12": 347.79635467529295, + "kl_loss_17": 89.86136512756347, + "kl_loss_3": 1939.941583251953, + "kl_loss_6": 1113.0343994140626, + "learning_rate": 5.927331827620902e-05, + "loss": 858.4156, + "step": 8450 + }, + { + "ce_loss_12": 3.149532949924469, + "ce_loss_17": 3.0329127073287965, + "ce_loss_23": 2.9848304629325866, + "ce_loss_3": 3.836488330364227, + "ce_loss_6": 3.4556496262550356, + "epoch": 0.846, + "grad_norm": 984.0, + "kl_loss_12": 333.4063385009766, + "kl_loss_17": 85.9187614440918, + "kl_loss_3": 1856.5706665039063, + "kl_loss_6": 1064.6796936035157, + "learning_rate": 5.852620357053651e-05, + "loss": 851.8359, + "step": 8460 + }, + { + "ce_loss_12": 3.1828793048858643, + "ce_loss_17": 3.0734182953834535, + "ce_loss_23": 3.0279962182044984, + "ce_loss_3": 3.8866792559623717, + "ce_loss_6": 3.491861045360565, + "epoch": 0.847, + "grad_norm": 1136.0, + "kl_loss_12": 334.7801971435547, + "kl_loss_17": 85.28386154174805, + "kl_loss_3": 1885.297296142578, + "kl_loss_6": 1077.3123596191406, + "learning_rate": 5.778353450109286e-05, + "loss": 852.2954, + "step": 8470 + }, + { + "ce_loss_12": 3.213039755821228, + "ce_loss_17": 3.0986974835395813, + "ce_loss_23": 3.0505094647407534, + "ce_loss_3": 3.9459834694862366, + "ce_loss_6": 3.535405671596527, + "epoch": 0.848, + "grad_norm": 1544.0, + "kl_loss_12": 343.61567840576174, + "kl_loss_17": 88.51277122497558, + "kl_loss_3": 1930.9443481445312, + "kl_loss_6": 1099.9347137451173, + "learning_rate": 5.7045318546547206e-05, + "loss": 858.4686, + "step": 8480 + }, + { + "ce_loss_12": 3.1145872831344605, + "ce_loss_17": 3.0008885622024537, + "ce_loss_23": 2.952608811855316, + "ce_loss_3": 3.8506309747695924, + "ce_loss_6": 3.4415759801864625, + "epoch": 0.849, + "grad_norm": 1224.0, + "kl_loss_12": 338.5645751953125, + "kl_loss_17": 87.2589542388916, + "kl_loss_3": 1946.0604858398438, + "kl_loss_6": 1108.356982421875, + "learning_rate": 5.631156314072605e-05, + "loss": 860.1698, + "step": 8490 + }, + { + "ce_loss_12": 3.143726873397827, + "ce_loss_17": 3.026803493499756, + "ce_loss_23": 2.979511725902557, + "ce_loss_3": 3.8262677907943727, + "ce_loss_6": 3.4486370205879213, + "epoch": 0.85, + "grad_norm": 1008.0, + "kl_loss_12": 337.11222686767576, + "kl_loss_17": 87.5474349975586, + "kl_loss_3": 1863.6341430664063, + "kl_loss_6": 1077.056936645508, + "learning_rate": 5.5582275672538315e-05, + "loss": 845.0238, + "step": 8500 + }, + { + "ce_loss_12": 3.062679886817932, + "ce_loss_17": 2.9370684027671814, + "ce_loss_23": 2.889982986450195, + "ce_loss_3": 3.8353065729141234, + "ce_loss_6": 3.4114139795303347, + "epoch": 0.851, + "grad_norm": 920.0, + "kl_loss_12": 352.9176330566406, + "kl_loss_17": 89.93073768615723, + "kl_loss_3": 2035.2821105957032, + "kl_loss_6": 1160.1542114257813, + "learning_rate": 5.4857463485900484e-05, + "loss": 884.4208, + "step": 8510 + }, + { + "ce_loss_12": 3.120178520679474, + "ce_loss_17": 3.0025755405426025, + "ce_loss_23": 2.9565329909324647, + "ce_loss_3": 3.817841875553131, + "ce_loss_6": 3.431487536430359, + "epoch": 0.852, + "grad_norm": 992.0, + "kl_loss_12": 336.0100524902344, + "kl_loss_17": 86.01943740844726, + "kl_loss_3": 1894.4754455566406, + "kl_loss_6": 1090.1786346435547, + "learning_rate": 5.413713387966329e-05, + "loss": 850.633, + "step": 8520 + }, + { + "ce_loss_12": 3.125340521335602, + "ce_loss_17": 3.01132732629776, + "ce_loss_23": 2.9644988775253296, + "ce_loss_3": 3.854294538497925, + "ce_loss_6": 3.4520197987556456, + "epoch": 0.853, + "grad_norm": 1208.0, + "kl_loss_12": 339.7386764526367, + "kl_loss_17": 87.40383911132812, + "kl_loss_3": 1942.8307739257812, + "kl_loss_6": 1113.2786926269532, + "learning_rate": 5.34212941075381e-05, + "loss": 864.8773, + "step": 8530 + }, + { + "ce_loss_12": 3.128991258144379, + "ce_loss_17": 3.0231241464614866, + "ce_loss_23": 2.978782618045807, + "ce_loss_3": 3.830638825893402, + "ce_loss_6": 3.4327388763427735, + "epoch": 0.854, + "grad_norm": 976.0, + "kl_loss_12": 323.4237808227539, + "kl_loss_17": 84.3223777770996, + "kl_loss_3": 1873.2142211914063, + "kl_loss_6": 1054.9510284423827, + "learning_rate": 5.270995137802315e-05, + "loss": 846.1405, + "step": 8540 + }, + { + "ce_loss_12": 3.0760897994041443, + "ce_loss_17": 2.964383137226105, + "ce_loss_23": 2.918411982059479, + "ce_loss_3": 3.7972556352615356, + "ce_loss_6": 3.3881014347076417, + "epoch": 0.855, + "grad_norm": 836.0, + "kl_loss_12": 334.1419174194336, + "kl_loss_17": 84.58750801086425, + "kl_loss_3": 1927.474542236328, + "kl_loss_6": 1087.4206146240235, + "learning_rate": 5.2003112854332125e-05, + "loss": 864.5331, + "step": 8550 + }, + { + "ce_loss_12": 3.072225642204285, + "ce_loss_17": 2.964117980003357, + "ce_loss_23": 2.919295918941498, + "ce_loss_3": 3.7820751905441283, + "ce_loss_6": 3.3831193804740907, + "epoch": 0.856, + "grad_norm": 1192.0, + "kl_loss_12": 330.09010925292966, + "kl_loss_17": 84.22143287658692, + "kl_loss_3": 1912.9076110839844, + "kl_loss_6": 1084.7628662109375, + "learning_rate": 5.130078565432089e-05, + "loss": 839.2182, + "step": 8560 + }, + { + "ce_loss_12": 3.1370887756347656, + "ce_loss_17": 3.0283565163612365, + "ce_loss_23": 2.982879114151001, + "ce_loss_3": 3.831531059741974, + "ce_loss_6": 3.444535326957703, + "epoch": 0.857, + "grad_norm": 996.0, + "kl_loss_12": 329.6750778198242, + "kl_loss_17": 83.78762245178223, + "kl_loss_3": 1881.2487365722657, + "kl_loss_6": 1080.3342071533202, + "learning_rate": 5.060297685041659e-05, + "loss": 838.5309, + "step": 8570 + }, + { + "ce_loss_12": 3.0776275753974915, + "ce_loss_17": 2.956244874000549, + "ce_loss_23": 2.9072667360305786, + "ce_loss_3": 3.807856500148773, + "ce_loss_6": 3.3985143542289733, + "epoch": 0.858, + "grad_norm": 1056.0, + "kl_loss_12": 342.1207244873047, + "kl_loss_17": 88.82546920776367, + "kl_loss_3": 1940.8138427734375, + "kl_loss_6": 1105.3937622070312, + "learning_rate": 4.99096934695461e-05, + "loss": 870.5254, + "step": 8580 + }, + { + "ce_loss_12": 3.134142804145813, + "ce_loss_17": 3.0192827582359314, + "ce_loss_23": 2.9710862755775453, + "ce_loss_3": 3.853245532512665, + "ce_loss_6": 3.4556553840637205, + "epoch": 0.859, + "grad_norm": 832.0, + "kl_loss_12": 333.8627334594727, + "kl_loss_17": 85.99855766296386, + "kl_loss_3": 1913.0985778808595, + "kl_loss_6": 1089.306918334961, + "learning_rate": 4.922094249306558e-05, + "loss": 848.9322, + "step": 8590 + }, + { + "ce_loss_12": 3.166194033622742, + "ce_loss_17": 3.0500768542289736, + "ce_loss_23": 3.0017306447029113, + "ce_loss_3": 3.883514702320099, + "ce_loss_6": 3.481926476955414, + "epoch": 0.86, + "grad_norm": 1020.0, + "kl_loss_12": 343.3954132080078, + "kl_loss_17": 88.88944129943847, + "kl_loss_3": 1925.6196594238281, + "kl_loss_6": 1095.9616302490235, + "learning_rate": 4.853673085668947e-05, + "loss": 845.7038, + "step": 8600 + }, + { + "ce_loss_12": 3.173807406425476, + "ce_loss_17": 3.0630053758621214, + "ce_loss_23": 3.0165204882621763, + "ce_loss_3": 3.893686830997467, + "ce_loss_6": 3.4856738567352297, + "epoch": 0.861, + "grad_norm": 900.0, + "kl_loss_12": 332.5966766357422, + "kl_loss_17": 85.91055793762207, + "kl_loss_3": 1913.8761779785157, + "kl_loss_6": 1079.535546875, + "learning_rate": 4.78570654504214e-05, + "loss": 856.4615, + "step": 8610 + }, + { + "ce_loss_12": 3.1255746483802795, + "ce_loss_17": 3.014578711986542, + "ce_loss_23": 2.9669725060462953, + "ce_loss_3": 3.8446498036384584, + "ce_loss_6": 3.4471297383308412, + "epoch": 0.862, + "grad_norm": 1072.0, + "kl_loss_12": 335.4307342529297, + "kl_loss_17": 85.39923210144043, + "kl_loss_3": 1918.1002502441406, + "kl_loss_6": 1101.1010650634767, + "learning_rate": 4.7181953118484556e-05, + "loss": 859.0185, + "step": 8620 + }, + { + "ce_loss_12": 3.149523913860321, + "ce_loss_17": 3.0387833952903747, + "ce_loss_23": 2.993229532241821, + "ce_loss_3": 3.855341386795044, + "ce_loss_6": 3.460942280292511, + "epoch": 0.863, + "grad_norm": 944.0, + "kl_loss_12": 330.67168884277345, + "kl_loss_17": 84.8177474975586, + "kl_loss_3": 1874.0156555175781, + "kl_loss_6": 1074.759991455078, + "learning_rate": 4.651140065925269e-05, + "loss": 864.2121, + "step": 8630 + }, + { + "ce_loss_12": 3.087773311138153, + "ce_loss_17": 2.975247311592102, + "ce_loss_23": 2.9271497249603273, + "ce_loss_3": 3.8027710556983947, + "ce_loss_6": 3.4052886724472047, + "epoch": 0.864, + "grad_norm": 1136.0, + "kl_loss_12": 336.3831130981445, + "kl_loss_17": 87.06652030944824, + "kl_loss_3": 1922.2005981445313, + "kl_loss_6": 1094.44453125, + "learning_rate": 4.58454148251814e-05, + "loss": 865.4342, + "step": 8640 + }, + { + "ce_loss_12": 3.099566066265106, + "ce_loss_17": 2.982946813106537, + "ce_loss_23": 2.9340903520584107, + "ce_loss_3": 3.8434081077575684, + "ce_loss_6": 3.429073524475098, + "epoch": 0.865, + "grad_norm": 988.0, + "kl_loss_12": 337.62081756591795, + "kl_loss_17": 86.17353515625, + "kl_loss_3": 1954.7680358886719, + "kl_loss_6": 1113.4993255615234, + "learning_rate": 4.518400232274078e-05, + "loss": 859.0479, + "step": 8650 + }, + { + "ce_loss_12": 3.131185245513916, + "ce_loss_17": 3.014595639705658, + "ce_loss_23": 2.9649352073669433, + "ce_loss_3": 3.838270437717438, + "ce_loss_6": 3.4395124316215515, + "epoch": 0.866, + "grad_norm": 960.0, + "kl_loss_12": 338.11429901123046, + "kl_loss_17": 87.64392280578613, + "kl_loss_3": 1888.8003845214844, + "kl_loss_6": 1081.4321197509767, + "learning_rate": 4.452716981234745e-05, + "loss": 832.7505, + "step": 8660 + }, + { + "ce_loss_12": 3.094347560405731, + "ce_loss_17": 2.985401213169098, + "ce_loss_23": 2.9405669450759886, + "ce_loss_3": 3.801581919193268, + "ce_loss_6": 3.4098233222961425, + "epoch": 0.867, + "grad_norm": 856.0, + "kl_loss_12": 329.37064361572266, + "kl_loss_17": 84.24899635314941, + "kl_loss_3": 1886.796435546875, + "kl_loss_6": 1086.4415130615234, + "learning_rate": 4.3874923908297335e-05, + "loss": 836.1057, + "step": 8670 + }, + { + "ce_loss_12": 3.149175775051117, + "ce_loss_17": 3.0373104572296143, + "ce_loss_23": 2.988961696624756, + "ce_loss_3": 3.8777933359146117, + "ce_loss_6": 3.474110317230225, + "epoch": 0.868, + "grad_norm": 884.0, + "kl_loss_12": 339.9120407104492, + "kl_loss_17": 87.37800178527831, + "kl_loss_3": 1943.5669311523438, + "kl_loss_6": 1111.9998474121094, + "learning_rate": 4.322727117869951e-05, + "loss": 854.1123, + "step": 8680 + }, + { + "ce_loss_12": 3.159498453140259, + "ce_loss_17": 3.0466421127319334, + "ce_loss_23": 2.998374545574188, + "ce_loss_3": 3.880115878582001, + "ce_loss_6": 3.4746498703956603, + "epoch": 0.869, + "grad_norm": 1448.0, + "kl_loss_12": 339.47459411621094, + "kl_loss_17": 86.88564872741699, + "kl_loss_3": 1935.815850830078, + "kl_loss_6": 1101.9241333007812, + "learning_rate": 4.2584218145409916e-05, + "loss": 854.732, + "step": 8690 + }, + { + "ce_loss_12": 3.1883333921432495, + "ce_loss_17": 3.0860480070114136, + "ce_loss_23": 3.0403281450271606, + "ce_loss_3": 3.8731122970581056, + "ce_loss_6": 3.490442156791687, + "epoch": 0.87, + "grad_norm": 944.0, + "kl_loss_12": 325.2259918212891, + "kl_loss_17": 83.94673805236816, + "kl_loss_3": 1845.4943603515626, + "kl_loss_6": 1057.3880157470703, + "learning_rate": 4.194577128396521e-05, + "loss": 833.2588, + "step": 8700 + }, + { + "ce_loss_12": 3.083304762840271, + "ce_loss_17": 2.973324239253998, + "ce_loss_23": 2.927659106254578, + "ce_loss_3": 3.8043991804122923, + "ce_loss_6": 3.397646701335907, + "epoch": 0.871, + "grad_norm": 976.0, + "kl_loss_12": 327.7694839477539, + "kl_loss_17": 84.01599769592285, + "kl_loss_3": 1917.2221069335938, + "kl_loss_6": 1079.2329345703124, + "learning_rate": 4.1311937023518264e-05, + "loss": 862.6654, + "step": 8710 + }, + { + "ce_loss_12": 3.0983502507209777, + "ce_loss_17": 2.993625044822693, + "ce_loss_23": 2.9482877254486084, + "ce_loss_3": 3.86238557100296, + "ce_loss_6": 3.4176156878471375, + "epoch": 0.872, + "grad_norm": 1248.0, + "kl_loss_12": 319.80799255371096, + "kl_loss_17": 83.03072204589844, + "kl_loss_3": 1995.8182006835937, + "kl_loss_6": 1084.0275299072266, + "learning_rate": 4.0682721746773344e-05, + "loss": 855.5651, + "step": 8720 + }, + { + "ce_loss_12": 2.974739396572113, + "ce_loss_17": 2.862412619590759, + "ce_loss_23": 2.817744278907776, + "ce_loss_3": 3.7292062759399416, + "ce_loss_6": 3.310219419002533, + "epoch": 0.873, + "grad_norm": 984.0, + "kl_loss_12": 333.8487045288086, + "kl_loss_17": 84.45549850463867, + "kl_loss_3": 1967.0060729980469, + "kl_loss_6": 1117.6822021484375, + "learning_rate": 4.0058131789920904e-05, + "loss": 850.5296, + "step": 8730 + }, + { + "ce_loss_12": 3.123417413234711, + "ce_loss_17": 3.0125887989997864, + "ce_loss_23": 2.965476393699646, + "ce_loss_3": 3.8315968990325926, + "ce_loss_6": 3.4407531023025513, + "epoch": 0.874, + "grad_norm": 1296.0, + "kl_loss_12": 329.74097442626953, + "kl_loss_17": 83.98312149047851, + "kl_loss_3": 1910.9992614746093, + "kl_loss_6": 1092.8551208496094, + "learning_rate": 3.9438173442575e-05, + "loss": 877.51, + "step": 8740 + }, + { + "ce_loss_12": 3.14895213842392, + "ce_loss_17": 3.033963847160339, + "ce_loss_23": 2.9877347469329836, + "ce_loss_3": 3.853372836112976, + "ce_loss_6": 3.4575639367103577, + "epoch": 0.875, + "grad_norm": 1248.0, + "kl_loss_12": 331.0361068725586, + "kl_loss_17": 84.8048080444336, + "kl_loss_3": 1875.0467468261718, + "kl_loss_6": 1067.8612365722656, + "learning_rate": 3.882285294770937e-05, + "loss": 846.115, + "step": 8750 + }, + { + "ce_loss_12": 3.103008270263672, + "ce_loss_17": 2.992861568927765, + "ce_loss_23": 2.947165012359619, + "ce_loss_3": 3.8003880858421324, + "ce_loss_6": 3.410064435005188, + "epoch": 0.876, + "grad_norm": 1040.0, + "kl_loss_12": 327.96388244628906, + "kl_loss_17": 85.37591094970703, + "kl_loss_3": 1864.6184143066407, + "kl_loss_6": 1062.9603637695313, + "learning_rate": 3.821217650159453e-05, + "loss": 855.1959, + "step": 8760 + }, + { + "ce_loss_12": 3.0010708808898925, + "ce_loss_17": 2.883316230773926, + "ce_loss_23": 2.8356757760047913, + "ce_loss_3": 3.7577064514160154, + "ce_loss_6": 3.331834352016449, + "epoch": 0.877, + "grad_norm": 1184.0, + "kl_loss_12": 340.2316040039062, + "kl_loss_17": 85.8433349609375, + "kl_loss_3": 1977.02939453125, + "kl_loss_6": 1125.2041931152344, + "learning_rate": 3.760615025373543e-05, + "loss": 867.5315, + "step": 8770 + }, + { + "ce_loss_12": 3.161791479587555, + "ce_loss_17": 3.0443769335746764, + "ce_loss_23": 2.9945632100105284, + "ce_loss_3": 3.8910972476005554, + "ce_loss_6": 3.4862359285354616, + "epoch": 0.878, + "grad_norm": 1288.0, + "kl_loss_12": 343.78154907226565, + "kl_loss_17": 89.50541648864746, + "kl_loss_3": 1940.6373046875, + "kl_loss_6": 1113.4889892578126, + "learning_rate": 3.700478030680987e-05, + "loss": 873.9607, + "step": 8780 + }, + { + "ce_loss_12": 3.1520805835723875, + "ce_loss_17": 3.0417134046554564, + "ce_loss_23": 2.995950186252594, + "ce_loss_3": 3.8786579847335814, + "ce_loss_6": 3.4690412402153017, + "epoch": 0.879, + "grad_norm": 1312.0, + "kl_loss_12": 330.60288696289064, + "kl_loss_17": 85.94220123291015, + "kl_loss_3": 1907.498956298828, + "kl_loss_6": 1094.785562133789, + "learning_rate": 3.6408072716606344e-05, + "loss": 851.4786, + "step": 8790 + }, + { + "ce_loss_12": 3.0897973656654356, + "ce_loss_17": 2.9741249084472656, + "ce_loss_23": 2.927089250087738, + "ce_loss_3": 3.836412763595581, + "ce_loss_6": 3.4170141220092773, + "epoch": 0.88, + "grad_norm": 936.0, + "kl_loss_12": 338.46410064697267, + "kl_loss_17": 86.6291488647461, + "kl_loss_3": 1971.1416748046875, + "kl_loss_6": 1117.8702606201173, + "learning_rate": 3.5816033491963716e-05, + "loss": 883.7929, + "step": 8800 + }, + { + "ce_loss_12": 2.9557029008865356, + "ce_loss_17": 2.840587717294693, + "ce_loss_23": 2.7953685760498046, + "ce_loss_3": 3.7151981711387636, + "ce_loss_6": 3.287673497200012, + "epoch": 0.881, + "grad_norm": 1264.0, + "kl_loss_12": 334.52431030273436, + "kl_loss_17": 85.75007247924805, + "kl_loss_3": 1989.5989318847655, + "kl_loss_6": 1118.5227844238282, + "learning_rate": 3.522866859471047e-05, + "loss": 864.8224, + "step": 8810 + }, + { + "ce_loss_12": 3.173339104652405, + "ce_loss_17": 3.067618262767792, + "ce_loss_23": 3.021925222873688, + "ce_loss_3": 3.8509360671043398, + "ce_loss_6": 3.4695744156837462, + "epoch": 0.882, + "grad_norm": 1256.0, + "kl_loss_12": 323.1707489013672, + "kl_loss_17": 82.98987579345703, + "kl_loss_3": 1828.076483154297, + "kl_loss_6": 1041.479315185547, + "learning_rate": 3.46459839396045e-05, + "loss": 838.6922, + "step": 8820 + }, + { + "ce_loss_12": 3.0913305282592773, + "ce_loss_17": 2.9775137305259705, + "ce_loss_23": 2.9278905749320985, + "ce_loss_3": 3.8238929867744447, + "ce_loss_6": 3.4145618438720704, + "epoch": 0.883, + "grad_norm": 980.0, + "kl_loss_12": 334.30641021728513, + "kl_loss_17": 85.81579704284668, + "kl_loss_3": 1927.4597717285155, + "kl_loss_6": 1095.1721893310546, + "learning_rate": 3.406798539427386e-05, + "loss": 874.5561, + "step": 8830 + }, + { + "ce_loss_12": 3.1545315861701964, + "ce_loss_17": 3.044299101829529, + "ce_loss_23": 2.998510789871216, + "ce_loss_3": 3.8693243622779847, + "ce_loss_6": 3.469870162010193, + "epoch": 0.884, + "grad_norm": 1208.0, + "kl_loss_12": 332.1228424072266, + "kl_loss_17": 84.90392875671387, + "kl_loss_3": 1923.0780151367187, + "kl_loss_6": 1091.9577911376953, + "learning_rate": 3.349467877915746e-05, + "loss": 859.1238, + "step": 8840 + }, + { + "ce_loss_12": 3.1225974440574644, + "ce_loss_17": 3.009087359905243, + "ce_loss_23": 2.962168884277344, + "ce_loss_3": 3.8547245621681214, + "ce_loss_6": 3.4457091689109802, + "epoch": 0.885, + "grad_norm": 876.0, + "kl_loss_12": 338.3793884277344, + "kl_loss_17": 86.19623184204102, + "kl_loss_3": 1967.9500549316406, + "kl_loss_6": 1116.0619323730468, + "learning_rate": 3.292606986744667e-05, + "loss": 886.2995, + "step": 8850 + }, + { + "ce_loss_12": 3.0737382054328917, + "ce_loss_17": 2.9652695178985597, + "ce_loss_23": 2.9208232402801513, + "ce_loss_3": 3.804908299446106, + "ce_loss_6": 3.3878675580024717, + "epoch": 0.886, + "grad_norm": 1064.0, + "kl_loss_12": 330.1391632080078, + "kl_loss_17": 83.49464492797851, + "kl_loss_3": 1942.0145629882813, + "kl_loss_6": 1098.344384765625, + "learning_rate": 3.23621643850267e-05, + "loss": 862.1052, + "step": 8860 + }, + { + "ce_loss_12": 3.1435423254966737, + "ce_loss_17": 3.031849682331085, + "ce_loss_23": 2.9862906098365785, + "ce_loss_3": 3.855881190299988, + "ce_loss_6": 3.462362253665924, + "epoch": 0.887, + "grad_norm": 1072.0, + "kl_loss_12": 340.6441955566406, + "kl_loss_17": 87.4849063873291, + "kl_loss_3": 1918.8952880859374, + "kl_loss_6": 1104.3206878662108, + "learning_rate": 3.180296801041971e-05, + "loss": 846.5516, + "step": 8870 + }, + { + "ce_loss_12": 3.1657923221588136, + "ce_loss_17": 3.0589999556541443, + "ce_loss_23": 3.0135733246803285, + "ce_loss_3": 3.878432369232178, + "ce_loss_6": 3.475983202457428, + "epoch": 0.888, + "grad_norm": 1012.0, + "kl_loss_12": 329.00604553222655, + "kl_loss_17": 84.90734176635742, + "kl_loss_3": 1914.929833984375, + "kl_loss_6": 1079.0008392333984, + "learning_rate": 3.124848637472688e-05, + "loss": 837.532, + "step": 8880 + }, + { + "ce_loss_12": 2.998358190059662, + "ce_loss_17": 2.8849174857139586, + "ce_loss_23": 2.8381486892700196, + "ce_loss_3": 3.734885573387146, + "ce_loss_6": 3.3218790531158446, + "epoch": 0.889, + "grad_norm": 1296.0, + "kl_loss_12": 326.6248016357422, + "kl_loss_17": 83.15086822509765, + "kl_loss_3": 1941.88310546875, + "kl_loss_6": 1098.8846435546875, + "learning_rate": 3.069872506157212e-05, + "loss": 853.1146, + "step": 8890 + }, + { + "ce_loss_12": 3.093616080284119, + "ce_loss_17": 2.982220458984375, + "ce_loss_23": 2.9373262763023376, + "ce_loss_3": 3.803342950344086, + "ce_loss_6": 3.406595754623413, + "epoch": 0.89, + "grad_norm": 1152.0, + "kl_loss_12": 331.7616683959961, + "kl_loss_17": 84.35318107604981, + "kl_loss_3": 1904.2030456542968, + "kl_loss_6": 1078.6033813476563, + "learning_rate": 3.0153689607045842e-05, + "loss": 844.5693, + "step": 8900 + }, + { + "ce_loss_12": 3.0073063373565674, + "ce_loss_17": 2.889033830165863, + "ce_loss_23": 2.841313922405243, + "ce_loss_3": 3.780926537513733, + "ce_loss_6": 3.350403296947479, + "epoch": 0.891, + "grad_norm": 1064.0, + "kl_loss_12": 345.90849609375, + "kl_loss_17": 87.65661125183105, + "kl_loss_3": 2050.542547607422, + "kl_loss_6": 1161.115560913086, + "learning_rate": 2.9613385499648926e-05, + "loss": 870.6569, + "step": 8910 + }, + { + "ce_loss_12": 3.0530014753341677, + "ce_loss_17": 2.9412503004074098, + "ce_loss_23": 2.8958068609237673, + "ce_loss_3": 3.7602118015289308, + "ce_loss_6": 3.3697606444358827, + "epoch": 0.892, + "grad_norm": 1112.0, + "kl_loss_12": 328.8154098510742, + "kl_loss_17": 84.93235816955567, + "kl_loss_3": 1880.8406860351563, + "kl_loss_6": 1080.6198760986329, + "learning_rate": 2.9077818180237692e-05, + "loss": 852.0275, + "step": 8920 + }, + { + "ce_loss_12": 3.0982262253761292, + "ce_loss_17": 2.9827243089675903, + "ce_loss_23": 2.9341540217399595, + "ce_loss_3": 3.8311649203300475, + "ce_loss_6": 3.420704758167267, + "epoch": 0.893, + "grad_norm": 1400.0, + "kl_loss_12": 332.6533721923828, + "kl_loss_17": 85.156884765625, + "kl_loss_3": 1921.9191345214845, + "kl_loss_6": 1094.7849243164062, + "learning_rate": 2.8546993041969172e-05, + "loss": 856.9773, + "step": 8930 + }, + { + "ce_loss_12": 3.1285369038581847, + "ce_loss_17": 3.0208025217056274, + "ce_loss_23": 2.9757667064666746, + "ce_loss_3": 3.8146532654762266, + "ce_loss_6": 3.428730511665344, + "epoch": 0.894, + "grad_norm": 892.0, + "kl_loss_12": 324.5311508178711, + "kl_loss_17": 83.1994400024414, + "kl_loss_3": 1870.1867980957031, + "kl_loss_6": 1062.970556640625, + "learning_rate": 2.802091543024671e-05, + "loss": 847.6863, + "step": 8940 + }, + { + "ce_loss_12": 3.1246097564697264, + "ce_loss_17": 3.013726496696472, + "ce_loss_23": 2.965451240539551, + "ce_loss_3": 3.8655041098594665, + "ce_loss_6": 3.450291562080383, + "epoch": 0.895, + "grad_norm": 1168.0, + "kl_loss_12": 333.080322265625, + "kl_loss_17": 85.56056671142578, + "kl_loss_3": 1969.126446533203, + "kl_loss_6": 1106.444076538086, + "learning_rate": 2.7499590642665774e-05, + "loss": 880.2635, + "step": 8950 + }, + { + "ce_loss_12": 3.143812417984009, + "ce_loss_17": 3.028483211994171, + "ce_loss_23": 2.985080027580261, + "ce_loss_3": 3.8509507298469545, + "ce_loss_6": 3.441773521900177, + "epoch": 0.896, + "grad_norm": 1224.0, + "kl_loss_12": 349.53120574951174, + "kl_loss_17": 86.6789764404297, + "kl_loss_3": 1895.8159729003905, + "kl_loss_6": 1070.7532867431642, + "learning_rate": 2.6983023928961405e-05, + "loss": 845.1069, + "step": 8960 + }, + { + "ce_loss_12": 3.106435704231262, + "ce_loss_17": 2.9932427644729613, + "ce_loss_23": 2.946383273601532, + "ce_loss_3": 3.8312835454940797, + "ce_loss_6": 3.4233412861824037, + "epoch": 0.897, + "grad_norm": 1144.0, + "kl_loss_12": 333.72619476318357, + "kl_loss_17": 85.79256935119629, + "kl_loss_3": 1905.6740844726562, + "kl_loss_6": 1080.7652252197265, + "learning_rate": 2.6471220490954628e-05, + "loss": 860.4385, + "step": 8970 + }, + { + "ce_loss_12": 3.0967092156410216, + "ce_loss_17": 2.991311955451965, + "ce_loss_23": 2.946649193763733, + "ce_loss_3": 3.8147808074951173, + "ce_loss_6": 3.400782382488251, + "epoch": 0.898, + "grad_norm": 1088.0, + "kl_loss_12": 326.99341583251953, + "kl_loss_17": 83.46413383483886, + "kl_loss_3": 1898.6377075195312, + "kl_loss_6": 1065.3228820800782, + "learning_rate": 2.596418548250029e-05, + "loss": 852.1803, + "step": 8980 + }, + { + "ce_loss_12": 3.1327340722084047, + "ce_loss_17": 3.021799325942993, + "ce_loss_23": 2.975428318977356, + "ce_loss_3": 3.8464512825012207, + "ce_loss_6": 3.4454381346702574, + "epoch": 0.899, + "grad_norm": 972.0, + "kl_loss_12": 335.007633972168, + "kl_loss_17": 87.43348655700683, + "kl_loss_3": 1915.6479431152343, + "kl_loss_6": 1085.7200561523437, + "learning_rate": 2.5461924009435368e-05, + "loss": 844.5648, + "step": 8990 + }, + { + "ce_loss_12": 3.129016411304474, + "ce_loss_17": 3.016228425502777, + "ce_loss_23": 2.971251893043518, + "ce_loss_3": 3.837902855873108, + "ce_loss_6": 3.4397639632225037, + "epoch": 0.9, + "grad_norm": 1072.0, + "kl_loss_12": 336.40973510742185, + "kl_loss_17": 86.76480216979981, + "kl_loss_3": 1904.9064086914063, + "kl_loss_6": 1085.8543060302734, + "learning_rate": 2.4964441129527336e-05, + "loss": 869.592, + "step": 9000 + }, + { + "ce_loss_12": 3.125925374031067, + "ce_loss_17": 3.0190950870513915, + "ce_loss_23": 2.9737316727638246, + "ce_loss_3": 3.812189018726349, + "ce_loss_6": 3.430092179775238, + "epoch": 0.901, + "grad_norm": 1128.0, + "kl_loss_12": 324.31306915283204, + "kl_loss_17": 83.48861198425293, + "kl_loss_3": 1863.2838806152345, + "kl_loss_6": 1058.821725463867, + "learning_rate": 2.4471741852423235e-05, + "loss": 838.7601, + "step": 9010 + }, + { + "ce_loss_12": 3.1805315017700195, + "ce_loss_17": 3.064588391780853, + "ce_loss_23": 3.018610644340515, + "ce_loss_3": 3.8909000754356384, + "ce_loss_6": 3.4915917038917543, + "epoch": 0.902, + "grad_norm": 904.0, + "kl_loss_12": 332.6283920288086, + "kl_loss_17": 86.11167373657227, + "kl_loss_3": 1886.189385986328, + "kl_loss_6": 1075.392333984375, + "learning_rate": 2.3983831139599287e-05, + "loss": 849.1871, + "step": 9020 + }, + { + "ce_loss_12": 3.09921315908432, + "ce_loss_17": 2.9914977669715883, + "ce_loss_23": 2.945024645328522, + "ce_loss_3": 3.813475775718689, + "ce_loss_6": 3.4056436419487, + "epoch": 0.903, + "grad_norm": 972.0, + "kl_loss_12": 320.38296661376955, + "kl_loss_17": 83.96120872497559, + "kl_loss_3": 1880.3851745605468, + "kl_loss_6": 1054.7029083251953, + "learning_rate": 2.3500713904311022e-05, + "loss": 828.3093, + "step": 9030 + }, + { + "ce_loss_12": 3.129648447036743, + "ce_loss_17": 3.0214686155319215, + "ce_loss_23": 2.977446722984314, + "ce_loss_3": 3.816241002082825, + "ce_loss_6": 3.427567720413208, + "epoch": 0.904, + "grad_norm": 1312.0, + "kl_loss_12": 320.1314926147461, + "kl_loss_17": 82.71114997863769, + "kl_loss_3": 1835.7401123046875, + "kl_loss_6": 1046.6744812011718, + "learning_rate": 2.3022395011543685e-05, + "loss": 827.5029, + "step": 9040 + }, + { + "ce_loss_12": 3.1652731776237486, + "ce_loss_17": 3.04982990026474, + "ce_loss_23": 3.0022929906845093, + "ce_loss_3": 3.8810678482055665, + "ce_loss_6": 3.483680856227875, + "epoch": 0.905, + "grad_norm": 1128.0, + "kl_loss_12": 341.3849838256836, + "kl_loss_17": 87.51144218444824, + "kl_loss_3": 1925.983953857422, + "kl_loss_6": 1097.1318237304688, + "learning_rate": 2.2548879277963063e-05, + "loss": 873.3892, + "step": 9050 + }, + { + "ce_loss_12": 3.081729865074158, + "ce_loss_17": 2.9729421496391297, + "ce_loss_23": 2.928011405467987, + "ce_loss_3": 3.7899386644363404, + "ce_loss_6": 3.3915660858154295, + "epoch": 0.906, + "grad_norm": 912.0, + "kl_loss_12": 327.93106536865236, + "kl_loss_17": 84.1146312713623, + "kl_loss_3": 1879.2633178710937, + "kl_loss_6": 1068.782211303711, + "learning_rate": 2.208017147186736e-05, + "loss": 826.4768, + "step": 9060 + }, + { + "ce_loss_12": 3.0779837131500245, + "ce_loss_17": 2.967180871963501, + "ce_loss_23": 2.920120894908905, + "ce_loss_3": 3.788610649108887, + "ce_loss_6": 3.3945882081985475, + "epoch": 0.907, + "grad_norm": 800.0, + "kl_loss_12": 328.81197052001954, + "kl_loss_17": 84.20720748901367, + "kl_loss_3": 1906.1780151367188, + "kl_loss_6": 1087.1554595947266, + "learning_rate": 2.1616276313139227e-05, + "loss": 847.2346, + "step": 9070 + }, + { + "ce_loss_12": 3.1198357820510862, + "ce_loss_17": 3.0084909439086913, + "ce_loss_23": 2.9599400043487547, + "ce_loss_3": 3.832070028781891, + "ce_loss_6": 3.4358290672302245, + "epoch": 0.908, + "grad_norm": 1224.0, + "kl_loss_12": 331.44759979248045, + "kl_loss_17": 85.02637100219727, + "kl_loss_3": 1910.9259521484375, + "kl_loss_6": 1087.1692504882812, + "learning_rate": 2.1157198473197415e-05, + "loss": 859.7826, + "step": 9080 + }, + { + "ce_loss_12": 3.170629847049713, + "ce_loss_17": 3.056658124923706, + "ce_loss_23": 3.0103936195373535, + "ce_loss_3": 3.8856051087379457, + "ce_loss_6": 3.4874077081680297, + "epoch": 0.909, + "grad_norm": 980.0, + "kl_loss_12": 339.3678176879883, + "kl_loss_17": 87.31046714782715, + "kl_loss_3": 1910.169189453125, + "kl_loss_6": 1094.2420532226563, + "learning_rate": 2.0702942574950812e-05, + "loss": 854.4599, + "step": 9090 + }, + { + "ce_loss_12": 3.1087508678436278, + "ce_loss_17": 2.9925315380096436, + "ce_loss_23": 2.944721484184265, + "ce_loss_3": 3.84104905128479, + "ce_loss_6": 3.4278690695762633, + "epoch": 0.91, + "grad_norm": 940.0, + "kl_loss_12": 340.95262756347654, + "kl_loss_17": 87.44319190979004, + "kl_loss_3": 1935.718896484375, + "kl_loss_6": 1098.7049499511718, + "learning_rate": 2.025351319275137e-05, + "loss": 857.4225, + "step": 9100 + }, + { + "ce_loss_12": 3.224490487575531, + "ce_loss_17": 3.110004448890686, + "ce_loss_23": 3.060851216316223, + "ce_loss_3": 3.9367117643356324, + "ce_loss_6": 3.543748676776886, + "epoch": 0.911, + "grad_norm": 1064.0, + "kl_loss_12": 344.0806182861328, + "kl_loss_17": 87.59589996337891, + "kl_loss_3": 1927.4166748046875, + "kl_loss_6": 1112.0118194580077, + "learning_rate": 1.9808914852347816e-05, + "loss": 878.4797, + "step": 9110 + }, + { + "ce_loss_12": 3.0762908935546873, + "ce_loss_17": 2.9619147539138795, + "ce_loss_23": 2.9152398824691774, + "ce_loss_3": 3.789369535446167, + "ce_loss_6": 3.395467746257782, + "epoch": 0.912, + "grad_norm": 784.0, + "kl_loss_12": 330.6476486206055, + "kl_loss_17": 85.57843284606933, + "kl_loss_3": 1885.4349487304687, + "kl_loss_6": 1078.397769165039, + "learning_rate": 1.9369152030840554e-05, + "loss": 846.1178, + "step": 9120 + }, + { + "ce_loss_12": 3.154605543613434, + "ce_loss_17": 3.046464478969574, + "ce_loss_23": 3.0007370710372925, + "ce_loss_3": 3.867127311229706, + "ce_loss_6": 3.4716737866401672, + "epoch": 0.913, + "grad_norm": 812.0, + "kl_loss_12": 333.79958190917966, + "kl_loss_17": 85.41837768554687, + "kl_loss_3": 1922.9818054199218, + "kl_loss_6": 1094.322232055664, + "learning_rate": 1.893422915663645e-05, + "loss": 857.3375, + "step": 9130 + }, + { + "ce_loss_12": 3.0396557331085203, + "ce_loss_17": 2.9230321049690247, + "ce_loss_23": 2.8751866221427917, + "ce_loss_3": 3.7878071665763855, + "ce_loss_6": 3.3714339256286623, + "epoch": 0.914, + "grad_norm": 1032.0, + "kl_loss_12": 337.8765075683594, + "kl_loss_17": 86.10670356750488, + "kl_loss_3": 1972.9100341796875, + "kl_loss_6": 1125.4221252441407, + "learning_rate": 1.850415060940386e-05, + "loss": 871.0162, + "step": 9140 + }, + { + "ce_loss_12": 3.1497557282447817, + "ce_loss_17": 3.0385369658470154, + "ce_loss_23": 2.9938205003738405, + "ce_loss_3": 3.8452399015426635, + "ce_loss_6": 3.45322071313858, + "epoch": 0.915, + "grad_norm": 1008.0, + "kl_loss_12": 332.32138671875, + "kl_loss_17": 84.9898525238037, + "kl_loss_3": 1874.5397216796875, + "kl_loss_6": 1070.864324951172, + "learning_rate": 1.8078920720028978e-05, + "loss": 848.5287, + "step": 9150 + }, + { + "ce_loss_12": 3.076209282875061, + "ce_loss_17": 2.9700810432434084, + "ce_loss_23": 2.926768720149994, + "ce_loss_3": 3.778885042667389, + "ce_loss_6": 3.385262680053711, + "epoch": 0.916, + "grad_norm": 1056.0, + "kl_loss_12": 321.25645141601564, + "kl_loss_17": 82.72433166503906, + "kl_loss_3": 1865.1188659667969, + "kl_loss_6": 1059.3348358154296, + "learning_rate": 1.765854377057219e-05, + "loss": 851.4663, + "step": 9160 + }, + { + "ce_loss_12": 3.051760160923004, + "ce_loss_17": 2.9470738410949706, + "ce_loss_23": 2.9018150329589845, + "ce_loss_3": 3.762616181373596, + "ce_loss_6": 3.360524618625641, + "epoch": 0.917, + "grad_norm": 1152.0, + "kl_loss_12": 322.6339859008789, + "kl_loss_17": 82.14870758056641, + "kl_loss_3": 1894.1626953125, + "kl_loss_6": 1062.8217315673828, + "learning_rate": 1.724302399422456e-05, + "loss": 849.0074, + "step": 9170 + }, + { + "ce_loss_12": 3.025525665283203, + "ce_loss_17": 2.9144426941871644, + "ce_loss_23": 2.8663503766059875, + "ce_loss_3": 3.747741138935089, + "ce_loss_6": 3.3477275729179383, + "epoch": 0.918, + "grad_norm": 1104.0, + "kl_loss_12": 337.24256134033203, + "kl_loss_17": 87.08244514465332, + "kl_loss_3": 1926.6942932128907, + "kl_loss_6": 1100.351480102539, + "learning_rate": 1.683236557526574e-05, + "loss": 858.5575, + "step": 9180 + }, + { + "ce_loss_12": 3.1238354563713076, + "ce_loss_17": 3.018564236164093, + "ce_loss_23": 2.9754155397415163, + "ce_loss_3": 3.81089985370636, + "ce_loss_6": 3.4238336086273193, + "epoch": 0.919, + "grad_norm": 832.0, + "kl_loss_12": 317.4216339111328, + "kl_loss_17": 82.28768424987793, + "kl_loss_3": 1827.3025207519531, + "kl_loss_6": 1032.5061828613282, + "learning_rate": 1.6426572649021475e-05, + "loss": 837.4054, + "step": 9190 + }, + { + "ce_loss_12": 3.159148633480072, + "ce_loss_17": 3.0522322416305543, + "ce_loss_23": 3.006185734272003, + "ce_loss_3": 3.827952432632446, + "ce_loss_6": 3.44946551322937, + "epoch": 0.92, + "grad_norm": 1440.0, + "kl_loss_12": 324.6236053466797, + "kl_loss_17": 85.08680648803711, + "kl_loss_3": 1835.1791931152343, + "kl_loss_6": 1047.3831878662108, + "learning_rate": 1.6025649301821876e-05, + "loss": 837.1028, + "step": 9200 + }, + { + "ce_loss_12": 3.1503032088279723, + "ce_loss_17": 3.0398178339004516, + "ce_loss_23": 2.9948899507522584, + "ce_loss_3": 3.8383944511413572, + "ce_loss_6": 3.4516194224357606, + "epoch": 0.921, + "grad_norm": 1096.0, + "kl_loss_12": 331.9905776977539, + "kl_loss_17": 85.62064247131347, + "kl_loss_3": 1859.518487548828, + "kl_loss_6": 1065.316110229492, + "learning_rate": 1.5629599570960716e-05, + "loss": 836.26, + "step": 9210 + }, + { + "ce_loss_12": 3.064479887485504, + "ce_loss_17": 2.9572617769241334, + "ce_loss_23": 2.9113405346870422, + "ce_loss_3": 3.78310467004776, + "ce_loss_6": 3.3747425436973573, + "epoch": 0.922, + "grad_norm": 976.0, + "kl_loss_12": 329.91296234130857, + "kl_loss_17": 84.50942268371583, + "kl_loss_3": 1919.332208251953, + "kl_loss_6": 1082.223892211914, + "learning_rate": 1.5238427444654367e-05, + "loss": 846.9709, + "step": 9220 + }, + { + "ce_loss_12": 3.1175356030464174, + "ce_loss_17": 3.004572319984436, + "ce_loss_23": 2.960529851913452, + "ce_loss_3": 3.8180633306503298, + "ce_loss_6": 3.421934986114502, + "epoch": 0.923, + "grad_norm": 940.0, + "kl_loss_12": 326.40318756103517, + "kl_loss_17": 84.79353713989258, + "kl_loss_3": 1871.1626403808593, + "kl_loss_6": 1054.7036376953124, + "learning_rate": 1.4852136862001764e-05, + "loss": 840.9615, + "step": 9230 + }, + { + "ce_loss_12": 3.087792432308197, + "ce_loss_17": 2.9745679259300233, + "ce_loss_23": 2.929926407337189, + "ce_loss_3": 3.7816343307495117, + "ce_loss_6": 3.394350898265839, + "epoch": 0.924, + "grad_norm": 884.0, + "kl_loss_12": 323.3223403930664, + "kl_loss_17": 82.07476539611817, + "kl_loss_3": 1860.5654235839843, + "kl_loss_6": 1062.8414367675782, + "learning_rate": 1.4470731712944884e-05, + "loss": 846.7811, + "step": 9240 + }, + { + "ce_loss_12": 3.1074899077415465, + "ce_loss_17": 2.995665502548218, + "ce_loss_23": 2.9461994886398317, + "ce_loss_3": 3.8176616072654723, + "ce_loss_6": 3.4217876195907593, + "epoch": 0.925, + "grad_norm": 948.0, + "kl_loss_12": 333.7071365356445, + "kl_loss_17": 85.56305122375488, + "kl_loss_3": 1878.6777465820312, + "kl_loss_6": 1069.3385070800782, + "learning_rate": 1.4094215838229174e-05, + "loss": 861.8266, + "step": 9250 + }, + { + "ce_loss_12": 3.085869753360748, + "ce_loss_17": 2.9727274417877196, + "ce_loss_23": 2.925786566734314, + "ce_loss_3": 3.807579004764557, + "ce_loss_6": 3.399830675125122, + "epoch": 0.926, + "grad_norm": 1136.0, + "kl_loss_12": 335.0611145019531, + "kl_loss_17": 85.76180076599121, + "kl_loss_3": 1930.875244140625, + "kl_loss_6": 1097.8640533447265, + "learning_rate": 1.372259302936546e-05, + "loss": 887.4711, + "step": 9260 + }, + { + "ce_loss_12": 3.185081458091736, + "ce_loss_17": 3.069750726222992, + "ce_loss_23": 3.0205004453659057, + "ce_loss_3": 3.8892650842666625, + "ce_loss_6": 3.492465448379517, + "epoch": 0.927, + "grad_norm": 792.0, + "kl_loss_12": 341.22171783447266, + "kl_loss_17": 89.73458824157714, + "kl_loss_3": 1897.4500854492187, + "kl_loss_6": 1078.6844818115235, + "learning_rate": 1.3355867028591206e-05, + "loss": 842.859, + "step": 9270 + }, + { + "ce_loss_12": 3.0859570026397707, + "ce_loss_17": 2.977510154247284, + "ce_loss_23": 2.934697890281677, + "ce_loss_3": 3.7636232137680055, + "ce_loss_6": 3.385506844520569, + "epoch": 0.928, + "grad_norm": 1400.0, + "kl_loss_12": 324.6181671142578, + "kl_loss_17": 83.21072921752929, + "kl_loss_3": 1841.8223083496093, + "kl_loss_6": 1056.7283020019531, + "learning_rate": 1.2994041528833267e-05, + "loss": 835.5336, + "step": 9280 + }, + { + "ce_loss_12": 3.0869906425476072, + "ce_loss_17": 2.977507305145264, + "ce_loss_23": 2.931162786483765, + "ce_loss_3": 3.7968642473220826, + "ce_loss_6": 3.3973227381706237, + "epoch": 0.929, + "grad_norm": 1504.0, + "kl_loss_12": 326.4042282104492, + "kl_loss_17": 82.84918251037598, + "kl_loss_3": 1895.9616149902345, + "kl_loss_6": 1075.722134399414, + "learning_rate": 1.2637120173670358e-05, + "loss": 839.5449, + "step": 9290 + }, + { + "ce_loss_12": 3.1100995540618896, + "ce_loss_17": 2.9969801187515257, + "ce_loss_23": 2.9494405508041384, + "ce_loss_3": 3.835551941394806, + "ce_loss_6": 3.431933581829071, + "epoch": 0.93, + "grad_norm": 1176.0, + "kl_loss_12": 332.53186492919923, + "kl_loss_17": 85.74410667419434, + "kl_loss_3": 1908.8519104003906, + "kl_loss_6": 1083.8726348876953, + "learning_rate": 1.2285106557296478e-05, + "loss": 844.5967, + "step": 9300 + }, + { + "ce_loss_12": 3.0027957916259767, + "ce_loss_17": 2.8903992056846617, + "ce_loss_23": 2.8444465160369874, + "ce_loss_3": 3.7717514753341677, + "ce_loss_6": 3.340221178531647, + "epoch": 0.931, + "grad_norm": 1008.0, + "kl_loss_12": 334.00975036621094, + "kl_loss_17": 84.80331535339356, + "kl_loss_3": 2001.4889343261718, + "kl_loss_6": 1126.0714202880858, + "learning_rate": 1.1938004224484989e-05, + "loss": 866.8414, + "step": 9310 + }, + { + "ce_loss_12": 3.2222105979919435, + "ce_loss_17": 3.111147093772888, + "ce_loss_23": 3.063632643222809, + "ce_loss_3": 3.917266309261322, + "ce_loss_6": 3.5249305725097657, + "epoch": 0.932, + "grad_norm": 1012.0, + "kl_loss_12": 334.32122497558595, + "kl_loss_17": 86.87853698730468, + "kl_loss_3": 1899.5992370605468, + "kl_loss_6": 1080.962240600586, + "learning_rate": 1.1595816670552429e-05, + "loss": 865.2732, + "step": 9320 + }, + { + "ce_loss_12": 3.1430585265159605, + "ce_loss_17": 3.0352956652641296, + "ce_loss_23": 2.9885062336921693, + "ce_loss_3": 3.8439278960227967, + "ce_loss_6": 3.447865056991577, + "epoch": 0.933, + "grad_norm": 980.0, + "kl_loss_12": 327.3254165649414, + "kl_loss_17": 86.49773216247559, + "kl_loss_3": 1866.330029296875, + "kl_loss_6": 1056.9034118652344, + "learning_rate": 1.1258547341323699e-05, + "loss": 834.5617, + "step": 9330 + }, + { + "ce_loss_12": 3.176737833023071, + "ce_loss_17": 3.0673033356666566, + "ce_loss_23": 3.018969011306763, + "ce_loss_3": 3.8674515962600706, + "ce_loss_6": 3.4846546173095705, + "epoch": 0.934, + "grad_norm": 884.0, + "kl_loss_12": 334.75340576171874, + "kl_loss_17": 85.98496704101562, + "kl_loss_3": 1884.177197265625, + "kl_loss_6": 1081.2376861572266, + "learning_rate": 1.0926199633097156e-05, + "loss": 845.4896, + "step": 9340 + }, + { + "ce_loss_12": 3.179938495159149, + "ce_loss_17": 3.074588692188263, + "ce_loss_23": 3.0303378343582152, + "ce_loss_3": 3.8442990660667418, + "ce_loss_6": 3.471651017665863, + "epoch": 0.935, + "grad_norm": 956.0, + "kl_loss_12": 319.90943603515626, + "kl_loss_17": 83.01764106750488, + "kl_loss_3": 1814.667059326172, + "kl_loss_6": 1041.4213287353516, + "learning_rate": 1.0598776892610684e-05, + "loss": 848.2934, + "step": 9350 + }, + { + "ce_loss_12": 3.009808611869812, + "ce_loss_17": 2.898372745513916, + "ce_loss_23": 2.854316532611847, + "ce_loss_3": 3.726544368267059, + "ce_loss_6": 3.32262647151947, + "epoch": 0.936, + "grad_norm": 916.0, + "kl_loss_12": 326.60680999755857, + "kl_loss_17": 83.0755630493164, + "kl_loss_3": 1924.3721801757813, + "kl_loss_6": 1085.0824829101562, + "learning_rate": 1.0276282417007399e-05, + "loss": 842.6063, + "step": 9360 + }, + { + "ce_loss_12": 3.1516371846199034, + "ce_loss_17": 3.0410276889801025, + "ce_loss_23": 2.996881353855133, + "ce_loss_3": 3.834710657596588, + "ce_loss_6": 3.450020658969879, + "epoch": 0.937, + "grad_norm": 1168.0, + "kl_loss_12": 323.99993896484375, + "kl_loss_17": 82.53963088989258, + "kl_loss_3": 1854.7433349609375, + "kl_loss_6": 1059.0154510498046, + "learning_rate": 9.958719453803277e-06, + "loss": 840.2551, + "step": 9370 + }, + { + "ce_loss_12": 3.1496444582939147, + "ce_loss_17": 3.0366398692131042, + "ce_loss_23": 2.989711654186249, + "ce_loss_3": 3.8697296500205995, + "ce_loss_6": 3.471941578388214, + "epoch": 0.938, + "grad_norm": 952.0, + "kl_loss_12": 331.8085479736328, + "kl_loss_17": 85.06403846740723, + "kl_loss_3": 1910.690576171875, + "kl_loss_6": 1096.4352661132812, + "learning_rate": 9.646091200853802e-06, + "loss": 847.9618, + "step": 9380 + }, + { + "ce_loss_12": 3.1037580490112306, + "ce_loss_17": 2.995933508872986, + "ce_loss_23": 2.9508240699768065, + "ce_loss_3": 3.8010664582252502, + "ce_loss_6": 3.4100754141807554, + "epoch": 0.939, + "grad_norm": 1088.0, + "kl_loss_12": 324.2424377441406, + "kl_loss_17": 83.03877677917481, + "kl_loss_3": 1846.7161865234375, + "kl_loss_6": 1062.0440032958984, + "learning_rate": 9.338400806321978e-06, + "loss": 819.034, + "step": 9390 + }, + { + "ce_loss_12": 3.1476006865501405, + "ce_loss_17": 3.033066248893738, + "ce_loss_23": 2.9848249673843386, + "ce_loss_3": 3.8408823251724242, + "ce_loss_6": 3.4523388862609865, + "epoch": 0.94, + "grad_norm": 908.0, + "kl_loss_12": 334.59171295166016, + "kl_loss_17": 86.80456924438477, + "kl_loss_3": 1869.5985900878907, + "kl_loss_6": 1072.3442169189452, + "learning_rate": 9.035651368646646e-06, + "loss": 836.1096, + "step": 9400 + }, + { + "ce_loss_12": 3.145501697063446, + "ce_loss_17": 3.03811719417572, + "ce_loss_23": 2.994671952724457, + "ce_loss_3": 3.834343147277832, + "ce_loss_6": 3.452866768836975, + "epoch": 0.941, + "grad_norm": 1072.0, + "kl_loss_12": 323.7934066772461, + "kl_loss_17": 83.24091529846191, + "kl_loss_3": 1857.227685546875, + "kl_loss_6": 1066.3185974121093, + "learning_rate": 8.737845936511335e-06, + "loss": 844.8186, + "step": 9410 + }, + { + "ce_loss_12": 3.098772418498993, + "ce_loss_17": 2.9861217737197876, + "ce_loss_23": 2.9370655179023744, + "ce_loss_3": 3.816584086418152, + "ce_loss_6": 3.4150121927261354, + "epoch": 0.942, + "grad_norm": 884.0, + "kl_loss_12": 331.8512329101562, + "kl_loss_17": 85.79718551635742, + "kl_loss_3": 1910.79208984375, + "kl_loss_6": 1083.8923706054688, + "learning_rate": 8.444987508813451e-06, + "loss": 842.9543, + "step": 9420 + }, + { + "ce_loss_12": 3.06053763628006, + "ce_loss_17": 2.9460514068603514, + "ce_loss_23": 2.9014501929283143, + "ce_loss_3": 3.802388143539429, + "ce_loss_6": 3.39177383184433, + "epoch": 0.943, + "grad_norm": 1256.0, + "kl_loss_12": 337.7682632446289, + "kl_loss_17": 85.87634010314942, + "kl_loss_3": 1981.4451171875, + "kl_loss_6": 1125.2985046386718, + "learning_rate": 8.157079034633974e-06, + "loss": 865.9455, + "step": 9430 + }, + { + "ce_loss_12": 3.049058198928833, + "ce_loss_17": 2.938900911808014, + "ce_loss_23": 2.8931046962738036, + "ce_loss_3": 3.7696901321411134, + "ce_loss_6": 3.3683017015457155, + "epoch": 0.944, + "grad_norm": 1248.0, + "kl_loss_12": 329.84632568359376, + "kl_loss_17": 83.97733726501465, + "kl_loss_3": 1931.3895690917968, + "kl_loss_6": 1095.4517517089844, + "learning_rate": 7.874123413208145e-06, + "loss": 849.1218, + "step": 9440 + }, + { + "ce_loss_12": 3.0315654158592222, + "ce_loss_17": 2.9158872246742247, + "ce_loss_23": 2.8700754761695864, + "ce_loss_3": 3.767153263092041, + "ce_loss_6": 3.3529996633529664, + "epoch": 0.945, + "grad_norm": 1056.0, + "kl_loss_12": 328.3330795288086, + "kl_loss_17": 83.33571510314941, + "kl_loss_3": 1934.7627197265624, + "kl_loss_6": 1086.8557250976562, + "learning_rate": 7.59612349389599e-06, + "loss": 859.2561, + "step": 9450 + }, + { + "ce_loss_12": 3.108149600028992, + "ce_loss_17": 3.000080704689026, + "ce_loss_23": 2.956249988079071, + "ce_loss_3": 3.7995969891548156, + "ce_loss_6": 3.4120611190795898, + "epoch": 0.946, + "grad_norm": 960.0, + "kl_loss_12": 320.4307144165039, + "kl_loss_17": 82.51799201965332, + "kl_loss_3": 1834.1530395507812, + "kl_loss_6": 1044.237774658203, + "learning_rate": 7.323082076153509e-06, + "loss": 834.5738, + "step": 9460 + }, + { + "ce_loss_12": 3.151083838939667, + "ce_loss_17": 3.0425574660301207, + "ce_loss_23": 2.9955819845199585, + "ce_loss_3": 3.8394291162490846, + "ce_loss_6": 3.4562087774276735, + "epoch": 0.947, + "grad_norm": 936.0, + "kl_loss_12": 333.69583892822266, + "kl_loss_17": 86.60095405578613, + "kl_loss_3": 1847.519189453125, + "kl_loss_6": 1059.3768951416016, + "learning_rate": 7.055001909504755e-06, + "loss": 850.0775, + "step": 9470 + }, + { + "ce_loss_12": 3.189653384685516, + "ce_loss_17": 3.0824284672737123, + "ce_loss_23": 3.0342490315437316, + "ce_loss_3": 3.8908013105392456, + "ce_loss_6": 3.494161105155945, + "epoch": 0.948, + "grad_norm": 1168.0, + "kl_loss_12": 329.65736083984376, + "kl_loss_17": 85.09084739685059, + "kl_loss_3": 1888.2302124023438, + "kl_loss_6": 1074.1681365966797, + "learning_rate": 6.791885693514133e-06, + "loss": 849.4602, + "step": 9480 + }, + { + "ce_loss_12": 3.09908903837204, + "ce_loss_17": 2.9901888966560364, + "ce_loss_23": 2.9434477210044863, + "ce_loss_3": 3.8163867354393006, + "ce_loss_6": 3.4153868079185488, + "epoch": 0.949, + "grad_norm": 1384.0, + "kl_loss_12": 330.6887634277344, + "kl_loss_17": 85.88697090148926, + "kl_loss_3": 1924.2292724609374, + "kl_loss_6": 1094.1362182617188, + "learning_rate": 6.533736077758867e-06, + "loss": 859.2564, + "step": 9490 + }, + { + "ce_loss_12": 3.0679742455482484, + "ce_loss_17": 2.953362250328064, + "ce_loss_23": 2.903950023651123, + "ce_loss_3": 3.8092184901237487, + "ce_loss_6": 3.3871954679489136, + "epoch": 0.95, + "grad_norm": 1224.0, + "kl_loss_12": 338.8248016357422, + "kl_loss_17": 86.66958503723144, + "kl_loss_3": 1991.7789733886718, + "kl_loss_6": 1120.8234252929688, + "learning_rate": 6.2805556618028556e-06, + "loss": 859.4271, + "step": 9500 + }, + { + "ce_loss_12": 3.1421627521514894, + "ce_loss_17": 3.037944483757019, + "ce_loss_23": 2.9918121695518494, + "ce_loss_3": 3.8194591641426086, + "ce_loss_6": 3.4336347937583924, + "epoch": 0.951, + "grad_norm": 1072.0, + "kl_loss_12": 315.54112854003904, + "kl_loss_17": 82.4297420501709, + "kl_loss_3": 1820.9282653808593, + "kl_loss_6": 1024.5725524902343, + "learning_rate": 6.032346995169968e-06, + "loss": 808.4711, + "step": 9510 + }, + { + "ce_loss_12": 3.1413091897964476, + "ce_loss_17": 3.034634304046631, + "ce_loss_23": 2.9904202222824097, + "ce_loss_3": 3.851827323436737, + "ce_loss_6": 3.446918821334839, + "epoch": 0.952, + "grad_norm": 1336.0, + "kl_loss_12": 327.0206527709961, + "kl_loss_17": 84.0272274017334, + "kl_loss_3": 1891.8768371582032, + "kl_loss_6": 1072.3389282226562, + "learning_rate": 5.789112577318789e-06, + "loss": 837.899, + "step": 9520 + }, + { + "ce_loss_12": 3.132501184940338, + "ce_loss_17": 3.020099472999573, + "ce_loss_23": 2.9733578681945803, + "ce_loss_3": 3.8476534247398377, + "ce_loss_6": 3.4466808438301086, + "epoch": 0.953, + "grad_norm": 808.0, + "kl_loss_12": 332.8140274047852, + "kl_loss_17": 86.41438598632813, + "kl_loss_3": 1920.221014404297, + "kl_loss_6": 1092.9197998046875, + "learning_rate": 5.550854857617194e-06, + "loss": 838.1862, + "step": 9530 + }, + { + "ce_loss_12": 3.1130447149276734, + "ce_loss_17": 3.000280570983887, + "ce_loss_23": 2.9520017981529234, + "ce_loss_3": 3.850728416442871, + "ce_loss_6": 3.4371416687965395, + "epoch": 0.954, + "grad_norm": 1072.0, + "kl_loss_12": 340.49346618652345, + "kl_loss_17": 87.64574966430663, + "kl_loss_3": 1954.3390808105469, + "kl_loss_6": 1107.7118560791016, + "learning_rate": 5.317576235317756e-06, + "loss": 863.4284, + "step": 9540 + }, + { + "ce_loss_12": 3.1355828166007997, + "ce_loss_17": 3.029031991958618, + "ce_loss_23": 2.9835096120834352, + "ce_loss_3": 3.8195814967155455, + "ce_loss_6": 3.436410915851593, + "epoch": 0.955, + "grad_norm": 1012.0, + "kl_loss_12": 320.99874114990234, + "kl_loss_17": 84.38487701416015, + "kl_loss_3": 1820.7837524414062, + "kl_loss_6": 1038.4941101074219, + "learning_rate": 5.089279059533658e-06, + "loss": 839.8436, + "step": 9550 + }, + { + "ce_loss_12": 3.1969587922096254, + "ce_loss_17": 3.0810975074768066, + "ce_loss_23": 3.030389428138733, + "ce_loss_3": 3.8857970118522642, + "ce_loss_6": 3.496463358402252, + "epoch": 0.956, + "grad_norm": 1160.0, + "kl_loss_12": 340.56368255615234, + "kl_loss_17": 88.42728080749512, + "kl_loss_3": 1877.9746032714843, + "kl_loss_6": 1079.9369049072266, + "learning_rate": 4.865965629214819e-06, + "loss": 840.1536, + "step": 9560 + }, + { + "ce_loss_12": 3.1409691333770753, + "ce_loss_17": 3.029647672176361, + "ce_loss_23": 2.9838614463806152, + "ce_loss_3": 3.8524129986763, + "ce_loss_6": 3.45097074508667, + "epoch": 0.957, + "grad_norm": 884.0, + "kl_loss_12": 334.6656768798828, + "kl_loss_17": 85.21070098876953, + "kl_loss_3": 1923.1624877929687, + "kl_loss_6": 1092.950177001953, + "learning_rate": 4.6476381931251366e-06, + "loss": 841.2354, + "step": 9570 + }, + { + "ce_loss_12": 3.126495099067688, + "ce_loss_17": 3.0162636756896974, + "ce_loss_23": 2.9702324748039244, + "ce_loss_3": 3.824518692493439, + "ce_loss_6": 3.4368972539901734, + "epoch": 0.958, + "grad_norm": 824.0, + "kl_loss_12": 323.58738861083987, + "kl_loss_17": 83.89055442810059, + "kl_loss_3": 1857.772998046875, + "kl_loss_6": 1060.3517059326173, + "learning_rate": 4.434298949819449e-06, + "loss": 835.967, + "step": 9580 + }, + { + "ce_loss_12": 3.0928184270858763, + "ce_loss_17": 2.9766878366470335, + "ce_loss_23": 2.929574990272522, + "ce_loss_3": 3.835130035877228, + "ce_loss_6": 3.425615167617798, + "epoch": 0.959, + "grad_norm": 872.0, + "kl_loss_12": 344.17464141845704, + "kl_loss_17": 90.14529151916504, + "kl_loss_3": 1993.1905578613282, + "kl_loss_6": 1141.2810089111329, + "learning_rate": 4.2259500476214406e-06, + "loss": 866.5919, + "step": 9590 + }, + { + "ce_loss_12": 3.07260400056839, + "ce_loss_17": 2.9639098167419435, + "ce_loss_23": 2.915282893180847, + "ce_loss_3": 3.7865395665168764, + "ce_loss_6": 3.3866448998451233, + "epoch": 0.96, + "grad_norm": 1208.0, + "kl_loss_12": 330.17433776855466, + "kl_loss_17": 85.21898574829102, + "kl_loss_3": 1919.545556640625, + "kl_loss_6": 1089.7303771972656, + "learning_rate": 4.02259358460233e-06, + "loss": 844.8156, + "step": 9600 + }, + { + "ce_loss_12": 3.134173059463501, + "ce_loss_17": 3.02146919965744, + "ce_loss_23": 2.9746540904045107, + "ce_loss_3": 3.835234320163727, + "ce_loss_6": 3.444248282909393, + "epoch": 0.961, + "grad_norm": 896.0, + "kl_loss_12": 329.57266693115236, + "kl_loss_17": 85.9197624206543, + "kl_loss_3": 1850.4071044921875, + "kl_loss_6": 1055.000308227539, + "learning_rate": 3.8242316085594916e-06, + "loss": 833.8258, + "step": 9610 + }, + { + "ce_loss_12": 3.028651475906372, + "ce_loss_17": 2.9132359266281127, + "ce_loss_23": 2.864736044406891, + "ce_loss_3": 3.7799143195152283, + "ce_loss_6": 3.3598650455474854, + "epoch": 0.962, + "grad_norm": 1112.0, + "kl_loss_12": 335.8763122558594, + "kl_loss_17": 87.01072463989257, + "kl_loss_3": 1995.7976135253907, + "kl_loss_6": 1126.1893524169923, + "learning_rate": 3.630866116995757e-06, + "loss": 873.6197, + "step": 9620 + }, + { + "ce_loss_12": 3.167985463142395, + "ce_loss_17": 3.0596530079841613, + "ce_loss_23": 3.0135764598846437, + "ce_loss_3": 3.8555119752883913, + "ce_loss_6": 3.4640034198760987, + "epoch": 0.963, + "grad_norm": 972.0, + "kl_loss_12": 323.5938186645508, + "kl_loss_17": 83.68088264465332, + "kl_loss_3": 1853.1992553710938, + "kl_loss_6": 1047.2032257080077, + "learning_rate": 3.4424990570994797e-06, + "loss": 848.4127, + "step": 9630 + }, + { + "ce_loss_12": 3.158530128002167, + "ce_loss_17": 3.0523101449012757, + "ce_loss_23": 3.005872070789337, + "ce_loss_3": 3.8581893920898436, + "ce_loss_6": 3.4666857361793517, + "epoch": 0.964, + "grad_norm": 1056.0, + "kl_loss_12": 325.6300765991211, + "kl_loss_17": 84.23550529479981, + "kl_loss_3": 1884.3258483886718, + "kl_loss_6": 1070.839437866211, + "learning_rate": 3.2591323257248896e-06, + "loss": 842.3455, + "step": 9640 + }, + { + "ce_loss_12": 3.0171950697898864, + "ce_loss_17": 2.907667005062103, + "ce_loss_23": 2.860476005077362, + "ce_loss_3": 3.7420357704162597, + "ce_loss_6": 3.3383049249649046, + "epoch": 0.965, + "grad_norm": 1168.0, + "kl_loss_12": 326.80652008056643, + "kl_loss_17": 84.05732383728028, + "kl_loss_3": 1914.0955505371094, + "kl_loss_6": 1086.868325805664, + "learning_rate": 3.0807677693729385e-06, + "loss": 858.0629, + "step": 9650 + }, + { + "ce_loss_12": 3.1908772110939028, + "ce_loss_17": 3.0809871912002564, + "ce_loss_23": 3.0360643744468687, + "ce_loss_3": 3.8822084307670592, + "ce_loss_6": 3.501388466358185, + "epoch": 0.966, + "grad_norm": 1008.0, + "kl_loss_12": 325.91085662841795, + "kl_loss_17": 84.52700004577636, + "kl_loss_3": 1853.4956420898438, + "kl_loss_6": 1064.3986785888671, + "learning_rate": 2.9074071841727055e-06, + "loss": 828.0664, + "step": 9660 + }, + { + "ce_loss_12": 3.12662855386734, + "ce_loss_17": 3.0138697266578673, + "ce_loss_23": 2.968185830116272, + "ce_loss_3": 3.82765930891037, + "ce_loss_6": 3.4405104398727415, + "epoch": 0.967, + "grad_norm": 1624.0, + "kl_loss_12": 328.47566375732424, + "kl_loss_17": 84.35643615722657, + "kl_loss_3": 1888.4803894042968, + "kl_loss_6": 1080.2502044677735, + "learning_rate": 2.739052315863355e-06, + "loss": 826.3218, + "step": 9670 + }, + { + "ce_loss_12": 3.0976916670799257, + "ce_loss_17": 2.990750730037689, + "ce_loss_23": 2.9455241322517396, + "ce_loss_3": 3.8131043910980225, + "ce_loss_6": 3.404400360584259, + "epoch": 0.968, + "grad_norm": 1272.0, + "kl_loss_12": 327.33734436035155, + "kl_loss_17": 84.63534660339356, + "kl_loss_3": 1917.4770935058593, + "kl_loss_6": 1077.3879791259765, + "learning_rate": 2.5757048597765396e-06, + "loss": 839.4121, + "step": 9680 + }, + { + "ce_loss_12": 3.1200202107429504, + "ce_loss_17": 3.008747708797455, + "ce_loss_23": 2.9609153032302857, + "ce_loss_3": 3.8308356046676635, + "ce_loss_6": 3.435415732860565, + "epoch": 0.969, + "grad_norm": 1096.0, + "kl_loss_12": 331.40294494628904, + "kl_loss_17": 84.86789970397949, + "kl_loss_3": 1902.2723022460937, + "kl_loss_6": 1090.882376098633, + "learning_rate": 2.417366460819359e-06, + "loss": 848.7949, + "step": 9690 + }, + { + "ce_loss_12": 3.134627604484558, + "ce_loss_17": 3.0217684507369995, + "ce_loss_23": 2.9727313041687013, + "ce_loss_3": 3.8631083250045775, + "ce_loss_6": 3.451991927623749, + "epoch": 0.97, + "grad_norm": 1104.0, + "kl_loss_12": 336.93389892578125, + "kl_loss_17": 87.41482429504394, + "kl_loss_3": 1944.5840698242187, + "kl_loss_6": 1095.5060607910157, + "learning_rate": 2.2640387134577057e-06, + "loss": 846.1895, + "step": 9700 + }, + { + "ce_loss_12": 3.0515653133392333, + "ce_loss_17": 2.9476856708526613, + "ce_loss_23": 2.9044357180595397, + "ce_loss_3": 3.7333238244056703, + "ce_loss_6": 3.3488406419754027, + "epoch": 0.971, + "grad_norm": 884.0, + "kl_loss_12": 313.5850357055664, + "kl_loss_17": 80.26754531860351, + "kl_loss_3": 1804.5651916503907, + "kl_loss_6": 1024.8025848388672, + "learning_rate": 2.115723161700278e-06, + "loss": 822.8891, + "step": 9710 + }, + { + "ce_loss_12": 3.0469194531440733, + "ce_loss_17": 2.931409239768982, + "ce_loss_23": 2.8847802877426147, + "ce_loss_3": 3.774531900882721, + "ce_loss_6": 3.364378833770752, + "epoch": 0.972, + "grad_norm": 1032.0, + "kl_loss_12": 338.0134704589844, + "kl_loss_17": 87.07059669494629, + "kl_loss_3": 1949.6646728515625, + "kl_loss_6": 1105.4224090576172, + "learning_rate": 1.9724212990830937e-06, + "loss": 862.8866, + "step": 9720 + }, + { + "ce_loss_12": 3.176584577560425, + "ce_loss_17": 3.0638570547103883, + "ce_loss_23": 3.0177934408187865, + "ce_loss_3": 3.896821069717407, + "ce_loss_6": 3.4962318897247315, + "epoch": 0.973, + "grad_norm": 1392.0, + "kl_loss_12": 336.9397857666016, + "kl_loss_17": 85.75758895874023, + "kl_loss_3": 1931.8845520019531, + "kl_loss_6": 1098.299368286133, + "learning_rate": 1.8341345686543331e-06, + "loss": 854.241, + "step": 9730 + }, + { + "ce_loss_12": 3.153524124622345, + "ce_loss_17": 3.0473845958709718, + "ce_loss_23": 3.0015310406684876, + "ce_loss_3": 3.8341501355171204, + "ce_loss_6": 3.457658517360687, + "epoch": 0.974, + "grad_norm": 1400.0, + "kl_loss_12": 324.8598434448242, + "kl_loss_17": 84.20447654724121, + "kl_loss_3": 1832.1616271972657, + "kl_loss_6": 1051.3619445800782, + "learning_rate": 1.7008643629596864e-06, + "loss": 851.5814, + "step": 9740 + }, + { + "ce_loss_12": 3.1431527733802795, + "ce_loss_17": 3.0320016860961916, + "ce_loss_23": 2.9844090819358824, + "ce_loss_3": 3.8373767614364622, + "ce_loss_6": 3.4459742307662964, + "epoch": 0.975, + "grad_norm": 860.0, + "kl_loss_12": 328.33545074462893, + "kl_loss_17": 84.7749095916748, + "kl_loss_3": 1882.492529296875, + "kl_loss_6": 1063.1302673339844, + "learning_rate": 1.5726120240288633e-06, + "loss": 852.7666, + "step": 9750 + }, + { + "ce_loss_12": 3.0448407530784607, + "ce_loss_17": 2.940678894519806, + "ce_loss_23": 2.8972470283508303, + "ce_loss_3": 3.7465156078338624, + "ce_loss_6": 3.3575788140296936, + "epoch": 0.976, + "grad_norm": 972.0, + "kl_loss_12": 327.9582092285156, + "kl_loss_17": 83.42790260314942, + "kl_loss_3": 1890.5948608398437, + "kl_loss_6": 1073.0351593017579, + "learning_rate": 1.4493788433612708e-06, + "loss": 840.3595, + "step": 9760 + }, + { + "ce_loss_12": 3.1668558955192565, + "ce_loss_17": 3.05363472700119, + "ce_loss_23": 3.0063628435134886, + "ce_loss_3": 3.8779948472976686, + "ce_loss_6": 3.4830434560775756, + "epoch": 0.977, + "grad_norm": 792.0, + "kl_loss_12": 333.98754425048827, + "kl_loss_17": 85.3881763458252, + "kl_loss_3": 1917.9533081054688, + "kl_loss_6": 1089.0577514648437, + "learning_rate": 1.3311660619138578e-06, + "loss": 855.6904, + "step": 9770 + }, + { + "ce_loss_12": 3.160209369659424, + "ce_loss_17": 3.051060974597931, + "ce_loss_23": 3.005156457424164, + "ce_loss_3": 3.82720787525177, + "ce_loss_6": 3.4543066024780273, + "epoch": 0.978, + "grad_norm": 776.0, + "kl_loss_12": 326.8641891479492, + "kl_loss_17": 84.59445114135742, + "kl_loss_3": 1810.7693969726563, + "kl_loss_6": 1046.252163696289, + "learning_rate": 1.2179748700879012e-06, + "loss": 837.7814, + "step": 9780 + }, + { + "ce_loss_12": 3.093959939479828, + "ce_loss_17": 2.9830276489257814, + "ce_loss_23": 2.9352121472358705, + "ce_loss_3": 3.794635844230652, + "ce_loss_6": 3.4019384741783143, + "epoch": 0.979, + "grad_norm": 1352.0, + "kl_loss_12": 324.03418731689453, + "kl_loss_17": 83.56771278381348, + "kl_loss_3": 1865.2319885253905, + "kl_loss_6": 1053.4732147216796, + "learning_rate": 1.1098064077174619e-06, + "loss": 840.0145, + "step": 9790 + }, + { + "ce_loss_12": 3.1171369314193726, + "ce_loss_17": 3.0069626808166503, + "ce_loss_23": 2.960393154621124, + "ce_loss_3": 3.846533477306366, + "ce_loss_6": 3.4357937932014466, + "epoch": 0.98, + "grad_norm": 992.0, + "kl_loss_12": 327.81168365478516, + "kl_loss_17": 84.16520195007324, + "kl_loss_3": 1931.3702514648437, + "kl_loss_6": 1085.5306610107423, + "learning_rate": 1.006661764057837e-06, + "loss": 848.5051, + "step": 9800 + }, + { + "ce_loss_12": 3.124183714389801, + "ce_loss_17": 3.016022801399231, + "ce_loss_23": 2.9706171631813048, + "ce_loss_3": 3.83089302778244, + "ce_loss_6": 3.4350167989730833, + "epoch": 0.981, + "grad_norm": 1024.0, + "kl_loss_12": 325.91871337890626, + "kl_loss_17": 84.2316764831543, + "kl_loss_3": 1892.4080688476563, + "kl_loss_6": 1070.907534790039, + "learning_rate": 9.085419777743465e-07, + "loss": 837.4656, + "step": 9810 + }, + { + "ce_loss_12": 3.0744644165039063, + "ce_loss_17": 2.969471788406372, + "ce_loss_23": 2.9252761244773864, + "ce_loss_3": 3.7865662932395936, + "ce_loss_6": 3.387745177745819, + "epoch": 0.982, + "grad_norm": 956.0, + "kl_loss_12": 323.53853302001954, + "kl_loss_17": 81.95206718444824, + "kl_loss_3": 1891.0988708496093, + "kl_loss_6": 1076.6217407226563, + "learning_rate": 8.15448036932176e-07, + "loss": 828.8999, + "step": 9820 + }, + { + "ce_loss_12": 3.1218610644340514, + "ce_loss_17": 3.0108360290527343, + "ce_loss_23": 2.9639970779418947, + "ce_loss_3": 3.821909260749817, + "ce_loss_6": 3.4302916169166564, + "epoch": 0.983, + "grad_norm": 968.0, + "kl_loss_12": 328.03336334228516, + "kl_loss_17": 83.96088600158691, + "kl_loss_3": 1896.133447265625, + "kl_loss_6": 1085.7091247558594, + "learning_rate": 7.273808789862724e-07, + "loss": 854.9318, + "step": 9830 + }, + { + "ce_loss_12": 3.188025426864624, + "ce_loss_17": 3.0801629781723023, + "ce_loss_23": 3.0319138884544374, + "ce_loss_3": 3.8802616000175476, + "ce_loss_6": 3.491058576107025, + "epoch": 0.984, + "grad_norm": 812.0, + "kl_loss_12": 330.5854736328125, + "kl_loss_17": 85.91607627868652, + "kl_loss_3": 1877.5823303222655, + "kl_loss_6": 1070.721142578125, + "learning_rate": 6.443413907720186e-07, + "loss": 838.1486, + "step": 9840 + }, + { + "ce_loss_12": 3.128363573551178, + "ce_loss_17": 3.0203511357307433, + "ce_loss_23": 2.975878059864044, + "ce_loss_3": 3.831292140483856, + "ce_loss_6": 3.431589329242706, + "epoch": 0.985, + "grad_norm": 1096.0, + "kl_loss_12": 326.2767532348633, + "kl_loss_17": 84.12903938293456, + "kl_loss_3": 1856.6162231445312, + "kl_loss_6": 1048.189892578125, + "learning_rate": 5.663304084960185e-07, + "loss": 833.4552, + "step": 9850 + }, + { + "ce_loss_12": 3.0610152006149294, + "ce_loss_17": 2.955245614051819, + "ce_loss_23": 2.9065255403518675, + "ce_loss_3": 3.7786110639572144, + "ce_loss_6": 3.3778342723846437, + "epoch": 0.986, + "grad_norm": 1424.0, + "kl_loss_12": 330.9642135620117, + "kl_loss_17": 87.89242706298828, + "kl_loss_3": 1924.0705871582031, + "kl_loss_6": 1085.759259033203, + "learning_rate": 4.933487177280482e-07, + "loss": 835.5607, + "step": 9860 + }, + { + "ce_loss_12": 3.1539594650268556, + "ce_loss_17": 3.048413944244385, + "ce_loss_23": 3.004293072223663, + "ce_loss_3": 3.8455660343170166, + "ce_loss_6": 3.4557252764701842, + "epoch": 0.987, + "grad_norm": 1072.0, + "kl_loss_12": 320.58961791992186, + "kl_loss_17": 82.36095275878907, + "kl_loss_3": 1859.45830078125, + "kl_loss_6": 1056.7829376220702, + "learning_rate": 4.2539705339295075e-07, + "loss": 828.8172, + "step": 9870 + }, + { + "ce_loss_12": 3.0203701496124267, + "ce_loss_17": 2.909262204170227, + "ce_loss_23": 2.8623374581336973, + "ce_loss_3": 3.741202163696289, + "ce_loss_6": 3.3373022198677065, + "epoch": 0.988, + "grad_norm": 904.0, + "kl_loss_12": 334.73585510253906, + "kl_loss_17": 83.74605751037598, + "kl_loss_3": 1923.2291320800782, + "kl_loss_6": 1087.5317352294921, + "learning_rate": 3.6247609976319816e-07, + "loss": 841.1823, + "step": 9880 + }, + { + "ce_loss_12": 3.1118029594421386, + "ce_loss_17": 2.9975714921951293, + "ce_loss_23": 2.9505231618881225, + "ce_loss_3": 3.838503098487854, + "ce_loss_6": 3.4238746047019957, + "epoch": 0.989, + "grad_norm": 1248.0, + "kl_loss_12": 330.36513671875, + "kl_loss_17": 85.91090202331543, + "kl_loss_3": 1926.5442749023437, + "kl_loss_6": 1087.4236846923827, + "learning_rate": 3.0458649045211895e-07, + "loss": 863.6676, + "step": 9890 + }, + { + "ce_loss_12": 3.079368329048157, + "ce_loss_17": 2.9630495429039003, + "ce_loss_23": 2.914914536476135, + "ce_loss_3": 3.8031314492225645, + "ce_loss_6": 3.4066123366355896, + "epoch": 0.99, + "grad_norm": 1020.0, + "kl_loss_12": 335.53031463623046, + "kl_loss_17": 86.8194580078125, + "kl_loss_3": 1906.4886291503906, + "kl_loss_6": 1089.162533569336, + "learning_rate": 2.517288084074587e-07, + "loss": 859.5178, + "step": 9900 + }, + { + "ce_loss_12": 3.1262808203697205, + "ce_loss_17": 3.0067719340324404, + "ce_loss_23": 2.9572256088256834, + "ce_loss_3": 3.8658326506614684, + "ce_loss_6": 3.4577017426490784, + "epoch": 0.991, + "grad_norm": 1020.0, + "kl_loss_12": 345.738835144043, + "kl_loss_17": 87.76178359985352, + "kl_loss_3": 1967.6563049316405, + "kl_loss_6": 1125.1915313720704, + "learning_rate": 2.0390358590538505e-07, + "loss": 862.3332, + "step": 9910 + }, + { + "ce_loss_12": 3.121526575088501, + "ce_loss_17": 3.0088306665420532, + "ce_loss_23": 2.961964523792267, + "ce_loss_3": 3.8342851638793944, + "ce_loss_6": 3.434750163555145, + "epoch": 0.992, + "grad_norm": 1128.0, + "kl_loss_12": 335.33690185546874, + "kl_loss_17": 85.0649803161621, + "kl_loss_3": 1908.5850769042968, + "kl_loss_6": 1088.622216796875, + "learning_rate": 1.61111304545436e-07, + "loss": 841.5836, + "step": 9920 + }, + { + "ce_loss_12": 3.093986284732819, + "ce_loss_17": 2.98314254283905, + "ce_loss_23": 2.9363115191459657, + "ce_loss_3": 3.7975124955177306, + "ce_loss_6": 3.4003641247749328, + "epoch": 0.993, + "grad_norm": 972.0, + "kl_loss_12": 327.34204864501953, + "kl_loss_17": 84.11604232788086, + "kl_loss_3": 1882.315252685547, + "kl_loss_6": 1068.8870910644532, + "learning_rate": 1.2335239524541298e-07, + "loss": 831.5901, + "step": 9930 + }, + { + "ce_loss_12": 3.062349224090576, + "ce_loss_17": 2.949763739109039, + "ce_loss_23": 2.9034113764762877, + "ce_loss_3": 3.7702368974685667, + "ce_loss_6": 3.3707385420799256, + "epoch": 0.994, + "grad_norm": 944.0, + "kl_loss_12": 328.5026824951172, + "kl_loss_17": 84.36543121337891, + "kl_loss_3": 1878.0137756347656, + "kl_loss_6": 1060.1397888183594, + "learning_rate": 9.06272382371065e-08, + "loss": 841.7564, + "step": 9940 + }, + { + "ce_loss_12": 3.1279003024101257, + "ce_loss_17": 3.017787003517151, + "ce_loss_23": 2.973386991024017, + "ce_loss_3": 3.852622413635254, + "ce_loss_6": 3.445349323749542, + "epoch": 0.995, + "grad_norm": 996.0, + "kl_loss_12": 335.48767242431643, + "kl_loss_17": 85.91932830810546, + "kl_loss_3": 1929.320849609375, + "kl_loss_6": 1100.2056854248046, + "learning_rate": 6.293616306246586e-08, + "loss": 850.2181, + "step": 9950 + }, + { + "ce_loss_12": 3.115555453300476, + "ce_loss_17": 3.0102423310279844, + "ce_loss_23": 2.9651247382164003, + "ce_loss_3": 3.7982796311378477, + "ce_loss_6": 3.4162571668624877, + "epoch": 0.996, + "grad_norm": 1224.0, + "kl_loss_12": 320.39947814941405, + "kl_loss_17": 81.87819061279296, + "kl_loss_3": 1844.3757263183593, + "kl_loss_6": 1054.789862060547, + "learning_rate": 4.027944857032395e-08, + "loss": 816.7188, + "step": 9960 + }, + { + "ce_loss_12": 3.1120410203933715, + "ce_loss_17": 3.010934889316559, + "ce_loss_23": 2.9690916180610656, + "ce_loss_3": 3.7765786409378053, + "ce_loss_6": 3.4030073404312136, + "epoch": 0.997, + "grad_norm": 972.0, + "kl_loss_12": 308.8306060791016, + "kl_loss_17": 80.754195022583, + "kl_loss_3": 1784.6604797363282, + "kl_loss_6": 1010.75185546875, + "learning_rate": 2.265732291356626e-08, + "loss": 806.2186, + "step": 9970 + }, + { + "ce_loss_12": 3.1557921767234802, + "ce_loss_17": 3.050547778606415, + "ce_loss_23": 3.00582869052887, + "ce_loss_3": 3.840213453769684, + "ce_loss_6": 3.453501784801483, + "epoch": 0.998, + "grad_norm": 776.0, + "kl_loss_12": 326.1690475463867, + "kl_loss_17": 83.65138893127441, + "kl_loss_3": 1832.3079162597655, + "kl_loss_6": 1048.4389709472657, + "learning_rate": 1.0069963546743833e-08, + "loss": 843.8186, + "step": 9980 + }, + { + "ce_loss_12": 3.1398675203323365, + "ce_loss_17": 3.0271052956581115, + "ce_loss_23": 2.9797831535339356, + "ce_loss_3": 3.8401438117027284, + "ce_loss_6": 3.447658562660217, + "epoch": 0.999, + "grad_norm": 1192.0, + "kl_loss_12": 331.73143615722654, + "kl_loss_17": 85.19338836669922, + "kl_loss_3": 1888.2928466796875, + "kl_loss_6": 1073.5358764648438, + "learning_rate": 2.517497224463483e-09, + "loss": 838.4053, + "step": 9990 + }, + { + "ce_loss_12": 3.0968006253242493, + "ce_loss_17": 2.980567467212677, + "ce_loss_23": 2.933296859264374, + "ce_loss_3": 3.8451858282089235, + "ce_loss_6": 3.42595511674881, + "epoch": 1.0, + "grad_norm": 1136.0, + "kl_loss_12": 338.3313674926758, + "kl_loss_17": 86.15568351745605, + "kl_loss_3": 1987.994921875, + "kl_loss_6": 1112.111392211914, + "learning_rate": 0.0, + "loss": 866.3291, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.502582338838856e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}