{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_10": 5.4414660930633545, "ce_loss_13": 3.55124568939209, "ce_loss_2": 10.093525886535645, "ce_loss_3": 10.20274829864502, "ce_loss_7": 6.342690944671631, "epoch": 0.0001, "grad_norm": 82944.0, "kl_loss_10": 4861.189697265625, "kl_loss_2": 13442.09228515625, "kl_loss_3": 13762.11669921875, "kl_loss_7": 6417.250732421875, "learning_rate": 1e-05, "loss": 9493.6807, "step": 1 }, { "ce_loss_10": 4.848646534813775, "ce_loss_13": 3.607477717929416, "ce_loss_2": 7.882981989118788, "ce_loss_3": 7.632776339848836, "ce_loss_7": 5.543010658688015, "epoch": 0.001, "grad_norm": 16320.0, "kl_loss_10": 3194.2076212565103, "kl_loss_2": 8431.235649956598, "kl_loss_3": 7977.491048177083, "kl_loss_7": 4098.4466552734375, "learning_rate": 0.0001, "loss": 5931.684, "step": 10 }, { "ce_loss_10": 4.192340469360351, "ce_loss_13": 3.6121856927871705, "ce_loss_2": 6.407724976539612, "ce_loss_3": 6.110695123672485, "ce_loss_7": 4.697946071624756, "epoch": 0.002, "grad_norm": 3728.0, "kl_loss_10": 1117.3084686279296, "kl_loss_2": 5130.931079101562, "kl_loss_3": 4576.346203613281, "kl_loss_7": 1968.5842712402343, "learning_rate": 0.0002, "loss": 3243.0105, "step": 20 }, { "ce_loss_10": 3.739894223213196, "ce_loss_13": 3.402119219303131, "ce_loss_2": 5.830984497070313, "ce_loss_3": 5.5242253065109255, "ce_loss_7": 4.216808533668518, "epoch": 0.003, "grad_norm": 3248.0, "kl_loss_10": 587.9037643432617, "kl_loss_2": 4467.688024902343, "kl_loss_3": 3901.3805297851563, "kl_loss_7": 1468.5001342773437, "learning_rate": 0.0003, "loss": 2558.6922, "step": 30 }, { "ce_loss_10": 3.81227308511734, "ce_loss_13": 3.5722511053085326, "ce_loss_2": 5.609953355789185, "ce_loss_3": 5.32381751537323, "ce_loss_7": 4.264926481246948, "epoch": 0.004, "grad_norm": 3728.0, "kl_loss_10": 435.9330429077148, "kl_loss_2": 3794.2421020507813, "kl_loss_3": 3267.354052734375, "kl_loss_7": 1256.7973205566407, "learning_rate": 0.0004, "loss": 2202.6764, "step": 40 }, { "ce_loss_10": 3.764557671546936, "ce_loss_13": 3.5534241557121278, "ce_loss_2": 5.45177059173584, "ce_loss_3": 5.161038231849671, "ce_loss_7": 4.168226671218872, "epoch": 0.005, "grad_norm": 3792.0, "kl_loss_10": 366.2504425048828, "kl_loss_2": 3568.579138183594, "kl_loss_3": 3041.7162841796876, "kl_loss_7": 1149.994189453125, "learning_rate": 0.0005, "loss": 2027.2316, "step": 50 }, { "ce_loss_10": 3.730236518383026, "ce_loss_13": 3.558177888393402, "ce_loss_2": 5.382890605926514, "ce_loss_3": 5.067366886138916, "ce_loss_7": 4.121474468708039, "epoch": 0.006, "grad_norm": 3568.0, "kl_loss_10": 310.650944519043, "kl_loss_2": 3436.02744140625, "kl_loss_3": 2842.05537109375, "kl_loss_7": 1025.8912414550782, "learning_rate": 0.0006, "loss": 1907.7687, "step": 60 }, { "ce_loss_10": 3.624938952922821, "ce_loss_13": 3.4783427357673644, "ce_loss_2": 5.254939460754395, "ce_loss_3": 4.91642997264862, "ce_loss_7": 4.001599073410034, "epoch": 0.007, "grad_norm": 2608.0, "kl_loss_10": 279.52313232421875, "kl_loss_2": 3379.3945068359376, "kl_loss_3": 2768.695068359375, "kl_loss_7": 976.9540100097656, "learning_rate": 0.0007, "loss": 1837.0203, "step": 70 }, { "ce_loss_10": 3.66162428855896, "ce_loss_13": 3.4794311046600344, "ce_loss_2": 5.186796355247497, "ce_loss_3": 4.870996403694153, "ce_loss_7": 3.9829494953155518, "epoch": 0.008, "grad_norm": 3440.0, "kl_loss_10": 374.0259613037109, "kl_loss_2": 3257.638916015625, "kl_loss_3": 2671.052526855469, "kl_loss_7": 932.288217163086, "learning_rate": 0.0008, "loss": 1809.8037, "step": 80 }, { "ce_loss_10": 3.615346169471741, "ce_loss_13": 3.436742055416107, "ce_loss_2": 5.1184669733047485, "ce_loss_3": 4.824533867835998, "ce_loss_7": 3.919069004058838, "epoch": 0.009, "grad_norm": 3264.0, "kl_loss_10": 328.0901245117187, "kl_loss_2": 3214.366650390625, "kl_loss_3": 2668.0147705078125, "kl_loss_7": 928.7298736572266, "learning_rate": 0.0009000000000000001, "loss": 1767.7828, "step": 90 }, { "ce_loss_10": 3.719766116142273, "ce_loss_13": 3.5628265142440796, "ce_loss_2": 5.176266622543335, "ce_loss_3": 4.8936989307403564, "ce_loss_7": 4.069474995136261, "epoch": 0.01, "grad_norm": 2064.0, "kl_loss_10": 282.8213035583496, "kl_loss_2": 3074.1569946289064, "kl_loss_3": 2548.117712402344, "kl_loss_7": 953.9666351318359, "learning_rate": 0.001, "loss": 1719.6691, "step": 100 }, { "ce_loss_10": 3.6567954063415526, "ce_loss_13": 3.5124174475669863, "ce_loss_2": 5.108190441131592, "ce_loss_3": 4.818107295036316, "ce_loss_7": 4.019633519649505, "epoch": 0.011, "grad_norm": 2400.0, "kl_loss_10": 270.0943603515625, "kl_loss_2": 3049.2991455078127, "kl_loss_3": 2500.848181152344, "kl_loss_7": 932.3636932373047, "learning_rate": 0.0009999974825027757, "loss": 1688.8926, "step": 110 }, { "ce_loss_10": 3.7039052963256838, "ce_loss_13": 3.571633052825928, "ce_loss_2": 5.094748187065124, "ce_loss_3": 4.852095460891723, "ce_loss_7": 4.037456893920899, "epoch": 0.012, "grad_norm": 1664.0, "kl_loss_10": 250.84463272094726, "kl_loss_2": 2932.8909423828127, "kl_loss_3": 2455.3158569335938, "kl_loss_7": 890.0397857666015, "learning_rate": 0.0009999899300364532, "loss": 1612.4051, "step": 120 }, { "ce_loss_10": 3.6791754364967346, "ce_loss_13": 3.545230841636658, "ce_loss_2": 5.091634058952332, "ce_loss_3": 4.8162085056304935, "ce_loss_7": 4.020157527923584, "epoch": 0.013, "grad_norm": 1560.0, "kl_loss_10": 255.16813125610352, "kl_loss_2": 2942.025427246094, "kl_loss_3": 2451.762646484375, "kl_loss_7": 898.0020751953125, "learning_rate": 0.0009999773426770863, "loss": 1657.6727, "step": 130 }, { "ce_loss_10": 3.73674556016922, "ce_loss_13": 3.5832789182662963, "ce_loss_2": 5.085049343109131, "ce_loss_3": 4.773765635490418, "ce_loss_7": 4.00857390165329, "epoch": 0.014, "grad_norm": 1680.0, "kl_loss_10": 287.71709671020506, "kl_loss_2": 2920.2906494140625, "kl_loss_3": 2305.8346618652345, "kl_loss_7": 817.9166778564453, "learning_rate": 0.0009999597205514296, "loss": 1597.5817, "step": 140 }, { "ce_loss_10": 3.731847679615021, "ce_loss_13": 3.536967146396637, "ce_loss_2": 5.1948922872543335, "ce_loss_3": 4.7174841403961185, "ce_loss_7": 4.000156426429749, "epoch": 0.015, "grad_norm": 2400.0, "kl_loss_10": 355.67767333984375, "kl_loss_2": 3135.73056640625, "kl_loss_3": 2270.5993408203126, "kl_loss_7": 853.3033996582031, "learning_rate": 0.0009999370638369377, "loss": 1665.8658, "step": 150 }, { "ce_loss_10": 3.742002522945404, "ce_loss_13": 3.577849972248077, "ce_loss_2": 5.10463318824768, "ce_loss_3": 4.742930483818054, "ce_loss_7": 4.048530840873719, "epoch": 0.016, "grad_norm": 1872.0, "kl_loss_10": 307.6055633544922, "kl_loss_2": 2953.6447998046874, "kl_loss_3": 2254.6800170898437, "kl_loss_7": 920.736947631836, "learning_rate": 0.000999909372761763, "loss": 1612.4882, "step": 160 }, { "ce_loss_10": 3.6539915680885313, "ce_loss_13": 3.5105618476867675, "ce_loss_2": 5.0401170492172245, "ce_loss_3": 4.6636940240859985, "ce_loss_7": 4.006681740283966, "epoch": 0.017, "grad_norm": 1136.0, "kl_loss_10": 274.9055458068848, "kl_loss_2": 2962.0895874023436, "kl_loss_3": 2260.215466308594, "kl_loss_7": 926.4199157714844, "learning_rate": 0.0009998766476047546, "loss": 1613.7586, "step": 170 }, { "ce_loss_10": 3.699781334400177, "ce_loss_13": 3.5564413189888002, "ce_loss_2": 5.0237473249435425, "ce_loss_3": 4.679536700248718, "ce_loss_7": 4.071722888946534, "epoch": 0.018, "grad_norm": 1112.0, "kl_loss_10": 265.14882888793943, "kl_loss_2": 2857.7050659179686, "kl_loss_3": 2181.458984375, "kl_loss_7": 999.3553771972656, "learning_rate": 0.0009998388886954545, "loss": 1607.9029, "step": 180 }, { "ce_loss_10": 3.64805783033371, "ce_loss_13": 3.520363616943359, "ce_loss_2": 4.989007997512817, "ce_loss_3": 4.6573096990585325, "ce_loss_7": 4.002157616615295, "epoch": 0.019, "grad_norm": 1048.0, "kl_loss_10": 244.38265075683594, "kl_loss_2": 2854.116064453125, "kl_loss_3": 2210.276501464844, "kl_loss_7": 909.954135131836, "learning_rate": 0.0009997960964140947, "loss": 1548.975, "step": 190 }, { "ce_loss_10": 3.640180766582489, "ce_loss_13": 3.5162421464920044, "ce_loss_2": 4.98552713394165, "ce_loss_3": 4.659880065917969, "ce_loss_7": 3.94964519739151, "epoch": 0.02, "grad_norm": 1096.0, "kl_loss_10": 245.4235740661621, "kl_loss_2": 2847.5531372070313, "kl_loss_3": 2231.041613769531, "kl_loss_7": 837.8150207519532, "learning_rate": 0.0009997482711915926, "loss": 1530.3634, "step": 200 }, { "ce_loss_10": 3.6212483406066895, "ce_loss_13": 3.4788748979568482, "ce_loss_2": 4.9227711200714115, "ce_loss_3": 4.63204300403595, "ce_loss_7": 3.8739295840263366, "epoch": 0.021, "grad_norm": 1496.0, "kl_loss_10": 294.7460823059082, "kl_loss_2": 2818.0055908203126, "kl_loss_3": 2240.425701904297, "kl_loss_7": 758.1431488037109, "learning_rate": 0.0009996954135095479, "loss": 1514.134, "step": 210 }, { "ce_loss_10": 3.732617425918579, "ce_loss_13": 3.565991473197937, "ce_loss_2": 4.940235328674317, "ce_loss_3": 4.660631942749023, "ce_loss_7": 3.945897901058197, "epoch": 0.022, "grad_norm": 1032.0, "kl_loss_10": 289.4634216308594, "kl_loss_2": 2661.9598022460937, "kl_loss_3": 2125.8117736816407, "kl_loss_7": 715.5729705810547, "learning_rate": 0.0009996375239002368, "loss": 1447.7406, "step": 220 }, { "ce_loss_10": 3.7656495571136475, "ce_loss_13": 3.6354315161705015, "ce_loss_2": 4.960019969940186, "ce_loss_3": 4.662168288230896, "ce_loss_7": 4.007078444957733, "epoch": 0.023, "grad_norm": 852.0, "kl_loss_10": 244.71729431152343, "kl_loss_2": 2586.0654052734376, "kl_loss_3": 2009.3392761230468, "kl_loss_7": 705.7847473144532, "learning_rate": 0.0009995746029466072, "loss": 1397.8061, "step": 230 }, { "ce_loss_10": 3.5471033453941345, "ce_loss_13": 3.4218883395195006, "ce_loss_2": 4.889282131195069, "ce_loss_3": 4.538329005241394, "ce_loss_7": 3.8075289726257324, "epoch": 0.024, "grad_norm": 1488.0, "kl_loss_10": 241.3412643432617, "kl_loss_2": 2865.20146484375, "kl_loss_3": 2182.661700439453, "kl_loss_7": 751.8032867431641, "learning_rate": 0.0009995066512822719, "loss": 1453.07, "step": 240 }, { "ce_loss_10": 3.665674090385437, "ce_loss_13": 3.5247814297676086, "ce_loss_2": 5.024472332000732, "ce_loss_3": 4.666782689094544, "ce_loss_7": 3.900399935245514, "epoch": 0.025, "grad_norm": 1560.0, "kl_loss_10": 258.1688774108887, "kl_loss_2": 2909.1632690429688, "kl_loss_3": 2216.5844116210938, "kl_loss_7": 720.5122467041016, "learning_rate": 0.000999433669591504, "loss": 1459.9385, "step": 250 }, { "ce_loss_10": 3.5710131525993347, "ce_loss_13": 3.424545645713806, "ce_loss_2": 4.864272856712342, "ce_loss_3": 4.511341714859009, "ce_loss_7": 3.8119132995605467, "epoch": 0.026, "grad_norm": 1176.0, "kl_loss_10": 282.1901268005371, "kl_loss_2": 2822.176232910156, "kl_loss_3": 2133.6006774902344, "kl_loss_7": 739.3891510009765, "learning_rate": 0.000999355658609228, "loss": 1456.5281, "step": 260 }, { "ce_loss_10": 3.6089833855628966, "ce_loss_13": 3.4602246284484863, "ce_loss_2": 4.948814797401428, "ce_loss_3": 4.569972229003906, "ce_loss_7": 3.8447819709777833, "epoch": 0.027, "grad_norm": 1528.0, "kl_loss_10": 276.7303955078125, "kl_loss_2": 2890.743786621094, "kl_loss_3": 2157.1086608886717, "kl_loss_7": 716.4465515136719, "learning_rate": 0.0009992726191210138, "loss": 1487.0811, "step": 270 }, { "ce_loss_10": 3.6369749307632446, "ce_loss_13": 3.495538282394409, "ce_loss_2": 4.901288914680481, "ce_loss_3": 4.5635118842124935, "ce_loss_7": 3.8698185324668883, "epoch": 0.028, "grad_norm": 892.0, "kl_loss_10": 265.56416473388674, "kl_loss_2": 2734.989501953125, "kl_loss_3": 2087.4435485839845, "kl_loss_7": 716.3130859375, "learning_rate": 0.0009991845519630679, "loss": 1437.1145, "step": 280 }, { "ce_loss_10": 3.505390703678131, "ce_loss_13": 3.3787208795547485, "ce_loss_2": 4.7870055675506595, "ce_loss_3": 4.4493123412132265, "ce_loss_7": 3.760839748382568, "epoch": 0.029, "grad_norm": 908.0, "kl_loss_10": 237.0340545654297, "kl_loss_2": 2736.9489135742188, "kl_loss_3": 2083.6819580078127, "kl_loss_7": 726.7024353027343, "learning_rate": 0.0009990914580222257, "loss": 1446.4514, "step": 290 }, { "ce_loss_10": 3.642133617401123, "ce_loss_13": 3.5245127081871033, "ce_loss_2": 4.845181202888488, "ce_loss_3": 4.518141508102417, "ce_loss_7": 3.9286104440689087, "epoch": 0.03, "grad_norm": 1040.0, "kl_loss_10": 216.63860015869142, "kl_loss_2": 2593.070068359375, "kl_loss_3": 1959.4425048828125, "kl_loss_7": 789.2019195556641, "learning_rate": 0.0009989933382359422, "loss": 1430.3693, "step": 300 }, { "ce_loss_10": 3.633878195285797, "ce_loss_13": 3.5297494769096374, "ce_loss_2": 4.831432461738586, "ce_loss_3": 4.504483461380005, "ce_loss_7": 3.9342789173126222, "epoch": 0.031, "grad_norm": 884.0, "kl_loss_10": 202.35764541625977, "kl_loss_2": 2563.0292846679686, "kl_loss_3": 1929.7875366210938, "kl_loss_7": 779.2723785400391, "learning_rate": 0.0009988901935922825, "loss": 1377.478, "step": 310 }, { "ce_loss_10": 3.4879576325416566, "ce_loss_13": 3.3748545050621033, "ce_loss_2": 4.758215618133545, "ce_loss_3": 4.416362595558167, "ce_loss_7": 3.793366551399231, "epoch": 0.032, "grad_norm": 1032.0, "kl_loss_10": 210.15899047851562, "kl_loss_2": 2712.4366821289063, "kl_loss_3": 2037.11875, "kl_loss_7": 816.4009399414062, "learning_rate": 0.0009987820251299122, "loss": 1400.1691, "step": 320 }, { "ce_loss_10": 3.618916356563568, "ce_loss_13": 3.5132275581359864, "ce_loss_2": 4.786975932121277, "ce_loss_3": 4.463800621032715, "ce_loss_7": 3.8751370787620543, "epoch": 0.033, "grad_norm": 844.0, "kl_loss_10": 199.11445999145508, "kl_loss_2": 2514.356481933594, "kl_loss_3": 1885.5515014648438, "kl_loss_7": 697.0057525634766, "learning_rate": 0.0009986688339380862, "loss": 1316.5152, "step": 330 }, { "ce_loss_10": 3.5652908205986025, "ce_loss_13": 3.462840723991394, "ce_loss_2": 4.73096718788147, "ce_loss_3": 4.41595048904419, "ce_loss_7": 3.7959569096565247, "epoch": 0.034, "grad_norm": 1088.0, "kl_loss_10": 196.1912078857422, "kl_loss_2": 2482.1171264648438, "kl_loss_3": 1867.904150390625, "kl_loss_7": 641.6138366699219, "learning_rate": 0.0009985506211566387, "loss": 1310.742, "step": 340 }, { "ce_loss_10": 3.599169671535492, "ce_loss_13": 3.499468731880188, "ce_loss_2": 4.743464422225952, "ce_loss_3": 4.4433406591415405, "ce_loss_7": 3.8246023654937744, "epoch": 0.035, "grad_norm": 908.0, "kl_loss_10": 193.88640899658202, "kl_loss_2": 2439.992578125, "kl_loss_3": 1844.8904907226563, "kl_loss_7": 628.5584655761719, "learning_rate": 0.0009984273879759713, "loss": 1282.633, "step": 350 }, { "ce_loss_10": 3.6349938988685606, "ce_loss_13": 3.524084043502808, "ce_loss_2": 4.792412829399109, "ce_loss_3": 4.484378623962402, "ce_loss_7": 3.867151415348053, "epoch": 0.036, "grad_norm": 900.0, "kl_loss_10": 211.68240356445312, "kl_loss_2": 2492.4594604492186, "kl_loss_3": 1889.6275817871094, "kl_loss_7": 650.7470794677735, "learning_rate": 0.0009982991356370402, "loss": 1330.9973, "step": 360 }, { "ce_loss_10": 3.6139026045799256, "ce_loss_13": 3.499786341190338, "ce_loss_2": 4.770193362236023, "ce_loss_3": 4.499257779121399, "ce_loss_7": 3.836435353755951, "epoch": 0.037, "grad_norm": 924.0, "kl_loss_10": 218.0047348022461, "kl_loss_2": 2494.1434814453123, "kl_loss_3": 1945.14365234375, "kl_loss_7": 644.8766632080078, "learning_rate": 0.0009981658654313456, "loss": 1328.8159, "step": 370 }, { "ce_loss_10": 3.6886358737945555, "ce_loss_13": 3.5812055826187135, "ce_loss_2": 4.797796511650086, "ce_loss_3": 4.494029021263122, "ce_loss_7": 3.930657184123993, "epoch": 0.038, "grad_norm": 1064.0, "kl_loss_10": 198.16498031616212, "kl_loss_2": 2407.9371826171873, "kl_loss_3": 1811.71591796875, "kl_loss_7": 701.7446105957031, "learning_rate": 0.000998027578700917, "loss": 1309.0261, "step": 380 }, { "ce_loss_10": 3.618265390396118, "ce_loss_13": 3.516481935977936, "ce_loss_2": 4.751624965667725, "ce_loss_3": 4.437908434867859, "ce_loss_7": 3.8756099224090574, "epoch": 0.039, "grad_norm": 748.0, "kl_loss_10": 192.7767189025879, "kl_loss_2": 2445.187145996094, "kl_loss_3": 1832.8304443359375, "kl_loss_7": 682.4473358154297, "learning_rate": 0.0009978842768382998, "loss": 1292.3232, "step": 390 }, { "ce_loss_10": 3.6378421783447266, "ce_loss_13": 3.5388819336891175, "ce_loss_2": 4.7288140773773195, "ce_loss_3": 4.416940224170685, "ce_loss_7": 3.8709887266159058, "epoch": 0.04, "grad_norm": 864.0, "kl_loss_10": 185.43303756713868, "kl_loss_2": 2356.771868896484, "kl_loss_3": 1759.8860900878906, "kl_loss_7": 662.7635040283203, "learning_rate": 0.0009977359612865424, "loss": 1253.7059, "step": 400 }, { "ce_loss_10": 3.6343122839927675, "ce_loss_13": 3.5350385427474977, "ce_loss_2": 4.746745371818543, "ce_loss_3": 4.44157133102417, "ce_loss_7": 3.8861587047576904, "epoch": 0.041, "grad_norm": 976.0, "kl_loss_10": 191.48266525268554, "kl_loss_2": 2374.4056091308594, "kl_loss_3": 1779.481707763672, "kl_loss_7": 673.08525390625, "learning_rate": 0.0009975826335391806, "loss": 1247.2398, "step": 410 }, { "ce_loss_10": 3.6603206634521483, "ce_loss_13": 3.5601704835891725, "ce_loss_2": 4.751420497894287, "ce_loss_3": 4.431653356552124, "ce_loss_7": 3.8760146379470823, "epoch": 0.042, "grad_norm": 880.0, "kl_loss_10": 191.66864318847655, "kl_loss_2": 2351.4589599609376, "kl_loss_3": 1731.821112060547, "kl_loss_7": 611.5101196289063, "learning_rate": 0.0009974242951402235, "loss": 1231.4973, "step": 420 }, { "ce_loss_10": 3.671127963066101, "ce_loss_13": 3.5646562814712524, "ce_loss_2": 4.768468499183655, "ce_loss_3": 4.45646653175354, "ce_loss_7": 3.8791951179504394, "epoch": 0.043, "grad_norm": 1016.0, "kl_loss_10": 199.78956832885743, "kl_loss_2": 2377.113977050781, "kl_loss_3": 1762.0419921875, "kl_loss_7": 611.9753326416015, "learning_rate": 0.0009972609476841367, "loss": 1225.8758, "step": 430 }, { "ce_loss_10": 3.576024031639099, "ce_loss_13": 3.4760246872901917, "ce_loss_2": 4.713018941879272, "ce_loss_3": 4.393813145160675, "ce_loss_7": 3.79048775434494, "epoch": 0.044, "grad_norm": 776.0, "kl_loss_10": 187.14953994750977, "kl_loss_2": 2427.2942016601564, "kl_loss_3": 1802.8625610351562, "kl_loss_7": 602.8439697265625, "learning_rate": 0.0009970925928158272, "loss": 1255.1398, "step": 440 }, { "ce_loss_10": 3.523408055305481, "ce_loss_13": 3.4175541162490846, "ce_loss_2": 4.677767181396485, "ce_loss_3": 4.351981210708618, "ce_loss_7": 3.7345662236213686, "epoch": 0.045, "grad_norm": 824.0, "kl_loss_10": 198.06284866333007, "kl_loss_2": 2509.906640625, "kl_loss_3": 1854.9789611816407, "kl_loss_7": 623.1649566650391, "learning_rate": 0.000996919232230627, "loss": 1273.0857, "step": 450 }, { "ce_loss_10": 3.6010383486747743, "ce_loss_13": 3.5072561621665956, "ce_loss_2": 4.707926797866821, "ce_loss_3": 4.392971789836883, "ce_loss_7": 3.8050674915313722, "epoch": 0.046, "grad_norm": 772.0, "kl_loss_10": 179.6293571472168, "kl_loss_2": 2379.8652587890624, "kl_loss_3": 1758.0087097167968, "kl_loss_7": 588.2210372924804, "learning_rate": 0.0009967408676742752, "loss": 1199.8335, "step": 460 }, { "ce_loss_10": 3.742927384376526, "ce_loss_13": 3.6482790350914, "ce_loss_2": 4.8101897716522215, "ce_loss_3": 4.505382323265076, "ce_loss_7": 3.947196352481842, "epoch": 0.047, "grad_norm": 752.0, "kl_loss_10": 189.56137313842774, "kl_loss_2": 2298.8898681640626, "kl_loss_3": 1710.8347595214843, "kl_loss_7": 593.067138671875, "learning_rate": 0.0009965575009429006, "loss": 1232.7049, "step": 470 }, { "ce_loss_10": 3.535725438594818, "ce_loss_13": 3.4301671028137206, "ce_loss_2": 4.649361157417298, "ce_loss_3": 4.334059405326843, "ce_loss_7": 3.7419899344444274, "epoch": 0.048, "grad_norm": 724.0, "kl_loss_10": 199.50787734985352, "kl_loss_2": 2420.501257324219, "kl_loss_3": 1798.5549255371093, "kl_loss_7": 605.8824523925781, "learning_rate": 0.0009963691338830043, "loss": 1235.2358, "step": 480 }, { "ce_loss_10": 3.632321667671204, "ce_loss_13": 3.531293308734894, "ce_loss_2": 4.712436723709106, "ce_loss_3": 4.396814930438995, "ce_loss_7": 3.8190667390823365, "epoch": 0.049, "grad_norm": 1088.0, "kl_loss_10": 197.5344093322754, "kl_loss_2": 2367.637561035156, "kl_loss_3": 1741.7679260253906, "kl_loss_7": 580.3160980224609, "learning_rate": 0.0009961757683914405, "loss": 1205.5581, "step": 490 }, { "ce_loss_10": 3.6362748384475707, "ce_loss_13": 3.5185879707336425, "ce_loss_2": 4.675903582572937, "ce_loss_3": 4.36779271364212, "ce_loss_7": 3.833943545818329, "epoch": 0.05, "grad_norm": 796.0, "kl_loss_10": 228.72670364379883, "kl_loss_2": 2316.8818908691405, "kl_loss_3": 1726.0417602539062, "kl_loss_7": 630.297004699707, "learning_rate": 0.0009959774064153978, "loss": 1231.3605, "step": 500 }, { "ce_loss_10": 3.619948923587799, "ce_loss_13": 3.5244550347328185, "ce_loss_2": 4.644695091247558, "ce_loss_3": 4.329704558849334, "ce_loss_7": 3.800706934928894, "epoch": 0.051, "grad_norm": 1012.0, "kl_loss_10": 192.10162582397462, "kl_loss_2": 2233.16171875, "kl_loss_3": 1634.2128601074219, "kl_loss_7": 559.5575347900391, "learning_rate": 0.0009957740499523787, "loss": 1175.9502, "step": 510 }, { "ce_loss_10": 3.6536168694496154, "ce_loss_13": 3.550007688999176, "ce_loss_2": 4.699387860298157, "ce_loss_3": 4.385229539871216, "ce_loss_7": 3.8452332377433778, "epoch": 0.052, "grad_norm": 836.0, "kl_loss_10": 186.8121696472168, "kl_loss_2": 2266.308477783203, "kl_loss_3": 1657.038348388672, "kl_loss_7": 579.6389221191406, "learning_rate": 0.0009955657010501807, "loss": 1182.318, "step": 520 }, { "ce_loss_10": 3.6187700390815736, "ce_loss_13": 3.5073922753334044, "ce_loss_2": 4.677786374092102, "ce_loss_3": 4.3664548873901365, "ce_loss_7": 3.8046794056892397, "epoch": 0.053, "grad_norm": 740.0, "kl_loss_10": 199.3314422607422, "kl_loss_2": 2326.2955810546873, "kl_loss_3": 1718.3814697265625, "kl_loss_7": 584.9697692871093, "learning_rate": 0.000995352361806875, "loss": 1193.8865, "step": 530 }, { "ce_loss_10": 3.6552900075912476, "ce_loss_13": 3.5457561016082764, "ce_loss_2": 4.691612362861633, "ce_loss_3": 4.384424495697021, "ce_loss_7": 3.836933982372284, "epoch": 0.054, "grad_norm": 752.0, "kl_loss_10": 216.19254837036132, "kl_loss_2": 2297.6909790039062, "kl_loss_3": 1690.300762939453, "kl_loss_7": 577.9310073852539, "learning_rate": 0.0009951340343707852, "loss": 1205.293, "step": 540 }, { "ce_loss_10": 3.7162037968635557, "ce_loss_13": 3.60575407743454, "ce_loss_2": 4.755306339263916, "ce_loss_3": 4.44061427116394, "ce_loss_7": 3.891192317008972, "epoch": 0.055, "grad_norm": 644.0, "kl_loss_10": 198.4109214782715, "kl_loss_2": 2257.826208496094, "kl_loss_3": 1657.457830810547, "kl_loss_7": 549.2522979736328, "learning_rate": 0.0009949107209404665, "loss": 1177.7273, "step": 550 }, { "ce_loss_10": 3.6101877808570864, "ce_loss_13": 3.5138476133346557, "ce_loss_2": 4.642359352111816, "ce_loss_3": 4.346826291084289, "ce_loss_7": 3.794588005542755, "epoch": 0.056, "grad_norm": 848.0, "kl_loss_10": 186.831143951416, "kl_loss_2": 2270.4338500976564, "kl_loss_3": 1676.7208557128906, "kl_loss_7": 559.3569519042969, "learning_rate": 0.0009946824237646824, "loss": 1175.7035, "step": 560 }, { "ce_loss_10": 3.5591055393218993, "ce_loss_13": 3.4608181595802305, "ce_loss_2": 4.6226129055023195, "ce_loss_3": 4.305383276939392, "ce_loss_7": 3.7558097004890443, "epoch": 0.057, "grad_norm": 772.0, "kl_loss_10": 183.9202102661133, "kl_loss_2": 2333.6235412597657, "kl_loss_3": 1695.6138793945313, "kl_loss_7": 590.3666534423828, "learning_rate": 0.0009944491451423828, "loss": 1214.8344, "step": 570 }, { "ce_loss_10": 3.5461070418357847, "ce_loss_13": 3.453331208229065, "ce_loss_2": 4.6320007085800174, "ce_loss_3": 4.303631937503814, "ce_loss_7": 3.7626206278800964, "epoch": 0.058, "grad_norm": 756.0, "kl_loss_10": 178.51001663208007, "kl_loss_2": 2340.150256347656, "kl_loss_3": 1703.9690673828125, "kl_loss_7": 605.2322082519531, "learning_rate": 0.0009942108874226813, "loss": 1183.7139, "step": 580 }, { "ce_loss_10": 3.671435010433197, "ce_loss_13": 3.5826574683189394, "ce_loss_2": 4.683747816085815, "ce_loss_3": 4.387489748001099, "ce_loss_7": 3.8705739974975586, "epoch": 0.059, "grad_norm": 656.0, "kl_loss_10": 170.50717468261718, "kl_loss_2": 2184.8461975097657, "kl_loss_3": 1606.6676391601563, "kl_loss_7": 563.9176605224609, "learning_rate": 0.00099396765300483, "loss": 1126.441, "step": 590 }, { "ce_loss_10": 3.645866310596466, "ce_loss_13": 3.55738787651062, "ce_loss_2": 4.660355973243713, "ce_loss_3": 4.369902229309082, "ce_loss_7": 3.8477280139923096, "epoch": 0.06, "grad_norm": 848.0, "kl_loss_10": 168.53888397216798, "kl_loss_2": 2202.7648498535154, "kl_loss_3": 1625.7495483398438, "kl_loss_7": 572.8751831054688, "learning_rate": 0.0009937194443381972, "loss": 1142.7555, "step": 600 }, { "ce_loss_10": 3.6595038771629333, "ce_loss_13": 3.5759320497512816, "ce_loss_2": 4.65912344455719, "ce_loss_3": 4.365536069869995, "ce_loss_7": 3.8528382897377016, "epoch": 0.061, "grad_norm": 672.0, "kl_loss_10": 163.77515716552733, "kl_loss_2": 2170.874645996094, "kl_loss_3": 1594.9725708007813, "kl_loss_7": 549.9977111816406, "learning_rate": 0.0009934662639222412, "loss": 1149.0033, "step": 610 }, { "ce_loss_10": 3.6197726726531982, "ce_loss_13": 3.5326411604881285, "ce_loss_2": 4.671251368522644, "ce_loss_3": 4.348840308189392, "ce_loss_7": 3.8088993430137634, "epoch": 0.062, "grad_norm": 648.0, "kl_loss_10": 164.4695831298828, "kl_loss_2": 2287.6524169921877, "kl_loss_3": 1658.6748779296875, "kl_loss_7": 552.7878021240234, "learning_rate": 0.000993208114306486, "loss": 1156.5602, "step": 620 }, { "ce_loss_10": 3.5423024177551268, "ce_loss_13": 3.4506514072418213, "ce_loss_2": 4.613234496116638, "ce_loss_3": 4.282683503627777, "ce_loss_7": 3.7373295068740844, "epoch": 0.063, "grad_norm": 1096.0, "kl_loss_10": 171.06299514770507, "kl_loss_2": 2306.1301086425783, "kl_loss_3": 1657.723583984375, "kl_loss_7": 552.2892486572266, "learning_rate": 0.0009929449980904952, "loss": 1137.1229, "step": 630 }, { "ce_loss_10": 3.6015995144844055, "ce_loss_13": 3.5103712916374206, "ce_loss_2": 4.635677099227905, "ce_loss_3": 4.3279380917549135, "ce_loss_7": 3.7797141551971434, "epoch": 0.064, "grad_norm": 888.0, "kl_loss_10": 181.99832305908203, "kl_loss_2": 2257.243127441406, "kl_loss_3": 1653.8410400390626, "kl_loss_7": 542.8657012939453, "learning_rate": 0.0009926769179238466, "loss": 1149.0314, "step": 640 }, { "ce_loss_10": 3.654631459712982, "ce_loss_13": 3.557699191570282, "ce_loss_2": 4.668602418899536, "ce_loss_3": 4.412432253360748, "ce_loss_7": 3.840227794647217, "epoch": 0.065, "grad_norm": 880.0, "kl_loss_10": 178.8748649597168, "kl_loss_2": 2221.984893798828, "kl_loss_3": 1714.45927734375, "kl_loss_7": 559.7461196899415, "learning_rate": 0.000992403876506104, "loss": 1164.191, "step": 650 }, { "ce_loss_10": 3.58308607339859, "ce_loss_13": 3.494268476963043, "ce_loss_2": 4.6036207437515255, "ce_loss_3": 4.3260154008865355, "ce_loss_7": 3.7680331826210023, "epoch": 0.066, "grad_norm": 664.0, "kl_loss_10": 166.09232177734376, "kl_loss_2": 2211.9074279785154, "kl_loss_3": 1670.099188232422, "kl_loss_7": 535.2302642822266, "learning_rate": 0.0009921258765867918, "loss": 1143.2223, "step": 660 }, { "ce_loss_10": 3.5434961915016174, "ce_loss_13": 3.4588265657424926, "ce_loss_2": 4.589056158065796, "ce_loss_3": 4.291700434684754, "ce_loss_7": 3.724953532218933, "epoch": 0.067, "grad_norm": 784.0, "kl_loss_10": 162.44573287963868, "kl_loss_2": 2286.3095825195314, "kl_loss_3": 1690.0082946777343, "kl_loss_7": 537.8186584472656, "learning_rate": 0.0009918429209653662, "loss": 1145.2387, "step": 670 }, { "ce_loss_10": 3.604193425178528, "ce_loss_13": 3.5196762561798094, "ce_loss_2": 4.627810287475586, "ce_loss_3": 4.324268043041229, "ce_loss_7": 3.787227988243103, "epoch": 0.068, "grad_norm": 640.0, "kl_loss_10": 164.71398696899413, "kl_loss_2": 2225.251837158203, "kl_loss_3": 1632.128973388672, "kl_loss_7": 534.7776992797851, "learning_rate": 0.0009915550124911866, "loss": 1115.5873, "step": 680 }, { "ce_loss_10": 3.619931185245514, "ce_loss_13": 3.530241775512695, "ce_loss_2": 4.617183327674866, "ce_loss_3": 4.310395693778991, "ce_loss_7": 3.8025647521018984, "epoch": 0.069, "grad_norm": 784.0, "kl_loss_10": 167.73641510009764, "kl_loss_2": 2184.9783447265627, "kl_loss_3": 1576.4586059570313, "kl_loss_7": 539.4014129638672, "learning_rate": 0.0009912621540634887, "loss": 1117.2131, "step": 690 }, { "ce_loss_10": 3.6502980947494508, "ce_loss_13": 3.5655866384506227, "ce_loss_2": 4.61748468875885, "ce_loss_3": 4.316927981376648, "ce_loss_7": 3.8264237999916078, "epoch": 0.07, "grad_norm": 660.0, "kl_loss_10": 165.26128768920898, "kl_loss_2": 2131.576544189453, "kl_loss_3": 1534.9104614257812, "kl_loss_7": 536.9320556640625, "learning_rate": 0.0009909643486313534, "loss": 1101.8908, "step": 700 }, { "ce_loss_10": 3.521154451370239, "ce_loss_13": 3.435533571243286, "ce_loss_2": 4.553616023063659, "ce_loss_3": 4.223876130580902, "ce_loss_7": 3.705720341205597, "epoch": 0.071, "grad_norm": 640.0, "kl_loss_10": 165.1494041442871, "kl_loss_2": 2252.724328613281, "kl_loss_3": 1606.9476257324218, "kl_loss_7": 542.5165710449219, "learning_rate": 0.000990661599193678, "loss": 1157.1567, "step": 710 }, { "ce_loss_10": 3.6575976967811585, "ce_loss_13": 3.573076629638672, "ce_loss_2": 4.6403602123260494, "ce_loss_3": 4.340050399303436, "ce_loss_7": 3.829990637302399, "epoch": 0.072, "grad_norm": 608.0, "kl_loss_10": 162.9642532348633, "kl_loss_2": 2156.529504394531, "kl_loss_3": 1567.0165588378907, "kl_loss_7": 518.587028503418, "learning_rate": 0.0009903539087991462, "loss": 1098.1953, "step": 720 }, { "ce_loss_10": 3.6248491764068604, "ce_loss_13": 3.5419184803962707, "ce_loss_2": 4.619556307792664, "ce_loss_3": 4.313732409477234, "ce_loss_7": 3.799022710323334, "epoch": 0.073, "grad_norm": 1004.0, "kl_loss_10": 159.26097106933594, "kl_loss_2": 2162.796240234375, "kl_loss_3": 1561.3003784179687, "kl_loss_7": 522.1007858276367, "learning_rate": 0.0009900412805461966, "loss": 1111.7787, "step": 730 }, { "ce_loss_10": 3.7069281697273255, "ce_loss_13": 3.6213572144508364, "ce_loss_2": 4.660569405555725, "ce_loss_3": 4.377930784225464, "ce_loss_7": 3.8874071717262266, "epoch": 0.074, "grad_norm": 932.0, "kl_loss_10": 165.44114456176757, "kl_loss_2": 2100.644494628906, "kl_loss_3": 1533.6077758789063, "kl_loss_7": 527.4311614990235, "learning_rate": 0.0009897237175829927, "loss": 1100.6893, "step": 740 }, { "ce_loss_10": 3.596748685836792, "ce_loss_13": 3.5033590316772463, "ce_loss_2": 4.599991416931152, "ce_loss_3": 4.287704348564148, "ce_loss_7": 3.774449360370636, "epoch": 0.075, "grad_norm": 656.0, "kl_loss_10": 171.81486968994142, "kl_loss_2": 2205.746691894531, "kl_loss_3": 1594.8901733398438, "kl_loss_7": 540.3972091674805, "learning_rate": 0.0009894012231073895, "loss": 1113.4967, "step": 750 }, { "ce_loss_10": 3.6394460201263428, "ce_loss_13": 3.546016478538513, "ce_loss_2": 4.615111136436463, "ce_loss_3": 4.303437685966491, "ce_loss_7": 3.8015685200691225, "epoch": 0.076, "grad_norm": 804.0, "kl_loss_10": 175.62161712646486, "kl_loss_2": 2138.9857666015623, "kl_loss_3": 1534.8791564941407, "kl_loss_7": 514.0439895629883, "learning_rate": 0.0009890738003669028, "loss": 1106.6446, "step": 760 }, { "ce_loss_10": 3.620648503303528, "ce_loss_13": 3.525268292427063, "ce_loss_2": 4.618654131889343, "ce_loss_3": 4.299736750125885, "ce_loss_7": 3.8008544087409972, "epoch": 0.077, "grad_norm": 712.0, "kl_loss_10": 187.86515350341796, "kl_loss_2": 2225.2861145019533, "kl_loss_3": 1579.9274169921875, "kl_loss_7": 551.6296844482422, "learning_rate": 0.0009887414526586764, "loss": 1101.2971, "step": 770 }, { "ce_loss_10": 3.6774813294410706, "ce_loss_13": 3.585373139381409, "ce_loss_2": 4.6435751676559445, "ce_loss_3": 4.326629590988159, "ce_loss_7": 3.847112762928009, "epoch": 0.078, "grad_norm": 864.0, "kl_loss_10": 174.33660125732422, "kl_loss_2": 2136.854138183594, "kl_loss_3": 1515.704736328125, "kl_loss_7": 519.9487030029297, "learning_rate": 0.0009884041833294476, "loss": 1070.8322, "step": 780 }, { "ce_loss_10": 3.662649929523468, "ce_loss_13": 3.578170645236969, "ce_loss_2": 4.618038749694824, "ce_loss_3": 4.341187214851379, "ce_loss_7": 3.8345428824424745, "epoch": 0.079, "grad_norm": 880.0, "kl_loss_10": 164.20001144409179, "kl_loss_2": 2102.465753173828, "kl_loss_3": 1541.64658203125, "kl_loss_7": 513.4600540161133, "learning_rate": 0.000988061995775515, "loss": 1110.6254, "step": 790 }, { "ce_loss_10": 3.5975906372070314, "ce_loss_13": 3.5121807932853697, "ce_loss_2": 4.546776533126831, "ce_loss_3": 4.272667920589447, "ce_loss_7": 3.7629655480384825, "epoch": 0.08, "grad_norm": 776.0, "kl_loss_10": 161.49296493530272, "kl_loss_2": 2091.497247314453, "kl_loss_3": 1577.6322265625, "kl_loss_7": 507.28360748291016, "learning_rate": 0.0009877148934427035, "loss": 1084.9137, "step": 800 }, { "ce_loss_10": 3.6317833065986633, "ce_loss_13": 3.5532532691955567, "ce_loss_2": 4.599791812896728, "ce_loss_3": 4.328591656684876, "ce_loss_7": 3.7980172753334047, "epoch": 0.081, "grad_norm": 792.0, "kl_loss_10": 153.97521362304687, "kl_loss_2": 2114.881622314453, "kl_loss_3": 1571.4160400390624, "kl_loss_7": 498.02918701171876, "learning_rate": 0.0009873628798263297, "loss": 1072.1146, "step": 810 }, { "ce_loss_10": 3.5831527948379516, "ce_loss_13": 3.501357066631317, "ce_loss_2": 4.5278380632400514, "ce_loss_3": 4.243371033668518, "ce_loss_7": 3.745946741104126, "epoch": 0.082, "grad_norm": 628.0, "kl_loss_10": 152.38495407104492, "kl_loss_2": 2064.5574279785155, "kl_loss_3": 1515.6498229980468, "kl_loss_7": 485.99849395751954, "learning_rate": 0.0009870059584711668, "loss": 1084.9197, "step": 820 }, { "ce_loss_10": 3.6029329776763914, "ce_loss_13": 3.5267816185951233, "ce_loss_2": 4.547496175765991, "ce_loss_3": 4.271672892570495, "ce_loss_7": 3.7769490838050843, "epoch": 0.083, "grad_norm": 624.0, "kl_loss_10": 150.01721115112304, "kl_loss_2": 2084.924005126953, "kl_loss_3": 1520.498712158203, "kl_loss_7": 504.1028030395508, "learning_rate": 0.000986644132971409, "loss": 1067.0667, "step": 830 }, { "ce_loss_10": 3.5902104139328004, "ce_loss_13": 3.510513925552368, "ce_loss_2": 4.575335621833801, "ce_loss_3": 4.284187662601471, "ce_loss_7": 3.771816539764404, "epoch": 0.084, "grad_norm": 772.0, "kl_loss_10": 153.73068771362304, "kl_loss_2": 2139.4975341796876, "kl_loss_3": 1554.0361694335938, "kl_loss_7": 522.2033233642578, "learning_rate": 0.0009862774069706345, "loss": 1078.778, "step": 840 }, { "ce_loss_10": 3.71586754322052, "ce_loss_13": 3.6384023308753966, "ce_loss_2": 4.638924956321716, "ce_loss_3": 4.361155247688293, "ce_loss_7": 3.8873739719390867, "epoch": 0.085, "grad_norm": 804.0, "kl_loss_10": 148.7649658203125, "kl_loss_2": 2067.6587097167967, "kl_loss_3": 1512.3407836914062, "kl_loss_7": 513.5188568115234, "learning_rate": 0.000985905784161771, "loss": 1074.969, "step": 850 }, { "ce_loss_10": 3.645820212364197, "ce_loss_13": 3.570139443874359, "ce_loss_2": 4.591745162010193, "ce_loss_3": 4.305811822414398, "ce_loss_7": 3.8123847246170044, "epoch": 0.086, "grad_norm": 764.0, "kl_loss_10": 148.65879287719727, "kl_loss_2": 2080.238977050781, "kl_loss_3": 1514.342840576172, "kl_loss_7": 492.7503723144531, "learning_rate": 0.000985529268287055, "loss": 1056.9346, "step": 860 }, { "ce_loss_10": 3.568171739578247, "ce_loss_13": 3.488191318511963, "ce_loss_2": 4.544654989242554, "ce_loss_3": 4.24829398393631, "ce_loss_7": 3.740330684185028, "epoch": 0.087, "grad_norm": 1056.0, "kl_loss_10": 152.84957046508788, "kl_loss_2": 2129.793176269531, "kl_loss_3": 1569.1706298828126, "kl_loss_7": 502.5919555664062, "learning_rate": 0.0009851478631379982, "loss": 1081.4564, "step": 870 }, { "ce_loss_10": 3.639467215538025, "ce_loss_13": 3.5535321831703186, "ce_loss_2": 4.579109835624695, "ce_loss_3": 4.309219968318939, "ce_loss_7": 3.8039302229881287, "epoch": 0.088, "grad_norm": 1080.0, "kl_loss_10": 156.31018295288087, "kl_loss_2": 2080.687878417969, "kl_loss_3": 1531.4270324707031, "kl_loss_7": 494.3655487060547, "learning_rate": 0.0009847615725553456, "loss": 1057.7123, "step": 880 }, { "ce_loss_10": 3.687655174732208, "ce_loss_13": 3.6150201678276064, "ce_loss_2": 4.578017616271973, "ce_loss_3": 4.329054999351501, "ce_loss_7": 3.8436725974082946, "epoch": 0.089, "grad_norm": 748.0, "kl_loss_10": 146.62072792053223, "kl_loss_2": 1964.2715270996093, "kl_loss_3": 1474.4080871582032, "kl_loss_7": 471.1993637084961, "learning_rate": 0.0009843704004290394, "loss": 1054.4049, "step": 890 }, { "ce_loss_10": 3.5966106057167053, "ce_loss_13": 3.517079842090607, "ce_loss_2": 4.5300004243850704, "ce_loss_3": 4.259670841693878, "ce_loss_7": 3.763486957550049, "epoch": 0.09, "grad_norm": 800.0, "kl_loss_10": 151.4746063232422, "kl_loss_2": 2069.883508300781, "kl_loss_3": 1523.9132507324218, "kl_loss_7": 495.7139236450195, "learning_rate": 0.0009839743506981783, "loss": 1057.0924, "step": 900 }, { "ce_loss_10": 3.5157420516014097, "ce_loss_13": 3.4331260085105897, "ce_loss_2": 4.515481805801391, "ce_loss_3": 4.204641652107239, "ce_loss_7": 3.6795000195503236, "epoch": 0.091, "grad_norm": 772.0, "kl_loss_10": 158.5204734802246, "kl_loss_2": 2202.96552734375, "kl_loss_3": 1587.075457763672, "kl_loss_7": 501.0933639526367, "learning_rate": 0.0009835734273509786, "loss": 1084.0856, "step": 910 }, { "ce_loss_10": 3.6128107786178587, "ce_loss_13": 3.5335276126861572, "ce_loss_2": 4.572772192955017, "ce_loss_3": 4.276380968093872, "ce_loss_7": 3.779345142841339, "epoch": 0.092, "grad_norm": 792.0, "kl_loss_10": 150.20287322998047, "kl_loss_2": 2079.9132446289063, "kl_loss_3": 1489.1456481933594, "kl_loss_7": 485.9955291748047, "learning_rate": 0.0009831676344247342, "loss": 1058.1836, "step": 920 }, { "ce_loss_10": 3.6252113699913027, "ce_loss_13": 3.554415154457092, "ce_loss_2": 4.55711989402771, "ce_loss_3": 4.277616715431213, "ce_loss_7": 3.7868492245674132, "epoch": 0.093, "grad_norm": 792.0, "kl_loss_10": 141.0625129699707, "kl_loss_2": 2046.1615234375, "kl_loss_3": 1482.7538146972656, "kl_loss_7": 469.94707641601565, "learning_rate": 0.0009827569760057755, "loss": 1047.4356, "step": 930 }, { "ce_loss_10": 3.549435830116272, "ce_loss_13": 3.4681930899620057, "ce_loss_2": 4.557485795021057, "ce_loss_3": 4.23450893163681, "ce_loss_7": 3.718186581134796, "epoch": 0.094, "grad_norm": 700.0, "kl_loss_10": 153.28956451416016, "kl_loss_2": 2226.7088317871094, "kl_loss_3": 1587.8077514648437, "kl_loss_7": 504.2065032958984, "learning_rate": 0.000982341456229428, "loss": 1082.7429, "step": 940 }, { "ce_loss_10": 3.6411906361579893, "ce_loss_13": 3.5639458298683167, "ce_loss_2": 4.606229948997497, "ce_loss_3": 4.315305745601654, "ce_loss_7": 3.818677604198456, "epoch": 0.095, "grad_norm": 672.0, "kl_loss_10": 153.15759887695313, "kl_loss_2": 2121.5237243652346, "kl_loss_3": 1532.8892150878905, "kl_loss_7": 515.9636611938477, "learning_rate": 0.000981921079279971, "loss": 1051.7303, "step": 950 }, { "ce_loss_10": 3.6610822081565857, "ce_loss_13": 3.5863242864608766, "ce_loss_2": 4.549208259582519, "ce_loss_3": 4.26780799627304, "ce_loss_7": 3.813734710216522, "epoch": 0.096, "grad_norm": 644.0, "kl_loss_10": 145.7539077758789, "kl_loss_2": 1980.1197570800782, "kl_loss_3": 1435.0607482910157, "kl_loss_7": 477.7298553466797, "learning_rate": 0.0009814958493905962, "loss": 1022.0867, "step": 960 }, { "ce_loss_10": 3.6117519617080687, "ce_loss_13": 3.533314788341522, "ce_loss_2": 4.569203948974609, "ce_loss_3": 4.268878102302551, "ce_loss_7": 3.776082968711853, "epoch": 0.097, "grad_norm": 812.0, "kl_loss_10": 150.21649246215821, "kl_loss_2": 2110.584747314453, "kl_loss_3": 1508.700390625, "kl_loss_7": 495.108415222168, "learning_rate": 0.0009810657708433637, "loss": 1072.993, "step": 970 }, { "ce_loss_10": 3.6833523392677305, "ce_loss_13": 3.6088930130004884, "ce_loss_2": 4.585800218582153, "ce_loss_3": 4.298417234420777, "ce_loss_7": 3.8500677704811097, "epoch": 0.098, "grad_norm": 1160.0, "kl_loss_10": 148.93168411254882, "kl_loss_2": 1995.902032470703, "kl_loss_3": 1424.8928771972655, "kl_loss_7": 507.4762268066406, "learning_rate": 0.0009806308479691594, "loss": 1025.8244, "step": 980 }, { "ce_loss_10": 3.707567238807678, "ce_loss_13": 3.6246368527412414, "ce_loss_2": 4.631712865829468, "ce_loss_3": 4.336020660400391, "ce_loss_7": 3.90069819688797, "epoch": 0.099, "grad_norm": 1112.0, "kl_loss_10": 159.5045524597168, "kl_loss_2": 2072.0030151367187, "kl_loss_3": 1482.8904541015625, "kl_loss_7": 549.7570953369141, "learning_rate": 0.0009801910851476522, "loss": 1060.2188, "step": 990 }, { "ce_loss_10": 3.610922968387604, "ce_loss_13": 3.5338536381721495, "ce_loss_2": 4.579190135002136, "ce_loss_3": 4.2641215324401855, "ce_loss_7": 3.80141019821167, "epoch": 0.1, "grad_norm": 800.0, "kl_loss_10": 156.32900161743163, "kl_loss_2": 2141.2065551757814, "kl_loss_3": 1510.3861450195313, "kl_loss_7": 553.7208724975586, "learning_rate": 0.0009797464868072487, "loss": 1062.0947, "step": 1000 }, { "ce_loss_10": 3.5967588424682617, "ce_loss_13": 3.5186093926429747, "ce_loss_2": 4.533871865272522, "ce_loss_3": 4.23558361530304, "ce_loss_7": 3.7824254274368285, "epoch": 0.101, "grad_norm": 732.0, "kl_loss_10": 152.12875900268554, "kl_loss_2": 2063.537969970703, "kl_loss_3": 1478.1659851074219, "kl_loss_7": 518.8494201660156, "learning_rate": 0.0009792970574250492, "loss": 1047.0445, "step": 1010 }, { "ce_loss_10": 3.634649133682251, "ce_loss_13": 3.5520723938941954, "ce_loss_2": 4.555771279335022, "ce_loss_3": 4.269433748722077, "ce_loss_7": 3.8018234848976133, "epoch": 0.102, "grad_norm": 1056.0, "kl_loss_10": 152.19155044555663, "kl_loss_2": 2054.3825805664064, "kl_loss_3": 1481.8115966796875, "kl_loss_7": 502.45936889648436, "learning_rate": 0.0009788428015268028, "loss": 1029.5352, "step": 1020 }, { "ce_loss_10": 3.6245800733566282, "ce_loss_13": 3.544338381290436, "ce_loss_2": 4.5295133829116825, "ce_loss_3": 4.25595852136612, "ce_loss_7": 3.786297357082367, "epoch": 0.103, "grad_norm": 660.0, "kl_loss_10": 160.0803108215332, "kl_loss_2": 2022.062939453125, "kl_loss_3": 1467.65185546875, "kl_loss_7": 497.1546142578125, "learning_rate": 0.0009783837236868609, "loss": 1029.6176, "step": 1030 }, { "ce_loss_10": 3.5978386282920836, "ce_loss_13": 3.514087271690369, "ce_loss_2": 4.513098549842835, "ce_loss_3": 4.231943845748901, "ce_loss_7": 3.7571991205215456, "epoch": 0.104, "grad_norm": 800.0, "kl_loss_10": 161.33699111938478, "kl_loss_2": 2013.978125, "kl_loss_3": 1465.3624328613282, "kl_loss_7": 493.7020004272461, "learning_rate": 0.0009779198285281327, "loss": 1025.9279, "step": 1040 }, { "ce_loss_10": 3.5938573837280274, "ce_loss_13": 3.5105223774909975, "ce_loss_2": 4.523820948600769, "ce_loss_3": 4.238416838645935, "ce_loss_7": 3.7604029059410093, "epoch": 0.105, "grad_norm": 688.0, "kl_loss_10": 154.55627822875977, "kl_loss_2": 2049.859246826172, "kl_loss_3": 1468.662060546875, "kl_loss_7": 499.8658721923828, "learning_rate": 0.0009774511207220368, "loss": 1040.6164, "step": 1050 }, { "ce_loss_10": 3.6313247799873354, "ce_loss_13": 3.545831286907196, "ce_loss_2": 4.554855704307556, "ce_loss_3": 4.261361932754516, "ce_loss_7": 3.787016749382019, "epoch": 0.106, "grad_norm": 732.0, "kl_loss_10": 171.73610458374023, "kl_loss_2": 2048.06240234375, "kl_loss_3": 1470.4993041992188, "kl_loss_7": 497.4718765258789, "learning_rate": 0.0009769776049884564, "loss": 1040.5774, "step": 1060 }, { "ce_loss_10": 3.5417616248130797, "ce_loss_13": 3.458304190635681, "ce_loss_2": 4.490171897411346, "ce_loss_3": 4.193785727024078, "ce_loss_7": 3.704947257041931, "epoch": 0.107, "grad_norm": 740.0, "kl_loss_10": 162.49530944824218, "kl_loss_2": 2102.502551269531, "kl_loss_3": 1506.5601684570313, "kl_loss_7": 491.08145904541016, "learning_rate": 0.0009764992860956889, "loss": 1072.0402, "step": 1070 }, { "ce_loss_10": 3.698649287223816, "ce_loss_13": 3.619806432723999, "ce_loss_2": 4.555187082290649, "ce_loss_3": 4.3013293743133545, "ce_loss_7": 3.8462509632110597, "epoch": 0.108, "grad_norm": 640.0, "kl_loss_10": 151.0406120300293, "kl_loss_2": 1927.8519348144532, "kl_loss_3": 1406.4910583496094, "kl_loss_7": 465.21489868164065, "learning_rate": 0.0009760161688604008, "loss": 1002.8713, "step": 1080 }, { "ce_loss_10": 3.6983713507652283, "ce_loss_13": 3.615480875968933, "ce_loss_2": 4.593178796768188, "ce_loss_3": 4.316923928260803, "ce_loss_7": 3.8593989253044128, "epoch": 0.109, "grad_norm": 760.0, "kl_loss_10": 153.6709747314453, "kl_loss_2": 1982.3002502441407, "kl_loss_3": 1432.9902465820312, "kl_loss_7": 483.6423080444336, "learning_rate": 0.0009755282581475768, "loss": 1028.2272, "step": 1090 }, { "ce_loss_10": 3.744760012626648, "ce_loss_13": 3.6672345638275146, "ce_loss_2": 4.625878024101257, "ce_loss_3": 4.342048907279969, "ce_loss_7": 3.922909903526306, "epoch": 0.11, "grad_norm": 880.0, "kl_loss_10": 152.10422286987304, "kl_loss_2": 1961.5256591796874, "kl_loss_3": 1401.3743103027343, "kl_loss_7": 537.2792694091797, "learning_rate": 0.0009750355588704727, "loss": 1009.3172, "step": 1100 }, { "ce_loss_10": 3.5791931748390198, "ce_loss_13": 3.501610016822815, "ce_loss_2": 4.491281342506409, "ce_loss_3": 4.195801568031311, "ce_loss_7": 3.7639052987098696, "epoch": 0.111, "grad_norm": 768.0, "kl_loss_10": 145.47024803161622, "kl_loss_2": 2007.8315551757812, "kl_loss_3": 1422.1205200195313, "kl_loss_7": 512.9552871704102, "learning_rate": 0.0009745380759905647, "loss": 1043.0764, "step": 1110 }, { "ce_loss_10": 3.532094621658325, "ce_loss_13": 3.458336853981018, "ce_loss_2": 4.466642260551453, "ce_loss_3": 4.162093091011047, "ce_loss_7": 3.694514799118042, "epoch": 0.112, "grad_norm": 728.0, "kl_loss_10": 143.39996185302735, "kl_loss_2": 2057.472821044922, "kl_loss_3": 1447.1441162109375, "kl_loss_7": 483.391047668457, "learning_rate": 0.0009740358145174998, "loss": 1049.5488, "step": 1120 }, { "ce_loss_10": 3.687417185306549, "ce_loss_13": 3.613670265674591, "ce_loss_2": 4.559359908103943, "ce_loss_3": 4.2791221857070925, "ce_loss_7": 3.841489839553833, "epoch": 0.113, "grad_norm": 628.0, "kl_loss_10": 140.37460861206054, "kl_loss_2": 1940.2849426269531, "kl_loss_3": 1379.7558654785157, "kl_loss_7": 467.3983688354492, "learning_rate": 0.0009735287795090455, "loss": 998.5758, "step": 1130 }, { "ce_loss_10": 3.574834203720093, "ce_loss_13": 3.4973213911056518, "ce_loss_2": 4.496623802185058, "ce_loss_3": 4.196505165100097, "ce_loss_7": 3.736388313770294, "epoch": 0.114, "grad_norm": 732.0, "kl_loss_10": 144.5387985229492, "kl_loss_2": 2023.1371704101562, "kl_loss_3": 1428.7174560546875, "kl_loss_7": 473.70257263183595, "learning_rate": 0.0009730169760710386, "loss": 1016.0635, "step": 1140 }, { "ce_loss_10": 3.6638391733169557, "ce_loss_13": 3.5843318939208983, "ce_loss_2": 4.574735760688782, "ce_loss_3": 4.271646451950073, "ce_loss_7": 3.8169267535209657, "epoch": 0.115, "grad_norm": 804.0, "kl_loss_10": 157.17790870666505, "kl_loss_2": 1999.6382446289062, "kl_loss_3": 1418.1464050292968, "kl_loss_7": 476.21234436035155, "learning_rate": 0.0009725004093573342, "loss": 1019.7543, "step": 1150 }, { "ce_loss_10": 3.5969048619270323, "ce_loss_13": 3.5150346517562867, "ce_loss_2": 4.500743961334228, "ce_loss_3": 4.20902818441391, "ce_loss_7": 3.7567399382591247, "epoch": 0.116, "grad_norm": 780.0, "kl_loss_10": 165.2628433227539, "kl_loss_2": 1981.3116882324218, "kl_loss_3": 1415.8141845703126, "kl_loss_7": 479.9772262573242, "learning_rate": 0.0009719790845697534, "loss": 1006.5268, "step": 1160 }, { "ce_loss_10": 3.5572616815567017, "ce_loss_13": 3.474119758605957, "ce_loss_2": 4.428411734104157, "ce_loss_3": 4.153100395202637, "ce_loss_7": 3.6976303577423097, "epoch": 0.117, "grad_norm": 808.0, "kl_loss_10": 155.7167541503906, "kl_loss_2": 1955.4356628417968, "kl_loss_3": 1399.585400390625, "kl_loss_7": 458.83162994384764, "learning_rate": 0.0009714530069580309, "loss": 993.1516, "step": 1170 }, { "ce_loss_10": 3.6529783248901366, "ce_loss_13": 3.5693596243858337, "ce_loss_2": 4.557260251045227, "ce_loss_3": 4.275024116039276, "ce_loss_7": 3.806614136695862, "epoch": 0.118, "grad_norm": 756.0, "kl_loss_10": 167.28345642089843, "kl_loss_2": 1992.4473449707032, "kl_loss_3": 1433.3669677734374, "kl_loss_7": 476.5534362792969, "learning_rate": 0.0009709221818197624, "loss": 1011.2301, "step": 1180 }, { "ce_loss_10": 3.6975300788879393, "ce_loss_13": 3.6016547799110414, "ce_loss_2": 4.591509246826172, "ce_loss_3": 4.297507727146149, "ce_loss_7": 3.834565532207489, "epoch": 0.119, "grad_norm": 660.0, "kl_loss_10": 185.80238494873046, "kl_loss_2": 2007.4934509277343, "kl_loss_3": 1425.4560913085938, "kl_loss_7": 469.11800384521484, "learning_rate": 0.0009703866145003512, "loss": 1024.6623, "step": 1190 }, { "ce_loss_10": 3.6678906202316286, "ce_loss_13": 3.5744742870330812, "ce_loss_2": 4.538065481185913, "ce_loss_3": 4.258814692497253, "ce_loss_7": 3.7999132752418516, "epoch": 0.12, "grad_norm": 772.0, "kl_loss_10": 183.3194091796875, "kl_loss_2": 1986.1185546875, "kl_loss_3": 1420.947119140625, "kl_loss_7": 459.66529846191406, "learning_rate": 0.0009698463103929542, "loss": 1026.581, "step": 1200 }, { "ce_loss_10": 3.632613515853882, "ce_loss_13": 3.543941628932953, "ce_loss_2": 4.525945568084717, "ce_loss_3": 4.238123095035553, "ce_loss_7": 3.7756314039230348, "epoch": 0.121, "grad_norm": 632.0, "kl_loss_10": 166.73718338012696, "kl_loss_2": 1978.2434631347655, "kl_loss_3": 1420.1109741210937, "kl_loss_7": 459.94732360839845, "learning_rate": 0.0009693012749384279, "loss": 1018.8634, "step": 1210 }, { "ce_loss_10": 3.6396193385124205, "ce_loss_13": 3.556045150756836, "ce_loss_2": 4.533756256103516, "ce_loss_3": 4.242860603332519, "ce_loss_7": 3.780854082107544, "epoch": 0.122, "grad_norm": 600.0, "kl_loss_10": 156.3915023803711, "kl_loss_2": 1990.9532287597656, "kl_loss_3": 1424.2327880859375, "kl_loss_7": 464.35849914550784, "learning_rate": 0.0009687515136252732, "loss": 1002.0254, "step": 1220 }, { "ce_loss_10": 3.5823015332221986, "ce_loss_13": 3.5072194218635557, "ce_loss_2": 4.528130960464478, "ce_loss_3": 4.2137349367141725, "ce_loss_7": 3.7315992593765257, "epoch": 0.123, "grad_norm": 756.0, "kl_loss_10": 150.18084487915038, "kl_loss_2": 2092.4699157714845, "kl_loss_3": 1461.4225219726563, "kl_loss_7": 468.54954833984374, "learning_rate": 0.0009681970319895803, "loss": 1052.6977, "step": 1230 }, { "ce_loss_10": 3.664915943145752, "ce_loss_13": 3.591793954372406, "ce_loss_2": 4.561380934715271, "ce_loss_3": 4.269609940052033, "ce_loss_7": 3.8157546401023863, "epoch": 0.124, "grad_norm": 636.0, "kl_loss_10": 144.05731277465821, "kl_loss_2": 1976.7593994140625, "kl_loss_3": 1403.097637939453, "kl_loss_7": 451.5134246826172, "learning_rate": 0.0009676378356149733, "loss": 996.3963, "step": 1240 }, { "ce_loss_10": 3.6368504881858827, "ce_loss_13": 3.563446891307831, "ce_loss_2": 4.521274757385254, "ce_loss_3": 4.231760728359222, "ce_loss_7": 3.7818166613578796, "epoch": 0.125, "grad_norm": 680.0, "kl_loss_10": 139.92978591918944, "kl_loss_2": 1945.718603515625, "kl_loss_3": 1376.9338073730469, "kl_loss_7": 442.8954742431641, "learning_rate": 0.0009670739301325534, "loss": 985.6887, "step": 1250 }, { "ce_loss_10": 3.6006561040878298, "ce_loss_13": 3.5266774535179137, "ce_loss_2": 4.487187838554382, "ce_loss_3": 4.210299682617188, "ce_loss_7": 3.7513450741767884, "epoch": 0.126, "grad_norm": 1088.0, "kl_loss_10": 141.97663192749025, "kl_loss_2": 1945.2674438476563, "kl_loss_3": 1397.4619201660157, "kl_loss_7": 451.89989471435547, "learning_rate": 0.0009665053212208426, "loss": 997.018, "step": 1260 }, { "ce_loss_10": 3.6409796833992005, "ce_loss_13": 3.566537618637085, "ce_loss_2": 4.534491777420044, "ce_loss_3": 4.254711973667145, "ce_loss_7": 3.7902607202529905, "epoch": 0.127, "grad_norm": 740.0, "kl_loss_10": 142.8513946533203, "kl_loss_2": 1985.1681701660157, "kl_loss_3": 1422.8020446777343, "kl_loss_7": 465.36827697753904, "learning_rate": 0.0009659320146057262, "loss": 1007.2841, "step": 1270 }, { "ce_loss_10": 3.643886315822601, "ce_loss_13": 3.5711652278900146, "ce_loss_2": 4.524505710601806, "ce_loss_3": 4.246666646003723, "ce_loss_7": 3.7946216225624085, "epoch": 0.128, "grad_norm": 780.0, "kl_loss_10": 139.41454315185547, "kl_loss_2": 1946.1368347167968, "kl_loss_3": 1391.9705444335937, "kl_loss_7": 453.75545196533204, "learning_rate": 0.0009653540160603955, "loss": 983.5219, "step": 1280 }, { "ce_loss_10": 3.6445462584495543, "ce_loss_13": 3.5743473052978514, "ce_loss_2": 4.525367498397827, "ce_loss_3": 4.243791842460633, "ce_loss_7": 3.798260760307312, "epoch": 0.129, "grad_norm": 896.0, "kl_loss_10": 137.9964168548584, "kl_loss_2": 1973.8229370117188, "kl_loss_3": 1402.948223876953, "kl_loss_7": 464.4623092651367, "learning_rate": 0.0009647713314052896, "loss": 986.2184, "step": 1290 }, { "ce_loss_10": 3.598604607582092, "ce_loss_13": 3.5246904134750365, "ce_loss_2": 4.537400603294373, "ce_loss_3": 4.24116679430008, "ce_loss_7": 3.7630946040153503, "epoch": 0.13, "grad_norm": 912.0, "kl_loss_10": 140.9877899169922, "kl_loss_2": 2062.618060302734, "kl_loss_3": 1461.2658752441407, "kl_loss_7": 488.41448974609375, "learning_rate": 0.0009641839665080363, "loss": 1032.2588, "step": 1300 }, { "ce_loss_10": 3.5593097448349, "ce_loss_13": 3.489050841331482, "ce_loss_2": 4.458271086215973, "ce_loss_3": 4.172789168357849, "ce_loss_7": 3.7170780062675477, "epoch": 0.131, "grad_norm": 716.0, "kl_loss_10": 133.77973709106445, "kl_loss_2": 1963.6124877929688, "kl_loss_3": 1399.7542846679687, "kl_loss_7": 456.0334274291992, "learning_rate": 0.0009635919272833937, "loss": 982.5201, "step": 1310 }, { "ce_loss_10": 3.5990184545516968, "ce_loss_13": 3.526463270187378, "ce_loss_2": 4.490329217910767, "ce_loss_3": 4.223057639598847, "ce_loss_7": 3.7545538902282716, "epoch": 0.132, "grad_norm": 676.0, "kl_loss_10": 135.81807289123535, "kl_loss_2": 1958.54912109375, "kl_loss_3": 1432.0921813964844, "kl_loss_7": 455.8761688232422, "learning_rate": 0.0009629952196931902, "loss": 978.7096, "step": 1320 }, { "ce_loss_10": 3.5818584084510805, "ce_loss_13": 3.511084866523743, "ce_loss_2": 4.47534259557724, "ce_loss_3": 4.207857239246368, "ce_loss_7": 3.7266325116157533, "epoch": 0.133, "grad_norm": 828.0, "kl_loss_10": 134.35010833740233, "kl_loss_2": 1972.4923217773437, "kl_loss_3": 1433.8158264160156, "kl_loss_7": 440.38950805664064, "learning_rate": 0.0009623938497462645, "loss": 990.1455, "step": 1330 }, { "ce_loss_10": 3.5756045818328857, "ce_loss_13": 3.5072654485702515, "ce_loss_2": 4.466656875610352, "ce_loss_3": 4.199881613254547, "ce_loss_7": 3.7256452798843385, "epoch": 0.134, "grad_norm": 888.0, "kl_loss_10": 134.34771575927735, "kl_loss_2": 1961.1023193359374, "kl_loss_3": 1444.9174011230468, "kl_loss_7": 447.36301574707034, "learning_rate": 0.0009617878234984055, "loss": 1002.1799, "step": 1340 }, { "ce_loss_10": 3.663484239578247, "ce_loss_13": 3.5971077919006347, "ce_loss_2": 4.516588711738587, "ce_loss_3": 4.253651916980743, "ce_loss_7": 3.806522560119629, "epoch": 0.135, "grad_norm": 608.0, "kl_loss_10": 130.0582447052002, "kl_loss_2": 1881.661474609375, "kl_loss_3": 1357.9678344726562, "kl_loss_7": 429.4259552001953, "learning_rate": 0.0009611771470522907, "loss": 966.1971, "step": 1350 }, { "ce_loss_10": 3.5947040557861327, "ce_loss_13": 3.525246286392212, "ce_loss_2": 4.476583409309387, "ce_loss_3": 4.205146253108978, "ce_loss_7": 3.752723276615143, "epoch": 0.136, "grad_norm": 676.0, "kl_loss_10": 133.20248107910157, "kl_loss_2": 1918.1431030273438, "kl_loss_3": 1379.0045837402345, "kl_loss_7": 446.282942199707, "learning_rate": 0.0009605618265574251, "loss": 961.7817, "step": 1360 }, { "ce_loss_10": 3.554506206512451, "ce_loss_13": 3.480433630943298, "ce_loss_2": 4.470173907279968, "ce_loss_3": 4.1743337392807005, "ce_loss_7": 3.719905662536621, "epoch": 0.137, "grad_norm": 652.0, "kl_loss_10": 136.85387153625487, "kl_loss_2": 2017.7276672363282, "kl_loss_3": 1423.5509887695312, "kl_loss_7": 487.44747467041014, "learning_rate": 0.0009599418682100792, "loss": 998.1765, "step": 1370 }, { "ce_loss_10": 3.5901790022850038, "ce_loss_13": 3.5194321513175963, "ce_loss_2": 4.49449212551117, "ce_loss_3": 4.199472737312317, "ce_loss_7": 3.7504992127418517, "epoch": 0.138, "grad_norm": 688.0, "kl_loss_10": 135.84379615783692, "kl_loss_2": 1976.9435302734375, "kl_loss_3": 1400.4364135742187, "kl_loss_7": 462.9383514404297, "learning_rate": 0.0009593172782532268, "loss": 993.7691, "step": 1380 }, { "ce_loss_10": 3.63968049287796, "ce_loss_13": 3.569602346420288, "ce_loss_2": 4.510952973365784, "ce_loss_3": 4.237147486209869, "ce_loss_7": 3.8172584652900694, "epoch": 0.139, "grad_norm": 888.0, "kl_loss_10": 135.67766990661622, "kl_loss_2": 1924.3626342773437, "kl_loss_3": 1374.3017150878907, "kl_loss_7": 515.5329833984375, "learning_rate": 0.0009586880629764817, "loss": 987.7382, "step": 1390 }, { "ce_loss_10": 3.5654266238212586, "ce_loss_13": 3.4953743696212767, "ce_loss_2": 4.459842729568481, "ce_loss_3": 4.170699322223664, "ce_loss_7": 3.74036705493927, "epoch": 0.14, "grad_norm": 652.0, "kl_loss_10": 133.87066612243652, "kl_loss_2": 1948.5296875, "kl_loss_3": 1380.6841857910156, "kl_loss_7": 479.6327728271484, "learning_rate": 0.0009580542287160348, "loss": 970.7623, "step": 1400 }, { "ce_loss_10": 3.532876360416412, "ce_loss_13": 3.4605329275131225, "ce_loss_2": 4.411247038841248, "ce_loss_3": 4.1278337121009825, "ce_loss_7": 3.687132728099823, "epoch": 0.141, "grad_norm": 760.0, "kl_loss_10": 133.00614318847656, "kl_loss_2": 1941.4534973144532, "kl_loss_3": 1370.9834411621093, "kl_loss_7": 461.88374786376954, "learning_rate": 0.0009574157818545901, "loss": 963.357, "step": 1410 }, { "ce_loss_10": 3.6023364901542663, "ce_loss_13": 3.5363354206085207, "ce_loss_2": 4.4577757835388185, "ce_loss_3": 4.177181375026703, "ce_loss_7": 3.7514011740684508, "epoch": 0.142, "grad_norm": 596.0, "kl_loss_10": 129.52679748535155, "kl_loss_2": 1881.3617065429687, "kl_loss_3": 1332.5099182128906, "kl_loss_7": 437.22986602783203, "learning_rate": 0.0009567727288213005, "loss": 968.5264, "step": 1420 }, { "ce_loss_10": 3.5778553009033205, "ce_loss_13": 3.508594274520874, "ce_loss_2": 4.465421223640442, "ce_loss_3": 4.179102063179016, "ce_loss_7": 3.743157207965851, "epoch": 0.143, "grad_norm": 744.0, "kl_loss_10": 134.4540657043457, "kl_loss_2": 1960.2565612792969, "kl_loss_3": 1389.5939453125, "kl_loss_7": 477.4431686401367, "learning_rate": 0.0009561250760917027, "loss": 973.4651, "step": 1430 }, { "ce_loss_10": 3.602523684501648, "ce_loss_13": 3.5343605875968933, "ce_loss_2": 4.481138801574707, "ce_loss_3": 4.192634081840515, "ce_loss_7": 3.774356245994568, "epoch": 0.144, "grad_norm": 1024.0, "kl_loss_10": 135.97045669555663, "kl_loss_2": 1952.9030639648438, "kl_loss_3": 1383.874755859375, "kl_loss_7": 484.99026947021486, "learning_rate": 0.0009554728301876525, "loss": 966.9766, "step": 1440 }, { "ce_loss_10": 3.6580937623977663, "ce_loss_13": 3.5851798057556152, "ce_loss_2": 4.5087562799453735, "ce_loss_3": 4.233534407615662, "ce_loss_7": 3.825493574142456, "epoch": 0.145, "grad_norm": 928.0, "kl_loss_10": 137.07459564208983, "kl_loss_2": 1881.6396850585938, "kl_loss_3": 1345.5737915039062, "kl_loss_7": 501.10081481933594, "learning_rate": 0.0009548159976772592, "loss": 998.4485, "step": 1450 }, { "ce_loss_10": 3.596034073829651, "ce_loss_13": 3.522447967529297, "ce_loss_2": 4.478816056251526, "ce_loss_3": 4.189649546146393, "ce_loss_7": 3.762848901748657, "epoch": 0.146, "grad_norm": 1080.0, "kl_loss_10": 143.58355560302735, "kl_loss_2": 1936.1035217285157, "kl_loss_3": 1365.2523193359375, "kl_loss_7": 484.733952331543, "learning_rate": 0.0009541545851748186, "loss": 976.6273, "step": 1460 }, { "ce_loss_10": 3.474217391014099, "ce_loss_13": 3.391642665863037, "ce_loss_2": 4.375903367996216, "ce_loss_3": 4.0772433400154116, "ce_loss_7": 3.618804621696472, "epoch": 0.147, "grad_norm": 740.0, "kl_loss_10": 157.2915252685547, "kl_loss_2": 1992.6693908691407, "kl_loss_3": 1395.2765319824218, "kl_loss_7": 459.77076568603513, "learning_rate": 0.0009534885993407473, "loss": 988.6643, "step": 1470 }, { "ce_loss_10": 3.6407272934913637, "ce_loss_13": 3.5654699087142943, "ce_loss_2": 4.522874236106873, "ce_loss_3": 4.237059187889099, "ce_loss_7": 3.7881784439086914, "epoch": 0.148, "grad_norm": 768.0, "kl_loss_10": 152.31081161499023, "kl_loss_2": 1960.43330078125, "kl_loss_3": 1391.7567260742187, "kl_loss_7": 447.1866821289062, "learning_rate": 0.0009528180468815154, "loss": 985.9476, "step": 1480 }, { "ce_loss_10": 3.6812485575675966, "ce_loss_13": 3.6057910084724427, "ce_loss_2": 4.517141556739807, "ce_loss_3": 4.24812548160553, "ce_loss_7": 3.818003940582275, "epoch": 0.149, "grad_norm": 668.0, "kl_loss_10": 148.28892364501954, "kl_loss_2": 1884.2628112792968, "kl_loss_3": 1354.7002502441405, "kl_loss_7": 450.14676971435546, "learning_rate": 0.0009521429345495787, "loss": 963.1709, "step": 1490 }, { "ce_loss_10": 3.6604441523551943, "ce_loss_13": 3.5888909220695497, "ce_loss_2": 4.499482488632202, "ce_loss_3": 4.225247251987457, "ce_loss_7": 3.7930260181427, "epoch": 0.15, "grad_norm": 584.0, "kl_loss_10": 146.37998809814454, "kl_loss_2": 1869.6201477050781, "kl_loss_3": 1330.6132080078125, "kl_loss_7": 426.6243362426758, "learning_rate": 0.0009514632691433108, "loss": 954.942, "step": 1500 }, { "ce_loss_10": 3.6246044397354127, "ce_loss_13": 3.549522113800049, "ce_loss_2": 4.489072966575622, "ce_loss_3": 4.212704205513001, "ce_loss_7": 3.7693188190460205, "epoch": 0.151, "grad_norm": 800.0, "kl_loss_10": 146.22886962890624, "kl_loss_2": 1926.7843017578125, "kl_loss_3": 1374.8228454589844, "kl_loss_7": 445.3820526123047, "learning_rate": 0.0009507790575069346, "loss": 975.7757, "step": 1510 }, { "ce_loss_10": 3.60465270280838, "ce_loss_13": 3.5260859608650206, "ce_loss_2": 4.497347474098206, "ce_loss_3": 4.208156502246856, "ce_loss_7": 3.754675567150116, "epoch": 0.152, "grad_norm": 640.0, "kl_loss_10": 146.7579174041748, "kl_loss_2": 1978.4870239257812, "kl_loss_3": 1398.3346801757812, "kl_loss_7": 464.8377914428711, "learning_rate": 0.0009500903065304539, "loss": 1002.5975, "step": 1520 }, { "ce_loss_10": 3.6378764629364015, "ce_loss_13": 3.568044149875641, "ce_loss_2": 4.477255725860596, "ce_loss_3": 4.20441210269928, "ce_loss_7": 3.781054198741913, "epoch": 0.153, "grad_norm": 632.0, "kl_loss_10": 134.77642707824708, "kl_loss_2": 1860.8244018554688, "kl_loss_3": 1325.1245361328124, "kl_loss_7": 435.2869445800781, "learning_rate": 0.0009493970231495835, "loss": 956.8914, "step": 1530 }, { "ce_loss_10": 3.573818826675415, "ce_loss_13": 3.5099527835845947, "ce_loss_2": 4.410961580276489, "ce_loss_3": 4.150579679012298, "ce_loss_7": 3.7092724084854125, "epoch": 0.154, "grad_norm": 780.0, "kl_loss_10": 131.3643898010254, "kl_loss_2": 1878.5225891113282, "kl_loss_3": 1350.3878967285157, "kl_loss_7": 430.40466461181643, "learning_rate": 0.0009486992143456792, "loss": 945.2268, "step": 1540 }, { "ce_loss_10": 3.5992016792297363, "ce_loss_13": 3.523569476604462, "ce_loss_2": 4.515044832229615, "ce_loss_3": 4.228962254524231, "ce_loss_7": 3.7475435733795166, "epoch": 0.155, "grad_norm": 672.0, "kl_loss_10": 141.14554443359376, "kl_loss_2": 2031.2699279785156, "kl_loss_3": 1444.2494689941407, "kl_loss_7": 463.138671875, "learning_rate": 0.0009479968871456679, "loss": 988.6553, "step": 1550 }, { "ce_loss_10": 3.5646559238433837, "ce_loss_13": 3.4966208219528196, "ce_loss_2": 4.450777673721314, "ce_loss_3": 4.168145835399628, "ce_loss_7": 3.710667645931244, "epoch": 0.156, "grad_norm": 688.0, "kl_loss_10": 136.88069381713868, "kl_loss_2": 1965.8140869140625, "kl_loss_3": 1402.7049072265625, "kl_loss_7": 445.93438415527345, "learning_rate": 0.0009472900486219768, "loss": 963.4377, "step": 1560 }, { "ce_loss_10": 3.549368178844452, "ce_loss_13": 3.4842219233512877, "ce_loss_2": 4.410623633861542, "ce_loss_3": 4.142621529102326, "ce_loss_7": 3.6982838034629824, "epoch": 0.157, "grad_norm": 640.0, "kl_loss_10": 131.32818450927735, "kl_loss_2": 1904.2484497070313, "kl_loss_3": 1367.554217529297, "kl_loss_7": 436.6511459350586, "learning_rate": 0.000946578705892462, "loss": 958.9611, "step": 1570 }, { "ce_loss_10": 3.591766357421875, "ce_loss_13": 3.524074637889862, "ce_loss_2": 4.432039093971253, "ce_loss_3": 4.15734601020813, "ce_loss_7": 3.7368598580360413, "epoch": 0.158, "grad_norm": 616.0, "kl_loss_10": 129.00978393554686, "kl_loss_2": 1863.6193725585938, "kl_loss_3": 1319.6479736328124, "kl_loss_7": 425.303483581543, "learning_rate": 0.0009458628661203367, "loss": 948.0822, "step": 1580 }, { "ce_loss_10": 3.596361207962036, "ce_loss_13": 3.5302602648735046, "ce_loss_2": 4.493044710159301, "ce_loss_3": 4.214160966873169, "ce_loss_7": 3.748061168193817, "epoch": 0.159, "grad_norm": 708.0, "kl_loss_10": 129.65957221984863, "kl_loss_2": 1953.6658996582032, "kl_loss_3": 1407.1987854003905, "kl_loss_7": 446.0535415649414, "learning_rate": 0.0009451425365140996, "loss": 951.6695, "step": 1590 }, { "ce_loss_10": 3.6739163994789124, "ce_loss_13": 3.605791914463043, "ce_loss_2": 4.50185797214508, "ce_loss_3": 4.243522822856903, "ce_loss_7": 3.8140183806419374, "epoch": 0.16, "grad_norm": 744.0, "kl_loss_10": 128.69642486572266, "kl_loss_2": 1846.3515991210938, "kl_loss_3": 1331.8128662109375, "kl_loss_7": 434.98155517578124, "learning_rate": 0.0009444177243274617, "loss": 933.1452, "step": 1600 }, { "ce_loss_10": 3.5251585721969603, "ce_loss_13": 3.455797529220581, "ce_loss_2": 4.405846536159515, "ce_loss_3": 4.126918637752533, "ce_loss_7": 3.677467477321625, "epoch": 0.161, "grad_norm": 596.0, "kl_loss_10": 136.4243942260742, "kl_loss_2": 1933.7171264648437, "kl_loss_3": 1392.4410583496094, "kl_loss_7": 451.72364349365233, "learning_rate": 0.0009436884368592739, "loss": 967.6984, "step": 1610 }, { "ce_loss_10": 3.5786744475364687, "ce_loss_13": 3.5104586362838743, "ce_loss_2": 4.414544034004211, "ce_loss_3": 4.156228089332581, "ce_loss_7": 3.7193200826644897, "epoch": 0.162, "grad_norm": 636.0, "kl_loss_10": 131.01683616638184, "kl_loss_2": 1853.443292236328, "kl_loss_3": 1337.9854797363282, "kl_loss_7": 428.82838134765626, "learning_rate": 0.0009429546814534529, "loss": 958.7369, "step": 1620 }, { "ce_loss_10": 3.5906108379364015, "ce_loss_13": 3.526137673854828, "ce_loss_2": 4.435400915145874, "ce_loss_3": 4.173231565952301, "ce_loss_7": 3.7300363063812254, "epoch": 0.163, "grad_norm": 624.0, "kl_loss_10": 132.57526741027831, "kl_loss_2": 1873.2140075683594, "kl_loss_3": 1342.874365234375, "kl_loss_7": 429.5042190551758, "learning_rate": 0.0009422164654989072, "loss": 931.9927, "step": 1630 }, { "ce_loss_10": 3.710638439655304, "ce_loss_13": 3.6424652576446532, "ce_loss_2": 4.532403230667114, "ce_loss_3": 4.2666764736175535, "ce_loss_7": 3.849680042266846, "epoch": 0.164, "grad_norm": 860.0, "kl_loss_10": 132.91800270080566, "kl_loss_2": 1853.9109191894531, "kl_loss_3": 1322.1990844726563, "kl_loss_7": 435.6725021362305, "learning_rate": 0.0009414737964294635, "loss": 941.8417, "step": 1640 }, { "ce_loss_10": 3.6342726349830627, "ce_loss_13": 3.5669748902320864, "ce_loss_2": 4.447252655029297, "ce_loss_3": 4.185538256168366, "ce_loss_7": 3.7665403723716735, "epoch": 0.165, "grad_norm": 692.0, "kl_loss_10": 134.89337196350098, "kl_loss_2": 1796.6323547363281, "kl_loss_3": 1292.7177673339843, "kl_loss_7": 412.7283767700195, "learning_rate": 0.000940726681723791, "loss": 938.1797, "step": 1650 }, { "ce_loss_10": 3.4676742553710938, "ce_loss_13": 3.3982078671455382, "ce_loss_2": 4.3552307963371275, "ce_loss_3": 4.062869071960449, "ce_loss_7": 3.60580974817276, "epoch": 0.166, "grad_norm": 816.0, "kl_loss_10": 141.08745079040528, "kl_loss_2": 1958.7149963378906, "kl_loss_3": 1377.0126342773438, "kl_loss_7": 436.9836364746094, "learning_rate": 0.0009399751289053266, "loss": 940.9443, "step": 1660 }, { "ce_loss_10": 3.6904470801353453, "ce_loss_13": 3.6224868535995483, "ce_loss_2": 4.523065829277039, "ce_loss_3": 4.254563307762146, "ce_loss_7": 3.834598112106323, "epoch": 0.167, "grad_norm": 668.0, "kl_loss_10": 135.15199966430663, "kl_loss_2": 1857.1015625, "kl_loss_3": 1324.1543884277344, "kl_loss_7": 437.40198822021483, "learning_rate": 0.0009392191455421988, "loss": 947.3873, "step": 1670 }, { "ce_loss_10": 3.6671738743782045, "ce_loss_13": 3.587620568275452, "ce_loss_2": 4.498320388793945, "ce_loss_3": 4.22104926109314, "ce_loss_7": 3.8122028708457947, "epoch": 0.168, "grad_norm": 676.0, "kl_loss_10": 166.51247024536133, "kl_loss_2": 1892.7054138183594, "kl_loss_3": 1342.7127990722656, "kl_loss_7": 486.4829803466797, "learning_rate": 0.0009384587392471515, "loss": 948.1786, "step": 1680 }, { "ce_loss_10": 3.6600228428840635, "ce_loss_13": 3.5864087343215942, "ce_loss_2": 4.459276187419891, "ce_loss_3": 4.208518004417419, "ce_loss_7": 3.800411415100098, "epoch": 0.169, "grad_norm": 776.0, "kl_loss_10": 138.55523910522462, "kl_loss_2": 1794.223944091797, "kl_loss_3": 1295.422784423828, "kl_loss_7": 436.5801635742188, "learning_rate": 0.0009376939176774678, "loss": 924.7266, "step": 1690 }, { "ce_loss_10": 3.630977785587311, "ce_loss_13": 3.5591222643852234, "ce_loss_2": 4.463634443283081, "ce_loss_3": 4.196054553985595, "ce_loss_7": 3.776019263267517, "epoch": 0.17, "grad_norm": 680.0, "kl_loss_10": 137.77413024902344, "kl_loss_2": 1851.2871948242187, "kl_loss_3": 1321.7806213378906, "kl_loss_7": 440.95410614013673, "learning_rate": 0.0009369246885348925, "loss": 954.3758, "step": 1700 }, { "ce_loss_10": 3.620860755443573, "ce_loss_13": 3.548725974559784, "ce_loss_2": 4.493576192855835, "ce_loss_3": 4.219510662555694, "ce_loss_7": 3.7679349422454833, "epoch": 0.171, "grad_norm": 644.0, "kl_loss_10": 138.9925937652588, "kl_loss_2": 1923.6863098144531, "kl_loss_3": 1377.5073120117188, "kl_loss_7": 443.99891357421876, "learning_rate": 0.0009361510595655545, "loss": 956.0152, "step": 1710 }, { "ce_loss_10": 3.577521193027496, "ce_loss_13": 3.5036789059638975, "ce_loss_2": 4.424620795249939, "ce_loss_3": 4.1642672419548035, "ce_loss_7": 3.7225108623504637, "epoch": 0.172, "grad_norm": 828.0, "kl_loss_10": 141.93520431518556, "kl_loss_2": 1889.5624694824219, "kl_loss_3": 1368.9590454101562, "kl_loss_7": 447.4163055419922, "learning_rate": 0.0009353730385598887, "loss": 950.4418, "step": 1720 }, { "ce_loss_10": 3.50588299036026, "ce_loss_13": 3.4323722243309023, "ce_loss_2": 4.386991477012634, "ce_loss_3": 4.100323116779327, "ce_loss_7": 3.6548351407051087, "epoch": 0.173, "grad_norm": 664.0, "kl_loss_10": 135.93833503723144, "kl_loss_2": 1934.9754455566406, "kl_loss_3": 1367.2397583007812, "kl_loss_7": 442.4083419799805, "learning_rate": 0.0009345906333525581, "loss": 968.5305, "step": 1730 }, { "ce_loss_10": 3.5357080459594727, "ce_loss_13": 3.468447768688202, "ce_loss_2": 4.3971401929855345, "ce_loss_3": 4.122917258739472, "ce_loss_7": 3.682889449596405, "epoch": 0.174, "grad_norm": 732.0, "kl_loss_10": 140.2384853363037, "kl_loss_2": 1929.9898620605468, "kl_loss_3": 1376.7037963867188, "kl_loss_7": 446.17051849365237, "learning_rate": 0.0009338038518223745, "loss": 949.4842, "step": 1740 }, { "ce_loss_10": 3.6080891370773314, "ce_loss_13": 3.5360337495803833, "ce_loss_2": 4.465080261230469, "ce_loss_3": 4.204415166378022, "ce_loss_7": 3.758884835243225, "epoch": 0.175, "grad_norm": 700.0, "kl_loss_10": 141.07245063781738, "kl_loss_2": 1912.0250732421875, "kl_loss_3": 1384.9084716796874, "kl_loss_7": 451.2083984375, "learning_rate": 0.0009330127018922195, "loss": 982.5578, "step": 1750 }, { "ce_loss_10": 3.5581135034561155, "ce_loss_13": 3.487321400642395, "ce_loss_2": 4.408911228179932, "ce_loss_3": 4.13130089044571, "ce_loss_7": 3.6947452783584596, "epoch": 0.176, "grad_norm": 712.0, "kl_loss_10": 132.9031551361084, "kl_loss_2": 1901.9076782226562, "kl_loss_3": 1359.709698486328, "kl_loss_7": 436.5532958984375, "learning_rate": 0.0009322171915289634, "loss": 956.0875, "step": 1760 }, { "ce_loss_10": 3.5944949984550476, "ce_loss_13": 3.5298572301864626, "ce_loss_2": 4.4130290985107425, "ce_loss_3": 4.157473838329315, "ce_loss_7": 3.731903576850891, "epoch": 0.177, "grad_norm": 660.0, "kl_loss_10": 129.41774711608886, "kl_loss_2": 1842.0722045898438, "kl_loss_3": 1323.5928039550781, "kl_loss_7": 430.4285171508789, "learning_rate": 0.0009314173287433873, "loss": 926.6568, "step": 1770 }, { "ce_loss_10": 3.5893386363983155, "ce_loss_13": 3.522033989429474, "ce_loss_2": 4.41558871269226, "ce_loss_3": 4.1596610188484195, "ce_loss_7": 3.7286543011665345, "epoch": 0.178, "grad_norm": 700.0, "kl_loss_10": 132.24522972106934, "kl_loss_2": 1857.3857727050781, "kl_loss_3": 1338.52314453125, "kl_loss_7": 427.33155517578126, "learning_rate": 0.0009306131215901003, "loss": 926.2527, "step": 1780 }, { "ce_loss_10": 3.6202665686607363, "ce_loss_13": 3.550876200199127, "ce_loss_2": 4.450339543819427, "ce_loss_3": 4.1915468096733095, "ce_loss_7": 3.754792034626007, "epoch": 0.179, "grad_norm": 848.0, "kl_loss_10": 133.3262908935547, "kl_loss_2": 1851.2231750488281, "kl_loss_3": 1329.9062072753907, "kl_loss_7": 421.88954010009763, "learning_rate": 0.0009298045781674596, "loss": 922.665, "step": 1790 }, { "ce_loss_10": 3.598757290840149, "ce_loss_13": 3.530095064640045, "ce_loss_2": 4.4083106279373165, "ce_loss_3": 4.1537808060646055, "ce_loss_7": 3.7406049013137816, "epoch": 0.18, "grad_norm": 700.0, "kl_loss_10": 132.90703239440919, "kl_loss_2": 1814.4720642089844, "kl_loss_3": 1303.6390197753906, "kl_loss_7": 421.62610778808596, "learning_rate": 0.0009289917066174886, "loss": 935.8461, "step": 1800 }, { "ce_loss_10": 3.5973706603050233, "ce_loss_13": 3.5324791789054872, "ce_loss_2": 4.382003843784332, "ce_loss_3": 4.137510073184967, "ce_loss_7": 3.727050006389618, "epoch": 0.181, "grad_norm": 612.0, "kl_loss_10": 129.67704124450682, "kl_loss_2": 1757.3234924316407, "kl_loss_3": 1272.9893371582032, "kl_loss_7": 401.9850479125977, "learning_rate": 0.0009281745151257945, "loss": 901.8143, "step": 1810 }, { "ce_loss_10": 3.6120322823524473, "ce_loss_13": 3.5457245230674745, "ce_loss_2": 4.430144441127777, "ce_loss_3": 4.176582098007202, "ce_loss_7": 3.7436187863349915, "epoch": 0.182, "grad_norm": 712.0, "kl_loss_10": 129.36869697570802, "kl_loss_2": 1824.022509765625, "kl_loss_3": 1315.5220275878905, "kl_loss_7": 413.90467834472656, "learning_rate": 0.0009273530119214868, "loss": 927.1159, "step": 1820 }, { "ce_loss_10": 3.7161304116249085, "ce_loss_13": 3.650569665431976, "ce_loss_2": 4.521009063720703, "ce_loss_3": 4.261719655990601, "ce_loss_7": 3.84481942653656, "epoch": 0.183, "grad_norm": 720.0, "kl_loss_10": 127.82066688537597, "kl_loss_2": 1804.4899597167969, "kl_loss_3": 1294.4129760742187, "kl_loss_7": 407.70554504394534, "learning_rate": 0.0009265272052770935, "loss": 898.3252, "step": 1830 }, { "ce_loss_10": 3.523871731758118, "ce_loss_13": 3.459288680553436, "ce_loss_2": 4.376335573196411, "ce_loss_3": 4.11072758436203, "ce_loss_7": 3.6698122382164002, "epoch": 0.184, "grad_norm": 1184.0, "kl_loss_10": 125.29639282226563, "kl_loss_2": 1871.7587097167968, "kl_loss_3": 1337.487725830078, "kl_loss_7": 423.0448913574219, "learning_rate": 0.0009256971035084784, "loss": 934.5854, "step": 1840 }, { "ce_loss_10": 3.4653433561325073, "ce_loss_13": 3.3982510447502134, "ce_loss_2": 4.335454785823822, "ce_loss_3": 4.066809415817261, "ce_loss_7": 3.6129683375358583, "epoch": 0.185, "grad_norm": 792.0, "kl_loss_10": 130.14883193969726, "kl_loss_2": 1906.7114624023438, "kl_loss_3": 1382.1593811035157, "kl_loss_7": 440.7191101074219, "learning_rate": 0.0009248627149747573, "loss": 943.6289, "step": 1850 }, { "ce_loss_10": 3.674869966506958, "ce_loss_13": 3.610811698436737, "ce_loss_2": 4.48777449131012, "ce_loss_3": 4.228848516941071, "ce_loss_7": 3.8107720971107484, "epoch": 0.186, "grad_norm": 780.0, "kl_loss_10": 127.98030014038086, "kl_loss_2": 1807.074853515625, "kl_loss_3": 1310.2983764648438, "kl_loss_7": 425.2335174560547, "learning_rate": 0.0009240240480782129, "loss": 921.9982, "step": 1860 }, { "ce_loss_10": 3.575409007072449, "ce_loss_13": 3.5072484970092774, "ce_loss_2": 4.422019696235656, "ce_loss_3": 4.169667434692383, "ce_loss_7": 3.7197341561317443, "epoch": 0.187, "grad_norm": 696.0, "kl_loss_10": 128.9853702545166, "kl_loss_2": 1862.2987854003907, "kl_loss_3": 1346.4248901367187, "kl_loss_7": 428.77113037109376, "learning_rate": 0.0009231811112642122, "loss": 926.9932, "step": 1870 }, { "ce_loss_10": 3.6224915862083433, "ce_loss_13": 3.5601382493972777, "ce_loss_2": 4.417256689071655, "ce_loss_3": 4.173554491996765, "ce_loss_7": 3.762256407737732, "epoch": 0.188, "grad_norm": 772.0, "kl_loss_10": 127.82582473754883, "kl_loss_2": 1790.9530212402344, "kl_loss_3": 1313.5617370605469, "kl_loss_7": 424.0302230834961, "learning_rate": 0.0009223339130211192, "loss": 914.026, "step": 1880 }, { "ce_loss_10": 3.4702867031097413, "ce_loss_13": 3.4068259596824646, "ce_loss_2": 4.3269793629646305, "ce_loss_3": 4.06168839931488, "ce_loss_7": 3.609572696685791, "epoch": 0.189, "grad_norm": 848.0, "kl_loss_10": 124.49359817504883, "kl_loss_2": 1892.4920349121094, "kl_loss_3": 1358.4828674316407, "kl_loss_7": 421.35957489013674, "learning_rate": 0.0009214824618802108, "loss": 938.8332, "step": 1890 }, { "ce_loss_10": 3.654657757282257, "ce_loss_13": 3.590511703491211, "ce_loss_2": 4.480857229232788, "ce_loss_3": 4.237442040443421, "ce_loss_7": 3.7957029223442076, "epoch": 0.19, "grad_norm": 676.0, "kl_loss_10": 128.34262886047364, "kl_loss_2": 1797.9830383300782, "kl_loss_3": 1329.4662231445313, "kl_loss_7": 423.3448181152344, "learning_rate": 0.0009206267664155906, "loss": 940.5471, "step": 1900 }, { "ce_loss_10": 3.580376386642456, "ce_loss_13": 3.513024628162384, "ce_loss_2": 4.4014887571334835, "ce_loss_3": 4.141080892086029, "ce_loss_7": 3.7158504128456116, "epoch": 0.191, "grad_norm": 712.0, "kl_loss_10": 128.73700561523438, "kl_loss_2": 1840.3277099609375, "kl_loss_3": 1317.614239501953, "kl_loss_7": 415.00048828125, "learning_rate": 0.0009197668352441024, "loss": 925.9144, "step": 1910 }, { "ce_loss_10": 3.6335063934326173, "ce_loss_13": 3.568242275714874, "ce_loss_2": 4.440365195274353, "ce_loss_3": 4.19104700088501, "ce_loss_7": 3.765529763698578, "epoch": 0.192, "grad_norm": 724.0, "kl_loss_10": 127.70339164733886, "kl_loss_2": 1779.9927368164062, "kl_loss_3": 1291.5384399414063, "kl_loss_7": 408.45856018066405, "learning_rate": 0.0009189026770252437, "loss": 909.3059, "step": 1920 }, { "ce_loss_10": 3.6576470136642456, "ce_loss_13": 3.591932010650635, "ce_loss_2": 4.458939456939698, "ce_loss_3": 4.208939397335053, "ce_loss_7": 3.7886819362640383, "epoch": 0.193, "grad_norm": 740.0, "kl_loss_10": 130.97687530517578, "kl_loss_2": 1793.56005859375, "kl_loss_3": 1296.6363952636718, "kl_loss_7": 417.5914505004883, "learning_rate": 0.000918034300461078, "loss": 942.7983, "step": 1930 }, { "ce_loss_10": 3.69169819355011, "ce_loss_13": 3.621174895763397, "ce_loss_2": 4.485340929031372, "ce_loss_3": 4.234645307064056, "ce_loss_7": 3.8210690140724184, "epoch": 0.194, "grad_norm": 604.0, "kl_loss_10": 132.72209777832032, "kl_loss_2": 1773.5186584472656, "kl_loss_3": 1279.0208435058594, "kl_loss_7": 411.0375274658203, "learning_rate": 0.0009171617142961477, "loss": 904.5678, "step": 1940 }, { "ce_loss_10": 3.651001274585724, "ce_loss_13": 3.5832403540611266, "ce_loss_2": 4.449045872688293, "ce_loss_3": 4.200624763965607, "ce_loss_7": 3.782055652141571, "epoch": 0.195, "grad_norm": 640.0, "kl_loss_10": 137.30070991516112, "kl_loss_2": 1787.3869018554688, "kl_loss_3": 1288.391796875, "kl_loss_7": 409.46341705322266, "learning_rate": 0.0009162849273173857, "loss": 910.0326, "step": 1950 }, { "ce_loss_10": 3.590295147895813, "ce_loss_13": 3.5198971033096313, "ce_loss_2": 4.397026085853577, "ce_loss_3": 4.140172338485717, "ce_loss_7": 3.71817033290863, "epoch": 0.196, "grad_norm": 600.0, "kl_loss_10": 140.8808967590332, "kl_loss_2": 1787.7542419433594, "kl_loss_3": 1274.266912841797, "kl_loss_7": 408.3756469726562, "learning_rate": 0.0009154039483540273, "loss": 913.2887, "step": 1960 }, { "ce_loss_10": 3.578533613681793, "ce_loss_13": 3.5021342277526855, "ce_loss_2": 4.390222692489624, "ce_loss_3": 4.122443926334381, "ce_loss_7": 3.6973868012428284, "epoch": 0.197, "grad_norm": 664.0, "kl_loss_10": 149.98492126464845, "kl_loss_2": 1833.7576965332032, "kl_loss_3": 1302.9780700683593, "kl_loss_7": 410.2252365112305, "learning_rate": 0.0009145187862775209, "loss": 922.408, "step": 1970 }, { "ce_loss_10": 3.606479120254517, "ce_loss_13": 3.5367385029792784, "ce_loss_2": 4.4233134388923645, "ce_loss_3": 4.164473819732666, "ce_loss_7": 3.732180595397949, "epoch": 0.198, "grad_norm": 632.0, "kl_loss_10": 145.58407974243164, "kl_loss_2": 1824.3634704589845, "kl_loss_3": 1309.70810546875, "kl_loss_7": 409.2798355102539, "learning_rate": 0.0009136294500014386, "loss": 912.488, "step": 1980 }, { "ce_loss_10": 3.5539753794670106, "ce_loss_13": 3.4837575554847717, "ce_loss_2": 4.3912107944488525, "ce_loss_3": 4.130738985538483, "ce_loss_7": 3.6802467107772827, "epoch": 0.199, "grad_norm": 1136.0, "kl_loss_10": 136.45192756652833, "kl_loss_2": 1860.856219482422, "kl_loss_3": 1332.878594970703, "kl_loss_7": 410.7211395263672, "learning_rate": 0.000912735948481387, "loss": 932.3057, "step": 1990 }, { "ce_loss_10": 3.5889375805854797, "ce_loss_13": 3.515679347515106, "ce_loss_2": 4.391900050640106, "ce_loss_3": 4.148991334438324, "ce_loss_7": 3.7147971630096435, "epoch": 0.2, "grad_norm": 800.0, "kl_loss_10": 139.69712715148927, "kl_loss_2": 1814.8407287597656, "kl_loss_3": 1331.9040588378907, "kl_loss_7": 416.8130645751953, "learning_rate": 0.0009118382907149164, "loss": 911.6631, "step": 2000 }, { "ce_loss_10": 3.6133728981018067, "ce_loss_13": 3.5464622616767882, "ce_loss_2": 4.418559169769287, "ce_loss_3": 4.164545750617981, "ce_loss_7": 3.7400378227233886, "epoch": 0.201, "grad_norm": 724.0, "kl_loss_10": 134.42137565612794, "kl_loss_2": 1802.273388671875, "kl_loss_3": 1304.1902709960937, "kl_loss_7": 413.644108581543, "learning_rate": 0.0009109364857414306, "loss": 904.017, "step": 2010 }, { "ce_loss_10": 3.5799274682998656, "ce_loss_13": 3.5125221014022827, "ce_loss_2": 4.36887800693512, "ce_loss_3": 4.120139968395233, "ce_loss_7": 3.708935272693634, "epoch": 0.202, "grad_norm": 964.0, "kl_loss_10": 129.09841537475586, "kl_loss_2": 1798.1168884277345, "kl_loss_3": 1280.6332702636719, "kl_loss_7": 412.83667907714846, "learning_rate": 0.0009100305426420956, "loss": 926.6541, "step": 2020 }, { "ce_loss_10": 3.5351605296134947, "ce_loss_13": 3.470785641670227, "ce_loss_2": 4.400927019119263, "ce_loss_3": 4.112563371658325, "ce_loss_7": 3.667282259464264, "epoch": 0.203, "grad_norm": 716.0, "kl_loss_10": 129.4260051727295, "kl_loss_2": 1925.4462341308595, "kl_loss_3": 1348.8035705566406, "kl_loss_7": 415.00121154785154, "learning_rate": 0.0009091204705397484, "loss": 923.3018, "step": 2030 }, { "ce_loss_10": 3.5308706402778625, "ce_loss_13": 3.459333658218384, "ce_loss_2": 4.386084961891174, "ce_loss_3": 4.109188389778137, "ce_loss_7": 3.65969340801239, "epoch": 0.204, "grad_norm": 656.0, "kl_loss_10": 142.97890548706056, "kl_loss_2": 1905.0079406738282, "kl_loss_3": 1356.3838134765624, "kl_loss_7": 418.6112533569336, "learning_rate": 0.0009082062785988049, "loss": 936.988, "step": 2040 }, { "ce_loss_10": 3.6675715923309324, "ce_loss_13": 3.602421057224274, "ce_loss_2": 4.449817395210266, "ce_loss_3": 4.201005351543427, "ce_loss_7": 3.7939741253852843, "epoch": 0.205, "grad_norm": 896.0, "kl_loss_10": 128.0077262878418, "kl_loss_2": 1770.9144714355468, "kl_loss_3": 1268.815936279297, "kl_loss_7": 408.55276641845705, "learning_rate": 0.0009072879760251679, "loss": 905.6441, "step": 2050 }, { "ce_loss_10": 3.6087319254875183, "ce_loss_13": 3.542242980003357, "ce_loss_2": 4.4326321363449095, "ce_loss_3": 4.171148526668548, "ce_loss_7": 3.756589877605438, "epoch": 0.206, "grad_norm": 1088.0, "kl_loss_10": 127.51346244812012, "kl_loss_2": 1838.9010009765625, "kl_loss_3": 1308.4912536621093, "kl_loss_7": 436.77366790771487, "learning_rate": 0.0009063655720661341, "loss": 915.8813, "step": 2060 }, { "ce_loss_10": 3.659642589092255, "ce_loss_13": 3.5936012625694276, "ce_loss_2": 4.449182534217835, "ce_loss_3": 4.205568349361419, "ce_loss_7": 3.7972538590431215, "epoch": 0.207, "grad_norm": 800.0, "kl_loss_10": 129.16879119873047, "kl_loss_2": 1771.5154296875, "kl_loss_3": 1277.5388916015625, "kl_loss_7": 426.86106262207034, "learning_rate": 0.000905439076010301, "loss": 904.9301, "step": 2070 }, { "ce_loss_10": 3.6077974796295167, "ce_loss_13": 3.542259955406189, "ce_loss_2": 4.422458052635193, "ce_loss_3": 4.169653272628784, "ce_loss_7": 3.7490785837173464, "epoch": 0.208, "grad_norm": 844.0, "kl_loss_10": 126.11813430786133, "kl_loss_2": 1798.7173217773438, "kl_loss_3": 1302.0502624511719, "kl_loss_7": 423.26373596191405, "learning_rate": 0.0009045084971874737, "loss": 895.15, "step": 2080 }, { "ce_loss_10": 3.589551329612732, "ce_loss_13": 3.52473863363266, "ce_loss_2": 4.3952157378196715, "ce_loss_3": 4.138834273815155, "ce_loss_7": 3.732974636554718, "epoch": 0.209, "grad_norm": 868.0, "kl_loss_10": 128.90884971618652, "kl_loss_2": 1803.3426025390625, "kl_loss_3": 1289.222833251953, "kl_loss_7": 426.99852142333987, "learning_rate": 0.0009035738449685707, "loss": 922.7558, "step": 2090 }, { "ce_loss_10": 3.527937316894531, "ce_loss_13": 3.4618417263031005, "ce_loss_2": 4.382559823989868, "ce_loss_3": 4.104182541370392, "ce_loss_7": 3.6752463936805726, "epoch": 0.21, "grad_norm": 1056.0, "kl_loss_10": 128.64362869262695, "kl_loss_2": 1863.6190979003907, "kl_loss_3": 1317.049594116211, "kl_loss_7": 438.52687072753906, "learning_rate": 0.0009026351287655293, "loss": 912.7813, "step": 2100 }, { "ce_loss_10": 3.7270989418029785, "ce_loss_13": 3.6646096467971803, "ce_loss_2": 4.471357464790344, "ce_loss_3": 4.233025455474854, "ce_loss_7": 3.8727977514266967, "epoch": 0.211, "grad_norm": 880.0, "kl_loss_10": 123.86026992797852, "kl_loss_2": 1695.6019714355468, "kl_loss_3": 1220.0527893066405, "kl_loss_7": 441.4079650878906, "learning_rate": 0.0009016923580312113, "loss": 876.9629, "step": 2110 }, { "ce_loss_10": 3.5797381520271303, "ce_loss_13": 3.51902893781662, "ce_loss_2": 4.368394494056702, "ce_loss_3": 4.1188765406608585, "ce_loss_7": 3.7279640674591064, "epoch": 0.212, "grad_norm": 672.0, "kl_loss_10": 123.35817451477051, "kl_loss_2": 1765.1290466308594, "kl_loss_3": 1263.2463500976562, "kl_loss_7": 441.48437957763673, "learning_rate": 0.0009007455422593077, "loss": 915.6727, "step": 2120 }, { "ce_loss_10": 3.5914886832237243, "ce_loss_13": 3.5270834445953367, "ce_loss_2": 4.4055636644363405, "ce_loss_3": 4.146296298503875, "ce_loss_7": 3.7344082236289977, "epoch": 0.213, "grad_norm": 760.0, "kl_loss_10": 125.04854812622071, "kl_loss_2": 1831.5112854003905, "kl_loss_3": 1317.9097106933593, "kl_loss_7": 445.1996826171875, "learning_rate": 0.0008997946909842425, "loss": 920.9859, "step": 2130 }, { "ce_loss_10": 3.6091657638549806, "ce_loss_13": 3.5466102242469786, "ce_loss_2": 4.466888070106506, "ce_loss_3": 4.190494358539581, "ce_loss_7": 3.7730682611465456, "epoch": 0.214, "grad_norm": 724.0, "kl_loss_10": 127.54317474365234, "kl_loss_2": 1898.3191040039062, "kl_loss_3": 1347.2041625976562, "kl_loss_7": 454.85473480224607, "learning_rate": 0.0008988398137810777, "loss": 921.4855, "step": 2140 }, { "ce_loss_10": 3.6425514817237854, "ce_loss_13": 3.5820160508155823, "ce_loss_2": 4.429642176628112, "ce_loss_3": 4.189036953449249, "ce_loss_7": 3.7897796273231505, "epoch": 0.215, "grad_norm": 932.0, "kl_loss_10": 121.40589485168456, "kl_loss_2": 1755.1794311523438, "kl_loss_3": 1274.5461669921874, "kl_loss_7": 430.68811492919923, "learning_rate": 0.0008978809202654162, "loss": 894.6924, "step": 2150 }, { "ce_loss_10": 3.6215335607528685, "ce_loss_13": 3.558076047897339, "ce_loss_2": 4.416151642799377, "ce_loss_3": 4.169169640541076, "ce_loss_7": 3.7691269755363463, "epoch": 0.216, "grad_norm": 932.0, "kl_loss_10": 123.7242588043213, "kl_loss_2": 1753.6674377441407, "kl_loss_3": 1266.8179077148438, "kl_loss_7": 428.3472427368164, "learning_rate": 0.0008969180200933046, "loss": 904.1987, "step": 2160 }, { "ce_loss_10": 3.5824714422225954, "ce_loss_13": 3.5178847312927246, "ce_loss_2": 4.409107494354248, "ce_loss_3": 4.154549217224121, "ce_loss_7": 3.7252731919288635, "epoch": 0.217, "grad_norm": 748.0, "kl_loss_10": 127.39406929016113, "kl_loss_2": 1820.242236328125, "kl_loss_3": 1317.6539672851563, "kl_loss_7": 430.6659332275391, "learning_rate": 0.0008959511229611376, "loss": 921.1195, "step": 2170 }, { "ce_loss_10": 3.6616580963134764, "ce_loss_13": 3.6004168510437013, "ce_loss_2": 4.448979210853577, "ce_loss_3": 4.202481353282929, "ce_loss_7": 3.7962108612060548, "epoch": 0.218, "grad_norm": 832.0, "kl_loss_10": 124.2728874206543, "kl_loss_2": 1757.4144775390625, "kl_loss_3": 1277.402899169922, "kl_loss_7": 411.25287170410155, "learning_rate": 0.0008949802386055581, "loss": 896.6183, "step": 2180 }, { "ce_loss_10": 3.523484992980957, "ce_loss_13": 3.4596511244773867, "ce_loss_2": 4.323343431949615, "ce_loss_3": 4.078759324550629, "ce_loss_7": 3.656231939792633, "epoch": 0.219, "grad_norm": 652.0, "kl_loss_10": 124.65362243652343, "kl_loss_2": 1768.03203125, "kl_loss_3": 1284.9952880859375, "kl_loss_7": 412.17943267822267, "learning_rate": 0.0008940053768033609, "loss": 913.8371, "step": 2190 }, { "ce_loss_10": 3.6082441568374635, "ce_loss_13": 3.5431989073753356, "ce_loss_2": 4.384416913986206, "ce_loss_3": 4.140364813804626, "ce_loss_7": 3.7515419840812685, "epoch": 0.22, "grad_norm": 720.0, "kl_loss_10": 127.33021545410156, "kl_loss_2": 1742.5865051269532, "kl_loss_3": 1254.7478149414062, "kl_loss_7": 417.65948181152345, "learning_rate": 0.0008930265473713938, "loss": 890.3902, "step": 2200 }, { "ce_loss_10": 3.575028455257416, "ce_loss_13": 3.50554678440094, "ce_loss_2": 4.3625051259994505, "ce_loss_3": 4.108423435688019, "ce_loss_7": 3.704033839702606, "epoch": 0.221, "grad_norm": 688.0, "kl_loss_10": 138.50967597961426, "kl_loss_2": 1779.0279418945313, "kl_loss_3": 1264.2146911621094, "kl_loss_7": 412.3875, "learning_rate": 0.0008920437601664579, "loss": 883.3312, "step": 2210 }, { "ce_loss_10": 3.5648491382598877, "ce_loss_13": 3.4946847558021545, "ce_loss_2": 4.360548412799835, "ce_loss_3": 4.100372767448425, "ce_loss_7": 3.6889105439186096, "epoch": 0.222, "grad_norm": 680.0, "kl_loss_10": 139.61580123901368, "kl_loss_2": 1789.831298828125, "kl_loss_3": 1274.1759338378906, "kl_loss_7": 412.41027374267577, "learning_rate": 0.0008910570250852097, "loss": 889.849, "step": 2220 }, { "ce_loss_10": 3.6774597764015198, "ce_loss_13": 3.609463691711426, "ce_loss_2": 4.430353260040283, "ce_loss_3": 4.181297183036804, "ce_loss_7": 3.7937057614326477, "epoch": 0.223, "grad_norm": 644.0, "kl_loss_10": 136.47908210754395, "kl_loss_2": 1709.4117919921875, "kl_loss_3": 1215.9844177246093, "kl_loss_7": 392.6665496826172, "learning_rate": 0.0008900663520640604, "loss": 868.2748, "step": 2230 }, { "ce_loss_10": 3.6227930784225464, "ce_loss_13": 3.554355251789093, "ce_loss_2": 4.40500385761261, "ce_loss_3": 4.147241795063019, "ce_loss_7": 3.749156880378723, "epoch": 0.224, "grad_norm": 624.0, "kl_loss_10": 133.57184333801268, "kl_loss_2": 1755.0290893554688, "kl_loss_3": 1248.1256103515625, "kl_loss_7": 399.51112365722656, "learning_rate": 0.0008890717510790764, "loss": 892.1449, "step": 2240 }, { "ce_loss_10": 3.5782495737075806, "ce_loss_13": 3.5105922341346742, "ce_loss_2": 4.390767920017242, "ce_loss_3": 4.117860805988312, "ce_loss_7": 3.7023658394813537, "epoch": 0.225, "grad_norm": 592.0, "kl_loss_10": 131.22142372131347, "kl_loss_2": 1810.7025451660156, "kl_loss_3": 1283.424383544922, "kl_loss_7": 400.15897369384766, "learning_rate": 0.0008880732321458784, "loss": 911.5875, "step": 2250 }, { "ce_loss_10": 3.608826470375061, "ce_loss_13": 3.5425026178359986, "ce_loss_2": 4.398601484298706, "ce_loss_3": 4.147091746330261, "ce_loss_7": 3.738967227935791, "epoch": 0.226, "grad_norm": 684.0, "kl_loss_10": 128.29030151367186, "kl_loss_2": 1756.5390869140624, "kl_loss_3": 1258.1002624511718, "kl_loss_7": 400.47411651611327, "learning_rate": 0.0008870708053195413, "loss": 901.6658, "step": 2260 }, { "ce_loss_10": 3.636641502380371, "ce_loss_13": 3.5736650347709658, "ce_loss_2": 4.407045221328735, "ce_loss_3": 4.153824484348297, "ce_loss_7": 3.75787615776062, "epoch": 0.227, "grad_norm": 640.0, "kl_loss_10": 125.45826568603516, "kl_loss_2": 1731.1550048828126, "kl_loss_3": 1238.2566467285155, "kl_loss_7": 392.61446075439454, "learning_rate": 0.0008860644806944918, "loss": 876.1695, "step": 2270 }, { "ce_loss_10": 3.5665910243988037, "ce_loss_13": 3.503004324436188, "ce_loss_2": 4.382701706886292, "ce_loss_3": 4.120155215263367, "ce_loss_7": 3.7016607880592347, "epoch": 0.228, "grad_norm": 728.0, "kl_loss_10": 127.46506385803222, "kl_loss_2": 1802.0574890136718, "kl_loss_3": 1290.8335571289062, "kl_loss_7": 410.45982666015624, "learning_rate": 0.0008850542684044079, "loss": 881.4728, "step": 2280 }, { "ce_loss_10": 3.543700098991394, "ce_loss_13": 3.47739417552948, "ce_loss_2": 4.388913011550903, "ce_loss_3": 4.117139446735382, "ce_loss_7": 3.682552933692932, "epoch": 0.229, "grad_norm": 728.0, "kl_loss_10": 131.36685447692872, "kl_loss_2": 1882.3553527832032, "kl_loss_3": 1329.7817443847657, "kl_loss_7": 425.6471466064453, "learning_rate": 0.0008840401786221159, "loss": 910.9973, "step": 2290 }, { "ce_loss_10": 3.6810392260551454, "ce_loss_13": 3.6215655326843263, "ce_loss_2": 4.462311911582947, "ce_loss_3": 4.211032736301422, "ce_loss_7": 3.8120059728622437, "epoch": 0.23, "grad_norm": 804.0, "kl_loss_10": 121.09583320617676, "kl_loss_2": 1728.6732299804687, "kl_loss_3": 1240.8697448730468, "kl_loss_7": 396.0142120361328, "learning_rate": 0.000883022221559489, "loss": 871.0646, "step": 2300 }, { "ce_loss_10": 3.6313990473747255, "ce_loss_13": 3.5685426592826843, "ce_loss_2": 4.423063921928406, "ce_loss_3": 4.169731891155243, "ce_loss_7": 3.7611438274383544, "epoch": 0.231, "grad_norm": 1012.0, "kl_loss_10": 122.84410018920899, "kl_loss_2": 1778.2179321289063, "kl_loss_3": 1282.86826171875, "kl_loss_7": 401.9935546875, "learning_rate": 0.0008820004074673434, "loss": 921.2191, "step": 2310 }, { "ce_loss_10": 3.539715349674225, "ce_loss_13": 3.4805975198745727, "ce_loss_2": 4.33271632194519, "ce_loss_3": 4.094627571105957, "ce_loss_7": 3.6719760417938234, "epoch": 0.232, "grad_norm": 772.0, "kl_loss_10": 121.08334236145019, "kl_loss_2": 1780.4273986816406, "kl_loss_3": 1301.80224609375, "kl_loss_7": 405.6146301269531, "learning_rate": 0.0008809747466353355, "loss": 881.2713, "step": 2320 }, { "ce_loss_10": 3.5549276947975157, "ce_loss_13": 3.492107570171356, "ce_loss_2": 4.3491785526275635, "ce_loss_3": 4.091101741790771, "ce_loss_7": 3.6815092086791994, "epoch": 0.233, "grad_norm": 944.0, "kl_loss_10": 123.1663890838623, "kl_loss_2": 1787.4856750488282, "kl_loss_3": 1267.0135620117187, "kl_loss_7": 398.70018005371094, "learning_rate": 0.0008799452493918585, "loss": 893.7687, "step": 2330 }, { "ce_loss_10": 3.6380561709403993, "ce_loss_13": 3.576252269744873, "ce_loss_2": 4.420890057086945, "ce_loss_3": 4.1712160348892215, "ce_loss_7": 3.765425479412079, "epoch": 0.234, "grad_norm": 672.0, "kl_loss_10": 121.48801879882812, "kl_loss_2": 1744.5474609375, "kl_loss_3": 1255.0732666015624, "kl_loss_7": 395.8260665893555, "learning_rate": 0.0008789119261039385, "loss": 904.2805, "step": 2340 }, { "ce_loss_10": 3.538336658477783, "ce_loss_13": 3.479731857776642, "ce_loss_2": 4.332676410675049, "ce_loss_3": 4.080135154724121, "ce_loss_7": 3.6660250306129454, "epoch": 0.235, "grad_norm": 1080.0, "kl_loss_10": 121.64487190246582, "kl_loss_2": 1748.9600341796875, "kl_loss_3": 1249.2535583496094, "kl_loss_7": 397.9961563110352, "learning_rate": 0.0008778747871771292, "loss": 872.0779, "step": 2350 }, { "ce_loss_10": 3.5936312079429626, "ce_loss_13": 3.5326396346092226, "ce_loss_2": 4.350062465667724, "ce_loss_3": 4.1070164203643795, "ce_loss_7": 3.7190688967704775, "epoch": 0.236, "grad_norm": 892.0, "kl_loss_10": 118.80932960510253, "kl_loss_2": 1697.8776550292969, "kl_loss_3": 1216.870068359375, "kl_loss_7": 386.30751953125, "learning_rate": 0.0008768338430554083, "loss": 855.8582, "step": 2360 }, { "ce_loss_10": 3.602631461620331, "ce_loss_13": 3.5398433089256285, "ce_loss_2": 4.386758756637573, "ce_loss_3": 4.139672267436981, "ce_loss_7": 3.7339996218681337, "epoch": 0.237, "grad_norm": 652.0, "kl_loss_10": 122.76006050109864, "kl_loss_2": 1740.8530029296876, "kl_loss_3": 1251.0193359375, "kl_loss_7": 402.31095733642576, "learning_rate": 0.0008757891042210713, "loss": 884.6639, "step": 2370 }, { "ce_loss_10": 3.6247237086296082, "ce_loss_13": 3.5633601546287537, "ce_loss_2": 4.408042776584625, "ce_loss_3": 4.159637022018432, "ce_loss_7": 3.755664598941803, "epoch": 0.238, "grad_norm": 820.0, "kl_loss_10": 122.77941551208497, "kl_loss_2": 1740.5317749023438, "kl_loss_3": 1251.8955200195312, "kl_loss_7": 402.40113067626953, "learning_rate": 0.0008747405811946271, "loss": 882.5381, "step": 2380 }, { "ce_loss_10": 3.516115999221802, "ce_loss_13": 3.4537455320358275, "ce_loss_2": 4.341125226020813, "ce_loss_3": 4.073987782001495, "ce_loss_7": 3.650569033622742, "epoch": 0.239, "grad_norm": 864.0, "kl_loss_10": 126.38643531799316, "kl_loss_2": 1825.3509948730468, "kl_loss_3": 1301.4881164550782, "kl_loss_7": 412.8675018310547, "learning_rate": 0.0008736882845346905, "loss": 879.5434, "step": 2390 }, { "ce_loss_10": 3.6170632362365724, "ce_loss_13": 3.5505687832832336, "ce_loss_2": 4.411578941345215, "ce_loss_3": 4.151790750026703, "ce_loss_7": 3.7476152896881105, "epoch": 0.24, "grad_norm": 808.0, "kl_loss_10": 128.26573524475097, "kl_loss_2": 1751.8210021972657, "kl_loss_3": 1247.988262939453, "kl_loss_7": 403.38811645507815, "learning_rate": 0.0008726322248378774, "loss": 875.394, "step": 2400 }, { "ce_loss_10": 3.611607217788696, "ce_loss_13": 3.551726186275482, "ce_loss_2": 4.430306696891785, "ce_loss_3": 4.15489045381546, "ce_loss_7": 3.743276059627533, "epoch": 0.241, "grad_norm": 540.0, "kl_loss_10": 124.3121467590332, "kl_loss_2": 1794.8143432617187, "kl_loss_3": 1257.1396118164062, "kl_loss_7": 393.2715179443359, "learning_rate": 0.0008715724127386971, "loss": 900.7046, "step": 2410 }, { "ce_loss_10": 3.686048352718353, "ce_loss_13": 3.624235725402832, "ce_loss_2": 4.4445260763168335, "ce_loss_3": 4.193621242046357, "ce_loss_7": 3.8048259377479554, "epoch": 0.242, "grad_norm": 848.0, "kl_loss_10": 123.6441822052002, "kl_loss_2": 1725.1532287597656, "kl_loss_3": 1225.6928466796876, "kl_loss_7": 388.18777923583986, "learning_rate": 0.0008705088589094458, "loss": 875.9166, "step": 2420 }, { "ce_loss_10": 3.6983994841575623, "ce_loss_13": 3.6343111276626585, "ce_loss_2": 4.466147994995117, "ce_loss_3": 4.217593848705292, "ce_loss_7": 3.8223814606666564, "epoch": 0.243, "grad_norm": 608.0, "kl_loss_10": 126.81392974853516, "kl_loss_2": 1740.5603149414062, "kl_loss_3": 1234.4478515625, "kl_loss_7": 392.4835739135742, "learning_rate": 0.0008694415740600988, "loss": 880.674, "step": 2430 }, { "ce_loss_10": 3.5496601462364197, "ce_loss_13": 3.4869441866874693, "ce_loss_2": 4.3661869168281555, "ce_loss_3": 4.098685562610626, "ce_loss_7": 3.6733205676078797, "epoch": 0.244, "grad_norm": 768.0, "kl_loss_10": 130.24971961975098, "kl_loss_2": 1814.3707336425782, "kl_loss_3": 1282.4598083496094, "kl_loss_7": 392.8763092041016, "learning_rate": 0.0008683705689382025, "loss": 889.2466, "step": 2440 }, { "ce_loss_10": 3.632191073894501, "ce_loss_13": 3.571500778198242, "ce_loss_2": 4.403897631168365, "ce_loss_3": 4.150809907913208, "ce_loss_7": 3.7560539484024047, "epoch": 0.245, "grad_norm": 652.0, "kl_loss_10": 121.29967422485352, "kl_loss_2": 1724.3360534667968, "kl_loss_3": 1225.94296875, "kl_loss_7": 384.2587158203125, "learning_rate": 0.0008672958543287666, "loss": 887.1217, "step": 2450 }, { "ce_loss_10": 3.6426288723945617, "ce_loss_13": 3.579582178592682, "ce_loss_2": 4.396446967124939, "ce_loss_3": 4.160457873344422, "ce_loss_7": 3.7650449872016907, "epoch": 0.246, "grad_norm": 832.0, "kl_loss_10": 122.32759323120118, "kl_loss_2": 1706.86083984375, "kl_loss_3": 1225.3012390136719, "kl_loss_7": 388.13391876220703, "learning_rate": 0.0008662174410541554, "loss": 865.7108, "step": 2460 }, { "ce_loss_10": 3.6073416233062745, "ce_loss_13": 3.5467630982398988, "ce_loss_2": 4.364645791053772, "ce_loss_3": 4.125742852687836, "ce_loss_7": 3.73012033700943, "epoch": 0.247, "grad_norm": 676.0, "kl_loss_10": 119.47982826232911, "kl_loss_2": 1703.542169189453, "kl_loss_3": 1221.757342529297, "kl_loss_7": 387.1647262573242, "learning_rate": 0.0008651353399739787, "loss": 878.2365, "step": 2470 }, { "ce_loss_10": 3.635068428516388, "ce_loss_13": 3.575910484790802, "ce_loss_2": 4.404824256896973, "ce_loss_3": 4.159365427494049, "ce_loss_7": 3.7614513635635376, "epoch": 0.248, "grad_norm": 840.0, "kl_loss_10": 120.67507553100586, "kl_loss_2": 1721.6146362304687, "kl_loss_3": 1229.2794067382813, "kl_loss_7": 393.36814270019534, "learning_rate": 0.0008640495619849821, "loss": 869.5021, "step": 2480 }, { "ce_loss_10": 3.594242787361145, "ce_loss_13": 3.5357272744178774, "ce_loss_2": 4.36545352935791, "ce_loss_3": 4.118818569183349, "ce_loss_7": 3.719261586666107, "epoch": 0.249, "grad_norm": 892.0, "kl_loss_10": 118.52586669921875, "kl_loss_2": 1712.1851440429687, "kl_loss_3": 1222.3048828125, "kl_loss_7": 391.0434173583984, "learning_rate": 0.0008629601180209381, "loss": 863.2522, "step": 2490 }, { "ce_loss_10": 3.5893252372741697, "ce_loss_13": 3.525994336605072, "ce_loss_2": 4.360780739784241, "ce_loss_3": 4.114725041389465, "ce_loss_7": 3.7128764629364013, "epoch": 0.25, "grad_norm": 684.0, "kl_loss_10": 120.1303482055664, "kl_loss_2": 1711.085009765625, "kl_loss_3": 1227.3393310546876, "kl_loss_7": 389.1397216796875, "learning_rate": 0.000861867019052535, "loss": 873.0406, "step": 2500 }, { "ce_loss_10": 3.5032670855522157, "ce_loss_13": 3.4380154728889467, "ce_loss_2": 4.322482085227966, "ce_loss_3": 4.051889681816101, "ce_loss_7": 3.633055055141449, "epoch": 0.251, "grad_norm": 728.0, "kl_loss_10": 122.94742317199707, "kl_loss_2": 1814.5214538574219, "kl_loss_3": 1273.9253723144532, "kl_loss_7": 403.30311126708983, "learning_rate": 0.0008607702760872678, "loss": 900.1684, "step": 2510 }, { "ce_loss_10": 3.6212844133377073, "ce_loss_13": 3.562401294708252, "ce_loss_2": 4.381760013103485, "ce_loss_3": 4.1439371585845945, "ce_loss_7": 3.750008535385132, "epoch": 0.252, "grad_norm": 932.0, "kl_loss_10": 120.19480628967285, "kl_loss_2": 1702.0514221191406, "kl_loss_3": 1224.6224182128906, "kl_loss_7": 390.96875762939453, "learning_rate": 0.0008596699001693256, "loss": 886.3662, "step": 2520 }, { "ce_loss_10": 3.6345497965812683, "ce_loss_13": 3.5723119258880613, "ce_loss_2": 4.3780588746070865, "ce_loss_3": 4.1333225011825565, "ce_loss_7": 3.7509743213653564, "epoch": 0.253, "grad_norm": 852.0, "kl_loss_10": 125.0958065032959, "kl_loss_2": 1702.8536560058594, "kl_loss_3": 1212.0218933105468, "kl_loss_7": 393.04117889404296, "learning_rate": 0.0008585659023794818, "loss": 883.0333, "step": 2530 }, { "ce_loss_10": 3.585874927043915, "ce_loss_13": 3.5245532989501953, "ce_loss_2": 4.397958612442016, "ce_loss_3": 4.136601293087006, "ce_loss_7": 3.715954637527466, "epoch": 0.254, "grad_norm": 700.0, "kl_loss_10": 123.6213150024414, "kl_loss_2": 1794.7963500976562, "kl_loss_3": 1277.6108825683593, "kl_loss_7": 406.0328598022461, "learning_rate": 0.0008574582938349817, "loss": 888.2199, "step": 2540 }, { "ce_loss_10": 3.588877189159393, "ce_loss_13": 3.523186147212982, "ce_loss_2": 4.388445925712586, "ce_loss_3": 4.141435182094574, "ce_loss_7": 3.719454038143158, "epoch": 0.255, "grad_norm": 656.0, "kl_loss_10": 126.89146575927734, "kl_loss_2": 1793.1378295898437, "kl_loss_3": 1285.1409545898437, "kl_loss_7": 409.85553283691405, "learning_rate": 0.0008563470856894315, "loss": 871.2184, "step": 2550 }, { "ce_loss_10": 3.572152853012085, "ce_loss_13": 3.5135809659957884, "ce_loss_2": 4.3703243136405945, "ce_loss_3": 4.109549701213837, "ce_loss_7": 3.698936939239502, "epoch": 0.256, "grad_norm": 768.0, "kl_loss_10": 118.68732070922852, "kl_loss_2": 1762.0347900390625, "kl_loss_3": 1246.6959045410156, "kl_loss_7": 389.87054748535155, "learning_rate": 0.0008552322891326845, "loss": 876.7961, "step": 2560 }, { "ce_loss_10": 3.550217092037201, "ce_loss_13": 3.486633098125458, "ce_loss_2": 4.336965727806091, "ce_loss_3": 4.082358205318451, "ce_loss_7": 3.6735817790031433, "epoch": 0.257, "grad_norm": 1152.0, "kl_loss_10": 123.27512969970704, "kl_loss_2": 1770.0594482421875, "kl_loss_3": 1253.9241149902343, "kl_loss_7": 390.6649993896484, "learning_rate": 0.0008541139153907296, "loss": 871.7845, "step": 2570 }, { "ce_loss_10": 3.5077988743782043, "ce_loss_13": 3.4418245196342467, "ce_loss_2": 4.287769389152527, "ce_loss_3": 4.030180740356445, "ce_loss_7": 3.6243348598480223, "epoch": 0.258, "grad_norm": 720.0, "kl_loss_10": 130.69061660766602, "kl_loss_2": 1752.0322875976562, "kl_loss_3": 1243.0596984863282, "kl_loss_7": 387.85493621826174, "learning_rate": 0.0008529919757255782, "loss": 881.207, "step": 2580 }, { "ce_loss_10": 3.5372533082962034, "ce_loss_13": 3.4788613438606264, "ce_loss_2": 4.2806028008461, "ce_loss_3": 4.038917350769043, "ce_loss_7": 3.6509461998939514, "epoch": 0.259, "grad_norm": 772.0, "kl_loss_10": 125.94407691955567, "kl_loss_2": 1680.382861328125, "kl_loss_3": 1199.427801513672, "kl_loss_7": 374.88755950927737, "learning_rate": 0.0008518664814351503, "loss": 853.0765, "step": 2590 }, { "ce_loss_10": 3.5122454166412354, "ce_loss_13": 3.445158326625824, "ce_loss_2": 4.303707993030548, "ce_loss_3": 4.055305373668671, "ce_loss_7": 3.632937502861023, "epoch": 0.26, "grad_norm": 1088.0, "kl_loss_10": 128.98979454040528, "kl_loss_2": 1786.3385864257812, "kl_loss_3": 1283.6488037109375, "kl_loss_7": 396.84752502441404, "learning_rate": 0.0008507374438531607, "loss": 916.0336, "step": 2600 }, { "ce_loss_10": 3.480205035209656, "ce_loss_13": 3.4188055038452148, "ce_loss_2": 4.268492805957794, "ce_loss_3": 4.014702022075653, "ce_loss_7": 3.601118636131287, "epoch": 0.261, "grad_norm": 688.0, "kl_loss_10": 122.25764427185058, "kl_loss_2": 1751.123272705078, "kl_loss_3": 1243.3333557128906, "kl_loss_7": 384.45611572265625, "learning_rate": 0.0008496048743490053, "loss": 870.8131, "step": 2610 }, { "ce_loss_10": 3.6342422485351564, "ce_loss_13": 3.5757507920265197, "ce_loss_2": 4.387712907791138, "ce_loss_3": 4.15058788061142, "ce_loss_7": 3.7528537273406983, "epoch": 0.262, "grad_norm": 688.0, "kl_loss_10": 120.9524139404297, "kl_loss_2": 1691.694891357422, "kl_loss_3": 1212.9383422851563, "kl_loss_7": 382.4178726196289, "learning_rate": 0.0008484687843276469, "loss": 858.3654, "step": 2620 }, { "ce_loss_10": 3.56218444108963, "ce_loss_13": 3.5020002007484434, "ce_loss_2": 4.343717336654663, "ce_loss_3": 4.087351965904236, "ce_loss_7": 3.6885337471961974, "epoch": 0.263, "grad_norm": 972.0, "kl_loss_10": 122.10862846374512, "kl_loss_2": 1743.074481201172, "kl_loss_3": 1241.259295654297, "kl_loss_7": 390.93798675537107, "learning_rate": 0.0008473291852294987, "loss": 889.2213, "step": 2630 }, { "ce_loss_10": 3.569066059589386, "ce_loss_13": 3.5079373717308044, "ce_loss_2": 4.341557312011719, "ce_loss_3": 4.102798533439636, "ce_loss_7": 3.702693009376526, "epoch": 0.264, "grad_norm": 992.0, "kl_loss_10": 121.03168258666992, "kl_loss_2": 1756.2496704101563, "kl_loss_3": 1268.1442626953126, "kl_loss_7": 409.74066772460935, "learning_rate": 0.0008461860885303114, "loss": 873.7736, "step": 2640 }, { "ce_loss_10": 3.600629377365112, "ce_loss_13": 3.5413268923759462, "ce_loss_2": 4.351797342300415, "ce_loss_3": 4.123497414588928, "ce_loss_7": 3.7193342566490175, "epoch": 0.265, "grad_norm": 784.0, "kl_loss_10": 117.7381477355957, "kl_loss_2": 1685.6346862792968, "kl_loss_3": 1223.5721496582032, "kl_loss_7": 385.61451568603513, "learning_rate": 0.000845039505741056, "loss": 865.0888, "step": 2650 }, { "ce_loss_10": 3.583999478816986, "ce_loss_13": 3.5274040579795836, "ce_loss_2": 4.3639294624328615, "ce_loss_3": 4.123298466205597, "ce_loss_7": 3.722229468822479, "epoch": 0.266, "grad_norm": 760.0, "kl_loss_10": 122.37080764770508, "kl_loss_2": 1745.8930297851562, "kl_loss_3": 1263.3002380371095, "kl_loss_7": 410.3360626220703, "learning_rate": 0.0008438894484078086, "loss": 903.9543, "step": 2660 }, { "ce_loss_10": 3.5930288791656495, "ce_loss_13": 3.5317795753479, "ce_loss_2": 4.351351189613342, "ce_loss_3": 4.11381551027298, "ce_loss_7": 3.7177330613136292, "epoch": 0.267, "grad_norm": 600.0, "kl_loss_10": 125.26796760559083, "kl_loss_2": 1700.093682861328, "kl_loss_3": 1229.3107971191407, "kl_loss_7": 394.9988708496094, "learning_rate": 0.0008427359281116334, "loss": 867.5541, "step": 2670 }, { "ce_loss_10": 3.504789853096008, "ce_loss_13": 3.4365304589271544, "ce_loss_2": 4.2888070940971375, "ce_loss_3": 4.033144581317901, "ce_loss_7": 3.62447829246521, "epoch": 0.268, "grad_norm": 788.0, "kl_loss_10": 127.1825080871582, "kl_loss_2": 1769.5092468261719, "kl_loss_3": 1255.384375, "kl_loss_7": 396.88611602783203, "learning_rate": 0.0008415789564684673, "loss": 882.0259, "step": 2680 }, { "ce_loss_10": 3.750445532798767, "ce_loss_13": 3.6757726311683654, "ce_loss_2": 4.478060841560364, "ce_loss_3": 4.249177932739258, "ce_loss_7": 3.8692382693290712, "epoch": 0.269, "grad_norm": 636.0, "kl_loss_10": 149.78771476745607, "kl_loss_2": 1648.9072570800781, "kl_loss_3": 1195.9601501464845, "kl_loss_7": 403.16771697998047, "learning_rate": 0.0008404185451290017, "loss": 850.8326, "step": 2690 }, { "ce_loss_10": 3.609413778781891, "ce_loss_13": 3.5459699988365174, "ce_loss_2": 4.3701406955719, "ce_loss_3": 4.121756374835968, "ce_loss_7": 3.7322357654571534, "epoch": 0.27, "grad_norm": 740.0, "kl_loss_10": 127.56755828857422, "kl_loss_2": 1713.2321350097657, "kl_loss_3": 1220.8419921875, "kl_loss_7": 393.25787048339845, "learning_rate": 0.0008392547057785661, "loss": 862.1611, "step": 2700 }, { "ce_loss_10": 3.5277312278747557, "ce_loss_13": 3.462877333164215, "ce_loss_2": 4.333063495159149, "ce_loss_3": 4.068599557876587, "ce_loss_7": 3.6612127304077147, "epoch": 0.271, "grad_norm": 644.0, "kl_loss_10": 126.14829139709472, "kl_loss_2": 1803.081787109375, "kl_loss_3": 1281.3850402832031, "kl_loss_7": 405.90563812255857, "learning_rate": 0.0008380874501370098, "loss": 866.4251, "step": 2710 }, { "ce_loss_10": 3.5219783544540406, "ce_loss_13": 3.4639760494232177, "ce_loss_2": 4.320484936237335, "ce_loss_3": 4.067164242267609, "ce_loss_7": 3.6574723958969115, "epoch": 0.272, "grad_norm": 1256.0, "kl_loss_10": 125.24216957092285, "kl_loss_2": 1782.6489868164062, "kl_loss_3": 1266.783380126953, "kl_loss_7": 428.45300750732423, "learning_rate": 0.0008369167899585841, "loss": 889.6836, "step": 2720 }, { "ce_loss_10": 3.6541715979576113, "ce_loss_13": 3.595216929912567, "ce_loss_2": 4.385185480117798, "ce_loss_3": 4.151276361942291, "ce_loss_7": 3.7882070541381836, "epoch": 0.273, "grad_norm": 728.0, "kl_loss_10": 120.80016899108887, "kl_loss_2": 1665.0289611816406, "kl_loss_3": 1196.1640930175781, "kl_loss_7": 412.7695770263672, "learning_rate": 0.0008357427370318238, "loss": 878.6904, "step": 2730 }, { "ce_loss_10": 3.6059207916259766, "ce_loss_13": 3.5445015668869018, "ce_loss_2": 4.376115798950195, "ce_loss_3": 4.119233012199402, "ce_loss_7": 3.7391616225242617, "epoch": 0.274, "grad_norm": 1576.0, "kl_loss_10": 121.43072166442872, "kl_loss_2": 1732.6891723632812, "kl_loss_3": 1225.3057495117187, "kl_loss_7": 408.2993225097656, "learning_rate": 0.0008345653031794292, "loss": 878.5334, "step": 2740 }, { "ce_loss_10": 3.6025451540946962, "ce_loss_13": 3.5433849692344666, "ce_loss_2": 4.365815544128418, "ce_loss_3": 4.125142395496368, "ce_loss_7": 3.735653018951416, "epoch": 0.275, "grad_norm": 696.0, "kl_loss_10": 121.71817207336426, "kl_loss_2": 1698.9590087890624, "kl_loss_3": 1221.5281066894531, "kl_loss_7": 396.41104278564455, "learning_rate": 0.0008333845002581458, "loss": 863.8256, "step": 2750 }, { "ce_loss_10": 3.524514317512512, "ce_loss_13": 3.464148759841919, "ce_loss_2": 4.319546580314636, "ce_loss_3": 4.068345057964325, "ce_loss_7": 3.654212474822998, "epoch": 0.276, "grad_norm": 752.0, "kl_loss_10": 121.97859344482421, "kl_loss_2": 1793.125555419922, "kl_loss_3": 1280.574969482422, "kl_loss_7": 402.11714324951174, "learning_rate": 0.0008322003401586462, "loss": 888.9676, "step": 2760 }, { "ce_loss_10": 3.5655999302864076, "ce_loss_13": 3.5086979150772093, "ce_loss_2": 4.306806945800782, "ce_loss_3": 4.063013255596161, "ce_loss_7": 3.6851531267166138, "epoch": 0.277, "grad_norm": 660.0, "kl_loss_10": 117.17034149169922, "kl_loss_2": 1677.4315246582032, "kl_loss_3": 1190.5869140625, "kl_loss_7": 377.3115737915039, "learning_rate": 0.0008310128348054094, "loss": 835.8481, "step": 2770 }, { "ce_loss_10": 3.5316494584083555, "ce_loss_13": 3.47216192483902, "ce_loss_2": 4.3050333023071286, "ce_loss_3": 4.051662290096283, "ce_loss_7": 3.652060294151306, "epoch": 0.278, "grad_norm": 808.0, "kl_loss_10": 117.91603393554688, "kl_loss_2": 1721.5279479980468, "kl_loss_3": 1228.1507141113282, "kl_loss_7": 385.47499694824216, "learning_rate": 0.0008298219961566008, "loss": 860.7359, "step": 2780 }, { "ce_loss_10": 3.5051544904708862, "ce_loss_13": 3.444338619709015, "ce_loss_2": 4.300984418392181, "ce_loss_3": 4.052151179313659, "ce_loss_7": 3.635912001132965, "epoch": 0.279, "grad_norm": 684.0, "kl_loss_10": 119.0569522857666, "kl_loss_2": 1791.0266662597655, "kl_loss_3": 1283.5458435058595, "kl_loss_7": 394.7980667114258, "learning_rate": 0.0008286278362039527, "loss": 870.367, "step": 2790 }, { "ce_loss_10": 3.5257250547409056, "ce_loss_13": 3.4669500470161436, "ce_loss_2": 4.3314121007919315, "ce_loss_3": 4.069406723976135, "ce_loss_7": 3.6529884815216063, "epoch": 0.28, "grad_norm": 592.0, "kl_loss_10": 118.33040466308594, "kl_loss_2": 1772.7789123535156, "kl_loss_3": 1258.172607421875, "kl_loss_7": 390.06167755126955, "learning_rate": 0.0008274303669726426, "loss": 859.6066, "step": 2800 }, { "ce_loss_10": 3.4326475739479063, "ce_loss_13": 3.368249499797821, "ce_loss_2": 4.2475821614265445, "ce_loss_3": 3.9768091082572936, "ce_loss_7": 3.56466429233551, "epoch": 0.281, "grad_norm": 820.0, "kl_loss_10": 119.32261543273925, "kl_loss_2": 1801.6330139160157, "kl_loss_3": 1268.0037780761718, "kl_loss_7": 399.8984771728516, "learning_rate": 0.0008262296005211721, "loss": 865.725, "step": 2810 }, { "ce_loss_10": 3.5568071007728577, "ce_loss_13": 3.4984994530677795, "ce_loss_2": 4.339726853370666, "ce_loss_3": 4.093842375278473, "ce_loss_7": 3.6863738179206846, "epoch": 0.282, "grad_norm": 668.0, "kl_loss_10": 117.20367279052735, "kl_loss_2": 1741.4729064941407, "kl_loss_3": 1243.0221740722657, "kl_loss_7": 391.26034393310545, "learning_rate": 0.0008250255489412463, "loss": 861.8801, "step": 2820 }, { "ce_loss_10": 3.6558093190193177, "ce_loss_13": 3.5970911622047423, "ce_loss_2": 4.4279505729675295, "ce_loss_3": 4.17941837310791, "ce_loss_7": 3.7779974818229674, "epoch": 0.283, "grad_norm": 644.0, "kl_loss_10": 117.54653549194336, "kl_loss_2": 1717.2896484375, "kl_loss_3": 1222.8905883789062, "kl_loss_7": 383.6278533935547, "learning_rate": 0.0008238182243576511, "loss": 864.3904, "step": 2830 }, { "ce_loss_10": 3.627612364292145, "ce_loss_13": 3.571374249458313, "ce_loss_2": 4.348476409912109, "ce_loss_3": 4.111672687530517, "ce_loss_7": 3.7428536653518676, "epoch": 0.284, "grad_norm": 848.0, "kl_loss_10": 115.3503978729248, "kl_loss_2": 1634.8415893554688, "kl_loss_3": 1172.3661010742187, "kl_loss_7": 373.25710144042966, "learning_rate": 0.0008226076389281315, "loss": 835.7893, "step": 2840 }, { "ce_loss_10": 3.6687015771865843, "ce_loss_13": 3.611837661266327, "ce_loss_2": 4.408014273643493, "ce_loss_3": 4.167214012145996, "ce_loss_7": 3.78675491809845, "epoch": 0.285, "grad_norm": 792.0, "kl_loss_10": 116.4748779296875, "kl_loss_2": 1687.3265686035156, "kl_loss_3": 1203.5112426757812, "kl_loss_7": 379.0892868041992, "learning_rate": 0.0008213938048432696, "loss": 837.4566, "step": 2850 }, { "ce_loss_10": 3.5925659775733947, "ce_loss_13": 3.5332329750061033, "ce_loss_2": 4.345850491523743, "ce_loss_3": 4.107743239402771, "ce_loss_7": 3.7164170384407043, "epoch": 0.286, "grad_norm": 724.0, "kl_loss_10": 119.2851993560791, "kl_loss_2": 1683.378924560547, "kl_loss_3": 1200.0212280273438, "kl_loss_7": 385.51824645996095, "learning_rate": 0.0008201767343263612, "loss": 853.9481, "step": 2860 }, { "ce_loss_10": 3.5278223156929016, "ce_loss_13": 3.4697880268096926, "ce_loss_2": 4.305528485774994, "ce_loss_3": 4.061980509757996, "ce_loss_7": 3.6568115830421446, "epoch": 0.287, "grad_norm": 688.0, "kl_loss_10": 117.13396682739258, "kl_loss_2": 1730.580908203125, "kl_loss_3": 1244.273779296875, "kl_loss_7": 389.10458984375, "learning_rate": 0.0008189564396332927, "loss": 840.4185, "step": 2870 }, { "ce_loss_10": 3.508720850944519, "ce_loss_13": 3.453092110157013, "ce_loss_2": 4.299274802207947, "ce_loss_3": 4.047192811965942, "ce_loss_7": 3.640024721622467, "epoch": 0.288, "grad_norm": 848.0, "kl_loss_10": 116.31773529052734, "kl_loss_2": 1743.982666015625, "kl_loss_3": 1237.4966247558593, "kl_loss_7": 386.93666687011716, "learning_rate": 0.0008177329330524181, "loss": 868.4197, "step": 2880 }, { "ce_loss_10": 3.5816394329071044, "ce_loss_13": 3.516602861881256, "ce_loss_2": 4.328305494785309, "ce_loss_3": 4.09252301454544, "ce_loss_7": 3.6952725887298583, "epoch": 0.289, "grad_norm": 880.0, "kl_loss_10": 120.13455467224121, "kl_loss_2": 1672.285400390625, "kl_loss_3": 1204.73544921875, "kl_loss_7": 379.05470581054686, "learning_rate": 0.0008165062269044352, "loss": 854.3851, "step": 2890 }, { "ce_loss_10": 3.533835935592651, "ce_loss_13": 3.470975637435913, "ce_loss_2": 4.294858407974243, "ce_loss_3": 4.049375188350678, "ce_loss_7": 3.669650936126709, "epoch": 0.29, "grad_norm": 956.0, "kl_loss_10": 131.92447509765626, "kl_loss_2": 1723.325860595703, "kl_loss_3": 1225.3687438964844, "kl_loss_7": 413.51206665039064, "learning_rate": 0.0008152763335422613, "loss": 875.9442, "step": 2900 }, { "ce_loss_10": 3.5229781746864317, "ce_loss_13": 3.4558417439460754, "ce_loss_2": 4.291088664531708, "ce_loss_3": 4.040839302539825, "ce_loss_7": 3.6485135436058043, "epoch": 0.291, "grad_norm": 916.0, "kl_loss_10": 130.519429397583, "kl_loss_2": 1736.2880615234376, "kl_loss_3": 1233.2502563476562, "kl_loss_7": 414.03160247802737, "learning_rate": 0.0008140432653509088, "loss": 866.857, "step": 2910 }, { "ce_loss_10": 3.567656230926514, "ce_loss_13": 3.503602409362793, "ce_loss_2": 4.3293495893478395, "ce_loss_3": 4.079035115242005, "ce_loss_7": 3.6952230453491213, "epoch": 0.292, "grad_norm": 640.0, "kl_loss_10": 127.2950496673584, "kl_loss_2": 1719.4770751953124, "kl_loss_3": 1215.8184814453125, "kl_loss_7": 405.2675979614258, "learning_rate": 0.0008128070347473608, "loss": 852.675, "step": 2920 }, { "ce_loss_10": 3.575137174129486, "ce_loss_13": 3.5157665014266968, "ce_loss_2": 4.372000908851623, "ce_loss_3": 4.101023232936859, "ce_loss_7": 3.708648312091827, "epoch": 0.293, "grad_norm": 1120.0, "kl_loss_10": 123.37828140258789, "kl_loss_2": 1776.6594665527343, "kl_loss_3": 1242.4217651367187, "kl_loss_7": 397.11926422119143, "learning_rate": 0.0008115676541804455, "loss": 866.9424, "step": 2930 }, { "ce_loss_10": 3.5826502084732055, "ce_loss_13": 3.524796175956726, "ce_loss_2": 4.326114082336426, "ce_loss_3": 4.089037120342255, "ce_loss_7": 3.7056201100349426, "epoch": 0.294, "grad_norm": 688.0, "kl_loss_10": 118.44030418395997, "kl_loss_2": 1688.6132507324219, "kl_loss_3": 1201.9659301757813, "kl_loss_7": 384.77832946777346, "learning_rate": 0.0008103251361307119, "loss": 854.5449, "step": 2940 }, { "ce_loss_10": 3.611738312244415, "ce_loss_13": 3.5526163220405578, "ce_loss_2": 4.366993880271911, "ce_loss_3": 4.125052762031555, "ce_loss_7": 3.7343359708786013, "epoch": 0.295, "grad_norm": 1072.0, "kl_loss_10": 120.27555274963379, "kl_loss_2": 1700.007586669922, "kl_loss_3": 1215.4175231933593, "kl_loss_7": 386.48180389404297, "learning_rate": 0.0008090794931103026, "loss": 847.0105, "step": 2950 }, { "ce_loss_10": 3.6010645151138307, "ce_loss_13": 3.5417975544929505, "ce_loss_2": 4.3437940835952755, "ce_loss_3": 4.1034717679023744, "ce_loss_7": 3.7186156630516054, "epoch": 0.296, "grad_norm": 732.0, "kl_loss_10": 115.5659568786621, "kl_loss_2": 1664.8860473632812, "kl_loss_3": 1192.3460205078125, "kl_loss_7": 378.62300872802734, "learning_rate": 0.0008078307376628291, "loss": 847.8291, "step": 2960 }, { "ce_loss_10": 3.6572685122489927, "ce_loss_13": 3.601787805557251, "ce_loss_2": 4.378321361541748, "ce_loss_3": 4.149720752239228, "ce_loss_7": 3.774696469306946, "epoch": 0.297, "grad_norm": 1024.0, "kl_loss_10": 114.00293769836426, "kl_loss_2": 1616.3122253417969, "kl_loss_3": 1163.856396484375, "kl_loss_7": 368.1110870361328, "learning_rate": 0.000806578882363245, "loss": 818.0032, "step": 2970 }, { "ce_loss_10": 3.5727164506912232, "ce_loss_13": 3.516432511806488, "ce_loss_2": 4.317604410648346, "ce_loss_3": 4.079760944843292, "ce_loss_7": 3.6930678486824036, "epoch": 0.298, "grad_norm": 768.0, "kl_loss_10": 113.55682945251465, "kl_loss_2": 1668.698175048828, "kl_loss_3": 1192.6838439941407, "kl_loss_7": 375.6543395996094, "learning_rate": 0.0008053239398177191, "loss": 859.0381, "step": 2980 }, { "ce_loss_10": 3.553641748428345, "ce_loss_13": 3.4987002849578857, "ce_loss_2": 4.315845227241516, "ce_loss_3": 4.065586876869202, "ce_loss_7": 3.6750356554985046, "epoch": 0.299, "grad_norm": 1280.0, "kl_loss_10": 115.50895500183105, "kl_loss_2": 1702.8024108886718, "kl_loss_3": 1206.832830810547, "kl_loss_7": 375.11317138671876, "learning_rate": 0.0008040659226635089, "loss": 871.3348, "step": 2990 }, { "ce_loss_10": 3.686588931083679, "ce_loss_13": 3.6270423412322996, "ce_loss_2": 4.426630926132202, "ce_loss_3": 4.1877022862434385, "ce_loss_7": 3.8101216912269593, "epoch": 0.3, "grad_norm": 920.0, "kl_loss_10": 120.24936103820801, "kl_loss_2": 1676.8763000488282, "kl_loss_3": 1199.5878540039062, "kl_loss_7": 388.31083221435546, "learning_rate": 0.0008028048435688333, "loss": 846.8068, "step": 3000 }, { "ce_loss_10": 3.556571066379547, "ce_loss_13": 3.4967838883399964, "ce_loss_2": 4.338928711414337, "ce_loss_3": 4.076198601722718, "ce_loss_7": 3.6810215592384337, "epoch": 0.301, "grad_norm": 948.0, "kl_loss_10": 117.14677352905274, "kl_loss_2": 1747.8990844726563, "kl_loss_3": 1232.4341674804687, "kl_loss_7": 384.1294311523437, "learning_rate": 0.0008015407152327448, "loss": 861.6333, "step": 3010 }, { "ce_loss_10": 3.602232789993286, "ce_loss_13": 3.5464617371559144, "ce_loss_2": 4.357162296772003, "ce_loss_3": 4.109545958042145, "ce_loss_7": 3.723551881313324, "epoch": 0.302, "grad_norm": 700.0, "kl_loss_10": 119.32380256652831, "kl_loss_2": 1718.0250427246094, "kl_loss_3": 1214.2114685058593, "kl_loss_7": 380.82495880126953, "learning_rate": 0.0008002735503850016, "loss": 860.6063, "step": 3020 }, { "ce_loss_10": 3.500118100643158, "ce_loss_13": 3.4301446318626403, "ce_loss_2": 4.2865880012512205, "ce_loss_3": 4.0292200446128845, "ce_loss_7": 3.6224149107933044, "epoch": 0.303, "grad_norm": 680.0, "kl_loss_10": 133.3865581512451, "kl_loss_2": 1762.3200256347657, "kl_loss_3": 1248.951885986328, "kl_loss_7": 402.3367004394531, "learning_rate": 0.0007990033617859396, "loss": 885.2019, "step": 3030 }, { "ce_loss_10": 3.5487611174583433, "ce_loss_13": 3.48541659116745, "ce_loss_2": 4.2959469079971315, "ce_loss_3": 4.057184457778931, "ce_loss_7": 3.6729429960250854, "epoch": 0.304, "grad_norm": 868.0, "kl_loss_10": 127.23117599487304, "kl_loss_2": 1680.872869873047, "kl_loss_3": 1196.3892578125, "kl_loss_7": 388.0195877075195, "learning_rate": 0.000797730162226344, "loss": 834.9006, "step": 3040 }, { "ce_loss_10": 3.5749799251556396, "ce_loss_13": 3.5117679953575136, "ce_loss_2": 4.324963521957398, "ce_loss_3": 4.080844748020172, "ce_loss_7": 3.7018226742744447, "epoch": 0.305, "grad_norm": 708.0, "kl_loss_10": 126.22021293640137, "kl_loss_2": 1696.1488952636719, "kl_loss_3": 1208.489306640625, "kl_loss_7": 395.04272613525393, "learning_rate": 0.0007964539645273203, "loss": 845.1297, "step": 3050 }, { "ce_loss_10": 3.5864476919174195, "ce_loss_13": 3.52917160987854, "ce_loss_2": 4.320749962329865, "ce_loss_3": 4.085593056678772, "ce_loss_7": 3.701304924488068, "epoch": 0.306, "grad_norm": 716.0, "kl_loss_10": 118.05113334655762, "kl_loss_2": 1656.3977111816407, "kl_loss_3": 1184.199285888672, "kl_loss_7": 373.9879623413086, "learning_rate": 0.000795174781540165, "loss": 846.1535, "step": 3060 }, { "ce_loss_10": 3.6699469685554504, "ce_loss_13": 3.602624785900116, "ce_loss_2": 4.3783220767974855, "ce_loss_3": 4.144092714786529, "ce_loss_7": 3.77892005443573, "epoch": 0.307, "grad_norm": 624.0, "kl_loss_10": 126.74162673950195, "kl_loss_2": 1606.51923828125, "kl_loss_3": 1149.1448059082031, "kl_loss_7": 372.5706756591797, "learning_rate": 0.0007938926261462366, "loss": 837.1207, "step": 3070 }, { "ce_loss_10": 3.609223115444183, "ce_loss_13": 3.545342946052551, "ce_loss_2": 4.331037449836731, "ce_loss_3": 4.0943366408348085, "ce_loss_7": 3.7234758496284486, "epoch": 0.308, "grad_norm": 784.0, "kl_loss_10": 120.93710441589356, "kl_loss_2": 1668.43271484375, "kl_loss_3": 1183.2797088623047, "kl_loss_7": 379.16572723388674, "learning_rate": 0.0007926075112568258, "loss": 855.3146, "step": 3080 }, { "ce_loss_10": 3.6012861251831056, "ce_loss_13": 3.5415249466896057, "ce_loss_2": 4.339315104484558, "ce_loss_3": 4.100031721591949, "ce_loss_7": 3.7238448023796082, "epoch": 0.309, "grad_norm": 864.0, "kl_loss_10": 117.66056022644042, "kl_loss_2": 1671.2183349609375, "kl_loss_3": 1194.3562866210937, "kl_loss_7": 386.64764709472655, "learning_rate": 0.0007913194498130252, "loss": 831.6145, "step": 3090 }, { "ce_loss_10": 3.5270251989364625, "ce_loss_13": 3.466107988357544, "ce_loss_2": 4.295390522480011, "ce_loss_3": 4.052128338813782, "ce_loss_7": 3.6544239044189455, "epoch": 0.31, "grad_norm": 712.0, "kl_loss_10": 118.14057502746581, "kl_loss_2": 1717.96220703125, "kl_loss_3": 1221.504083251953, "kl_loss_7": 395.8646697998047, "learning_rate": 0.0007900284547855992, "loss": 860.8879, "step": 3100 }, { "ce_loss_10": 3.5359286427497865, "ce_loss_13": 3.4782017827033997, "ce_loss_2": 4.27366749048233, "ce_loss_3": 4.035928988456726, "ce_loss_7": 3.6604955792427063, "epoch": 0.311, "grad_norm": 804.0, "kl_loss_10": 114.00839118957519, "kl_loss_2": 1655.3436767578125, "kl_loss_3": 1184.622735595703, "kl_loss_7": 381.71679840087893, "learning_rate": 0.0007887345391748532, "loss": 851.5764, "step": 3110 }, { "ce_loss_10": 3.670715296268463, "ce_loss_13": 3.614938962459564, "ce_loss_2": 4.374625968933105, "ce_loss_3": 4.153026962280274, "ce_loss_7": 3.791151750087738, "epoch": 0.312, "grad_norm": 856.0, "kl_loss_10": 115.99779586791992, "kl_loss_2": 1608.5520751953125, "kl_loss_3": 1168.0057678222656, "kl_loss_7": 380.57466583251954, "learning_rate": 0.0007874377160105036, "loss": 818.879, "step": 3120 }, { "ce_loss_10": 3.560124433040619, "ce_loss_13": 3.5026067972183226, "ce_loss_2": 4.319526970386505, "ce_loss_3": 4.075046932697296, "ce_loss_7": 3.6801488518714907, "epoch": 0.313, "grad_norm": 668.0, "kl_loss_10": 115.1606674194336, "kl_loss_2": 1702.915545654297, "kl_loss_3": 1218.063250732422, "kl_loss_7": 378.69984130859376, "learning_rate": 0.0007861379983515449, "loss": 867.7164, "step": 3130 }, { "ce_loss_10": 3.64900780916214, "ce_loss_13": 3.5890329003334047, "ce_loss_2": 4.382000231742859, "ce_loss_3": 4.151926016807556, "ce_loss_7": 3.770545780658722, "epoch": 0.314, "grad_norm": 620.0, "kl_loss_10": 115.61932106018067, "kl_loss_2": 1667.7091735839845, "kl_loss_3": 1203.9859252929687, "kl_loss_7": 378.3471343994141, "learning_rate": 0.0007848353992861195, "loss": 831.5619, "step": 3140 }, { "ce_loss_10": 3.726586377620697, "ce_loss_13": 3.665834975242615, "ce_loss_2": 4.466833281517029, "ce_loss_3": 4.239802432060242, "ce_loss_7": 3.855803596973419, "epoch": 0.315, "grad_norm": 540.0, "kl_loss_10": 121.5996280670166, "kl_loss_2": 1666.041278076172, "kl_loss_3": 1208.4485992431642, "kl_loss_7": 394.7228042602539, "learning_rate": 0.0007835299319313853, "loss": 851.6315, "step": 3150 }, { "ce_loss_10": 3.6075380086898803, "ce_loss_13": 3.5498488545417786, "ce_loss_2": 4.33118622303009, "ce_loss_3": 4.103638553619385, "ce_loss_7": 3.721197712421417, "epoch": 0.316, "grad_norm": 812.0, "kl_loss_10": 114.99713478088378, "kl_loss_2": 1637.2281372070313, "kl_loss_3": 1185.058447265625, "kl_loss_7": 373.321435546875, "learning_rate": 0.0007822216094333848, "loss": 856.9743, "step": 3160 }, { "ce_loss_10": 3.609579026699066, "ce_loss_13": 3.55041366815567, "ce_loss_2": 4.360835981369019, "ce_loss_3": 4.124555444717407, "ce_loss_7": 3.7366039872169496, "epoch": 0.317, "grad_norm": 724.0, "kl_loss_10": 114.99435119628906, "kl_loss_2": 1676.748974609375, "kl_loss_3": 1201.9430358886718, "kl_loss_7": 383.48704223632814, "learning_rate": 0.0007809104449669101, "loss": 836.4251, "step": 3170 }, { "ce_loss_10": 3.558854579925537, "ce_loss_13": 3.50283659696579, "ce_loss_2": 4.288606309890747, "ce_loss_3": 4.060452401638031, "ce_loss_7": 3.686760699748993, "epoch": 0.318, "grad_norm": 672.0, "kl_loss_10": 112.90032691955567, "kl_loss_2": 1653.4009094238281, "kl_loss_3": 1182.4323974609374, "kl_loss_7": 382.2897430419922, "learning_rate": 0.0007795964517353734, "loss": 830.4848, "step": 3180 }, { "ce_loss_10": 3.5531341314315794, "ce_loss_13": 3.494058334827423, "ce_loss_2": 4.312021148204804, "ce_loss_3": 4.06132435798645, "ce_loss_7": 3.6786675214767457, "epoch": 0.319, "grad_norm": 632.0, "kl_loss_10": 121.5546661376953, "kl_loss_2": 1714.2662963867188, "kl_loss_3": 1209.382650756836, "kl_loss_7": 390.5830047607422, "learning_rate": 0.000778279642970672, "loss": 830.7126, "step": 3190 }, { "ce_loss_10": 3.554937243461609, "ce_loss_13": 3.4984064936637878, "ce_loss_2": 4.289804995059967, "ce_loss_3": 4.049097466468811, "ce_loss_7": 3.673481619358063, "epoch": 0.32, "grad_norm": 760.0, "kl_loss_10": 116.15705032348633, "kl_loss_2": 1661.6501708984374, "kl_loss_3": 1179.4572509765626, "kl_loss_7": 378.86720275878906, "learning_rate": 0.0007769600319330552, "loss": 822.0354, "step": 3200 }, { "ce_loss_10": 3.5909475445747376, "ce_loss_13": 3.5323880195617674, "ce_loss_2": 4.367212843894959, "ce_loss_3": 4.107944619655609, "ce_loss_7": 3.717561733722687, "epoch": 0.321, "grad_norm": 1568.0, "kl_loss_10": 115.30198402404785, "kl_loss_2": 1722.4256225585937, "kl_loss_3": 1212.1700317382813, "kl_loss_7": 389.22625122070315, "learning_rate": 0.0007756376319109917, "loss": 845.6691, "step": 3210 }, { "ce_loss_10": 3.6382947683334352, "ce_loss_13": 3.5816096305847167, "ce_loss_2": 4.370487928390503, "ce_loss_3": 4.13275511264801, "ce_loss_7": 3.7714091897010804, "epoch": 0.322, "grad_norm": 748.0, "kl_loss_10": 117.98968887329102, "kl_loss_2": 1646.2207153320312, "kl_loss_3": 1171.6144165039063, "kl_loss_7": 388.19681549072266, "learning_rate": 0.0007743124562210351, "loss": 818.264, "step": 3220 }, { "ce_loss_10": 3.653497350215912, "ce_loss_13": 3.5951754808425904, "ce_loss_2": 4.374958968162536, "ce_loss_3": 4.134474778175354, "ce_loss_7": 3.7706104278564454, "epoch": 0.323, "grad_norm": 928.0, "kl_loss_10": 119.37146644592285, "kl_loss_2": 1638.5594421386718, "kl_loss_3": 1167.7997100830078, "kl_loss_7": 390.55777435302736, "learning_rate": 0.0007729845182076895, "loss": 836.4436, "step": 3230 }, { "ce_loss_10": 3.5757078647613527, "ce_loss_13": 3.5204537510871887, "ce_loss_2": 4.29359986782074, "ce_loss_3": 4.064731419086456, "ce_loss_7": 3.704847884178162, "epoch": 0.324, "grad_norm": 892.0, "kl_loss_10": 112.78250846862792, "kl_loss_2": 1609.9090698242187, "kl_loss_3": 1159.7744262695312, "kl_loss_7": 384.8263137817383, "learning_rate": 0.0007716538312432765, "loss": 840.0002, "step": 3240 }, { "ce_loss_10": 3.534730553627014, "ce_loss_13": 3.4789490580558775, "ce_loss_2": 4.304475176334381, "ce_loss_3": 4.052125036716461, "ce_loss_7": 3.6611103057861327, "epoch": 0.325, "grad_norm": 684.0, "kl_loss_10": 118.80456047058105, "kl_loss_2": 1705.30078125, "kl_loss_3": 1207.9038452148438, "kl_loss_7": 400.3294143676758, "learning_rate": 0.0007703204087277988, "loss": 849.7979, "step": 3250 }, { "ce_loss_10": 3.640562951564789, "ce_loss_13": 3.5829039812088013, "ce_loss_2": 4.344409227371216, "ce_loss_3": 4.114521729946136, "ce_loss_7": 3.7689186573028564, "epoch": 0.326, "grad_norm": 1088.0, "kl_loss_10": 117.3976432800293, "kl_loss_2": 1579.8317626953126, "kl_loss_3": 1130.771304321289, "kl_loss_7": 393.01446228027345, "learning_rate": 0.0007689842640888063, "loss": 817.2718, "step": 3260 }, { "ce_loss_10": 3.633817756175995, "ce_loss_13": 3.5766273021697996, "ce_loss_2": 4.350421750545502, "ce_loss_3": 4.125564169883728, "ce_loss_7": 3.761690652370453, "epoch": 0.327, "grad_norm": 720.0, "kl_loss_10": 118.46526412963867, "kl_loss_2": 1605.4541320800781, "kl_loss_3": 1154.1414672851563, "kl_loss_7": 388.8281951904297, "learning_rate": 0.0007676454107812607, "loss": 825.4082, "step": 3270 }, { "ce_loss_10": 3.570014715194702, "ce_loss_13": 3.5132551789283752, "ce_loss_2": 4.310917520523072, "ce_loss_3": 4.072949314117432, "ce_loss_7": 3.697970747947693, "epoch": 0.328, "grad_norm": 908.0, "kl_loss_10": 116.57307319641113, "kl_loss_2": 1668.5494201660156, "kl_loss_3": 1185.7364288330077, "kl_loss_7": 388.6903610229492, "learning_rate": 0.0007663038622873999, "loss": 826.5832, "step": 3280 }, { "ce_loss_10": 3.608345115184784, "ce_loss_13": 3.552368700504303, "ce_loss_2": 4.342653036117554, "ce_loss_3": 4.107228255271911, "ce_loss_7": 3.7292393803596497, "epoch": 0.329, "grad_norm": 800.0, "kl_loss_10": 119.16379737854004, "kl_loss_2": 1657.9130859375, "kl_loss_3": 1178.795343017578, "kl_loss_7": 385.3046020507812, "learning_rate": 0.0007649596321166025, "loss": 820.6685, "step": 3290 }, { "ce_loss_10": 3.5168209075927734, "ce_loss_13": 3.4585548520088194, "ce_loss_2": 4.2374080419540405, "ce_loss_3": 4.016101682186127, "ce_loss_7": 3.642660355567932, "epoch": 0.33, "grad_norm": 624.0, "kl_loss_10": 112.55266952514648, "kl_loss_2": 1615.6689147949219, "kl_loss_3": 1172.109423828125, "kl_loss_7": 382.10296936035155, "learning_rate": 0.0007636127338052513, "loss": 834.566, "step": 3300 }, { "ce_loss_10": 3.616956079006195, "ce_loss_13": 3.5588277220726012, "ce_loss_2": 4.365944027900696, "ce_loss_3": 4.128736138343811, "ce_loss_7": 3.7405884146690367, "epoch": 0.331, "grad_norm": 632.0, "kl_loss_10": 118.14865264892578, "kl_loss_2": 1688.8310668945312, "kl_loss_3": 1205.416485595703, "kl_loss_7": 387.86192779541017, "learning_rate": 0.0007622631809165971, "loss": 830.905, "step": 3310 }, { "ce_loss_10": 3.6154383778572083, "ce_loss_13": 3.559745740890503, "ce_loss_2": 4.3175086855888365, "ce_loss_3": 4.093609952926636, "ce_loss_7": 3.7267310857772826, "epoch": 0.332, "grad_norm": 628.0, "kl_loss_10": 110.52432746887207, "kl_loss_2": 1570.4284362792969, "kl_loss_3": 1125.6173156738282, "kl_loss_7": 357.82008972167966, "learning_rate": 0.000760910987040623, "loss": 808.4639, "step": 3320 }, { "ce_loss_10": 3.5965430974960326, "ce_loss_13": 3.539671075344086, "ce_loss_2": 4.348510015010834, "ce_loss_3": 4.109544110298157, "ce_loss_7": 3.719984066486359, "epoch": 0.333, "grad_norm": 764.0, "kl_loss_10": 115.54491081237794, "kl_loss_2": 1688.9568359375, "kl_loss_3": 1208.1375122070312, "kl_loss_7": 378.84394989013674, "learning_rate": 0.000759556165793906, "loss": 825.1844, "step": 3330 }, { "ce_loss_10": 3.6220271229743957, "ce_loss_13": 3.5657182216644285, "ce_loss_2": 4.348813438415528, "ce_loss_3": 4.117461419105529, "ce_loss_7": 3.740175020694733, "epoch": 0.334, "grad_norm": 724.0, "kl_loss_10": 115.18860549926758, "kl_loss_2": 1633.3359802246093, "kl_loss_3": 1169.733172607422, "kl_loss_7": 373.47499084472656, "learning_rate": 0.000758198730819481, "loss": 831.5812, "step": 3340 }, { "ce_loss_10": 3.566503274440765, "ce_loss_13": 3.5095133543014527, "ce_loss_2": 4.300767230987549, "ce_loss_3": 4.068003153800964, "ce_loss_7": 3.6790093183517456, "epoch": 0.335, "grad_norm": 684.0, "kl_loss_10": 112.77861061096192, "kl_loss_2": 1666.2818481445313, "kl_loss_3": 1193.1672149658202, "kl_loss_7": 364.73136444091796, "learning_rate": 0.0007568386957867032, "loss": 831.8002, "step": 3350 }, { "ce_loss_10": 3.6464338302612305, "ce_loss_13": 3.5835552334785463, "ce_loss_2": 4.362986898422241, "ce_loss_3": 4.131534039974213, "ce_loss_7": 3.7615885734558105, "epoch": 0.336, "grad_norm": 736.0, "kl_loss_10": 119.8167335510254, "kl_loss_2": 1618.847589111328, "kl_loss_3": 1159.0107849121093, "kl_loss_7": 370.0627777099609, "learning_rate": 0.0007554760743911103, "loss": 829.4766, "step": 3360 }, { "ce_loss_10": 3.5369420170784, "ce_loss_13": 3.482306623458862, "ce_loss_2": 4.261832523345947, "ce_loss_3": 4.029614782333374, "ce_loss_7": 3.653941977024078, "epoch": 0.337, "grad_norm": 812.0, "kl_loss_10": 112.68153266906738, "kl_loss_2": 1658.736798095703, "kl_loss_3": 1180.5161499023438, "kl_loss_7": 365.83653869628904, "learning_rate": 0.0007541108803542846, "loss": 846.3863, "step": 3370 }, { "ce_loss_10": 3.5927870750427244, "ce_loss_13": 3.5340675950050353, "ce_loss_2": 4.320529985427856, "ce_loss_3": 4.082762014865875, "ce_loss_7": 3.705589485168457, "epoch": 0.338, "grad_norm": 1088.0, "kl_loss_10": 114.63297500610352, "kl_loss_2": 1651.7355407714845, "kl_loss_3": 1173.1455291748048, "kl_loss_7": 364.3025634765625, "learning_rate": 0.0007527431274237149, "loss": 865.8752, "step": 3380 }, { "ce_loss_10": 3.5577341437339784, "ce_loss_13": 3.5026549458503724, "ce_loss_2": 4.273448967933655, "ce_loss_3": 4.03739470243454, "ce_loss_7": 3.670669972896576, "epoch": 0.339, "grad_norm": 752.0, "kl_loss_10": 112.07779579162597, "kl_loss_2": 1629.0080871582031, "kl_loss_3": 1156.0211303710937, "kl_loss_7": 361.95240631103513, "learning_rate": 0.0007513728293726579, "loss": 819.0673, "step": 3390 }, { "ce_loss_10": 3.6811296582221984, "ce_loss_13": 3.6241077899932863, "ce_loss_2": 4.3843337297439575, "ce_loss_3": 4.157323217391967, "ce_loss_7": 3.7977556228637694, "epoch": 0.34, "grad_norm": 832.0, "kl_loss_10": 115.31730308532715, "kl_loss_2": 1618.7043884277343, "kl_loss_3": 1157.5196899414063, "kl_loss_7": 366.34784240722655, "learning_rate": 0.00075, "loss": 812.9603, "step": 3400 }, { "ce_loss_10": 3.666926121711731, "ce_loss_13": 3.6089104533195497, "ce_loss_2": 4.4058397769927975, "ce_loss_3": 4.168282294273377, "ce_loss_7": 3.7875444531440734, "epoch": 0.341, "grad_norm": 656.0, "kl_loss_10": 117.60438995361328, "kl_loss_2": 1639.777978515625, "kl_loss_3": 1167.6026336669922, "kl_loss_7": 372.6427734375, "learning_rate": 0.0007486246531301177, "loss": 814.5788, "step": 3410 }, { "ce_loss_10": 3.471621036529541, "ce_loss_13": 3.4169135451316834, "ce_loss_2": 4.210591995716095, "ce_loss_3": 3.9694793820381165, "ce_loss_7": 3.5899360179901123, "epoch": 0.342, "grad_norm": 816.0, "kl_loss_10": 114.05370597839355, "kl_loss_2": 1650.5152465820313, "kl_loss_3": 1165.4592010498047, "kl_loss_7": 365.2296401977539, "learning_rate": 0.0007472468026127384, "loss": 811.9648, "step": 3420 }, { "ce_loss_10": 3.609058141708374, "ce_loss_13": 3.5469409108161924, "ce_loss_2": 4.368825697898865, "ce_loss_3": 4.119254291057587, "ce_loss_7": 3.730163371562958, "epoch": 0.343, "grad_norm": 768.0, "kl_loss_10": 118.5148250579834, "kl_loss_2": 1713.5852661132812, "kl_loss_3": 1219.4200317382813, "kl_loss_7": 383.739990234375, "learning_rate": 0.000745866462322802, "loss": 848.2663, "step": 3430 }, { "ce_loss_10": 3.5947309017181395, "ce_loss_13": 3.5389798879623413, "ce_loss_2": 4.318871974945068, "ce_loss_3": 4.078428280353546, "ce_loss_7": 3.7111936211586, "epoch": 0.344, "grad_norm": 600.0, "kl_loss_10": 112.68238525390625, "kl_loss_2": 1617.6124755859375, "kl_loss_3": 1142.982891845703, "kl_loss_7": 359.40439453125, "learning_rate": 0.0007444836461603195, "loss": 819.0549, "step": 3440 }, { "ce_loss_10": 3.6551440238952635, "ce_loss_13": 3.5972506046295165, "ce_loss_2": 4.396359491348266, "ce_loss_3": 4.173777627944946, "ce_loss_7": 3.782487857341766, "epoch": 0.345, "grad_norm": 900.0, "kl_loss_10": 119.71044311523437, "kl_loss_2": 1675.6620239257813, "kl_loss_3": 1219.2009826660155, "kl_loss_7": 393.1466552734375, "learning_rate": 0.0007430983680502344, "loss": 847.6783, "step": 3450 }, { "ce_loss_10": 3.5005151271820067, "ce_loss_13": 3.4460280299186707, "ce_loss_2": 4.258556139469147, "ce_loss_3": 4.0134279251098635, "ce_loss_7": 3.6184648156166075, "epoch": 0.346, "grad_norm": 628.0, "kl_loss_10": 114.63112754821778, "kl_loss_2": 1691.1443481445312, "kl_loss_3": 1203.0183471679688, "kl_loss_7": 371.58728179931643, "learning_rate": 0.0007417106419422819, "loss": 837.4654, "step": 3460 }, { "ce_loss_10": 3.6071864128112794, "ce_loss_13": 3.5498743891716003, "ce_loss_2": 4.338369846343994, "ce_loss_3": 4.1038648843765255, "ce_loss_7": 3.724769628047943, "epoch": 0.347, "grad_norm": 704.0, "kl_loss_10": 114.65529251098633, "kl_loss_2": 1627.19208984375, "kl_loss_3": 1163.6196228027343, "kl_loss_7": 364.3851486206055, "learning_rate": 0.0007403204818108486, "loss": 825.5721, "step": 3470 }, { "ce_loss_10": 3.5747918844223023, "ce_loss_13": 3.5167446613311766, "ce_loss_2": 4.30417982339859, "ce_loss_3": 4.064648604393005, "ce_loss_7": 3.69026460647583, "epoch": 0.348, "grad_norm": 640.0, "kl_loss_10": 116.47897415161133, "kl_loss_2": 1662.3652465820312, "kl_loss_3": 1180.3042358398438, "kl_loss_7": 372.2030776977539, "learning_rate": 0.0007389279016548316, "loss": 808.2109, "step": 3480 }, { "ce_loss_10": 3.582957696914673, "ce_loss_13": 3.524260699748993, "ce_loss_2": 4.354557299613953, "ce_loss_3": 4.107005596160889, "ce_loss_7": 3.7079381704330445, "epoch": 0.349, "grad_norm": 572.0, "kl_loss_10": 115.8749885559082, "kl_loss_2": 1712.395037841797, "kl_loss_3": 1212.98125, "kl_loss_7": 377.2995910644531, "learning_rate": 0.0007375329154974975, "loss": 843.8529, "step": 3490 }, { "ce_loss_10": 3.5415509939193726, "ce_loss_13": 3.4872037291526796, "ce_loss_2": 4.266521430015564, "ce_loss_3": 4.03315395116806, "ce_loss_7": 3.657358193397522, "epoch": 0.35, "grad_norm": 672.0, "kl_loss_10": 112.8521339416504, "kl_loss_2": 1626.7841674804688, "kl_loss_3": 1167.0242736816406, "kl_loss_7": 366.882551574707, "learning_rate": 0.0007361355373863414, "loss": 834.417, "step": 3500 }, { "ce_loss_10": 3.598529267311096, "ce_loss_13": 3.542326867580414, "ce_loss_2": 4.31279833316803, "ce_loss_3": 4.08603081703186, "ce_loss_7": 3.7130195021629335, "epoch": 0.351, "grad_norm": 824.0, "kl_loss_10": 112.47966613769532, "kl_loss_2": 1605.3129028320313, "kl_loss_3": 1148.4092254638672, "kl_loss_7": 360.4650512695313, "learning_rate": 0.0007347357813929454, "loss": 833.2721, "step": 3510 }, { "ce_loss_10": 3.544852817058563, "ce_loss_13": 3.4877468585968017, "ce_loss_2": 4.261535465717316, "ce_loss_3": 4.028400087356568, "ce_loss_7": 3.6583495497703553, "epoch": 0.352, "grad_norm": 928.0, "kl_loss_10": 113.20467529296874, "kl_loss_2": 1603.7619995117188, "kl_loss_3": 1139.722622680664, "kl_loss_7": 355.9799346923828, "learning_rate": 0.0007333336616128369, "loss": 822.5288, "step": 3520 }, { "ce_loss_10": 3.519996774196625, "ce_loss_13": 3.461334776878357, "ce_loss_2": 4.2705777287483215, "ce_loss_3": 4.02740398645401, "ce_loss_7": 3.6356507897377015, "epoch": 0.353, "grad_norm": 856.0, "kl_loss_10": 114.7894500732422, "kl_loss_2": 1674.3318664550782, "kl_loss_3": 1181.553628540039, "kl_loss_7": 371.0823638916016, "learning_rate": 0.0007319291921653463, "loss": 832.3778, "step": 3530 }, { "ce_loss_10": 3.607111060619354, "ce_loss_13": 3.547634994983673, "ce_loss_2": 4.344854402542114, "ce_loss_3": 4.1113566875457765, "ce_loss_7": 3.7266387581825255, "epoch": 0.354, "grad_norm": 848.0, "kl_loss_10": 116.1092742919922, "kl_loss_2": 1660.3392211914063, "kl_loss_3": 1188.1138244628905, "kl_loss_7": 378.91358947753906, "learning_rate": 0.0007305223871934656, "loss": 819.3607, "step": 3540 }, { "ce_loss_10": 3.56844744682312, "ce_loss_13": 3.5131123185157778, "ce_loss_2": 4.294091653823853, "ce_loss_3": 4.060184383392334, "ce_loss_7": 3.6857504844665527, "epoch": 0.355, "grad_norm": 812.0, "kl_loss_10": 112.87055015563965, "kl_loss_2": 1624.2784912109375, "kl_loss_3": 1147.5262908935547, "kl_loss_7": 363.9884552001953, "learning_rate": 0.0007291132608637052, "loss": 815.0952, "step": 3550 }, { "ce_loss_10": 3.5319056868553163, "ce_loss_13": 3.47707599401474, "ce_loss_2": 4.2967642664909365, "ce_loss_3": 4.034195554256439, "ce_loss_7": 3.646118187904358, "epoch": 0.356, "grad_norm": 1456.0, "kl_loss_10": 113.29713935852051, "kl_loss_2": 1697.2320617675782, "kl_loss_3": 1185.2187103271485, "kl_loss_7": 355.46015014648435, "learning_rate": 0.0007277018273659516, "loss": 841.3997, "step": 3560 }, { "ce_loss_10": 3.6619539856910706, "ce_loss_13": 3.60086305141449, "ce_loss_2": 4.390695488452911, "ce_loss_3": 4.159114730358124, "ce_loss_7": 3.776704525947571, "epoch": 0.357, "grad_norm": 648.0, "kl_loss_10": 124.89349174499512, "kl_loss_2": 1661.32607421875, "kl_loss_3": 1190.2617248535157, "kl_loss_7": 375.0624282836914, "learning_rate": 0.0007262881009133242, "loss": 834.5688, "step": 3570 }, { "ce_loss_10": 3.5744593501091004, "ce_loss_13": 3.5190460443496705, "ce_loss_2": 4.291014635562897, "ce_loss_3": 4.065291798114776, "ce_loss_7": 3.69045627117157, "epoch": 0.358, "grad_norm": 788.0, "kl_loss_10": 116.01307220458985, "kl_loss_2": 1618.3801513671874, "kl_loss_3": 1155.3495208740235, "kl_loss_7": 361.6014602661133, "learning_rate": 0.0007248720957420329, "loss": 809.4335, "step": 3580 }, { "ce_loss_10": 3.5892657995224, "ce_loss_13": 3.5325476050376894, "ce_loss_2": 4.303843867778778, "ce_loss_3": 4.068600571155548, "ce_loss_7": 3.6991782069206236, "epoch": 0.359, "grad_norm": 1072.0, "kl_loss_10": 116.2580722808838, "kl_loss_2": 1608.4495788574218, "kl_loss_3": 1135.5694519042968, "kl_loss_7": 360.97457122802734, "learning_rate": 0.0007234538261112341, "loss": 824.5166, "step": 3590 }, { "ce_loss_10": 3.622062110900879, "ce_loss_13": 3.5674719333648683, "ce_loss_2": 4.355395114421844, "ce_loss_3": 4.114840877056122, "ce_loss_7": 3.7364174842834474, "epoch": 0.36, "grad_norm": 548.0, "kl_loss_10": 114.94896354675294, "kl_loss_2": 1648.0451293945312, "kl_loss_3": 1174.3060668945313, "kl_loss_7": 369.6777969360352, "learning_rate": 0.0007220333063028871, "loss": 816.8971, "step": 3600 }, { "ce_loss_10": 3.6533716797828673, "ce_loss_13": 3.5973863124847414, "ce_loss_2": 4.432326292991638, "ce_loss_3": 4.171863329410553, "ce_loss_7": 3.7727251172065737, "epoch": 0.361, "grad_norm": 952.0, "kl_loss_10": 114.99727020263671, "kl_loss_2": 1779.2748962402343, "kl_loss_3": 1241.500958251953, "kl_loss_7": 378.4084533691406, "learning_rate": 0.0007206105506216106, "loss": 863.2791, "step": 3610 }, { "ce_loss_10": 3.5320773124694824, "ce_loss_13": 3.47771532535553, "ce_loss_2": 4.253122186660766, "ce_loss_3": 4.022608149051666, "ce_loss_7": 3.646604669094086, "epoch": 0.362, "grad_norm": 748.0, "kl_loss_10": 111.02953720092773, "kl_loss_2": 1609.5626220703125, "kl_loss_3": 1153.3769470214843, "kl_loss_7": 360.0997344970703, "learning_rate": 0.0007191855733945387, "loss": 807.024, "step": 3620 }, { "ce_loss_10": 3.622232747077942, "ce_loss_13": 3.568149435520172, "ce_loss_2": 4.341107964515686, "ce_loss_3": 4.109168660640717, "ce_loss_7": 3.7372394680976866, "epoch": 0.363, "grad_norm": 652.0, "kl_loss_10": 112.0240264892578, "kl_loss_2": 1613.2364685058594, "kl_loss_3": 1154.0867492675782, "kl_loss_7": 362.3990966796875, "learning_rate": 0.0007177583889711762, "loss": 810.762, "step": 3630 }, { "ce_loss_10": 3.5399230480194093, "ce_loss_13": 3.4826905012130736, "ce_loss_2": 4.282315456867218, "ce_loss_3": 4.040750968456268, "ce_loss_7": 3.657997989654541, "epoch": 0.364, "grad_norm": 756.0, "kl_loss_10": 112.95874557495117, "kl_loss_2": 1671.2237731933594, "kl_loss_3": 1183.0107391357421, "kl_loss_7": 369.6154296875, "learning_rate": 0.0007163290117232541, "loss": 830.5109, "step": 3640 }, { "ce_loss_10": 3.6612719535827636, "ce_loss_13": 3.606365132331848, "ce_loss_2": 4.347417032718658, "ce_loss_3": 4.12227988243103, "ce_loss_7": 3.773686683177948, "epoch": 0.365, "grad_norm": 928.0, "kl_loss_10": 111.72150573730468, "kl_loss_2": 1574.4906005859375, "kl_loss_3": 1125.264892578125, "kl_loss_7": 359.3614990234375, "learning_rate": 0.0007148974560445859, "loss": 806.21, "step": 3650 }, { "ce_loss_10": 3.5773789167404173, "ce_loss_13": 3.522028124332428, "ce_loss_2": 4.281854033470154, "ce_loss_3": 4.059629571437836, "ce_loss_7": 3.6927342653274535, "epoch": 0.366, "grad_norm": 736.0, "kl_loss_10": 110.19277267456054, "kl_loss_2": 1581.7052856445312, "kl_loss_3": 1140.0502868652343, "kl_loss_7": 362.01515045166013, "learning_rate": 0.0007134637363509209, "loss": 802.6989, "step": 3660 }, { "ce_loss_10": 3.6860452771186827, "ce_loss_13": 3.6308083653450014, "ce_loss_2": 4.390925621986389, "ce_loss_3": 4.164236938953399, "ce_loss_7": 3.803776812553406, "epoch": 0.367, "grad_norm": 728.0, "kl_loss_10": 109.99288368225098, "kl_loss_2": 1571.0088989257813, "kl_loss_3": 1124.6994598388671, "kl_loss_7": 359.7028305053711, "learning_rate": 0.0007120278670798009, "loss": 813.3729, "step": 3670 }, { "ce_loss_10": 3.4854964137077333, "ce_loss_13": 3.428144407272339, "ce_loss_2": 4.267225384712219, "ce_loss_3": 4.010604655742645, "ce_loss_7": 3.6120729088783263, "epoch": 0.368, "grad_norm": 1016.0, "kl_loss_10": 113.8963191986084, "kl_loss_2": 1739.7731201171875, "kl_loss_3": 1224.8854675292969, "kl_loss_7": 384.0669769287109, "learning_rate": 0.0007105898626904133, "loss": 862.9203, "step": 3680 }, { "ce_loss_10": 3.589139664173126, "ce_loss_13": 3.5323904752731323, "ce_loss_2": 4.318976259231567, "ce_loss_3": 4.081890630722046, "ce_loss_7": 3.7046213507652284, "epoch": 0.369, "grad_norm": 564.0, "kl_loss_10": 112.2456226348877, "kl_loss_2": 1621.3577758789063, "kl_loss_3": 1151.3302612304688, "kl_loss_7": 364.4396682739258, "learning_rate": 0.0007091497376634463, "loss": 805.5474, "step": 3690 }, { "ce_loss_10": 3.5295987725257874, "ce_loss_13": 3.4756538391113283, "ce_loss_2": 4.249388527870178, "ce_loss_3": 4.017361283302307, "ce_loss_7": 3.647699475288391, "epoch": 0.37, "grad_norm": 616.0, "kl_loss_10": 111.63405570983886, "kl_loss_2": 1615.4303771972657, "kl_loss_3": 1145.3682800292968, "kl_loss_7": 362.72245635986326, "learning_rate": 0.0007077075065009433, "loss": 825.8085, "step": 3700 }, { "ce_loss_10": 3.640535795688629, "ce_loss_13": 3.583171856403351, "ce_loss_2": 4.378232324123383, "ce_loss_3": 4.138037061691284, "ce_loss_7": 3.757775664329529, "epoch": 0.371, "grad_norm": 840.0, "kl_loss_10": 115.77109489440917, "kl_loss_2": 1657.125262451172, "kl_loss_3": 1176.4483612060546, "kl_loss_7": 376.4958068847656, "learning_rate": 0.0007062631837261557, "loss": 825.9424, "step": 3710 }, { "ce_loss_10": 3.507258176803589, "ce_loss_13": 3.4526267290115356, "ce_loss_2": 4.242707252502441, "ce_loss_3": 4.004286658763886, "ce_loss_7": 3.6265420079231263, "epoch": 0.372, "grad_norm": 740.0, "kl_loss_10": 111.28448181152343, "kl_loss_2": 1641.0077697753907, "kl_loss_3": 1160.2905731201172, "kl_loss_7": 363.54747161865237, "learning_rate": 0.0007048167838833977, "loss": 831.9325, "step": 3720 }, { "ce_loss_10": 3.6075364470481874, "ce_loss_13": 3.5515516638755797, "ce_loss_2": 4.312083971500397, "ce_loss_3": 4.0842722177505495, "ce_loss_7": 3.7187731742858885, "epoch": 0.373, "grad_norm": 644.0, "kl_loss_10": 112.55620231628419, "kl_loss_2": 1599.3533508300782, "kl_loss_3": 1142.2032135009765, "kl_loss_7": 362.6938003540039, "learning_rate": 0.0007033683215379002, "loss": 805.0671, "step": 3730 }, { "ce_loss_10": 3.595511317253113, "ce_loss_13": 3.539903426170349, "ce_loss_2": 4.318402671813965, "ce_loss_3": 4.082508647441864, "ce_loss_7": 3.7135739088058473, "epoch": 0.374, "grad_norm": 832.0, "kl_loss_10": 111.01340446472167, "kl_loss_2": 1610.1729858398437, "kl_loss_3": 1144.7270874023438, "kl_loss_7": 358.1510513305664, "learning_rate": 0.0007019178112756625, "loss": 820.7587, "step": 3740 }, { "ce_loss_10": 3.553373420238495, "ce_loss_13": 3.4991654753684998, "ce_loss_2": 4.282506775856018, "ce_loss_3": 4.044153892993927, "ce_loss_7": 3.6662455558776856, "epoch": 0.375, "grad_norm": 728.0, "kl_loss_10": 112.27315635681153, "kl_loss_2": 1620.312060546875, "kl_loss_3": 1150.902978515625, "kl_loss_7": 358.3823532104492, "learning_rate": 0.0007004652677033068, "loss": 818.4001, "step": 3750 }, { "ce_loss_10": 3.6296069502830504, "ce_loss_13": 3.5780036568641664, "ce_loss_2": 4.324629938602447, "ce_loss_3": 4.093917334079743, "ce_loss_7": 3.735015392303467, "epoch": 0.376, "grad_norm": 1320.0, "kl_loss_10": 108.83694229125976, "kl_loss_2": 1574.510302734375, "kl_loss_3": 1120.5037750244142, "kl_loss_7": 347.6950988769531, "learning_rate": 0.0006990107054479312, "loss": 800.2917, "step": 3760 }, { "ce_loss_10": 3.6188935160636904, "ce_loss_13": 3.563589060306549, "ce_loss_2": 4.328138792514801, "ce_loss_3": 4.102461910247802, "ce_loss_7": 3.7316895365715026, "epoch": 0.377, "grad_norm": 792.0, "kl_loss_10": 111.43818740844726, "kl_loss_2": 1603.086358642578, "kl_loss_3": 1139.6190460205078, "kl_loss_7": 354.68104095458983, "learning_rate": 0.000697554139156961, "loss": 805.6019, "step": 3770 }, { "ce_loss_10": 3.5970142006874086, "ce_loss_13": 3.5441549181938172, "ce_loss_2": 4.3167448282241825, "ce_loss_3": 4.087893843650818, "ce_loss_7": 3.7121401906013487, "epoch": 0.378, "grad_norm": 932.0, "kl_loss_10": 112.52928352355957, "kl_loss_2": 1632.8701477050781, "kl_loss_3": 1164.68017578125, "kl_loss_7": 366.97422790527344, "learning_rate": 0.0006960955834980027, "loss": 804.3657, "step": 3780 }, { "ce_loss_10": 3.572918438911438, "ce_loss_13": 3.5169412732124328, "ce_loss_2": 4.287224912643433, "ce_loss_3": 4.064115214347839, "ce_loss_7": 3.6872175335884094, "epoch": 0.379, "grad_norm": 628.0, "kl_loss_10": 111.71525993347169, "kl_loss_2": 1596.9349792480468, "kl_loss_3": 1145.359423828125, "kl_loss_7": 358.553889465332, "learning_rate": 0.0006946350531586958, "loss": 807.9014, "step": 3790 }, { "ce_loss_10": 3.594931626319885, "ce_loss_13": 3.540974700450897, "ce_loss_2": 4.307750010490418, "ce_loss_3": 4.074563002586364, "ce_loss_7": 3.7054863572120667, "epoch": 0.38, "grad_norm": 592.0, "kl_loss_10": 112.29558181762695, "kl_loss_2": 1597.129541015625, "kl_loss_3": 1141.9570861816405, "kl_loss_7": 358.04528961181643, "learning_rate": 0.0006931725628465643, "loss": 822.4022, "step": 3800 }, { "ce_loss_10": 3.6254909515380858, "ce_loss_13": 3.566372275352478, "ce_loss_2": 4.342851352691651, "ce_loss_3": 4.108896970748901, "ce_loss_7": 3.736094582080841, "epoch": 0.381, "grad_norm": 596.0, "kl_loss_10": 119.4555606842041, "kl_loss_2": 1615.1177734375, "kl_loss_3": 1153.0230712890625, "kl_loss_7": 361.6635452270508, "learning_rate": 0.0006917081272888696, "loss": 819.1617, "step": 3810 }, { "ce_loss_10": 3.531435215473175, "ce_loss_13": 3.4657562255859373, "ce_loss_2": 4.2563963532447815, "ce_loss_3": 4.0093022108078005, "ce_loss_7": 3.6335185408592223, "epoch": 0.382, "grad_norm": 624.0, "kl_loss_10": 125.53880195617675, "kl_loss_2": 1657.9251586914063, "kl_loss_3": 1152.6442901611329, "kl_loss_7": 358.821354675293, "learning_rate": 0.0006902417612324615, "loss": 812.629, "step": 3820 }, { "ce_loss_10": 3.6589030742645265, "ce_loss_13": 3.5913188815116883, "ce_loss_2": 4.390701973438263, "ce_loss_3": 4.154953193664551, "ce_loss_7": 3.768098795413971, "epoch": 0.383, "grad_norm": 816.0, "kl_loss_10": 129.4063404083252, "kl_loss_2": 1660.2730651855468, "kl_loss_3": 1190.3410583496093, "kl_loss_7": 373.82919921875, "learning_rate": 0.00068877347944363, "loss": 830.2361, "step": 3830 }, { "ce_loss_10": 3.651380455493927, "ce_loss_13": 3.590957319736481, "ce_loss_2": 4.348117756843567, "ce_loss_3": 4.117576336860656, "ce_loss_7": 3.7543843030929565, "epoch": 0.384, "grad_norm": 628.0, "kl_loss_10": 122.7444221496582, "kl_loss_2": 1583.8421508789063, "kl_loss_3": 1131.0393615722655, "kl_loss_7": 356.3165542602539, "learning_rate": 0.0006873032967079561, "loss": 817.4205, "step": 3840 }, { "ce_loss_10": 3.6335170745849608, "ce_loss_13": 3.577832078933716, "ce_loss_2": 4.3216951251029965, "ce_loss_3": 4.097213232517243, "ce_loss_7": 3.7368293046951293, "epoch": 0.385, "grad_norm": 708.0, "kl_loss_10": 117.6690486907959, "kl_loss_2": 1573.2097229003907, "kl_loss_3": 1122.621371459961, "kl_loss_7": 350.94859619140624, "learning_rate": 0.0006858312278301637, "loss": 796.704, "step": 3850 }, { "ce_loss_10": 3.6798113226890563, "ce_loss_13": 3.6175168991088866, "ce_loss_2": 4.350703716278076, "ce_loss_3": 4.131449520587921, "ce_loss_7": 3.779970121383667, "epoch": 0.386, "grad_norm": 748.0, "kl_loss_10": 120.30896453857422, "kl_loss_2": 1568.8364013671876, "kl_loss_3": 1122.1907867431642, "kl_loss_7": 353.39493560791016, "learning_rate": 0.0006843572876339704, "loss": 796.1761, "step": 3860 }, { "ce_loss_10": 3.5860485553741457, "ce_loss_13": 3.534332013130188, "ce_loss_2": 4.264412248134613, "ce_loss_3": 4.042767536640167, "ce_loss_7": 3.693724584579468, "epoch": 0.387, "grad_norm": 652.0, "kl_loss_10": 112.18650245666504, "kl_loss_2": 1543.6417297363282, "kl_loss_3": 1099.1077575683594, "kl_loss_7": 344.66336517333986, "learning_rate": 0.0006828814909619373, "loss": 810.5582, "step": 3870 }, { "ce_loss_10": 3.713555467128754, "ce_loss_13": 3.6556958675384523, "ce_loss_2": 4.405901694297791, "ce_loss_3": 4.180235993862152, "ce_loss_7": 3.821026015281677, "epoch": 0.388, "grad_norm": 588.0, "kl_loss_10": 117.28574714660644, "kl_loss_2": 1571.2838562011718, "kl_loss_3": 1117.8440704345703, "kl_loss_7": 356.7807907104492, "learning_rate": 0.0006814038526753205, "loss": 792.8091, "step": 3880 }, { "ce_loss_10": 3.6067586660385134, "ce_loss_13": 3.547635757923126, "ce_loss_2": 4.307654559612274, "ce_loss_3": 4.081937205791474, "ce_loss_7": 3.7171419978141786, "epoch": 0.389, "grad_norm": 568.0, "kl_loss_10": 113.71804313659668, "kl_loss_2": 1581.8426574707032, "kl_loss_3": 1130.9747375488282, "kl_loss_7": 355.7605316162109, "learning_rate": 0.0006799243876539213, "loss": 800.2555, "step": 3890 }, { "ce_loss_10": 3.5266871213912965, "ce_loss_13": 3.4727567434310913, "ce_loss_2": 4.271532738208771, "ce_loss_3": 4.021096110343933, "ce_loss_7": 3.643188107013702, "epoch": 0.39, "grad_norm": 832.0, "kl_loss_10": 113.58738403320312, "kl_loss_2": 1653.007305908203, "kl_loss_3": 1151.2217895507813, "kl_loss_7": 362.8320083618164, "learning_rate": 0.0006784431107959359, "loss": 817.077, "step": 3900 }, { "ce_loss_10": 3.589532899856567, "ce_loss_13": 3.5327725172042848, "ce_loss_2": 4.329496431350708, "ce_loss_3": 4.088249838352203, "ce_loss_7": 3.70775443315506, "epoch": 0.391, "grad_norm": 788.0, "kl_loss_10": 114.00553436279297, "kl_loss_2": 1664.1122314453125, "kl_loss_3": 1171.7627258300781, "kl_loss_7": 367.3645324707031, "learning_rate": 0.0006769600370178059, "loss": 819.1777, "step": 3910 }, { "ce_loss_10": 3.556470251083374, "ce_loss_13": 3.500228500366211, "ce_loss_2": 4.282485377788544, "ce_loss_3": 4.049212145805359, "ce_loss_7": 3.6682217478752137, "epoch": 0.392, "grad_norm": 692.0, "kl_loss_10": 110.49532318115234, "kl_loss_2": 1619.375909423828, "kl_loss_3": 1152.3483276367188, "kl_loss_7": 358.85225677490234, "learning_rate": 0.0006754751812540679, "loss": 797.0069, "step": 3920 }, { "ce_loss_10": 3.6087580561637878, "ce_loss_13": 3.5503377079963685, "ce_loss_2": 4.330141878128051, "ce_loss_3": 4.090830850601196, "ce_loss_7": 3.716358256340027, "epoch": 0.393, "grad_norm": 780.0, "kl_loss_10": 119.51278839111328, "kl_loss_2": 1637.2988830566405, "kl_loss_3": 1159.553662109375, "kl_loss_7": 362.3539810180664, "learning_rate": 0.0006739885584572025, "loss": 819.8983, "step": 3930 }, { "ce_loss_10": 3.6341138005256655, "ce_loss_13": 3.5722761869430544, "ce_loss_2": 4.366119122505188, "ce_loss_3": 4.114328777790069, "ce_loss_7": 3.7444755911827086, "epoch": 0.394, "grad_norm": 860.0, "kl_loss_10": 122.08591918945312, "kl_loss_2": 1677.715838623047, "kl_loss_3": 1173.277606201172, "kl_loss_7": 363.27244567871094, "learning_rate": 0.0006725001835974853, "loss": 816.8476, "step": 3940 }, { "ce_loss_10": 3.622213864326477, "ce_loss_13": 3.5617170572280883, "ce_loss_2": 4.345288610458374, "ce_loss_3": 4.107230472564697, "ce_loss_7": 3.7375001311302185, "epoch": 0.395, "grad_norm": 684.0, "kl_loss_10": 120.22575340270996, "kl_loss_2": 1625.9869079589844, "kl_loss_3": 1153.9129821777344, "kl_loss_7": 362.3075927734375, "learning_rate": 0.0006710100716628344, "loss": 801.8663, "step": 3950 }, { "ce_loss_10": 3.601362907886505, "ce_loss_13": 3.5433133959770204, "ce_loss_2": 4.320641613006591, "ce_loss_3": 4.090987122058868, "ce_loss_7": 3.7134194374084473, "epoch": 0.396, "grad_norm": 676.0, "kl_loss_10": 114.60573959350586, "kl_loss_2": 1617.4572631835938, "kl_loss_3": 1151.9900695800782, "kl_loss_7": 358.98384552001954, "learning_rate": 0.0006695182376586602, "loss": 816.8568, "step": 3960 }, { "ce_loss_10": 3.6466455340385435, "ce_loss_13": 3.5853740096092226, "ce_loss_2": 4.320103776454926, "ce_loss_3": 4.10079540014267, "ce_loss_7": 3.748496043682098, "epoch": 0.397, "grad_norm": 740.0, "kl_loss_10": 118.65713195800781, "kl_loss_2": 1531.3922546386718, "kl_loss_3": 1089.6826293945312, "kl_loss_7": 345.24675903320315, "learning_rate": 0.000668024696607715, "loss": 800.3069, "step": 3970 }, { "ce_loss_10": 3.587778663635254, "ce_loss_13": 3.5327473163604735, "ce_loss_2": 4.291483080387115, "ce_loss_3": 4.059780657291412, "ce_loss_7": 3.6969476103782655, "epoch": 0.398, "grad_norm": 772.0, "kl_loss_10": 115.91534194946288, "kl_loss_2": 1605.4522338867187, "kl_loss_3": 1145.034896850586, "kl_loss_7": 358.45362701416013, "learning_rate": 0.0006665294635499404, "loss": 806.5447, "step": 3980 }, { "ce_loss_10": 3.5975987553596496, "ce_loss_13": 3.5398255825042724, "ce_loss_2": 4.340124273300171, "ce_loss_3": 4.097122013568878, "ce_loss_7": 3.7147999167442323, "epoch": 0.399, "grad_norm": 736.0, "kl_loss_10": 119.60567893981934, "kl_loss_2": 1681.5505920410155, "kl_loss_3": 1194.736019897461, "kl_loss_7": 377.9365966796875, "learning_rate": 0.0006650325535423167, "loss": 825.7657, "step": 3990 }, { "ce_loss_10": 3.623650085926056, "ce_loss_13": 3.5689555644989013, "ce_loss_2": 4.305339181423188, "ce_loss_3": 4.083868932723999, "ce_loss_7": 3.7299221634864805, "epoch": 0.4, "grad_norm": 1240.0, "kl_loss_10": 108.51692848205566, "kl_loss_2": 1535.4692443847657, "kl_loss_3": 1098.896890258789, "kl_loss_7": 345.56116943359376, "learning_rate": 0.0006635339816587109, "loss": 793.1923, "step": 4000 }, { "ce_loss_10": 3.559077489376068, "ce_loss_13": 3.5038843393325805, "ce_loss_2": 4.285916292667389, "ce_loss_3": 4.043698585033416, "ce_loss_7": 3.668740463256836, "epoch": 0.401, "grad_norm": 572.0, "kl_loss_10": 113.25110626220703, "kl_loss_2": 1639.3580810546875, "kl_loss_3": 1154.5542938232422, "kl_loss_7": 359.79540100097654, "learning_rate": 0.0006620337629897252, "loss": 804.0013, "step": 4010 }, { "ce_loss_10": 3.5661354064941406, "ce_loss_13": 3.5108693480491637, "ce_loss_2": 4.284282314777374, "ce_loss_3": 4.0547087550163265, "ce_loss_7": 3.679810118675232, "epoch": 0.402, "grad_norm": 768.0, "kl_loss_10": 111.87732963562011, "kl_loss_2": 1613.9498107910156, "kl_loss_3": 1148.6082763671875, "kl_loss_7": 359.6726791381836, "learning_rate": 0.0006605319126425454, "loss": 823.1623, "step": 4020 }, { "ce_loss_10": 3.4701685905456543, "ce_loss_13": 3.4153579235076905, "ce_loss_2": 4.212935626506805, "ce_loss_3": 3.9681638360023497, "ce_loss_7": 3.5901816964149473, "epoch": 0.403, "grad_norm": 716.0, "kl_loss_10": 110.84479560852051, "kl_loss_2": 1668.632763671875, "kl_loss_3": 1176.426739501953, "kl_loss_7": 362.00088653564455, "learning_rate": 0.0006590284457407876, "loss": 818.29, "step": 4030 }, { "ce_loss_10": 3.5741465210914614, "ce_loss_13": 3.5170868396759034, "ce_loss_2": 4.283811104297638, "ce_loss_3": 4.052058017253875, "ce_loss_7": 3.6852954387664796, "epoch": 0.404, "grad_norm": 828.0, "kl_loss_10": 110.29585990905761, "kl_loss_2": 1591.2055053710938, "kl_loss_3": 1126.3041290283204, "kl_loss_7": 356.3999816894531, "learning_rate": 0.0006575233774243465, "loss": 800.8386, "step": 4040 }, { "ce_loss_10": 3.5590779066085814, "ce_loss_13": 3.5032316207885743, "ce_loss_2": 4.283195281028748, "ce_loss_3": 4.045397615432739, "ce_loss_7": 3.6708916664123534, "epoch": 0.405, "grad_norm": 860.0, "kl_loss_10": 110.3638069152832, "kl_loss_2": 1642.2369384765625, "kl_loss_3": 1161.6165649414063, "kl_loss_7": 361.9104797363281, "learning_rate": 0.0006560167228492435, "loss": 814.2618, "step": 4050 }, { "ce_loss_10": 3.606846511363983, "ce_loss_13": 3.558204233646393, "ce_loss_2": 4.296821963787079, "ce_loss_3": 4.071336483955383, "ce_loss_7": 3.717876648902893, "epoch": 0.406, "grad_norm": 860.0, "kl_loss_10": 105.17063331604004, "kl_loss_2": 1555.840966796875, "kl_loss_3": 1103.9833374023438, "kl_loss_7": 345.41405181884767, "learning_rate": 0.0006545084971874737, "loss": 799.0605, "step": 4060 }, { "ce_loss_10": 3.574194645881653, "ce_loss_13": 3.5181384086608887, "ce_loss_2": 4.310971105098725, "ce_loss_3": 4.075375628471375, "ce_loss_7": 3.693107807636261, "epoch": 0.407, "grad_norm": 1200.0, "kl_loss_10": 113.23215675354004, "kl_loss_2": 1657.857861328125, "kl_loss_3": 1173.7760009765625, "kl_loss_7": 368.5741241455078, "learning_rate": 0.0006529987156268526, "loss": 806.9854, "step": 4070 }, { "ce_loss_10": 3.4922988176345826, "ce_loss_13": 3.4339062452316282, "ce_loss_2": 4.230000305175781, "ce_loss_3": 3.9903356075286864, "ce_loss_7": 3.6105907559394836, "epoch": 0.408, "grad_norm": 744.0, "kl_loss_10": 113.21600570678712, "kl_loss_2": 1647.085302734375, "kl_loss_3": 1165.4387115478517, "kl_loss_7": 364.42572021484375, "learning_rate": 0.0006514873933708637, "loss": 828.9529, "step": 4080 }, { "ce_loss_10": 3.5987717390060423, "ce_loss_13": 3.546416938304901, "ce_loss_2": 4.315543818473816, "ce_loss_3": 4.074562764167785, "ce_loss_7": 3.714030992984772, "epoch": 0.409, "grad_norm": 1056.0, "kl_loss_10": 108.66846237182617, "kl_loss_2": 1603.7696472167968, "kl_loss_3": 1132.9613677978516, "kl_loss_7": 354.7437286376953, "learning_rate": 0.0006499745456385053, "loss": 800.9375, "step": 4090 }, { "ce_loss_10": 3.5672728180885316, "ce_loss_13": 3.513661253452301, "ce_loss_2": 4.2895782589912415, "ce_loss_3": 4.0553812980651855, "ce_loss_7": 3.6795910000801086, "epoch": 0.41, "grad_norm": 824.0, "kl_loss_10": 109.73035316467285, "kl_loss_2": 1597.405938720703, "kl_loss_3": 1141.4947448730468, "kl_loss_7": 356.6665985107422, "learning_rate": 0.0006484601876641375, "loss": 812.7092, "step": 4100 }, { "ce_loss_10": 3.5573137998580933, "ce_loss_13": 3.5041900873184204, "ce_loss_2": 4.246158039569854, "ce_loss_3": 4.028305006027222, "ce_loss_7": 3.666341245174408, "epoch": 0.411, "grad_norm": 592.0, "kl_loss_10": 107.3414405822754, "kl_loss_2": 1554.253887939453, "kl_loss_3": 1112.2054473876954, "kl_loss_7": 347.7774963378906, "learning_rate": 0.000646944334697328, "loss": 788.0204, "step": 4110 }, { "ce_loss_10": 3.671020877361298, "ce_loss_13": 3.6174062371253966, "ce_loss_2": 4.356060886383057, "ce_loss_3": 4.140322005748748, "ce_loss_7": 3.7856799960136414, "epoch": 0.412, "grad_norm": 556.0, "kl_loss_10": 108.6750274658203, "kl_loss_2": 1534.2772705078125, "kl_loss_3": 1096.760092163086, "kl_loss_7": 350.0878479003906, "learning_rate": 0.0006454270020026995, "loss": 776.7923, "step": 4120 }, { "ce_loss_10": 3.639327073097229, "ce_loss_13": 3.5877670288085937, "ce_loss_2": 4.316995143890381, "ce_loss_3": 4.100991833209991, "ce_loss_7": 3.7454528331756594, "epoch": 0.413, "grad_norm": 748.0, "kl_loss_10": 105.7381103515625, "kl_loss_2": 1526.7204467773438, "kl_loss_3": 1095.415216064453, "kl_loss_7": 343.8223907470703, "learning_rate": 0.0006439082048597755, "loss": 775.9975, "step": 4130 }, { "ce_loss_10": 3.628104865550995, "ce_loss_13": 3.574373698234558, "ce_loss_2": 4.329335260391235, "ce_loss_3": 4.099046552181244, "ce_loss_7": 3.742833375930786, "epoch": 0.414, "grad_norm": 776.0, "kl_loss_10": 109.39915008544922, "kl_loss_2": 1594.7671813964844, "kl_loss_3": 1132.73916015625, "kl_loss_7": 357.6129547119141, "learning_rate": 0.0006423879585628261, "loss": 800.1681, "step": 4140 }, { "ce_loss_10": 3.59013671875, "ce_loss_13": 3.5310643911361694, "ce_loss_2": 4.332114636898041, "ce_loss_3": 4.090101170539856, "ce_loss_7": 3.708698034286499, "epoch": 0.415, "grad_norm": 720.0, "kl_loss_10": 114.16459083557129, "kl_loss_2": 1655.5896728515625, "kl_loss_3": 1175.8370361328125, "kl_loss_7": 365.1665435791016, "learning_rate": 0.0006408662784207149, "loss": 822.0938, "step": 4150 }, { "ce_loss_10": 3.547511374950409, "ce_loss_13": 3.4912413239479063, "ce_loss_2": 4.252956867218018, "ce_loss_3": 4.024624216556549, "ce_loss_7": 3.6563313722610475, "epoch": 0.416, "grad_norm": 796.0, "kl_loss_10": 108.11836242675781, "kl_loss_2": 1603.76171875, "kl_loss_3": 1143.7199920654298, "kl_loss_7": 351.2268661499023, "learning_rate": 0.0006393431797567439, "loss": 803.3352, "step": 4160 }, { "ce_loss_10": 3.6319918870925902, "ce_loss_13": 3.578919732570648, "ce_loss_2": 4.302798545360565, "ce_loss_3": 4.081296157836914, "ce_loss_7": 3.7385294437408447, "epoch": 0.417, "grad_norm": 656.0, "kl_loss_10": 109.54261932373046, "kl_loss_2": 1541.943701171875, "kl_loss_3": 1099.8628997802734, "kl_loss_7": 348.5423294067383, "learning_rate": 0.0006378186779084996, "loss": 765.586, "step": 4170 }, { "ce_loss_10": 3.46295428276062, "ce_loss_13": 3.4088621854782106, "ce_loss_2": 4.204769730567932, "ce_loss_3": 3.9614800333976747, "ce_loss_7": 3.5774038791656495, "epoch": 0.418, "grad_norm": 760.0, "kl_loss_10": 110.07180404663086, "kl_loss_2": 1635.357568359375, "kl_loss_3": 1155.2987487792968, "kl_loss_7": 356.7828842163086, "learning_rate": 0.0006362927882276989, "loss": 813.903, "step": 4180 }, { "ce_loss_10": 3.6643528580665587, "ce_loss_13": 3.608100974559784, "ce_loss_2": 4.35189688205719, "ce_loss_3": 4.127672624588013, "ce_loss_7": 3.772542440891266, "epoch": 0.419, "grad_norm": 652.0, "kl_loss_10": 108.98200492858886, "kl_loss_2": 1566.8099609375, "kl_loss_3": 1108.0154205322265, "kl_loss_7": 343.8260559082031, "learning_rate": 0.000634765526080034, "loss": 773.992, "step": 4190 }, { "ce_loss_10": 3.666310214996338, "ce_loss_13": 3.6103180050849915, "ce_loss_2": 4.360920691490174, "ce_loss_3": 4.13553694486618, "ce_loss_7": 3.7773757338523866, "epoch": 0.42, "grad_norm": 784.0, "kl_loss_10": 111.99367599487304, "kl_loss_2": 1578.667254638672, "kl_loss_3": 1126.5918273925781, "kl_loss_7": 353.9135177612305, "learning_rate": 0.0006332369068450174, "loss": 785.516, "step": 4200 }, { "ce_loss_10": 3.5980607748031614, "ce_loss_13": 3.542741060256958, "ce_loss_2": 4.305352103710175, "ce_loss_3": 4.06997002363205, "ce_loss_7": 3.706935429573059, "epoch": 0.421, "grad_norm": 540.0, "kl_loss_10": 109.52786445617676, "kl_loss_2": 1581.6275329589844, "kl_loss_3": 1121.8053253173828, "kl_loss_7": 351.7825164794922, "learning_rate": 0.0006317069459158283, "loss": 789.4091, "step": 4210 }, { "ce_loss_10": 3.7129782915115355, "ce_loss_13": 3.66150221824646, "ce_loss_2": 4.386883282661438, "ce_loss_3": 4.162325203418732, "ce_loss_7": 3.8174985527992247, "epoch": 0.422, "grad_norm": 728.0, "kl_loss_10": 109.29851493835449, "kl_loss_2": 1534.1687622070312, "kl_loss_3": 1092.115789794922, "kl_loss_7": 345.35370178222655, "learning_rate": 0.0006301756586991561, "loss": 785.4158, "step": 4220 }, { "ce_loss_10": 3.4808504223823546, "ce_loss_13": 3.4289090514183043, "ce_loss_2": 4.210888338088989, "ce_loss_3": 3.9705875873565675, "ce_loss_7": 3.5945833683013917, "epoch": 0.423, "grad_norm": 760.0, "kl_loss_10": 109.6344123840332, "kl_loss_2": 1649.18388671875, "kl_loss_3": 1168.557196044922, "kl_loss_7": 360.38920440673826, "learning_rate": 0.0006286430606150459, "loss": 810.1971, "step": 4230 }, { "ce_loss_10": 3.686413383483887, "ce_loss_13": 3.633048677444458, "ce_loss_2": 4.373401165008545, "ce_loss_3": 4.152478146553039, "ce_loss_7": 3.79513897895813, "epoch": 0.424, "grad_norm": 576.0, "kl_loss_10": 109.33524894714355, "kl_loss_2": 1562.4575439453124, "kl_loss_3": 1115.3017944335938, "kl_loss_7": 354.32454071044924, "learning_rate": 0.0006271091670967436, "loss": 783.647, "step": 4240 }, { "ce_loss_10": 3.599622702598572, "ce_loss_13": 3.5406736612319945, "ce_loss_2": 4.324578738212585, "ce_loss_3": 4.091307723522187, "ce_loss_7": 3.7173473715782164, "epoch": 0.425, "grad_norm": 932.0, "kl_loss_10": 113.91219444274903, "kl_loss_2": 1654.1155517578125, "kl_loss_3": 1179.1394104003907, "kl_loss_7": 373.93585510253905, "learning_rate": 0.0006255739935905395, "loss": 810.7966, "step": 4250 }, { "ce_loss_10": 3.6332942247390747, "ce_loss_13": 3.580468213558197, "ce_loss_2": 4.318172717094422, "ce_loss_3": 4.094944822788238, "ce_loss_7": 3.744012701511383, "epoch": 0.426, "grad_norm": 564.0, "kl_loss_10": 108.21953315734864, "kl_loss_2": 1559.2735534667968, "kl_loss_3": 1105.1550354003907, "kl_loss_7": 348.98621063232423, "learning_rate": 0.0006240375555556145, "loss": 808.7702, "step": 4260 }, { "ce_loss_10": 3.644718074798584, "ce_loss_13": 3.5896992921829223, "ce_loss_2": 4.373972868919372, "ce_loss_3": 4.134547388553619, "ce_loss_7": 3.7593003869056703, "epoch": 0.427, "grad_norm": 912.0, "kl_loss_10": 109.82870063781738, "kl_loss_2": 1615.0028869628907, "kl_loss_3": 1141.0478881835938, "kl_loss_7": 352.82830810546875, "learning_rate": 0.000622499868463882, "loss": 798.742, "step": 4270 }, { "ce_loss_10": 3.611183559894562, "ce_loss_13": 3.5582830786705015, "ce_loss_2": 4.28910653591156, "ce_loss_3": 4.066485011577607, "ce_loss_7": 3.714597499370575, "epoch": 0.428, "grad_norm": 724.0, "kl_loss_10": 107.94103050231934, "kl_loss_2": 1559.0828002929688, "kl_loss_3": 1100.9592224121093, "kl_loss_7": 342.31569213867186, "learning_rate": 0.0006209609477998338, "loss": 788.2028, "step": 4280 }, { "ce_loss_10": 3.664873945713043, "ce_loss_13": 3.6078646540641786, "ce_loss_2": 4.361586105823517, "ce_loss_3": 4.1321197867393495, "ce_loss_7": 3.7738134384155275, "epoch": 0.429, "grad_norm": 712.0, "kl_loss_10": 111.10050354003906, "kl_loss_2": 1577.475, "kl_loss_3": 1128.3111358642577, "kl_loss_7": 351.4517105102539, "learning_rate": 0.0006194208090603844, "loss": 800.3831, "step": 4290 }, { "ce_loss_10": 3.5812186121940615, "ce_loss_13": 3.529315197467804, "ce_loss_2": 4.274899244308472, "ce_loss_3": 4.04980229139328, "ce_loss_7": 3.6913169741630556, "epoch": 0.43, "grad_norm": 896.0, "kl_loss_10": 106.97743072509766, "kl_loss_2": 1551.778759765625, "kl_loss_3": 1103.7679443359375, "kl_loss_7": 344.62910919189454, "learning_rate": 0.0006178794677547138, "loss": 777.4179, "step": 4300 }, { "ce_loss_10": 3.611683452129364, "ce_loss_13": 3.55863596200943, "ce_loss_2": 4.320464360713959, "ce_loss_3": 4.09289687871933, "ce_loss_7": 3.726914095878601, "epoch": 0.431, "grad_norm": 764.0, "kl_loss_10": 109.82320098876953, "kl_loss_2": 1605.521875, "kl_loss_3": 1144.8658203125, "kl_loss_7": 359.4647903442383, "learning_rate": 0.0006163369394041111, "loss": 797.5429, "step": 4310 }, { "ce_loss_10": 3.5496033310890196, "ce_loss_13": 3.4974258303642274, "ce_loss_2": 4.269338321685791, "ce_loss_3": 4.0404635787010195, "ce_loss_7": 3.6666770935058595, "epoch": 0.432, "grad_norm": 804.0, "kl_loss_10": 108.05587043762208, "kl_loss_2": 1616.2899108886718, "kl_loss_3": 1156.0351806640624, "kl_loss_7": 355.83813629150393, "learning_rate": 0.0006147932395418205, "loss": 819.3604, "step": 4320 }, { "ce_loss_10": 3.583838105201721, "ce_loss_13": 3.529697823524475, "ce_loss_2": 4.2737444043159485, "ce_loss_3": 4.0504556655883786, "ce_loss_7": 3.6884029269218446, "epoch": 0.433, "grad_norm": 1160.0, "kl_loss_10": 107.5973388671875, "kl_loss_2": 1561.297918701172, "kl_loss_3": 1112.086898803711, "kl_loss_7": 348.4447265625, "learning_rate": 0.0006132483837128823, "loss": 781.3867, "step": 4330 }, { "ce_loss_10": 3.5666239976882936, "ce_loss_13": 3.5152251958847045, "ce_loss_2": 4.277129757404327, "ce_loss_3": 4.034627997875214, "ce_loss_7": 3.675694727897644, "epoch": 0.434, "grad_norm": 748.0, "kl_loss_10": 108.01759567260743, "kl_loss_2": 1601.0975158691406, "kl_loss_3": 1121.4753845214843, "kl_loss_7": 352.5591354370117, "learning_rate": 0.0006117023874739772, "loss": 795.2471, "step": 4340 }, { "ce_loss_10": 3.5544289708137513, "ce_loss_13": 3.501638042926788, "ce_loss_2": 4.268613231182099, "ce_loss_3": 4.03516457080841, "ce_loss_7": 3.66785523891449, "epoch": 0.435, "grad_norm": 660.0, "kl_loss_10": 108.2745346069336, "kl_loss_2": 1611.378271484375, "kl_loss_3": 1141.5203735351563, "kl_loss_7": 357.8962738037109, "learning_rate": 0.0006101552663932703, "loss": 803.5875, "step": 4350 }, { "ce_loss_10": 3.58964604139328, "ce_loss_13": 3.53500018119812, "ce_loss_2": 4.277464830875397, "ce_loss_3": 4.055664229393005, "ce_loss_7": 3.6980403900146483, "epoch": 0.436, "grad_norm": 688.0, "kl_loss_10": 110.70993270874024, "kl_loss_2": 1580.899591064453, "kl_loss_3": 1120.4501556396485, "kl_loss_7": 354.43458862304686, "learning_rate": 0.0006086070360502539, "loss": 796.6737, "step": 4360 }, { "ce_loss_10": 3.58989417552948, "ce_loss_13": 3.535743165016174, "ce_loss_2": 4.294692206382751, "ce_loss_3": 4.058806276321411, "ce_loss_7": 3.6955052495002745, "epoch": 0.437, "grad_norm": 676.0, "kl_loss_10": 109.01717109680176, "kl_loss_2": 1593.5218872070313, "kl_loss_3": 1125.020816040039, "kl_loss_7": 348.8786880493164, "learning_rate": 0.0006070577120355903, "loss": 792.9192, "step": 4370 }, { "ce_loss_10": 3.5945995688438415, "ce_loss_13": 3.539144825935364, "ce_loss_2": 4.2865272760391235, "ce_loss_3": 4.071486496925354, "ce_loss_7": 3.7060585618019104, "epoch": 0.438, "grad_norm": 732.0, "kl_loss_10": 106.93712120056152, "kl_loss_2": 1553.8668884277345, "kl_loss_3": 1116.010565185547, "kl_loss_7": 347.89042510986326, "learning_rate": 0.0006055073099509549, "loss": 783.8716, "step": 4380 }, { "ce_loss_10": 3.6512889504432677, "ce_loss_13": 3.5975317478179933, "ce_loss_2": 4.341889214515686, "ce_loss_3": 4.113449001312256, "ce_loss_7": 3.7572733759880066, "epoch": 0.439, "grad_norm": 636.0, "kl_loss_10": 107.61761360168457, "kl_loss_2": 1557.2190246582031, "kl_loss_3": 1107.4708557128906, "kl_loss_7": 345.42579040527346, "learning_rate": 0.0006039558454088796, "loss": 796.3017, "step": 4390 }, { "ce_loss_10": 3.632346880435944, "ce_loss_13": 3.5744163393974304, "ce_loss_2": 4.342388272285461, "ce_loss_3": 4.114965856075287, "ce_loss_7": 3.7484363436698915, "epoch": 0.44, "grad_norm": 668.0, "kl_loss_10": 110.9566276550293, "kl_loss_2": 1602.0462219238282, "kl_loss_3": 1143.73037109375, "kl_loss_7": 358.4840744018555, "learning_rate": 0.0006024033340325954, "loss": 788.2627, "step": 4400 }, { "ce_loss_10": 3.6963846564292906, "ce_loss_13": 3.6434014201164246, "ce_loss_2": 4.367515945434571, "ce_loss_3": 4.143146562576294, "ce_loss_7": 3.799778068065643, "epoch": 0.441, "grad_norm": 912.0, "kl_loss_10": 106.078271484375, "kl_loss_2": 1503.47685546875, "kl_loss_3": 1065.7772735595704, "kl_loss_7": 336.0490234375, "learning_rate": 0.0006008497914558743, "loss": 767.4968, "step": 4410 }, { "ce_loss_10": 3.6384681940078734, "ce_loss_13": 3.5803956389427185, "ce_loss_2": 4.343875598907471, "ce_loss_3": 4.112421405315399, "ce_loss_7": 3.749760186672211, "epoch": 0.442, "grad_norm": 836.0, "kl_loss_10": 113.6450122833252, "kl_loss_2": 1593.7016723632812, "kl_loss_3": 1131.4469818115235, "kl_loss_7": 359.4178527832031, "learning_rate": 0.0005992952333228728, "loss": 795.3125, "step": 4420 }, { "ce_loss_10": 3.57104572057724, "ce_loss_13": 3.5198657870292664, "ce_loss_2": 4.280158734321594, "ce_loss_3": 4.046003353595734, "ce_loss_7": 3.681185817718506, "epoch": 0.443, "grad_norm": 636.0, "kl_loss_10": 106.92403793334961, "kl_loss_2": 1602.4413696289062, "kl_loss_3": 1132.8224975585938, "kl_loss_7": 346.2705642700195, "learning_rate": 0.0005977396752879741, "loss": 791.5731, "step": 4430 }, { "ce_loss_10": 3.5007863640785217, "ce_loss_13": 3.4477171778678892, "ce_loss_2": 4.210827338695526, "ce_loss_3": 3.977983021736145, "ce_loss_7": 3.614708137512207, "epoch": 0.444, "grad_norm": 800.0, "kl_loss_10": 104.44592208862305, "kl_loss_2": 1614.9410827636718, "kl_loss_3": 1140.3809295654296, "kl_loss_7": 349.715380859375, "learning_rate": 0.0005961831330156305, "loss": 790.0957, "step": 4440 }, { "ce_loss_10": 3.642556297779083, "ce_loss_13": 3.5895811676979066, "ce_loss_2": 4.353832125663757, "ce_loss_3": 4.119977056980133, "ce_loss_7": 3.751791608333588, "epoch": 0.445, "grad_norm": 848.0, "kl_loss_10": 108.8304931640625, "kl_loss_2": 1612.9378601074218, "kl_loss_3": 1143.15947265625, "kl_loss_7": 353.1093444824219, "learning_rate": 0.0005946256221802051, "loss": 812.0301, "step": 4450 }, { "ce_loss_10": 3.6208613395690916, "ce_loss_13": 3.5698217391967773, "ce_loss_2": 4.2941223859786986, "ce_loss_3": 4.067051422595978, "ce_loss_7": 3.7218756914138793, "epoch": 0.446, "grad_norm": 692.0, "kl_loss_10": 106.97785911560058, "kl_loss_2": 1524.440771484375, "kl_loss_3": 1079.041940307617, "kl_loss_7": 338.7503356933594, "learning_rate": 0.0005930671584658151, "loss": 805.509, "step": 4460 }, { "ce_loss_10": 3.623039758205414, "ce_loss_13": 3.5692772388458254, "ce_loss_2": 4.318716645240784, "ce_loss_3": 4.081597459316254, "ce_loss_7": 3.7314345717430113, "epoch": 0.447, "grad_norm": 976.0, "kl_loss_10": 107.55665588378906, "kl_loss_2": 1580.4924011230469, "kl_loss_3": 1110.966748046875, "kl_loss_7": 346.3030258178711, "learning_rate": 0.0005915077575661722, "loss": 799.5762, "step": 4470 }, { "ce_loss_10": 3.6382622718811035, "ce_loss_13": 3.583562767505646, "ce_loss_2": 4.341680574417114, "ce_loss_3": 4.111017751693725, "ce_loss_7": 3.750675618648529, "epoch": 0.448, "grad_norm": 740.0, "kl_loss_10": 111.75588569641113, "kl_loss_2": 1598.7828002929687, "kl_loss_3": 1131.2714416503907, "kl_loss_7": 356.4145278930664, "learning_rate": 0.000589947435184427, "loss": 786.9881, "step": 4480 }, { "ce_loss_10": 3.7111261248588563, "ce_loss_13": 3.657747006416321, "ce_loss_2": 4.366076803207397, "ce_loss_3": 4.152265286445617, "ce_loss_7": 3.8132617354393004, "epoch": 0.449, "grad_norm": 620.0, "kl_loss_10": 109.36752700805664, "kl_loss_2": 1517.9298706054688, "kl_loss_3": 1085.603988647461, "kl_loss_7": 343.01112670898436, "learning_rate": 0.0005883862070330078, "loss": 778.6908, "step": 4490 }, { "ce_loss_10": 3.6374473094940187, "ce_loss_13": 3.5844584345817565, "ce_loss_2": 4.329462325572967, "ce_loss_3": 4.105158042907715, "ce_loss_7": 3.7480441689491273, "epoch": 0.45, "grad_norm": 936.0, "kl_loss_10": 108.28100662231445, "kl_loss_2": 1575.8340698242187, "kl_loss_3": 1122.2505157470703, "kl_loss_7": 351.20479888916014, "learning_rate": 0.0005868240888334653, "loss": 782.2573, "step": 4500 }, { "ce_loss_10": 3.5195240020751952, "ce_loss_13": 3.466025245189667, "ce_loss_2": 4.243563389778137, "ce_loss_3": 4.0139793992042545, "ce_loss_7": 3.6346124410629272, "epoch": 0.451, "grad_norm": 736.0, "kl_loss_10": 109.5379539489746, "kl_loss_2": 1607.1504211425781, "kl_loss_3": 1155.0284393310546, "kl_loss_7": 355.5206588745117, "learning_rate": 0.0005852610963163119, "loss": 800.4146, "step": 4510 }, { "ce_loss_10": 3.544723391532898, "ce_loss_13": 3.490994024276733, "ce_loss_2": 4.230922675132751, "ce_loss_3": 4.012325489521027, "ce_loss_7": 3.652547013759613, "epoch": 0.452, "grad_norm": 636.0, "kl_loss_10": 105.81944465637207, "kl_loss_2": 1557.9451171875, "kl_loss_3": 1107.8014221191406, "kl_loss_7": 342.89503021240233, "learning_rate": 0.0005836972452208654, "loss": 773.7493, "step": 4520 }, { "ce_loss_10": 3.545295166969299, "ce_loss_13": 3.4926422953605654, "ce_loss_2": 4.259631025791168, "ce_loss_3": 4.027780282497406, "ce_loss_7": 3.6585117101669313, "epoch": 0.453, "grad_norm": 1088.0, "kl_loss_10": 107.09506111145019, "kl_loss_2": 1580.9333740234374, "kl_loss_3": 1127.8527770996093, "kl_loss_7": 350.4449951171875, "learning_rate": 0.0005821325512950885, "loss": 792.7084, "step": 4530 }, { "ce_loss_10": 3.57132648229599, "ce_loss_13": 3.5218061804771423, "ce_loss_2": 4.262444412708282, "ce_loss_3": 4.037965607643128, "ce_loss_7": 3.678075098991394, "epoch": 0.454, "grad_norm": 656.0, "kl_loss_10": 104.18403358459473, "kl_loss_2": 1537.4950805664062, "kl_loss_3": 1093.6788299560546, "kl_loss_7": 339.663801574707, "learning_rate": 0.0005805670302954321, "loss": 785.5168, "step": 4540 }, { "ce_loss_10": 3.5839459538459777, "ce_loss_13": 3.534604048728943, "ce_loss_2": 4.267784786224365, "ce_loss_3": 4.045807409286499, "ce_loss_7": 3.689132308959961, "epoch": 0.455, "grad_norm": 672.0, "kl_loss_10": 103.64351615905761, "kl_loss_2": 1546.8034790039062, "kl_loss_3": 1103.8724212646484, "kl_loss_7": 340.18624114990234, "learning_rate": 0.000579000697986675, "loss": 771.8404, "step": 4550 }, { "ce_loss_10": 3.538821852207184, "ce_loss_13": 3.4819812893867494, "ce_loss_2": 4.265886902809143, "ce_loss_3": 4.0334603667259215, "ce_loss_7": 3.6552736282348635, "epoch": 0.456, "grad_norm": 680.0, "kl_loss_10": 110.63412628173828, "kl_loss_2": 1620.8699768066406, "kl_loss_3": 1158.6603637695312, "kl_loss_7": 359.01024169921874, "learning_rate": 0.0005774335701417662, "loss": 797.246, "step": 4560 }, { "ce_loss_10": 3.527772307395935, "ce_loss_13": 3.4755420207977297, "ce_loss_2": 4.238798153400421, "ce_loss_3": 4.004311883449555, "ce_loss_7": 3.640327513217926, "epoch": 0.457, "grad_norm": 784.0, "kl_loss_10": 106.28552436828613, "kl_loss_2": 1610.9963317871093, "kl_loss_3": 1140.0501953125, "kl_loss_7": 350.3925048828125, "learning_rate": 0.0005758656625416658, "loss": 795.3104, "step": 4570 }, { "ce_loss_10": 3.582171177864075, "ce_loss_13": 3.5287099123001098, "ce_loss_2": 4.286090135574341, "ce_loss_3": 4.051575064659119, "ce_loss_7": 3.6925273537635803, "epoch": 0.458, "grad_norm": 752.0, "kl_loss_10": 107.13385391235352, "kl_loss_2": 1575.1007080078125, "kl_loss_3": 1115.4173217773437, "kl_loss_7": 348.64647979736327, "learning_rate": 0.0005742969909751859, "loss": 775.693, "step": 4580 }, { "ce_loss_10": 3.59539133310318, "ce_loss_13": 3.541775679588318, "ce_loss_2": 4.293217575550079, "ce_loss_3": 4.0632738947868345, "ce_loss_7": 3.7048603534698485, "epoch": 0.459, "grad_norm": 800.0, "kl_loss_10": 107.83103408813477, "kl_loss_2": 1582.502587890625, "kl_loss_3": 1116.2388397216796, "kl_loss_7": 346.83958740234374, "learning_rate": 0.0005727275712388318, "loss": 792.1415, "step": 4590 }, { "ce_loss_10": 3.6260154604911805, "ce_loss_13": 3.5757366180419923, "ce_loss_2": 4.2999175667762755, "ce_loss_3": 4.07515869140625, "ce_loss_7": 3.7306299567222596, "epoch": 0.46, "grad_norm": 812.0, "kl_loss_10": 104.83017654418946, "kl_loss_2": 1543.1842407226563, "kl_loss_3": 1093.9305236816406, "kl_loss_7": 339.95037841796875, "learning_rate": 0.0005711574191366427, "loss": 780.5626, "step": 4600 }, { "ce_loss_10": 3.5696982383728026, "ce_loss_13": 3.5184038162231444, "ce_loss_2": 4.261876273155212, "ce_loss_3": 4.036314535140991, "ce_loss_7": 3.6753049969673155, "epoch": 0.461, "grad_norm": 712.0, "kl_loss_10": 105.47657279968261, "kl_loss_2": 1572.1477355957031, "kl_loss_3": 1112.2259094238282, "kl_loss_7": 344.2445343017578, "learning_rate": 0.0005695865504800327, "loss": 779.8552, "step": 4610 }, { "ce_loss_10": 3.512272357940674, "ce_loss_13": 3.4572680711746218, "ce_loss_2": 4.281127500534057, "ce_loss_3": 4.027366352081299, "ce_loss_7": 3.632248318195343, "epoch": 0.462, "grad_norm": 1088.0, "kl_loss_10": 110.0875846862793, "kl_loss_2": 1694.7703979492187, "kl_loss_3": 1194.6057922363282, "kl_loss_7": 363.4616394042969, "learning_rate": 0.0005680149810876322, "loss": 809.6735, "step": 4620 }, { "ce_loss_10": 3.5679752707481383, "ce_loss_13": 3.5146932005882263, "ce_loss_2": 4.263607358932495, "ce_loss_3": 4.039915704727173, "ce_loss_7": 3.6722684502601624, "epoch": 0.463, "grad_norm": 704.0, "kl_loss_10": 107.62806701660156, "kl_loss_2": 1566.1937133789063, "kl_loss_3": 1118.8762084960938, "kl_loss_7": 343.0075744628906, "learning_rate": 0.0005664427267851271, "loss": 778.1993, "step": 4630 }, { "ce_loss_10": 3.4835991621017457, "ce_loss_13": 3.432711887359619, "ce_loss_2": 4.180566942691803, "ce_loss_3": 3.9567903876304626, "ce_loss_7": 3.5922146916389464, "epoch": 0.464, "grad_norm": 748.0, "kl_loss_10": 105.2827262878418, "kl_loss_2": 1568.0562255859375, "kl_loss_3": 1113.7365661621093, "kl_loss_7": 342.60572204589846, "learning_rate": 0.0005648698034051009, "loss": 775.151, "step": 4640 }, { "ce_loss_10": 3.600264918804169, "ce_loss_13": 3.545127975940704, "ce_loss_2": 4.316138565540314, "ce_loss_3": 4.07486492395401, "ce_loss_7": 3.705556297302246, "epoch": 0.465, "grad_norm": 636.0, "kl_loss_10": 107.335888671875, "kl_loss_2": 1605.7216552734376, "kl_loss_3": 1126.881021118164, "kl_loss_7": 341.64183197021487, "learning_rate": 0.0005632962267868747, "loss": 778.2783, "step": 4650 }, { "ce_loss_10": 3.537656307220459, "ce_loss_13": 3.4871830463409426, "ce_loss_2": 4.227344763278961, "ce_loss_3": 4.00237489938736, "ce_loss_7": 3.648514187335968, "epoch": 0.466, "grad_norm": 768.0, "kl_loss_10": 102.39470024108887, "kl_loss_2": 1548.0284057617187, "kl_loss_3": 1097.4708526611328, "kl_loss_7": 336.05613098144534, "learning_rate": 0.0005617220127763474, "loss": 781.8706, "step": 4660 }, { "ce_loss_10": 3.615744781494141, "ce_loss_13": 3.5643057227134705, "ce_loss_2": 4.297688031196595, "ce_loss_3": 4.077828156948089, "ce_loss_7": 3.717690980434418, "epoch": 0.467, "grad_norm": 692.0, "kl_loss_10": 105.74517669677735, "kl_loss_2": 1546.93232421875, "kl_loss_3": 1098.3449401855469, "kl_loss_7": 340.9761535644531, "learning_rate": 0.0005601471772258368, "loss": 783.3083, "step": 4670 }, { "ce_loss_10": 3.59833767414093, "ce_loss_13": 3.5471682786941527, "ce_loss_2": 4.274568283557892, "ce_loss_3": 4.052994883060455, "ce_loss_7": 3.7028800010681153, "epoch": 0.468, "grad_norm": 604.0, "kl_loss_10": 105.27560005187988, "kl_loss_2": 1524.3396118164062, "kl_loss_3": 1075.7982116699218, "kl_loss_7": 337.89133911132814, "learning_rate": 0.0005585717359939192, "loss": 779.969, "step": 4680 }, { "ce_loss_10": 3.51119749546051, "ce_loss_13": 3.4574978351593018, "ce_loss_2": 4.206775689125061, "ce_loss_3": 3.9793694376945496, "ce_loss_7": 3.616895890235901, "epoch": 0.469, "grad_norm": 1004.0, "kl_loss_10": 104.80028839111328, "kl_loss_2": 1555.6327880859376, "kl_loss_3": 1109.0904022216796, "kl_loss_7": 342.80506439208983, "learning_rate": 0.0005569957049452703, "loss": 791.5963, "step": 4690 }, { "ce_loss_10": 3.5696662783622743, "ce_loss_13": 3.514850342273712, "ce_loss_2": 4.275382494926452, "ce_loss_3": 4.04845141172409, "ce_loss_7": 3.680628263950348, "epoch": 0.47, "grad_norm": 748.0, "kl_loss_10": 107.4641788482666, "kl_loss_2": 1596.0020629882813, "kl_loss_3": 1136.009326171875, "kl_loss_7": 351.4793395996094, "learning_rate": 0.0005554190999505056, "loss": 793.8654, "step": 4700 }, { "ce_loss_10": 3.6935503125190734, "ce_loss_13": 3.6402706861495973, "ce_loss_2": 4.390166485309601, "ce_loss_3": 4.160398244857788, "ce_loss_7": 3.8034549593925475, "epoch": 0.471, "grad_norm": 684.0, "kl_loss_10": 108.2986988067627, "kl_loss_2": 1582.3346374511718, "kl_loss_3": 1116.2829315185547, "kl_loss_7": 351.3518981933594, "learning_rate": 0.0005538419368860196, "loss": 760.8446, "step": 4710 }, { "ce_loss_10": 3.6193637251853943, "ce_loss_13": 3.5664429545402525, "ce_loss_2": 4.308554613590241, "ce_loss_3": 4.07954374551773, "ce_loss_7": 3.726813542842865, "epoch": 0.472, "grad_norm": 620.0, "kl_loss_10": 106.51740417480468, "kl_loss_2": 1558.5083801269532, "kl_loss_3": 1099.7098052978515, "kl_loss_7": 346.3127685546875, "learning_rate": 0.0005522642316338268, "loss": 794.3575, "step": 4720 }, { "ce_loss_10": 3.622410202026367, "ce_loss_13": 3.5721071004867553, "ce_loss_2": 4.296853995323181, "ce_loss_3": 4.074155044555664, "ce_loss_7": 3.728399670124054, "epoch": 0.473, "grad_norm": 1184.0, "kl_loss_10": 106.80021095275879, "kl_loss_2": 1536.873956298828, "kl_loss_3": 1090.9999084472656, "kl_loss_7": 345.0013717651367, "learning_rate": 0.0005506860000814017, "loss": 793.2863, "step": 4730 }, { "ce_loss_10": 3.650379252433777, "ce_loss_13": 3.597243010997772, "ce_loss_2": 4.319996333122253, "ce_loss_3": 4.099277722835541, "ce_loss_7": 3.7514834880828856, "epoch": 0.474, "grad_norm": 536.0, "kl_loss_10": 105.83440933227538, "kl_loss_2": 1524.633251953125, "kl_loss_3": 1082.5695404052735, "kl_loss_7": 339.2084197998047, "learning_rate": 0.0005491072581215186, "loss": 778.0616, "step": 4740 }, { "ce_loss_10": 3.6513981580734254, "ce_loss_13": 3.594015192985535, "ce_loss_2": 4.328640329837799, "ce_loss_3": 4.103936815261841, "ce_loss_7": 3.7592919945716856, "epoch": 0.475, "grad_norm": 996.0, "kl_loss_10": 110.53365478515624, "kl_loss_2": 1569.9827392578125, "kl_loss_3": 1117.1862548828126, "kl_loss_7": 352.0670761108398, "learning_rate": 0.0005475280216520913, "loss": 767.7382, "step": 4750 }, { "ce_loss_10": 3.561974513530731, "ce_loss_13": 3.513732361793518, "ce_loss_2": 4.244001770019532, "ce_loss_3": 4.019676458835602, "ce_loss_7": 3.6699718594551087, "epoch": 0.476, "grad_norm": 848.0, "kl_loss_10": 105.23144874572753, "kl_loss_2": 1528.915057373047, "kl_loss_3": 1081.2933441162108, "kl_loss_7": 338.44948120117186, "learning_rate": 0.0005459483065760138, "loss": 786.6552, "step": 4760 }, { "ce_loss_10": 3.5006993055343627, "ce_loss_13": 3.4474108099937437, "ce_loss_2": 4.2361647367477415, "ce_loss_3": 3.9947264313697817, "ce_loss_7": 3.611408996582031, "epoch": 0.477, "grad_norm": 1008.0, "kl_loss_10": 105.65767707824708, "kl_loss_2": 1641.7184326171875, "kl_loss_3": 1150.0165161132813, "kl_loss_7": 345.60813903808594, "learning_rate": 0.0005443681288009991, "loss": 789.1808, "step": 4770 }, { "ce_loss_10": 3.5626617074012756, "ce_loss_13": 3.5092739343643187, "ce_loss_2": 4.2540900111198425, "ce_loss_3": 4.02539541721344, "ce_loss_7": 3.6686468362808227, "epoch": 0.478, "grad_norm": 640.0, "kl_loss_10": 106.88839836120606, "kl_loss_2": 1583.2138427734376, "kl_loss_3": 1119.1056030273437, "kl_loss_7": 345.27428283691404, "learning_rate": 0.0005427875042394199, "loss": 791.0654, "step": 4780 }, { "ce_loss_10": 3.58731290102005, "ce_loss_13": 3.5331693410873415, "ce_loss_2": 4.274273359775544, "ce_loss_3": 4.050940811634064, "ce_loss_7": 3.692301535606384, "epoch": 0.479, "grad_norm": 612.0, "kl_loss_10": 108.49159164428711, "kl_loss_2": 1568.2550659179688, "kl_loss_3": 1115.3421508789063, "kl_loss_7": 347.33675689697264, "learning_rate": 0.0005412064488081482, "loss": 797.266, "step": 4790 }, { "ce_loss_10": 3.5943424463272096, "ce_loss_13": 3.543016028404236, "ce_loss_2": 4.278271102905274, "ce_loss_3": 4.050240468978882, "ce_loss_7": 3.699341666698456, "epoch": 0.48, "grad_norm": 1004.0, "kl_loss_10": 104.96976432800292, "kl_loss_2": 1551.6132690429688, "kl_loss_3": 1088.9093048095704, "kl_loss_7": 340.02966156005857, "learning_rate": 0.0005396249784283942, "loss": 770.7547, "step": 4800 }, { "ce_loss_10": 3.6162082314491273, "ce_loss_13": 3.5600994348526003, "ce_loss_2": 4.32856605052948, "ce_loss_3": 4.092339861392975, "ce_loss_7": 3.72650328874588, "epoch": 0.481, "grad_norm": 748.0, "kl_loss_10": 111.72134437561036, "kl_loss_2": 1614.5575317382813, "kl_loss_3": 1138.7758422851562, "kl_loss_7": 353.9819793701172, "learning_rate": 0.0005380431090255476, "loss": 795.777, "step": 4810 }, { "ce_loss_10": 3.6053959608078, "ce_loss_13": 3.5558215737342835, "ce_loss_2": 4.272196555137635, "ce_loss_3": 4.055683016777039, "ce_loss_7": 3.7070755124092103, "epoch": 0.482, "grad_norm": 516.0, "kl_loss_10": 103.00741539001464, "kl_loss_2": 1511.6953857421875, "kl_loss_3": 1073.3668182373046, "kl_loss_7": 330.7967956542969, "learning_rate": 0.0005364608565290155, "loss": 762.3736, "step": 4820 }, { "ce_loss_10": 3.616597867012024, "ce_loss_13": 3.5615100383758547, "ce_loss_2": 4.30396535396576, "ce_loss_3": 4.079499983787537, "ce_loss_7": 3.722726809978485, "epoch": 0.483, "grad_norm": 564.0, "kl_loss_10": 108.23855934143066, "kl_loss_2": 1553.3298950195312, "kl_loss_3": 1098.51328125, "kl_loss_7": 339.9071044921875, "learning_rate": 0.0005348782368720626, "loss": 775.4738, "step": 4830 }, { "ce_loss_10": 3.540391433238983, "ce_loss_13": 3.4896615028381346, "ce_loss_2": 4.223983693122864, "ce_loss_3": 4.000096595287323, "ce_loss_7": 3.6485345482826235, "epoch": 0.484, "grad_norm": 944.0, "kl_loss_10": 103.76399879455566, "kl_loss_2": 1534.5832458496093, "kl_loss_3": 1091.8973114013672, "kl_loss_7": 337.3481048583984, "learning_rate": 0.000533295265991652, "loss": 779.6613, "step": 4840 }, { "ce_loss_10": 3.6258066058158875, "ce_loss_13": 3.574051332473755, "ce_loss_2": 4.292212641239166, "ce_loss_3": 4.077902638912201, "ce_loss_7": 3.730948543548584, "epoch": 0.485, "grad_norm": 724.0, "kl_loss_10": 105.05139617919922, "kl_loss_2": 1513.2453857421874, "kl_loss_3": 1078.8460510253906, "kl_loss_7": 340.6751480102539, "learning_rate": 0.0005317119598282822, "loss": 766.4916, "step": 4850 }, { "ce_loss_10": 3.623393952846527, "ce_loss_13": 3.5711713314056395, "ce_loss_2": 4.305029904842376, "ce_loss_3": 4.0827592253685, "ce_loss_7": 3.733406388759613, "epoch": 0.486, "grad_norm": 580.0, "kl_loss_10": 105.81864318847656, "kl_loss_2": 1536.424542236328, "kl_loss_3": 1093.581723022461, "kl_loss_7": 342.66624145507814, "learning_rate": 0.0005301283343258293, "loss": 775.7233, "step": 4860 }, { "ce_loss_10": 3.6843939900398253, "ce_loss_13": 3.6326879978179933, "ce_loss_2": 4.350576448440552, "ce_loss_3": 4.137509047985077, "ce_loss_7": 3.7914267420768737, "epoch": 0.487, "grad_norm": 964.0, "kl_loss_10": 106.32772293090821, "kl_loss_2": 1508.5073364257812, "kl_loss_3": 1082.712173461914, "kl_loss_7": 340.41368408203124, "learning_rate": 0.000528544405431384, "loss": 760.867, "step": 4870 }, { "ce_loss_10": 3.5636670708656313, "ce_loss_13": 3.510996401309967, "ce_loss_2": 4.272000575065613, "ce_loss_3": 4.048352217674255, "ce_loss_7": 3.676626420021057, "epoch": 0.488, "grad_norm": 688.0, "kl_loss_10": 107.13710670471191, "kl_loss_2": 1591.5465026855468, "kl_loss_3": 1134.915997314453, "kl_loss_7": 351.8549118041992, "learning_rate": 0.000526960189095093, "loss": 791.7934, "step": 4880 }, { "ce_loss_10": 3.5406416058540344, "ce_loss_13": 3.4884895205497743, "ce_loss_2": 4.230566167831421, "ce_loss_3": 4.003983414173126, "ce_loss_7": 3.648192596435547, "epoch": 0.489, "grad_norm": 604.0, "kl_loss_10": 103.35883827209473, "kl_loss_2": 1541.6214477539063, "kl_loss_3": 1092.1354553222657, "kl_loss_7": 339.1172836303711, "learning_rate": 0.0005253757012699972, "loss": 771.4528, "step": 4890 }, { "ce_loss_10": 3.6308435201644897, "ce_loss_13": 3.579329228401184, "ce_loss_2": 4.305352687835693, "ce_loss_3": 4.076308810710907, "ce_loss_7": 3.7364446759223937, "epoch": 0.49, "grad_norm": 840.0, "kl_loss_10": 106.25533714294434, "kl_loss_2": 1535.658624267578, "kl_loss_3": 1084.617254638672, "kl_loss_7": 340.3037139892578, "learning_rate": 0.0005237909579118712, "loss": 784.7123, "step": 4900 }, { "ce_loss_10": 3.591050755977631, "ce_loss_13": 3.537055218219757, "ce_loss_2": 4.292327558994293, "ce_loss_3": 4.060908043384552, "ce_loss_7": 3.6998765587806703, "epoch": 0.491, "grad_norm": 716.0, "kl_loss_10": 108.27464981079102, "kl_loss_2": 1587.2009521484374, "kl_loss_3": 1120.1489471435548, "kl_loss_7": 352.0192276000977, "learning_rate": 0.0005222059749790631, "loss": 788.1985, "step": 4910 }, { "ce_loss_10": 3.6566781401634216, "ce_loss_13": 3.604442524909973, "ce_loss_2": 4.306674826145172, "ce_loss_3": 4.09120432138443, "ce_loss_7": 3.7567849278450014, "epoch": 0.492, "grad_norm": 684.0, "kl_loss_10": 105.86485214233399, "kl_loss_2": 1491.6484008789062, "kl_loss_3": 1064.066244506836, "kl_loss_7": 335.6333465576172, "learning_rate": 0.0005206207684323337, "loss": 747.0295, "step": 4920 }, { "ce_loss_10": 3.6366550445556642, "ce_loss_13": 3.585684764385223, "ce_loss_2": 4.3044400930404665, "ce_loss_3": 4.086514747142791, "ce_loss_7": 3.7404402017593386, "epoch": 0.493, "grad_norm": 636.0, "kl_loss_10": 107.03595809936523, "kl_loss_2": 1534.8705261230468, "kl_loss_3": 1092.2729858398438, "kl_loss_7": 340.86955108642576, "learning_rate": 0.000519035354234695, "loss": 784.1796, "step": 4930 }, { "ce_loss_10": 3.6158014059066774, "ce_loss_13": 3.5621575117111206, "ce_loss_2": 4.300448024272919, "ce_loss_3": 4.079223501682281, "ce_loss_7": 3.726815938949585, "epoch": 0.494, "grad_norm": 768.0, "kl_loss_10": 107.37990112304688, "kl_loss_2": 1539.34794921875, "kl_loss_3": 1099.2253143310547, "kl_loss_7": 345.51239013671875, "learning_rate": 0.0005174497483512506, "loss": 763.4105, "step": 4940 }, { "ce_loss_10": 3.6578310132026672, "ce_loss_13": 3.606469249725342, "ce_loss_2": 4.325371444225311, "ce_loss_3": 4.105874109268188, "ce_loss_7": 3.762361395359039, "epoch": 0.495, "grad_norm": 568.0, "kl_loss_10": 104.33839149475098, "kl_loss_2": 1535.6225524902343, "kl_loss_3": 1090.7650207519532, "kl_loss_7": 337.1204116821289, "learning_rate": 0.0005158639667490339, "loss": 779.978, "step": 4950 }, { "ce_loss_10": 3.54868825674057, "ce_loss_13": 3.4968921065330507, "ce_loss_2": 4.242542886734009, "ce_loss_3": 4.017865908145905, "ce_loss_7": 3.6616206765174866, "epoch": 0.496, "grad_norm": 580.0, "kl_loss_10": 105.31343040466308, "kl_loss_2": 1552.54345703125, "kl_loss_3": 1100.017984008789, "kl_loss_7": 342.6055084228516, "learning_rate": 0.0005142780253968481, "loss": 773.7154, "step": 4960 }, { "ce_loss_10": 3.5054641246795653, "ce_loss_13": 3.455543577671051, "ce_loss_2": 4.183038651943207, "ce_loss_3": 3.9585731863975524, "ce_loss_7": 3.6109924793243406, "epoch": 0.497, "grad_norm": 964.0, "kl_loss_10": 101.63932495117187, "kl_loss_2": 1524.8353271484375, "kl_loss_3": 1072.442886352539, "kl_loss_7": 331.6019989013672, "learning_rate": 0.0005126919402651053, "loss": 748.1686, "step": 4970 }, { "ce_loss_10": 3.576478934288025, "ce_loss_13": 3.5233771920204164, "ce_loss_2": 4.272300028800965, "ce_loss_3": 4.0522057056427006, "ce_loss_7": 3.687915360927582, "epoch": 0.498, "grad_norm": 752.0, "kl_loss_10": 106.66744346618653, "kl_loss_2": 1553.7132690429687, "kl_loss_3": 1113.0173828125, "kl_loss_7": 343.4169952392578, "learning_rate": 0.0005111057273256647, "loss": 781.1326, "step": 4980 }, { "ce_loss_10": 3.6780145883560182, "ce_loss_13": 3.629815137386322, "ce_loss_2": 4.310672724246979, "ce_loss_3": 4.104003727436066, "ce_loss_7": 3.777252805233002, "epoch": 0.499, "grad_norm": 812.0, "kl_loss_10": 103.18383827209473, "kl_loss_2": 1450.2989929199218, "kl_loss_3": 1034.7297332763671, "kl_loss_7": 323.98138732910155, "learning_rate": 0.0005095194025516733, "loss": 741.9576, "step": 4990 }, { "ce_loss_10": 3.6002578854560854, "ce_loss_13": 3.551834559440613, "ce_loss_2": 4.26209282875061, "ce_loss_3": 4.0455299615859985, "ce_loss_7": 3.700413763523102, "epoch": 0.5, "grad_norm": 720.0, "kl_loss_10": 102.64009437561035, "kl_loss_2": 1503.5544006347657, "kl_loss_3": 1067.44873046875, "kl_loss_7": 332.5431625366211, "learning_rate": 0.000507932981917404, "loss": 780.3157, "step": 5000 }, { "ce_loss_10": 3.552625060081482, "ce_loss_13": 3.5022681593894958, "ce_loss_2": 4.272991299629211, "ce_loss_3": 4.033677089214325, "ce_loss_7": 3.664923703670502, "epoch": 0.501, "grad_norm": 956.0, "kl_loss_10": 108.40164146423339, "kl_loss_2": 1624.3978942871095, "kl_loss_3": 1136.9808227539063, "kl_loss_7": 351.3223403930664, "learning_rate": 0.0005063464813980949, "loss": 798.7978, "step": 5010 }, { "ce_loss_10": 3.5441391825675965, "ce_loss_13": 3.493256175518036, "ce_loss_2": 4.232671022415161, "ce_loss_3": 4.005011808872223, "ce_loss_7": 3.649567222595215, "epoch": 0.502, "grad_norm": 904.0, "kl_loss_10": 105.25513000488282, "kl_loss_2": 1559.7631591796876, "kl_loss_3": 1106.2904724121095, "kl_loss_7": 338.8459075927734, "learning_rate": 0.0005047599169697884, "loss": 770.6113, "step": 5020 }, { "ce_loss_10": 3.4779917359352113, "ce_loss_13": 3.426401746273041, "ce_loss_2": 4.173187339305878, "ce_loss_3": 3.939064681529999, "ce_loss_7": 3.5845581173896788, "epoch": 0.503, "grad_norm": 916.0, "kl_loss_10": 102.86986770629883, "kl_loss_2": 1549.1849548339844, "kl_loss_3": 1094.1441131591796, "kl_loss_7": 338.38169250488284, "learning_rate": 0.000503173304609171, "loss": 756.1774, "step": 5030 }, { "ce_loss_10": 3.5992946147918703, "ce_loss_13": 3.5473763704299928, "ce_loss_2": 4.2791898369789125, "ce_loss_3": 4.067893850803375, "ce_loss_7": 3.708905208110809, "epoch": 0.504, "grad_norm": 648.0, "kl_loss_10": 103.77697067260742, "kl_loss_2": 1511.68330078125, "kl_loss_3": 1086.9513244628906, "kl_loss_7": 339.56539764404295, "learning_rate": 0.0005015866602934111, "loss": 757.23, "step": 5040 }, { "ce_loss_10": 3.5694451451301576, "ce_loss_13": 3.5161996126174926, "ce_loss_2": 4.282076847553253, "ce_loss_3": 4.048467779159546, "ce_loss_7": 3.681831932067871, "epoch": 0.505, "grad_norm": 692.0, "kl_loss_10": 107.97551002502442, "kl_loss_2": 1594.2293273925782, "kl_loss_3": 1131.3639190673828, "kl_loss_7": 355.373291015625, "learning_rate": 0.0005, "loss": 783.5191, "step": 5050 }, { "ce_loss_10": 3.553804886341095, "ce_loss_13": 3.504037308692932, "ce_loss_2": 4.241030144691467, "ce_loss_3": 4.0159555077552795, "ce_loss_7": 3.660956013202667, "epoch": 0.506, "grad_norm": 852.0, "kl_loss_10": 105.30016746520997, "kl_loss_2": 1555.2715148925781, "kl_loss_3": 1101.2654724121094, "kl_loss_7": 344.75842437744143, "learning_rate": 0.0004984133397065889, "loss": 763.1373, "step": 5060 }, { "ce_loss_10": 3.5664543390274046, "ce_loss_13": 3.5147876262664797, "ce_loss_2": 4.2698539853096005, "ce_loss_3": 4.046985030174255, "ce_loss_7": 3.6759848594665527, "epoch": 0.507, "grad_norm": 756.0, "kl_loss_10": 105.64219741821289, "kl_loss_2": 1564.9867065429687, "kl_loss_3": 1114.7719696044921, "kl_loss_7": 346.48322143554685, "learning_rate": 0.0004968266953908291, "loss": 768.5187, "step": 5070 }, { "ce_loss_10": 3.606212091445923, "ce_loss_13": 3.555961561203003, "ce_loss_2": 4.2961250901222225, "ce_loss_3": 4.062407577037812, "ce_loss_7": 3.713254189491272, "epoch": 0.508, "grad_norm": 940.0, "kl_loss_10": 103.76396903991699, "kl_loss_2": 1553.2374389648437, "kl_loss_3": 1091.4473846435546, "kl_loss_7": 335.7966766357422, "learning_rate": 0.0004952400830302117, "loss": 769.8045, "step": 5080 }, { "ce_loss_10": 3.5356621026992796, "ce_loss_13": 3.4850072622299195, "ce_loss_2": 4.245317327976227, "ce_loss_3": 4.0114261388778685, "ce_loss_7": 3.6454938292503356, "epoch": 0.509, "grad_norm": 616.0, "kl_loss_10": 106.50161399841309, "kl_loss_2": 1590.15634765625, "kl_loss_3": 1122.505337524414, "kl_loss_7": 346.90687866210936, "learning_rate": 0.0004936535186019053, "loss": 774.5224, "step": 5090 }, { "ce_loss_10": 3.6339807868003846, "ce_loss_13": 3.5860685467720033, "ce_loss_2": 4.289305317401886, "ce_loss_3": 4.074518239498138, "ce_loss_7": 3.7355559706687926, "epoch": 0.51, "grad_norm": 556.0, "kl_loss_10": 102.87107429504394, "kl_loss_2": 1487.5680358886718, "kl_loss_3": 1057.1669616699219, "kl_loss_7": 328.8177856445312, "learning_rate": 0.000492067018082596, "loss": 756.9469, "step": 5100 }, { "ce_loss_10": 3.572108805179596, "ce_loss_13": 3.519754505157471, "ce_loss_2": 4.2937098979949955, "ce_loss_3": 4.057534110546112, "ce_loss_7": 3.6845625162124636, "epoch": 0.511, "grad_norm": 564.0, "kl_loss_10": 107.05563545227051, "kl_loss_2": 1601.017333984375, "kl_loss_3": 1127.8345275878905, "kl_loss_7": 346.7941864013672, "learning_rate": 0.0004904805974483267, "loss": 798.3451, "step": 5110 }, { "ce_loss_10": 3.688843047618866, "ce_loss_13": 3.632902240753174, "ce_loss_2": 4.387015056610108, "ce_loss_3": 4.16108318567276, "ce_loss_7": 3.800713813304901, "epoch": 0.512, "grad_norm": 728.0, "kl_loss_10": 111.84035453796386, "kl_loss_2": 1583.4616271972657, "kl_loss_3": 1129.3079772949218, "kl_loss_7": 359.58056182861327, "learning_rate": 0.0004888942726743353, "loss": 804.7963, "step": 5120 }, { "ce_loss_10": 3.5544611692428587, "ce_loss_13": 3.50446811914444, "ce_loss_2": 4.241513431072235, "ce_loss_3": 4.019679427146912, "ce_loss_7": 3.6642414569854735, "epoch": 0.513, "grad_norm": 748.0, "kl_loss_10": 105.32323875427247, "kl_loss_2": 1559.0365966796876, "kl_loss_3": 1099.6247528076171, "kl_loss_7": 343.92891693115234, "learning_rate": 0.0004873080597348947, "loss": 776.9712, "step": 5130 }, { "ce_loss_10": 3.4444934010505674, "ce_loss_13": 3.3938143610954286, "ce_loss_2": 4.172502040863037, "ce_loss_3": 3.926547610759735, "ce_loss_7": 3.552323853969574, "epoch": 0.514, "grad_norm": 592.0, "kl_loss_10": 104.23584251403808, "kl_loss_2": 1627.0246704101562, "kl_loss_3": 1141.180697631836, "kl_loss_7": 344.92951049804685, "learning_rate": 0.0004857219746031519, "loss": 788.8108, "step": 5140 }, { "ce_loss_10": 3.615693438053131, "ce_loss_13": 3.5642744302749634, "ce_loss_2": 4.289623641967774, "ce_loss_3": 4.0702834844589235, "ce_loss_7": 3.7196484565734864, "epoch": 0.515, "grad_norm": 648.0, "kl_loss_10": 106.10341720581054, "kl_loss_2": 1524.5794921875, "kl_loss_3": 1084.168667602539, "kl_loss_7": 340.3775100708008, "learning_rate": 0.0004841360332509663, "loss": 767.6986, "step": 5150 }, { "ce_loss_10": 3.570136833190918, "ce_loss_13": 3.5186691164970396, "ce_loss_2": 4.244175159931183, "ce_loss_3": 4.021892881393432, "ce_loss_7": 3.675465631484985, "epoch": 0.516, "grad_norm": 728.0, "kl_loss_10": 102.95111656188965, "kl_loss_2": 1518.259051513672, "kl_loss_3": 1078.3638061523438, "kl_loss_7": 335.5635925292969, "learning_rate": 0.0004825502516487497, "loss": 743.8507, "step": 5160 }, { "ce_loss_10": 3.5275537967681885, "ce_loss_13": 3.4778661847114565, "ce_loss_2": 4.231430792808533, "ce_loss_3": 4.003102886676788, "ce_loss_7": 3.6373011231422425, "epoch": 0.517, "grad_norm": 1168.0, "kl_loss_10": 106.2911434173584, "kl_loss_2": 1580.9174682617188, "kl_loss_3": 1123.9657592773438, "kl_loss_7": 344.83895721435545, "learning_rate": 0.00048096464576530507, "loss": 781.726, "step": 5170 }, { "ce_loss_10": 3.637148928642273, "ce_loss_13": 3.586077404022217, "ce_loss_2": 4.287311220169068, "ce_loss_3": 4.078454482555389, "ce_loss_7": 3.7398216009140013, "epoch": 0.518, "grad_norm": 568.0, "kl_loss_10": 105.0203483581543, "kl_loss_2": 1484.6437194824218, "kl_loss_3": 1060.8911895751953, "kl_loss_7": 333.27953643798827, "learning_rate": 0.00047937923156766646, "loss": 749.8512, "step": 5180 }, { "ce_loss_10": 3.6861174583435057, "ce_loss_13": 3.6335399627685545, "ce_loss_2": 4.327944195270538, "ce_loss_3": 4.121000564098358, "ce_loss_7": 3.788613975048065, "epoch": 0.519, "grad_norm": 664.0, "kl_loss_10": 110.57874069213867, "kl_loss_2": 1481.331884765625, "kl_loss_3": 1063.7629180908202, "kl_loss_7": 339.9734313964844, "learning_rate": 0.00047779402502093696, "loss": 755.2444, "step": 5190 }, { "ce_loss_10": 3.6503878116607664, "ce_loss_13": 3.5951317071914675, "ce_loss_2": 4.313281559944153, "ce_loss_3": 4.0941772103309635, "ce_loss_7": 3.7504095077514648, "epoch": 0.52, "grad_norm": 528.0, "kl_loss_10": 110.5813705444336, "kl_loss_2": 1506.1938903808593, "kl_loss_3": 1073.6798614501954, "kl_loss_7": 335.5557922363281, "learning_rate": 0.0004762090420881289, "loss": 766.2417, "step": 5200 }, { "ce_loss_10": 3.563931930065155, "ce_loss_13": 3.5133076310157776, "ce_loss_2": 4.235055208206177, "ce_loss_3": 4.011075031757355, "ce_loss_7": 3.6647618770599366, "epoch": 0.521, "grad_norm": 912.0, "kl_loss_10": 107.92477302551269, "kl_loss_2": 1520.8591064453126, "kl_loss_3": 1072.4250915527343, "kl_loss_7": 334.4733062744141, "learning_rate": 0.00047462429873000296, "loss": 751.0428, "step": 5210 }, { "ce_loss_10": 3.648752224445343, "ce_loss_13": 3.59717835187912, "ce_loss_2": 4.307024538516998, "ce_loss_3": 4.0925533175468445, "ce_loss_7": 3.7506918787956236, "epoch": 0.522, "grad_norm": 616.0, "kl_loss_10": 107.85691528320312, "kl_loss_2": 1505.1311645507812, "kl_loss_3": 1068.8792602539063, "kl_loss_7": 334.6901351928711, "learning_rate": 0.0004730398109049071, "loss": 756.0511, "step": 5220 }, { "ce_loss_10": 3.5776852130889893, "ce_loss_13": 3.5244672894477844, "ce_loss_2": 4.273973751068115, "ce_loss_3": 4.043230903148651, "ce_loss_7": 3.6845834374427797, "epoch": 0.523, "grad_norm": 768.0, "kl_loss_10": 108.47629928588867, "kl_loss_2": 1577.0228271484375, "kl_loss_3": 1121.7806488037108, "kl_loss_7": 345.6023803710938, "learning_rate": 0.000471455594568616, "loss": 771.5051, "step": 5230 }, { "ce_loss_10": 3.652016305923462, "ce_loss_13": 3.6008514642715452, "ce_loss_2": 4.308791708946228, "ce_loss_3": 4.091267836093903, "ce_loss_7": 3.7525286436080934, "epoch": 0.524, "grad_norm": 804.0, "kl_loss_10": 106.29882698059082, "kl_loss_2": 1493.961279296875, "kl_loss_3": 1063.3957733154298, "kl_loss_7": 332.6543563842773, "learning_rate": 0.00046987166567417086, "loss": 761.2231, "step": 5240 }, { "ce_loss_10": 3.568842327594757, "ce_loss_13": 3.5181811928749083, "ce_loss_2": 4.241319179534912, "ce_loss_3": 4.021905446052552, "ce_loss_7": 3.67096084356308, "epoch": 0.525, "grad_norm": 688.0, "kl_loss_10": 103.95141677856445, "kl_loss_2": 1515.3368713378907, "kl_loss_3": 1076.799124145508, "kl_loss_7": 332.47727966308594, "learning_rate": 0.00046828804017171776, "loss": 741.6352, "step": 5250 }, { "ce_loss_10": 3.6140376925468445, "ce_loss_13": 3.5600037693977358, "ce_loss_2": 4.304037141799927, "ce_loss_3": 4.078682374954224, "ce_loss_7": 3.720091235637665, "epoch": 0.526, "grad_norm": 684.0, "kl_loss_10": 105.49669380187989, "kl_loss_2": 1535.6353393554687, "kl_loss_3": 1088.1175109863282, "kl_loss_7": 339.3191452026367, "learning_rate": 0.00046670473400834805, "loss": 775.4123, "step": 5260 }, { "ce_loss_10": 3.5437933564186097, "ce_loss_13": 3.494081747531891, "ce_loss_2": 4.207845985889435, "ce_loss_3": 3.9850253224372865, "ce_loss_7": 3.6457874178886414, "epoch": 0.527, "grad_norm": 548.0, "kl_loss_10": 101.99541931152343, "kl_loss_2": 1503.5750671386718, "kl_loss_3": 1062.66142578125, "kl_loss_7": 327.184553527832, "learning_rate": 0.00046512176312793734, "loss": 776.9915, "step": 5270 }, { "ce_loss_10": 3.544222927093506, "ce_loss_13": 3.4924687266349794, "ce_loss_2": 4.223357951641082, "ce_loss_3": 3.9977125763893127, "ce_loss_7": 3.649840462207794, "epoch": 0.528, "grad_norm": 716.0, "kl_loss_10": 104.9525104522705, "kl_loss_2": 1547.6778991699218, "kl_loss_3": 1089.306478881836, "kl_loss_7": 336.4553619384766, "learning_rate": 0.00046353914347098467, "loss": 774.7193, "step": 5280 }, { "ce_loss_10": 3.634207582473755, "ce_loss_13": 3.5827547192573546, "ce_loss_2": 4.315836381912232, "ce_loss_3": 4.087788462638855, "ce_loss_7": 3.738226056098938, "epoch": 0.529, "grad_norm": 828.0, "kl_loss_10": 104.55474815368652, "kl_loss_2": 1531.230047607422, "kl_loss_3": 1078.3197662353516, "kl_loss_7": 332.82044830322263, "learning_rate": 0.0004619568909744524, "loss": 764.6749, "step": 5290 }, { "ce_loss_10": 3.639143466949463, "ce_loss_13": 3.5892367243766783, "ce_loss_2": 4.309708309173584, "ce_loss_3": 4.088440322875977, "ce_loss_7": 3.742543303966522, "epoch": 0.53, "grad_norm": 688.0, "kl_loss_10": 105.91220245361328, "kl_loss_2": 1505.5661682128907, "kl_loss_3": 1072.2569061279296, "kl_loss_7": 333.90283050537107, "learning_rate": 0.00046037502157160573, "loss": 767.6039, "step": 5300 }, { "ce_loss_10": 3.512180840969086, "ce_loss_13": 3.4605939388275146, "ce_loss_2": 4.199014806747437, "ce_loss_3": 3.9735213994979857, "ce_loss_7": 3.6199467301368715, "epoch": 0.531, "grad_norm": 596.0, "kl_loss_10": 104.17751197814941, "kl_loss_2": 1552.9967224121094, "kl_loss_3": 1096.7238159179688, "kl_loss_7": 341.9464736938477, "learning_rate": 0.00045879355119185207, "loss": 776.0659, "step": 5310 }, { "ce_loss_10": 3.5917591571807863, "ce_loss_13": 3.539541244506836, "ce_loss_2": 4.27906973361969, "ce_loss_3": 4.054012656211853, "ce_loss_7": 3.697227191925049, "epoch": 0.532, "grad_norm": 676.0, "kl_loss_10": 105.86441574096679, "kl_loss_2": 1568.1892883300782, "kl_loss_3": 1110.4130065917968, "kl_loss_7": 344.8864273071289, "learning_rate": 0.0004572124957605803, "loss": 782.3594, "step": 5320 }, { "ce_loss_10": 3.6141596913337706, "ce_loss_13": 3.5631175518035887, "ce_loss_2": 4.282216358184814, "ce_loss_3": 4.064068484306335, "ce_loss_7": 3.7216551423072817, "epoch": 0.533, "grad_norm": 592.0, "kl_loss_10": 104.27678909301758, "kl_loss_2": 1528.1688293457032, "kl_loss_3": 1086.5652130126953, "kl_loss_7": 337.6312454223633, "learning_rate": 0.00045563187119900103, "loss": 756.5656, "step": 5330 }, { "ce_loss_10": 3.455550253391266, "ce_loss_13": 3.408282685279846, "ce_loss_2": 4.161994159221649, "ce_loss_3": 3.9285236239433288, "ce_loss_7": 3.564248967170715, "epoch": 0.534, "grad_norm": 960.0, "kl_loss_10": 104.12380828857422, "kl_loss_2": 1579.8607116699218, "kl_loss_3": 1107.8948364257812, "kl_loss_7": 341.63795318603513, "learning_rate": 0.00045405169342398633, "loss": 777.5294, "step": 5340 }, { "ce_loss_10": 3.5446057438850405, "ce_loss_13": 3.4926030158996584, "ce_loss_2": 4.239805471897125, "ce_loss_3": 4.0091390132904055, "ce_loss_7": 3.6536974906921387, "epoch": 0.535, "grad_norm": 908.0, "kl_loss_10": 106.18468551635742, "kl_loss_2": 1558.015606689453, "kl_loss_3": 1098.1478118896484, "kl_loss_7": 339.72764282226564, "learning_rate": 0.0004524719783479088, "loss": 761.7418, "step": 5350 }, { "ce_loss_10": 3.495758616924286, "ce_loss_13": 3.444382071495056, "ce_loss_2": 4.207507169246673, "ce_loss_3": 3.9777019143104555, "ce_loss_7": 3.606581676006317, "epoch": 0.536, "grad_norm": 548.0, "kl_loss_10": 105.52358474731446, "kl_loss_2": 1597.09365234375, "kl_loss_3": 1127.7711273193358, "kl_loss_7": 345.26265716552734, "learning_rate": 0.00045089274187848144, "loss": 769.0439, "step": 5360 }, { "ce_loss_10": 3.621846556663513, "ce_loss_13": 3.5721195578575133, "ce_loss_2": 4.279402017593384, "ce_loss_3": 4.060469186305999, "ce_loss_7": 3.7197882533073425, "epoch": 0.537, "grad_norm": 1040.0, "kl_loss_10": 103.39545669555665, "kl_loss_2": 1501.6532043457032, "kl_loss_3": 1061.9325378417968, "kl_loss_7": 330.7702331542969, "learning_rate": 0.00044931399991859835, "loss": 752.2175, "step": 5370 }, { "ce_loss_10": 3.4820066690444946, "ce_loss_13": 3.4285654544830324, "ce_loss_2": 4.17064733505249, "ce_loss_3": 3.9410685777664183, "ce_loss_7": 3.5903464794158935, "epoch": 0.538, "grad_norm": 580.0, "kl_loss_10": 103.59290580749511, "kl_loss_2": 1558.2202575683593, "kl_loss_3": 1094.9960021972656, "kl_loss_7": 338.6309051513672, "learning_rate": 0.00044773576836617336, "loss": 757.4745, "step": 5380 }, { "ce_loss_10": 3.574995219707489, "ce_loss_13": 3.5227272629737856, "ce_loss_2": 4.266055810451507, "ce_loss_3": 4.046211910247803, "ce_loss_7": 3.6836758255958557, "epoch": 0.539, "grad_norm": 736.0, "kl_loss_10": 107.58810195922851, "kl_loss_2": 1572.7946472167969, "kl_loss_3": 1124.5888092041016, "kl_loss_7": 346.74962158203124, "learning_rate": 0.00044615806311398056, "loss": 789.3427, "step": 5390 }, { "ce_loss_10": 3.652173328399658, "ce_loss_13": 3.6022719621658323, "ce_loss_2": 4.2849855661392215, "ce_loss_3": 4.071276617050171, "ce_loss_7": 3.7474453926086424, "epoch": 0.54, "grad_norm": 576.0, "kl_loss_10": 103.14287910461425, "kl_loss_2": 1454.1438842773437, "kl_loss_3": 1040.240167236328, "kl_loss_7": 326.39082489013674, "learning_rate": 0.00044458090004949454, "loss": 759.9868, "step": 5400 }, { "ce_loss_10": 3.505845844745636, "ce_loss_13": 3.4532413244247437, "ce_loss_2": 4.231140959262848, "ce_loss_3": 3.9924408197402954, "ce_loss_7": 3.619277369976044, "epoch": 0.541, "grad_norm": 532.0, "kl_loss_10": 107.98148956298829, "kl_loss_2": 1636.3289367675782, "kl_loss_3": 1150.8103424072265, "kl_loss_7": 351.5084167480469, "learning_rate": 0.0004430042950547297, "loss": 777.7207, "step": 5410 }, { "ce_loss_10": 3.6032155632972716, "ce_loss_13": 3.5490991711616515, "ce_loss_2": 4.293168675899506, "ce_loss_3": 4.072460389137268, "ce_loss_7": 3.714365911483765, "epoch": 0.542, "grad_norm": 724.0, "kl_loss_10": 107.85527381896972, "kl_loss_2": 1562.5899780273437, "kl_loss_3": 1110.6490478515625, "kl_loss_7": 347.6031066894531, "learning_rate": 0.0004414282640060809, "loss": 772.3885, "step": 5420 }, { "ce_loss_10": 3.698866081237793, "ce_loss_13": 3.6463446855545043, "ce_loss_2": 4.358999896049499, "ce_loss_3": 4.147069537639618, "ce_loss_7": 3.801178812980652, "epoch": 0.543, "grad_norm": 796.0, "kl_loss_10": 105.54359436035156, "kl_loss_2": 1494.2757690429687, "kl_loss_3": 1060.0533599853516, "kl_loss_7": 333.4321685791016, "learning_rate": 0.0004398528227741633, "loss": 748.4289, "step": 5430 }, { "ce_loss_10": 3.5601810455322265, "ce_loss_13": 3.508813178539276, "ce_loss_2": 4.247533452510834, "ce_loss_3": 4.026942503452301, "ce_loss_7": 3.664951038360596, "epoch": 0.544, "grad_norm": 1024.0, "kl_loss_10": 104.4090732574463, "kl_loss_2": 1527.733642578125, "kl_loss_3": 1085.5337341308593, "kl_loss_7": 335.1481048583984, "learning_rate": 0.00043827798722365264, "loss": 772.5596, "step": 5440 }, { "ce_loss_10": 3.689334487915039, "ce_loss_13": 3.637822449207306, "ce_loss_2": 4.337388634681702, "ce_loss_3": 4.121065163612366, "ce_loss_7": 3.786122667789459, "epoch": 0.545, "grad_norm": 1000.0, "kl_loss_10": 105.49521064758301, "kl_loss_2": 1483.632830810547, "kl_loss_3": 1048.82021484375, "kl_loss_7": 330.44934844970703, "learning_rate": 0.00043670377321312535, "loss": 747.1109, "step": 5450 }, { "ce_loss_10": 3.6913806438446044, "ce_loss_13": 3.639738380908966, "ce_loss_2": 4.341848480701446, "ce_loss_3": 4.123585867881775, "ce_loss_7": 3.7898626804351805, "epoch": 0.546, "grad_norm": 852.0, "kl_loss_10": 104.70193481445312, "kl_loss_2": 1479.7732055664062, "kl_loss_3": 1051.444403076172, "kl_loss_7": 329.40140838623046, "learning_rate": 0.0004351301965948991, "loss": 758.7202, "step": 5460 }, { "ce_loss_10": 3.598298704624176, "ce_loss_13": 3.5469488382339476, "ce_loss_2": 4.248875212669373, "ce_loss_3": 4.038452315330505, "ce_loss_7": 3.6984407186508177, "epoch": 0.547, "grad_norm": 768.0, "kl_loss_10": 102.9339599609375, "kl_loss_2": 1480.8460021972655, "kl_loss_3": 1052.092123413086, "kl_loss_7": 326.97843780517576, "learning_rate": 0.000433557273214873, "loss": 755.4356, "step": 5470 }, { "ce_loss_10": 3.5880109786987306, "ce_loss_13": 3.5371466279029846, "ce_loss_2": 4.252883791923523, "ce_loss_3": 4.032021224498749, "ce_loss_7": 3.692497897148132, "epoch": 0.548, "grad_norm": 636.0, "kl_loss_10": 103.87395858764648, "kl_loss_2": 1501.017138671875, "kl_loss_3": 1058.5933807373046, "kl_loss_7": 336.77830963134767, "learning_rate": 0.000431985018912368, "loss": 747.0134, "step": 5480 }, { "ce_loss_10": 3.5538950204849242, "ce_loss_13": 3.501691222190857, "ce_loss_2": 4.248037838935852, "ce_loss_3": 4.022601127624512, "ce_loss_7": 3.6580682754516602, "epoch": 0.549, "grad_norm": 888.0, "kl_loss_10": 105.69743995666504, "kl_loss_2": 1574.1127380371095, "kl_loss_3": 1110.9820037841796, "kl_loss_7": 341.6565216064453, "learning_rate": 0.0004304134495199674, "loss": 758.753, "step": 5490 }, { "ce_loss_10": 3.581299889087677, "ce_loss_13": 3.529753065109253, "ce_loss_2": 4.272796952724457, "ce_loss_3": 4.0470798254013065, "ce_loss_7": 3.688917899131775, "epoch": 0.55, "grad_norm": 944.0, "kl_loss_10": 106.15092010498047, "kl_loss_2": 1576.0352294921875, "kl_loss_3": 1113.462698364258, "kl_loss_7": 344.3568405151367, "learning_rate": 0.0004288425808633575, "loss": 769.614, "step": 5500 }, { "ce_loss_10": 3.5521881103515627, "ce_loss_13": 3.502358639240265, "ce_loss_2": 4.233278894424439, "ce_loss_3": 4.007499969005584, "ce_loss_7": 3.6560718536376955, "epoch": 0.551, "grad_norm": 1048.0, "kl_loss_10": 103.1799186706543, "kl_loss_2": 1545.3322387695312, "kl_loss_3": 1091.1925506591797, "kl_loss_7": 336.0160339355469, "learning_rate": 0.0004272724287611684, "loss": 765.4611, "step": 5510 }, { "ce_loss_10": 3.53389288187027, "ce_loss_13": 3.48224390745163, "ce_loss_2": 4.221820151805877, "ce_loss_3": 3.9870800733566285, "ce_loss_7": 3.6366492390632628, "epoch": 0.552, "grad_norm": 676.0, "kl_loss_10": 106.44814186096191, "kl_loss_2": 1559.4843444824219, "kl_loss_3": 1095.065740966797, "kl_loss_7": 338.5904022216797, "learning_rate": 0.00042570300902481425, "loss": 767.9568, "step": 5520 }, { "ce_loss_10": 3.5621741652488708, "ce_loss_13": 3.5120548844337462, "ce_loss_2": 4.226723647117614, "ce_loss_3": 4.005667972564697, "ce_loss_7": 3.6591508269309996, "epoch": 0.553, "grad_norm": 632.0, "kl_loss_10": 103.24676780700683, "kl_loss_2": 1525.8041198730468, "kl_loss_3": 1074.1563568115234, "kl_loss_7": 333.4058364868164, "learning_rate": 0.00042413433745833423, "loss": 757.0599, "step": 5530 }, { "ce_loss_10": 3.566916048526764, "ce_loss_13": 3.514276123046875, "ce_loss_2": 4.245769345760346, "ce_loss_3": 4.025698387622834, "ce_loss_7": 3.669881451129913, "epoch": 0.554, "grad_norm": 592.0, "kl_loss_10": 109.22485809326172, "kl_loss_2": 1526.9733459472657, "kl_loss_3": 1078.5566314697267, "kl_loss_7": 336.7415344238281, "learning_rate": 0.0004225664298582339, "loss": 743.0363, "step": 5540 }, { "ce_loss_10": 3.6458821892738342, "ce_loss_13": 3.595542049407959, "ce_loss_2": 4.304706931114197, "ce_loss_3": 4.083009362220764, "ce_loss_7": 3.7487244963645936, "epoch": 0.555, "grad_norm": 700.0, "kl_loss_10": 103.35477066040039, "kl_loss_2": 1492.541973876953, "kl_loss_3": 1054.6003387451171, "kl_loss_7": 328.97535095214846, "learning_rate": 0.000420999302013325, "loss": 746.7069, "step": 5550 }, { "ce_loss_10": 3.543286621570587, "ce_loss_13": 3.4908923387527464, "ce_loss_2": 4.248111283779144, "ce_loss_3": 4.010579240322113, "ce_loss_7": 3.6510781407356263, "epoch": 0.556, "grad_norm": 588.0, "kl_loss_10": 107.93652229309082, "kl_loss_2": 1572.0495971679688, "kl_loss_3": 1100.8470733642578, "kl_loss_7": 344.99388732910154, "learning_rate": 0.000419432969704568, "loss": 761.0296, "step": 5560 }, { "ce_loss_10": 3.591177201271057, "ce_loss_13": 3.5405736446380613, "ce_loss_2": 4.254916775226593, "ce_loss_3": 4.039114046096802, "ce_loss_7": 3.695609426498413, "epoch": 0.557, "grad_norm": 596.0, "kl_loss_10": 103.91976203918458, "kl_loss_2": 1494.520928955078, "kl_loss_3": 1059.1153381347656, "kl_loss_7": 332.13660888671876, "learning_rate": 0.00041786744870491154, "loss": 771.7157, "step": 5570 }, { "ce_loss_10": 3.5246644496917723, "ce_loss_13": 3.474863862991333, "ce_loss_2": 4.209751331806183, "ce_loss_3": 3.9866259932518004, "ce_loss_7": 3.6306143283843992, "epoch": 0.558, "grad_norm": 768.0, "kl_loss_10": 107.37320899963379, "kl_loss_2": 1553.0863952636719, "kl_loss_3": 1101.115771484375, "kl_loss_7": 343.06164703369143, "learning_rate": 0.0004163027547791347, "loss": 766.0039, "step": 5580 }, { "ce_loss_10": 3.5013378858566284, "ce_loss_13": 3.4499176740646362, "ce_loss_2": 4.211779713630676, "ce_loss_3": 3.972139894962311, "ce_loss_7": 3.6102558732032777, "epoch": 0.559, "grad_norm": 676.0, "kl_loss_10": 105.24801139831543, "kl_loss_2": 1595.248468017578, "kl_loss_3": 1116.559765625, "kl_loss_7": 345.6261413574219, "learning_rate": 0.0004147389036836881, "loss": 770.9746, "step": 5590 }, { "ce_loss_10": 3.5530710458755492, "ce_loss_13": 3.502123260498047, "ce_loss_2": 4.247922241687775, "ce_loss_3": 4.015506017208099, "ce_loss_7": 3.6567649960517885, "epoch": 0.56, "grad_norm": 892.0, "kl_loss_10": 105.40846672058106, "kl_loss_2": 1570.2839782714843, "kl_loss_3": 1108.4176177978516, "kl_loss_7": 339.8567367553711, "learning_rate": 0.00041317591116653486, "loss": 781.8748, "step": 5600 }, { "ce_loss_10": 3.5930798768997194, "ce_loss_13": 3.5404407382011414, "ce_loss_2": 4.279128456115723, "ce_loss_3": 4.049571526050568, "ce_loss_7": 3.698474419116974, "epoch": 0.561, "grad_norm": 532.0, "kl_loss_10": 107.07124710083008, "kl_loss_2": 1551.74775390625, "kl_loss_3": 1085.609555053711, "kl_loss_7": 342.054150390625, "learning_rate": 0.0004116137929669921, "loss": 757.7674, "step": 5610 }, { "ce_loss_10": 3.5819862723350524, "ce_loss_13": 3.531055748462677, "ce_loss_2": 4.255003273487091, "ce_loss_3": 4.030177474021912, "ce_loss_7": 3.6875664472579954, "epoch": 0.562, "grad_norm": 740.0, "kl_loss_10": 103.05764236450196, "kl_loss_2": 1534.006573486328, "kl_loss_3": 1087.1775329589843, "kl_loss_7": 337.22252655029297, "learning_rate": 0.00041005256481557305, "loss": 755.6203, "step": 5620 }, { "ce_loss_10": 3.683738684654236, "ce_loss_13": 3.6351139307022096, "ce_loss_2": 4.3178343892097475, "ce_loss_3": 4.1045763969421385, "ce_loss_7": 3.7809927582740785, "epoch": 0.563, "grad_norm": 816.0, "kl_loss_10": 101.93090286254883, "kl_loss_2": 1447.3084106445312, "kl_loss_3": 1025.6407775878906, "kl_loss_7": 323.4442886352539, "learning_rate": 0.00040849224243382767, "loss": 738.3967, "step": 5630 }, { "ce_loss_10": 3.535522389411926, "ce_loss_13": 3.4846082448959352, "ce_loss_2": 4.214535582065582, "ce_loss_3": 3.9950142383575438, "ce_loss_7": 3.636801707744598, "epoch": 0.564, "grad_norm": 1224.0, "kl_loss_10": 103.41972465515137, "kl_loss_2": 1533.3713317871093, "kl_loss_3": 1088.9838928222657, "kl_loss_7": 338.5591110229492, "learning_rate": 0.000406932841534185, "loss": 750.5981, "step": 5640 }, { "ce_loss_10": 3.491779601573944, "ce_loss_13": 3.4413583874702454, "ce_loss_2": 4.179727625846863, "ce_loss_3": 3.9571254730224608, "ce_loss_7": 3.597375750541687, "epoch": 0.565, "grad_norm": 768.0, "kl_loss_10": 103.9033332824707, "kl_loss_2": 1554.881982421875, "kl_loss_3": 1098.8897766113282, "kl_loss_7": 337.7521911621094, "learning_rate": 0.0004053743778197951, "loss": 776.919, "step": 5650 }, { "ce_loss_10": 3.6005424857139587, "ce_loss_13": 3.5478009581565857, "ce_loss_2": 4.272884631156922, "ce_loss_3": 4.055270135402679, "ce_loss_7": 3.707489860057831, "epoch": 0.566, "grad_norm": 876.0, "kl_loss_10": 106.7627166748047, "kl_loss_2": 1512.9670288085938, "kl_loss_3": 1075.9048553466796, "kl_loss_7": 337.25047912597654, "learning_rate": 0.0004038168669843697, "loss": 768.7045, "step": 5660 }, { "ce_loss_10": 3.571461033821106, "ce_loss_13": 3.519770634174347, "ce_loss_2": 4.217774832248688, "ce_loss_3": 4.005596113204956, "ce_loss_7": 3.6682698011398314, "epoch": 0.567, "grad_norm": 716.0, "kl_loss_10": 102.6090244293213, "kl_loss_2": 1487.19619140625, "kl_loss_3": 1053.3968170166015, "kl_loss_7": 327.57488403320315, "learning_rate": 0.000402260324712026, "loss": 758.7686, "step": 5670 }, { "ce_loss_10": 3.614235782623291, "ce_loss_13": 3.562249553203583, "ce_loss_2": 4.303182423114777, "ce_loss_3": 4.071573722362518, "ce_loss_7": 3.717853009700775, "epoch": 0.568, "grad_norm": 756.0, "kl_loss_10": 103.95795211791992, "kl_loss_2": 1550.2413452148437, "kl_loss_3": 1084.43701171875, "kl_loss_7": 332.0640930175781, "learning_rate": 0.00040070476667712743, "loss": 754.9023, "step": 5680 }, { "ce_loss_10": 3.638916862010956, "ce_loss_13": 3.5876455068588258, "ce_loss_2": 4.311550414562225, "ce_loss_3": 4.082707452774048, "ce_loss_7": 3.7436187624931336, "epoch": 0.569, "grad_norm": 640.0, "kl_loss_10": 105.23985023498535, "kl_loss_2": 1516.5386047363281, "kl_loss_3": 1064.3995880126954, "kl_loss_7": 332.18102264404297, "learning_rate": 0.0003991502085441259, "loss": 762.5479, "step": 5690 }, { "ce_loss_10": 3.673640179634094, "ce_loss_13": 3.6238157987594604, "ce_loss_2": 4.310906338691711, "ce_loss_3": 4.097232127189637, "ce_loss_7": 3.769440805912018, "epoch": 0.57, "grad_norm": 640.0, "kl_loss_10": 102.09554634094238, "kl_loss_2": 1449.8062255859375, "kl_loss_3": 1024.8091369628905, "kl_loss_7": 321.06761322021487, "learning_rate": 0.0003975966659674047, "loss": 745.8081, "step": 5700 }, { "ce_loss_10": 3.637189197540283, "ce_loss_13": 3.5872881770133973, "ce_loss_2": 4.306287097930908, "ce_loss_3": 4.079818177223205, "ce_loss_7": 3.742920684814453, "epoch": 0.571, "grad_norm": 892.0, "kl_loss_10": 104.71104316711425, "kl_loss_2": 1506.985986328125, "kl_loss_3": 1061.8996337890626, "kl_loss_7": 332.19418182373045, "learning_rate": 0.0003960441545911204, "loss": 748.0883, "step": 5710 }, { "ce_loss_10": 3.6364985704421997, "ce_loss_13": 3.5860419154167174, "ce_loss_2": 4.293080842494964, "ce_loss_3": 4.082504796981811, "ce_loss_7": 3.7380078792572022, "epoch": 0.572, "grad_norm": 896.0, "kl_loss_10": 103.22016830444336, "kl_loss_2": 1503.7408264160156, "kl_loss_3": 1075.556103515625, "kl_loss_7": 332.86627502441405, "learning_rate": 0.0003944926900490452, "loss": 748.6983, "step": 5720 }, { "ce_loss_10": 3.550028681755066, "ce_loss_13": 3.4982852935791016, "ce_loss_2": 4.234983682632446, "ce_loss_3": 4.007760643959045, "ce_loss_7": 3.6566668510437013, "epoch": 0.573, "grad_norm": 640.0, "kl_loss_10": 104.12722892761231, "kl_loss_2": 1549.0814880371095, "kl_loss_3": 1094.347848510742, "kl_loss_7": 338.23940124511716, "learning_rate": 0.0003929422879644099, "loss": 753.4583, "step": 5730 }, { "ce_loss_10": 3.5504472494125365, "ce_loss_13": 3.505388391017914, "ce_loss_2": 4.2074245572090145, "ce_loss_3": 3.988253116607666, "ce_loss_7": 3.6503371596336365, "epoch": 0.574, "grad_norm": 676.0, "kl_loss_10": 101.45218963623047, "kl_loss_2": 1493.6693786621095, "kl_loss_3": 1057.2907318115235, "kl_loss_7": 325.82408752441404, "learning_rate": 0.0003913929639497462, "loss": 736.0854, "step": 5740 }, { "ce_loss_10": 3.5011925101280212, "ce_loss_13": 3.448139989376068, "ce_loss_2": 4.199755299091339, "ce_loss_3": 3.958504331111908, "ce_loss_7": 3.607280421257019, "epoch": 0.575, "grad_norm": 736.0, "kl_loss_10": 103.01125793457031, "kl_loss_2": 1556.0855895996094, "kl_loss_3": 1085.7976440429688, "kl_loss_7": 331.14860382080076, "learning_rate": 0.00038984473360672965, "loss": 747.6187, "step": 5750 }, { "ce_loss_10": 3.5171858191490175, "ce_loss_13": 3.463565707206726, "ce_loss_2": 4.207050454616547, "ce_loss_3": 3.97497478723526, "ce_loss_7": 3.619600760936737, "epoch": 0.576, "grad_norm": 620.0, "kl_loss_10": 102.24255867004395, "kl_loss_2": 1551.9175415039062, "kl_loss_3": 1087.918264770508, "kl_loss_7": 330.58592376708987, "learning_rate": 0.0003882976125260229, "loss": 747.4189, "step": 5760 }, { "ce_loss_10": 3.585590887069702, "ce_loss_13": 3.5345024347305296, "ce_loss_2": 4.256135821342468, "ce_loss_3": 4.030499756336212, "ce_loss_7": 3.6903082013130186, "epoch": 0.577, "grad_norm": 712.0, "kl_loss_10": 104.04505729675293, "kl_loss_2": 1504.3523864746094, "kl_loss_3": 1062.4952545166016, "kl_loss_7": 328.9863876342773, "learning_rate": 0.00038675161628711776, "loss": 754.4238, "step": 5770 }, { "ce_loss_10": 3.621767997741699, "ce_loss_13": 3.5723859071731567, "ce_loss_2": 4.273144292831421, "ce_loss_3": 4.058295905590057, "ce_loss_7": 3.7225655794143675, "epoch": 0.578, "grad_norm": 608.0, "kl_loss_10": 103.78015403747558, "kl_loss_2": 1474.5402099609375, "kl_loss_3": 1053.4625305175782, "kl_loss_7": 329.27289123535155, "learning_rate": 0.0003852067604581794, "loss": 764.7126, "step": 5780 }, { "ce_loss_10": 3.566964566707611, "ce_loss_13": 3.516967737674713, "ce_loss_2": 4.246767210960388, "ce_loss_3": 4.022446417808533, "ce_loss_7": 3.663704001903534, "epoch": 0.579, "grad_norm": 1088.0, "kl_loss_10": 103.47967948913575, "kl_loss_2": 1544.652392578125, "kl_loss_3": 1086.7516876220702, "kl_loss_7": 332.0967056274414, "learning_rate": 0.0003836630605958888, "loss": 755.4902, "step": 5790 }, { "ce_loss_10": 3.6213804602622988, "ce_loss_13": 3.5709073185920714, "ce_loss_2": 4.2802981495857235, "ce_loss_3": 4.062768268585205, "ce_loss_7": 3.725276529788971, "epoch": 0.58, "grad_norm": 880.0, "kl_loss_10": 103.92762908935546, "kl_loss_2": 1520.2714965820312, "kl_loss_3": 1077.9637115478515, "kl_loss_7": 334.8226593017578, "learning_rate": 0.0003821205322452863, "loss": 779.13, "step": 5800 }, { "ce_loss_10": 3.604140031337738, "ce_loss_13": 3.555801820755005, "ce_loss_2": 4.259420943260193, "ce_loss_3": 4.03878002166748, "ce_loss_7": 3.7059669733047484, "epoch": 0.581, "grad_norm": 728.0, "kl_loss_10": 102.23239822387696, "kl_loss_2": 1497.5399658203125, "kl_loss_3": 1058.6563537597656, "kl_loss_7": 328.7637573242188, "learning_rate": 0.0003805791909396155, "loss": 751.9693, "step": 5810 }, { "ce_loss_10": 3.5547424077987673, "ce_loss_13": 3.5049473524093626, "ce_loss_2": 4.218287968635559, "ce_loss_3": 4.000685131549835, "ce_loss_7": 3.6565263986587526, "epoch": 0.582, "grad_norm": 916.0, "kl_loss_10": 102.4558609008789, "kl_loss_2": 1510.7140991210938, "kl_loss_3": 1068.0064880371094, "kl_loss_7": 327.7070068359375, "learning_rate": 0.0003790390522001662, "loss": 759.3443, "step": 5820 }, { "ce_loss_10": 3.4867406725883483, "ce_loss_13": 3.438612926006317, "ce_loss_2": 4.163849830627441, "ce_loss_3": 3.934425783157349, "ce_loss_7": 3.586738383769989, "epoch": 0.583, "grad_norm": 616.0, "kl_loss_10": 100.84263534545899, "kl_loss_2": 1548.273504638672, "kl_loss_3": 1084.9116088867188, "kl_loss_7": 331.867448425293, "learning_rate": 0.0003775001315361183, "loss": 749.9862, "step": 5830 }, { "ce_loss_10": 3.603880000114441, "ce_loss_13": 3.55130854845047, "ce_loss_2": 4.276393926143646, "ce_loss_3": 4.053136312961579, "ce_loss_7": 3.7067405223846435, "epoch": 0.584, "grad_norm": 944.0, "kl_loss_10": 104.46978950500488, "kl_loss_2": 1524.7834350585938, "kl_loss_3": 1075.7860015869142, "kl_loss_7": 332.461848449707, "learning_rate": 0.0003759624444443858, "loss": 754.5517, "step": 5840 }, { "ce_loss_10": 3.6359092354774476, "ce_loss_13": 3.588266432285309, "ce_loss_2": 4.290874457359314, "ce_loss_3": 4.073269629478455, "ce_loss_7": 3.7330583333969116, "epoch": 0.585, "grad_norm": 668.0, "kl_loss_10": 103.05393180847167, "kl_loss_2": 1494.6533752441405, "kl_loss_3": 1055.1204010009765, "kl_loss_7": 324.49484100341795, "learning_rate": 0.00037442600640946044, "loss": 741.5803, "step": 5850 }, { "ce_loss_10": 3.5943995356559753, "ce_loss_13": 3.5469851016998293, "ce_loss_2": 4.244282925128937, "ce_loss_3": 4.032297527790069, "ce_loss_7": 3.6919864296913145, "epoch": 0.586, "grad_norm": 576.0, "kl_loss_10": 101.35606575012207, "kl_loss_2": 1490.3679138183593, "kl_loss_3": 1055.8772857666015, "kl_loss_7": 328.1918350219727, "learning_rate": 0.00037289083290325663, "loss": 735.9778, "step": 5860 }, { "ce_loss_10": 3.580592691898346, "ce_loss_13": 3.5297833561897276, "ce_loss_2": 4.237173223495484, "ce_loss_3": 4.018594920635223, "ce_loss_7": 3.6821051597595216, "epoch": 0.587, "grad_norm": 1080.0, "kl_loss_10": 103.70457572937012, "kl_loss_2": 1485.4027770996095, "kl_loss_3": 1049.0727416992188, "kl_loss_7": 328.60044250488284, "learning_rate": 0.0003713569393849543, "loss": 739.9794, "step": 5870 }, { "ce_loss_10": 3.62850706577301, "ce_loss_13": 3.5780181527137755, "ce_loss_2": 4.288279867172241, "ce_loss_3": 4.06753134727478, "ce_loss_7": 3.729906475543976, "epoch": 0.588, "grad_norm": 1408.0, "kl_loss_10": 103.48674354553222, "kl_loss_2": 1498.5252746582032, "kl_loss_3": 1056.2960296630858, "kl_loss_7": 328.75601654052736, "learning_rate": 0.00036982434130084397, "loss": 750.2411, "step": 5880 }, { "ce_loss_10": 3.535642158985138, "ce_loss_13": 3.4822312593460083, "ce_loss_2": 4.202389264106751, "ce_loss_3": 3.981008434295654, "ce_loss_7": 3.6408384323120115, "epoch": 0.589, "grad_norm": 680.0, "kl_loss_10": 104.4944004058838, "kl_loss_2": 1510.5988342285157, "kl_loss_3": 1067.8350799560546, "kl_loss_7": 334.65821228027346, "learning_rate": 0.00036829305408417166, "loss": 756.7639, "step": 5890 }, { "ce_loss_10": 3.5267788410186767, "ce_loss_13": 3.4754740238189696, "ce_loss_2": 4.220576870441437, "ce_loss_3": 3.9906909346580504, "ce_loss_7": 3.634098696708679, "epoch": 0.59, "grad_norm": 752.0, "kl_loss_10": 104.63858146667481, "kl_loss_2": 1559.4647521972656, "kl_loss_3": 1092.9864868164063, "kl_loss_7": 338.1507034301758, "learning_rate": 0.0003667630931549826, "loss": 759.6546, "step": 5900 }, { "ce_loss_10": 3.4906337380409242, "ce_loss_13": 3.44010591506958, "ce_loss_2": 4.205403459072113, "ce_loss_3": 3.96314742565155, "ce_loss_7": 3.602112913131714, "epoch": 0.591, "grad_norm": 1008.0, "kl_loss_10": 103.78443870544433, "kl_loss_2": 1593.700421142578, "kl_loss_3": 1107.236505126953, "kl_loss_7": 337.62986907958987, "learning_rate": 0.00036523447391996613, "loss": 770.2784, "step": 5910 }, { "ce_loss_10": 3.590332794189453, "ce_loss_13": 3.540692663192749, "ce_loss_2": 4.254353618621826, "ce_loss_3": 4.03473801612854, "ce_loss_7": 3.690076971054077, "epoch": 0.592, "grad_norm": 728.0, "kl_loss_10": 101.6016056060791, "kl_loss_2": 1490.7753051757813, "kl_loss_3": 1049.7984252929687, "kl_loss_7": 327.50398406982424, "learning_rate": 0.00036370721177230114, "loss": 740.0721, "step": 5920 }, { "ce_loss_10": 3.5800902605056764, "ce_loss_13": 3.529787373542786, "ce_loss_2": 4.259616982936859, "ce_loss_3": 4.031433463096619, "ce_loss_7": 3.687071990966797, "epoch": 0.593, "grad_norm": 916.0, "kl_loss_10": 103.60996360778809, "kl_loss_2": 1530.73017578125, "kl_loss_3": 1081.6857879638671, "kl_loss_7": 335.4696014404297, "learning_rate": 0.00036218132209150044, "loss": 757.1418, "step": 5930 }, { "ce_loss_10": 3.5374584794044495, "ce_loss_13": 3.48268039226532, "ce_loss_2": 4.253190016746521, "ce_loss_3": 4.017039823532104, "ce_loss_7": 3.6513925194740295, "epoch": 0.594, "grad_norm": 808.0, "kl_loss_10": 107.17097396850586, "kl_loss_2": 1602.049609375, "kl_loss_3": 1123.8143005371094, "kl_loss_7": 346.45857391357424, "learning_rate": 0.0003606568202432562, "loss": 772.6543, "step": 5940 }, { "ce_loss_10": 3.6087950468063354, "ce_loss_13": 3.557066786289215, "ce_loss_2": 4.300846183300019, "ce_loss_3": 4.069302177429199, "ce_loss_7": 3.711429476737976, "epoch": 0.595, "grad_norm": 944.0, "kl_loss_10": 104.97148094177246, "kl_loss_2": 1563.99765625, "kl_loss_3": 1098.8528839111327, "kl_loss_7": 337.7842132568359, "learning_rate": 0.0003591337215792851, "loss": 757.7949, "step": 5950 }, { "ce_loss_10": 3.6505517840385435, "ce_loss_13": 3.6006023406982424, "ce_loss_2": 4.288039410114289, "ce_loss_3": 4.073378145694733, "ce_loss_7": 3.749862039089203, "epoch": 0.596, "grad_norm": 836.0, "kl_loss_10": 102.86605949401856, "kl_loss_2": 1478.939794921875, "kl_loss_3": 1045.490380859375, "kl_loss_7": 327.3226348876953, "learning_rate": 0.00035761204143717383, "loss": 752.927, "step": 5960 }, { "ce_loss_10": 3.6058494210243226, "ce_loss_13": 3.553228962421417, "ce_loss_2": 4.270123815536499, "ce_loss_3": 4.054256749153137, "ce_loss_7": 3.7049681901931764, "epoch": 0.597, "grad_norm": 680.0, "kl_loss_10": 103.65262145996094, "kl_loss_2": 1521.8378662109376, "kl_loss_3": 1072.9578399658203, "kl_loss_7": 329.09585723876955, "learning_rate": 0.0003560917951402245, "loss": 774.4003, "step": 5970 }, { "ce_loss_10": 3.5767219185829164, "ce_loss_13": 3.5293787360191344, "ce_loss_2": 4.2343804955482485, "ce_loss_3": 4.020289719104767, "ce_loss_7": 3.677654707431793, "epoch": 0.598, "grad_norm": 596.0, "kl_loss_10": 101.6404468536377, "kl_loss_2": 1511.8633422851562, "kl_loss_3": 1067.9173767089844, "kl_loss_7": 328.18029022216797, "learning_rate": 0.00035457299799730046, "loss": 750.287, "step": 5980 }, { "ce_loss_10": 3.644188916683197, "ce_loss_13": 3.5941228747367857, "ce_loss_2": 4.301962375640869, "ce_loss_3": 4.082816529273987, "ce_loss_7": 3.747548019886017, "epoch": 0.599, "grad_norm": 740.0, "kl_loss_10": 102.16660385131836, "kl_loss_2": 1489.9934936523437, "kl_loss_3": 1049.2652069091796, "kl_loss_7": 329.72974548339846, "learning_rate": 0.0003530556653026721, "loss": 754.5862, "step": 5990 }, { "ce_loss_10": 3.558954632282257, "ce_loss_13": 3.5082098126411436, "ce_loss_2": 4.23734233379364, "ce_loss_3": 3.9969574928283693, "ce_loss_7": 3.657862997055054, "epoch": 0.6, "grad_norm": 1232.0, "kl_loss_10": 101.46609268188476, "kl_loss_2": 1543.7959716796875, "kl_loss_3": 1060.671484375, "kl_loss_7": 325.714404296875, "learning_rate": 0.00035153981233586274, "loss": 761.5355, "step": 6000 }, { "ce_loss_10": 3.5352521777153014, "ce_loss_13": 3.4850243449211122, "ce_loss_2": 4.208903896808624, "ce_loss_3": 3.9893046498298643, "ce_loss_7": 3.638095939159393, "epoch": 0.601, "grad_norm": 788.0, "kl_loss_10": 102.83674011230468, "kl_loss_2": 1520.490460205078, "kl_loss_3": 1075.1412994384766, "kl_loss_7": 330.4532958984375, "learning_rate": 0.00035002545436149473, "loss": 775.2623, "step": 6010 }, { "ce_loss_10": 3.5446203351020813, "ce_loss_13": 3.4925349354743958, "ce_loss_2": 4.229444015026092, "ce_loss_3": 4.005681085586548, "ce_loss_7": 3.6492652177810667, "epoch": 0.602, "grad_norm": 720.0, "kl_loss_10": 105.86748886108398, "kl_loss_2": 1544.3044860839843, "kl_loss_3": 1094.7579620361328, "kl_loss_7": 335.57518157958987, "learning_rate": 0.0003485126066291364, "loss": 750.8229, "step": 6020 }, { "ce_loss_10": 3.5943541407585142, "ce_loss_13": 3.545217454433441, "ce_loss_2": 4.270927572250367, "ce_loss_3": 4.049900913238526, "ce_loss_7": 3.696830666065216, "epoch": 0.603, "grad_norm": 556.0, "kl_loss_10": 102.35104598999024, "kl_loss_2": 1526.0286010742188, "kl_loss_3": 1079.2010833740235, "kl_loss_7": 329.24956665039065, "learning_rate": 0.0003470012843731476, "loss": 759.8646, "step": 6030 }, { "ce_loss_10": 3.5351796746253967, "ce_loss_13": 3.4840824246406554, "ce_loss_2": 4.2191991925239565, "ce_loss_3": 3.988502836227417, "ce_loss_7": 3.6364561796188353, "epoch": 0.604, "grad_norm": 716.0, "kl_loss_10": 102.13497428894043, "kl_loss_2": 1543.3246276855468, "kl_loss_3": 1082.2652648925782, "kl_loss_7": 328.9899307250977, "learning_rate": 0.00034549150281252633, "loss": 772.0707, "step": 6040 }, { "ce_loss_10": 3.5091880321502686, "ce_loss_13": 3.4613110542297365, "ce_loss_2": 4.184108030796051, "ce_loss_3": 3.9643590211868287, "ce_loss_7": 3.6145803332328796, "epoch": 0.605, "grad_norm": 732.0, "kl_loss_10": 102.22412796020508, "kl_loss_2": 1504.8231323242187, "kl_loss_3": 1063.2692016601563, "kl_loss_7": 328.2981216430664, "learning_rate": 0.0003439832771507565, "loss": 745.4139, "step": 6050 }, { "ce_loss_10": 3.5193559169769286, "ce_loss_13": 3.468503212928772, "ce_loss_2": 4.194398546218872, "ce_loss_3": 3.9736214876174927, "ce_loss_7": 3.6221352577209474, "epoch": 0.606, "grad_norm": 712.0, "kl_loss_10": 102.24978408813476, "kl_loss_2": 1532.1601928710938, "kl_loss_3": 1086.0745056152343, "kl_loss_7": 332.0151031494141, "learning_rate": 0.0003424766225756537, "loss": 750.1569, "step": 6060 }, { "ce_loss_10": 3.5759162306785583, "ce_loss_13": 3.5256246089935304, "ce_loss_2": 4.245574676990509, "ce_loss_3": 4.0260990262031555, "ce_loss_7": 3.6784772634506226, "epoch": 0.607, "grad_norm": 752.0, "kl_loss_10": 103.50673561096191, "kl_loss_2": 1508.47099609375, "kl_loss_3": 1060.6993072509765, "kl_loss_7": 330.906965637207, "learning_rate": 0.00034097155425921255, "loss": 742.3438, "step": 6070 }, { "ce_loss_10": 3.46923348903656, "ce_loss_13": 3.415601706504822, "ce_loss_2": 4.156315732002258, "ce_loss_3": 3.925130546092987, "ce_loss_7": 3.57115318775177, "epoch": 0.608, "grad_norm": 560.0, "kl_loss_10": 103.56741981506347, "kl_loss_2": 1545.499249267578, "kl_loss_3": 1075.9679229736328, "kl_loss_7": 332.6068481445312, "learning_rate": 0.0003394680873574546, "loss": 751.0478, "step": 6080 }, { "ce_loss_10": 3.5853955030441282, "ce_loss_13": 3.5316200256347656, "ce_loss_2": 4.260335302352905, "ce_loss_3": 4.0361177802085875, "ce_loss_7": 3.6886603474617004, "epoch": 0.609, "grad_norm": 864.0, "kl_loss_10": 104.27139320373536, "kl_loss_2": 1534.9214965820313, "kl_loss_3": 1079.5093170166015, "kl_loss_7": 331.8132049560547, "learning_rate": 0.0003379662370102747, "loss": 748.6225, "step": 6090 }, { "ce_loss_10": 3.58959846496582, "ce_loss_13": 3.541385018825531, "ce_loss_2": 4.240854179859161, "ce_loss_3": 4.028041577339172, "ce_loss_7": 3.6898964762687685, "epoch": 0.61, "grad_norm": 600.0, "kl_loss_10": 102.2036735534668, "kl_loss_2": 1498.26982421875, "kl_loss_3": 1065.2218231201173, "kl_loss_7": 329.87906646728516, "learning_rate": 0.0003364660183412892, "loss": 752.2733, "step": 6100 }, { "ce_loss_10": 3.571301484107971, "ce_loss_13": 3.521631419658661, "ce_loss_2": 4.230020773410797, "ce_loss_3": 4.008902907371521, "ce_loss_7": 3.670923388004303, "epoch": 0.611, "grad_norm": 628.0, "kl_loss_10": 102.84948997497558, "kl_loss_2": 1506.0473693847657, "kl_loss_3": 1060.877734375, "kl_loss_7": 331.68453521728514, "learning_rate": 0.0003349674464576834, "loss": 758.8906, "step": 6110 }, { "ce_loss_10": 3.5140294313430784, "ce_loss_13": 3.4644438147544863, "ce_loss_2": 4.19958119392395, "ce_loss_3": 3.965699815750122, "ce_loss_7": 3.616783881187439, "epoch": 0.612, "grad_norm": 844.0, "kl_loss_10": 103.29110069274903, "kl_loss_2": 1543.1746948242187, "kl_loss_3": 1072.8209136962892, "kl_loss_7": 330.17193908691405, "learning_rate": 0.00033347053645005966, "loss": 742.2323, "step": 6120 }, { "ce_loss_10": 3.6327236175537108, "ce_loss_13": 3.582784104347229, "ce_loss_2": 4.282118928432465, "ce_loss_3": 4.0711610317230225, "ce_loss_7": 3.7321339011192323, "epoch": 0.613, "grad_norm": 740.0, "kl_loss_10": 102.02607955932618, "kl_loss_2": 1465.8333862304687, "kl_loss_3": 1045.488787841797, "kl_loss_7": 324.58868103027345, "learning_rate": 0.00033197530339228485, "loss": 751.7893, "step": 6130 }, { "ce_loss_10": 3.586853337287903, "ce_loss_13": 3.5358680963516234, "ce_loss_2": 4.25110250711441, "ce_loss_3": 4.034108889102936, "ce_loss_7": 3.6896887183189393, "epoch": 0.614, "grad_norm": 564.0, "kl_loss_10": 104.10573081970215, "kl_loss_2": 1511.268865966797, "kl_loss_3": 1067.6672302246093, "kl_loss_7": 332.1710968017578, "learning_rate": 0.00033048176234133967, "loss": 749.2161, "step": 6140 }, { "ce_loss_10": 3.5742902636528013, "ce_loss_13": 3.5228272914886474, "ce_loss_2": 4.230352890491486, "ce_loss_3": 4.015736556053161, "ce_loss_7": 3.6750436544418337, "epoch": 0.615, "grad_norm": 860.0, "kl_loss_10": 103.39159049987794, "kl_loss_2": 1498.0490417480469, "kl_loss_3": 1063.913623046875, "kl_loss_7": 331.5521179199219, "learning_rate": 0.0003289899283371657, "loss": 754.5307, "step": 6150 }, { "ce_loss_10": 3.5957746744155883, "ce_loss_13": 3.544519376754761, "ce_loss_2": 4.269026684761047, "ce_loss_3": 4.042446756362915, "ce_loss_7": 3.699190843105316, "epoch": 0.616, "grad_norm": 828.0, "kl_loss_10": 102.69770431518555, "kl_loss_2": 1511.92548828125, "kl_loss_3": 1061.2057678222657, "kl_loss_7": 326.47308044433595, "learning_rate": 0.0003274998164025148, "loss": 761.5789, "step": 6160 }, { "ce_loss_10": 3.6271970868110657, "ce_loss_13": 3.5771484851837156, "ce_loss_2": 4.28127703666687, "ce_loss_3": 4.068797588348389, "ce_loss_7": 3.725817549228668, "epoch": 0.617, "grad_norm": 816.0, "kl_loss_10": 104.18108444213867, "kl_loss_2": 1495.4690368652343, "kl_loss_3": 1067.8161834716798, "kl_loss_7": 330.9167999267578, "learning_rate": 0.0003260114415427975, "loss": 764.954, "step": 6170 }, { "ce_loss_10": 3.542770063877106, "ce_loss_13": 3.4941726207733153, "ce_loss_2": 4.222906470298767, "ce_loss_3": 3.9978802680969237, "ce_loss_7": 3.6469863176345827, "epoch": 0.618, "grad_norm": 608.0, "kl_loss_10": 102.77635269165039, "kl_loss_2": 1539.7477722167969, "kl_loss_3": 1073.7549041748048, "kl_loss_7": 329.3814010620117, "learning_rate": 0.0003245248187459323, "loss": 770.941, "step": 6180 }, { "ce_loss_10": 3.5339240074157714, "ce_loss_13": 3.4864890456199644, "ce_loss_2": 4.179947829246521, "ce_loss_3": 3.9647281408309936, "ce_loss_7": 3.6298912167549133, "epoch": 0.619, "grad_norm": 700.0, "kl_loss_10": 99.105961227417, "kl_loss_2": 1474.9824157714843, "kl_loss_3": 1042.0771240234376, "kl_loss_7": 320.1064682006836, "learning_rate": 0.00032303996298219416, "loss": 738.6812, "step": 6190 }, { "ce_loss_10": 3.622859275341034, "ce_loss_13": 3.569678175449371, "ce_loss_2": 4.2641567349433895, "ce_loss_3": 4.05250461101532, "ce_loss_7": 3.7192259788513184, "epoch": 0.62, "grad_norm": 600.0, "kl_loss_10": 101.88480224609376, "kl_loss_2": 1460.370428466797, "kl_loss_3": 1037.2598937988282, "kl_loss_7": 323.563525390625, "learning_rate": 0.00032155688920406414, "loss": 734.9562, "step": 6200 }, { "ce_loss_10": 3.5330086827278135, "ce_loss_13": 3.479928433895111, "ce_loss_2": 4.232482516765595, "ce_loss_3": 3.9923107981681825, "ce_loss_7": 3.6350372195243836, "epoch": 0.621, "grad_norm": 712.0, "kl_loss_10": 105.51962699890137, "kl_loss_2": 1560.9213806152343, "kl_loss_3": 1088.901821899414, "kl_loss_7": 332.6898712158203, "learning_rate": 0.0003200756123460788, "loss": 778.1945, "step": 6210 }, { "ce_loss_10": 3.5616023778915404, "ce_loss_13": 3.5096312880516054, "ce_loss_2": 4.24862129688263, "ce_loss_3": 4.016729807853698, "ce_loss_7": 3.667115581035614, "epoch": 0.622, "grad_norm": 856.0, "kl_loss_10": 104.9896369934082, "kl_loss_2": 1552.094677734375, "kl_loss_3": 1088.4412170410155, "kl_loss_7": 337.5470367431641, "learning_rate": 0.00031859614732467957, "loss": 770.0535, "step": 6220 }, { "ce_loss_10": 3.612150752544403, "ce_loss_13": 3.561730885505676, "ce_loss_2": 4.268787050247193, "ce_loss_3": 4.049095714092255, "ce_loss_7": 3.7150032997131346, "epoch": 0.623, "grad_norm": 680.0, "kl_loss_10": 102.05799598693848, "kl_loss_2": 1480.0767211914062, "kl_loss_3": 1039.762841796875, "kl_loss_7": 322.03365631103514, "learning_rate": 0.00031711850903806275, "loss": 740.9797, "step": 6230 }, { "ce_loss_10": 3.5173429012298585, "ce_loss_13": 3.4667624950408937, "ce_loss_2": 4.197360372543335, "ce_loss_3": 3.973633337020874, "ce_loss_7": 3.6220384359359743, "epoch": 0.624, "grad_norm": 752.0, "kl_loss_10": 104.92872314453125, "kl_loss_2": 1550.9642578125, "kl_loss_3": 1093.791519165039, "kl_loss_7": 337.9700302124023, "learning_rate": 0.0003156427123660297, "loss": 753.9351, "step": 6240 }, { "ce_loss_10": 3.608515167236328, "ce_loss_13": 3.558167338371277, "ce_loss_2": 4.25784957408905, "ce_loss_3": 4.047843027114868, "ce_loss_7": 3.711010181903839, "epoch": 0.625, "grad_norm": 656.0, "kl_loss_10": 102.28406639099121, "kl_loss_2": 1481.6047119140626, "kl_loss_3": 1050.207046508789, "kl_loss_7": 326.46583557128906, "learning_rate": 0.0003141687721698363, "loss": 755.7682, "step": 6250 }, { "ce_loss_10": 3.576554238796234, "ce_loss_13": 3.5290865659713746, "ce_loss_2": 4.214070570468903, "ce_loss_3": 4.000980746746063, "ce_loss_7": 3.6730608105659486, "epoch": 0.626, "grad_norm": 712.0, "kl_loss_10": 99.44272041320801, "kl_loss_2": 1449.5838439941406, "kl_loss_3": 1020.5220001220703, "kl_loss_7": 315.5764907836914, "learning_rate": 0.00031269670329204396, "loss": 737.7984, "step": 6260 }, { "ce_loss_10": 3.6150862336158753, "ce_loss_13": 3.5658503651618956, "ce_loss_2": 4.2533419847488405, "ce_loss_3": 4.047639870643616, "ce_loss_7": 3.712856340408325, "epoch": 0.627, "grad_norm": 752.0, "kl_loss_10": 103.68559112548829, "kl_loss_2": 1467.6802612304687, "kl_loss_3": 1043.7154693603516, "kl_loss_7": 325.4422317504883, "learning_rate": 0.00031122652055637015, "loss": 745.9129, "step": 6270 }, { "ce_loss_10": 3.576836359500885, "ce_loss_13": 3.528142845630646, "ce_loss_2": 4.248085951805114, "ce_loss_3": 4.019692587852478, "ce_loss_7": 3.68005256652832, "epoch": 0.628, "grad_norm": 596.0, "kl_loss_10": 102.96378440856934, "kl_loss_2": 1535.2899108886718, "kl_loss_3": 1079.1783569335937, "kl_loss_7": 331.52378845214844, "learning_rate": 0.0003097582387675385, "loss": 744.3274, "step": 6280 }, { "ce_loss_10": 3.6145308136940004, "ce_loss_13": 3.5663092136383057, "ce_loss_2": 4.272367560863495, "ce_loss_3": 4.058292520046234, "ce_loss_7": 3.717595374584198, "epoch": 0.629, "grad_norm": 912.0, "kl_loss_10": 102.35868263244629, "kl_loss_2": 1503.147149658203, "kl_loss_3": 1062.844760131836, "kl_loss_7": 329.4593734741211, "learning_rate": 0.00030829187271113034, "loss": 742.6113, "step": 6290 }, { "ce_loss_10": 3.6142046570777895, "ce_loss_13": 3.5650119185447693, "ce_loss_2": 4.267242801189423, "ce_loss_3": 4.045838582515716, "ce_loss_7": 3.710309457778931, "epoch": 0.63, "grad_norm": 1144.0, "kl_loss_10": 100.94548225402832, "kl_loss_2": 1466.1372375488281, "kl_loss_3": 1033.335122680664, "kl_loss_7": 319.0108947753906, "learning_rate": 0.00030682743715343565, "loss": 747.3359, "step": 6300 }, { "ce_loss_10": 3.5611066222190857, "ce_loss_13": 3.508115231990814, "ce_loss_2": 4.23502197265625, "ce_loss_3": 4.01511150598526, "ce_loss_7": 3.6634032130241394, "epoch": 0.631, "grad_norm": 596.0, "kl_loss_10": 104.99801979064941, "kl_loss_2": 1507.1676513671875, "kl_loss_3": 1065.9459564208985, "kl_loss_7": 333.0229888916016, "learning_rate": 0.0003053649468413043, "loss": 757.1045, "step": 6310 }, { "ce_loss_10": 3.673084056377411, "ce_loss_13": 3.6206674337387086, "ce_loss_2": 4.328990340232849, "ce_loss_3": 4.112777829170227, "ce_loss_7": 3.7753253817558288, "epoch": 0.632, "grad_norm": 960.0, "kl_loss_10": 103.74627227783203, "kl_loss_2": 1491.32626953125, "kl_loss_3": 1059.1556274414063, "kl_loss_7": 329.95042419433594, "learning_rate": 0.00030390441650199725, "loss": 740.6209, "step": 6320 }, { "ce_loss_10": 3.567064619064331, "ce_loss_13": 3.517108714580536, "ce_loss_2": 4.224661326408386, "ce_loss_3": 4.008377110958099, "ce_loss_7": 3.668283534049988, "epoch": 0.633, "grad_norm": 672.0, "kl_loss_10": 101.21994132995606, "kl_loss_2": 1481.4445922851562, "kl_loss_3": 1047.4256591796875, "kl_loss_7": 326.64634399414064, "learning_rate": 0.00030244586084303903, "loss": 736.5036, "step": 6330 }, { "ce_loss_10": 3.537699282169342, "ce_loss_13": 3.4859857678413393, "ce_loss_2": 4.215117824077606, "ce_loss_3": 3.9961795091629027, "ce_loss_7": 3.6404967188835142, "epoch": 0.634, "grad_norm": 668.0, "kl_loss_10": 104.18267974853515, "kl_loss_2": 1545.0354919433594, "kl_loss_3": 1103.3633178710938, "kl_loss_7": 336.44725646972654, "learning_rate": 0.00030098929455206903, "loss": 750.3958, "step": 6340 }, { "ce_loss_10": 3.545027530193329, "ce_loss_13": 3.494958758354187, "ce_loss_2": 4.212705898284912, "ce_loss_3": 3.9828644752502442, "ce_loss_7": 3.647375166416168, "epoch": 0.635, "grad_norm": 876.0, "kl_loss_10": 100.94892578125, "kl_loss_2": 1526.2840881347656, "kl_loss_3": 1067.8353332519532, "kl_loss_7": 329.84326629638673, "learning_rate": 0.00029953473229669324, "loss": 768.8955, "step": 6350 }, { "ce_loss_10": 3.568509006500244, "ce_loss_13": 3.520020866394043, "ce_loss_2": 4.239533865451813, "ce_loss_3": 4.020994126796722, "ce_loss_7": 3.6722696304321287, "epoch": 0.636, "grad_norm": 576.0, "kl_loss_10": 100.92936706542969, "kl_loss_2": 1501.6828979492188, "kl_loss_3": 1067.7843322753906, "kl_loss_7": 331.0703842163086, "learning_rate": 0.00029808218872433767, "loss": 743.1559, "step": 6360 }, { "ce_loss_10": 3.6348137736320494, "ce_loss_13": 3.5856575846672056, "ce_loss_2": 4.28603435754776, "ce_loss_3": 4.065227198600769, "ce_loss_7": 3.7360589027404787, "epoch": 0.637, "grad_norm": 584.0, "kl_loss_10": 101.93559303283692, "kl_loss_2": 1483.5072509765625, "kl_loss_3": 1040.7521209716797, "kl_loss_7": 326.0771286010742, "learning_rate": 0.0002966316784621, "loss": 735.2354, "step": 6370 }, { "ce_loss_10": 3.5472820162773133, "ce_loss_13": 3.497290515899658, "ce_loss_2": 4.236545002460479, "ce_loss_3": 4.0091285586357115, "ce_loss_7": 3.6542338490486146, "epoch": 0.638, "grad_norm": 792.0, "kl_loss_10": 103.12975769042968, "kl_loss_2": 1543.3537231445312, "kl_loss_3": 1089.6312896728516, "kl_loss_7": 337.1376647949219, "learning_rate": 0.0002951832161166024, "loss": 746.8385, "step": 6380 }, { "ce_loss_10": 3.6292296767234804, "ce_loss_13": 3.576380527019501, "ce_loss_2": 4.283578586578369, "ce_loss_3": 4.0701348900794985, "ce_loss_7": 3.727100098133087, "epoch": 0.639, "grad_norm": 736.0, "kl_loss_10": 104.15792579650879, "kl_loss_2": 1477.9153625488282, "kl_loss_3": 1053.1480743408204, "kl_loss_7": 329.2735260009766, "learning_rate": 0.0002937368162738445, "loss": 732.7099, "step": 6390 }, { "ce_loss_10": 3.5623428225517273, "ce_loss_13": 3.5153833866119384, "ce_loss_2": 4.209477055072784, "ce_loss_3": 3.9933662891387938, "ce_loss_7": 3.6614410996437075, "epoch": 0.64, "grad_norm": 804.0, "kl_loss_10": 98.37516212463379, "kl_loss_2": 1481.9513061523437, "kl_loss_3": 1049.031494140625, "kl_loss_7": 322.0158157348633, "learning_rate": 0.0002922924934990568, "loss": 745.6221, "step": 6400 }, { "ce_loss_10": 3.5046188950538637, "ce_loss_13": 3.454220485687256, "ce_loss_2": 4.192747735977173, "ce_loss_3": 3.965851306915283, "ce_loss_7": 3.610697627067566, "epoch": 0.641, "grad_norm": 592.0, "kl_loss_10": 101.77973136901855, "kl_loss_2": 1557.1741027832031, "kl_loss_3": 1096.1894592285157, "kl_loss_7": 332.9794982910156, "learning_rate": 0.0002908502623365536, "loss": 755.7854, "step": 6410 }, { "ce_loss_10": 3.4388458132743835, "ce_loss_13": 3.388948106765747, "ce_loss_2": 4.131116008758545, "ce_loss_3": 3.899718451499939, "ce_loss_7": 3.5454453110694883, "epoch": 0.642, "grad_norm": 640.0, "kl_loss_10": 101.07619361877441, "kl_loss_2": 1556.9548217773438, "kl_loss_3": 1088.1022003173828, "kl_loss_7": 331.88790740966795, "learning_rate": 0.0002894101373095867, "loss": 754.8264, "step": 6420 }, { "ce_loss_10": 3.6482159018516542, "ce_loss_13": 3.5989360451698302, "ce_loss_2": 4.3026307106018065, "ce_loss_3": 4.088621973991394, "ce_loss_7": 3.7486623406410216, "epoch": 0.643, "grad_norm": 808.0, "kl_loss_10": 103.97106246948242, "kl_loss_2": 1480.7304565429688, "kl_loss_3": 1058.295590209961, "kl_loss_7": 330.6110534667969, "learning_rate": 0.00028797213292019926, "loss": 746.0376, "step": 6430 }, { "ce_loss_10": 3.627279353141785, "ce_loss_13": 3.576305639743805, "ce_loss_2": 4.28672604560852, "ce_loss_3": 4.0649741888046265, "ce_loss_7": 3.72651948928833, "epoch": 0.644, "grad_norm": 804.0, "kl_loss_10": 102.7659896850586, "kl_loss_2": 1495.4775634765624, "kl_loss_3": 1054.7227661132813, "kl_loss_7": 328.53303985595704, "learning_rate": 0.0002865362636490791, "loss": 757.1003, "step": 6440 }, { "ce_loss_10": 3.639008581638336, "ce_loss_13": 3.593253993988037, "ce_loss_2": 4.295189774036407, "ce_loss_3": 4.074914503097534, "ce_loss_7": 3.739544117450714, "epoch": 0.645, "grad_norm": 680.0, "kl_loss_10": 101.32575569152831, "kl_loss_2": 1499.6353271484375, "kl_loss_3": 1054.7485656738281, "kl_loss_7": 325.29419860839846, "learning_rate": 0.0002851025439554142, "loss": 739.9297, "step": 6450 }, { "ce_loss_10": 3.6304724335670473, "ce_loss_13": 3.5810503482818605, "ce_loss_2": 4.282529997825622, "ce_loss_3": 4.079386639595032, "ce_loss_7": 3.7343773007392884, "epoch": 0.646, "grad_norm": 620.0, "kl_loss_10": 102.98503303527832, "kl_loss_2": 1475.7454650878906, "kl_loss_3": 1062.6621795654296, "kl_loss_7": 329.3671646118164, "learning_rate": 0.00028367098827674573, "loss": 739.7644, "step": 6460 }, { "ce_loss_10": 3.5564152240753173, "ce_loss_13": 3.505023491382599, "ce_loss_2": 4.215067052841187, "ce_loss_3": 3.9929473996162415, "ce_loss_7": 3.6544344305992125, "epoch": 0.647, "grad_norm": 632.0, "kl_loss_10": 101.028609085083, "kl_loss_2": 1488.8440002441407, "kl_loss_3": 1046.5376190185548, "kl_loss_7": 319.807470703125, "learning_rate": 0.00028224161102882397, "loss": 742.6878, "step": 6470 }, { "ce_loss_10": 3.532797598838806, "ce_loss_13": 3.485309863090515, "ce_loss_2": 4.181655299663544, "ce_loss_3": 3.973689341545105, "ce_loss_7": 3.633597362041473, "epoch": 0.648, "grad_norm": 1096.0, "kl_loss_10": 100.66843681335449, "kl_loss_2": 1466.3800354003906, "kl_loss_3": 1043.9051361083984, "kl_loss_7": 321.4422241210938, "learning_rate": 0.00028081442660546124, "loss": 741.9064, "step": 6480 }, { "ce_loss_10": 3.5990182161331177, "ce_loss_13": 3.5488809823989866, "ce_loss_2": 4.2509907484054565, "ce_loss_3": 4.034446978569031, "ce_loss_7": 3.6986751317977906, "epoch": 0.649, "grad_norm": 740.0, "kl_loss_10": 103.26073913574218, "kl_loss_2": 1486.8421569824218, "kl_loss_3": 1046.1349822998047, "kl_loss_7": 325.41282043457034, "learning_rate": 0.0002793894493783892, "loss": 745.4766, "step": 6490 }, { "ce_loss_10": 3.6137735843658447, "ce_loss_13": 3.5677531242370604, "ce_loss_2": 4.263865661621094, "ce_loss_3": 4.04360601902008, "ce_loss_7": 3.7102641820907594, "epoch": 0.65, "grad_norm": 544.0, "kl_loss_10": 100.34854888916016, "kl_loss_2": 1467.2634765625, "kl_loss_3": 1032.0357879638673, "kl_loss_7": 318.1000717163086, "learning_rate": 0.0002779666936971129, "loss": 731.0987, "step": 6500 }, { "ce_loss_10": 3.622784066200256, "ce_loss_13": 3.5738240361213682, "ce_loss_2": 4.293535590171814, "ce_loss_3": 4.0728056073188785, "ce_loss_7": 3.7274719357490538, "epoch": 0.651, "grad_norm": 736.0, "kl_loss_10": 101.5173168182373, "kl_loss_2": 1510.3239868164062, "kl_loss_3": 1074.2033203125, "kl_loss_7": 328.23638610839845, "learning_rate": 0.00027654617388876614, "loss": 752.9499, "step": 6510 }, { "ce_loss_10": 3.650138533115387, "ce_loss_13": 3.601429855823517, "ce_loss_2": 4.299806785583496, "ce_loss_3": 4.084343910217285, "ce_loss_7": 3.7478551268577576, "epoch": 0.652, "grad_norm": 684.0, "kl_loss_10": 103.03865585327148, "kl_loss_2": 1483.2630310058594, "kl_loss_3": 1046.6598236083985, "kl_loss_7": 323.6476364135742, "learning_rate": 0.0002751279042579672, "loss": 740.9, "step": 6520 }, { "ce_loss_10": 3.5948129177093504, "ce_loss_13": 3.5448715686798096, "ce_loss_2": 4.239987373352051, "ce_loss_3": 4.0261630654335026, "ce_loss_7": 3.696196985244751, "epoch": 0.653, "grad_norm": 1016.0, "kl_loss_10": 100.91519508361816, "kl_loss_2": 1463.7680236816407, "kl_loss_3": 1030.3847961425781, "kl_loss_7": 321.7137786865234, "learning_rate": 0.00027371189908667604, "loss": 746.639, "step": 6530 }, { "ce_loss_10": 3.6402456879615785, "ce_loss_13": 3.5885451793670655, "ce_loss_2": 4.3247617244720455, "ce_loss_3": 4.0944114327430725, "ce_loss_7": 3.7461476802825926, "epoch": 0.654, "grad_norm": 632.0, "kl_loss_10": 104.77521095275878, "kl_loss_2": 1528.7153076171876, "kl_loss_3": 1069.2963775634767, "kl_loss_7": 334.3237762451172, "learning_rate": 0.00027229817263404863, "loss": 766.7919, "step": 6540 }, { "ce_loss_10": 3.6225221157073975, "ce_loss_13": 3.5735984683036803, "ce_loss_2": 4.252956342697144, "ce_loss_3": 4.048660516738892, "ce_loss_7": 3.717261564731598, "epoch": 0.655, "grad_norm": 824.0, "kl_loss_10": 102.12414779663087, "kl_loss_2": 1443.03857421875, "kl_loss_3": 1028.7486511230468, "kl_loss_7": 321.27867431640624, "learning_rate": 0.0002708867391362948, "loss": 735.0582, "step": 6550 }, { "ce_loss_10": 3.6016898274421694, "ce_loss_13": 3.5533241271972655, "ce_loss_2": 4.234023022651672, "ce_loss_3": 4.025428521633148, "ce_loss_7": 3.695333516597748, "epoch": 0.656, "grad_norm": 984.0, "kl_loss_10": 99.8819133758545, "kl_loss_2": 1431.1221557617187, "kl_loss_3": 1010.254037475586, "kl_loss_7": 313.99269409179686, "learning_rate": 0.0002694776128065345, "loss": 732.7115, "step": 6560 }, { "ce_loss_10": 3.537383186817169, "ce_loss_13": 3.489410877227783, "ce_loss_2": 4.19642813205719, "ce_loss_3": 3.9806575417518615, "ce_loss_7": 3.6391283512115478, "epoch": 0.657, "grad_norm": 576.0, "kl_loss_10": 101.79305152893066, "kl_loss_2": 1501.8880432128906, "kl_loss_3": 1068.321340942383, "kl_loss_7": 329.8885498046875, "learning_rate": 0.00026807080783465374, "loss": 737.5012, "step": 6570 }, { "ce_loss_10": 3.653720664978027, "ce_loss_13": 3.6032727479934694, "ce_loss_2": 4.32029173374176, "ce_loss_3": 4.098062968254089, "ce_loss_7": 3.7533512115478516, "epoch": 0.658, "grad_norm": 616.0, "kl_loss_10": 102.99898376464844, "kl_loss_2": 1489.673895263672, "kl_loss_3": 1056.718051147461, "kl_loss_7": 325.9285400390625, "learning_rate": 0.00026666633838716316, "loss": 751.2588, "step": 6580 }, { "ce_loss_10": 3.5405921816825865, "ce_loss_13": 3.489483916759491, "ce_loss_2": 4.220113301277161, "ce_loss_3": 3.9972169280052183, "ce_loss_7": 3.6466519236564636, "epoch": 0.659, "grad_norm": 696.0, "kl_loss_10": 104.70051918029785, "kl_loss_2": 1529.2580810546874, "kl_loss_3": 1082.1379486083983, "kl_loss_7": 333.6918212890625, "learning_rate": 0.00026526421860705474, "loss": 761.1169, "step": 6590 }, { "ce_loss_10": 3.570644438266754, "ce_loss_13": 3.520185112953186, "ce_loss_2": 4.228864288330078, "ce_loss_3": 4.009816741943359, "ce_loss_7": 3.676501214504242, "epoch": 0.66, "grad_norm": 700.0, "kl_loss_10": 103.64044952392578, "kl_loss_2": 1497.2633117675782, "kl_loss_3": 1051.0708099365233, "kl_loss_7": 331.30606842041016, "learning_rate": 0.0002638644626136587, "loss": 741.429, "step": 6600 }, { "ce_loss_10": 3.580263316631317, "ce_loss_13": 3.530834698677063, "ce_loss_2": 4.247053337097168, "ce_loss_3": 4.017553317546844, "ce_loss_7": 3.6781116127967834, "epoch": 0.661, "grad_norm": 504.0, "kl_loss_10": 100.85865097045898, "kl_loss_2": 1507.0734069824218, "kl_loss_3": 1057.3230102539062, "kl_loss_7": 323.9317108154297, "learning_rate": 0.00026246708450250255, "loss": 746.3756, "step": 6610 }, { "ce_loss_10": 3.5796775579452516, "ce_loss_13": 3.5275776982307434, "ce_loss_2": 4.229259622097016, "ce_loss_3": 4.009648394584656, "ce_loss_7": 3.679073464870453, "epoch": 0.662, "grad_norm": 788.0, "kl_loss_10": 101.13056831359863, "kl_loss_2": 1476.3184204101562, "kl_loss_3": 1035.5322509765624, "kl_loss_7": 320.9491439819336, "learning_rate": 0.00026107209834516854, "loss": 738.2496, "step": 6620 }, { "ce_loss_10": 3.524012196063995, "ce_loss_13": 3.4747344732284544, "ce_loss_2": 4.215798091888428, "ce_loss_3": 3.9891905426979064, "ce_loss_7": 3.6282769083976745, "epoch": 0.663, "grad_norm": 556.0, "kl_loss_10": 102.19658203125, "kl_loss_2": 1556.5956665039062, "kl_loss_3": 1092.559063720703, "kl_loss_7": 330.03883666992186, "learning_rate": 0.0002596795181891514, "loss": 764.1166, "step": 6630 }, { "ce_loss_10": 3.533200263977051, "ce_loss_13": 3.480591058731079, "ce_loss_2": 4.208922982215881, "ce_loss_3": 3.9843982458114624, "ce_loss_7": 3.637127935886383, "epoch": 0.664, "grad_norm": 744.0, "kl_loss_10": 104.05998039245605, "kl_loss_2": 1520.5204772949219, "kl_loss_3": 1069.3911712646484, "kl_loss_7": 332.95276641845703, "learning_rate": 0.000258289358057718, "loss": 777.6476, "step": 6640 }, { "ce_loss_10": 3.605274772644043, "ce_loss_13": 3.552940809726715, "ce_loss_2": 4.27773334980011, "ce_loss_3": 4.0479767441749575, "ce_loss_7": 3.710472786426544, "epoch": 0.665, "grad_norm": 668.0, "kl_loss_10": 103.32317848205567, "kl_loss_2": 1518.1732055664063, "kl_loss_3": 1066.8532684326171, "kl_loss_7": 331.6925903320313, "learning_rate": 0.0002569016319497657, "loss": 755.805, "step": 6650 }, { "ce_loss_10": 3.594022798538208, "ce_loss_13": 3.5410403847694396, "ce_loss_2": 4.260823273658753, "ce_loss_3": 4.036172866821289, "ce_loss_7": 3.6959716796875, "epoch": 0.666, "grad_norm": 676.0, "kl_loss_10": 104.14873695373535, "kl_loss_2": 1531.2763549804688, "kl_loss_3": 1074.8283905029298, "kl_loss_7": 334.44822082519534, "learning_rate": 0.00025551635383968066, "loss": 767.0006, "step": 6660 }, { "ce_loss_10": 3.5057089567184447, "ce_loss_13": 3.4541707158088686, "ce_loss_2": 4.176563847064972, "ce_loss_3": 3.9496588110923767, "ce_loss_7": 3.6054946780204773, "epoch": 0.667, "grad_norm": 1016.0, "kl_loss_10": 103.33973236083985, "kl_loss_2": 1528.662451171875, "kl_loss_3": 1071.2199096679688, "kl_loss_7": 331.499040222168, "learning_rate": 0.00025413353767719804, "loss": 753.8289, "step": 6670 }, { "ce_loss_10": 3.5577869057655334, "ce_loss_13": 3.5111236929893495, "ce_loss_2": 4.231831538677215, "ce_loss_3": 4.005722403526306, "ce_loss_7": 3.659741163253784, "epoch": 0.668, "grad_norm": 860.0, "kl_loss_10": 100.04262313842773, "kl_loss_2": 1522.005810546875, "kl_loss_3": 1065.8314422607423, "kl_loss_7": 322.74341583251953, "learning_rate": 0.0002527531973872617, "loss": 754.4833, "step": 6680 }, { "ce_loss_10": 3.5766180515289308, "ce_loss_13": 3.5274311661720277, "ce_loss_2": 4.230318570137024, "ce_loss_3": 4.01560093164444, "ce_loss_7": 3.678002429008484, "epoch": 0.669, "grad_norm": 1384.0, "kl_loss_10": 100.60889892578125, "kl_loss_2": 1491.849298095703, "kl_loss_3": 1057.1784240722657, "kl_loss_7": 324.19933166503904, "learning_rate": 0.0002513753468698826, "loss": 741.4988, "step": 6690 }, { "ce_loss_10": 3.547669863700867, "ce_loss_13": 3.4957098007202148, "ce_loss_2": 4.225924658775329, "ce_loss_3": 3.998526620864868, "ce_loss_7": 3.651533854007721, "epoch": 0.67, "grad_norm": 1048.0, "kl_loss_10": 103.2139949798584, "kl_loss_2": 1538.2576843261718, "kl_loss_3": 1083.5722961425781, "kl_loss_7": 332.5747299194336, "learning_rate": 0.0002500000000000001, "loss": 758.0608, "step": 6700 }, { "ce_loss_10": 3.662180411815643, "ce_loss_13": 3.613389527797699, "ce_loss_2": 4.284214067459106, "ce_loss_3": 4.079432034492493, "ce_loss_7": 3.7579395055770872, "epoch": 0.671, "grad_norm": 576.0, "kl_loss_10": 99.55428085327148, "kl_loss_2": 1425.9182189941407, "kl_loss_3": 1014.9958679199219, "kl_loss_7": 316.9266525268555, "learning_rate": 0.0002486271706273421, "loss": 747.6499, "step": 6710 }, { "ce_loss_10": 3.6001158714294434, "ce_loss_13": 3.5526996731758116, "ce_loss_2": 4.222909963130951, "ce_loss_3": 4.011171352863312, "ce_loss_7": 3.6927876710891723, "epoch": 0.672, "grad_norm": 976.0, "kl_loss_10": 100.10917282104492, "kl_loss_2": 1431.2197570800781, "kl_loss_3": 1014.3698150634766, "kl_loss_7": 315.2253082275391, "learning_rate": 0.0002472568725762853, "loss": 739.0668, "step": 6720 }, { "ce_loss_10": 3.5926173090934754, "ce_loss_13": 3.544549751281738, "ce_loss_2": 4.214060723781586, "ce_loss_3": 4.011428570747375, "ce_loss_7": 3.6910054445266725, "epoch": 0.673, "grad_norm": 516.0, "kl_loss_10": 99.02452163696289, "kl_loss_2": 1443.2336669921874, "kl_loss_3": 1022.6737579345703, "kl_loss_7": 313.8247299194336, "learning_rate": 0.00024588911964571554, "loss": 731.988, "step": 6730 }, { "ce_loss_10": 3.603414499759674, "ce_loss_13": 3.5511184096336366, "ce_loss_2": 4.2851941108703615, "ce_loss_3": 4.05850830078125, "ce_loss_7": 3.707227420806885, "epoch": 0.674, "grad_norm": 580.0, "kl_loss_10": 107.2045684814453, "kl_loss_2": 1529.6573669433594, "kl_loss_3": 1079.2643737792969, "kl_loss_7": 337.3622589111328, "learning_rate": 0.00024452392560888974, "loss": 750.3002, "step": 6740 }, { "ce_loss_10": 3.4954357981681823, "ce_loss_13": 3.447824704647064, "ce_loss_2": 4.156595861911773, "ce_loss_3": 3.9343768119812013, "ce_loss_7": 3.5925448536872864, "epoch": 0.675, "grad_norm": 604.0, "kl_loss_10": 100.99344253540039, "kl_loss_2": 1510.9070068359374, "kl_loss_3": 1064.7915496826172, "kl_loss_7": 324.59803009033203, "learning_rate": 0.00024316130421329695, "loss": 744.0665, "step": 6750 }, { "ce_loss_10": 3.5750558853149412, "ce_loss_13": 3.5268502712249754, "ce_loss_2": 4.229422283172608, "ce_loss_3": 4.013463175296783, "ce_loss_7": 3.6730728268623354, "epoch": 0.676, "grad_norm": 544.0, "kl_loss_10": 100.6230339050293, "kl_loss_2": 1456.6670532226562, "kl_loss_3": 1025.9927215576172, "kl_loss_7": 317.3859344482422, "learning_rate": 0.00024180126918051909, "loss": 734.8776, "step": 6760 }, { "ce_loss_10": 3.6212777972221373, "ce_loss_13": 3.571806180477142, "ce_loss_2": 4.262225651741028, "ce_loss_3": 4.049245178699493, "ce_loss_7": 3.72325119972229, "epoch": 0.677, "grad_norm": 672.0, "kl_loss_10": 101.04333686828613, "kl_loss_2": 1456.7758178710938, "kl_loss_3": 1029.471417236328, "kl_loss_7": 320.73938903808596, "learning_rate": 0.00024044383420609406, "loss": 729.4033, "step": 6770 }, { "ce_loss_10": 3.6311294794082642, "ce_loss_13": 3.583685803413391, "ce_loss_2": 4.25825183391571, "ce_loss_3": 4.052718317508697, "ce_loss_7": 3.7265649795532227, "epoch": 0.678, "grad_norm": 576.0, "kl_loss_10": 99.96271705627441, "kl_loss_2": 1446.6059020996095, "kl_loss_3": 1028.0545379638672, "kl_loss_7": 317.5989364624023, "learning_rate": 0.00023908901295937712, "loss": 742.0234, "step": 6780 }, { "ce_loss_10": 3.628547990322113, "ce_loss_13": 3.5761566281318666, "ce_loss_2": 4.273875093460083, "ce_loss_3": 4.0607711434364315, "ce_loss_7": 3.7250800013542174, "epoch": 0.679, "grad_norm": 712.0, "kl_loss_10": 101.27830696105957, "kl_loss_2": 1455.5505249023438, "kl_loss_3": 1029.1639190673827, "kl_loss_7": 320.25231781005857, "learning_rate": 0.00023773681908340283, "loss": 750.7537, "step": 6790 }, { "ce_loss_10": 3.599391317367554, "ce_loss_13": 3.547702062129974, "ce_loss_2": 4.270357215404511, "ce_loss_3": 4.047076523303986, "ce_loss_7": 3.702862000465393, "epoch": 0.68, "grad_norm": 812.0, "kl_loss_10": 105.92819633483887, "kl_loss_2": 1535.9554870605468, "kl_loss_3": 1077.9286743164062, "kl_loss_7": 337.33699493408204, "learning_rate": 0.00023638726619474876, "loss": 769.5994, "step": 6800 }, { "ce_loss_10": 3.588322615623474, "ce_loss_13": 3.538641881942749, "ce_loss_2": 4.273955631256103, "ce_loss_3": 4.0468222260475155, "ce_loss_7": 3.691556119918823, "epoch": 0.681, "grad_norm": 664.0, "kl_loss_10": 103.05172843933106, "kl_loss_2": 1531.3743835449218, "kl_loss_3": 1082.476754760742, "kl_loss_7": 331.4963119506836, "learning_rate": 0.0002350403678833976, "loss": 754.1373, "step": 6810 }, { "ce_loss_10": 3.513903558254242, "ce_loss_13": 3.4633278369903566, "ce_loss_2": 4.185439038276672, "ce_loss_3": 3.9612069487571717, "ce_loss_7": 3.6158273220062256, "epoch": 0.682, "grad_norm": 672.0, "kl_loss_10": 100.21570281982422, "kl_loss_2": 1517.696221923828, "kl_loss_3": 1071.2864685058594, "kl_loss_7": 326.1484893798828, "learning_rate": 0.00023369613771260007, "loss": 743.7283, "step": 6820 }, { "ce_loss_10": 3.632569658756256, "ce_loss_13": 3.583143413066864, "ce_loss_2": 4.298235011100769, "ce_loss_3": 4.075159895420074, "ce_loss_7": 3.735695469379425, "epoch": 0.683, "grad_norm": 1072.0, "kl_loss_10": 103.31424903869629, "kl_loss_2": 1523.1610717773438, "kl_loss_3": 1067.3874206542969, "kl_loss_7": 330.7246551513672, "learning_rate": 0.00023235458921873925, "loss": 759.2215, "step": 6830 }, { "ce_loss_10": 3.5781831979751586, "ce_loss_13": 3.5281830191612245, "ce_loss_2": 4.273094618320465, "ce_loss_3": 4.044710409641266, "ce_loss_7": 3.687508189678192, "epoch": 0.684, "grad_norm": 1592.0, "kl_loss_10": 104.86770210266113, "kl_loss_2": 1573.0007080078126, "kl_loss_3": 1104.036917114258, "kl_loss_7": 340.68297882080077, "learning_rate": 0.0002310157359111938, "loss": 775.1095, "step": 6840 }, { "ce_loss_10": 3.4720136761665343, "ce_loss_13": 3.420221483707428, "ce_loss_2": 4.19843327999115, "ce_loss_3": 3.950139057636261, "ce_loss_7": 3.581448459625244, "epoch": 0.685, "grad_norm": 968.0, "kl_loss_10": 103.10019950866699, "kl_loss_2": 1624.6990661621094, "kl_loss_3": 1124.6025360107421, "kl_loss_7": 340.21141052246094, "learning_rate": 0.0002296795912722014, "loss": 774.3507, "step": 6850 }, { "ce_loss_10": 3.619318461418152, "ce_loss_13": 3.570458245277405, "ce_loss_2": 4.2634848833084105, "ce_loss_3": 4.050227165222168, "ce_loss_7": 3.720330607891083, "epoch": 0.686, "grad_norm": 704.0, "kl_loss_10": 101.98275108337403, "kl_loss_2": 1470.4170776367187, "kl_loss_3": 1038.748483276367, "kl_loss_7": 323.4554748535156, "learning_rate": 0.0002283461687567236, "loss": 729.2701, "step": 6860 }, { "ce_loss_10": 3.6723763942718506, "ce_loss_13": 3.621968948841095, "ce_loss_2": 4.30412210226059, "ce_loss_3": 4.09724851846695, "ce_loss_7": 3.7722750067710877, "epoch": 0.687, "grad_norm": 616.0, "kl_loss_10": 101.73320808410645, "kl_loss_2": 1434.9742248535156, "kl_loss_3": 1020.1068145751954, "kl_loss_7": 318.0028076171875, "learning_rate": 0.00022701548179231045, "loss": 742.0532, "step": 6870 }, { "ce_loss_10": 3.6228394985198973, "ce_loss_13": 3.571567165851593, "ce_loss_2": 4.281274724006653, "ce_loss_3": 4.060529291629791, "ce_loss_7": 3.7240469694137572, "epoch": 0.688, "grad_norm": 820.0, "kl_loss_10": 102.79843978881836, "kl_loss_2": 1500.4690185546874, "kl_loss_3": 1055.0995330810547, "kl_loss_7": 324.5954223632813, "learning_rate": 0.00022568754377896516, "loss": 738.3846, "step": 6880 }, { "ce_loss_10": 3.6168400645256042, "ce_loss_13": 3.568450939655304, "ce_loss_2": 4.26511583328247, "ce_loss_3": 4.0520494937896725, "ce_loss_7": 3.715933322906494, "epoch": 0.689, "grad_norm": 576.0, "kl_loss_10": 101.39541778564453, "kl_loss_2": 1481.3662841796875, "kl_loss_3": 1044.4031860351563, "kl_loss_7": 327.64307708740233, "learning_rate": 0.00022436236808900844, "loss": 735.7188, "step": 6890 }, { "ce_loss_10": 3.5091506004333497, "ce_loss_13": 3.459543597698212, "ce_loss_2": 4.179090237617492, "ce_loss_3": 3.9523648619651794, "ce_loss_7": 3.611553704738617, "epoch": 0.69, "grad_norm": 900.0, "kl_loss_10": 102.09333686828613, "kl_loss_2": 1531.6673522949218, "kl_loss_3": 1071.113348388672, "kl_loss_7": 328.5457275390625, "learning_rate": 0.00022303996806694487, "loss": 746.5365, "step": 6900 }, { "ce_loss_10": 3.5924980640411377, "ce_loss_13": 3.5434168696403505, "ce_loss_2": 4.249067664146423, "ce_loss_3": 4.037683534622192, "ce_loss_7": 3.6954938411712646, "epoch": 0.691, "grad_norm": 688.0, "kl_loss_10": 100.86896057128907, "kl_loss_2": 1498.27890625, "kl_loss_3": 1057.79873046875, "kl_loss_7": 324.28038177490237, "learning_rate": 0.00022172035702932823, "loss": 743.9507, "step": 6910 }, { "ce_loss_10": 3.631723868846893, "ce_loss_13": 3.584617757797241, "ce_loss_2": 4.269367408752442, "ce_loss_3": 4.060899245738983, "ce_loss_7": 3.7275217175483704, "epoch": 0.692, "grad_norm": 708.0, "kl_loss_10": 101.48851852416992, "kl_loss_2": 1448.5584289550782, "kl_loss_3": 1032.1916259765626, "kl_loss_7": 320.71083679199216, "learning_rate": 0.00022040354826462666, "loss": 735.1562, "step": 6920 }, { "ce_loss_10": 3.570647966861725, "ce_loss_13": 3.521779215335846, "ce_loss_2": 4.2185657024383545, "ce_loss_3": 4.009237241744995, "ce_loss_7": 3.6679769277572634, "epoch": 0.693, "grad_norm": 676.0, "kl_loss_10": 100.70296020507813, "kl_loss_2": 1474.9325317382813, "kl_loss_3": 1042.8214721679688, "kl_loss_7": 318.8450271606445, "learning_rate": 0.0002190895550330899, "loss": 743.9021, "step": 6930 }, { "ce_loss_10": 3.4950982213020323, "ce_loss_13": 3.4431872367858887, "ce_loss_2": 4.173799359798432, "ce_loss_3": 3.9546412110328673, "ce_loss_7": 3.5996562957763674, "epoch": 0.694, "grad_norm": 764.0, "kl_loss_10": 103.4541389465332, "kl_loss_2": 1533.1158630371094, "kl_loss_3": 1082.206524658203, "kl_loss_7": 336.3981216430664, "learning_rate": 0.00021777839056661552, "loss": 744.3959, "step": 6940 }, { "ce_loss_10": 3.5826305866241457, "ce_loss_13": 3.535012757778168, "ce_loss_2": 4.233699595928192, "ce_loss_3": 4.0169126987457275, "ce_loss_7": 3.6802796363830566, "epoch": 0.695, "grad_norm": 772.0, "kl_loss_10": 100.54262924194336, "kl_loss_2": 1474.622247314453, "kl_loss_3": 1041.125521850586, "kl_loss_7": 321.0318359375, "learning_rate": 0.0002164700680686147, "loss": 729.3359, "step": 6950 }, { "ce_loss_10": 3.630559766292572, "ce_loss_13": 3.5819427013397216, "ce_loss_2": 4.2709434747695925, "ce_loss_3": 4.058713316917419, "ce_loss_7": 3.73145877122879, "epoch": 0.696, "grad_norm": 648.0, "kl_loss_10": 101.15100364685058, "kl_loss_2": 1453.4813293457032, "kl_loss_3": 1022.8825988769531, "kl_loss_7": 321.2187896728516, "learning_rate": 0.0002151646007138806, "loss": 729.0, "step": 6960 }, { "ce_loss_10": 3.5013758659362795, "ce_loss_13": 3.452328824996948, "ce_loss_2": 4.178699684143067, "ce_loss_3": 3.947460615634918, "ce_loss_7": 3.6004079937934876, "epoch": 0.697, "grad_norm": 660.0, "kl_loss_10": 103.3748233795166, "kl_loss_2": 1541.3166259765626, "kl_loss_3": 1076.5017669677734, "kl_loss_7": 331.0443710327148, "learning_rate": 0.00021386200164845526, "loss": 750.0945, "step": 6970 }, { "ce_loss_10": 3.6936116099357603, "ce_loss_13": 3.6449682235717775, "ce_loss_2": 4.310778784751892, "ce_loss_3": 4.108264696598053, "ce_loss_7": 3.787326729297638, "epoch": 0.698, "grad_norm": 604.0, "kl_loss_10": 101.98397216796874, "kl_loss_2": 1433.8074523925782, "kl_loss_3": 1022.558969116211, "kl_loss_7": 320.03160247802737, "learning_rate": 0.0002125622839894964, "loss": 729.9183, "step": 6980 }, { "ce_loss_10": 3.6322519898414614, "ce_loss_13": 3.585060155391693, "ce_loss_2": 4.272629261016846, "ce_loss_3": 4.05768392086029, "ce_loss_7": 3.7269264459609985, "epoch": 0.699, "grad_norm": 572.0, "kl_loss_10": 101.5181053161621, "kl_loss_2": 1459.6825866699219, "kl_loss_3": 1032.1376068115235, "kl_loss_7": 318.1553451538086, "learning_rate": 0.00021126546082514663, "loss": 731.3151, "step": 6990 }, { "ce_loss_10": 3.6491450905799865, "ce_loss_13": 3.5987593650817873, "ce_loss_2": 4.282061743736267, "ce_loss_3": 4.071951985359192, "ce_loss_7": 3.7427945017814634, "epoch": 0.7, "grad_norm": 760.0, "kl_loss_10": 101.92886085510254, "kl_loss_2": 1458.24462890625, "kl_loss_3": 1029.3592163085937, "kl_loss_7": 320.2199172973633, "learning_rate": 0.00020997154521440098, "loss": 729.4302, "step": 7000 }, { "ce_loss_10": 3.5915807366371153, "ce_loss_13": 3.5439091682434083, "ce_loss_2": 4.233248913288117, "ce_loss_3": 4.024724793434143, "ce_loss_7": 3.685661196708679, "epoch": 0.701, "grad_norm": 564.0, "kl_loss_10": 99.64333763122559, "kl_loss_2": 1456.7446655273438, "kl_loss_3": 1038.2345611572266, "kl_loss_7": 318.3260437011719, "learning_rate": 0.0002086805501869749, "loss": 725.6171, "step": 7010 }, { "ce_loss_10": 3.5580953001976012, "ce_loss_13": 3.508427548408508, "ce_loss_2": 4.235318386554718, "ce_loss_3": 4.011897599697113, "ce_loss_7": 3.662204325199127, "epoch": 0.702, "grad_norm": 644.0, "kl_loss_10": 102.74548721313477, "kl_loss_2": 1536.4400817871094, "kl_loss_3": 1087.122247314453, "kl_loss_7": 335.6620391845703, "learning_rate": 0.0002073924887431744, "loss": 754.1321, "step": 7020 }, { "ce_loss_10": 3.567932200431824, "ce_loss_13": 3.519511365890503, "ce_loss_2": 4.220903420448304, "ce_loss_3": 4.0007314562797545, "ce_loss_7": 3.6697914361953736, "epoch": 0.703, "grad_norm": 660.0, "kl_loss_10": 101.81342010498047, "kl_loss_2": 1490.4758544921874, "kl_loss_3": 1051.454364013672, "kl_loss_7": 327.6197845458984, "learning_rate": 0.00020610737385376348, "loss": 756.8908, "step": 7030 }, { "ce_loss_10": 3.635763943195343, "ce_loss_13": 3.5879767417907713, "ce_loss_2": 4.2563050270080565, "ce_loss_3": 4.054796481132508, "ce_loss_7": 3.7330915212631224, "epoch": 0.704, "grad_norm": 520.0, "kl_loss_10": 100.99309921264648, "kl_loss_2": 1429.4926452636719, "kl_loss_3": 1022.7890441894531, "kl_loss_7": 316.4844741821289, "learning_rate": 0.00020482521845983521, "loss": 736.2275, "step": 7040 }, { "ce_loss_10": 3.626901054382324, "ce_loss_13": 3.5782265067100525, "ce_loss_2": 4.277740502357483, "ce_loss_3": 4.060053181648255, "ce_loss_7": 3.7243635654449463, "epoch": 0.705, "grad_norm": 1104.0, "kl_loss_10": 104.85830421447754, "kl_loss_2": 1487.9429260253905, "kl_loss_3": 1048.5312286376952, "kl_loss_7": 329.04687957763673, "learning_rate": 0.00020354603547267987, "loss": 753.9324, "step": 7050 }, { "ce_loss_10": 3.6085660099983214, "ce_loss_13": 3.557363510131836, "ce_loss_2": 4.282197725772858, "ce_loss_3": 4.062791967391968, "ce_loss_7": 3.717707800865173, "epoch": 0.706, "grad_norm": 676.0, "kl_loss_10": 103.20671119689942, "kl_loss_2": 1504.701708984375, "kl_loss_3": 1060.428286743164, "kl_loss_7": 328.32612762451174, "learning_rate": 0.00020226983777365604, "loss": 764.819, "step": 7060 }, { "ce_loss_10": 3.5074614882469177, "ce_loss_13": 3.4597942352294924, "ce_loss_2": 4.202941036224365, "ce_loss_3": 3.9672993779182435, "ce_loss_7": 3.6049316763877868, "epoch": 0.707, "grad_norm": 592.0, "kl_loss_10": 97.65254592895508, "kl_loss_2": 1557.982244873047, "kl_loss_3": 1085.2211975097657, "kl_loss_7": 315.8718978881836, "learning_rate": 0.00020099663821406056, "loss": 749.1276, "step": 7070 }, { "ce_loss_10": 3.615716254711151, "ce_loss_13": 3.567082965373993, "ce_loss_2": 4.2539966106414795, "ce_loss_3": 4.044722843170166, "ce_loss_7": 3.7132434606552125, "epoch": 0.708, "grad_norm": 984.0, "kl_loss_10": 99.90684127807617, "kl_loss_2": 1457.3600708007812, "kl_loss_3": 1027.7681457519532, "kl_loss_7": 315.9672348022461, "learning_rate": 0.00019972644961499853, "loss": 742.4009, "step": 7080 }, { "ce_loss_10": 3.578559231758118, "ce_loss_13": 3.5300379633903503, "ce_loss_2": 4.255948328971863, "ce_loss_3": 4.032967436313629, "ce_loss_7": 3.6806026816368105, "epoch": 0.709, "grad_norm": 632.0, "kl_loss_10": 103.09658241271973, "kl_loss_2": 1523.0350708007813, "kl_loss_3": 1077.6670959472656, "kl_loss_7": 331.51563720703126, "learning_rate": 0.00019845928476725522, "loss": 749.1405, "step": 7090 }, { "ce_loss_10": 3.660240077972412, "ce_loss_13": 3.609438145160675, "ce_loss_2": 4.30460307598114, "ce_loss_3": 4.094051587581634, "ce_loss_7": 3.7604071855545045, "epoch": 0.71, "grad_norm": 564.0, "kl_loss_10": 102.16684951782227, "kl_loss_2": 1464.917236328125, "kl_loss_3": 1041.8206512451172, "kl_loss_7": 324.6231399536133, "learning_rate": 0.00019719515643116677, "loss": 757.1804, "step": 7100 }, { "ce_loss_10": 3.6064583539962767, "ce_loss_13": 3.5555363297462463, "ce_loss_2": 4.249185109138489, "ce_loss_3": 4.03629766702652, "ce_loss_7": 3.7034249305725098, "epoch": 0.711, "grad_norm": 992.0, "kl_loss_10": 102.0643482208252, "kl_loss_2": 1461.975604248047, "kl_loss_3": 1033.3216278076172, "kl_loss_7": 321.64864654541014, "learning_rate": 0.0001959340773364911, "loss": 742.5738, "step": 7110 }, { "ce_loss_10": 3.61837899684906, "ce_loss_13": 3.567757952213287, "ce_loss_2": 4.280507481098175, "ce_loss_3": 4.062068665027619, "ce_loss_7": 3.7197158455848696, "epoch": 0.712, "grad_norm": 692.0, "kl_loss_10": 102.36725082397462, "kl_loss_2": 1492.5347412109375, "kl_loss_3": 1050.5329223632812, "kl_loss_7": 324.6414047241211, "learning_rate": 0.0001946760601822809, "loss": 729.7203, "step": 7120 }, { "ce_loss_10": 3.6702843070030213, "ce_loss_13": 3.620930242538452, "ce_loss_2": 4.300143253803253, "ce_loss_3": 4.090729308128357, "ce_loss_7": 3.770727002620697, "epoch": 0.713, "grad_norm": 496.0, "kl_loss_10": 100.84015464782715, "kl_loss_2": 1447.8009521484375, "kl_loss_3": 1029.8072570800782, "kl_loss_7": 322.2212600708008, "learning_rate": 0.00019342111763675512, "loss": 721.3763, "step": 7130 }, { "ce_loss_10": 3.6747087836265564, "ce_loss_13": 3.623539352416992, "ce_loss_2": 4.299582004547119, "ce_loss_3": 4.092449605464935, "ce_loss_7": 3.7682504177093508, "epoch": 0.714, "grad_norm": 600.0, "kl_loss_10": 103.48250427246094, "kl_loss_2": 1452.7804931640626, "kl_loss_3": 1031.7409545898438, "kl_loss_7": 323.0628997802734, "learning_rate": 0.00019216926233717085, "loss": 727.6729, "step": 7140 }, { "ce_loss_10": 3.553460383415222, "ce_loss_13": 3.50497385263443, "ce_loss_2": 4.237006831169128, "ce_loss_3": 4.004274690151215, "ce_loss_7": 3.6498111367225645, "epoch": 0.715, "grad_norm": 1024.0, "kl_loss_10": 100.35433006286621, "kl_loss_2": 1541.9970458984376, "kl_loss_3": 1076.5046600341798, "kl_loss_7": 319.6664093017578, "learning_rate": 0.00019092050688969737, "loss": 752.495, "step": 7150 }, { "ce_loss_10": 3.6285326361656187, "ce_loss_13": 3.581188178062439, "ce_loss_2": 4.264269089698791, "ce_loss_3": 4.057489204406738, "ce_loss_7": 3.726917600631714, "epoch": 0.716, "grad_norm": 680.0, "kl_loss_10": 100.73601875305175, "kl_loss_2": 1468.3200744628907, "kl_loss_3": 1039.4500671386718, "kl_loss_7": 320.52637786865233, "learning_rate": 0.00018967486386928817, "loss": 730.6697, "step": 7160 }, { "ce_loss_10": 3.4922155380249023, "ce_loss_13": 3.4432080507278444, "ce_loss_2": 4.180318629741668, "ce_loss_3": 3.951646876335144, "ce_loss_7": 3.5971828937530517, "epoch": 0.717, "grad_norm": 748.0, "kl_loss_10": 100.55046463012695, "kl_loss_2": 1546.0572082519532, "kl_loss_3": 1083.219937133789, "kl_loss_7": 330.65559539794924, "learning_rate": 0.00018843234581955443, "loss": 772.5576, "step": 7170 }, { "ce_loss_10": 3.515317904949188, "ce_loss_13": 3.463692331314087, "ce_loss_2": 4.185732483863831, "ce_loss_3": 3.9641585826873778, "ce_loss_7": 3.620204746723175, "epoch": 0.718, "grad_norm": 572.0, "kl_loss_10": 102.95229301452636, "kl_loss_2": 1519.0477600097656, "kl_loss_3": 1071.558935546875, "kl_loss_7": 329.1860656738281, "learning_rate": 0.00018719296525263924, "loss": 750.2962, "step": 7180 }, { "ce_loss_10": 3.615639638900757, "ce_loss_13": 3.566976988315582, "ce_loss_2": 4.243787431716919, "ce_loss_3": 4.0338469505310055, "ce_loss_7": 3.7108601212501524, "epoch": 0.719, "grad_norm": 488.0, "kl_loss_10": 101.69222755432129, "kl_loss_2": 1435.488018798828, "kl_loss_3": 1012.9308074951172, "kl_loss_7": 319.6354354858398, "learning_rate": 0.0001859567346490913, "loss": 723.9555, "step": 7190 }, { "ce_loss_10": 3.589732789993286, "ce_loss_13": 3.5365479588508606, "ce_loss_2": 4.253436517715454, "ce_loss_3": 4.034386122226715, "ce_loss_7": 3.688367509841919, "epoch": 0.72, "grad_norm": 812.0, "kl_loss_10": 103.2382583618164, "kl_loss_2": 1503.9259033203125, "kl_loss_3": 1060.4109893798827, "kl_loss_7": 330.47876892089846, "learning_rate": 0.0001847236664577389, "loss": 732.0061, "step": 7200 }, { "ce_loss_10": 3.6180379152297975, "ce_loss_13": 3.566099727153778, "ce_loss_2": 4.246710586547851, "ce_loss_3": 4.036282730102539, "ce_loss_7": 3.71278338432312, "epoch": 0.721, "grad_norm": 536.0, "kl_loss_10": 102.03774604797363, "kl_loss_2": 1442.6143859863282, "kl_loss_3": 1019.9147583007813, "kl_loss_7": 318.68707885742185, "learning_rate": 0.00018349377309556487, "loss": 719.3636, "step": 7210 }, { "ce_loss_10": 3.5575922250747682, "ce_loss_13": 3.5070847868919373, "ce_loss_2": 4.234365069866181, "ce_loss_3": 4.0066630125045775, "ce_loss_7": 3.656701982021332, "epoch": 0.722, "grad_norm": 636.0, "kl_loss_10": 101.9844139099121, "kl_loss_2": 1550.3471374511719, "kl_loss_3": 1084.754083251953, "kl_loss_7": 330.33725891113284, "learning_rate": 0.00018226706694758193, "loss": 755.8813, "step": 7220 }, { "ce_loss_10": 3.6318568110466005, "ce_loss_13": 3.583934152126312, "ce_loss_2": 4.2729597091674805, "ce_loss_3": 4.053939759731293, "ce_loss_7": 3.728121757507324, "epoch": 0.723, "grad_norm": 888.0, "kl_loss_10": 100.9413215637207, "kl_loss_2": 1476.1191040039062, "kl_loss_3": 1039.776678466797, "kl_loss_7": 320.9526397705078, "learning_rate": 0.0001810435603667075, "loss": 753.4178, "step": 7230 }, { "ce_loss_10": 3.479217517375946, "ce_loss_13": 3.4290864944458006, "ce_loss_2": 4.143425631523132, "ce_loss_3": 3.921012759208679, "ce_loss_7": 3.5752957582473757, "epoch": 0.724, "grad_norm": 840.0, "kl_loss_10": 97.81247100830078, "kl_loss_2": 1510.6328369140624, "kl_loss_3": 1062.0849060058595, "kl_loss_7": 319.5965209960938, "learning_rate": 0.0001798232656736389, "loss": 753.1852, "step": 7240 }, { "ce_loss_10": 3.6586895108222963, "ce_loss_13": 3.6110859394073485, "ce_loss_2": 4.280707168579101, "ce_loss_3": 4.0747087597846985, "ce_loss_7": 3.755585479736328, "epoch": 0.725, "grad_norm": 748.0, "kl_loss_10": 100.88589782714844, "kl_loss_2": 1432.1588256835937, "kl_loss_3": 1014.6901824951171, "kl_loss_7": 316.9482650756836, "learning_rate": 0.0001786061951567303, "loss": 730.6222, "step": 7250 }, { "ce_loss_10": 3.5725823640823364, "ce_loss_13": 3.5224485397338867, "ce_loss_2": 4.225451338291168, "ce_loss_3": 4.010411250591278, "ce_loss_7": 3.6760420203208923, "epoch": 0.726, "grad_norm": 510.0, "kl_loss_10": 101.72770271301269, "kl_loss_2": 1468.72880859375, "kl_loss_3": 1037.118734741211, "kl_loss_7": 322.33358306884764, "learning_rate": 0.00017739236107186857, "loss": 740.7124, "step": 7260 }, { "ce_loss_10": 3.661336326599121, "ce_loss_13": 3.6133769869804384, "ce_loss_2": 4.28061957359314, "ce_loss_3": 4.072839677333832, "ce_loss_7": 3.7523145198822023, "epoch": 0.727, "grad_norm": 512.0, "kl_loss_10": 99.33006744384765, "kl_loss_2": 1427.7947143554688, "kl_loss_3": 1010.3567138671875, "kl_loss_7": 314.64671478271487, "learning_rate": 0.00017618177564234904, "loss": 724.1102, "step": 7270 }, { "ce_loss_10": 3.6391143441200255, "ce_loss_13": 3.5903729796409607, "ce_loss_2": 4.252013909816742, "ce_loss_3": 4.052266252040863, "ce_loss_7": 3.733324384689331, "epoch": 0.728, "grad_norm": 684.0, "kl_loss_10": 100.08157882690429, "kl_loss_2": 1401.1171142578125, "kl_loss_3": 998.6275787353516, "kl_loss_7": 313.19330444335935, "learning_rate": 0.00017497445105875377, "loss": 719.6587, "step": 7280 }, { "ce_loss_10": 3.546626257896423, "ce_loss_13": 3.4963098049163817, "ce_loss_2": 4.224094724655151, "ce_loss_3": 3.9970731019973753, "ce_loss_7": 3.652878963947296, "epoch": 0.729, "grad_norm": 648.0, "kl_loss_10": 101.46650428771973, "kl_loss_2": 1530.8541870117188, "kl_loss_3": 1075.4965515136719, "kl_loss_7": 332.26658630371094, "learning_rate": 0.000173770399478828, "loss": 749.5231, "step": 7290 }, { "ce_loss_10": 3.466491138935089, "ce_loss_13": 3.4188210129737855, "ce_loss_2": 4.122490584850311, "ce_loss_3": 3.8977851986885073, "ce_loss_7": 3.563360559940338, "epoch": 0.73, "grad_norm": 848.0, "kl_loss_10": 100.28041152954101, "kl_loss_2": 1486.7876831054687, "kl_loss_3": 1047.5693664550781, "kl_loss_7": 321.95996551513673, "learning_rate": 0.0001725696330273575, "loss": 756.988, "step": 7300 }, { "ce_loss_10": 3.655240035057068, "ce_loss_13": 3.6066733479499815, "ce_loss_2": 4.28205144405365, "ce_loss_3": 4.081033682823181, "ce_loss_7": 3.7530914306640626, "epoch": 0.731, "grad_norm": 880.0, "kl_loss_10": 98.73011474609375, "kl_loss_2": 1434.0589965820313, "kl_loss_3": 1014.6115051269531, "kl_loss_7": 314.84511413574216, "learning_rate": 0.00017137216379604724, "loss": 718.0417, "step": 7310 }, { "ce_loss_10": 3.5331166267395018, "ce_loss_13": 3.482956790924072, "ce_loss_2": 4.1915734052658085, "ce_loss_3": 3.973959708213806, "ce_loss_7": 3.629355323314667, "epoch": 0.732, "grad_norm": 652.0, "kl_loss_10": 100.25732460021973, "kl_loss_2": 1478.2487548828126, "kl_loss_3": 1040.2373596191405, "kl_loss_7": 318.95433807373047, "learning_rate": 0.00017017800384339925, "loss": 734.5391, "step": 7320 }, { "ce_loss_10": 3.4845200538635255, "ce_loss_13": 3.435916531085968, "ce_loss_2": 4.180770039558411, "ce_loss_3": 3.942674934864044, "ce_loss_7": 3.588698422908783, "epoch": 0.733, "grad_norm": 620.0, "kl_loss_10": 101.78398971557617, "kl_loss_2": 1554.9213562011719, "kl_loss_3": 1079.1240173339843, "kl_loss_7": 327.88011932373047, "learning_rate": 0.00016898716519459073, "loss": 735.1391, "step": 7330 }, { "ce_loss_10": 3.6073665857315063, "ce_loss_13": 3.557834780216217, "ce_loss_2": 4.289764428138733, "ce_loss_3": 4.060822868347168, "ce_loss_7": 3.711581456661224, "epoch": 0.734, "grad_norm": 532.0, "kl_loss_10": 103.13581085205078, "kl_loss_2": 1524.8836853027344, "kl_loss_3": 1068.3976043701173, "kl_loss_7": 333.7369720458984, "learning_rate": 0.00016779965984135375, "loss": 745.2718, "step": 7340 }, { "ce_loss_10": 3.517951250076294, "ce_loss_13": 3.4706948518753054, "ce_loss_2": 4.169898021221161, "ce_loss_3": 3.9554733633995056, "ce_loss_7": 3.6164847016334534, "epoch": 0.735, "grad_norm": 812.0, "kl_loss_10": 97.91431465148926, "kl_loss_2": 1477.9057678222657, "kl_loss_3": 1035.2748046875, "kl_loss_7": 315.540544128418, "learning_rate": 0.00016661549974185424, "loss": 733.8604, "step": 7350 }, { "ce_loss_10": 3.5558545112609865, "ce_loss_13": 3.507548999786377, "ce_loss_2": 4.21607004404068, "ce_loss_3": 3.994111716747284, "ce_loss_7": 3.6564208030700684, "epoch": 0.736, "grad_norm": 576.0, "kl_loss_10": 101.86353950500488, "kl_loss_2": 1494.8444702148438, "kl_loss_3": 1045.7262268066406, "kl_loss_7": 324.7269943237305, "learning_rate": 0.00016543469682057105, "loss": 731.5467, "step": 7360 }, { "ce_loss_10": 3.589854109287262, "ce_loss_13": 3.538823735713959, "ce_loss_2": 4.236572289466858, "ce_loss_3": 4.021358871459961, "ce_loss_7": 3.6871808409690856, "epoch": 0.737, "grad_norm": 616.0, "kl_loss_10": 102.97372932434082, "kl_loss_2": 1477.7834594726562, "kl_loss_3": 1045.0648406982423, "kl_loss_7": 326.77844696044923, "learning_rate": 0.00016425726296817632, "loss": 738.9849, "step": 7370 }, { "ce_loss_10": 3.6046580195426943, "ce_loss_13": 3.5564786791801453, "ce_loss_2": 4.236163938045502, "ce_loss_3": 4.025436687469482, "ce_loss_7": 3.700652003288269, "epoch": 0.738, "grad_norm": 912.0, "kl_loss_10": 99.1933479309082, "kl_loss_2": 1443.9814819335938, "kl_loss_3": 1021.2991546630859, "kl_loss_7": 314.4842254638672, "learning_rate": 0.00016308321004141607, "loss": 725.121, "step": 7380 }, { "ce_loss_10": 3.5545114517211913, "ce_loss_13": 3.5051153659820558, "ce_loss_2": 4.2193849086761475, "ce_loss_3": 4.001522815227508, "ce_loss_7": 3.6586191773414614, "epoch": 0.739, "grad_norm": 860.0, "kl_loss_10": 102.69992713928222, "kl_loss_2": 1486.2926818847657, "kl_loss_3": 1045.9685974121094, "kl_loss_7": 325.7093994140625, "learning_rate": 0.00016191254986299043, "loss": 733.4309, "step": 7390 }, { "ce_loss_10": 3.605199325084686, "ce_loss_13": 3.559028685092926, "ce_loss_2": 4.247898590564728, "ce_loss_3": 4.026130676269531, "ce_loss_7": 3.697719967365265, "epoch": 0.74, "grad_norm": 864.0, "kl_loss_10": 100.1459529876709, "kl_loss_2": 1479.9716430664062, "kl_loss_3": 1032.2414581298829, "kl_loss_7": 315.8760025024414, "learning_rate": 0.00016074529422143398, "loss": 742.5588, "step": 7400 }, { "ce_loss_10": 3.5475880980491636, "ce_loss_13": 3.498714768886566, "ce_loss_2": 4.222816336154938, "ce_loss_3": 3.987065052986145, "ce_loss_7": 3.647062873840332, "epoch": 0.741, "grad_norm": 1584.0, "kl_loss_10": 100.77743530273438, "kl_loss_2": 1525.576885986328, "kl_loss_3": 1057.4777526855469, "kl_loss_7": 320.56249847412107, "learning_rate": 0.0001595814548709983, "loss": 750.858, "step": 7410 }, { "ce_loss_10": 3.6178871512413027, "ce_loss_13": 3.568697464466095, "ce_loss_2": 4.270915389060974, "ce_loss_3": 4.059020531177521, "ce_loss_7": 3.722296047210693, "epoch": 0.742, "grad_norm": 652.0, "kl_loss_10": 103.05680885314942, "kl_loss_2": 1492.0172302246094, "kl_loss_3": 1055.1884674072267, "kl_loss_7": 330.7481857299805, "learning_rate": 0.00015842104353153285, "loss": 745.3371, "step": 7420 }, { "ce_loss_10": 3.633833074569702, "ce_loss_13": 3.5853574872016907, "ce_loss_2": 4.2891986727714535, "ce_loss_3": 4.0714383006095884, "ce_loss_7": 3.7352861285209658, "epoch": 0.743, "grad_norm": 688.0, "kl_loss_10": 102.43589935302734, "kl_loss_2": 1495.2289978027343, "kl_loss_3": 1052.4135284423828, "kl_loss_7": 327.82014923095704, "learning_rate": 0.0001572640718883667, "loss": 756.6117, "step": 7430 }, { "ce_loss_10": 3.5646160006523133, "ce_loss_13": 3.517827796936035, "ce_loss_2": 4.207964992523193, "ce_loss_3": 3.9948436856269836, "ce_loss_7": 3.663499450683594, "epoch": 0.744, "grad_norm": 520.0, "kl_loss_10": 98.91822967529296, "kl_loss_2": 1448.877459716797, "kl_loss_3": 1023.0617065429688, "kl_loss_7": 316.0859832763672, "learning_rate": 0.0001561105515921915, "loss": 742.48, "step": 7440 }, { "ce_loss_10": 3.413878917694092, "ce_loss_13": 3.3665127754211426, "ce_loss_2": 4.107733094692231, "ce_loss_3": 3.8735260128974915, "ce_loss_7": 3.516214442253113, "epoch": 0.745, "grad_norm": 592.0, "kl_loss_10": 98.08993644714356, "kl_loss_2": 1551.601708984375, "kl_loss_3": 1074.8999908447265, "kl_loss_7": 322.88723907470705, "learning_rate": 0.0001549604942589441, "loss": 740.9817, "step": 7450 }, { "ce_loss_10": 3.609237015247345, "ce_loss_13": 3.5622578978538515, "ce_loss_2": 4.218359482288361, "ce_loss_3": 4.020682299137116, "ce_loss_7": 3.701595675945282, "epoch": 0.746, "grad_norm": 864.0, "kl_loss_10": 97.97488784790039, "kl_loss_2": 1396.5510192871093, "kl_loss_3": 989.4693145751953, "kl_loss_7": 307.9789108276367, "learning_rate": 0.00015381391146968864, "loss": 715.9987, "step": 7460 }, { "ce_loss_10": 3.583309030532837, "ce_loss_13": 3.534773552417755, "ce_loss_2": 4.24002616405487, "ce_loss_3": 4.021809184551239, "ce_loss_7": 3.679876244068146, "epoch": 0.747, "grad_norm": 1424.0, "kl_loss_10": 98.04772987365723, "kl_loss_2": 1468.8201782226563, "kl_loss_3": 1032.034912109375, "kl_loss_7": 314.97127838134764, "learning_rate": 0.00015267081477050133, "loss": 736.9425, "step": 7470 }, { "ce_loss_10": 3.680017924308777, "ce_loss_13": 3.6314922094345095, "ce_loss_2": 4.308831429481506, "ce_loss_3": 4.103770196437836, "ce_loss_7": 3.77610330581665, "epoch": 0.748, "grad_norm": 588.0, "kl_loss_10": 102.77583084106445, "kl_loss_2": 1440.1351745605468, "kl_loss_3": 1022.9948883056641, "kl_loss_7": 321.7729690551758, "learning_rate": 0.00015153121567235335, "loss": 718.3909, "step": 7480 }, { "ce_loss_10": 3.568933379650116, "ce_loss_13": 3.520808684825897, "ce_loss_2": 4.233863592147827, "ce_loss_3": 4.006792187690735, "ce_loss_7": 3.666488540172577, "epoch": 0.749, "grad_norm": 1240.0, "kl_loss_10": 101.02551536560058, "kl_loss_2": 1515.6455688476562, "kl_loss_3": 1057.2483459472655, "kl_loss_7": 323.41761016845703, "learning_rate": 0.00015039512565099468, "loss": 724.0142, "step": 7490 }, { "ce_loss_10": 3.636400604248047, "ce_loss_13": 3.586910128593445, "ce_loss_2": 4.27596024274826, "ce_loss_3": 4.060910153388977, "ce_loss_7": 3.7364757657051086, "epoch": 0.75, "grad_norm": 592.0, "kl_loss_10": 101.5184169769287, "kl_loss_2": 1460.6539428710937, "kl_loss_3": 1032.1658905029296, "kl_loss_7": 323.4354415893555, "learning_rate": 0.00014926255614683932, "loss": 753.1442, "step": 7500 }, { "ce_loss_10": 3.5847999691963195, "ce_loss_13": 3.5361611366271974, "ce_loss_2": 4.221375560760498, "ce_loss_3": 4.00215779542923, "ce_loss_7": 3.6820521473884584, "epoch": 0.751, "grad_norm": 648.0, "kl_loss_10": 100.41216621398925, "kl_loss_2": 1460.0320617675782, "kl_loss_3": 1023.7983551025391, "kl_loss_7": 318.2928298950195, "learning_rate": 0.0001481335185648498, "loss": 738.1113, "step": 7510 }, { "ce_loss_10": 3.586812174320221, "ce_loss_13": 3.539919674396515, "ce_loss_2": 4.226488614082337, "ce_loss_3": 4.01739672422409, "ce_loss_7": 3.6843767523765565, "epoch": 0.752, "grad_norm": 664.0, "kl_loss_10": 99.0669017791748, "kl_loss_2": 1466.2590270996093, "kl_loss_3": 1038.5502990722657, "kl_loss_7": 319.99786376953125, "learning_rate": 0.0001470080242744218, "loss": 726.2307, "step": 7520 }, { "ce_loss_10": 3.585355854034424, "ce_loss_13": 3.5357612133026124, "ce_loss_2": 4.235686790943146, "ce_loss_3": 4.018222296237946, "ce_loss_7": 3.677559757232666, "epoch": 0.753, "grad_norm": 668.0, "kl_loss_10": 98.61138343811035, "kl_loss_2": 1471.2027954101563, "kl_loss_3": 1037.068017578125, "kl_loss_7": 315.84401550292966, "learning_rate": 0.0001458860846092705, "loss": 736.6536, "step": 7530 }, { "ce_loss_10": 3.619723081588745, "ce_loss_13": 3.571056568622589, "ce_loss_2": 4.24805052280426, "ce_loss_3": 4.035051429271698, "ce_loss_7": 3.7132526636123657, "epoch": 0.754, "grad_norm": 564.0, "kl_loss_10": 99.40720977783204, "kl_loss_2": 1444.1072082519531, "kl_loss_3": 1016.2197021484375, "kl_loss_7": 315.57659912109375, "learning_rate": 0.00014476771086731566, "loss": 718.0346, "step": 7540 }, { "ce_loss_10": 3.7294838428497314, "ce_loss_13": 3.6771111130714416, "ce_loss_2": 4.3626165866851805, "ce_loss_3": 4.154290568828583, "ce_loss_7": 3.827013409137726, "epoch": 0.755, "grad_norm": 652.0, "kl_loss_10": 103.24525947570801, "kl_loss_2": 1451.2049865722656, "kl_loss_3": 1026.5278259277343, "kl_loss_7": 322.3322021484375, "learning_rate": 0.00014365291431056872, "loss": 748.3513, "step": 7550 }, { "ce_loss_10": 3.554888129234314, "ce_loss_13": 3.5024208307266234, "ce_loss_2": 4.216960692405701, "ce_loss_3": 3.996437156200409, "ce_loss_7": 3.659726858139038, "epoch": 0.756, "grad_norm": 652.0, "kl_loss_10": 102.81799125671387, "kl_loss_2": 1517.4044860839845, "kl_loss_3": 1065.2943908691407, "kl_loss_7": 330.98865203857423, "learning_rate": 0.00014254170616501827, "loss": 743.0686, "step": 7560 }, { "ce_loss_10": 3.486711573600769, "ce_loss_13": 3.43492249250412, "ce_loss_2": 4.17220014333725, "ce_loss_3": 3.950027620792389, "ce_loss_7": 3.59212349653244, "epoch": 0.757, "grad_norm": 920.0, "kl_loss_10": 101.8103126525879, "kl_loss_2": 1548.1325256347657, "kl_loss_3": 1098.6501068115235, "kl_loss_7": 331.817610168457, "learning_rate": 0.0001414340976205183, "loss": 766.8574, "step": 7570 }, { "ce_loss_10": 3.500650465488434, "ce_loss_13": 3.4508027911186216, "ce_loss_2": 4.17868949174881, "ce_loss_3": 3.945530080795288, "ce_loss_7": 3.6048083305358887, "epoch": 0.758, "grad_norm": 868.0, "kl_loss_10": 99.97529563903808, "kl_loss_2": 1518.4486083984375, "kl_loss_3": 1058.0074951171875, "kl_loss_7": 325.02918243408203, "learning_rate": 0.00014033009983067452, "loss": 743.5407, "step": 7580 }, { "ce_loss_10": 3.668207585811615, "ce_loss_13": 3.620664989948273, "ce_loss_2": 4.288976764678955, "ce_loss_3": 4.081995344161987, "ce_loss_7": 3.7634514570236206, "epoch": 0.759, "grad_norm": 616.0, "kl_loss_10": 99.13009910583496, "kl_loss_2": 1423.26025390625, "kl_loss_3": 1000.9866973876954, "kl_loss_7": 310.7435531616211, "learning_rate": 0.00013922972391273224, "loss": 723.2811, "step": 7590 }, { "ce_loss_10": 3.6714157700538634, "ce_loss_13": 3.622804617881775, "ce_loss_2": 4.317670297622681, "ce_loss_3": 4.092569124698639, "ce_loss_7": 3.769862473011017, "epoch": 0.76, "grad_norm": 628.0, "kl_loss_10": 100.62276115417481, "kl_loss_2": 1457.4623901367188, "kl_loss_3": 1011.4762512207031, "kl_loss_7": 317.48037109375, "learning_rate": 0.0001381329809474649, "loss": 730.9679, "step": 7600 }, { "ce_loss_10": 3.5676814198493956, "ce_loss_13": 3.5183636069297792, "ce_loss_2": 4.249255347251892, "ce_loss_3": 4.022576558589935, "ce_loss_7": 3.6742392659187315, "epoch": 0.761, "grad_norm": 788.0, "kl_loss_10": 102.64234657287598, "kl_loss_2": 1532.18251953125, "kl_loss_3": 1073.4588409423827, "kl_loss_7": 328.63363494873045, "learning_rate": 0.0001370398819790621, "loss": 753.1651, "step": 7610 }, { "ce_loss_10": 3.709104669094086, "ce_loss_13": 3.6601706624031065, "ce_loss_2": 4.330952978134155, "ce_loss_3": 4.1265774130821224, "ce_loss_7": 3.805286169052124, "epoch": 0.762, "grad_norm": 668.0, "kl_loss_10": 101.34319152832032, "kl_loss_2": 1416.3973266601563, "kl_loss_3": 1008.2008361816406, "kl_loss_7": 314.8024185180664, "learning_rate": 0.00013595043801501794, "loss": 710.8723, "step": 7620 }, { "ce_loss_10": 3.5045378684997557, "ce_loss_13": 3.4562175273895264, "ce_loss_2": 4.197242736816406, "ce_loss_3": 3.9669162034988403, "ce_loss_7": 3.6095515727996825, "epoch": 0.763, "grad_norm": 684.0, "kl_loss_10": 100.82436599731446, "kl_loss_2": 1558.5006103515625, "kl_loss_3": 1095.090069580078, "kl_loss_7": 328.63230590820314, "learning_rate": 0.00013486466002602133, "loss": 752.6841, "step": 7630 }, { "ce_loss_10": 3.6230127215385437, "ce_loss_13": 3.5728342533111572, "ce_loss_2": 4.249013924598694, "ce_loss_3": 4.039559626579285, "ce_loss_7": 3.717764842510223, "epoch": 0.764, "grad_norm": 584.0, "kl_loss_10": 101.1651824951172, "kl_loss_2": 1438.4548400878907, "kl_loss_3": 1010.2212646484375, "kl_loss_7": 316.61218414306643, "learning_rate": 0.00013378255894584462, "loss": 743.1861, "step": 7640 }, { "ce_loss_10": 3.553361701965332, "ce_loss_13": 3.501368546485901, "ce_loss_2": 4.211768960952758, "ce_loss_3": 3.9971181631088255, "ce_loss_7": 3.6566309213638304, "epoch": 0.765, "grad_norm": 560.0, "kl_loss_10": 101.52089576721191, "kl_loss_2": 1493.2127807617187, "kl_loss_3": 1057.4331665039062, "kl_loss_7": 327.0298675537109, "learning_rate": 0.0001327041456712334, "loss": 744.0121, "step": 7650 }, { "ce_loss_10": 3.5922834157943724, "ce_loss_13": 3.5425678372383116, "ce_loss_2": 4.237431991100311, "ce_loss_3": 4.028069686889649, "ce_loss_7": 3.690912353992462, "epoch": 0.766, "grad_norm": 892.0, "kl_loss_10": 101.17123641967774, "kl_loss_2": 1478.9973693847655, "kl_loss_3": 1055.1416595458984, "kl_loss_7": 325.1578964233398, "learning_rate": 0.00013162943106179747, "loss": 745.5192, "step": 7660 }, { "ce_loss_10": 3.5697691679000854, "ce_loss_13": 3.5214869737625123, "ce_loss_2": 4.2154614448547365, "ce_loss_3": 3.9985948324203493, "ce_loss_7": 3.6656860232353212, "epoch": 0.767, "grad_norm": 648.0, "kl_loss_10": 102.13901824951172, "kl_loss_2": 1477.124578857422, "kl_loss_3": 1039.877670288086, "kl_loss_7": 320.97084350585936, "learning_rate": 0.00013055842593990132, "loss": 734.1866, "step": 7670 }, { "ce_loss_10": 3.5158339023590086, "ce_loss_13": 3.4678486704826357, "ce_loss_2": 4.164434158802033, "ce_loss_3": 3.9516779661178587, "ce_loss_7": 3.618457889556885, "epoch": 0.768, "grad_norm": 604.0, "kl_loss_10": 98.76402587890625, "kl_loss_2": 1461.6289916992187, "kl_loss_3": 1032.9562957763671, "kl_loss_7": 320.3838912963867, "learning_rate": 0.00012949114109055414, "loss": 745.3294, "step": 7680 }, { "ce_loss_10": 3.5645843982696532, "ce_loss_13": 3.5161487102508544, "ce_loss_2": 4.22154221534729, "ce_loss_3": 4.005855643749237, "ce_loss_7": 3.667157244682312, "epoch": 0.769, "grad_norm": 510.0, "kl_loss_10": 100.61445198059081, "kl_loss_2": 1485.9128967285155, "kl_loss_3": 1048.273974609375, "kl_loss_7": 325.4884323120117, "learning_rate": 0.00012842758726130281, "loss": 747.9092, "step": 7690 }, { "ce_loss_10": 3.602025330066681, "ce_loss_13": 3.5516483426094054, "ce_loss_2": 4.268712973594665, "ce_loss_3": 4.046865451335907, "ce_loss_7": 3.7030147314071655, "epoch": 0.77, "grad_norm": 856.0, "kl_loss_10": 102.05240936279297, "kl_loss_2": 1503.2529357910157, "kl_loss_3": 1051.5912994384767, "kl_loss_7": 326.3058227539062, "learning_rate": 0.00012736777516212267, "loss": 736.4097, "step": 7700 }, { "ce_loss_10": 3.5961975932121275, "ce_loss_13": 3.545725905895233, "ce_loss_2": 4.251299357414245, "ce_loss_3": 4.03817503452301, "ce_loss_7": 3.6982323050498964, "epoch": 0.771, "grad_norm": 680.0, "kl_loss_10": 101.89763069152832, "kl_loss_2": 1489.81259765625, "kl_loss_3": 1057.1007019042968, "kl_loss_7": 329.05216827392576, "learning_rate": 0.00012631171546530968, "loss": 733.0863, "step": 7710 }, { "ce_loss_10": 3.6131365060806275, "ce_loss_13": 3.5616217732429503, "ce_loss_2": 4.2632009267807005, "ce_loss_3": 4.053297114372254, "ce_loss_7": 3.712467277050018, "epoch": 0.772, "grad_norm": 676.0, "kl_loss_10": 104.0031364440918, "kl_loss_2": 1482.6835021972656, "kl_loss_3": 1054.0444396972657, "kl_loss_7": 325.3907302856445, "learning_rate": 0.00012525941880537307, "loss": 750.0256, "step": 7720 }, { "ce_loss_10": 3.6494514107704163, "ce_loss_13": 3.600589466094971, "ce_loss_2": 4.285376441478729, "ce_loss_3": 4.080974972248077, "ce_loss_7": 3.744878327846527, "epoch": 0.773, "grad_norm": 520.0, "kl_loss_10": 100.10319595336914, "kl_loss_2": 1437.9194885253905, "kl_loss_3": 1021.8783477783203, "kl_loss_7": 316.0697509765625, "learning_rate": 0.00012421089577892869, "loss": 728.4293, "step": 7730 }, { "ce_loss_10": 3.5963020086288453, "ce_loss_13": 3.543910026550293, "ce_loss_2": 4.2532641887664795, "ce_loss_3": 4.036566972732544, "ce_loss_7": 3.698932147026062, "epoch": 0.774, "grad_norm": 872.0, "kl_loss_10": 102.04463233947754, "kl_loss_2": 1496.1355529785155, "kl_loss_3": 1055.0438262939454, "kl_loss_7": 327.16393585205077, "learning_rate": 0.0001231661569445919, "loss": 744.5356, "step": 7740 }, { "ce_loss_10": 3.4462903261184694, "ce_loss_13": 3.399669516086578, "ce_loss_2": 4.117041897773743, "ce_loss_3": 3.892062509059906, "ce_loss_7": 3.5477941393852235, "epoch": 0.775, "grad_norm": 1128.0, "kl_loss_10": 98.97135620117187, "kl_loss_2": 1497.6878723144532, "kl_loss_3": 1047.4519134521483, "kl_loss_7": 321.57388458251955, "learning_rate": 0.00012212521282287093, "loss": 750.6002, "step": 7750 }, { "ce_loss_10": 3.607205033302307, "ce_loss_13": 3.555282437801361, "ce_loss_2": 4.251476073265076, "ce_loss_3": 4.040511429309845, "ce_loss_7": 3.710643637180328, "epoch": 0.776, "grad_norm": 968.0, "kl_loss_10": 103.96742973327636, "kl_loss_2": 1468.1371154785156, "kl_loss_3": 1033.768118286133, "kl_loss_7": 326.66978759765624, "learning_rate": 0.00012108807389606158, "loss": 747.9398, "step": 7760 }, { "ce_loss_10": 3.6053456544876097, "ce_loss_13": 3.5565745711326597, "ce_loss_2": 4.248643982410431, "ce_loss_3": 4.0330203652381895, "ce_loss_7": 3.7027784824371337, "epoch": 0.777, "grad_norm": 488.0, "kl_loss_10": 98.77987785339356, "kl_loss_2": 1463.323992919922, "kl_loss_3": 1023.36650390625, "kl_loss_7": 316.96722717285155, "learning_rate": 0.00012005475060814159, "loss": 730.9623, "step": 7770 }, { "ce_loss_10": 3.54359176158905, "ce_loss_13": 3.4953760504722595, "ce_loss_2": 4.208021295070648, "ce_loss_3": 3.9865978002548217, "ce_loss_7": 3.6427852511405945, "epoch": 0.778, "grad_norm": 732.0, "kl_loss_10": 102.18351593017579, "kl_loss_2": 1514.1899291992188, "kl_loss_3": 1066.5852752685546, "kl_loss_7": 326.0293334960937, "learning_rate": 0.00011902525336466464, "loss": 745.7067, "step": 7780 }, { "ce_loss_10": 3.526556408405304, "ce_loss_13": 3.4746569395065308, "ce_loss_2": 4.211039841175079, "ce_loss_3": 3.979912042617798, "ce_loss_7": 3.6299877882003786, "epoch": 0.779, "grad_norm": 704.0, "kl_loss_10": 103.69430274963379, "kl_loss_2": 1539.017315673828, "kl_loss_3": 1078.670361328125, "kl_loss_7": 332.1550918579102, "learning_rate": 0.00011799959253265668, "loss": 746.7819, "step": 7790 }, { "ce_loss_10": 3.590300941467285, "ce_loss_13": 3.5382444262504578, "ce_loss_2": 4.245766711235047, "ce_loss_3": 4.022064113616944, "ce_loss_7": 3.687968409061432, "epoch": 0.78, "grad_norm": 1152.0, "kl_loss_10": 103.51565818786621, "kl_loss_2": 1503.5505065917969, "kl_loss_3": 1051.2292053222657, "kl_loss_7": 325.9313629150391, "learning_rate": 0.00011697777844051105, "loss": 746.1879, "step": 7800 }, { "ce_loss_10": 3.566819489002228, "ce_loss_13": 3.516168200969696, "ce_loss_2": 4.244085478782654, "ce_loss_3": 4.018101978302002, "ce_loss_7": 3.6710123777389527, "epoch": 0.781, "grad_norm": 692.0, "kl_loss_10": 102.14329109191894, "kl_loss_2": 1534.894287109375, "kl_loss_3": 1075.6725128173828, "kl_loss_7": 326.75748443603516, "learning_rate": 0.00011595982137788402, "loss": 751.4992, "step": 7810 }, { "ce_loss_10": 3.5463154554367065, "ce_loss_13": 3.499880874156952, "ce_loss_2": 4.185127007961273, "ce_loss_3": 3.975770080089569, "ce_loss_7": 3.643055832386017, "epoch": 0.782, "grad_norm": 732.0, "kl_loss_10": 98.86368026733399, "kl_loss_2": 1443.6425048828125, "kl_loss_3": 1022.2640747070312, "kl_loss_7": 315.4009216308594, "learning_rate": 0.00011494573159559212, "loss": 732.5146, "step": 7820 }, { "ce_loss_10": 3.5339179754257204, "ce_loss_13": 3.483425164222717, "ce_loss_2": 4.19458544254303, "ce_loss_3": 3.976577842235565, "ce_loss_7": 3.6322914958000183, "epoch": 0.783, "grad_norm": 708.0, "kl_loss_10": 100.64520874023438, "kl_loss_2": 1485.1636657714844, "kl_loss_3": 1051.7114685058593, "kl_loss_7": 320.48876953125, "learning_rate": 0.00011393551930550828, "loss": 753.042, "step": 7830 }, { "ce_loss_10": 3.676271617412567, "ce_loss_13": 3.6245307326316833, "ce_loss_2": 4.311579501628875, "ce_loss_3": 4.105104970932007, "ce_loss_7": 3.7734909415245057, "epoch": 0.784, "grad_norm": 756.0, "kl_loss_10": 102.75973587036133, "kl_loss_2": 1447.6376220703125, "kl_loss_3": 1026.9560760498048, "kl_loss_7": 321.5903793334961, "learning_rate": 0.00011292919468045875, "loss": 732.2318, "step": 7840 }, { "ce_loss_10": 3.6260016798973083, "ce_loss_13": 3.577576553821564, "ce_loss_2": 4.270954990386963, "ce_loss_3": 4.053041040897369, "ce_loss_7": 3.7236294627189634, "epoch": 0.785, "grad_norm": 532.0, "kl_loss_10": 100.46421546936035, "kl_loss_2": 1466.1397766113282, "kl_loss_3": 1033.1141723632813, "kl_loss_7": 320.160466003418, "learning_rate": 0.00011192676785412154, "loss": 727.2934, "step": 7850 }, { "ce_loss_10": 3.567592740058899, "ce_loss_13": 3.5165759563446044, "ce_loss_2": 4.237607002258301, "ce_loss_3": 4.017949235439301, "ce_loss_7": 3.669193887710571, "epoch": 0.786, "grad_norm": 1088.0, "kl_loss_10": 101.7808937072754, "kl_loss_2": 1506.9895568847655, "kl_loss_3": 1056.252407836914, "kl_loss_7": 323.0879531860352, "learning_rate": 0.00011092824892092374, "loss": 741.3501, "step": 7860 }, { "ce_loss_10": 3.4958463072776795, "ce_loss_13": 3.4471227169036864, "ce_loss_2": 4.171995770931244, "ce_loss_3": 3.9428863406181334, "ce_loss_7": 3.5922133922576904, "epoch": 0.787, "grad_norm": 486.0, "kl_loss_10": 99.75861511230468, "kl_loss_2": 1525.08134765625, "kl_loss_3": 1066.3266906738281, "kl_loss_7": 324.98873443603514, "learning_rate": 0.0001099336479359398, "loss": 736.9773, "step": 7870 }, { "ce_loss_10": 3.621865713596344, "ce_loss_13": 3.5758667945861817, "ce_loss_2": 4.252462542057037, "ce_loss_3": 4.045475161075592, "ce_loss_7": 3.7161823749542235, "epoch": 0.788, "grad_norm": 788.0, "kl_loss_10": 99.02849006652832, "kl_loss_2": 1456.160467529297, "kl_loss_3": 1028.9691040039063, "kl_loss_7": 318.2076126098633, "learning_rate": 0.00010894297491479043, "loss": 733.5009, "step": 7880 }, { "ce_loss_10": 3.6141549587249755, "ce_loss_13": 3.5646785616874697, "ce_loss_2": 4.2630019068717955, "ce_loss_3": 4.046338450908661, "ce_loss_7": 3.71095016002655, "epoch": 0.789, "grad_norm": 504.0, "kl_loss_10": 100.84864921569825, "kl_loss_2": 1462.1729553222656, "kl_loss_3": 1029.0127716064453, "kl_loss_7": 319.5621643066406, "learning_rate": 0.00010795623983354214, "loss": 727.5393, "step": 7890 }, { "ce_loss_10": 3.5006146311759947, "ce_loss_13": 3.4506691813468935, "ce_loss_2": 4.166540515422821, "ce_loss_3": 3.94506813287735, "ce_loss_7": 3.6063103675842285, "epoch": 0.79, "grad_norm": 736.0, "kl_loss_10": 102.10223350524902, "kl_loss_2": 1518.7560241699218, "kl_loss_3": 1068.0064361572265, "kl_loss_7": 334.0091354370117, "learning_rate": 0.00010697345262860636, "loss": 741.6417, "step": 7900 }, { "ce_loss_10": 3.6430941581726075, "ce_loss_13": 3.5947277545928955, "ce_loss_2": 4.278507566452026, "ce_loss_3": 4.064696681499481, "ce_loss_7": 3.739503014087677, "epoch": 0.791, "grad_norm": 486.0, "kl_loss_10": 101.41275329589844, "kl_loss_2": 1455.6594116210938, "kl_loss_3": 1022.2336944580078, "kl_loss_7": 318.54153900146486, "learning_rate": 0.00010599462319663906, "loss": 725.699, "step": 7910 }, { "ce_loss_10": 3.6181755542755125, "ce_loss_13": 3.5690415501594543, "ce_loss_2": 4.232064175605774, "ce_loss_3": 4.028841197490692, "ce_loss_7": 3.714269697666168, "epoch": 0.792, "grad_norm": 478.0, "kl_loss_10": 99.3538745880127, "kl_loss_2": 1412.8293518066407, "kl_loss_3": 1003.182406616211, "kl_loss_7": 312.8563522338867, "learning_rate": 0.00010501976139444191, "loss": 721.0695, "step": 7920 }, { "ce_loss_10": 3.6500046491622924, "ce_loss_13": 3.601353681087494, "ce_loss_2": 4.279228281974793, "ce_loss_3": 4.072514867782592, "ce_loss_7": 3.7437435388565063, "epoch": 0.793, "grad_norm": 1536.0, "kl_loss_10": 101.22099380493164, "kl_loss_2": 1443.7253784179688, "kl_loss_3": 1025.7533111572266, "kl_loss_7": 315.55796508789064, "learning_rate": 0.0001040488770388625, "loss": 737.0972, "step": 7930 }, { "ce_loss_10": 3.5919533133506776, "ce_loss_13": 3.5442355036735536, "ce_loss_2": 4.2396332144737245, "ce_loss_3": 4.025132322311402, "ce_loss_7": 3.689213979244232, "epoch": 0.794, "grad_norm": 856.0, "kl_loss_10": 101.47716064453125, "kl_loss_2": 1475.1850891113281, "kl_loss_3": 1045.1036376953125, "kl_loss_7": 324.41676330566406, "learning_rate": 0.00010308197990669538, "loss": 735.8634, "step": 7940 }, { "ce_loss_10": 3.705140697956085, "ce_loss_13": 3.654864525794983, "ce_loss_2": 4.346729636192322, "ce_loss_3": 4.1319398283958435, "ce_loss_7": 3.8019388556480407, "epoch": 0.795, "grad_norm": 504.0, "kl_loss_10": 103.90359115600586, "kl_loss_2": 1461.9827697753906, "kl_loss_3": 1031.338119506836, "kl_loss_7": 324.3537887573242, "learning_rate": 0.0001021190797345839, "loss": 729.0127, "step": 7950 }, { "ce_loss_10": 3.4266234755516054, "ce_loss_13": 3.37706036567688, "ce_loss_2": 4.114708411693573, "ce_loss_3": 3.888358306884766, "ce_loss_7": 3.533508372306824, "epoch": 0.796, "grad_norm": 756.0, "kl_loss_10": 103.416752243042, "kl_loss_2": 1564.6560913085937, "kl_loss_3": 1095.2656188964843, "kl_loss_7": 337.80855560302734, "learning_rate": 0.00010116018621892236, "loss": 750.1419, "step": 7960 }, { "ce_loss_10": 3.639100730419159, "ce_loss_13": 3.5894752740859985, "ce_loss_2": 4.298265874385834, "ce_loss_3": 4.078330492973327, "ce_loss_7": 3.7383811712265014, "epoch": 0.797, "grad_norm": 684.0, "kl_loss_10": 104.32260398864746, "kl_loss_2": 1499.6624267578125, "kl_loss_3": 1060.7293792724608, "kl_loss_7": 333.2513092041016, "learning_rate": 0.00010020530901575753, "loss": 728.5693, "step": 7970 }, { "ce_loss_10": 3.667653262615204, "ce_loss_13": 3.61742045879364, "ce_loss_2": 4.3062340259552006, "ce_loss_3": 4.096399331092835, "ce_loss_7": 3.7629624128341677, "epoch": 0.798, "grad_norm": 1024.0, "kl_loss_10": 103.5965633392334, "kl_loss_2": 1467.5749816894531, "kl_loss_3": 1040.191195678711, "kl_loss_7": 324.1479202270508, "learning_rate": 9.925445774069231e-05, "loss": 722.177, "step": 7980 }, { "ce_loss_10": 3.6216500282287596, "ce_loss_13": 3.5718342900276183, "ce_loss_2": 4.266017317771912, "ce_loss_3": 4.050320148468018, "ce_loss_7": 3.7188344478607176, "epoch": 0.799, "grad_norm": 516.0, "kl_loss_10": 101.32882881164551, "kl_loss_2": 1456.918896484375, "kl_loss_3": 1025.4983001708983, "kl_loss_7": 319.06079864501953, "learning_rate": 9.830764196878872e-05, "loss": 717.1135, "step": 7990 }, { "ce_loss_10": 3.5583004117012025, "ce_loss_13": 3.513035309314728, "ce_loss_2": 4.212134575843811, "ce_loss_3": 3.9891600012779236, "ce_loss_7": 3.660321295261383, "epoch": 0.8, "grad_norm": 636.0, "kl_loss_10": 100.12886657714844, "kl_loss_2": 1509.6505798339845, "kl_loss_3": 1056.922555541992, "kl_loss_7": 323.2692901611328, "learning_rate": 9.736487123447069e-05, "loss": 740.9455, "step": 8000 }, { "ce_loss_10": 3.5056336641311647, "ce_loss_13": 3.455529284477234, "ce_loss_2": 4.210934901237488, "ce_loss_3": 3.9641587257385256, "ce_loss_7": 3.6030144929885863, "epoch": 0.801, "grad_norm": 792.0, "kl_loss_10": 101.06791076660156, "kl_loss_2": 1595.9190856933594, "kl_loss_3": 1097.0911193847655, "kl_loss_7": 323.61043243408204, "learning_rate": 9.642615503142926e-05, "loss": 759.3992, "step": 8010 }, { "ce_loss_10": 3.577189016342163, "ce_loss_13": 3.526599645614624, "ce_loss_2": 4.230797731876374, "ce_loss_3": 4.0129972219467165, "ce_loss_7": 3.674345350265503, "epoch": 0.802, "grad_norm": 548.0, "kl_loss_10": 99.88861961364746, "kl_loss_2": 1474.8159057617188, "kl_loss_3": 1038.0601318359375, "kl_loss_7": 317.3719146728516, "learning_rate": 9.549150281252633e-05, "loss": 728.5845, "step": 8020 }, { "ce_loss_10": 3.6038238286972044, "ce_loss_13": 3.5542522788047792, "ce_loss_2": 4.252991592884063, "ce_loss_3": 4.034434342384339, "ce_loss_7": 3.70179363489151, "epoch": 0.803, "grad_norm": 636.0, "kl_loss_10": 101.49574432373046, "kl_loss_2": 1472.1244201660156, "kl_loss_3": 1029.3231994628907, "kl_loss_7": 319.5150619506836, "learning_rate": 9.4560923989699e-05, "loss": 740.761, "step": 8030 }, { "ce_loss_10": 3.593788433074951, "ce_loss_13": 3.5424663543701174, "ce_loss_2": 4.245579862594605, "ce_loss_3": 4.028131818771362, "ce_loss_7": 3.6965779423713685, "epoch": 0.804, "grad_norm": 748.0, "kl_loss_10": 101.84553260803223, "kl_loss_2": 1476.931494140625, "kl_loss_3": 1040.8555084228515, "kl_loss_7": 322.7687133789062, "learning_rate": 9.363442793386607e-05, "loss": 747.8585, "step": 8040 }, { "ce_loss_10": 3.5677515029907227, "ce_loss_13": 3.517994499206543, "ce_loss_2": 4.244594371318817, "ce_loss_3": 4.021477830410004, "ce_loss_7": 3.6719501495361326, "epoch": 0.805, "grad_norm": 812.0, "kl_loss_10": 102.14230270385742, "kl_loss_2": 1510.5491027832031, "kl_loss_3": 1068.9244110107422, "kl_loss_7": 332.5317855834961, "learning_rate": 9.271202397483213e-05, "loss": 732.0646, "step": 8050 }, { "ce_loss_10": 3.5933067917823793, "ce_loss_13": 3.545845794677734, "ce_loss_2": 4.2285720109939575, "ce_loss_3": 4.009271323680878, "ce_loss_7": 3.6873164892196657, "epoch": 0.806, "grad_norm": 728.0, "kl_loss_10": 100.23396263122558, "kl_loss_2": 1462.359307861328, "kl_loss_3": 1021.0957550048828, "kl_loss_7": 317.46580200195314, "learning_rate": 9.179372140119524e-05, "loss": 742.199, "step": 8060 }, { "ce_loss_10": 3.5343562006950378, "ce_loss_13": 3.4855529069900513, "ce_loss_2": 4.174920916557312, "ce_loss_3": 3.961995244026184, "ce_loss_7": 3.631755459308624, "epoch": 0.807, "grad_norm": 616.0, "kl_loss_10": 99.79405632019044, "kl_loss_2": 1463.4166015625, "kl_loss_3": 1030.4902770996093, "kl_loss_7": 319.55114898681643, "learning_rate": 9.087952946025175e-05, "loss": 739.1791, "step": 8070 }, { "ce_loss_10": 3.649857831001282, "ce_loss_13": 3.6004006624221803, "ce_loss_2": 4.268086218833924, "ce_loss_3": 4.062866771221161, "ce_loss_7": 3.7416433334350585, "epoch": 0.808, "grad_norm": 816.0, "kl_loss_10": 100.08632011413575, "kl_loss_2": 1419.8275268554687, "kl_loss_3": 1008.4796905517578, "kl_loss_7": 313.4950241088867, "learning_rate": 8.996945735790446e-05, "loss": 729.3666, "step": 8080 }, { "ce_loss_10": 3.5439666390419005, "ce_loss_13": 3.4964158296585084, "ce_loss_2": 4.197232007980347, "ce_loss_3": 3.9758567929267885, "ce_loss_7": 3.6408447265625, "epoch": 0.809, "grad_norm": 932.0, "kl_loss_10": 99.96629028320312, "kl_loss_2": 1496.5753662109375, "kl_loss_3": 1043.68212890625, "kl_loss_7": 319.75955352783205, "learning_rate": 8.906351425856951e-05, "loss": 744.0419, "step": 8090 }, { "ce_loss_10": 3.5307163119316103, "ce_loss_13": 3.4813020825386047, "ce_loss_2": 4.202869343757629, "ce_loss_3": 3.9749444127082825, "ce_loss_7": 3.6287263154983522, "epoch": 0.81, "grad_norm": 540.0, "kl_loss_10": 101.69815940856934, "kl_loss_2": 1528.2784484863282, "kl_loss_3": 1066.2303527832032, "kl_loss_7": 325.8038619995117, "learning_rate": 8.816170928508365e-05, "loss": 748.6136, "step": 8100 }, { "ce_loss_10": 3.490275239944458, "ce_loss_13": 3.4412143349647524, "ce_loss_2": 4.17544139623642, "ce_loss_3": 3.9485397815704344, "ce_loss_7": 3.593135714530945, "epoch": 0.811, "grad_norm": 528.0, "kl_loss_10": 101.71697235107422, "kl_loss_2": 1541.75498046875, "kl_loss_3": 1083.5701934814454, "kl_loss_7": 329.104345703125, "learning_rate": 8.7264051518613e-05, "loss": 750.0558, "step": 8110 }, { "ce_loss_10": 3.587212336063385, "ce_loss_13": 3.5403501272201536, "ce_loss_2": 4.215567100048065, "ce_loss_3": 4.006534945964813, "ce_loss_7": 3.683346486091614, "epoch": 0.812, "grad_norm": 740.0, "kl_loss_10": 99.61520233154297, "kl_loss_2": 1440.5771240234376, "kl_loss_3": 1019.1734283447265, "kl_loss_7": 316.40090026855466, "learning_rate": 8.637054999856148e-05, "loss": 730.2901, "step": 8120 }, { "ce_loss_10": 3.569942498207092, "ce_loss_13": 3.5195496559143065, "ce_loss_2": 4.227549159526825, "ce_loss_3": 4.005132162570954, "ce_loss_7": 3.6685083627700807, "epoch": 0.813, "grad_norm": 724.0, "kl_loss_10": 101.21401519775391, "kl_loss_2": 1486.9437194824218, "kl_loss_3": 1042.5372619628906, "kl_loss_7": 323.7269744873047, "learning_rate": 8.548121372247918e-05, "loss": 749.7126, "step": 8130 }, { "ce_loss_10": 3.6449042439460753, "ce_loss_13": 3.5991093277931214, "ce_loss_2": 4.2855897665023805, "ce_loss_3": 4.066536927223206, "ce_loss_7": 3.7413372159004212, "epoch": 0.814, "grad_norm": 748.0, "kl_loss_10": 100.7046890258789, "kl_loss_2": 1471.275341796875, "kl_loss_3": 1027.591714477539, "kl_loss_7": 315.89408721923826, "learning_rate": 8.459605164597267e-05, "loss": 730.5779, "step": 8140 }, { "ce_loss_10": 3.5314650416374205, "ce_loss_13": 3.482147479057312, "ce_loss_2": 4.184685850143433, "ce_loss_3": 3.9665878653526305, "ce_loss_7": 3.6254254460334776, "epoch": 0.815, "grad_norm": 828.0, "kl_loss_10": 99.40346908569336, "kl_loss_2": 1489.9046752929687, "kl_loss_3": 1046.0586029052733, "kl_loss_7": 320.45188293457034, "learning_rate": 8.371507268261436e-05, "loss": 741.6928, "step": 8150 }, { "ce_loss_10": 3.609076774120331, "ce_loss_13": 3.559982919692993, "ce_loss_2": 4.250039672851562, "ce_loss_3": 4.036680591106415, "ce_loss_7": 3.7110161542892457, "epoch": 0.816, "grad_norm": 494.0, "kl_loss_10": 101.12937507629394, "kl_loss_2": 1477.7734252929688, "kl_loss_3": 1043.9434509277344, "kl_loss_7": 323.91139831542966, "learning_rate": 8.283828570385238e-05, "loss": 719.9689, "step": 8160 }, { "ce_loss_10": 3.608122503757477, "ce_loss_13": 3.5602147459983824, "ce_loss_2": 4.259062159061432, "ce_loss_3": 4.043926072120667, "ce_loss_7": 3.7063713908195495, "epoch": 0.817, "grad_norm": 592.0, "kl_loss_10": 100.46234130859375, "kl_loss_2": 1451.338885498047, "kl_loss_3": 1024.6061248779297, "kl_loss_7": 317.46505279541014, "learning_rate": 8.196569953892202e-05, "loss": 730.805, "step": 8170 }, { "ce_loss_10": 3.520654034614563, "ce_loss_13": 3.4709535002708436, "ce_loss_2": 4.180013179779053, "ce_loss_3": 3.957700550556183, "ce_loss_7": 3.6226252555847167, "epoch": 0.818, "grad_norm": 572.0, "kl_loss_10": 100.23197441101074, "kl_loss_2": 1472.8869079589845, "kl_loss_3": 1034.3491180419921, "kl_loss_7": 320.2228073120117, "learning_rate": 8.109732297475635e-05, "loss": 731.251, "step": 8180 }, { "ce_loss_10": 3.4925883531570436, "ce_loss_13": 3.440473508834839, "ce_loss_2": 4.192221534252167, "ce_loss_3": 3.962784230709076, "ce_loss_7": 3.599685513973236, "epoch": 0.819, "grad_norm": 668.0, "kl_loss_10": 103.05813369750976, "kl_loss_2": 1551.283984375, "kl_loss_3": 1093.7224700927734, "kl_loss_7": 335.3472213745117, "learning_rate": 8.023316475589754e-05, "loss": 754.8944, "step": 8190 }, { "ce_loss_10": 3.4566813707351685, "ce_loss_13": 3.4041911959648132, "ce_loss_2": 4.175233256816864, "ce_loss_3": 3.931389880180359, "ce_loss_7": 3.56941739320755, "epoch": 0.82, "grad_norm": 992.0, "kl_loss_10": 105.4918254852295, "kl_loss_2": 1607.8986328125, "kl_loss_3": 1112.0597839355469, "kl_loss_7": 340.25874481201174, "learning_rate": 7.937323358440934e-05, "loss": 769.7056, "step": 8200 }, { "ce_loss_10": 3.5821478962898254, "ce_loss_13": 3.5370441198349, "ce_loss_2": 4.216321432590485, "ce_loss_3": 4.003733205795288, "ce_loss_7": 3.6785664796829223, "epoch": 0.821, "grad_norm": 668.0, "kl_loss_10": 99.36309585571288, "kl_loss_2": 1455.1914367675781, "kl_loss_3": 1026.6596313476562, "kl_loss_7": 318.3153244018555, "learning_rate": 7.851753811978923e-05, "loss": 733.8741, "step": 8210 }, { "ce_loss_10": 3.605075752735138, "ce_loss_13": 3.553903329372406, "ce_loss_2": 4.264594554901123, "ce_loss_3": 4.0411129832267765, "ce_loss_7": 3.7039045810699465, "epoch": 0.822, "grad_norm": 508.0, "kl_loss_10": 101.19773025512696, "kl_loss_2": 1491.5029541015624, "kl_loss_3": 1034.2482360839845, "kl_loss_7": 319.2388977050781, "learning_rate": 7.766608697888095e-05, "loss": 732.1681, "step": 8220 }, { "ce_loss_10": 3.6139151573181154, "ce_loss_13": 3.56223611831665, "ce_loss_2": 4.260961723327637, "ce_loss_3": 4.040641283988952, "ce_loss_7": 3.7134764909744264, "epoch": 0.823, "grad_norm": 612.0, "kl_loss_10": 104.46126365661621, "kl_loss_2": 1478.3617126464844, "kl_loss_3": 1041.9627655029296, "kl_loss_7": 325.27500915527344, "learning_rate": 7.681888873578785e-05, "loss": 741.8987, "step": 8230 }, { "ce_loss_10": 3.5476807713508607, "ce_loss_13": 3.496246707439423, "ce_loss_2": 4.2170847177505495, "ce_loss_3": 3.9970930576324464, "ce_loss_7": 3.6521557807922362, "epoch": 0.824, "grad_norm": 932.0, "kl_loss_10": 103.55918807983399, "kl_loss_2": 1511.0997924804688, "kl_loss_3": 1064.8140228271484, "kl_loss_7": 330.45691680908203, "learning_rate": 7.597595192178702e-05, "loss": 736.8945, "step": 8240 }, { "ce_loss_10": 3.5364687919616697, "ce_loss_13": 3.4881993770599364, "ce_loss_2": 4.215133023262024, "ce_loss_3": 3.986978304386139, "ce_loss_7": 3.6410319447517394, "epoch": 0.825, "grad_norm": 724.0, "kl_loss_10": 103.43578033447265, "kl_loss_2": 1540.4459716796875, "kl_loss_3": 1086.9694610595702, "kl_loss_7": 334.2783966064453, "learning_rate": 7.513728502524286e-05, "loss": 755.3905, "step": 8250 }, { "ce_loss_10": 3.543540918827057, "ce_loss_13": 3.4973260879516603, "ce_loss_2": 4.188510763645172, "ce_loss_3": 3.970736765861511, "ce_loss_7": 3.6425477981567385, "epoch": 0.826, "grad_norm": 824.0, "kl_loss_10": 97.67796058654785, "kl_loss_2": 1454.1864440917968, "kl_loss_3": 1024.783999633789, "kl_loss_7": 313.1675552368164, "learning_rate": 7.430289649152156e-05, "loss": 738.1198, "step": 8260 }, { "ce_loss_10": 3.442179000377655, "ce_loss_13": 3.3937164187431335, "ce_loss_2": 4.133165848255158, "ce_loss_3": 3.901263117790222, "ce_loss_7": 3.54562885761261, "epoch": 0.827, "grad_norm": 1400.0, "kl_loss_10": 100.55124206542969, "kl_loss_2": 1556.8484802246094, "kl_loss_3": 1085.967514038086, "kl_loss_7": 329.81087188720704, "learning_rate": 7.347279472290646e-05, "loss": 746.1129, "step": 8270 }, { "ce_loss_10": 3.587779426574707, "ce_loss_13": 3.5384023427963256, "ce_loss_2": 4.244775927066803, "ce_loss_3": 4.022708773612976, "ce_loss_7": 3.6883206129074098, "epoch": 0.828, "grad_norm": 1160.0, "kl_loss_10": 101.53954811096192, "kl_loss_2": 1490.7300354003905, "kl_loss_3": 1047.6129608154297, "kl_loss_7": 323.168473815918, "learning_rate": 7.264698807851328e-05, "loss": 743.4324, "step": 8280 }, { "ce_loss_10": 3.554813039302826, "ce_loss_13": 3.509436821937561, "ce_loss_2": 4.1841387271881105, "ce_loss_3": 3.9711859583854676, "ce_loss_7": 3.6489402770996096, "epoch": 0.829, "grad_norm": 652.0, "kl_loss_10": 97.92380104064941, "kl_loss_2": 1442.4901733398438, "kl_loss_3": 1020.077114868164, "kl_loss_7": 312.84254150390626, "learning_rate": 7.182548487420554e-05, "loss": 729.7623, "step": 8290 }, { "ce_loss_10": 3.6034604787826536, "ce_loss_13": 3.5539374232292174, "ce_loss_2": 4.244960236549377, "ce_loss_3": 4.032441568374634, "ce_loss_7": 3.70367169380188, "epoch": 0.83, "grad_norm": 608.0, "kl_loss_10": 101.68586807250976, "kl_loss_2": 1470.0505676269531, "kl_loss_3": 1040.4366333007813, "kl_loss_7": 322.59322052001954, "learning_rate": 7.100829338251146e-05, "loss": 731.4772, "step": 8300 }, { "ce_loss_10": 3.5403619050979613, "ce_loss_13": 3.48887220621109, "ce_loss_2": 4.214012861251831, "ce_loss_3": 3.989605951309204, "ce_loss_7": 3.6424837946891784, "epoch": 0.831, "grad_norm": 1088.0, "kl_loss_10": 102.52384452819824, "kl_loss_2": 1515.7459533691406, "kl_loss_3": 1066.7195373535155, "kl_loss_7": 328.8053894042969, "learning_rate": 7.019542183254046e-05, "loss": 737.7286, "step": 8310 }, { "ce_loss_10": 3.579847252368927, "ce_loss_13": 3.528160297870636, "ce_loss_2": 4.225255382061005, "ce_loss_3": 4.006514894962311, "ce_loss_7": 3.6832633018493652, "epoch": 0.832, "grad_norm": 720.0, "kl_loss_10": 105.29637870788574, "kl_loss_2": 1482.731707763672, "kl_loss_3": 1043.4006561279298, "kl_loss_7": 329.25179290771484, "learning_rate": 6.938687840989971e-05, "loss": 737.84, "step": 8320 }, { "ce_loss_10": 3.515520715713501, "ce_loss_13": 3.465034031867981, "ce_loss_2": 4.166922867298126, "ce_loss_3": 3.951807737350464, "ce_loss_7": 3.6140180706977842, "epoch": 0.833, "grad_norm": 796.0, "kl_loss_10": 101.97601776123047, "kl_loss_2": 1471.0227416992188, "kl_loss_3": 1041.632534790039, "kl_loss_7": 324.6424224853516, "learning_rate": 6.858267125661271e-05, "loss": 740.3374, "step": 8330 }, { "ce_loss_10": 3.5782296538352965, "ce_loss_13": 3.5290733695030214, "ce_loss_2": 4.234528493881226, "ce_loss_3": 4.0184645652771, "ce_loss_7": 3.6785075187683107, "epoch": 0.834, "grad_norm": 1192.0, "kl_loss_10": 99.59610710144042, "kl_loss_2": 1471.9729919433594, "kl_loss_3": 1036.3682800292968, "kl_loss_7": 319.3002197265625, "learning_rate": 6.778280847103668e-05, "loss": 750.9748, "step": 8340 }, { "ce_loss_10": 3.5899698972702025, "ce_loss_13": 3.5369165778160094, "ce_loss_2": 4.243354654312133, "ce_loss_3": 4.024777829647064, "ce_loss_7": 3.6916509747505186, "epoch": 0.835, "grad_norm": 496.0, "kl_loss_10": 103.29425773620605, "kl_loss_2": 1503.0087951660157, "kl_loss_3": 1054.3375610351563, "kl_loss_7": 327.5699966430664, "learning_rate": 6.698729810778065e-05, "loss": 739.9958, "step": 8350 }, { "ce_loss_10": 3.496941828727722, "ce_loss_13": 3.4473833560943605, "ce_loss_2": 4.16062775850296, "ce_loss_3": 3.938363790512085, "ce_loss_7": 3.5968827605247498, "epoch": 0.836, "grad_norm": 880.0, "kl_loss_10": 99.08586883544922, "kl_loss_2": 1500.0357360839844, "kl_loss_3": 1055.4989624023438, "kl_loss_7": 320.9412338256836, "learning_rate": 6.619614817762538e-05, "loss": 742.115, "step": 8360 }, { "ce_loss_10": 3.455747163295746, "ce_loss_13": 3.408223581314087, "ce_loss_2": 4.163050651550293, "ce_loss_3": 3.9264211773872377, "ce_loss_7": 3.5644610047340395, "epoch": 0.837, "grad_norm": 860.0, "kl_loss_10": 99.9547508239746, "kl_loss_2": 1569.7484130859375, "kl_loss_3": 1102.0569274902343, "kl_loss_7": 331.30985565185546, "learning_rate": 6.540936664744196e-05, "loss": 756.3589, "step": 8370 }, { "ce_loss_10": 3.611568200588226, "ce_loss_13": 3.562069320678711, "ce_loss_2": 4.274012577533722, "ce_loss_3": 4.052524602413177, "ce_loss_7": 3.7125937700271607, "epoch": 0.838, "grad_norm": 484.0, "kl_loss_10": 102.25545883178711, "kl_loss_2": 1493.2916687011718, "kl_loss_3": 1054.4226837158203, "kl_loss_7": 322.77488708496094, "learning_rate": 6.462696144011149e-05, "loss": 732.0412, "step": 8380 }, { "ce_loss_10": 3.5595585227012636, "ce_loss_13": 3.5099167704582213, "ce_loss_2": 4.214264965057373, "ce_loss_3": 3.996537780761719, "ce_loss_7": 3.661302852630615, "epoch": 0.839, "grad_norm": 1004.0, "kl_loss_10": 103.47585754394531, "kl_loss_2": 1488.425274658203, "kl_loss_3": 1049.5533325195313, "kl_loss_7": 327.769953918457, "learning_rate": 6.384894043444567e-05, "loss": 731.9472, "step": 8390 }, { "ce_loss_10": 3.589981698989868, "ce_loss_13": 3.5403080940246583, "ce_loss_2": 4.256827044486999, "ce_loss_3": 4.035109913349151, "ce_loss_7": 3.6935128450393675, "epoch": 0.84, "grad_norm": 900.0, "kl_loss_10": 101.94223175048828, "kl_loss_2": 1498.7192504882812, "kl_loss_3": 1049.6434509277344, "kl_loss_7": 322.99024658203126, "learning_rate": 6.307531146510753e-05, "loss": 733.4202, "step": 8400 }, { "ce_loss_10": 3.568012523651123, "ce_loss_13": 3.5185906171798704, "ce_loss_2": 4.206760311126709, "ce_loss_3": 3.9952922821044923, "ce_loss_7": 3.6660799384117126, "epoch": 0.841, "grad_norm": 1584.0, "kl_loss_10": 101.2415885925293, "kl_loss_2": 1459.6966186523437, "kl_loss_3": 1029.4983947753906, "kl_loss_7": 321.6950378417969, "learning_rate": 6.230608232253226e-05, "loss": 725.581, "step": 8410 }, { "ce_loss_10": 3.523574101924896, "ce_loss_13": 3.474301612377167, "ce_loss_2": 4.205008840560913, "ce_loss_3": 3.979651963710785, "ce_loss_7": 3.6246840715408326, "epoch": 0.842, "grad_norm": 728.0, "kl_loss_10": 100.59882545471191, "kl_loss_2": 1532.942822265625, "kl_loss_3": 1073.0771911621093, "kl_loss_7": 326.88155670166014, "learning_rate": 6.154126075284855e-05, "loss": 737.5785, "step": 8420 }, { "ce_loss_10": 3.6176608920097353, "ce_loss_13": 3.5695696473121643, "ce_loss_2": 4.2550837874412535, "ce_loss_3": 4.040318775177002, "ce_loss_7": 3.71435227394104, "epoch": 0.843, "grad_norm": 748.0, "kl_loss_10": 98.53165054321289, "kl_loss_2": 1432.1840270996095, "kl_loss_3": 1011.589013671875, "kl_loss_7": 314.44939880371095, "learning_rate": 6.078085445780129e-05, "loss": 714.9252, "step": 8430 }, { "ce_loss_10": 3.6224847435951233, "ce_loss_13": 3.5747481107711794, "ce_loss_2": 4.285443568229676, "ce_loss_3": 4.062715935707092, "ce_loss_7": 3.7213473081588746, "epoch": 0.844, "grad_norm": 752.0, "kl_loss_10": 101.92573623657226, "kl_loss_2": 1492.1762939453124, "kl_loss_3": 1050.4893585205077, "kl_loss_7": 323.04198303222654, "learning_rate": 6.002487109467347e-05, "loss": 730.756, "step": 8440 }, { "ce_loss_10": 3.6310657382011415, "ce_loss_13": 3.5825918436050417, "ce_loss_2": 4.264392173290252, "ce_loss_3": 4.055971515178681, "ce_loss_7": 3.731156384944916, "epoch": 0.845, "grad_norm": 604.0, "kl_loss_10": 101.95924758911133, "kl_loss_2": 1468.8117980957031, "kl_loss_3": 1035.8216339111327, "kl_loss_7": 324.88834381103516, "learning_rate": 5.927331827620902e-05, "loss": 730.5096, "step": 8450 }, { "ce_loss_10": 3.617450773715973, "ce_loss_13": 3.5687082529067995, "ce_loss_2": 4.235312438011169, "ce_loss_3": 4.038230431079865, "ce_loss_7": 3.7163636445999146, "epoch": 0.846, "grad_norm": 510.0, "kl_loss_10": 98.62048377990723, "kl_loss_2": 1418.9692993164062, "kl_loss_3": 1014.9820068359375, "kl_loss_7": 317.5827911376953, "learning_rate": 5.852620357053651e-05, "loss": 728.3835, "step": 8460 }, { "ce_loss_10": 3.6519840598106383, "ce_loss_13": 3.6037726163864137, "ce_loss_2": 4.284592986106873, "ce_loss_3": 4.07490748167038, "ce_loss_7": 3.7498626708984375, "epoch": 0.847, "grad_norm": 676.0, "kl_loss_10": 98.99671249389648, "kl_loss_2": 1437.5142944335937, "kl_loss_3": 1015.3446014404296, "kl_loss_7": 314.9153533935547, "learning_rate": 5.778353450109286e-05, "loss": 724.4845, "step": 8470 }, { "ce_loss_10": 3.6935429334640504, "ce_loss_13": 3.642084562778473, "ce_loss_2": 4.339848303794861, "ce_loss_3": 4.125913679599762, "ce_loss_7": 3.792868947982788, "epoch": 0.848, "grad_norm": 928.0, "kl_loss_10": 103.5403491973877, "kl_loss_2": 1480.7773193359376, "kl_loss_3": 1043.4694580078126, "kl_loss_7": 324.63964080810547, "learning_rate": 5.7045318546547206e-05, "loss": 735.9525, "step": 8480 }, { "ce_loss_10": 3.5869197249412537, "ce_loss_13": 3.536722409725189, "ce_loss_2": 4.23733344078064, "ce_loss_3": 4.0254469275474545, "ce_loss_7": 3.682870125770569, "epoch": 0.849, "grad_norm": 884.0, "kl_loss_10": 101.59747047424317, "kl_loss_2": 1488.824969482422, "kl_loss_3": 1052.1211700439453, "kl_loss_7": 320.6091278076172, "learning_rate": 5.631156314072605e-05, "loss": 729.1527, "step": 8490 }, { "ce_loss_10": 3.59784597158432, "ce_loss_13": 3.5493574023246763, "ce_loss_2": 4.225604259967804, "ce_loss_3": 4.016707766056061, "ce_loss_7": 3.6973516941070557, "epoch": 0.85, "grad_norm": 724.0, "kl_loss_10": 99.15880241394044, "kl_loss_2": 1446.5472595214844, "kl_loss_3": 1020.590933227539, "kl_loss_7": 315.5006790161133, "learning_rate": 5.5582275672538315e-05, "loss": 721.3334, "step": 8500 }, { "ce_loss_10": 3.5188814759254456, "ce_loss_13": 3.466568958759308, "ce_loss_2": 4.220649135112763, "ce_loss_3": 3.988197946548462, "ce_loss_7": 3.6279117465019226, "epoch": 0.851, "grad_norm": 644.0, "kl_loss_10": 104.65736503601075, "kl_loss_2": 1563.5226318359375, "kl_loss_3": 1094.2188354492187, "kl_loss_7": 332.27506866455076, "learning_rate": 5.4857463485900484e-05, "loss": 756.6825, "step": 8510 }, { "ce_loss_10": 3.5718650341033937, "ce_loss_13": 3.5233447551727295, "ce_loss_2": 4.217866456508636, "ce_loss_3": 4.002499830722809, "ce_loss_7": 3.674088108539581, "epoch": 0.852, "grad_norm": 760.0, "kl_loss_10": 99.97307891845703, "kl_loss_2": 1466.6584777832031, "kl_loss_3": 1031.4493774414063, "kl_loss_7": 321.10572357177733, "learning_rate": 5.413713387966329e-05, "loss": 730.4219, "step": 8520 }, { "ce_loss_10": 3.5936383962631226, "ce_loss_13": 3.5434906244277955, "ce_loss_2": 4.252785682678223, "ce_loss_3": 4.030806457996368, "ce_loss_7": 3.6913840889930727, "epoch": 0.853, "grad_norm": 912.0, "kl_loss_10": 102.96573944091797, "kl_loss_2": 1493.1239440917968, "kl_loss_3": 1051.283901977539, "kl_loss_7": 323.63524017333987, "learning_rate": 5.34212941075381e-05, "loss": 740.0517, "step": 8530 }, { "ce_loss_10": 3.6050042867660523, "ce_loss_13": 3.5580546259880066, "ce_loss_2": 4.2372880458831785, "ce_loss_3": 4.0220554828643795, "ce_loss_7": 3.6978816747665406, "epoch": 0.854, "grad_norm": 1320.0, "kl_loss_10": 98.43786392211913, "kl_loss_2": 1457.6458251953125, "kl_loss_3": 1021.0993408203125, "kl_loss_7": 312.4445114135742, "learning_rate": 5.270995137802315e-05, "loss": 724.5521, "step": 8540 }, { "ce_loss_10": 3.5337819814682008, "ce_loss_13": 3.4898949384689333, "ce_loss_2": 4.187564945220947, "ce_loss_3": 3.965048086643219, "ce_loss_7": 3.6320536017417906, "epoch": 0.855, "grad_norm": 540.0, "kl_loss_10": 98.11261672973633, "kl_loss_2": 1487.168194580078, "kl_loss_3": 1036.4231323242188, "kl_loss_7": 319.65953521728517, "learning_rate": 5.2003112854332125e-05, "loss": 737.7947, "step": 8550 }, { "ce_loss_10": 3.5365186452865602, "ce_loss_13": 3.486772668361664, "ce_loss_2": 4.17830080986023, "ce_loss_3": 3.9645521521568297, "ce_loss_7": 3.634303534030914, "epoch": 0.856, "grad_norm": 664.0, "kl_loss_10": 99.88435516357421, "kl_loss_2": 1469.8387268066406, "kl_loss_3": 1036.522396850586, "kl_loss_7": 318.1914260864258, "learning_rate": 5.130078565432089e-05, "loss": 721.3629, "step": 8560 }, { "ce_loss_10": 3.612158238887787, "ce_loss_13": 3.5638707756996153, "ce_loss_2": 4.241634714603424, "ce_loss_3": 4.029456174373626, "ce_loss_7": 3.7070309281349183, "epoch": 0.857, "grad_norm": 536.0, "kl_loss_10": 99.21782760620117, "kl_loss_2": 1455.7318054199218, "kl_loss_3": 1023.9639678955078, "kl_loss_7": 312.6913131713867, "learning_rate": 5.060297685041659e-05, "loss": 717.3177, "step": 8570 }, { "ce_loss_10": 3.5389069318771362, "ce_loss_13": 3.489778387546539, "ce_loss_2": 4.202560603618622, "ce_loss_3": 3.9817099809646606, "ce_loss_7": 3.6424978494644167, "epoch": 0.858, "grad_norm": 940.0, "kl_loss_10": 102.62759246826172, "kl_loss_2": 1504.5575073242187, "kl_loss_3": 1059.7657592773437, "kl_loss_7": 328.3709991455078, "learning_rate": 4.99096934695461e-05, "loss": 749.7916, "step": 8580 }, { "ce_loss_10": 3.600894570350647, "ce_loss_13": 3.5491868257522583, "ce_loss_2": 4.255689239501953, "ce_loss_3": 4.038812637329102, "ce_loss_7": 3.700803780555725, "epoch": 0.859, "grad_norm": 460.0, "kl_loss_10": 101.148779296875, "kl_loss_2": 1474.426251220703, "kl_loss_3": 1037.391323852539, "kl_loss_7": 320.32366638183595, "learning_rate": 4.922094249306558e-05, "loss": 723.4629, "step": 8590 }, { "ce_loss_10": 3.623567187786102, "ce_loss_13": 3.575220191478729, "ce_loss_2": 4.270040154457092, "ce_loss_3": 4.056743347644806, "ce_loss_7": 3.725175881385803, "epoch": 0.86, "grad_norm": 732.0, "kl_loss_10": 102.33082809448243, "kl_loss_2": 1460.0979064941407, "kl_loss_3": 1026.9369049072266, "kl_loss_7": 321.9026504516602, "learning_rate": 4.853673085668947e-05, "loss": 718.3957, "step": 8600 }, { "ce_loss_10": 3.652418076992035, "ce_loss_13": 3.6025752305984495, "ce_loss_2": 4.3013145446777346, "ce_loss_3": 4.085046648979187, "ce_loss_7": 3.75024756193161, "epoch": 0.861, "grad_norm": 644.0, "kl_loss_10": 101.83001403808593, "kl_loss_2": 1473.394305419922, "kl_loss_3": 1034.6163208007813, "kl_loss_7": 318.5129333496094, "learning_rate": 4.78570654504214e-05, "loss": 736.1691, "step": 8610 }, { "ce_loss_10": 3.592516303062439, "ce_loss_13": 3.5425866723060606, "ce_loss_2": 4.245881724357605, "ce_loss_3": 4.029755437374115, "ce_loss_7": 3.6929439306259155, "epoch": 0.862, "grad_norm": 720.0, "kl_loss_10": 100.39467163085938, "kl_loss_2": 1487.7456420898438, "kl_loss_3": 1053.1868133544922, "kl_loss_7": 322.69295806884764, "learning_rate": 4.7181953118484556e-05, "loss": 741.1623, "step": 8620 }, { "ce_loss_10": 3.620820641517639, "ce_loss_13": 3.5733582019805907, "ce_loss_2": 4.259818744659424, "ce_loss_3": 4.044246184825897, "ce_loss_7": 3.722467541694641, "epoch": 0.863, "grad_norm": 552.0, "kl_loss_10": 99.31918983459472, "kl_loss_2": 1435.6528381347657, "kl_loss_3": 1018.1011322021484, "kl_loss_7": 316.54356384277344, "learning_rate": 4.651140065925269e-05, "loss": 738.1011, "step": 8630 }, { "ce_loss_10": 3.551366376876831, "ce_loss_13": 3.5010396957397463, "ce_loss_2": 4.206502521038056, "ce_loss_3": 3.9842041730880737, "ce_loss_7": 3.6545602917671203, "epoch": 0.864, "grad_norm": 768.0, "kl_loss_10": 101.73706588745117, "kl_loss_2": 1484.758477783203, "kl_loss_3": 1041.09755859375, "kl_loss_7": 322.01622161865237, "learning_rate": 4.58454148251814e-05, "loss": 747.5355, "step": 8640 }, { "ce_loss_10": 3.5697497367858886, "ce_loss_13": 3.5183204889297484, "ce_loss_2": 4.238825786113739, "ce_loss_3": 4.02286514043808, "ce_loss_7": 3.6690367460250854, "epoch": 0.865, "grad_norm": 440.0, "kl_loss_10": 101.12740707397461, "kl_loss_2": 1518.5367431640625, "kl_loss_3": 1071.8171813964843, "kl_loss_7": 324.78069458007815, "learning_rate": 4.518400232274078e-05, "loss": 743.0739, "step": 8650 }, { "ce_loss_10": 3.5850109577178957, "ce_loss_13": 3.5342284798622132, "ce_loss_2": 4.2288681149482725, "ce_loss_3": 4.020788443088532, "ce_loss_7": 3.688563084602356, "epoch": 0.866, "grad_norm": 688.0, "kl_loss_10": 102.68562927246094, "kl_loss_2": 1457.446112060547, "kl_loss_3": 1036.6482788085937, "kl_loss_7": 326.4265655517578, "learning_rate": 4.452716981234745e-05, "loss": 717.7304, "step": 8660 }, { "ce_loss_10": 3.5618560314178467, "ce_loss_13": 3.5160150289535523, "ce_loss_2": 4.20790159702301, "ce_loss_3": 3.9910016536712645, "ce_loss_7": 3.6588242411613465, "epoch": 0.867, "grad_norm": 440.0, "kl_loss_10": 98.75237731933593, "kl_loss_2": 1466.609307861328, "kl_loss_3": 1035.6744049072265, "kl_loss_7": 317.27842102050784, "learning_rate": 4.3874923908297335e-05, "loss": 723.6885, "step": 8670 }, { "ce_loss_10": 3.6163036584854127, "ce_loss_13": 3.565244710445404, "ce_loss_2": 4.271518409252167, "ce_loss_3": 4.051812696456909, "ce_loss_7": 3.7126657605171203, "epoch": 0.868, "grad_norm": 584.0, "kl_loss_10": 102.75019416809081, "kl_loss_2": 1486.28642578125, "kl_loss_3": 1047.1869110107423, "kl_loss_7": 322.0906280517578, "learning_rate": 4.322727117869951e-05, "loss": 734.895, "step": 8680 }, { "ce_loss_10": 3.620135450363159, "ce_loss_13": 3.569386053085327, "ce_loss_2": 4.275386321544647, "ce_loss_3": 4.05297178030014, "ce_loss_7": 3.7197634100914003, "epoch": 0.869, "grad_norm": 884.0, "kl_loss_10": 102.7432373046875, "kl_loss_2": 1490.3015441894531, "kl_loss_3": 1046.310073852539, "kl_loss_7": 324.1436370849609, "learning_rate": 4.2584218145409916e-05, "loss": 731.3696, "step": 8690 }, { "ce_loss_10": 3.666629433631897, "ce_loss_13": 3.619381856918335, "ce_loss_2": 4.286886191368103, "ce_loss_3": 4.083159327507019, "ce_loss_7": 3.759081172943115, "epoch": 0.87, "grad_norm": 648.0, "kl_loss_10": 100.43111610412598, "kl_loss_2": 1430.4754821777344, "kl_loss_3": 1015.6041595458985, "kl_loss_7": 316.2693817138672, "learning_rate": 4.194577128396521e-05, "loss": 713.9332, "step": 8700 }, { "ce_loss_10": 3.539565312862396, "ce_loss_13": 3.493181300163269, "ce_loss_2": 4.193097543716431, "ce_loss_3": 3.9723041534423826, "ce_loss_7": 3.6394848108291624, "epoch": 0.871, "grad_norm": 636.0, "kl_loss_10": 99.1823616027832, "kl_loss_2": 1490.4127380371094, "kl_loss_3": 1037.2340850830078, "kl_loss_7": 316.6495758056641, "learning_rate": 4.1311937023518264e-05, "loss": 740.3966, "step": 8710 }, { "ce_loss_10": 3.555456852912903, "ce_loss_13": 3.509356987476349, "ce_loss_2": 4.220394253730774, "ce_loss_3": 3.9831700801849363, "ce_loss_7": 3.650043416023254, "epoch": 0.872, "grad_norm": 624.0, "kl_loss_10": 98.57582550048828, "kl_loss_2": 1504.0806579589844, "kl_loss_3": 1025.9110107421875, "kl_loss_7": 308.2161361694336, "learning_rate": 4.0682721746773344e-05, "loss": 728.1965, "step": 8720 }, { "ce_loss_10": 3.4296391725540163, "ce_loss_13": 3.3807521224021913, "ce_loss_2": 4.1147982478141785, "ce_loss_3": 3.8807220697402953, "ce_loss_7": 3.52937308549881, "epoch": 0.873, "grad_norm": 716.0, "kl_loss_10": 98.89792900085449, "kl_loss_2": 1518.07587890625, "kl_loss_3": 1062.6288024902344, "kl_loss_7": 322.4821182250977, "learning_rate": 4.0058131789920904e-05, "loss": 727.0471, "step": 8730 }, { "ce_loss_10": 3.580957305431366, "ce_loss_13": 3.5321433424949644, "ce_loss_2": 4.2271950244903564, "ce_loss_3": 4.012033867835998, "ce_loss_7": 3.680548095703125, "epoch": 0.874, "grad_norm": 904.0, "kl_loss_10": 99.5214054107666, "kl_loss_2": 1477.0742919921875, "kl_loss_3": 1046.1049407958985, "kl_loss_7": 318.0300659179687, "learning_rate": 3.9438173442575e-05, "loss": 753.8283, "step": 8740 }, { "ce_loss_10": 3.6143658638000487, "ce_loss_13": 3.564062404632568, "ce_loss_2": 4.251032495498658, "ce_loss_3": 4.033862113952637, "ce_loss_7": 3.7102433323860167, "epoch": 0.875, "grad_norm": 788.0, "kl_loss_10": 100.42368049621582, "kl_loss_2": 1456.5761413574219, "kl_loss_3": 1026.1640197753907, "kl_loss_7": 319.6662353515625, "learning_rate": 3.882285294770937e-05, "loss": 728.4841, "step": 8750 }, { "ce_loss_10": 3.5788313865661623, "ce_loss_13": 3.5294269919395447, "ce_loss_2": 4.210182988643647, "ce_loss_3": 3.9981965899467466, "ce_loss_7": 3.674803590774536, "epoch": 0.876, "grad_norm": 640.0, "kl_loss_10": 101.32976112365722, "kl_loss_2": 1458.606463623047, "kl_loss_3": 1028.2858795166017, "kl_loss_7": 320.42979583740237, "learning_rate": 3.821217650159453e-05, "loss": 735.2739, "step": 8760 }, { "ce_loss_10": 3.445225954055786, "ce_loss_13": 3.396264111995697, "ce_loss_2": 4.143158411979675, "ce_loss_3": 3.9164228796958924, "ce_loss_7": 3.553824269771576, "epoch": 0.877, "grad_norm": 996.0, "kl_loss_10": 100.45628967285157, "kl_loss_2": 1550.563232421875, "kl_loss_3": 1090.0193725585937, "kl_loss_7": 330.59093322753904, "learning_rate": 3.760615025373543e-05, "loss": 748.665, "step": 8770 }, { "ce_loss_10": 3.63193439245224, "ce_loss_13": 3.5814425945281982, "ce_loss_2": 4.298359119892121, "ce_loss_3": 4.074527013301849, "ce_loss_7": 3.730543315410614, "epoch": 0.878, "grad_norm": 736.0, "kl_loss_10": 104.41954154968262, "kl_loss_2": 1504.7132080078125, "kl_loss_3": 1056.808139038086, "kl_loss_7": 328.22808227539065, "learning_rate": 3.700478030680987e-05, "loss": 751.1011, "step": 8780 }, { "ce_loss_10": 3.6200098514556887, "ce_loss_13": 3.5731790900230407, "ce_loss_2": 4.260116326808929, "ce_loss_3": 4.046404182910919, "ce_loss_7": 3.717615473270416, "epoch": 0.879, "grad_norm": 468.0, "kl_loss_10": 100.81511688232422, "kl_loss_2": 1453.1581481933595, "kl_loss_3": 1021.8418762207032, "kl_loss_7": 316.85109252929686, "learning_rate": 3.6408072716606344e-05, "loss": 723.9254, "step": 8790 }, { "ce_loss_10": 3.5401777982711793, "ce_loss_13": 3.490380597114563, "ce_loss_2": 4.216920244693756, "ce_loss_3": 3.9922378420829774, "ce_loss_7": 3.6427207231521606, "epoch": 0.88, "grad_norm": 668.0, "kl_loss_10": 102.35791473388672, "kl_loss_2": 1528.781671142578, "kl_loss_3": 1074.3986419677735, "kl_loss_7": 327.72459106445314, "learning_rate": 3.5816033491963716e-05, "loss": 763.6572, "step": 8800 }, { "ce_loss_10": 3.397855484485626, "ce_loss_13": 3.3487441539764404, "ce_loss_2": 4.09263653755188, "ce_loss_3": 3.8519899725914, "ce_loss_7": 3.5014525294303893, "epoch": 0.881, "grad_norm": 724.0, "kl_loss_10": 99.97165641784667, "kl_loss_2": 1538.5061340332031, "kl_loss_3": 1061.687191772461, "kl_loss_7": 318.96228179931643, "learning_rate": 3.522866859471047e-05, "loss": 743.1218, "step": 8810 }, { "ce_loss_10": 3.6399516701698302, "ce_loss_13": 3.5957464814186095, "ce_loss_2": 4.256836426258087, "ce_loss_3": 4.05051851272583, "ce_loss_7": 3.7309115886688233, "epoch": 0.882, "grad_norm": 708.0, "kl_loss_10": 96.69907646179199, "kl_loss_2": 1410.1169921875, "kl_loss_3": 991.3723724365234, "kl_loss_7": 306.3313705444336, "learning_rate": 3.46459839396045e-05, "loss": 718.9224, "step": 8820 }, { "ce_loss_10": 3.5656251907348633, "ce_loss_13": 3.513873982429504, "ce_loss_2": 4.2293421626091, "ce_loss_3": 4.01195923089981, "ce_loss_7": 3.6672701597213746, "epoch": 0.883, "grad_norm": 588.0, "kl_loss_10": 101.94745559692383, "kl_loss_2": 1484.8607177734375, "kl_loss_3": 1047.0061737060546, "kl_loss_7": 322.7177047729492, "learning_rate": 3.406798539427386e-05, "loss": 752.7816, "step": 8830 }, { "ce_loss_10": 3.624855399131775, "ce_loss_13": 3.575716805458069, "ce_loss_2": 4.276913905143738, "ce_loss_3": 4.059955632686615, "ce_loss_7": 3.7224327683448792, "epoch": 0.884, "grad_norm": 1024.0, "kl_loss_10": 100.38882217407226, "kl_loss_2": 1493.7386352539063, "kl_loss_3": 1049.57802734375, "kl_loss_7": 320.48105773925784, "learning_rate": 3.349467877915746e-05, "loss": 739.4407, "step": 8840 }, { "ce_loss_10": 3.578192043304443, "ce_loss_13": 3.530308175086975, "ce_loss_2": 4.249985110759735, "ce_loss_3": 4.025544083118438, "ce_loss_7": 3.67979074716568, "epoch": 0.885, "grad_norm": 620.0, "kl_loss_10": 100.53253707885742, "kl_loss_2": 1528.336981201172, "kl_loss_3": 1070.8029663085938, "kl_loss_7": 325.8035583496094, "learning_rate": 3.292606986744667e-05, "loss": 766.4358, "step": 8850 }, { "ce_loss_10": 3.536137354373932, "ce_loss_13": 3.490540635585785, "ce_loss_2": 4.200462830066681, "ce_loss_3": 3.977449345588684, "ce_loss_7": 3.6323532342910765, "epoch": 0.886, "grad_norm": 612.0, "kl_loss_10": 99.125341796875, "kl_loss_2": 1492.0791320800781, "kl_loss_3": 1047.467221069336, "kl_loss_7": 317.71561431884766, "learning_rate": 3.23621643850267e-05, "loss": 737.4637, "step": 8860 }, { "ce_loss_10": 3.615437150001526, "ce_loss_13": 3.567244231700897, "ce_loss_2": 4.253424096107483, "ce_loss_3": 4.037160444259643, "ce_loss_7": 3.7142882466316225, "epoch": 0.887, "grad_norm": 576.0, "kl_loss_10": 101.8384937286377, "kl_loss_2": 1474.9909973144531, "kl_loss_3": 1041.4495056152343, "kl_loss_7": 324.29286193847656, "learning_rate": 3.180296801041971e-05, "loss": 724.9371, "step": 8870 }, { "ce_loss_10": 3.637635326385498, "ce_loss_13": 3.5926483273506165, "ce_loss_2": 4.285278296470642, "ce_loss_3": 4.062232720851898, "ce_loss_7": 3.7338334441185, "epoch": 0.888, "grad_norm": 580.0, "kl_loss_10": 100.01979370117188, "kl_loss_2": 1470.817236328125, "kl_loss_3": 1021.6034362792968, "kl_loss_7": 316.28428039550784, "learning_rate": 3.124848637472688e-05, "loss": 720.8995, "step": 8880 }, { "ce_loss_10": 3.4564511656761168, "ce_loss_13": 3.40824179649353, "ce_loss_2": 4.124191880226135, "ce_loss_3": 3.8997549533843996, "ce_loss_7": 3.5557323694229126, "epoch": 0.889, "grad_norm": 1432.0, "kl_loss_10": 98.77566146850586, "kl_loss_2": 1508.8233520507813, "kl_loss_3": 1057.6690338134765, "kl_loss_7": 318.5674057006836, "learning_rate": 3.069872506157212e-05, "loss": 736.8049, "step": 8890 }, { "ce_loss_10": 3.5567004561424254, "ce_loss_13": 3.511071038246155, "ce_loss_2": 4.205580937862396, "ce_loss_3": 3.9943366408348084, "ce_loss_7": 3.655029034614563, "epoch": 0.89, "grad_norm": 632.0, "kl_loss_10": 100.13505249023437, "kl_loss_2": 1480.5708862304687, "kl_loss_3": 1044.6940124511718, "kl_loss_7": 321.9190338134766, "learning_rate": 3.0153689607045842e-05, "loss": 729.6326, "step": 8900 }, { "ce_loss_10": 3.4543484330177305, "ce_loss_13": 3.405192255973816, "ce_loss_2": 4.160457563400269, "ce_loss_3": 3.9234394431114197, "ce_loss_7": 3.5612606167793275, "epoch": 0.891, "grad_norm": 604.0, "kl_loss_10": 102.85602722167968, "kl_loss_2": 1579.6288330078125, "kl_loss_3": 1101.6359619140626, "kl_loss_7": 333.17054595947263, "learning_rate": 2.9613385499648926e-05, "loss": 744.3033, "step": 8910 }, { "ce_loss_10": 3.5080823063850404, "ce_loss_13": 3.4615165829658507, "ce_loss_2": 4.163012349605561, "ce_loss_3": 3.947146010398865, "ce_loss_7": 3.609831357002258, "epoch": 0.892, "grad_norm": 804.0, "kl_loss_10": 98.99362754821777, "kl_loss_2": 1482.4679138183594, "kl_loss_3": 1053.731625366211, "kl_loss_7": 320.96242370605466, "learning_rate": 2.9077818180237692e-05, "loss": 736.6158, "step": 8920 }, { "ce_loss_10": 3.5552244663238524, "ce_loss_13": 3.5073567986488343, "ce_loss_2": 4.228390431404113, "ce_loss_3": 4.004204547405243, "ce_loss_7": 3.659198832511902, "epoch": 0.893, "grad_norm": 1016.0, "kl_loss_10": 100.31335105895997, "kl_loss_2": 1497.0627990722655, "kl_loss_3": 1051.808157348633, "kl_loss_7": 321.3925979614258, "learning_rate": 2.8546993041969172e-05, "loss": 735.1848, "step": 8930 }, { "ce_loss_10": 3.5945846796035767, "ce_loss_13": 3.546629238128662, "ce_loss_2": 4.226353633403778, "ce_loss_3": 4.017337286472321, "ce_loss_7": 3.690079319477081, "epoch": 0.894, "grad_norm": 640.0, "kl_loss_10": 98.40950088500976, "kl_loss_2": 1453.492364501953, "kl_loss_3": 1026.328042602539, "kl_loss_7": 317.301823425293, "learning_rate": 2.802091543024671e-05, "loss": 734.1522, "step": 8940 }, { "ce_loss_10": 3.586183214187622, "ce_loss_13": 3.5388261437416078, "ce_loss_2": 4.2546870589256285, "ce_loss_3": 4.034034776687622, "ce_loss_7": 3.689036464691162, "epoch": 0.895, "grad_norm": 628.0, "kl_loss_10": 100.29861907958984, "kl_loss_2": 1505.7158142089843, "kl_loss_3": 1060.311819458008, "kl_loss_7": 322.7163345336914, "learning_rate": 2.7499590642665774e-05, "loss": 756.9452, "step": 8950 }, { "ce_loss_10": 3.600201141834259, "ce_loss_13": 3.5520679473876955, "ce_loss_2": 4.261254894733429, "ce_loss_3": 4.027135276794434, "ce_loss_7": 3.6956157207489015, "epoch": 0.896, "grad_norm": 556.0, "kl_loss_10": 102.55679969787597, "kl_loss_2": 1494.8634155273437, "kl_loss_3": 1026.7823822021485, "kl_loss_7": 319.2309280395508, "learning_rate": 2.6983023928961405e-05, "loss": 728.1347, "step": 8960 }, { "ce_loss_10": 3.573468232154846, "ce_loss_13": 3.5233027219772337, "ce_loss_2": 4.227510786056518, "ce_loss_3": 4.011422181129456, "ce_loss_7": 3.6736750483512877, "epoch": 0.897, "grad_norm": 716.0, "kl_loss_10": 101.28071403503418, "kl_loss_2": 1474.439678955078, "kl_loss_3": 1040.7958374023438, "kl_loss_7": 322.3550231933594, "learning_rate": 2.6471220490954628e-05, "loss": 740.6856, "step": 8970 }, { "ce_loss_10": 3.557556450366974, "ce_loss_13": 3.5107120990753176, "ce_loss_2": 4.197606909275055, "ce_loss_3": 3.9785725831985475, "ce_loss_7": 3.649537718296051, "epoch": 0.898, "grad_norm": 632.0, "kl_loss_10": 99.04625396728515, "kl_loss_2": 1465.134100341797, "kl_loss_3": 1033.050845336914, "kl_loss_7": 315.48382568359375, "learning_rate": 2.596418548250029e-05, "loss": 735.2943, "step": 8980 }, { "ce_loss_10": 3.603998827934265, "ce_loss_13": 3.552130103111267, "ce_loss_2": 4.245459401607514, "ce_loss_3": 4.0296440601348875, "ce_loss_7": 3.7030985593795775, "epoch": 0.899, "grad_norm": 824.0, "kl_loss_10": 102.60270156860352, "kl_loss_2": 1474.0803649902343, "kl_loss_3": 1032.2244232177734, "kl_loss_7": 322.16136169433594, "learning_rate": 2.5461924009435368e-05, "loss": 729.7287, "step": 8990 }, { "ce_loss_10": 3.5992946863174438, "ce_loss_13": 3.5510019779205324, "ce_loss_2": 4.242183566093445, "ce_loss_3": 4.028680396080017, "ce_loss_7": 3.700281012058258, "epoch": 0.9, "grad_norm": 736.0, "kl_loss_10": 100.81934242248535, "kl_loss_2": 1454.1983764648437, "kl_loss_3": 1030.8050842285156, "kl_loss_7": 319.0749176025391, "learning_rate": 2.4964441129527336e-05, "loss": 741.3405, "step": 9000 }, { "ce_loss_10": 3.5929892301559447, "ce_loss_13": 3.5443052887916564, "ce_loss_2": 4.224874365329742, "ce_loss_3": 4.016772317886352, "ce_loss_7": 3.690162885189056, "epoch": 0.901, "grad_norm": 584.0, "kl_loss_10": 99.14652366638184, "kl_loss_2": 1444.2301025390625, "kl_loss_3": 1017.8550384521484, "kl_loss_7": 314.55660552978514, "learning_rate": 2.4471741852423235e-05, "loss": 718.9399, "step": 9010 }, { "ce_loss_10": 3.641668236255646, "ce_loss_13": 3.5931880354881285, "ce_loss_2": 4.292751264572144, "ce_loss_3": 4.082400214672089, "ce_loss_7": 3.7427754759788514, "epoch": 0.902, "grad_norm": 572.0, "kl_loss_10": 101.11631660461425, "kl_loss_2": 1461.3420959472655, "kl_loss_3": 1031.8964721679688, "kl_loss_7": 319.8264358520508, "learning_rate": 2.3983831139599287e-05, "loss": 733.7472, "step": 9020 }, { "ce_loss_10": 3.5593065500259398, "ce_loss_13": 3.5099385142326356, "ce_loss_2": 4.206565976142883, "ce_loss_3": 3.984605371952057, "ce_loss_7": 3.6537352323532106, "epoch": 0.903, "grad_norm": 456.0, "kl_loss_10": 98.91191177368164, "kl_loss_2": 1466.722674560547, "kl_loss_3": 1017.5840484619141, "kl_loss_7": 312.34007720947267, "learning_rate": 2.3500713904311022e-05, "loss": 713.5482, "step": 9030 }, { "ce_loss_10": 3.6017813682556152, "ce_loss_13": 3.554370975494385, "ce_loss_2": 4.226889824867248, "ce_loss_3": 4.011416518688202, "ce_loss_7": 3.6921934366226195, "epoch": 0.904, "grad_norm": 584.0, "kl_loss_10": 98.59734649658203, "kl_loss_2": 1420.5975036621094, "kl_loss_3": 997.9641632080078, "kl_loss_7": 308.7359924316406, "learning_rate": 2.3022395011543685e-05, "loss": 714.9296, "step": 9040 }, { "ce_loss_10": 3.633997368812561, "ce_loss_13": 3.5846336841583253, "ce_loss_2": 4.286326169967651, "ce_loss_3": 4.068668282032013, "ce_loss_7": 3.735692727565765, "epoch": 0.905, "grad_norm": 640.0, "kl_loss_10": 101.26521530151368, "kl_loss_2": 1486.862841796875, "kl_loss_3": 1049.7235443115235, "kl_loss_7": 327.13793792724607, "learning_rate": 2.2548879277963063e-05, "loss": 746.4134, "step": 9050 }, { "ce_loss_10": 3.5504543781280518, "ce_loss_13": 3.501303553581238, "ce_loss_2": 4.194028830528259, "ce_loss_3": 3.978367805480957, "ce_loss_7": 3.6456032276153563, "epoch": 0.906, "grad_norm": 680.0, "kl_loss_10": 100.52586059570312, "kl_loss_2": 1471.07861328125, "kl_loss_3": 1036.158026123047, "kl_loss_7": 318.09866485595705, "learning_rate": 2.208017147186736e-05, "loss": 715.4064, "step": 9060 }, { "ce_loss_10": 3.5485743045806886, "ce_loss_13": 3.4995696544647217, "ce_loss_2": 4.198383843898773, "ce_loss_3": 3.9804369688034056, "ce_loss_7": 3.6459625601768493, "epoch": 0.907, "grad_norm": 564.0, "kl_loss_10": 100.06849670410156, "kl_loss_2": 1477.7602844238281, "kl_loss_3": 1043.5721008300782, "kl_loss_7": 319.51026916503906, "learning_rate": 2.1616276313139227e-05, "loss": 723.6405, "step": 9070 }, { "ce_loss_10": 3.5836859941482544, "ce_loss_13": 3.534115183353424, "ce_loss_2": 4.235964345932007, "ce_loss_3": 4.014615166187286, "ce_loss_7": 3.679818069934845, "epoch": 0.908, "grad_norm": 496.0, "kl_loss_10": 100.40332870483398, "kl_loss_2": 1470.1177612304687, "kl_loss_3": 1034.0055450439454, "kl_loss_7": 317.81043395996096, "learning_rate": 2.1157198473197415e-05, "loss": 736.7519, "step": 9080 }, { "ce_loss_10": 3.653381907939911, "ce_loss_13": 3.6043422698974608, "ce_loss_2": 4.291116559505463, "ce_loss_3": 4.077895724773407, "ce_loss_7": 3.750797915458679, "epoch": 0.909, "grad_norm": 1012.0, "kl_loss_10": 101.42403373718261, "kl_loss_2": 1464.9044189453125, "kl_loss_3": 1033.8306793212892, "kl_loss_7": 324.32294921875, "learning_rate": 2.0702942574950812e-05, "loss": 731.8589, "step": 9090 }, { "ce_loss_10": 3.569808614253998, "ce_loss_13": 3.520155668258667, "ce_loss_2": 4.226888036727905, "ce_loss_3": 4.008000898361206, "ce_loss_7": 3.6718919396400453, "epoch": 0.91, "grad_norm": 576.0, "kl_loss_10": 101.54311294555664, "kl_loss_2": 1474.3709777832032, "kl_loss_3": 1044.0665649414063, "kl_loss_7": 322.58630523681643, "learning_rate": 2.025351319275137e-05, "loss": 732.5172, "step": 9100 }, { "ce_loss_10": 3.702002429962158, "ce_loss_13": 3.650774323940277, "ce_loss_2": 4.3437717914581295, "ce_loss_3": 4.13386709690094, "ce_loss_7": 3.798423147201538, "epoch": 0.911, "grad_norm": 604.0, "kl_loss_10": 104.95756034851074, "kl_loss_2": 1492.8397094726563, "kl_loss_3": 1062.238055419922, "kl_loss_7": 330.4491897583008, "learning_rate": 1.9808914852347816e-05, "loss": 756.9385, "step": 9110 }, { "ce_loss_10": 3.5480262279510497, "ce_loss_13": 3.49644513130188, "ce_loss_2": 4.2057594656944275, "ce_loss_3": 3.993450975418091, "ce_loss_7": 3.646942472457886, "epoch": 0.912, "grad_norm": 684.0, "kl_loss_10": 101.37423629760742, "kl_loss_2": 1486.220196533203, "kl_loss_3": 1055.6602325439453, "kl_loss_7": 323.5559967041016, "learning_rate": 1.9369152030840554e-05, "loss": 733.8441, "step": 9120 }, { "ce_loss_10": 3.6263280153274535, "ce_loss_13": 3.5767349004745483, "ce_loss_2": 4.270286953449249, "ce_loss_3": 4.054820692539215, "ce_loss_7": 3.721015989780426, "epoch": 0.913, "grad_norm": 656.0, "kl_loss_10": 101.21072273254394, "kl_loss_2": 1486.7355834960938, "kl_loss_3": 1042.7204528808593, "kl_loss_7": 319.9798324584961, "learning_rate": 1.893422915663645e-05, "loss": 735.1966, "step": 9130 }, { "ce_loss_10": 3.489763581752777, "ce_loss_13": 3.4398744463920594, "ce_loss_2": 4.181857180595398, "ce_loss_3": 3.9571975231170655, "ce_loss_7": 3.5976037979125977, "epoch": 0.914, "grad_norm": 776.0, "kl_loss_10": 101.65524978637696, "kl_loss_2": 1542.9002624511718, "kl_loss_3": 1088.0900024414063, "kl_loss_7": 328.0973876953125, "learning_rate": 1.850415060940386e-05, "loss": 752.1859, "step": 9140 }, { "ce_loss_10": 3.6176821112632753, "ce_loss_13": 3.569239854812622, "ce_loss_2": 4.248135197162628, "ce_loss_3": 4.045203781127929, "ce_loss_7": 3.7156535506248476, "epoch": 0.915, "grad_norm": 600.0, "kl_loss_10": 101.11350364685059, "kl_loss_2": 1449.418719482422, "kl_loss_3": 1031.2927825927734, "kl_loss_7": 319.98859252929685, "learning_rate": 1.8078920720028978e-05, "loss": 730.3095, "step": 9150 }, { "ce_loss_10": 3.545880711078644, "ce_loss_13": 3.4982577562332153, "ce_loss_2": 4.182165789604187, "ce_loss_3": 3.967874300479889, "ce_loss_7": 3.644175922870636, "epoch": 0.916, "grad_norm": 676.0, "kl_loss_10": 98.6468994140625, "kl_loss_2": 1451.41650390625, "kl_loss_3": 1024.3850982666015, "kl_loss_7": 314.3308532714844, "learning_rate": 1.765854377057219e-05, "loss": 739.5928, "step": 9160 }, { "ce_loss_10": 3.5270461201667787, "ce_loss_13": 3.4784483313560486, "ce_loss_2": 4.173061633110047, "ce_loss_3": 3.9516496419906617, "ce_loss_7": 3.6225127220153808, "epoch": 0.917, "grad_norm": 692.0, "kl_loss_10": 98.03750038146973, "kl_loss_2": 1473.7793823242187, "kl_loss_3": 1032.1868377685546, "kl_loss_7": 314.66668395996095, "learning_rate": 1.724302399422456e-05, "loss": 732.1436, "step": 9170 }, { "ce_loss_10": 3.475886869430542, "ce_loss_13": 3.425909125804901, "ce_loss_2": 4.134977567195892, "ce_loss_3": 3.918413257598877, "ce_loss_7": 3.575960898399353, "epoch": 0.918, "grad_norm": 920.0, "kl_loss_10": 102.07516288757324, "kl_loss_2": 1490.3460083007812, "kl_loss_3": 1060.189486694336, "kl_loss_7": 327.9425277709961, "learning_rate": 1.683236557526574e-05, "loss": 744.2836, "step": 9180 }, { "ce_loss_10": 3.595420205593109, "ce_loss_13": 3.5480047941207884, "ce_loss_2": 4.21327691078186, "ce_loss_3": 4.010032510757446, "ce_loss_7": 3.6892980217933653, "epoch": 0.919, "grad_norm": 588.0, "kl_loss_10": 98.43444595336913, "kl_loss_2": 1417.1862426757812, "kl_loss_3": 997.9367370605469, "kl_loss_7": 309.44056701660156, "learning_rate": 1.6426572649021475e-05, "loss": 722.4824, "step": 9190 }, { "ce_loss_10": 3.636527156829834, "ce_loss_13": 3.5877527236938476, "ce_loss_2": 4.2483520865440365, "ce_loss_3": 4.040287816524506, "ce_loss_7": 3.7288791298866273, "epoch": 0.92, "grad_norm": 752.0, "kl_loss_10": 101.33950271606446, "kl_loss_2": 1425.2800659179688, "kl_loss_3": 1008.1013336181641, "kl_loss_7": 315.51397094726565, "learning_rate": 1.6025649301821876e-05, "loss": 722.2042, "step": 9200 }, { "ce_loss_10": 3.618816578388214, "ce_loss_13": 3.5715938925743105, "ce_loss_2": 4.244227147102356, "ce_loss_3": 4.039032888412476, "ce_loss_7": 3.7134204864501954, "epoch": 0.921, "grad_norm": 580.0, "kl_loss_10": 100.78575897216797, "kl_loss_2": 1453.9480285644531, "kl_loss_3": 1031.4327911376954, "kl_loss_7": 322.1048858642578, "learning_rate": 1.5629599570960716e-05, "loss": 724.5923, "step": 9210 }, { "ce_loss_10": 3.5260069608688354, "ce_loss_13": 3.4774582147598267, "ce_loss_2": 4.172971761226654, "ce_loss_3": 3.9540740489959716, "ce_loss_7": 3.622638463973999, "epoch": 0.922, "grad_norm": 876.0, "kl_loss_10": 99.58263626098633, "kl_loss_2": 1482.261541748047, "kl_loss_3": 1035.059912109375, "kl_loss_7": 317.87442474365236, "learning_rate": 1.5238427444654367e-05, "loss": 733.5522, "step": 9220 }, { "ce_loss_10": 3.58858962059021, "ce_loss_13": 3.5394353032112122, "ce_loss_2": 4.221817135810852, "ce_loss_3": 4.005943262577057, "ce_loss_7": 3.6839746832847595, "epoch": 0.923, "grad_norm": 1056.0, "kl_loss_10": 99.78695945739746, "kl_loss_2": 1442.9137817382812, "kl_loss_3": 1013.8354431152344, "kl_loss_7": 314.0834762573242, "learning_rate": 1.4852136862001764e-05, "loss": 722.5939, "step": 9230 }, { "ce_loss_10": 3.5489871501922607, "ce_loss_13": 3.5015014171600343, "ce_loss_2": 4.189854669570923, "ce_loss_3": 3.982525897026062, "ce_loss_7": 3.6456330180168153, "epoch": 0.924, "grad_norm": 1072.0, "kl_loss_10": 96.98024253845215, "kl_loss_2": 1455.6874206542968, "kl_loss_3": 1029.0364135742188, "kl_loss_7": 313.7121780395508, "learning_rate": 1.4470731712944884e-05, "loss": 732.4188, "step": 9240 }, { "ce_loss_10": 3.5776649355888366, "ce_loss_13": 3.528981626033783, "ce_loss_2": 4.226421916484833, "ce_loss_3": 4.008637535572052, "ce_loss_7": 3.678273010253906, "epoch": 0.925, "grad_norm": 600.0, "kl_loss_10": 100.2242919921875, "kl_loss_2": 1464.7343994140624, "kl_loss_3": 1024.5027099609374, "kl_loss_7": 320.24311981201174, "learning_rate": 1.4094215838229174e-05, "loss": 744.1477, "step": 9250 }, { "ce_loss_10": 3.5355584979057313, "ce_loss_13": 3.488026261329651, "ce_loss_2": 4.199022781848908, "ce_loss_3": 3.976306843757629, "ce_loss_7": 3.637814199924469, "epoch": 0.926, "grad_norm": 788.0, "kl_loss_10": 100.92579612731933, "kl_loss_2": 1505.5577453613282, "kl_loss_3": 1056.5226654052735, "kl_loss_7": 325.0016784667969, "learning_rate": 1.372259302936546e-05, "loss": 762.6035, "step": 9260 }, { "ce_loss_10": 3.653318774700165, "ce_loss_13": 3.600943887233734, "ce_loss_2": 4.294710075855255, "ce_loss_3": 4.081323349475861, "ce_loss_7": 3.75021378993988, "epoch": 0.927, "grad_norm": 612.0, "kl_loss_10": 103.40512619018554, "kl_loss_2": 1462.4480529785155, "kl_loss_3": 1032.8355773925782, "kl_loss_7": 324.21121368408205, "learning_rate": 1.3355867028591206e-05, "loss": 722.0662, "step": 9270 }, { "ce_loss_10": 3.5565295815467834, "ce_loss_13": 3.5062556862831116, "ce_loss_2": 4.187858819961548, "ce_loss_3": 3.978440821170807, "ce_loss_7": 3.648498904705048, "epoch": 0.928, "grad_norm": 976.0, "kl_loss_10": 100.46078147888184, "kl_loss_2": 1456.1305236816406, "kl_loss_3": 1030.140936279297, "kl_loss_7": 315.7961700439453, "learning_rate": 1.2994041528833267e-05, "loss": 722.6728, "step": 9280 }, { "ce_loss_10": 3.5600144386291506, "ce_loss_13": 3.5094253420829773, "ce_loss_2": 4.202559447288513, "ce_loss_3": 3.9889981031417845, "ce_loss_7": 3.65659636259079, "epoch": 0.929, "grad_norm": 684.0, "kl_loss_10": 99.00979995727539, "kl_loss_2": 1469.9648193359376, "kl_loss_3": 1030.1941162109374, "kl_loss_7": 315.91405792236327, "learning_rate": 1.2637120173670358e-05, "loss": 729.2418, "step": 9290 }, { "ce_loss_10": 3.580701267719269, "ce_loss_13": 3.530916380882263, "ce_loss_2": 4.243158495426178, "ce_loss_3": 4.01680428981781, "ce_loss_7": 3.6834437012672425, "epoch": 0.93, "grad_norm": 956.0, "kl_loss_10": 100.70985794067383, "kl_loss_2": 1487.264910888672, "kl_loss_3": 1038.2951354980469, "kl_loss_7": 322.8812744140625, "learning_rate": 1.2285106557296478e-05, "loss": 734.9421, "step": 9300 }, { "ce_loss_10": 3.451865482330322, "ce_loss_13": 3.403413951396942, "ce_loss_2": 4.158205282688141, "ce_loss_3": 3.915228509902954, "ce_loss_7": 3.5537479758262633, "epoch": 0.931, "grad_norm": 668.0, "kl_loss_10": 99.77069282531738, "kl_loss_2": 1558.4464477539063, "kl_loss_3": 1076.4288665771485, "kl_loss_7": 322.8979919433594, "learning_rate": 1.1938004224484989e-05, "loss": 743.7089, "step": 9310 }, { "ce_loss_10": 3.6987316370010377, "ce_loss_13": 3.645847535133362, "ce_loss_2": 4.33648407459259, "ce_loss_3": 4.1272706389427185, "ce_loss_7": 3.793771517276764, "epoch": 0.932, "grad_norm": 864.0, "kl_loss_10": 103.44231643676758, "kl_loss_2": 1474.3127685546874, "kl_loss_3": 1042.6139434814454, "kl_loss_7": 322.40623474121094, "learning_rate": 1.1595816670552429e-05, "loss": 745.5192, "step": 9320 }, { "ce_loss_10": 3.6196014642715455, "ce_loss_13": 3.5704059958457948, "ce_loss_2": 4.255560755729675, "ce_loss_3": 4.043376386165619, "ce_loss_7": 3.7176753163337706, "epoch": 0.933, "grad_norm": 624.0, "kl_loss_10": 101.67831153869629, "kl_loss_2": 1448.494287109375, "kl_loss_3": 1016.9408416748047, "kl_loss_7": 316.3284286499023, "learning_rate": 1.1258547341323699e-05, "loss": 719.7978, "step": 9330 }, { "ce_loss_10": 3.65011385679245, "ce_loss_13": 3.6008502721786497, "ce_loss_2": 4.28247971534729, "ce_loss_3": 4.07297123670578, "ce_loss_7": 3.748872971534729, "epoch": 0.934, "grad_norm": 724.0, "kl_loss_10": 101.5847942352295, "kl_loss_2": 1479.5779174804688, "kl_loss_3": 1045.208514404297, "kl_loss_7": 322.5457504272461, "learning_rate": 1.0926199633097156e-05, "loss": 732.4528, "step": 9340 }, { "ce_loss_10": 3.651190483570099, "ce_loss_13": 3.6049916625022886, "ce_loss_2": 4.261642718315125, "ce_loss_3": 4.058089017868042, "ce_loss_7": 3.743030273914337, "epoch": 0.935, "grad_norm": 672.0, "kl_loss_10": 98.8299602508545, "kl_loss_2": 1416.1376220703125, "kl_loss_3": 1003.6277282714843, "kl_loss_7": 312.6012481689453, "learning_rate": 1.0598776892610684e-05, "loss": 730.9984, "step": 9350 }, { "ce_loss_10": 3.463999366760254, "ce_loss_13": 3.4159683704376222, "ce_loss_2": 4.132733225822449, "ce_loss_3": 3.905828559398651, "ce_loss_7": 3.5638004422187803, "epoch": 0.936, "grad_norm": 756.0, "kl_loss_10": 98.57135581970215, "kl_loss_2": 1500.514715576172, "kl_loss_3": 1050.2365997314453, "kl_loss_7": 318.48359375, "learning_rate": 1.0276282417007399e-05, "loss": 730.1331, "step": 9360 }, { "ce_loss_10": 3.629877495765686, "ce_loss_13": 3.5818740725517273, "ce_loss_2": 4.244524800777436, "ce_loss_3": 4.03765150308609, "ce_loss_7": 3.7260459661483765, "epoch": 0.937, "grad_norm": 808.0, "kl_loss_10": 98.70971488952637, "kl_loss_2": 1423.6197875976563, "kl_loss_3": 1003.1044799804688, "kl_loss_7": 312.7714874267578, "learning_rate": 9.958719453803277e-06, "loss": 719.7182, "step": 9370 }, { "ce_loss_10": 3.620361053943634, "ce_loss_13": 3.569625699520111, "ce_loss_2": 4.272698831558228, "ce_loss_3": 4.054683363437652, "ce_loss_7": 3.7232205748558043, "epoch": 0.938, "grad_norm": 560.0, "kl_loss_10": 101.04146385192871, "kl_loss_2": 1479.6770751953125, "kl_loss_3": 1033.0038482666016, "kl_loss_7": 321.316389465332, "learning_rate": 9.646091200853802e-06, "loss": 730.4701, "step": 9380 }, { "ce_loss_10": 3.5802704453468324, "ce_loss_13": 3.532654654979706, "ce_loss_2": 4.216030716896057, "ce_loss_3": 4.008428776264191, "ce_loss_7": 3.6718807578086854, "epoch": 0.939, "grad_norm": 640.0, "kl_loss_10": 97.80096817016602, "kl_loss_2": 1440.6820373535156, "kl_loss_3": 1019.3043212890625, "kl_loss_7": 313.47987060546876, "learning_rate": 9.338400806321978e-06, "loss": 706.0288, "step": 9390 }, { "ce_loss_10": 3.6130733489990234, "ce_loss_13": 3.562330484390259, "ce_loss_2": 4.2406017780303955, "ce_loss_3": 4.032883334159851, "ce_loss_7": 3.711196410655975, "epoch": 0.94, "grad_norm": 524.0, "kl_loss_10": 101.68023414611817, "kl_loss_2": 1441.4777526855469, "kl_loss_3": 1020.6096130371094, "kl_loss_7": 321.85620269775393, "learning_rate": 9.035651368646646e-06, "loss": 720.5777, "step": 9400 }, { "ce_loss_10": 3.6171160101890565, "ce_loss_13": 3.5678391575813295, "ce_loss_2": 4.246705079078675, "ce_loss_3": 4.031179976463318, "ce_loss_7": 3.712442195415497, "epoch": 0.941, "grad_norm": 928.0, "kl_loss_10": 99.55560111999512, "kl_loss_2": 1436.2016235351562, "kl_loss_3": 1009.4204345703125, "kl_loss_7": 312.73047637939453, "learning_rate": 8.737845936511335e-06, "loss": 724.4431, "step": 9410 }, { "ce_loss_10": 3.565962862968445, "ce_loss_13": 3.515843319892883, "ce_loss_2": 4.218027985095977, "ce_loss_3": 3.9984277963638304, "ce_loss_7": 3.6626911044120787, "epoch": 0.942, "grad_norm": 664.0, "kl_loss_10": 101.70542182922364, "kl_loss_2": 1475.068914794922, "kl_loss_3": 1034.3641723632813, "kl_loss_7": 320.2153259277344, "learning_rate": 8.444987508813451e-06, "loss": 727.4669, "step": 9420 }, { "ce_loss_10": 3.518308973312378, "ce_loss_13": 3.4704669952392577, "ce_loss_2": 4.1819773554801944, "ce_loss_3": 3.9543440341949463, "ce_loss_7": 3.618017780780792, "epoch": 0.943, "grad_norm": 772.0, "kl_loss_10": 101.26314086914063, "kl_loss_2": 1525.5377014160156, "kl_loss_3": 1064.7629974365234, "kl_loss_7": 325.3822952270508, "learning_rate": 8.157079034633974e-06, "loss": 744.0637, "step": 9430 }, { "ce_loss_10": 3.5147162199020388, "ce_loss_13": 3.4665843844413757, "ce_loss_2": 4.170525097846985, "ce_loss_3": 3.9518492579460145, "ce_loss_7": 3.614675509929657, "epoch": 0.944, "grad_norm": 656.0, "kl_loss_10": 100.22494850158691, "kl_loss_2": 1502.26962890625, "kl_loss_3": 1057.2914001464844, "kl_loss_7": 319.3195556640625, "learning_rate": 7.874123413208145e-06, "loss": 732.8164, "step": 9440 }, { "ce_loss_10": 3.4823323011398317, "ce_loss_13": 3.434736800193787, "ce_loss_2": 4.147767508029938, "ce_loss_3": 3.931454598903656, "ce_loss_7": 3.5841008186340333, "epoch": 0.945, "grad_norm": 724.0, "kl_loss_10": 99.14258117675782, "kl_loss_2": 1486.425, "kl_loss_3": 1043.9653778076172, "kl_loss_7": 319.19978790283204, "learning_rate": 7.59612349389599e-06, "loss": 734.4664, "step": 9450 }, { "ce_loss_10": 3.5783546805381774, "ce_loss_13": 3.5316025614738464, "ce_loss_2": 4.204697036743164, "ce_loss_3": 3.9950135707855225, "ce_loss_7": 3.6768389105796815, "epoch": 0.946, "grad_norm": 1632.0, "kl_loss_10": 96.9753978729248, "kl_loss_2": 1417.4609985351562, "kl_loss_3": 1000.7292694091797, "kl_loss_7": 311.39798736572266, "learning_rate": 7.323082076153509e-06, "loss": 718.4145, "step": 9460 }, { "ce_loss_10": 3.621594047546387, "ce_loss_13": 3.572771978378296, "ce_loss_2": 4.250254535675049, "ce_loss_3": 4.038458049297333, "ce_loss_7": 3.7181079506874086, "epoch": 0.947, "grad_norm": 836.0, "kl_loss_10": 102.37061347961426, "kl_loss_2": 1443.3331604003906, "kl_loss_3": 1018.7461090087891, "kl_loss_7": 321.2649383544922, "learning_rate": 7.055001909504755e-06, "loss": 735.3864, "step": 9470 }, { "ce_loss_10": 3.6511409640312196, "ce_loss_13": 3.6022180914878845, "ce_loss_2": 4.283723545074463, "ce_loss_3": 4.0745571732521055, "ce_loss_7": 3.747443664073944, "epoch": 0.948, "grad_norm": 552.0, "kl_loss_10": 100.98973159790039, "kl_loss_2": 1458.7612609863281, "kl_loss_3": 1030.3292388916016, "kl_loss_7": 319.4419235229492, "learning_rate": 6.791885693514133e-06, "loss": 730.6492, "step": 9480 }, { "ce_loss_10": 3.5563197493553163, "ce_loss_13": 3.5079686641693115, "ce_loss_2": 4.219356620311737, "ce_loss_3": 3.9983399629592897, "ce_loss_7": 3.6557886600494385, "epoch": 0.949, "grad_norm": 628.0, "kl_loss_10": 101.03056373596192, "kl_loss_2": 1502.6600341796875, "kl_loss_3": 1046.867025756836, "kl_loss_7": 322.37164611816405, "learning_rate": 6.533736077758867e-06, "loss": 741.9673, "step": 9490 }, { "ce_loss_10": 3.524586260318756, "ce_loss_13": 3.4753112316131594, "ce_loss_2": 4.197818899154663, "ce_loss_3": 3.9671564221382143, "ce_loss_7": 3.6279956340789794, "epoch": 0.95, "grad_norm": 988.0, "kl_loss_10": 100.79750289916993, "kl_loss_2": 1526.3000915527343, "kl_loss_3": 1064.0913696289062, "kl_loss_7": 326.236262512207, "learning_rate": 6.2805556618028556e-06, "loss": 743.4429, "step": 9500 }, { "ce_loss_10": 3.616682207584381, "ce_loss_13": 3.5698466658592225, "ce_loss_2": 4.238051950931549, "ce_loss_3": 4.031062352657318, "ce_loss_7": 3.7088807821273804, "epoch": 0.951, "grad_norm": 552.0, "kl_loss_10": 98.77718887329101, "kl_loss_2": 1404.2392150878907, "kl_loss_3": 997.3295196533203, "kl_loss_7": 308.6707962036133, "learning_rate": 6.032346995169968e-06, "loss": 698.0699, "step": 9510 }, { "ce_loss_10": 3.619255244731903, "ce_loss_13": 3.569891571998596, "ce_loss_2": 4.251179981231689, "ce_loss_3": 4.03684755563736, "ce_loss_7": 3.7131584763526915, "epoch": 0.952, "grad_norm": 1184.0, "kl_loss_10": 99.68890724182128, "kl_loss_2": 1442.2247253417968, "kl_loss_3": 1019.6851318359375, "kl_loss_7": 315.02208099365237, "learning_rate": 5.789112577318789e-06, "loss": 720.3744, "step": 9520 }, { "ce_loss_10": 3.5947921514511108, "ce_loss_13": 3.544568932056427, "ce_loss_2": 4.244994127750397, "ce_loss_3": 4.02504061460495, "ce_loss_7": 3.6920055389404296, "epoch": 0.953, "grad_norm": 584.0, "kl_loss_10": 101.4294563293457, "kl_loss_2": 1499.6636108398438, "kl_loss_3": 1052.6900146484375, "kl_loss_7": 322.4961364746094, "learning_rate": 5.550854857617194e-06, "loss": 726.9194, "step": 9530 }, { "ce_loss_10": 3.582830882072449, "ce_loss_13": 3.5319761395454408, "ce_loss_2": 4.242195701599121, "ce_loss_3": 4.021008789539337, "ce_loss_7": 3.6820746064186096, "epoch": 0.954, "grad_norm": 572.0, "kl_loss_10": 102.64753494262695, "kl_loss_2": 1498.8092956542969, "kl_loss_3": 1050.7385650634765, "kl_loss_7": 326.7690063476563, "learning_rate": 5.317576235317756e-06, "loss": 737.1573, "step": 9540 }, { "ce_loss_10": 3.6114678263664244, "ce_loss_13": 3.5621456623077394, "ce_loss_2": 4.227889513969421, "ce_loss_3": 4.022716641426086, "ce_loss_7": 3.703843331336975, "epoch": 0.955, "grad_norm": 788.0, "kl_loss_10": 99.9121597290039, "kl_loss_2": 1403.9868591308593, "kl_loss_3": 992.9365753173828, "kl_loss_7": 308.4971160888672, "learning_rate": 5.089279059533658e-06, "loss": 727.0448, "step": 9550 }, { "ce_loss_10": 3.6699771642684937, "ce_loss_13": 3.618920314311981, "ce_loss_2": 4.306116104125977, "ce_loss_3": 4.091136622428894, "ce_loss_7": 3.7673362493515015, "epoch": 0.956, "grad_norm": 776.0, "kl_loss_10": 104.20219497680664, "kl_loss_2": 1463.6626708984375, "kl_loss_3": 1033.8897552490234, "kl_loss_7": 326.441096496582, "learning_rate": 4.865965629214819e-06, "loss": 725.4249, "step": 9560 }, { "ce_loss_10": 3.614033079147339, "ce_loss_13": 3.5638347268104553, "ce_loss_2": 4.255724763870239, "ce_loss_3": 4.041497755050659, "ce_loss_7": 3.7127063274383545, "epoch": 0.957, "grad_norm": 520.0, "kl_loss_10": 101.81166343688965, "kl_loss_2": 1481.6173767089845, "kl_loss_3": 1041.6663055419922, "kl_loss_7": 323.22521057128904, "learning_rate": 4.6476381931251366e-06, "loss": 721.5253, "step": 9570 }, { "ce_loss_10": 3.59082635641098, "ce_loss_13": 3.541495108604431, "ce_loss_2": 4.226521170139312, "ce_loss_3": 4.021185553073883, "ce_loss_7": 3.6890914678573608, "epoch": 0.958, "grad_norm": 584.0, "kl_loss_10": 99.81327781677246, "kl_loss_2": 1453.2628601074218, "kl_loss_3": 1029.4942352294922, "kl_loss_7": 318.6952651977539, "learning_rate": 4.434298949819449e-06, "loss": 728.2395, "step": 9580 }, { "ce_loss_10": 3.545853388309479, "ce_loss_13": 3.4954501748085023, "ce_loss_2": 4.220154106616974, "ce_loss_3": 3.9992950558662415, "ce_loss_7": 3.643615353107452, "epoch": 0.959, "grad_norm": 540.0, "kl_loss_10": 102.36123390197754, "kl_loss_2": 1555.2266845703125, "kl_loss_3": 1093.3547973632812, "kl_loss_7": 330.1093811035156, "learning_rate": 4.2259500476214406e-06, "loss": 752.4537, "step": 9590 }, { "ce_loss_10": 3.5348328948020935, "ce_loss_13": 3.483853542804718, "ce_loss_2": 4.178453171253205, "ce_loss_3": 3.9601629257202147, "ce_loss_7": 3.6308905482292175, "epoch": 0.96, "grad_norm": 684.0, "kl_loss_10": 100.2852855682373, "kl_loss_2": 1487.2806335449218, "kl_loss_3": 1047.0238006591796, "kl_loss_7": 320.01941375732423, "learning_rate": 4.02259358460233e-06, "loss": 729.1799, "step": 9600 }, { "ce_loss_10": 3.6012317419052122, "ce_loss_13": 3.5504815340042115, "ce_loss_2": 4.237862277030945, "ce_loss_3": 4.0222912430763245, "ce_loss_7": 3.6958125591278077, "epoch": 0.961, "grad_norm": 740.0, "kl_loss_10": 100.86824188232421, "kl_loss_2": 1447.4791198730468, "kl_loss_3": 1013.4048553466797, "kl_loss_7": 318.97264862060547, "learning_rate": 3.8242316085594916e-06, "loss": 717.5141, "step": 9610 }, { "ce_loss_10": 3.482761192321777, "ce_loss_13": 3.433958411216736, "ce_loss_2": 4.1683017134666445, "ce_loss_3": 3.939417612552643, "ce_loss_7": 3.587407350540161, "epoch": 0.962, "grad_norm": 636.0, "kl_loss_10": 101.28954124450684, "kl_loss_2": 1549.2028381347657, "kl_loss_3": 1078.0746215820313, "kl_loss_7": 326.9995407104492, "learning_rate": 3.630866116995757e-06, "loss": 758.2951, "step": 9620 }, { "ce_loss_10": 3.637610685825348, "ce_loss_13": 3.5894572496414185, "ce_loss_2": 4.263991355895996, "ce_loss_3": 4.0577524423599245, "ce_loss_7": 3.733923375606537, "epoch": 0.963, "grad_norm": 660.0, "kl_loss_10": 100.02307357788087, "kl_loss_2": 1420.0200805664062, "kl_loss_3": 1003.1484680175781, "kl_loss_7": 313.7723220825195, "learning_rate": 3.4424990570994797e-06, "loss": 729.9384, "step": 9630 }, { "ce_loss_10": 3.633535933494568, "ce_loss_13": 3.583401381969452, "ce_loss_2": 4.254140913486481, "ce_loss_3": 4.051010286808014, "ce_loss_7": 3.7287922620773317, "epoch": 0.964, "grad_norm": 560.0, "kl_loss_10": 100.02474784851074, "kl_loss_2": 1442.352032470703, "kl_loss_3": 1015.8196166992187, "kl_loss_7": 315.29369201660154, "learning_rate": 3.2591323257248896e-06, "loss": 724.2088, "step": 9640 }, { "ce_loss_10": 3.4801689267158507, "ce_loss_13": 3.431644892692566, "ce_loss_2": 4.140816879272461, "ce_loss_3": 3.9168063402175903, "ce_loss_7": 3.5761757493019104, "epoch": 0.965, "grad_norm": 588.0, "kl_loss_10": 98.70831642150878, "kl_loss_2": 1492.278985595703, "kl_loss_3": 1045.1632019042968, "kl_loss_7": 317.46045074462893, "learning_rate": 3.0807677693729385e-06, "loss": 738.5743, "step": 9650 }, { "ce_loss_10": 3.664973223209381, "ce_loss_13": 3.6175607323646544, "ce_loss_2": 4.2879242420196535, "ce_loss_3": 4.081740701198578, "ce_loss_7": 3.758702278137207, "epoch": 0.966, "grad_norm": 1024.0, "kl_loss_10": 99.16736755371093, "kl_loss_2": 1425.2832153320312, "kl_loss_3": 1003.6775115966797, "kl_loss_7": 311.84822845458984, "learning_rate": 2.9074071841727055e-06, "loss": 711.7873, "step": 9660 }, { "ce_loss_10": 3.5945656180381773, "ce_loss_13": 3.546189475059509, "ce_loss_2": 4.235725092887878, "ce_loss_3": 4.026057779788971, "ce_loss_7": 3.689992678165436, "epoch": 0.967, "grad_norm": 1080.0, "kl_loss_10": 100.27811698913574, "kl_loss_2": 1456.6593078613282, "kl_loss_3": 1037.367868041992, "kl_loss_7": 321.3731719970703, "learning_rate": 2.739052315863355e-06, "loss": 714.5004, "step": 9670 }, { "ce_loss_10": 3.5768367648124695, "ce_loss_13": 3.5270548462867737, "ce_loss_2": 4.221412074565888, "ce_loss_3": 4.004365146160126, "ce_loss_7": 3.6732075214385986, "epoch": 0.968, "grad_norm": 776.0, "kl_loss_10": 101.80694351196288, "kl_loss_2": 1471.1910034179687, "kl_loss_3": 1032.568795776367, "kl_loss_7": 317.22945098876954, "learning_rate": 2.5757048597765396e-06, "loss": 725.398, "step": 9680 }, { "ce_loss_10": 3.59031343460083, "ce_loss_13": 3.5396840929985047, "ce_loss_2": 4.236286723613739, "ce_loss_3": 4.018032371997833, "ce_loss_7": 3.685284149646759, "epoch": 0.969, "grad_norm": 672.0, "kl_loss_10": 100.63111534118653, "kl_loss_2": 1478.1512756347656, "kl_loss_3": 1039.5015014648438, "kl_loss_7": 322.1880874633789, "learning_rate": 2.417366460819359e-06, "loss": 729.9285, "step": 9690 }, { "ce_loss_10": 3.5973408102989195, "ce_loss_13": 3.547382986545563, "ce_loss_2": 4.26169183254242, "ce_loss_3": 4.040053284168243, "ce_loss_7": 3.699202907085419, "epoch": 0.97, "grad_norm": 992.0, "kl_loss_10": 102.65268669128417, "kl_loss_2": 1520.297100830078, "kl_loss_3": 1069.3534912109376, "kl_loss_7": 326.8909194946289, "learning_rate": 2.2640387134577057e-06, "loss": 732.2714, "step": 9700 }, { "ce_loss_10": 3.5202884912490844, "ce_loss_13": 3.474460709095001, "ce_loss_2": 4.142319214344025, "ce_loss_3": 3.936626875400543, "ce_loss_7": 3.614048516750336, "epoch": 0.971, "grad_norm": 668.0, "kl_loss_10": 95.6164436340332, "kl_loss_2": 1412.2223693847657, "kl_loss_3": 996.8598114013672, "kl_loss_7": 307.68802337646486, "learning_rate": 2.115723161700278e-06, "loss": 714.1914, "step": 9710 }, { "ce_loss_10": 3.5025058150291444, "ce_loss_13": 3.4522303462028505, "ce_loss_2": 4.171424794197082, "ce_loss_3": 3.9464163184165955, "ce_loss_7": 3.606028401851654, "epoch": 0.972, "grad_norm": 728.0, "kl_loss_10": 102.53854255676269, "kl_loss_2": 1505.1114929199218, "kl_loss_3": 1056.5371704101562, "kl_loss_7": 325.85479736328125, "learning_rate": 1.9724212990830937e-06, "loss": 744.4349, "step": 9720 }, { "ce_loss_10": 3.6495727419853212, "ce_loss_13": 3.6018050789833067, "ce_loss_2": 4.306086361408234, "ce_loss_3": 4.086185026168823, "ce_loss_7": 3.749111497402191, "epoch": 0.973, "grad_norm": 512.0, "kl_loss_10": 101.35614318847657, "kl_loss_2": 1485.3276916503905, "kl_loss_3": 1045.383840942383, "kl_loss_7": 322.4482192993164, "learning_rate": 1.8341345686543331e-06, "loss": 733.7501, "step": 9730 }, { "ce_loss_10": 3.632604348659515, "ce_loss_13": 3.584032082557678, "ce_loss_2": 4.245633041858673, "ce_loss_3": 4.044562423229218, "ce_loss_7": 3.729058027267456, "epoch": 0.974, "grad_norm": 1056.0, "kl_loss_10": 100.17584037780762, "kl_loss_2": 1419.3184204101562, "kl_loss_3": 1010.3348663330078, "kl_loss_7": 315.9739364624023, "learning_rate": 1.7008643629596864e-06, "loss": 731.5823, "step": 9740 }, { "ce_loss_10": 3.617405414581299, "ce_loss_13": 3.5665115833282472, "ce_loss_2": 4.266829407215118, "ce_loss_3": 4.0473064422607425, "ce_loss_7": 3.7161169409751893, "epoch": 0.975, "grad_norm": 508.0, "kl_loss_10": 100.89203796386718, "kl_loss_2": 1488.1953369140624, "kl_loss_3": 1044.3024871826171, "kl_loss_7": 321.0751068115234, "learning_rate": 1.5726120240288633e-06, "loss": 742.7414, "step": 9750 }, { "ce_loss_10": 3.5173370480537414, "ce_loss_13": 3.467965769767761, "ce_loss_2": 4.154630899429321, "ce_loss_3": 3.938868534564972, "ce_loss_7": 3.6149882435798646, "epoch": 0.976, "grad_norm": 916.0, "kl_loss_10": 98.95890884399414, "kl_loss_2": 1466.7470458984376, "kl_loss_3": 1029.949411010742, "kl_loss_7": 316.8812088012695, "learning_rate": 1.4493788433612708e-06, "loss": 722.6529, "step": 9760 }, { "ce_loss_10": 3.6336648225784303, "ce_loss_13": 3.584157955646515, "ce_loss_2": 4.280201971530914, "ce_loss_3": 4.065834891796112, "ce_loss_7": 3.7311365246772765, "epoch": 0.977, "grad_norm": 684.0, "kl_loss_10": 100.03769454956054, "kl_loss_2": 1478.9156005859375, "kl_loss_3": 1040.3515747070312, "kl_loss_7": 319.4053009033203, "learning_rate": 1.3311660619138578e-06, "loss": 736.5975, "step": 9770 }, { "ce_loss_10": 3.632482481002808, "ce_loss_13": 3.5835426926612852, "ce_loss_2": 4.238091909885407, "ce_loss_3": 4.034222900867462, "ce_loss_7": 3.7265204310417177, "epoch": 0.978, "grad_norm": 568.0, "kl_loss_10": 100.34218406677246, "kl_loss_2": 1402.0389282226563, "kl_loss_3": 996.9839691162109, "kl_loss_7": 315.2754852294922, "learning_rate": 1.2179748700879012e-06, "loss": 721.9756, "step": 9780 }, { "ce_loss_10": 3.5573767185211183, "ce_loss_13": 3.510503625869751, "ce_loss_2": 4.196924710273743, "ce_loss_3": 3.9808929204940795, "ce_loss_7": 3.655661571025848, "epoch": 0.979, "grad_norm": 728.0, "kl_loss_10": 98.98676071166992, "kl_loss_2": 1450.4956359863281, "kl_loss_3": 1017.1022735595703, "kl_loss_7": 315.50543823242185, "learning_rate": 1.1098064077174619e-06, "loss": 725.8752, "step": 9790 }, { "ce_loss_10": 3.5931172370910645, "ce_loss_13": 3.542897272109985, "ce_loss_2": 4.25806030035019, "ce_loss_3": 4.0340229868888855, "ce_loss_7": 3.693891000747681, "epoch": 0.98, "grad_norm": 1136.0, "kl_loss_10": 99.92363891601562, "kl_loss_2": 1489.9005859375, "kl_loss_3": 1042.5260528564454, "kl_loss_7": 318.3444885253906, "learning_rate": 1.006661764057837e-06, "loss": 728.7937, "step": 9800 }, { "ce_loss_10": 3.596359574794769, "ce_loss_13": 3.548104441165924, "ce_loss_2": 4.2417675971984865, "ce_loss_3": 4.026107430458069, "ce_loss_7": 3.6916649460792543, "epoch": 0.981, "grad_norm": 568.0, "kl_loss_10": 99.93220672607421, "kl_loss_2": 1467.9831604003907, "kl_loss_3": 1029.5319030761718, "kl_loss_7": 315.5573989868164, "learning_rate": 9.085419777743465e-07, "loss": 725.1054, "step": 9810 }, { "ce_loss_10": 3.5348106741905214, "ce_loss_13": 3.4887638092041016, "ce_loss_2": 4.179925692081452, "ce_loss_3": 3.969004821777344, "ce_loss_7": 3.629472756385803, "epoch": 0.982, "grad_norm": 632.0, "kl_loss_10": 97.8527442932129, "kl_loss_2": 1456.6363159179687, "kl_loss_3": 1031.8575866699218, "kl_loss_7": 310.8157470703125, "learning_rate": 8.15448036932176e-07, "loss": 716.7362, "step": 9820 }, { "ce_loss_10": 3.588992726802826, "ce_loss_13": 3.5412689447402954, "ce_loss_2": 4.2244803547859195, "ce_loss_3": 4.017318344116211, "ce_loss_7": 3.6873218059539794, "epoch": 0.983, "grad_norm": 696.0, "kl_loss_10": 99.93506622314453, "kl_loss_2": 1452.4131469726562, "kl_loss_3": 1034.2230529785156, "kl_loss_7": 317.5879592895508, "learning_rate": 7.273808789862724e-07, "loss": 734.1375, "step": 9830 }, { "ce_loss_10": 3.672054672241211, "ce_loss_13": 3.6226749420166016, "ce_loss_2": 4.299103081226349, "ce_loss_3": 4.089282441139221, "ce_loss_7": 3.7691115736961365, "epoch": 0.984, "grad_norm": 840.0, "kl_loss_10": 101.45122680664062, "kl_loss_2": 1449.5640014648438, "kl_loss_3": 1022.9663452148437, "kl_loss_7": 319.88048858642577, "learning_rate": 6.443413907720186e-07, "loss": 722.3884, "step": 9840 }, { "ce_loss_10": 3.5984023571014405, "ce_loss_13": 3.5494791865348816, "ce_loss_2": 4.233557558059692, "ce_loss_3": 4.018463003635406, "ce_loss_7": 3.6955064058303835, "epoch": 0.985, "grad_norm": 680.0, "kl_loss_10": 99.94643592834473, "kl_loss_2": 1453.2210998535156, "kl_loss_3": 1023.1268951416016, "kl_loss_7": 317.93501434326174, "learning_rate": 5.663304084960185e-07, "loss": 722.5181, "step": 9850 }, { "ce_loss_10": 3.522158908843994, "ce_loss_13": 3.4731910824775696, "ce_loss_2": 4.184080266952515, "ce_loss_3": 3.964234435558319, "ce_loss_7": 3.620095467567444, "epoch": 0.986, "grad_norm": 720.0, "kl_loss_10": 100.35142593383789, "kl_loss_2": 1485.5776977539062, "kl_loss_3": 1042.6309692382813, "kl_loss_7": 320.0404983520508, "learning_rate": 4.933487177280482e-07, "loss": 719.0503, "step": 9860 }, { "ce_loss_10": 3.6208077549934385, "ce_loss_13": 3.5728573322296144, "ce_loss_2": 4.251078021526337, "ce_loss_3": 4.0389905095100405, "ce_loss_7": 3.714921402931213, "epoch": 0.987, "grad_norm": 772.0, "kl_loss_10": 98.80282516479492, "kl_loss_2": 1449.0300231933593, "kl_loss_3": 1026.049136352539, "kl_loss_7": 314.0385025024414, "learning_rate": 4.2539705339295075e-07, "loss": 719.6409, "step": 9870 }, { "ce_loss_10": 3.476616156101227, "ce_loss_13": 3.427881383895874, "ce_loss_2": 4.134393751621246, "ce_loss_3": 3.9140883088111877, "ce_loss_7": 3.5780155062675476, "epoch": 0.988, "grad_norm": 1004.0, "kl_loss_10": 98.60747108459472, "kl_loss_2": 1495.9751708984375, "kl_loss_3": 1049.8730224609376, "kl_loss_7": 323.53383178710936, "learning_rate": 3.6247609976319816e-07, "loss": 726.4771, "step": 9880 }, { "ce_loss_10": 3.575879955291748, "ce_loss_13": 3.5266340017318725, "ce_loss_2": 4.234549343585968, "ce_loss_3": 4.012395465373993, "ce_loss_7": 3.6740915536880494, "epoch": 0.989, "grad_norm": 900.0, "kl_loss_10": 101.82386322021485, "kl_loss_2": 1488.7779113769532, "kl_loss_3": 1040.8666076660156, "kl_loss_7": 321.87248077392576, "learning_rate": 3.0458649045211895e-07, "loss": 749.0734, "step": 9890 }, { "ce_loss_10": 3.5423336386680604, "ce_loss_13": 3.4921443819999696, "ce_loss_2": 4.195040118694306, "ce_loss_3": 3.980450451374054, "ce_loss_7": 3.6402501106262206, "epoch": 0.99, "grad_norm": 732.0, "kl_loss_10": 101.91628227233886, "kl_loss_2": 1481.928271484375, "kl_loss_3": 1047.8629669189454, "kl_loss_7": 325.102685546875, "learning_rate": 2.517288084074587e-07, "loss": 743.1618, "step": 9900 }, { "ce_loss_10": 3.5815725326538086, "ce_loss_13": 3.5311940193176268, "ce_loss_2": 4.254600787162781, "ce_loss_3": 4.035233128070831, "ce_loss_7": 3.6864403247833253, "epoch": 0.991, "grad_norm": 580.0, "kl_loss_10": 102.49858741760254, "kl_loss_2": 1520.3685241699218, "kl_loss_3": 1070.8265075683594, "kl_loss_7": 328.1074920654297, "learning_rate": 2.0390358590538505e-07, "loss": 740.4264, "step": 9910 }, { "ce_loss_10": 3.5888866186141968, "ce_loss_13": 3.5394426822662353, "ce_loss_2": 4.234891438484192, "ce_loss_3": 4.02474262714386, "ce_loss_7": 3.6895466804504395, "epoch": 0.992, "grad_norm": 1168.0, "kl_loss_10": 101.30520286560059, "kl_loss_2": 1472.2737854003906, "kl_loss_3": 1045.0297576904297, "kl_loss_7": 325.7491256713867, "learning_rate": 1.61111304545436e-07, "loss": 726.4056, "step": 9920 }, { "ce_loss_10": 3.5562559604644775, "ce_loss_13": 3.5081410884857176, "ce_loss_2": 4.196097910404205, "ce_loss_3": 3.9858675599098206, "ce_loss_7": 3.6498913168907166, "epoch": 0.993, "grad_norm": 492.0, "kl_loss_10": 98.92250175476075, "kl_loss_2": 1469.2185607910155, "kl_loss_3": 1034.9410614013673, "kl_loss_7": 316.94109802246095, "learning_rate": 1.2335239524541298e-07, "loss": 718.1404, "step": 9930 }, { "ce_loss_10": 3.5263262748718263, "ce_loss_13": 3.4782188057899477, "ce_loss_2": 4.171599221229553, "ce_loss_3": 3.9576822400093077, "ce_loss_7": 3.6256606340408326, "epoch": 0.994, "grad_norm": 776.0, "kl_loss_10": 98.82010307312012, "kl_loss_2": 1459.3435363769531, "kl_loss_3": 1024.1364196777345, "kl_loss_7": 315.70213470458987, "learning_rate": 9.06272382371065e-08, "loss": 729.0179, "step": 9940 }, { "ce_loss_10": 3.596427261829376, "ce_loss_13": 3.5463723301887513, "ce_loss_2": 4.251511228084564, "ce_loss_3": 4.029919493198395, "ce_loss_7": 3.694312071800232, "epoch": 0.995, "grad_norm": 776.0, "kl_loss_10": 101.4124641418457, "kl_loss_2": 1491.3899658203125, "kl_loss_3": 1047.7046600341796, "kl_loss_7": 322.27986907958984, "learning_rate": 6.293616306246586e-08, "loss": 731.8995, "step": 9950 }, { "ce_loss_10": 3.591547703742981, "ce_loss_13": 3.546114706993103, "ce_loss_2": 4.207206773757934, "ce_loss_3": 4.000539326667786, "ce_loss_7": 3.6898637294769285, "epoch": 0.996, "grad_norm": 512.0, "kl_loss_10": 97.26972007751465, "kl_loss_2": 1417.1295532226563, "kl_loss_3": 1000.4558502197266, "kl_loss_7": 311.8316848754883, "learning_rate": 4.027944857032395e-08, "loss": 706.6073, "step": 9960 }, { "ce_loss_10": 3.5830178380012514, "ce_loss_13": 3.5348027467727663, "ce_loss_2": 4.194839382171631, "ce_loss_3": 3.9924580574035646, "ce_loss_7": 3.673254609107971, "epoch": 0.997, "grad_norm": 592.0, "kl_loss_10": 97.76697616577148, "kl_loss_2": 1396.567919921875, "kl_loss_3": 986.9955322265625, "kl_loss_7": 303.8766387939453, "learning_rate": 2.265732291356626e-08, "loss": 705.8461, "step": 9970 }, { "ce_loss_10": 3.6323296189308167, "ce_loss_13": 3.583577513694763, "ce_loss_2": 4.261997354030609, "ce_loss_3": 4.047069096565247, "ce_loss_7": 3.7265624046325683, "epoch": 0.998, "grad_norm": 616.0, "kl_loss_10": 99.8657657623291, "kl_loss_2": 1437.0718994140625, "kl_loss_3": 1011.7502319335938, "kl_loss_7": 317.0954345703125, "learning_rate": 1.0069963546743833e-08, "loss": 733.7814, "step": 9980 }, { "ce_loss_10": 3.604882597923279, "ce_loss_13": 3.556373083591461, "ce_loss_2": 4.24988762140274, "ce_loss_3": 4.032586073875427, "ce_loss_7": 3.7061725497245788, "epoch": 0.999, "grad_norm": 960.0, "kl_loss_10": 100.96372756958007, "kl_loss_2": 1466.6524536132813, "kl_loss_3": 1031.8848724365234, "kl_loss_7": 320.97324523925784, "learning_rate": 2.517497224463483e-09, "loss": 727.0809, "step": 9990 }, { "ce_loss_10": 3.563299095630646, "ce_loss_13": 3.514452576637268, "ce_loss_2": 4.253318119049072, "ce_loss_3": 4.018176889419555, "ce_loss_7": 3.6688645243644715, "epoch": 1.0, "grad_norm": 860.0, "kl_loss_10": 102.08984718322753, "kl_loss_2": 1549.4201416015626, "kl_loss_3": 1071.8392395019532, "kl_loss_7": 329.1241226196289, "learning_rate": 0.0, "loss": 750.2723, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.177819035608023e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }