{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_10": 5.865575313568115, "ce_loss_13": 3.509598731994629, "ce_loss_2": 10.007837295532227, "ce_loss_3": 13.285265445709229, "ce_loss_7": 6.684143781661987, "epoch": 0.0001, "grad_norm": 110592.0, "kl_loss_10": 5785.801025390625, "kl_loss_2": 13150.353515625, "kl_loss_3": 19835.314453125, "kl_loss_7": 7173.60595703125, "learning_rate": 1e-05, "loss": 11523.9297, "step": 1 }, { "ce_loss_10": 5.085373534096612, "ce_loss_13": 3.570544626977709, "ce_loss_2": 7.954865243699816, "ce_loss_3": 9.156357712215847, "ce_loss_7": 5.791120873557197, "epoch": 0.001, "grad_norm": 12160.0, "kl_loss_10": 3681.9522908528647, "kl_loss_2": 8514.069715711805, "kl_loss_3": 11012.32679578993, "kl_loss_7": 4671.0064697265625, "learning_rate": 0.0001, "loss": 7057.7209, "step": 10 }, { "ce_loss_10": 4.197324633598328, "ce_loss_13": 3.5781216859817504, "ce_loss_2": 6.396872496604919, "ce_loss_3": 6.317626190185547, "ce_loss_7": 4.718231725692749, "epoch": 0.002, "grad_norm": 3088.0, "kl_loss_10": 1178.2664031982422, "kl_loss_2": 5194.5411376953125, "kl_loss_3": 5041.807763671875, "kl_loss_7": 2107.8896606445314, "learning_rate": 0.0002, "loss": 3433.318, "step": 20 }, { "ce_loss_10": 3.7206726551055906, "ce_loss_13": 3.368276393413544, "ce_loss_2": 5.781760859489441, "ce_loss_3": 5.545315575599671, "ce_loss_7": 4.186028468608856, "epoch": 0.003, "grad_norm": 4032.0, "kl_loss_10": 674.2754943847656, "kl_loss_2": 4511.11181640625, "kl_loss_3": 4054.1059814453124, "kl_loss_7": 1528.1455383300781, "learning_rate": 0.0003, "loss": 2640.5512, "step": 30 }, { "ce_loss_10": 3.821107840538025, "ce_loss_13": 3.536966252326965, "ce_loss_2": 5.572521734237671, "ce_loss_3": 5.341288280487061, "ce_loss_7": 4.234801423549652, "epoch": 0.004, "grad_norm": 2960.0, "kl_loss_10": 524.9487045288085, "kl_loss_2": 3838.2613525390625, "kl_loss_3": 3402.8488647460936, "kl_loss_7": 1309.348095703125, "learning_rate": 0.0004, "loss": 2283.1129, "step": 40 }, { "ce_loss_10": 3.7567438006401064, "ce_loss_13": 3.5144933104515075, "ce_loss_2": 5.419973969459534, "ce_loss_3": 5.193652868270874, "ce_loss_7": 4.1413051843643185, "epoch": 0.005, "grad_norm": 6016.0, "kl_loss_10": 445.8336715698242, "kl_loss_2": 3600.4189453125, "kl_loss_3": 3176.044384765625, "kl_loss_7": 1185.156512451172, "learning_rate": 0.0005, "loss": 2103.5049, "step": 50 }, { "ce_loss_10": 3.722773575782776, "ce_loss_13": 3.5224708676338197, "ce_loss_2": 5.278621530532837, "ce_loss_3": 5.080061268806458, "ce_loss_7": 4.097856783866883, "epoch": 0.006, "grad_norm": 3824.0, "kl_loss_10": 390.2220687866211, "kl_loss_2": 3350.347863769531, "kl_loss_3": 2998.066064453125, "kl_loss_7": 1115.832781982422, "learning_rate": 0.0006, "loss": 1964.402, "step": 60 }, { "ce_loss_10": 3.6256401419639586, "ce_loss_13": 3.4409272193908693, "ce_loss_2": 5.1505759954452515, "ce_loss_3": 4.97669529914856, "ce_loss_7": 3.983938765525818, "epoch": 0.007, "grad_norm": 3920.0, "kl_loss_10": 355.79547729492185, "kl_loss_2": 3279.6125610351564, "kl_loss_3": 2951.422497558594, "kl_loss_7": 1050.2557006835937, "learning_rate": 0.0007, "loss": 1892.3121, "step": 70 }, { "ce_loss_10": 3.611355257034302, "ce_loss_13": 3.4418101072311402, "ce_loss_2": 5.105751895904541, "ce_loss_3": 4.90991735458374, "ce_loss_7": 3.9785196661949156, "epoch": 0.008, "grad_norm": 3488.0, "kl_loss_10": 338.34486999511716, "kl_loss_2": 3190.1597534179687, "kl_loss_3": 2825.821044921875, "kl_loss_7": 1014.8152435302734, "learning_rate": 0.0008, "loss": 1848.4, "step": 80 }, { "ce_loss_10": 3.5723747968673707, "ce_loss_13": 3.4033101916313173, "ce_loss_2": 5.05491304397583, "ce_loss_3": 4.844810819625854, "ce_loss_7": 3.8869771361351013, "epoch": 0.009, "grad_norm": 3504.0, "kl_loss_10": 351.16760406494143, "kl_loss_2": 3202.9417724609375, "kl_loss_3": 2806.134655761719, "kl_loss_7": 954.3875213623047, "learning_rate": 0.0009000000000000001, "loss": 1819.1223, "step": 90 }, { "ce_loss_10": 3.714704382419586, "ce_loss_13": 3.524587631225586, "ce_loss_2": 5.140144395828247, "ce_loss_3": 4.962791919708252, "ce_loss_7": 4.010744750499725, "epoch": 0.01, "grad_norm": 4832.0, "kl_loss_10": 371.28753662109375, "kl_loss_2": 3117.2229125976564, "kl_loss_3": 2795.0933227539062, "kl_loss_7": 947.7742279052734, "learning_rate": 0.001, "loss": 1805.8203, "step": 100 }, { "ce_loss_10": 3.6942928433418274, "ce_loss_13": 3.4817589998245237, "ce_loss_2": 5.091573238372803, "ce_loss_3": 4.877256274223328, "ce_loss_7": 3.9481292366981506, "epoch": 0.011, "grad_norm": 2008.0, "kl_loss_10": 429.9922073364258, "kl_loss_2": 3126.605187988281, "kl_loss_3": 2695.8526123046877, "kl_loss_7": 906.8043029785156, "learning_rate": 0.0009999974825027757, "loss": 1785.3453, "step": 110 }, { "ce_loss_10": 3.7456037163734437, "ce_loss_13": 3.5361623644828795, "ce_loss_2": 5.1227661848068236, "ce_loss_3": 4.809029960632325, "ce_loss_7": 4.0057451844215395, "epoch": 0.012, "grad_norm": 2736.0, "kl_loss_10": 421.653271484375, "kl_loss_2": 3041.225866699219, "kl_loss_3": 2468.0310668945312, "kl_loss_7": 901.2433288574218, "learning_rate": 0.0009999899300364532, "loss": 1691.2215, "step": 120 }, { "ce_loss_10": 3.6967467427253724, "ce_loss_13": 3.5110474586486817, "ce_loss_2": 5.076847052574157, "ce_loss_3": 4.790580677986145, "ce_loss_7": 3.9812671184539794, "epoch": 0.013, "grad_norm": 1792.0, "kl_loss_10": 366.0322860717773, "kl_loss_2": 3049.608190917969, "kl_loss_3": 2474.2186645507813, "kl_loss_7": 909.4364349365235, "learning_rate": 0.0009999773426770863, "loss": 1717.7953, "step": 130 }, { "ce_loss_10": 3.7232070088386537, "ce_loss_13": 3.543111026287079, "ce_loss_2": 5.047617030143738, "ce_loss_3": 4.763497018814087, "ce_loss_7": 3.990681600570679, "epoch": 0.014, "grad_norm": 1920.0, "kl_loss_10": 359.95825347900393, "kl_loss_2": 2908.210729980469, "kl_loss_3": 2355.444958496094, "kl_loss_7": 888.9608703613281, "learning_rate": 0.0009999597205514296, "loss": 1639.8703, "step": 140 }, { "ce_loss_10": 3.676049864292145, "ce_loss_13": 3.505241572856903, "ce_loss_2": 4.947909688949585, "ce_loss_3": 4.669061374664307, "ce_loss_7": 3.9511622548103333, "epoch": 0.015, "grad_norm": 1376.0, "kl_loss_10": 345.57527313232424, "kl_loss_2": 2795.33154296875, "kl_loss_3": 2267.452508544922, "kl_loss_7": 859.9151733398437, "learning_rate": 0.0009999370638369377, "loss": 1577.1158, "step": 150 }, { "ce_loss_10": 3.7071213841438295, "ce_loss_13": 3.5414621829986572, "ce_loss_2": 4.982771682739258, "ce_loss_3": 4.697902798652649, "ce_loss_7": 3.964942920207977, "epoch": 0.016, "grad_norm": 1568.0, "kl_loss_10": 323.99631652832034, "kl_loss_2": 2816.620886230469, "kl_loss_3": 2258.00849609375, "kl_loss_7": 828.0151275634765, "learning_rate": 0.000999909372761763, "loss": 1565.1712, "step": 160 }, { "ce_loss_10": 3.629442536830902, "ce_loss_13": 3.475699579715729, "ce_loss_2": 4.9288132905960085, "ce_loss_3": 4.636562061309815, "ce_loss_7": 3.8839596271514893, "epoch": 0.017, "grad_norm": 1512.0, "kl_loss_10": 310.6042221069336, "kl_loss_2": 2849.280310058594, "kl_loss_3": 2291.288397216797, "kl_loss_7": 820.7699920654297, "learning_rate": 0.0009998766476047546, "loss": 1575.1025, "step": 170 }, { "ce_loss_10": 3.6930266618728638, "ce_loss_13": 3.519435966014862, "ce_loss_2": 4.993418955802918, "ce_loss_3": 4.673493552207947, "ce_loss_7": 3.9580948114395142, "epoch": 0.018, "grad_norm": 3152.0, "kl_loss_10": 337.58184814453125, "kl_loss_2": 2880.2222290039062, "kl_loss_3": 2268.8982666015627, "kl_loss_7": 865.2954742431641, "learning_rate": 0.0009998388886954545, "loss": 1604.3529, "step": 180 }, { "ce_loss_10": 3.6799558639526366, "ce_loss_13": 3.486136960983276, "ce_loss_2": 4.946302032470703, "ce_loss_3": 4.601687026023865, "ce_loss_7": 3.9099194288253782, "epoch": 0.019, "grad_norm": 1672.0, "kl_loss_10": 402.4949157714844, "kl_loss_2": 2886.513952636719, "kl_loss_3": 2211.9050720214846, "kl_loss_7": 841.9078552246094, "learning_rate": 0.0009997960964140947, "loss": 1577.3238, "step": 190 }, { "ce_loss_10": 3.650504744052887, "ce_loss_13": 3.4824187994003295, "ce_loss_2": 4.945061421394348, "ce_loss_3": 4.60343816280365, "ce_loss_7": 3.8988500118255613, "epoch": 0.02, "grad_norm": 1872.0, "kl_loss_10": 343.68775634765626, "kl_loss_2": 2847.6089111328124, "kl_loss_3": 2192.5098266601562, "kl_loss_7": 843.3558349609375, "learning_rate": 0.0009997482711915926, "loss": 1555.5152, "step": 200 }, { "ce_loss_10": 3.6082719564437866, "ce_loss_13": 3.44773451089859, "ce_loss_2": 4.884588956832886, "ce_loss_3": 4.508144104480744, "ce_loss_7": 3.8779943227767943, "epoch": 0.021, "grad_norm": 1736.0, "kl_loss_10": 324.77989807128904, "kl_loss_2": 2826.437780761719, "kl_loss_3": 2092.716345214844, "kl_loss_7": 846.5946807861328, "learning_rate": 0.0009996954135095479, "loss": 1513.0088, "step": 210 }, { "ce_loss_10": 3.6795857906341554, "ce_loss_13": 3.5324608206748964, "ce_loss_2": 4.862332582473755, "ce_loss_3": 4.554595494270325, "ce_loss_7": 3.907832646369934, "epoch": 0.022, "grad_norm": 1032.0, "kl_loss_10": 296.77941131591797, "kl_loss_2": 2593.2522094726564, "kl_loss_3": 2006.6829711914063, "kl_loss_7": 745.140835571289, "learning_rate": 0.0009996375239002368, "loss": 1414.3565, "step": 220 }, { "ce_loss_10": 3.740398120880127, "ce_loss_13": 3.5988372921943665, "ce_loss_2": 4.867113184928894, "ce_loss_3": 4.597586512565613, "ce_loss_7": 3.96889066696167, "epoch": 0.023, "grad_norm": 1048.0, "kl_loss_10": 276.71729202270507, "kl_loss_2": 2492.076037597656, "kl_loss_3": 1982.453546142578, "kl_loss_7": 727.5907012939454, "learning_rate": 0.0009995746029466072, "loss": 1380.723, "step": 230 }, { "ce_loss_10": 3.5279888272285462, "ce_loss_13": 3.3874544382095335, "ce_loss_2": 4.788827538490295, "ce_loss_3": 4.472659921646118, "ce_loss_7": 3.778586721420288, "epoch": 0.024, "grad_norm": 1296.0, "kl_loss_10": 279.43826217651366, "kl_loss_2": 2758.438586425781, "kl_loss_3": 2143.7233947753907, "kl_loss_7": 779.3903015136718, "learning_rate": 0.0009995066512822719, "loss": 1441.9243, "step": 240 }, { "ce_loss_10": 3.6201495051383974, "ce_loss_13": 3.4905712485313414, "ce_loss_2": 4.891209101676941, "ce_loss_3": 4.570740580558777, "ce_loss_7": 3.865015411376953, "epoch": 0.025, "grad_norm": 1432.0, "kl_loss_10": 261.5669853210449, "kl_loss_2": 2739.434948730469, "kl_loss_3": 2116.1525390625, "kl_loss_7": 737.9495910644531, "learning_rate": 0.000999433669591504, "loss": 1414.2406, "step": 250 }, { "ce_loss_10": 3.5210766077041624, "ce_loss_13": 3.3931028485298156, "ce_loss_2": 4.796879243850708, "ce_loss_3": 4.466079044342041, "ce_loss_7": 3.767536473274231, "epoch": 0.026, "grad_norm": 1368.0, "kl_loss_10": 264.8755844116211, "kl_loss_2": 2785.579797363281, "kl_loss_3": 2137.9736206054686, "kl_loss_7": 744.8723297119141, "learning_rate": 0.000999355658609228, "loss": 1444.816, "step": 260 }, { "ce_loss_10": 3.5607513666152952, "ce_loss_13": 3.4221261620521544, "ce_loss_2": 4.833321261405945, "ce_loss_3": 4.517770767211914, "ce_loss_7": 3.800622284412384, "epoch": 0.027, "grad_norm": 932.0, "kl_loss_10": 273.0821258544922, "kl_loss_2": 2761.8063842773436, "kl_loss_3": 2143.9581298828125, "kl_loss_7": 740.5351409912109, "learning_rate": 0.0009992726191210138, "loss": 1461.2188, "step": 270 }, { "ce_loss_10": 3.5995601177215577, "ce_loss_13": 3.465478837490082, "ce_loss_2": 4.800075507164001, "ce_loss_3": 4.5197283864021305, "ce_loss_7": 3.8348299264907837, "epoch": 0.028, "grad_norm": 1432.0, "kl_loss_10": 281.3691596984863, "kl_loss_2": 2633.5447509765627, "kl_loss_3": 2094.4590698242187, "kl_loss_7": 738.63564453125, "learning_rate": 0.0009991845519630679, "loss": 1420.4679, "step": 280 }, { "ce_loss_10": 3.4883801102638246, "ce_loss_13": 3.3483877182006836, "ce_loss_2": 4.6885244131088255, "ce_loss_3": 4.424971508979797, "ce_loss_7": 3.7516568541526794, "epoch": 0.029, "grad_norm": 1696.0, "kl_loss_10": 289.22730712890626, "kl_loss_2": 2620.5162719726563, "kl_loss_3": 2117.432470703125, "kl_loss_7": 775.0425170898437, "learning_rate": 0.0009990914580222257, "loss": 1449.7931, "step": 290 }, { "ce_loss_10": 3.6296466469764708, "ce_loss_13": 3.491527271270752, "ce_loss_2": 4.723770475387573, "ce_loss_3": 4.4687070608139035, "ce_loss_7": 3.8463655352592467, "epoch": 0.03, "grad_norm": 960.0, "kl_loss_10": 275.84105758666993, "kl_loss_2": 2456.5258056640623, "kl_loss_3": 1969.6472045898438, "kl_loss_7": 744.4771392822265, "learning_rate": 0.0009989933382359422, "loss": 1393.9777, "step": 300 }, { "ce_loss_10": 3.6187907338142393, "ce_loss_13": 3.494164001941681, "ce_loss_2": 4.727794432640076, "ce_loss_3": 4.440771436691284, "ce_loss_7": 3.837166500091553, "epoch": 0.031, "grad_norm": 1020.0, "kl_loss_10": 256.4068969726562, "kl_loss_2": 2450.5587768554688, "kl_loss_3": 1899.8815795898438, "kl_loss_7": 696.280160522461, "learning_rate": 0.0009988901935922825, "loss": 1338.8556, "step": 310 }, { "ce_loss_10": 3.475846517086029, "ce_loss_13": 3.3469168901443482, "ce_loss_2": 4.662670254707336, "ce_loss_3": 4.363593912124633, "ce_loss_7": 3.6978800535202025, "epoch": 0.032, "grad_norm": 1064.0, "kl_loss_10": 265.57069702148436, "kl_loss_2": 2603.8465576171875, "kl_loss_3": 2019.1532775878907, "kl_loss_7": 702.3248809814453, "learning_rate": 0.0009987820251299122, "loss": 1359.8287, "step": 320 }, { "ce_loss_10": 3.6019670248031614, "ce_loss_13": 3.4775232672691345, "ce_loss_2": 4.698803400993347, "ce_loss_3": 4.42151095867157, "ce_loss_7": 3.817472517490387, "epoch": 0.033, "grad_norm": 940.0, "kl_loss_10": 252.79367218017578, "kl_loss_2": 2444.330114746094, "kl_loss_3": 1906.024169921875, "kl_loss_7": 675.2368347167969, "learning_rate": 0.0009986688339380862, "loss": 1315.1269, "step": 330 }, { "ce_loss_10": 3.5493945360183714, "ce_loss_13": 3.430993151664734, "ce_loss_2": 4.632452750205994, "ce_loss_3": 4.36405119895935, "ce_loss_7": 3.7596432328224183, "epoch": 0.034, "grad_norm": 964.0, "kl_loss_10": 238.95554962158204, "kl_loss_2": 2376.1999267578126, "kl_loss_3": 1854.2742736816406, "kl_loss_7": 652.8402496337891, "learning_rate": 0.0009985506211566387, "loss": 1294.451, "step": 340 }, { "ce_loss_10": 3.5777671217918394, "ce_loss_13": 3.459764850139618, "ce_loss_2": 4.638717436790467, "ce_loss_3": 4.377784371376038, "ce_loss_7": 3.7803191184997558, "epoch": 0.035, "grad_norm": 1072.0, "kl_loss_10": 247.259854888916, "kl_loss_2": 2338.356652832031, "kl_loss_3": 1833.6623596191407, "kl_loss_7": 643.2027313232422, "learning_rate": 0.0009984273879759713, "loss": 1274.0854, "step": 350 }, { "ce_loss_10": 3.615658330917358, "ce_loss_13": 3.4873095512390138, "ce_loss_2": 4.692537188529968, "ce_loss_3": 4.444826865196228, "ce_loss_7": 3.812762463092804, "epoch": 0.036, "grad_norm": 944.0, "kl_loss_10": 267.5936080932617, "kl_loss_2": 2401.089453125, "kl_loss_3": 1906.0577697753906, "kl_loss_7": 656.432861328125, "learning_rate": 0.0009982991356370402, "loss": 1322.0952, "step": 360 }, { "ce_loss_10": 3.60038241147995, "ce_loss_13": 3.4653908014297485, "ce_loss_2": 4.659529018402099, "ce_loss_3": 4.398728227615356, "ce_loss_7": 3.788052773475647, "epoch": 0.037, "grad_norm": 1264.0, "kl_loss_10": 274.8468948364258, "kl_loss_2": 2378.8714233398437, "kl_loss_3": 1863.3742370605469, "kl_loss_7": 654.2035675048828, "learning_rate": 0.0009981658654313456, "loss": 1294.735, "step": 370 }, { "ce_loss_10": 3.6825244545936586, "ce_loss_13": 3.5485115528106688, "ce_loss_2": 4.6979457378387455, "ce_loss_3": 4.432817983627319, "ce_loss_7": 3.8632850527763365, "epoch": 0.038, "grad_norm": 776.0, "kl_loss_10": 272.49184265136716, "kl_loss_2": 2301.4927673339844, "kl_loss_3": 1785.1670227050781, "kl_loss_7": 634.1265563964844, "learning_rate": 0.000998027578700917, "loss": 1269.1893, "step": 380 }, { "ce_loss_10": 3.605938124656677, "ce_loss_13": 3.478776490688324, "ce_loss_2": 4.650116562843323, "ce_loss_3": 4.38636953830719, "ce_loss_7": 3.796186101436615, "epoch": 0.039, "grad_norm": 884.0, "kl_loss_10": 259.15301818847655, "kl_loss_2": 2337.7537109375, "kl_loss_3": 1833.4333435058593, "kl_loss_7": 650.2792358398438, "learning_rate": 0.0009978842768382998, "loss": 1277.8982, "step": 390 }, { "ce_loss_10": 3.619408428668976, "ce_loss_13": 3.5009591698646547, "ce_loss_2": 4.620783948898316, "ce_loss_3": 4.36586412191391, "ce_loss_7": 3.8007230043411253, "epoch": 0.04, "grad_norm": 924.0, "kl_loss_10": 239.69640197753907, "kl_loss_2": 2237.8325927734377, "kl_loss_3": 1755.0042358398437, "kl_loss_7": 615.1319396972656, "learning_rate": 0.0009977359612865424, "loss": 1226.1033, "step": 400 }, { "ce_loss_10": 3.6167723059654238, "ce_loss_13": 3.5048955202102663, "ce_loss_2": 4.640660691261291, "ce_loss_3": 4.3846115350723265, "ce_loss_7": 3.8166274309158323, "epoch": 0.041, "grad_norm": 724.0, "kl_loss_10": 232.37744522094727, "kl_loss_2": 2279.1153564453125, "kl_loss_3": 1781.2996765136718, "kl_loss_7": 644.5638702392578, "learning_rate": 0.0009975826335391806, "loss": 1223.2463, "step": 410 }, { "ce_loss_10": 3.6286866068840027, "ce_loss_13": 3.52371027469635, "ce_loss_2": 4.6422699928283695, "ce_loss_3": 4.3733536958694454, "ce_loss_7": 3.8216816902160646, "epoch": 0.042, "grad_norm": 920.0, "kl_loss_10": 219.2363723754883, "kl_loss_2": 2237.4999084472656, "kl_loss_3": 1720.3533020019531, "kl_loss_7": 611.6994201660157, "learning_rate": 0.0009974242951402235, "loss": 1211.1291, "step": 420 }, { "ce_loss_10": 3.638279104232788, "ce_loss_13": 3.528469812870026, "ce_loss_2": 4.666049456596374, "ce_loss_3": 4.395814538002014, "ce_loss_7": 3.8397316694259644, "epoch": 0.043, "grad_norm": 980.0, "kl_loss_10": 225.09888305664063, "kl_loss_2": 2284.1164428710936, "kl_loss_3": 1757.2524719238281, "kl_loss_7": 629.9028503417969, "learning_rate": 0.0009972609476841367, "loss": 1210.4814, "step": 430 }, { "ce_loss_10": 3.549589216709137, "ce_loss_13": 3.4418772578239443, "ce_loss_2": 4.606719565391541, "ce_loss_3": 4.324760735034943, "ce_loss_7": 3.7483399391174315, "epoch": 0.044, "grad_norm": 932.0, "kl_loss_10": 216.09697952270508, "kl_loss_2": 2312.0109741210936, "kl_loss_3": 1769.2410278320312, "kl_loss_7": 615.9786346435546, "learning_rate": 0.0009970925928158272, "loss": 1228.8015, "step": 440 }, { "ce_loss_10": 3.4980531454086305, "ce_loss_13": 3.385247766971588, "ce_loss_2": 4.560380291938782, "ce_loss_3": 4.278938281536102, "ce_loss_7": 3.7021584153175353, "epoch": 0.045, "grad_norm": 908.0, "kl_loss_10": 226.3798355102539, "kl_loss_2": 2369.1811340332033, "kl_loss_3": 1809.7959411621093, "kl_loss_7": 638.1731658935547, "learning_rate": 0.000996919232230627, "loss": 1239.8368, "step": 450 }, { "ce_loss_10": 3.576229965686798, "ce_loss_13": 3.473420023918152, "ce_loss_2": 4.5887164831161495, "ce_loss_3": 4.321652209758758, "ce_loss_7": 3.77212575674057, "epoch": 0.046, "grad_norm": 1024.0, "kl_loss_10": 210.70022430419922, "kl_loss_2": 2231.498858642578, "kl_loss_3": 1716.9997863769531, "kl_loss_7": 612.2136322021485, "learning_rate": 0.0009967408676742752, "loss": 1172.7754, "step": 460 }, { "ce_loss_10": 3.734071063995361, "ce_loss_13": 3.618434226512909, "ce_loss_2": 4.695290613174438, "ce_loss_3": 4.43647825717926, "ce_loss_7": 3.918177044391632, "epoch": 0.047, "grad_norm": 908.0, "kl_loss_10": 231.6582176208496, "kl_loss_2": 2180.8106018066405, "kl_loss_3": 1692.2353576660157, "kl_loss_7": 612.3778228759766, "learning_rate": 0.0009965575009429006, "loss": 1209.0801, "step": 470 }, { "ce_loss_10": 3.5073325395584107, "ce_loss_13": 3.397393560409546, "ce_loss_2": 4.541721534729004, "ce_loss_3": 4.268522250652313, "ce_loss_7": 3.696891689300537, "epoch": 0.048, "grad_norm": 888.0, "kl_loss_10": 227.73510971069337, "kl_loss_2": 2294.750048828125, "kl_loss_3": 1770.7283081054688, "kl_loss_7": 619.1150939941406, "learning_rate": 0.0009963691338830043, "loss": 1207.0088, "step": 480 }, { "ce_loss_10": 3.598436427116394, "ce_loss_13": 3.498102140426636, "ce_loss_2": 4.597862339019775, "ce_loss_3": 4.321764397621155, "ce_loss_7": 3.779419946670532, "epoch": 0.049, "grad_norm": 820.0, "kl_loss_10": 216.79126739501953, "kl_loss_2": 2232.1513854980467, "kl_loss_3": 1698.4708862304688, "kl_loss_7": 586.6665161132812, "learning_rate": 0.0009961757683914405, "loss": 1170.0627, "step": 490 }, { "ce_loss_10": 3.587793219089508, "ce_loss_13": 3.479810047149658, "ce_loss_2": 4.565227222442627, "ce_loss_3": 4.309310102462769, "ce_loss_7": 3.773853075504303, "epoch": 0.05, "grad_norm": 1760.0, "kl_loss_10": 222.1615104675293, "kl_loss_2": 2198.9348083496093, "kl_loss_3": 1694.7262390136718, "kl_loss_7": 612.1928680419921, "learning_rate": 0.0009959774064153978, "loss": 1195.7824, "step": 500 }, { "ce_loss_10": 3.590281581878662, "ce_loss_13": 3.492150938510895, "ce_loss_2": 4.57180380821228, "ce_loss_3": 4.302522945404053, "ce_loss_7": 3.772382390499115, "epoch": 0.051, "grad_norm": 972.0, "kl_loss_10": 210.51349639892578, "kl_loss_2": 2180.4048400878905, "kl_loss_3": 1653.8468933105469, "kl_loss_7": 588.0678894042969, "learning_rate": 0.0009957740499523787, "loss": 1178.0232, "step": 510 }, { "ce_loss_10": 3.6132477045059206, "ce_loss_13": 3.511445939540863, "ce_loss_2": 4.587271404266358, "ce_loss_3": 4.3159163236618046, "ce_loss_7": 3.796973693370819, "epoch": 0.052, "grad_norm": 720.0, "kl_loss_10": 204.7069091796875, "kl_loss_2": 2151.1836120605467, "kl_loss_3": 1634.1124877929688, "kl_loss_7": 572.2468353271485, "learning_rate": 0.0009955657010501807, "loss": 1149.754, "step": 520 }, { "ce_loss_10": 3.5784901857376097, "ce_loss_13": 3.4709136962890623, "ce_loss_2": 4.551413655281067, "ce_loss_3": 4.286619782447815, "ce_loss_7": 3.7552420496940613, "epoch": 0.053, "grad_norm": 1032.0, "kl_loss_10": 223.27496261596679, "kl_loss_2": 2177.6577087402343, "kl_loss_3": 1660.8929260253906, "kl_loss_7": 572.54072265625, "learning_rate": 0.000995352361806875, "loss": 1147.3114, "step": 530 }, { "ce_loss_10": 3.625127899646759, "ce_loss_13": 3.511006486415863, "ce_loss_2": 4.581828641891479, "ce_loss_3": 4.31304669380188, "ce_loss_7": 3.7988196134567263, "epoch": 0.054, "grad_norm": 732.0, "kl_loss_10": 231.79150848388673, "kl_loss_2": 2171.6323669433596, "kl_loss_3": 1659.2912536621093, "kl_loss_7": 599.0842193603515, "learning_rate": 0.0009951340343707852, "loss": 1174.3984, "step": 540 }, { "ce_loss_10": 3.676268827915192, "ce_loss_13": 3.5701403856277465, "ce_loss_2": 4.638545846939087, "ce_loss_3": 4.371236753463745, "ce_loss_7": 3.8460424661636354, "epoch": 0.055, "grad_norm": 700.0, "kl_loss_10": 216.45612487792968, "kl_loss_2": 2133.3610229492188, "kl_loss_3": 1642.1666015625, "kl_loss_7": 563.3070922851563, "learning_rate": 0.0009949107209404665, "loss": 1152.1754, "step": 550 }, { "ce_loss_10": 3.587022233009338, "ce_loss_13": 3.474756634235382, "ce_loss_2": 4.525298738479615, "ce_loss_3": 4.273183763027191, "ce_loss_7": 3.750283181667328, "epoch": 0.056, "grad_norm": 676.0, "kl_loss_10": 231.15026779174804, "kl_loss_2": 2132.3562866210937, "kl_loss_3": 1646.3475463867187, "kl_loss_7": 568.931118774414, "learning_rate": 0.0009946824237646824, "loss": 1145.0641, "step": 560 }, { "ce_loss_10": 3.5445329308509828, "ce_loss_13": 3.4262341499328612, "ce_loss_2": 4.4978625774383545, "ce_loss_3": 4.245725357532502, "ce_loss_7": 3.7067874550819395, "epoch": 0.057, "grad_norm": 824.0, "kl_loss_10": 238.33600463867188, "kl_loss_2": 2178.645184326172, "kl_loss_3": 1689.5344848632812, "kl_loss_7": 583.5655319213868, "learning_rate": 0.0009944491451423828, "loss": 1186.293, "step": 570 }, { "ce_loss_10": 3.53601359128952, "ce_loss_13": 3.420380687713623, "ce_loss_2": 4.508120799064637, "ce_loss_3": 4.24744416475296, "ce_loss_7": 3.7096508502960206, "epoch": 0.058, "grad_norm": 956.0, "kl_loss_10": 240.11019058227538, "kl_loss_2": 2193.1629943847656, "kl_loss_3": 1688.384765625, "kl_loss_7": 592.8175323486328, "learning_rate": 0.0009942108874226813, "loss": 1155.7771, "step": 580 }, { "ce_loss_10": 3.657444155216217, "ce_loss_13": 3.5424296021461488, "ce_loss_2": 4.5656781673431395, "ce_loss_3": 4.320976912975311, "ce_loss_7": 3.8200041532516478, "epoch": 0.059, "grad_norm": 820.0, "kl_loss_10": 237.33242263793946, "kl_loss_2": 2063.7497314453126, "kl_loss_3": 1590.3793151855468, "kl_loss_7": 574.3437225341797, "learning_rate": 0.00099396765300483, "loss": 1113.7337, "step": 590 }, { "ce_loss_10": 3.6340256094932557, "ce_loss_13": 3.519163191318512, "ce_loss_2": 4.549298119544983, "ce_loss_3": 4.307851886749267, "ce_loss_7": 3.8015130519866944, "epoch": 0.06, "grad_norm": 948.0, "kl_loss_10": 239.48185348510742, "kl_loss_2": 2090.356060791016, "kl_loss_3": 1616.7106872558593, "kl_loss_7": 579.2961456298829, "learning_rate": 0.0009937194443381972, "loss": 1128.8246, "step": 600 }, { "ce_loss_10": 3.646886777877808, "ce_loss_13": 3.5430930137634276, "ce_loss_2": 4.552362751960755, "ce_loss_3": 4.3045818567276, "ce_loss_7": 3.812400245666504, "epoch": 0.061, "grad_norm": 836.0, "kl_loss_10": 216.1532455444336, "kl_loss_2": 2053.682244873047, "kl_loss_3": 1576.81474609375, "kl_loss_7": 571.3107208251953, "learning_rate": 0.0009934662639222412, "loss": 1128.2942, "step": 610 }, { "ce_loss_10": 3.602829623222351, "ce_loss_13": 3.4988005995750426, "ce_loss_2": 4.552669429779053, "ce_loss_3": 4.288874197006225, "ce_loss_7": 3.771229422092438, "epoch": 0.062, "grad_norm": 752.0, "kl_loss_10": 211.5402572631836, "kl_loss_2": 2146.809149169922, "kl_loss_3": 1635.902978515625, "kl_loss_7": 572.7010498046875, "learning_rate": 0.000993208114306486, "loss": 1128.1789, "step": 620 }, { "ce_loss_10": 3.513095796108246, "ce_loss_13": 3.41504830121994, "ce_loss_2": 4.476359033584595, "ce_loss_3": 4.210507690906525, "ce_loss_7": 3.692080223560333, "epoch": 0.063, "grad_norm": 840.0, "kl_loss_10": 207.13141326904298, "kl_loss_2": 2149.41875, "kl_loss_3": 1630.2548095703125, "kl_loss_7": 566.9929428100586, "learning_rate": 0.0009929449980904952, "loss": 1107.0104, "step": 630 }, { "ce_loss_10": 3.5761404514312742, "ce_loss_13": 3.479358458518982, "ce_loss_2": 4.512639951705933, "ce_loss_3": 4.251035594940186, "ce_loss_7": 3.746527075767517, "epoch": 0.064, "grad_norm": 704.0, "kl_loss_10": 199.16628189086913, "kl_loss_2": 2115.853399658203, "kl_loss_3": 1620.3352111816407, "kl_loss_7": 563.6727096557618, "learning_rate": 0.0009926769179238466, "loss": 1116.6029, "step": 640 }, { "ce_loss_10": 3.6297390460968018, "ce_loss_13": 3.526135504245758, "ce_loss_2": 4.559930968284607, "ce_loss_3": 4.2981769919395445, "ce_loss_7": 3.8018287062644958, "epoch": 0.065, "grad_norm": 812.0, "kl_loss_10": 217.01969451904296, "kl_loss_2": 2101.874133300781, "kl_loss_3": 1596.4249572753906, "kl_loss_7": 579.9847625732422, "learning_rate": 0.000992403876506104, "loss": 1118.0791, "step": 650 }, { "ce_loss_10": 3.5638173818588257, "ce_loss_13": 3.4600673079490663, "ce_loss_2": 4.492022132873535, "ce_loss_3": 4.226557815074921, "ce_loss_7": 3.737081015110016, "epoch": 0.066, "grad_norm": 636.0, "kl_loss_10": 213.38376693725587, "kl_loss_2": 2105.9776245117187, "kl_loss_3": 1588.3972534179688, "kl_loss_7": 565.9338088989258, "learning_rate": 0.0009921258765867918, "loss": 1115.5061, "step": 660 }, { "ce_loss_10": 3.524133229255676, "ce_loss_13": 3.424080693721771, "ce_loss_2": 4.483368253707885, "ce_loss_3": 4.2175681591033936, "ce_loss_7": 3.6960334897041323, "epoch": 0.067, "grad_norm": 824.0, "kl_loss_10": 213.54936981201172, "kl_loss_2": 2168.286993408203, "kl_loss_3": 1640.3900268554687, "kl_loss_7": 567.8847198486328, "learning_rate": 0.0009918429209653662, "loss": 1124.8729, "step": 670 }, { "ce_loss_10": 3.5795446634292603, "ce_loss_13": 3.479706287384033, "ce_loss_2": 4.519145917892456, "ce_loss_3": 4.256680989265442, "ce_loss_7": 3.746138072013855, "epoch": 0.068, "grad_norm": 820.0, "kl_loss_10": 211.06802139282226, "kl_loss_2": 2113.3779052734376, "kl_loss_3": 1607.1892822265625, "kl_loss_7": 560.476708984375, "learning_rate": 0.0009915550124911866, "loss": 1099.4275, "step": 680 }, { "ce_loss_10": 3.5930914521217345, "ce_loss_13": 3.4938461661338804, "ce_loss_2": 4.51025116443634, "ce_loss_3": 4.262886941432953, "ce_loss_7": 3.7600345849990844, "epoch": 0.069, "grad_norm": 960.0, "kl_loss_10": 200.71142120361327, "kl_loss_2": 2064.7206604003904, "kl_loss_3": 1576.7652465820313, "kl_loss_7": 544.939192199707, "learning_rate": 0.0009912621540634887, "loss": 1096.3648, "step": 690 }, { "ce_loss_10": 3.6200631141662596, "ce_loss_13": 3.5290600538253782, "ce_loss_2": 4.496898746490478, "ce_loss_3": 4.253770506381988, "ce_loss_7": 3.775084400177002, "epoch": 0.07, "grad_norm": 952.0, "kl_loss_10": 188.68883590698243, "kl_loss_2": 2000.6690612792968, "kl_loss_3": 1528.578564453125, "kl_loss_7": 525.6970199584961, "learning_rate": 0.0009909643486313534, "loss": 1075.2783, "step": 700 }, { "ce_loss_10": 3.491303098201752, "ce_loss_13": 3.3972670793533326, "ce_loss_2": 4.432852864265442, "ce_loss_3": 4.169692039489746, "ce_loss_7": 3.659318673610687, "epoch": 0.071, "grad_norm": 772.0, "kl_loss_10": 195.57371444702147, "kl_loss_2": 2111.2869506835937, "kl_loss_3": 1599.3862426757812, "kl_loss_7": 541.3287918090821, "learning_rate": 0.000990661599193678, "loss": 1124.3994, "step": 710 }, { "ce_loss_10": 3.630624604225159, "ce_loss_13": 3.5385274052619935, "ce_loss_2": 4.531144857406616, "ce_loss_3": 4.287478506565094, "ce_loss_7": 3.7991448640823364, "epoch": 0.072, "grad_norm": 636.0, "kl_loss_10": 192.84549560546876, "kl_loss_2": 2026.7384338378906, "kl_loss_3": 1547.2736206054688, "kl_loss_7": 541.6422912597657, "learning_rate": 0.0009903539087991462, "loss": 1076.785, "step": 720 }, { "ce_loss_10": 3.594977593421936, "ce_loss_13": 3.50269376039505, "ce_loss_2": 4.492258667945862, "ce_loss_3": 4.242715549468994, "ce_loss_7": 3.7583904504776, "epoch": 0.073, "grad_norm": 732.0, "kl_loss_10": 189.20598907470702, "kl_loss_2": 2021.3861267089844, "kl_loss_3": 1532.5231994628907, "kl_loss_7": 542.5753295898437, "learning_rate": 0.0009900412805461966, "loss": 1080.9737, "step": 730 }, { "ce_loss_10": 3.6703630805015566, "ce_loss_13": 3.5826353311538695, "ce_loss_2": 4.54851381778717, "ce_loss_3": 4.306425213813782, "ce_loss_7": 3.843156623840332, "epoch": 0.074, "grad_norm": 920.0, "kl_loss_10": 187.67292861938478, "kl_loss_2": 1985.2287841796874, "kl_loss_3": 1507.3379516601562, "kl_loss_7": 538.3505889892579, "learning_rate": 0.0009897237175829927, "loss": 1072.161, "step": 740 }, { "ce_loss_10": 3.5649651885032654, "ce_loss_13": 3.4684099912643434, "ce_loss_2": 4.487882328033447, "ce_loss_3": 4.228008484840393, "ce_loss_7": 3.731177771091461, "epoch": 0.075, "grad_norm": 768.0, "kl_loss_10": 193.74180603027344, "kl_loss_2": 2084.8017517089843, "kl_loss_3": 1566.5948913574218, "kl_loss_7": 554.2125396728516, "learning_rate": 0.0009894012231073895, "loss": 1088.093, "step": 750 }, { "ce_loss_10": 3.6071489572525026, "ce_loss_13": 3.515170168876648, "ce_loss_2": 4.504528665542603, "ce_loss_3": 4.2520447134971615, "ce_loss_7": 3.7718453645706176, "epoch": 0.076, "grad_norm": 620.0, "kl_loss_10": 189.55669250488282, "kl_loss_2": 2013.1507141113282, "kl_loss_3": 1520.1027954101562, "kl_loss_7": 526.1563827514649, "learning_rate": 0.0009890738003669028, "loss": 1077.2552, "step": 760 }, { "ce_loss_10": 3.5843887448310854, "ce_loss_13": 3.489227271080017, "ce_loss_2": 4.5111163854599, "ce_loss_3": 4.255188155174255, "ce_loss_7": 3.7455286383628845, "epoch": 0.077, "grad_norm": 728.0, "kl_loss_10": 207.77494277954102, "kl_loss_2": 2098.0493408203124, "kl_loss_3": 1588.9514770507812, "kl_loss_7": 542.3229370117188, "learning_rate": 0.0009887414526586764, "loss": 1081.4613, "step": 770 }, { "ce_loss_10": 3.6421250104904175, "ce_loss_13": 3.5463628649711607, "ce_loss_2": 4.539319705963135, "ce_loss_3": 4.268165516853332, "ce_loss_7": 3.799858510494232, "epoch": 0.078, "grad_norm": 924.0, "kl_loss_10": 198.3345474243164, "kl_loss_2": 2029.2047729492188, "kl_loss_3": 1506.253399658203, "kl_loss_7": 526.1126602172851, "learning_rate": 0.0009884041833294476, "loss": 1050.2004, "step": 780 }, { "ce_loss_10": 3.6439041018486025, "ce_loss_13": 3.546130394935608, "ce_loss_2": 4.5299879789352415, "ce_loss_3": 4.267681610584259, "ce_loss_7": 3.798111093044281, "epoch": 0.079, "grad_norm": 1072.0, "kl_loss_10": 193.93799591064453, "kl_loss_2": 2009.9286010742187, "kl_loss_3": 1509.10400390625, "kl_loss_7": 542.0801528930664, "learning_rate": 0.000988061995775515, "loss": 1085.4477, "step": 790 }, { "ce_loss_10": 3.5596301794052123, "ce_loss_13": 3.470755398273468, "ce_loss_2": 4.442834830284118, "ce_loss_3": 4.188168525695801, "ce_loss_7": 3.741673398017883, "epoch": 0.08, "grad_norm": 640.0, "kl_loss_10": 202.93540420532227, "kl_loss_2": 2011.5775634765625, "kl_loss_3": 1495.8062683105468, "kl_loss_7": 546.9228302001953, "learning_rate": 0.0009877148934427035, "loss": 1066.8678, "step": 800 }, { "ce_loss_10": 3.6398961186408996, "ce_loss_13": 3.5154946088790893, "ce_loss_2": 4.5010058879852295, "ce_loss_3": 4.234906959533691, "ce_loss_7": 3.7647638440132143, "epoch": 0.081, "grad_norm": 784.0, "kl_loss_10": 239.83477783203125, "kl_loss_2": 2027.1837829589845, "kl_loss_3": 1514.4463195800781, "kl_loss_7": 519.1964233398437, "learning_rate": 0.0009873628798263297, "loss": 1059.5939, "step": 810 }, { "ce_loss_10": 3.5738606095314025, "ce_loss_13": 3.468097412586212, "ce_loss_2": 4.424917078018188, "ce_loss_3": 4.169192314147949, "ce_loss_7": 3.707214820384979, "epoch": 0.082, "grad_norm": 708.0, "kl_loss_10": 242.0955726623535, "kl_loss_2": 1952.9846618652343, "kl_loss_3": 1470.680194091797, "kl_loss_7": 500.8814727783203, "learning_rate": 0.0009870059584711668, "loss": 1069.6682, "step": 820 }, { "ce_loss_10": 3.58900762796402, "ce_loss_13": 3.4884706497192384, "ce_loss_2": 4.43571412563324, "ce_loss_3": 4.198841071128845, "ce_loss_7": 3.7228142976760865, "epoch": 0.083, "grad_norm": 788.0, "kl_loss_10": 211.60461807250977, "kl_loss_2": 1966.7832885742187, "kl_loss_3": 1492.8640014648438, "kl_loss_7": 499.06214294433596, "learning_rate": 0.000986644132971409, "loss": 1044.2496, "step": 830 }, { "ce_loss_10": 3.5730356097221376, "ce_loss_13": 3.475196421146393, "ce_loss_2": 4.4566532373428345, "ce_loss_3": 4.211660146713257, "ce_loss_7": 3.717496383190155, "epoch": 0.084, "grad_norm": 856.0, "kl_loss_10": 204.7541816711426, "kl_loss_2": 2003.5655700683594, "kl_loss_3": 1527.1750061035157, "kl_loss_7": 514.9196395874023, "learning_rate": 0.0009862774069706345, "loss": 1053.708, "step": 840 }, { "ce_loss_10": 3.6906569719314577, "ce_loss_13": 3.601027488708496, "ce_loss_2": 4.525540113449097, "ce_loss_3": 4.283427906036377, "ce_loss_7": 3.8413923501968386, "epoch": 0.085, "grad_norm": 844.0, "kl_loss_10": 194.00841827392577, "kl_loss_2": 1934.2958251953125, "kl_loss_3": 1471.3841796875, "kl_loss_7": 520.3793319702148, "learning_rate": 0.000985905784161771, "loss": 1041.0401, "step": 850 }, { "ce_loss_10": 3.6237664222717285, "ce_loss_13": 3.5309490084648134, "ce_loss_2": 4.46936445236206, "ce_loss_3": 4.220357573032379, "ce_loss_7": 3.779926073551178, "epoch": 0.086, "grad_norm": 836.0, "kl_loss_10": 189.39860305786132, "kl_loss_2": 1951.8812316894532, "kl_loss_3": 1472.02509765625, "kl_loss_7": 520.433024597168, "learning_rate": 0.000985529268287055, "loss": 1032.3361, "step": 860 }, { "ce_loss_10": 3.5477117776870726, "ce_loss_13": 3.4556665897369383, "ce_loss_2": 4.441434073448181, "ce_loss_3": 4.170016729831696, "ce_loss_7": 3.7197394251823424, "epoch": 0.087, "grad_norm": 956.0, "kl_loss_10": 188.84106826782227, "kl_loss_2": 2015.502880859375, "kl_loss_3": 1487.3405334472657, "kl_loss_7": 539.0837844848633, "learning_rate": 0.0009851478631379982, "loss": 1053.5276, "step": 870 }, { "ce_loss_10": 3.609230947494507, "ce_loss_13": 3.516581404209137, "ce_loss_2": 4.48051495552063, "ce_loss_3": 4.213979172706604, "ce_loss_7": 3.765811729431152, "epoch": 0.088, "grad_norm": 908.0, "kl_loss_10": 186.26299972534179, "kl_loss_2": 1983.9146301269532, "kl_loss_3": 1461.4365661621093, "kl_loss_7": 506.38514251708983, "learning_rate": 0.0009847615725553456, "loss": 1027.144, "step": 880 }, { "ce_loss_10": 3.6631253480911257, "ce_loss_13": 3.580364799499512, "ce_loss_2": 4.489781618118286, "ce_loss_3": 4.246252822875976, "ce_loss_7": 3.813025879859924, "epoch": 0.089, "grad_norm": 820.0, "kl_loss_10": 177.18833389282227, "kl_loss_2": 1872.1732604980468, "kl_loss_3": 1411.1392700195313, "kl_loss_7": 495.2686401367188, "learning_rate": 0.0009843704004290394, "loss": 1029.3286, "step": 890 }, { "ce_loss_10": 3.565847933292389, "ce_loss_13": 3.477725636959076, "ce_loss_2": 4.422844636440277, "ce_loss_3": 4.1976773858070375, "ce_loss_7": 3.7395910024642944, "epoch": 0.09, "grad_norm": 1012.0, "kl_loss_10": 186.12151031494142, "kl_loss_2": 1965.6116821289063, "kl_loss_3": 1515.8754577636719, "kl_loss_7": 555.1820693969727, "learning_rate": 0.0009839743506981783, "loss": 1049.7934, "step": 900 }, { "ce_loss_10": 3.490706205368042, "ce_loss_13": 3.4019437432289124, "ce_loss_2": 4.38123586177826, "ce_loss_3": 4.134083950519562, "ce_loss_7": 3.6646034359931945, "epoch": 0.091, "grad_norm": 700.0, "kl_loss_10": 189.97202682495117, "kl_loss_2": 2033.5506591796875, "kl_loss_3": 1553.0788024902345, "kl_loss_7": 550.8684387207031, "learning_rate": 0.0009835734273509786, "loss": 1058.8781, "step": 910 }, { "ce_loss_10": 3.5909106731414795, "ce_loss_13": 3.4989862561225893, "ce_loss_2": 4.468850481510162, "ce_loss_3": 4.242748379707336, "ce_loss_7": 3.7557740807533264, "epoch": 0.092, "grad_norm": 1200.0, "kl_loss_10": 187.22658081054686, "kl_loss_2": 1967.8191162109374, "kl_loss_3": 1530.7474304199218, "kl_loss_7": 525.3208801269532, "learning_rate": 0.0009831676344247342, "loss": 1050.8719, "step": 920 }, { "ce_loss_10": 3.601709270477295, "ce_loss_13": 3.517729616165161, "ce_loss_2": 4.460277819633484, "ce_loss_3": 4.208156263828277, "ce_loss_7": 3.750774598121643, "epoch": 0.093, "grad_norm": 976.0, "kl_loss_10": 178.7038887023926, "kl_loss_2": 1956.5622802734374, "kl_loss_3": 1466.5545593261718, "kl_loss_7": 497.9136016845703, "learning_rate": 0.0009827569760057755, "loss": 1037.5467, "step": 930 }, { "ce_loss_10": 3.524882364273071, "ce_loss_13": 3.433571922779083, "ce_loss_2": 4.444187712669373, "ce_loss_3": 4.170401692390442, "ce_loss_7": 3.6811744213104247, "epoch": 0.094, "grad_norm": 784.0, "kl_loss_10": 187.684228515625, "kl_loss_2": 2098.4013122558595, "kl_loss_3": 1541.9158447265625, "kl_loss_7": 517.6738510131836, "learning_rate": 0.000982341456229428, "loss": 1051.2325, "step": 940 }, { "ce_loss_10": 3.615868926048279, "ce_loss_13": 3.5291592121124267, "ce_loss_2": 4.486113166809082, "ce_loss_3": 4.232404887676239, "ce_loss_7": 3.76742799282074, "epoch": 0.095, "grad_norm": 676.0, "kl_loss_10": 183.69078369140624, "kl_loss_2": 1981.9986938476563, "kl_loss_3": 1486.5308166503905, "kl_loss_7": 510.1406478881836, "learning_rate": 0.000981921079279971, "loss": 1019.1191, "step": 950 }, { "ce_loss_10": 3.630085599422455, "ce_loss_13": 3.547876071929932, "ce_loss_2": 4.444245457649231, "ce_loss_3": 4.2063503623008724, "ce_loss_7": 3.7743829011917116, "epoch": 0.096, "grad_norm": 688.0, "kl_loss_10": 177.08920364379884, "kl_loss_2": 1876.4136657714844, "kl_loss_3": 1407.2470703125, "kl_loss_7": 490.83606872558596, "learning_rate": 0.0009814958493905962, "loss": 998.0566, "step": 960 }, { "ce_loss_10": 3.58119341135025, "ce_loss_13": 3.4952746510505674, "ce_loss_2": 4.4530457019805905, "ce_loss_3": 4.1936848878860475, "ce_loss_7": 3.745371329784393, "epoch": 0.097, "grad_norm": 856.0, "kl_loss_10": 180.93770446777344, "kl_loss_2": 1972.7642639160156, "kl_loss_3": 1475.1272094726562, "kl_loss_7": 518.6202331542969, "learning_rate": 0.0009810657708433637, "loss": 1046.3625, "step": 970 }, { "ce_loss_10": 3.6587608098983764, "ce_loss_13": 3.5761932730674744, "ce_loss_2": 4.4908324718475345, "ce_loss_3": 4.249144637584687, "ce_loss_7": 3.8146904349327087, "epoch": 0.098, "grad_norm": 1040.0, "kl_loss_10": 175.9536979675293, "kl_loss_2": 1896.2690856933593, "kl_loss_3": 1430.7713623046875, "kl_loss_7": 520.7419006347657, "learning_rate": 0.0009806308479691594, "loss": 1012.5152, "step": 980 }, { "ce_loss_10": 3.6701141357421876, "ce_loss_13": 3.5853224396705627, "ce_loss_2": 4.52058436870575, "ce_loss_3": 4.304493975639343, "ce_loss_7": 3.8337016344070434, "epoch": 0.099, "grad_norm": 1064.0, "kl_loss_10": 182.05279541015625, "kl_loss_2": 1942.8283203125, "kl_loss_3": 1504.5979553222655, "kl_loss_7": 525.28359375, "learning_rate": 0.0009801910851476522, "loss": 1029.841, "step": 990 }, { "ce_loss_10": 3.5780128598213197, "ce_loss_13": 3.4931989908218384, "ce_loss_2": 4.455501818656922, "ce_loss_3": 4.210770845413208, "ce_loss_7": 3.7347821235656737, "epoch": 0.1, "grad_norm": 668.0, "kl_loss_10": 179.7102798461914, "kl_loss_2": 1998.8098388671874, "kl_loss_3": 1528.9624084472657, "kl_loss_7": 516.746615600586, "learning_rate": 0.0009797464868072487, "loss": 1031.5818, "step": 1000 }, { "ce_loss_10": 3.568189251422882, "ce_loss_13": 3.4829827189445495, "ce_loss_2": 4.416838443279266, "ce_loss_3": 4.186929941177368, "ce_loss_7": 3.7205358743667603, "epoch": 0.101, "grad_norm": 732.0, "kl_loss_10": 179.53496170043945, "kl_loss_2": 1933.2125244140625, "kl_loss_3": 1477.44189453125, "kl_loss_7": 505.2583862304688, "learning_rate": 0.0009792970574250492, "loss": 1018.5471, "step": 1010 }, { "ce_loss_10": 3.6072539567947386, "ce_loss_13": 3.519195032119751, "ce_loss_2": 4.433947956562042, "ce_loss_3": 4.211443436145783, "ce_loss_7": 3.7644609808921814, "epoch": 0.102, "grad_norm": 824.0, "kl_loss_10": 179.83254623413086, "kl_loss_2": 1919.0231384277345, "kl_loss_3": 1470.111962890625, "kl_loss_7": 523.1709747314453, "learning_rate": 0.0009788428015268028, "loss": 1009.9572, "step": 1020 }, { "ce_loss_10": 3.595908534526825, "ce_loss_13": 3.5105677366256716, "ce_loss_2": 4.409725475311279, "ce_loss_3": 4.1861141443252565, "ce_loss_7": 3.7467220067977904, "epoch": 0.103, "grad_norm": 708.0, "kl_loss_10": 184.79271774291993, "kl_loss_2": 1885.4566284179687, "kl_loss_3": 1448.107196044922, "kl_loss_7": 506.4451049804687, "learning_rate": 0.0009783837236868609, "loss": 1001.9834, "step": 1030 }, { "ce_loss_10": 3.5803792238235475, "ce_loss_13": 3.48011519908905, "ce_loss_2": 4.398702096939087, "ce_loss_3": 4.170737850666046, "ce_loss_7": 3.7171101450920103, "epoch": 0.104, "grad_norm": 652.0, "kl_loss_10": 198.62368392944336, "kl_loss_2": 1895.2196655273438, "kl_loss_3": 1456.3298950195312, "kl_loss_7": 493.37102966308595, "learning_rate": 0.0009779198285281327, "loss": 1003.9096, "step": 1040 }, { "ce_loss_10": 3.5599252581596375, "ce_loss_13": 3.472754955291748, "ce_loss_2": 4.411727952957153, "ce_loss_3": 4.179691898822784, "ce_loss_7": 3.7037505507469177, "epoch": 0.105, "grad_norm": 612.0, "kl_loss_10": 185.47133712768556, "kl_loss_2": 1925.2284606933595, "kl_loss_3": 1470.763250732422, "kl_loss_7": 480.902815246582, "learning_rate": 0.0009774511207220368, "loss": 1008.8404, "step": 1050 }, { "ce_loss_10": 3.5997629284858705, "ce_loss_13": 3.5105605721473694, "ce_loss_2": 4.449126553535462, "ce_loss_3": 4.208902645111084, "ce_loss_7": 3.7385777592658997, "epoch": 0.106, "grad_norm": 664.0, "kl_loss_10": 188.41523284912108, "kl_loss_2": 1931.9002990722656, "kl_loss_3": 1463.5917114257813, "kl_loss_7": 488.90601348876953, "learning_rate": 0.0009769776049884564, "loss": 1011.1688, "step": 1060 }, { "ce_loss_10": 3.512972867488861, "ce_loss_13": 3.4215759873390197, "ce_loss_2": 4.383116137981415, "ce_loss_3": 4.136074674129486, "ce_loss_7": 3.658175325393677, "epoch": 0.107, "grad_norm": 852.0, "kl_loss_10": 190.86334686279298, "kl_loss_2": 1988.4344360351563, "kl_loss_3": 1500.9677978515624, "kl_loss_7": 497.2473831176758, "learning_rate": 0.0009764992860956889, "loss": 1046.9855, "step": 1070 }, { "ce_loss_10": 3.664934504032135, "ce_loss_13": 3.581758964061737, "ce_loss_2": 4.456975436210632, "ce_loss_3": 4.2267944574356076, "ce_loss_7": 3.7991317749023437, "epoch": 0.108, "grad_norm": 692.0, "kl_loss_10": 194.86636047363282, "kl_loss_2": 1828.549542236328, "kl_loss_3": 1386.0209167480468, "kl_loss_7": 470.81641540527346, "learning_rate": 0.0009760161688604008, "loss": 983.4062, "step": 1080 }, { "ce_loss_10": 3.699925255775452, "ce_loss_13": 3.581090009212494, "ce_loss_2": 4.5030299663543705, "ce_loss_3": 4.258689761161804, "ce_loss_7": 3.8088807463645935, "epoch": 0.109, "grad_norm": 932.0, "kl_loss_10": 231.67974853515625, "kl_loss_2": 1899.6866027832032, "kl_loss_3": 1422.8416687011718, "kl_loss_7": 478.274723815918, "learning_rate": 0.0009755282581475768, "loss": 1018.6061, "step": 1090 }, { "ce_loss_10": 3.7407832741737366, "ce_loss_13": 3.629633975028992, "ce_loss_2": 4.543957281112671, "ce_loss_3": 4.291879677772522, "ce_loss_7": 3.859333908557892, "epoch": 0.11, "grad_norm": 840.0, "kl_loss_10": 242.35417098999022, "kl_loss_2": 1901.089776611328, "kl_loss_3": 1411.2118530273438, "kl_loss_7": 486.46076049804685, "learning_rate": 0.0009750355588704727, "loss": 1002.7542, "step": 1100 }, { "ce_loss_10": 3.5715940475463865, "ce_loss_13": 3.4648202657699585, "ce_loss_2": 4.382298231124878, "ce_loss_3": 4.144306838512421, "ce_loss_7": 3.693096125125885, "epoch": 0.111, "grad_norm": 724.0, "kl_loss_10": 228.16529388427733, "kl_loss_2": 1889.4921142578125, "kl_loss_3": 1417.6652526855469, "kl_loss_7": 475.9224227905273, "learning_rate": 0.0009745380759905647, "loss": 1021.5175, "step": 1110 }, { "ce_loss_10": 3.518705630302429, "ce_loss_13": 3.418720316886902, "ce_loss_2": 4.3428931593894955, "ce_loss_3": 4.101069831848145, "ce_loss_7": 3.6415586709976195, "epoch": 0.112, "grad_norm": 704.0, "kl_loss_10": 208.09901504516603, "kl_loss_2": 1914.6993408203125, "kl_loss_3": 1443.0666809082031, "kl_loss_7": 475.36561584472656, "learning_rate": 0.0009740358145174998, "loss": 1021.0135, "step": 1120 }, { "ce_loss_10": 3.6743900656700133, "ce_loss_13": 3.5760027527809144, "ce_loss_2": 4.446346926689148, "ce_loss_3": 4.222055697441101, "ce_loss_7": 3.7923503518104553, "epoch": 0.113, "grad_norm": 568.0, "kl_loss_10": 194.76330795288087, "kl_loss_2": 1822.0701782226563, "kl_loss_3": 1381.2394104003906, "kl_loss_7": 467.96456146240234, "learning_rate": 0.0009735287795090455, "loss": 980.3441, "step": 1130 }, { "ce_loss_10": 3.5504071354866027, "ce_loss_13": 3.462277662754059, "ce_loss_2": 4.376504588127136, "ce_loss_3": 4.138739550113678, "ce_loss_7": 3.69127904176712, "epoch": 0.114, "grad_norm": 924.0, "kl_loss_10": 182.32659149169922, "kl_loss_2": 1886.9593383789063, "kl_loss_3": 1422.0450927734375, "kl_loss_7": 478.68931579589844, "learning_rate": 0.0009730169760710386, "loss": 988.1342, "step": 1140 }, { "ce_loss_10": 3.62770699262619, "ce_loss_13": 3.5463622212409973, "ce_loss_2": 4.45663468837738, "ce_loss_3": 4.212454223632813, "ce_loss_7": 3.7987569212913512, "epoch": 0.115, "grad_norm": 904.0, "kl_loss_10": 178.53827209472655, "kl_loss_2": 1875.3674560546874, "kl_loss_3": 1405.993963623047, "kl_loss_7": 538.1174468994141, "learning_rate": 0.0009725004093573342, "loss": 1006.2083, "step": 1150 }, { "ce_loss_10": 3.5690181612968446, "ce_loss_13": 3.4807565331459047, "ce_loss_2": 4.384287440776825, "ce_loss_3": 4.156945848464966, "ce_loss_7": 3.752134454250336, "epoch": 0.116, "grad_norm": 716.0, "kl_loss_10": 176.73938064575196, "kl_loss_2": 1855.7882263183594, "kl_loss_3": 1408.193621826172, "kl_loss_7": 553.3539993286133, "learning_rate": 0.0009719790845697534, "loss": 992.4009, "step": 1160 }, { "ce_loss_10": 3.5152013182640074, "ce_loss_13": 3.4388429045677187, "ce_loss_2": 4.306136679649353, "ce_loss_3": 4.084867370128632, "ce_loss_7": 3.685141396522522, "epoch": 0.117, "grad_norm": 696.0, "kl_loss_10": 168.1632064819336, "kl_loss_2": 1813.9858947753905, "kl_loss_3": 1386.7823852539063, "kl_loss_7": 525.0696212768555, "learning_rate": 0.0009714530069580309, "loss": 973.7273, "step": 1170 }, { "ce_loss_10": 3.62287323474884, "ce_loss_13": 3.5366265535354615, "ce_loss_2": 4.438341093063355, "ce_loss_3": 4.2517277002334595, "ce_loss_7": 3.7834308385849, "epoch": 0.118, "grad_norm": 668.0, "kl_loss_10": 180.73734664916992, "kl_loss_2": 1856.9345581054688, "kl_loss_3": 1479.8200439453126, "kl_loss_7": 530.3196792602539, "learning_rate": 0.0009709221818197624, "loss": 1003.6437, "step": 1180 }, { "ce_loss_10": 3.6476974844932557, "ce_loss_13": 3.566649878025055, "ce_loss_2": 4.460924696922302, "ce_loss_3": 4.266936922073365, "ce_loss_7": 3.8050862193107604, "epoch": 0.119, "grad_norm": 684.0, "kl_loss_10": 169.83945999145507, "kl_loss_2": 1853.9904663085938, "kl_loss_3": 1484.227764892578, "kl_loss_7": 493.99244689941406, "learning_rate": 0.0009703866145003512, "loss": 1003.36, "step": 1190 }, { "ce_loss_10": 3.6218933939933775, "ce_loss_13": 3.5399347305297852, "ce_loss_2": 4.422274470329285, "ce_loss_3": 4.231696951389313, "ce_loss_7": 3.76218581199646, "epoch": 0.12, "grad_norm": 860.0, "kl_loss_10": 167.10036239624023, "kl_loss_2": 1845.4068298339844, "kl_loss_3": 1456.3514770507813, "kl_loss_7": 478.3939971923828, "learning_rate": 0.0009698463103929542, "loss": 1001.8003, "step": 1200 }, { "ce_loss_10": 3.586829674243927, "ce_loss_13": 3.5053379774093627, "ce_loss_2": 4.3998222827911375, "ce_loss_3": 4.1850836157798765, "ce_loss_7": 3.728915822505951, "epoch": 0.121, "grad_norm": 780.0, "kl_loss_10": 169.9067398071289, "kl_loss_2": 1841.5256103515626, "kl_loss_3": 1426.9917419433593, "kl_loss_7": 468.1813171386719, "learning_rate": 0.0009693012749384279, "loss": 985.2267, "step": 1210 }, { "ce_loss_10": 3.601468288898468, "ce_loss_13": 3.5188393354415894, "ce_loss_2": 4.404597902297974, "ce_loss_3": 4.181776678562164, "ce_loss_7": 3.7396764159202576, "epoch": 0.122, "grad_norm": 644.0, "kl_loss_10": 173.15536575317384, "kl_loss_2": 1847.19716796875, "kl_loss_3": 1416.7066284179687, "kl_loss_7": 463.79361419677736, "learning_rate": 0.0009687515136252732, "loss": 967.898, "step": 1220 }, { "ce_loss_10": 3.5564996123313906, "ce_loss_13": 3.473362135887146, "ce_loss_2": 4.387398433685303, "ce_loss_3": 4.149928534030915, "ce_loss_7": 3.689453649520874, "epoch": 0.123, "grad_norm": 740.0, "kl_loss_10": 174.67308807373047, "kl_loss_2": 1905.6107360839844, "kl_loss_3": 1438.5151733398438, "kl_loss_7": 470.0431854248047, "learning_rate": 0.0009681970319895803, "loss": 999.513, "step": 1230 }, { "ce_loss_10": 3.6379595756530763, "ce_loss_13": 3.558580422401428, "ce_loss_2": 4.434055185317993, "ce_loss_3": 4.211921918392181, "ce_loss_7": 3.768270802497864, "epoch": 0.124, "grad_norm": 604.0, "kl_loss_10": 171.7199905395508, "kl_loss_2": 1822.535498046875, "kl_loss_3": 1383.5216369628906, "kl_loss_7": 455.1578430175781, "learning_rate": 0.0009676378356149733, "loss": 961.3066, "step": 1240 }, { "ce_loss_10": 3.605825924873352, "ce_loss_13": 3.526325762271881, "ce_loss_2": 4.392473816871643, "ce_loss_3": 4.1692427635192875, "ce_loss_7": 3.7352627873420716, "epoch": 0.125, "grad_norm": 720.0, "kl_loss_10": 167.6490280151367, "kl_loss_2": 1810.04775390625, "kl_loss_3": 1368.414892578125, "kl_loss_7": 448.32765197753906, "learning_rate": 0.0009670739301325534, "loss": 957.2052, "step": 1250 }, { "ce_loss_10": 3.571504831314087, "ce_loss_13": 3.4888750672340394, "ce_loss_2": 4.378832995891571, "ce_loss_3": 4.150575721263886, "ce_loss_7": 3.7100158095359803, "epoch": 0.126, "grad_norm": 584.0, "kl_loss_10": 173.1241600036621, "kl_loss_2": 1845.0594482421875, "kl_loss_3": 1403.9748901367188, "kl_loss_7": 459.8861801147461, "learning_rate": 0.0009665053212208426, "loss": 977.427, "step": 1260 }, { "ce_loss_10": 3.618936550617218, "ce_loss_13": 3.536702239513397, "ce_loss_2": 4.445125913619995, "ce_loss_3": 4.200946152210236, "ce_loss_7": 3.752047967910767, "epoch": 0.127, "grad_norm": 636.0, "kl_loss_10": 175.11898193359374, "kl_loss_2": 1894.717462158203, "kl_loss_3": 1411.8616638183594, "kl_loss_7": 466.6258575439453, "learning_rate": 0.0009659320146057262, "loss": 985.5969, "step": 1270 }, { "ce_loss_10": 3.615454339981079, "ce_loss_13": 3.536350691318512, "ce_loss_2": 4.4296933054924015, "ce_loss_3": 4.193294942378998, "ce_loss_7": 3.7456225991249084, "epoch": 0.128, "grad_norm": 684.0, "kl_loss_10": 172.33256301879882, "kl_loss_2": 1863.494482421875, "kl_loss_3": 1382.5830627441405, "kl_loss_7": 454.3094543457031, "learning_rate": 0.0009653540160603955, "loss": 967.9032, "step": 1280 }, { "ce_loss_10": 3.618008828163147, "ce_loss_13": 3.540120279788971, "ce_loss_2": 4.412742114067077, "ce_loss_3": 4.178766667842865, "ce_loss_7": 3.7450068473815916, "epoch": 0.129, "grad_norm": 728.0, "kl_loss_10": 168.77175369262696, "kl_loss_2": 1848.1599365234374, "kl_loss_3": 1392.0753234863282, "kl_loss_7": 450.8601531982422, "learning_rate": 0.0009647713314052896, "loss": 957.1789, "step": 1290 }, { "ce_loss_10": 3.570509469509125, "ce_loss_13": 3.489526689052582, "ce_loss_2": 4.4025511026382445, "ce_loss_3": 4.168629574775696, "ce_loss_7": 3.7077707052230835, "epoch": 0.13, "grad_norm": 676.0, "kl_loss_10": 173.87704010009764, "kl_loss_2": 1909.4092407226562, "kl_loss_3": 1439.0013549804687, "kl_loss_7": 467.00162200927736, "learning_rate": 0.0009641839665080363, "loss": 987.4625, "step": 1300 }, { "ce_loss_10": 3.5321548104286196, "ce_loss_13": 3.45308997631073, "ce_loss_2": 4.342683684825897, "ce_loss_3": 4.104823327064514, "ce_loss_7": 3.6708693146705627, "epoch": 0.131, "grad_norm": 760.0, "kl_loss_10": 166.72612762451172, "kl_loss_2": 1841.438250732422, "kl_loss_3": 1384.6276489257812, "kl_loss_7": 460.2369979858398, "learning_rate": 0.0009635919272833937, "loss": 954.8711, "step": 1310 }, { "ce_loss_10": 3.5722994804382324, "ce_loss_13": 3.4894865989685058, "ce_loss_2": 4.388064694404602, "ce_loss_3": 4.155776739120483, "ce_loss_7": 3.710488736629486, "epoch": 0.132, "grad_norm": 752.0, "kl_loss_10": 172.25881423950196, "kl_loss_2": 1844.984326171875, "kl_loss_3": 1399.1760986328125, "kl_loss_7": 472.22783203125, "learning_rate": 0.0009629952196931902, "loss": 955.9352, "step": 1320 }, { "ce_loss_10": 3.551358866691589, "ce_loss_13": 3.472510349750519, "ce_loss_2": 4.355191397666931, "ce_loss_3": 4.118129765987396, "ce_loss_7": 3.6837236404418947, "epoch": 0.133, "grad_norm": 588.0, "kl_loss_10": 171.36332321166992, "kl_loss_2": 1841.2326904296874, "kl_loss_3": 1376.0937561035157, "kl_loss_7": 456.13024139404297, "learning_rate": 0.0009623938497462645, "loss": 960.5266, "step": 1330 }, { "ce_loss_10": 3.5559371590614317, "ce_loss_13": 3.4709559440612794, "ce_loss_2": 4.358719682693481, "ce_loss_3": 4.123664891719818, "ce_loss_7": 3.688221287727356, "epoch": 0.134, "grad_norm": 652.0, "kl_loss_10": 176.0792022705078, "kl_loss_2": 1841.4331909179687, "kl_loss_3": 1396.2080993652344, "kl_loss_7": 467.3533462524414, "learning_rate": 0.0009617878234984055, "loss": 977.5735, "step": 1340 }, { "ce_loss_10": 3.6392318964004517, "ce_loss_13": 3.5607362031936645, "ce_loss_2": 4.411078405380249, "ce_loss_3": 4.175350475311279, "ce_loss_7": 3.7691015005111694, "epoch": 0.135, "grad_norm": 680.0, "kl_loss_10": 167.24838333129884, "kl_loss_2": 1775.0880249023437, "kl_loss_3": 1327.7960632324218, "kl_loss_7": 444.50580139160155, "learning_rate": 0.0009611771470522907, "loss": 944.3117, "step": 1350 }, { "ce_loss_10": 3.571557867527008, "ce_loss_13": 3.4895546078681945, "ce_loss_2": 4.367475354671479, "ce_loss_3": 4.138877761363983, "ce_loss_7": 3.70872939825058, "epoch": 0.136, "grad_norm": 620.0, "kl_loss_10": 168.35062713623046, "kl_loss_2": 1814.3583923339843, "kl_loss_3": 1366.009014892578, "kl_loss_7": 453.19874114990233, "learning_rate": 0.0009605618265574251, "loss": 946.0445, "step": 1360 }, { "ce_loss_10": 3.5265390634536744, "ce_loss_13": 3.4456461548805235, "ce_loss_2": 4.336602091789246, "ce_loss_3": 4.115667700767517, "ce_loss_7": 3.666548228263855, "epoch": 0.137, "grad_norm": 640.0, "kl_loss_10": 170.2979606628418, "kl_loss_2": 1872.8250915527344, "kl_loss_3": 1430.012762451172, "kl_loss_7": 472.1897598266602, "learning_rate": 0.0009599418682100792, "loss": 971.0635, "step": 1370 }, { "ce_loss_10": 3.5678892135620117, "ce_loss_13": 3.4896570086479186, "ce_loss_2": 4.374086010456085, "ce_loss_3": 4.133544516563416, "ce_loss_7": 3.7014646172523498, "epoch": 0.138, "grad_norm": 724.0, "kl_loss_10": 164.6707275390625, "kl_loss_2": 1831.634326171875, "kl_loss_3": 1376.7487731933593, "kl_loss_7": 454.62243194580077, "learning_rate": 0.0009593172782532268, "loss": 962.5875, "step": 1380 }, { "ce_loss_10": 3.6105551958084106, "ce_loss_13": 3.5338037848472594, "ce_loss_2": 4.406553602218628, "ce_loss_3": 4.176796317100525, "ce_loss_7": 3.7485893249511717, "epoch": 0.139, "grad_norm": 640.0, "kl_loss_10": 166.68616943359376, "kl_loss_2": 1814.0467590332032, "kl_loss_3": 1370.630841064453, "kl_loss_7": 459.292626953125, "learning_rate": 0.0009586880629764817, "loss": 950.4029, "step": 1390 }, { "ce_loss_10": 3.5382327914237974, "ce_loss_13": 3.4606364369392395, "ce_loss_2": 4.352329158782959, "ce_loss_3": 4.119093835353851, "ce_loss_7": 3.6758073806762694, "epoch": 0.14, "grad_norm": 604.0, "kl_loss_10": 168.50987167358397, "kl_loss_2": 1828.4553466796874, "kl_loss_3": 1383.6268249511718, "kl_loss_7": 462.06499633789065, "learning_rate": 0.0009580542287160348, "loss": 947.015, "step": 1400 }, { "ce_loss_10": 3.505202662944794, "ce_loss_13": 3.4246376156806946, "ce_loss_2": 4.3068211555480955, "ce_loss_3": 4.070422291755676, "ce_loss_7": 3.6390760779380797, "epoch": 0.141, "grad_norm": 736.0, "kl_loss_10": 165.31348571777343, "kl_loss_2": 1827.4319946289063, "kl_loss_3": 1369.9763732910155, "kl_loss_7": 452.67235260009767, "learning_rate": 0.0009574157818545901, "loss": 940.6614, "step": 1410 }, { "ce_loss_10": 3.573702847957611, "ce_loss_13": 3.498333144187927, "ce_loss_2": 4.353998446464539, "ce_loss_3": 4.126765632629395, "ce_loss_7": 3.70104296207428, "epoch": 0.142, "grad_norm": 672.0, "kl_loss_10": 160.25886917114258, "kl_loss_2": 1778.8529113769532, "kl_loss_3": 1335.792724609375, "kl_loss_7": 436.3431121826172, "learning_rate": 0.0009567727288213005, "loss": 948.7402, "step": 1420 }, { "ce_loss_10": 3.552593457698822, "ce_loss_13": 3.4785722613334658, "ce_loss_2": 4.346270668506622, "ce_loss_3": 4.128087055683136, "ce_loss_7": 3.691338813304901, "epoch": 0.143, "grad_norm": 720.0, "kl_loss_10": 169.80655975341796, "kl_loss_2": 1820.7370666503907, "kl_loss_3": 1388.6684265136719, "kl_loss_7": 470.6744079589844, "learning_rate": 0.0009561250760917027, "loss": 952.012, "step": 1430 }, { "ce_loss_10": 3.5761927485466005, "ce_loss_13": 3.4948782205581663, "ce_loss_2": 4.358104157447815, "ce_loss_3": 4.14318814277649, "ce_loss_7": 3.7130213379859924, "epoch": 0.144, "grad_norm": 656.0, "kl_loss_10": 169.0357551574707, "kl_loss_2": 1829.9590637207032, "kl_loss_3": 1398.402197265625, "kl_loss_7": 465.52441864013673, "learning_rate": 0.0009554728301876525, "loss": 941.6771, "step": 1440 }, { "ce_loss_10": 3.6277766704559324, "ce_loss_13": 3.5470379829406737, "ce_loss_2": 4.396523261070252, "ce_loss_3": 4.18031120300293, "ce_loss_7": 3.7636173605918883, "epoch": 0.145, "grad_norm": 572.0, "kl_loss_10": 168.25755767822267, "kl_loss_2": 1775.9608276367187, "kl_loss_3": 1358.2267517089845, "kl_loss_7": 455.221989440918, "learning_rate": 0.0009548159976772592, "loss": 965.8246, "step": 1450 }, { "ce_loss_10": 3.5671639442443848, "ce_loss_13": 3.488686537742615, "ce_loss_2": 4.366501688957214, "ce_loss_3": 4.148012197017669, "ce_loss_7": 3.7019433975219727, "epoch": 0.146, "grad_norm": 948.0, "kl_loss_10": 167.73931274414062, "kl_loss_2": 1823.703350830078, "kl_loss_3": 1385.1528442382812, "kl_loss_7": 466.3150634765625, "learning_rate": 0.0009541545851748186, "loss": 952.6289, "step": 1460 }, { "ce_loss_10": 3.438965713977814, "ce_loss_13": 3.356391704082489, "ce_loss_2": 4.271179831027984, "ce_loss_3": 4.023587942123413, "ce_loss_7": 3.5836508512496947, "epoch": 0.147, "grad_norm": 728.0, "kl_loss_10": 166.18173599243164, "kl_loss_2": 1871.1487426757812, "kl_loss_3": 1398.6961181640625, "kl_loss_7": 475.6040863037109, "learning_rate": 0.0009534885993407473, "loss": 965.3007, "step": 1470 }, { "ce_loss_10": 3.6119082808494567, "ce_loss_13": 3.5323118925094605, "ce_loss_2": 4.421308779716492, "ce_loss_3": 4.1795473337173465, "ce_loss_7": 3.7527047395706177, "epoch": 0.148, "grad_norm": 924.0, "kl_loss_10": 169.8495635986328, "kl_loss_2": 1843.5690612792969, "kl_loss_3": 1385.9791320800782, "kl_loss_7": 477.3311019897461, "learning_rate": 0.0009528180468815154, "loss": 969.2416, "step": 1480 }, { "ce_loss_10": 3.6473464369773865, "ce_loss_13": 3.5708807826042177, "ce_loss_2": 4.4276422500610355, "ce_loss_3": 4.19535973072052, "ce_loss_7": 3.780378723144531, "epoch": 0.149, "grad_norm": 716.0, "kl_loss_10": 168.1454620361328, "kl_loss_2": 1800.059014892578, "kl_loss_3": 1349.663555908203, "kl_loss_7": 474.4862854003906, "learning_rate": 0.0009521429345495787, "loss": 948.5005, "step": 1490 }, { "ce_loss_10": 3.6274529099464417, "ce_loss_13": 3.5506494641304016, "ce_loss_2": 4.379093813896179, "ce_loss_3": 4.163582825660706, "ce_loss_7": 3.767805981636047, "epoch": 0.15, "grad_norm": 544.0, "kl_loss_10": 161.9274887084961, "kl_loss_2": 1749.5603088378907, "kl_loss_3": 1322.4438842773438, "kl_loss_7": 461.2112503051758, "learning_rate": 0.0009514632691433108, "loss": 934.5367, "step": 1500 }, { "ce_loss_10": 3.589273178577423, "ce_loss_13": 3.5116019368171694, "ce_loss_2": 4.371138715744019, "ce_loss_3": 4.152551782131195, "ce_loss_7": 3.737976813316345, "epoch": 0.151, "grad_norm": 756.0, "kl_loss_10": 165.94934310913087, "kl_loss_2": 1800.9721557617188, "kl_loss_3": 1379.0521667480468, "kl_loss_7": 496.21505279541014, "learning_rate": 0.0009507790575069346, "loss": 965.8927, "step": 1510 }, { "ce_loss_10": 3.567319369316101, "ce_loss_13": 3.4871225714683534, "ce_loss_2": 4.369505047798157, "ce_loss_3": 4.1454179883003235, "ce_loss_7": 3.7245999336242677, "epoch": 0.152, "grad_norm": 800.0, "kl_loss_10": 168.54553909301757, "kl_loss_2": 1831.139581298828, "kl_loss_3": 1381.621405029297, "kl_loss_7": 492.096989440918, "learning_rate": 0.0009500903065304539, "loss": 972.4712, "step": 1520 }, { "ce_loss_10": 3.6029422521591186, "ce_loss_13": 3.529670035839081, "ce_loss_2": 4.370370721817016, "ce_loss_3": 4.140894186496735, "ce_loss_7": 3.73762069940567, "epoch": 0.153, "grad_norm": 828.0, "kl_loss_10": 160.99332580566406, "kl_loss_2": 1756.960675048828, "kl_loss_3": 1307.8462524414062, "kl_loss_7": 457.64794006347654, "learning_rate": 0.0009493970231495835, "loss": 935.4037, "step": 1530 }, { "ce_loss_10": 3.5418770790100096, "ce_loss_13": 3.470135521888733, "ce_loss_2": 4.307833838462829, "ce_loss_3": 4.0862520098686215, "ce_loss_7": 3.6741525650024416, "epoch": 0.154, "grad_norm": 788.0, "kl_loss_10": 161.51265563964844, "kl_loss_2": 1777.391455078125, "kl_loss_3": 1322.6101928710937, "kl_loss_7": 442.0227294921875, "learning_rate": 0.0009486992143456792, "loss": 925.9713, "step": 1540 }, { "ce_loss_10": 3.572007155418396, "ce_loss_13": 3.4902480483055114, "ce_loss_2": 4.407237410545349, "ce_loss_3": 4.155784142017365, "ce_loss_7": 3.711955487728119, "epoch": 0.155, "grad_norm": 804.0, "kl_loss_10": 173.13835983276368, "kl_loss_2": 1896.4215270996094, "kl_loss_3": 1404.887060546875, "kl_loss_7": 488.79372253417966, "learning_rate": 0.0009479968871456679, "loss": 969.3572, "step": 1550 }, { "ce_loss_10": 3.5374878644943237, "ce_loss_13": 3.4610541582107546, "ce_loss_2": 4.35056471824646, "ce_loss_3": 4.090835416316986, "ce_loss_7": 3.67384033203125, "epoch": 0.156, "grad_norm": 664.0, "kl_loss_10": 168.38916778564453, "kl_loss_2": 1863.9793334960937, "kl_loss_3": 1362.9484497070312, "kl_loss_7": 457.0409851074219, "learning_rate": 0.0009472900486219768, "loss": 942.6081, "step": 1560 }, { "ce_loss_10": 3.5255000948905946, "ce_loss_13": 3.4513633012771607, "ce_loss_2": 4.313361859321594, "ce_loss_3": 4.075027894973755, "ce_loss_7": 3.655648243427277, "epoch": 0.157, "grad_norm": 632.0, "kl_loss_10": 163.33478240966798, "kl_loss_2": 1801.9673278808593, "kl_loss_3": 1334.2445068359375, "kl_loss_7": 440.6134338378906, "learning_rate": 0.000946578705892462, "loss": 935.4695, "step": 1570 }, { "ce_loss_10": 3.566776442527771, "ce_loss_13": 3.4891526818275453, "ce_loss_2": 4.334450674057007, "ce_loss_3": 4.097967481613159, "ce_loss_7": 3.6950459837913514, "epoch": 0.158, "grad_norm": 596.0, "kl_loss_10": 161.22595672607423, "kl_loss_2": 1756.149267578125, "kl_loss_3": 1303.3608947753905, "kl_loss_7": 433.9689010620117, "learning_rate": 0.0009458628661203367, "loss": 927.5395, "step": 1580 }, { "ce_loss_10": 3.5715178489685058, "ce_loss_13": 3.493246281147003, "ce_loss_2": 4.381004786491394, "ce_loss_3": 4.15310035943985, "ce_loss_7": 3.7095054388046265, "epoch": 0.159, "grad_norm": 772.0, "kl_loss_10": 166.47358245849608, "kl_loss_2": 1829.0434631347657, "kl_loss_3": 1386.59541015625, "kl_loss_7": 455.2687271118164, "learning_rate": 0.0009451425365140996, "loss": 931.1994, "step": 1590 }, { "ce_loss_10": 3.644000780582428, "ce_loss_13": 3.5690673708915712, "ce_loss_2": 4.3989935398101805, "ce_loss_3": 4.183095216751099, "ce_loss_7": 3.770583391189575, "epoch": 0.16, "grad_norm": 684.0, "kl_loss_10": 164.29436340332032, "kl_loss_2": 1741.866064453125, "kl_loss_3": 1315.065985107422, "kl_loss_7": 435.75640869140625, "learning_rate": 0.0009444177243274617, "loss": 913.077, "step": 1600 }, { "ce_loss_10": 3.5015690445899965, "ce_loss_13": 3.420224165916443, "ce_loss_2": 4.298003304004669, "ce_loss_3": 4.071039080619812, "ce_loss_7": 3.6376882791519165, "epoch": 0.161, "grad_norm": 648.0, "kl_loss_10": 170.34925384521483, "kl_loss_2": 1819.6341918945313, "kl_loss_3": 1383.1305297851563, "kl_loss_7": 454.9949661254883, "learning_rate": 0.0009436884368592739, "loss": 949.2351, "step": 1610 }, { "ce_loss_10": 3.553143525123596, "ce_loss_13": 3.4774653792381285, "ce_loss_2": 4.3112345933914185, "ce_loss_3": 4.104372417926788, "ce_loss_7": 3.6796785831451415, "epoch": 0.162, "grad_norm": 728.0, "kl_loss_10": 164.4239158630371, "kl_loss_2": 1752.8823547363281, "kl_loss_3": 1345.055731201172, "kl_loss_7": 438.5689453125, "learning_rate": 0.0009429546814534529, "loss": 940.1898, "step": 1620 }, { "ce_loss_10": 3.5688347697257994, "ce_loss_13": 3.4930466771125794, "ce_loss_2": 4.32410843372345, "ce_loss_3": 4.122909438610077, "ce_loss_7": 3.6923286437988283, "epoch": 0.163, "grad_norm": 596.0, "kl_loss_10": 162.7445167541504, "kl_loss_2": 1746.289862060547, "kl_loss_3": 1339.1979248046875, "kl_loss_7": 437.2148178100586, "learning_rate": 0.0009422164654989072, "loss": 912.9286, "step": 1630 }, { "ce_loss_10": 3.678850030899048, "ce_loss_13": 3.6019165754318236, "ce_loss_2": 4.419181323051452, "ce_loss_3": 4.2116731882095335, "ce_loss_7": 3.801846480369568, "epoch": 0.164, "grad_norm": 688.0, "kl_loss_10": 162.59844360351562, "kl_loss_2": 1724.923516845703, "kl_loss_3": 1329.3218505859375, "kl_loss_7": 438.8222412109375, "learning_rate": 0.0009414737964294635, "loss": 921.2891, "step": 1640 }, { "ce_loss_10": 3.603172779083252, "ce_loss_13": 3.5337018847465513, "ce_loss_2": 4.340193152427673, "ce_loss_3": 4.136861097812653, "ce_loss_7": 3.7259446382522583, "epoch": 0.165, "grad_norm": 732.0, "kl_loss_10": 157.09877319335936, "kl_loss_2": 1687.0253173828125, "kl_loss_3": 1302.9964721679687, "kl_loss_7": 423.92798919677733, "learning_rate": 0.000940726681723791, "loss": 916.6791, "step": 1650 }, { "ce_loss_10": 3.440978169441223, "ce_loss_13": 3.3671964764595033, "ce_loss_2": 4.236228859424591, "ce_loss_3": 4.024500918388367, "ce_loss_7": 3.5734368205070495, "epoch": 0.166, "grad_norm": 692.0, "kl_loss_10": 159.3915771484375, "kl_loss_2": 1815.1759338378906, "kl_loss_3": 1393.582989501953, "kl_loss_7": 444.88841857910154, "learning_rate": 0.0009399751289053266, "loss": 922.4507, "step": 1660 }, { "ce_loss_10": 3.6614871978759767, "ce_loss_13": 3.588225471973419, "ce_loss_2": 4.408766531944275, "ce_loss_3": 4.200141429901123, "ce_loss_7": 3.7855040192604066, "epoch": 0.167, "grad_norm": 780.0, "kl_loss_10": 157.96069717407227, "kl_loss_2": 1723.715997314453, "kl_loss_3": 1322.4010009765625, "kl_loss_7": 429.9161651611328, "learning_rate": 0.0009392191455421988, "loss": 917.9642, "step": 1670 }, { "ce_loss_10": 3.63707195520401, "ce_loss_13": 3.5556543111801147, "ce_loss_2": 4.380373930931091, "ce_loss_3": 4.166925942897796, "ce_loss_7": 3.758769381046295, "epoch": 0.168, "grad_norm": 660.0, "kl_loss_10": 175.24741973876954, "kl_loss_2": 1761.8118225097655, "kl_loss_3": 1344.4787719726562, "kl_loss_7": 450.10546875, "learning_rate": 0.0009384587392471515, "loss": 916.3633, "step": 1680 }, { "ce_loss_10": 3.622085762023926, "ce_loss_13": 3.5469981908798216, "ce_loss_2": 4.356770062446595, "ce_loss_3": 4.151288604736328, "ce_loss_7": 3.746310067176819, "epoch": 0.169, "grad_norm": 608.0, "kl_loss_10": 163.2817695617676, "kl_loss_2": 1704.9851623535155, "kl_loss_3": 1297.997540283203, "kl_loss_7": 428.83862762451173, "learning_rate": 0.0009376939176774678, "loss": 905.5219, "step": 1690 }, { "ce_loss_10": 3.5977450728416445, "ce_loss_13": 3.519981324672699, "ce_loss_2": 4.352665519714355, "ce_loss_3": 4.143550324440002, "ce_loss_7": 3.7202346324920654, "epoch": 0.17, "grad_norm": 632.0, "kl_loss_10": 162.28709259033204, "kl_loss_2": 1725.159112548828, "kl_loss_3": 1313.8654235839845, "kl_loss_7": 435.5428039550781, "learning_rate": 0.0009369246885348925, "loss": 921.765, "step": 1700 }, { "ce_loss_10": 3.59084609746933, "ce_loss_13": 3.5151766538619995, "ce_loss_2": 4.375664782524109, "ce_loss_3": 4.156036603450775, "ce_loss_7": 3.7246397972106933, "epoch": 0.171, "grad_norm": 684.0, "kl_loss_10": 164.49726409912108, "kl_loss_2": 1801.033349609375, "kl_loss_3": 1370.6250915527344, "kl_loss_7": 453.40001373291017, "learning_rate": 0.0009361510595655545, "loss": 935.6853, "step": 1710 }, { "ce_loss_10": 3.549048590660095, "ce_loss_13": 3.4653629899024962, "ce_loss_2": 4.307630968093872, "ce_loss_3": 4.094715774059296, "ce_loss_7": 3.6762614488601684, "epoch": 0.172, "grad_norm": 680.0, "kl_loss_10": 170.3953384399414, "kl_loss_2": 1762.9258972167968, "kl_loss_3": 1346.3614562988282, "kl_loss_7": 449.2826202392578, "learning_rate": 0.0009353730385598887, "loss": 922.2734, "step": 1720 }, { "ce_loss_10": 3.472779655456543, "ce_loss_13": 3.3950839400291444, "ce_loss_2": 4.263340127468109, "ce_loss_3": 4.03793773651123, "ce_loss_7": 3.6020253181457518, "epoch": 0.173, "grad_norm": 656.0, "kl_loss_10": 163.52980422973633, "kl_loss_2": 1797.2368774414062, "kl_loss_3": 1351.431103515625, "kl_loss_7": 441.22152252197264, "learning_rate": 0.0009345906333525581, "loss": 935.6812, "step": 1730 }, { "ce_loss_10": 3.5214235901832582, "ce_loss_13": 3.4371795892715453, "ce_loss_2": 4.292250299453736, "ce_loss_3": 4.065243577957153, "ce_loss_7": 3.642275357246399, "epoch": 0.174, "grad_norm": 756.0, "kl_loss_10": 178.52879638671874, "kl_loss_2": 1812.4777038574218, "kl_loss_3": 1354.8654235839845, "kl_loss_7": 448.5812484741211, "learning_rate": 0.0009338038518223745, "loss": 926.7967, "step": 1740 }, { "ce_loss_10": 3.579227292537689, "ce_loss_13": 3.497506093978882, "ce_loss_2": 4.353535735607148, "ce_loss_3": 4.1314038872718815, "ce_loss_7": 3.712595534324646, "epoch": 0.175, "grad_norm": 684.0, "kl_loss_10": 170.88160781860353, "kl_loss_2": 1791.0358825683593, "kl_loss_3": 1358.0457336425782, "kl_loss_7": 456.49011383056643, "learning_rate": 0.0009330127018922195, "loss": 955.6061, "step": 1750 }, { "ce_loss_10": 3.534073221683502, "ce_loss_13": 3.4557726621627807, "ce_loss_2": 4.298114824295044, "ce_loss_3": 4.071936726570129, "ce_loss_7": 3.6692795515060426, "epoch": 0.176, "grad_norm": 940.0, "kl_loss_10": 162.63356018066406, "kl_loss_2": 1777.0078063964843, "kl_loss_3": 1322.5951354980468, "kl_loss_7": 452.01353607177737, "learning_rate": 0.0009322171915289634, "loss": 925.2979, "step": 1760 }, { "ce_loss_10": 3.562859356403351, "ce_loss_13": 3.4936991572380065, "ce_loss_2": 4.311712801456451, "ce_loss_3": 4.096149682998657, "ce_loss_7": 3.695814371109009, "epoch": 0.177, "grad_norm": 672.0, "kl_loss_10": 159.3963638305664, "kl_loss_2": 1739.3221557617187, "kl_loss_3": 1308.4884643554688, "kl_loss_7": 450.83116302490237, "learning_rate": 0.0009314173287433873, "loss": 905.2543, "step": 1770 }, { "ce_loss_10": 3.5641183972358705, "ce_loss_13": 3.4850690603256225, "ce_loss_2": 4.308147180080414, "ce_loss_3": 4.097426617145539, "ce_loss_7": 3.6933220624923706, "epoch": 0.178, "grad_norm": 848.0, "kl_loss_10": 167.73994827270508, "kl_loss_2": 1746.2864990234375, "kl_loss_3": 1316.532891845703, "kl_loss_7": 455.0190155029297, "learning_rate": 0.0009306131215901003, "loss": 906.8408, "step": 1780 }, { "ce_loss_10": 3.596917653083801, "ce_loss_13": 3.516416549682617, "ce_loss_2": 4.341592967510223, "ce_loss_3": 4.1163853168487545, "ce_loss_7": 3.717692863941193, "epoch": 0.179, "grad_norm": 696.0, "kl_loss_10": 170.9574432373047, "kl_loss_2": 1727.8181701660155, "kl_loss_3": 1299.9059143066406, "kl_loss_7": 441.5744827270508, "learning_rate": 0.0009298045781674596, "loss": 897.6227, "step": 1790 }, { "ce_loss_10": 3.57572420835495, "ce_loss_13": 3.4945407032966616, "ce_loss_2": 4.302236485481262, "ce_loss_3": 4.08895423412323, "ce_loss_7": 3.6926202178001404, "epoch": 0.18, "grad_norm": 664.0, "kl_loss_10": 179.89295349121093, "kl_loss_2": 1700.9044799804688, "kl_loss_3": 1284.20361328125, "kl_loss_7": 431.50037536621096, "learning_rate": 0.0009289917066174886, "loss": 914.1196, "step": 1800 }, { "ce_loss_10": 3.574003517627716, "ce_loss_13": 3.4940902471542357, "ce_loss_2": 4.277863013744354, "ce_loss_3": 4.078153216838837, "ce_loss_7": 3.683695209026337, "epoch": 0.181, "grad_norm": 664.0, "kl_loss_10": 173.87937088012694, "kl_loss_2": 1662.8412353515625, "kl_loss_3": 1261.6159118652345, "kl_loss_7": 416.16939239501954, "learning_rate": 0.0009281745151257945, "loss": 889.3567, "step": 1810 }, { "ce_loss_10": 3.587869346141815, "ce_loss_13": 3.50655517578125, "ce_loss_2": 4.322592568397522, "ce_loss_3": 4.116343176364898, "ce_loss_7": 3.7045819759368896, "epoch": 0.182, "grad_norm": 688.0, "kl_loss_10": 170.37163772583008, "kl_loss_2": 1707.6110900878907, "kl_loss_3": 1315.0409118652344, "kl_loss_7": 429.47176055908204, "learning_rate": 0.0009273530119214868, "loss": 911.8113, "step": 1820 }, { "ce_loss_10": 3.696093189716339, "ce_loss_13": 3.6190614104270935, "ce_loss_2": 4.3983154296875, "ce_loss_3": 4.195645475387574, "ce_loss_7": 3.807457995414734, "epoch": 0.183, "grad_norm": 800.0, "kl_loss_10": 165.97553634643555, "kl_loss_2": 1682.4634155273438, "kl_loss_3": 1278.163592529297, "kl_loss_7": 420.69031982421876, "learning_rate": 0.0009265272052770935, "loss": 880.8881, "step": 1830 }, { "ce_loss_10": 3.5015958070755007, "ce_loss_13": 3.424196720123291, "ce_loss_2": 4.267678773403167, "ce_loss_3": 4.0475095987319945, "ce_loss_7": 3.6268508434295654, "epoch": 0.184, "grad_norm": 704.0, "kl_loss_10": 163.908536529541, "kl_loss_2": 1749.7696533203125, "kl_loss_3": 1320.0984130859374, "kl_loss_7": 427.6091049194336, "learning_rate": 0.0009256971035084784, "loss": 912.5401, "step": 1840 }, { "ce_loss_10": 3.44786741733551, "ce_loss_13": 3.3659385561943056, "ce_loss_2": 4.2315644264221195, "ce_loss_3": 4.008389139175415, "ce_loss_7": 3.574787473678589, "epoch": 0.185, "grad_norm": 708.0, "kl_loss_10": 172.1600601196289, "kl_loss_2": 1787.1378540039063, "kl_loss_3": 1351.4346496582032, "kl_loss_7": 442.25659637451173, "learning_rate": 0.0009248627149747573, "loss": 920.3978, "step": 1850 }, { "ce_loss_10": 3.6512333154678345, "ce_loss_13": 3.5744757533073424, "ce_loss_2": 4.376823902130127, "ce_loss_3": 4.171487009525299, "ce_loss_7": 3.772753894329071, "epoch": 0.186, "grad_norm": 580.0, "kl_loss_10": 179.92476654052734, "kl_loss_2": 1699.939630126953, "kl_loss_3": 1295.8851440429687, "kl_loss_7": 434.7665496826172, "learning_rate": 0.0009240240480782129, "loss": 904.4956, "step": 1860 }, { "ce_loss_10": 3.5657613396644594, "ce_loss_13": 3.4775595903396606, "ce_loss_2": 4.310595238208771, "ce_loss_3": 4.096242725849152, "ce_loss_7": 3.6834920406341554, "epoch": 0.187, "grad_norm": 564.0, "kl_loss_10": 176.3750358581543, "kl_loss_2": 1732.9331481933593, "kl_loss_3": 1312.267645263672, "kl_loss_7": 431.05884704589846, "learning_rate": 0.0009231811112642122, "loss": 904.4842, "step": 1870 }, { "ce_loss_10": 3.5995773315429687, "ce_loss_13": 3.523776340484619, "ce_loss_2": 4.318611538410186, "ce_loss_3": 4.111791932582856, "ce_loss_7": 3.7252105116844176, "epoch": 0.188, "grad_norm": 800.0, "kl_loss_10": 166.8633888244629, "kl_loss_2": 1689.3792785644532, "kl_loss_3": 1285.7919128417968, "kl_loss_7": 438.99388275146487, "learning_rate": 0.0009223339130211192, "loss": 895.5117, "step": 1880 }, { "ce_loss_10": 3.447600543498993, "ce_loss_13": 3.3741417050361635, "ce_loss_2": 4.2147566199302675, "ce_loss_3": 3.9939645886421205, "ce_loss_7": 3.574213218688965, "epoch": 0.189, "grad_norm": 744.0, "kl_loss_10": 155.9600372314453, "kl_loss_2": 1748.6449157714844, "kl_loss_3": 1322.1059143066407, "kl_loss_7": 423.45860900878904, "learning_rate": 0.0009214824618802108, "loss": 908.6522, "step": 1890 }, { "ce_loss_10": 3.6377331376075746, "ce_loss_13": 3.5647199988365172, "ce_loss_2": 4.387881731986999, "ce_loss_3": 4.164871621131897, "ce_loss_7": 3.76009886264801, "epoch": 0.19, "grad_norm": 684.0, "kl_loss_10": 159.65029067993163, "kl_loss_2": 1726.2060913085938, "kl_loss_3": 1294.0604736328125, "kl_loss_7": 432.2201461791992, "learning_rate": 0.0009206267664155906, "loss": 913.599, "step": 1900 }, { "ce_loss_10": 3.547088313102722, "ce_loss_13": 3.4745959639549255, "ce_loss_2": 4.296490705013275, "ce_loss_3": 4.079526424407959, "ce_loss_7": 3.669068491458893, "epoch": 0.191, "grad_norm": 772.0, "kl_loss_10": 158.69638366699218, "kl_loss_2": 1713.6190551757813, "kl_loss_3": 1288.2169921875, "kl_loss_7": 425.7963638305664, "learning_rate": 0.0009197668352441024, "loss": 897.9576, "step": 1910 }, { "ce_loss_10": 3.604101300239563, "ce_loss_13": 3.530220317840576, "ce_loss_2": 4.342615389823914, "ce_loss_3": 4.1288164258003235, "ce_loss_7": 3.7257459282875063, "epoch": 0.192, "grad_norm": 808.0, "kl_loss_10": 155.30001373291014, "kl_loss_2": 1698.1914672851562, "kl_loss_3": 1284.7894897460938, "kl_loss_7": 421.06340942382815, "learning_rate": 0.0009189026770252437, "loss": 894.5027, "step": 1920 }, { "ce_loss_10": 3.6317692160606385, "ce_loss_13": 3.5578647017478944, "ce_loss_2": 4.357654023170471, "ce_loss_3": 4.154970502853393, "ce_loss_7": 3.754297876358032, "epoch": 0.193, "grad_norm": 672.0, "kl_loss_10": 156.60808029174805, "kl_loss_2": 1681.9849914550782, "kl_loss_3": 1293.0608825683594, "kl_loss_7": 425.2352691650391, "learning_rate": 0.000918034300461078, "loss": 914.7908, "step": 1930 }, { "ce_loss_10": 3.661780071258545, "ce_loss_13": 3.5855299711227415, "ce_loss_2": 4.3866435289382935, "ce_loss_3": 4.176734316349029, "ce_loss_7": 3.780391752719879, "epoch": 0.194, "grad_norm": 508.0, "kl_loss_10": 158.01766357421874, "kl_loss_2": 1686.9480285644531, "kl_loss_3": 1278.583349609375, "kl_loss_7": 423.1416976928711, "learning_rate": 0.0009171617142961477, "loss": 886.9112, "step": 1940 }, { "ce_loss_10": 3.619071674346924, "ce_loss_13": 3.544144856929779, "ce_loss_2": 4.3446578741073605, "ce_loss_3": 4.132830607891083, "ce_loss_7": 3.7368661999702453, "epoch": 0.195, "grad_norm": 736.0, "kl_loss_10": 156.3160285949707, "kl_loss_2": 1687.3851745605468, "kl_loss_3": 1275.4538269042969, "kl_loss_7": 421.2539489746094, "learning_rate": 0.0009162849273173857, "loss": 885.9303, "step": 1950 }, { "ce_loss_10": 3.553014504909515, "ce_loss_13": 3.4827070951461794, "ce_loss_2": 4.291935205459595, "ce_loss_3": 4.0778828144073485, "ce_loss_7": 3.6772719860076903, "epoch": 0.196, "grad_norm": 628.0, "kl_loss_10": 154.84951171875, "kl_loss_2": 1692.2017944335937, "kl_loss_3": 1271.0808410644531, "kl_loss_7": 426.93184356689454, "learning_rate": 0.0009154039483540273, "loss": 894.1963, "step": 1960 }, { "ce_loss_10": 3.5380710124969483, "ce_loss_13": 3.4637695431709288, "ce_loss_2": 4.2753846645355225, "ce_loss_3": 4.052942824363709, "ce_loss_7": 3.656904327869415, "epoch": 0.197, "grad_norm": 632.0, "kl_loss_10": 159.71602325439454, "kl_loss_2": 1709.4674560546875, "kl_loss_3": 1272.3471313476562, "kl_loss_7": 423.2468658447266, "learning_rate": 0.0009145187862775209, "loss": 892.0132, "step": 1970 }, { "ce_loss_10": 3.573553669452667, "ce_loss_13": 3.503090190887451, "ce_loss_2": 4.311598265171051, "ce_loss_3": 4.097398114204407, "ce_loss_7": 3.6970279932022097, "epoch": 0.198, "grad_norm": 640.0, "kl_loss_10": 162.31361923217773, "kl_loss_2": 1708.26845703125, "kl_loss_3": 1289.0806701660156, "kl_loss_7": 427.25081634521484, "learning_rate": 0.0009136294500014386, "loss": 892.4137, "step": 1980 }, { "ce_loss_10": 3.519877481460571, "ce_loss_13": 3.445008838176727, "ce_loss_2": 4.2790512323379515, "ce_loss_3": 4.054595351219177, "ce_loss_7": 3.6438548445701597, "epoch": 0.199, "grad_norm": 828.0, "kl_loss_10": 163.45086746215821, "kl_loss_2": 1735.9828308105468, "kl_loss_3": 1307.2176513671875, "kl_loss_7": 429.4031814575195, "learning_rate": 0.000912735948481387, "loss": 908.3786, "step": 1990 }, { "ce_loss_10": 3.553472650051117, "ce_loss_13": 3.478245162963867, "ce_loss_2": 4.28654375076294, "ce_loss_3": 4.0802433609962465, "ce_loss_7": 3.677069938182831, "epoch": 0.2, "grad_norm": 628.0, "kl_loss_10": 160.29014053344727, "kl_loss_2": 1707.9494140625, "kl_loss_3": 1296.2968994140624, "kl_loss_7": 428.4298263549805, "learning_rate": 0.0009118382907149164, "loss": 885.6779, "step": 2000 }, { "ce_loss_10": 3.5850682854652405, "ce_loss_13": 3.50952627658844, "ce_loss_2": 4.315432548522949, "ce_loss_3": 4.100254940986633, "ce_loss_7": 3.7033676624298097, "epoch": 0.201, "grad_norm": 708.0, "kl_loss_10": 157.8212142944336, "kl_loss_2": 1702.1678955078125, "kl_loss_3": 1284.286181640625, "kl_loss_7": 423.3880950927734, "learning_rate": 0.0009109364857414306, "loss": 884.5561, "step": 2010 }, { "ce_loss_10": 3.5489532589912414, "ce_loss_13": 3.4744242310523985, "ce_loss_2": 4.2670130610466, "ce_loss_3": 4.060766589641571, "ce_loss_7": 3.668097496032715, "epoch": 0.202, "grad_norm": 728.0, "kl_loss_10": 156.86791915893554, "kl_loss_2": 1700.6970947265625, "kl_loss_3": 1274.6605895996095, "kl_loss_7": 423.90224609375, "learning_rate": 0.0009100305426420956, "loss": 905.0367, "step": 2020 }, { "ce_loss_10": 3.5054296493530273, "ce_loss_13": 3.4360422253608705, "ce_loss_2": 4.28641676902771, "ce_loss_3": 4.060827207565308, "ce_loss_7": 3.6314488530158995, "epoch": 0.203, "grad_norm": 568.0, "kl_loss_10": 154.1449821472168, "kl_loss_2": 1787.4573059082031, "kl_loss_3": 1334.0317810058593, "kl_loss_7": 424.69750823974607, "learning_rate": 0.0009091204705397484, "loss": 899.0799, "step": 2030 }, { "ce_loss_10": 3.500208306312561, "ce_loss_13": 3.4300543308258056, "ce_loss_2": 4.267745566368103, "ce_loss_3": 4.045078694820404, "ce_loss_7": 3.619711148738861, "epoch": 0.204, "grad_norm": 668.0, "kl_loss_10": 156.37296295166016, "kl_loss_2": 1764.6390991210938, "kl_loss_3": 1327.2704162597656, "kl_loss_7": 422.20280303955076, "learning_rate": 0.0009082062785988049, "loss": 907.7636, "step": 2040 }, { "ce_loss_10": 3.639115536212921, "ce_loss_13": 3.570273053646088, "ce_loss_2": 4.345433187484741, "ce_loss_3": 4.141461956501007, "ce_loss_7": 3.7540995121002196, "epoch": 0.205, "grad_norm": 684.0, "kl_loss_10": 150.40593338012695, "kl_loss_2": 1673.0178833007812, "kl_loss_3": 1264.1800903320313, "kl_loss_7": 411.66319274902344, "learning_rate": 0.0009072879760251679, "loss": 886.5711, "step": 2050 }, { "ce_loss_10": 3.5780452609062197, "ce_loss_13": 3.506811273097992, "ce_loss_2": 4.32657071352005, "ce_loss_3": 4.111942863464355, "ce_loss_7": 3.7021100521087646, "epoch": 0.206, "grad_norm": 848.0, "kl_loss_10": 153.30594024658203, "kl_loss_2": 1718.6283569335938, "kl_loss_3": 1295.268243408203, "kl_loss_7": 424.54795989990237, "learning_rate": 0.0009063655720661341, "loss": 891.0267, "step": 2060 }, { "ce_loss_10": 3.6308677077293394, "ce_loss_13": 3.559994864463806, "ce_loss_2": 4.349410057067871, "ce_loss_3": 4.144072437286377, "ce_loss_7": 3.752864146232605, "epoch": 0.207, "grad_norm": 656.0, "kl_loss_10": 154.1874885559082, "kl_loss_2": 1672.9453002929688, "kl_loss_3": 1275.2094665527343, "kl_loss_7": 426.11539459228516, "learning_rate": 0.000905439076010301, "loss": 882.46, "step": 2070 }, { "ce_loss_10": 3.5808544397354125, "ce_loss_13": 3.509160017967224, "ce_loss_2": 4.326603198051453, "ce_loss_3": 4.109063744544983, "ce_loss_7": 3.705178952217102, "epoch": 0.208, "grad_norm": 692.0, "kl_loss_10": 153.3233413696289, "kl_loss_2": 1708.8538391113282, "kl_loss_3": 1284.7677429199218, "kl_loss_7": 424.9750183105469, "learning_rate": 0.0009045084971874737, "loss": 878.8373, "step": 2080 }, { "ce_loss_10": 3.5615262746810914, "ce_loss_13": 3.489412307739258, "ce_loss_2": 4.29884420633316, "ce_loss_3": 4.0808792352676395, "ce_loss_7": 3.6813034057617187, "epoch": 0.209, "grad_norm": 676.0, "kl_loss_10": 155.00849456787108, "kl_loss_2": 1709.5053344726562, "kl_loss_3": 1283.1550598144531, "kl_loss_7": 423.63720855712893, "learning_rate": 0.0009035738449685707, "loss": 901.9238, "step": 2090 }, { "ce_loss_10": 3.500594878196716, "ce_loss_13": 3.4269418120384216, "ce_loss_2": 4.261061310768127, "ce_loss_3": 4.0436607837677006, "ce_loss_7": 3.6208752751350404, "epoch": 0.21, "grad_norm": 664.0, "kl_loss_10": 154.60431098937988, "kl_loss_2": 1738.2481689453125, "kl_loss_3": 1307.3330932617187, "kl_loss_7": 419.5928253173828, "learning_rate": 0.0009026351287655293, "loss": 886.3637, "step": 2100 }, { "ce_loss_10": 3.6964518427848816, "ce_loss_13": 3.628730046749115, "ce_loss_2": 4.3766416072845455, "ce_loss_3": 4.168106377124786, "ce_loss_7": 3.810632276535034, "epoch": 0.211, "grad_norm": 892.0, "kl_loss_10": 147.90714263916016, "kl_loss_2": 1608.2051940917968, "kl_loss_3": 1206.7733154296875, "kl_loss_7": 407.6394790649414, "learning_rate": 0.0009016923580312113, "loss": 848.7596, "step": 2110 }, { "ce_loss_10": 3.551041543483734, "ce_loss_13": 3.4838817596435545, "ce_loss_2": 4.263743901252747, "ce_loss_3": 4.0499477744102474, "ce_loss_7": 3.675402212142944, "epoch": 0.212, "grad_norm": 560.0, "kl_loss_10": 151.607958984375, "kl_loss_2": 1656.1309509277344, "kl_loss_3": 1250.3619262695313, "kl_loss_7": 422.76923828125, "learning_rate": 0.0009007455422593077, "loss": 885.3857, "step": 2120 }, { "ce_loss_10": 3.562802243232727, "ce_loss_13": 3.4917763471603394, "ce_loss_2": 4.310350394248962, "ce_loss_3": 4.089577150344849, "ce_loss_7": 3.6881680369377134, "epoch": 0.213, "grad_norm": 596.0, "kl_loss_10": 154.58464736938475, "kl_loss_2": 1729.4122314453125, "kl_loss_3": 1302.9043884277344, "kl_loss_7": 434.224055480957, "learning_rate": 0.0008997946909842425, "loss": 899.9531, "step": 2130 }, { "ce_loss_10": 3.5820153951644897, "ce_loss_13": 3.5091649174690245, "ce_loss_2": 4.356534934043884, "ce_loss_3": 4.134391415119171, "ce_loss_7": 3.712181234359741, "epoch": 0.214, "grad_norm": 868.0, "kl_loss_10": 160.32343063354492, "kl_loss_2": 1772.4649536132813, "kl_loss_3": 1337.5961608886719, "kl_loss_7": 449.59600219726565, "learning_rate": 0.0008988398137810777, "loss": 897.9156, "step": 2140 }, { "ce_loss_10": 3.6153340220451353, "ce_loss_13": 3.5469638228416445, "ce_loss_2": 4.333649778366089, "ce_loss_3": 4.1305704593658445, "ce_loss_7": 3.746591365337372, "epoch": 0.215, "grad_norm": 744.0, "kl_loss_10": 152.21541442871094, "kl_loss_2": 1670.5354553222655, "kl_loss_3": 1258.0333435058594, "kl_loss_7": 436.6368347167969, "learning_rate": 0.0008978809202654162, "loss": 878.6867, "step": 2150 }, { "ce_loss_10": 3.5994900941848753, "ce_loss_13": 3.523406147956848, "ce_loss_2": 4.32131108045578, "ce_loss_3": 4.106508493423462, "ce_loss_7": 3.71969929933548, "epoch": 0.216, "grad_norm": 680.0, "kl_loss_10": 156.3335433959961, "kl_loss_2": 1665.5125793457032, "kl_loss_3": 1255.7060974121093, "kl_loss_7": 430.11304168701173, "learning_rate": 0.0008969180200933046, "loss": 882.8723, "step": 2160 }, { "ce_loss_10": 3.5530227780342103, "ce_loss_13": 3.4805894613265993, "ce_loss_2": 4.314852666854859, "ce_loss_3": 4.093249773979187, "ce_loss_7": 3.68630450963974, "epoch": 0.217, "grad_norm": 884.0, "kl_loss_10": 159.62687911987305, "kl_loss_2": 1730.9878967285156, "kl_loss_3": 1299.03779296875, "kl_loss_7": 449.18050384521484, "learning_rate": 0.0008959511229611376, "loss": 904.2477, "step": 2170 }, { "ce_loss_10": 3.6367149114608766, "ce_loss_13": 3.5638245701789857, "ce_loss_2": 4.346567583084107, "ce_loss_3": 4.133716177940369, "ce_loss_7": 3.7738547563552856, "epoch": 0.218, "grad_norm": 1040.0, "kl_loss_10": 154.5606544494629, "kl_loss_2": 1663.1614990234375, "kl_loss_3": 1249.802392578125, "kl_loss_7": 464.9475952148438, "learning_rate": 0.0008949802386055581, "loss": 886.1459, "step": 2180 }, { "ce_loss_10": 3.4948283553123476, "ce_loss_13": 3.422482490539551, "ce_loss_2": 4.231684553623199, "ce_loss_3": 4.012719178199768, "ce_loss_7": 3.651959717273712, "epoch": 0.219, "grad_norm": 848.0, "kl_loss_10": 152.99597930908203, "kl_loss_2": 1675.9751953125, "kl_loss_3": 1256.631854248047, "kl_loss_7": 486.07566986083987, "learning_rate": 0.0008940053768033609, "loss": 903.9787, "step": 2190 }, { "ce_loss_10": 3.5814868688583372, "ce_loss_13": 3.511065399646759, "ce_loss_2": 4.282150137424469, "ce_loss_3": 4.074921286106109, "ce_loss_7": 3.7124979138374328, "epoch": 0.22, "grad_norm": 612.0, "kl_loss_10": 151.5022117614746, "kl_loss_2": 1646.2027404785156, "kl_loss_3": 1238.9055969238282, "kl_loss_7": 446.9908981323242, "learning_rate": 0.0008930265473713938, "loss": 875.279, "step": 2200 }, { "ce_loss_10": 3.545922613143921, "ce_loss_13": 3.4695589780807494, "ce_loss_2": 4.269280636310578, "ce_loss_3": 4.051465404033661, "ce_loss_7": 3.678086221218109, "epoch": 0.221, "grad_norm": 720.0, "kl_loss_10": 159.6601760864258, "kl_loss_2": 1673.8038024902344, "kl_loss_3": 1248.3465393066406, "kl_loss_7": 435.1474182128906, "learning_rate": 0.0008920437601664579, "loss": 865.5202, "step": 2210 }, { "ce_loss_10": 3.540095329284668, "ce_loss_13": 3.461512506008148, "ce_loss_2": 4.25678983926773, "ce_loss_3": 4.043842458724976, "ce_loss_7": 3.6578521966934203, "epoch": 0.222, "grad_norm": 696.0, "kl_loss_10": 164.2334945678711, "kl_loss_2": 1687.953759765625, "kl_loss_3": 1269.6444458007813, "kl_loss_7": 430.08106689453126, "learning_rate": 0.0008910570250852097, "loss": 872.3879, "step": 2220 }, { "ce_loss_10": 3.645900297164917, "ce_loss_13": 3.5746286392211912, "ce_loss_2": 4.329179430007935, "ce_loss_3": 4.126194024085999, "ce_loss_7": 3.7611724615097044, "epoch": 0.223, "grad_norm": 580.0, "kl_loss_10": 161.01494750976562, "kl_loss_2": 1616.48173828125, "kl_loss_3": 1212.9592407226562, "kl_loss_7": 411.58105926513673, "learning_rate": 0.0008900663520640604, "loss": 857.1105, "step": 2230 }, { "ce_loss_10": 3.5964877367019654, "ce_loss_13": 3.51711448431015, "ce_loss_2": 4.304993438720703, "ce_loss_3": 4.098519861698151, "ce_loss_7": 3.7116694688796996, "epoch": 0.224, "grad_norm": 736.0, "kl_loss_10": 163.17066192626953, "kl_loss_2": 1651.7649719238282, "kl_loss_3": 1242.3382141113282, "kl_loss_7": 411.6787506103516, "learning_rate": 0.0008890717510790764, "loss": 876.1222, "step": 2240 }, { "ce_loss_10": 3.5471703886985777, "ce_loss_13": 3.4758081436157227, "ce_loss_2": 4.273530685901642, "ce_loss_3": 4.067912590503693, "ce_loss_7": 3.6660157322883604, "epoch": 0.225, "grad_norm": 708.0, "kl_loss_10": 157.27454681396483, "kl_loss_2": 1677.1010131835938, "kl_loss_3": 1277.8482238769532, "kl_loss_7": 413.5911529541016, "learning_rate": 0.0008880732321458784, "loss": 886.1244, "step": 2250 }, { "ce_loss_10": 3.584555244445801, "ce_loss_13": 3.5100926637649534, "ce_loss_2": 4.294793701171875, "ce_loss_3": 4.089890396595001, "ce_loss_7": 3.6978877186775208, "epoch": 0.226, "grad_norm": 668.0, "kl_loss_10": 156.83258056640625, "kl_loss_2": 1654.9069946289062, "kl_loss_3": 1258.4203796386719, "kl_loss_7": 411.3527374267578, "learning_rate": 0.0008870708053195413, "loss": 882.6709, "step": 2260 }, { "ce_loss_10": 3.6060186505317686, "ce_loss_13": 3.537421989440918, "ce_loss_2": 4.294502913951874, "ce_loss_3": 4.09798412322998, "ce_loss_7": 3.7177546858787536, "epoch": 0.227, "grad_norm": 752.0, "kl_loss_10": 151.7200439453125, "kl_loss_2": 1629.2985412597657, "kl_loss_3": 1232.4538330078126, "kl_loss_7": 405.0216552734375, "learning_rate": 0.0008860644806944918, "loss": 858.0595, "step": 2270 }, { "ce_loss_10": 3.5466419100761413, "ce_loss_13": 3.471850037574768, "ce_loss_2": 4.279071676731109, "ce_loss_3": 4.0602316617965695, "ce_loss_7": 3.6642366647720337, "epoch": 0.228, "grad_norm": 732.0, "kl_loss_10": 159.87943267822266, "kl_loss_2": 1702.7948913574219, "kl_loss_3": 1281.67705078125, "kl_loss_7": 420.01871795654296, "learning_rate": 0.0008850542684044079, "loss": 867.6785, "step": 2280 }, { "ce_loss_10": 3.5172786831855776, "ce_loss_13": 3.442570173740387, "ce_loss_2": 4.280114269256591, "ce_loss_3": 4.0603335976600645, "ce_loss_7": 3.642946183681488, "epoch": 0.229, "grad_norm": 868.0, "kl_loss_10": 165.0769485473633, "kl_loss_2": 1756.4314086914062, "kl_loss_3": 1316.273486328125, "kl_loss_7": 426.0765625, "learning_rate": 0.0008840401786221159, "loss": 888.6179, "step": 2290 }, { "ce_loss_10": 3.653991627693176, "ce_loss_13": 3.587805616855621, "ce_loss_2": 4.348642802238464, "ce_loss_3": 4.1388364911079405, "ce_loss_7": 3.767448306083679, "epoch": 0.23, "grad_norm": 660.0, "kl_loss_10": 150.6523193359375, "kl_loss_2": 1617.4279296875, "kl_loss_3": 1219.0333129882813, "kl_loss_7": 399.7216049194336, "learning_rate": 0.000883022221559489, "loss": 847.182, "step": 2300 }, { "ce_loss_10": 3.6083556771278382, "ce_loss_13": 3.5367133378982545, "ce_loss_2": 4.322761964797974, "ce_loss_3": 4.111302089691162, "ce_loss_7": 3.720748817920685, "epoch": 0.231, "grad_norm": 700.0, "kl_loss_10": 152.93154830932616, "kl_loss_2": 1665.5345397949218, "kl_loss_3": 1251.20830078125, "kl_loss_7": 409.15943145751953, "learning_rate": 0.0008820004074673434, "loss": 890.3789, "step": 2310 }, { "ce_loss_10": 3.51333087682724, "ce_loss_13": 3.4452512502670287, "ce_loss_2": 4.2334395289421085, "ce_loss_3": 4.0242817282676695, "ce_loss_7": 3.6292155742645265, "epoch": 0.232, "grad_norm": 852.0, "kl_loss_10": 146.69402542114258, "kl_loss_2": 1683.8106201171875, "kl_loss_3": 1269.85615234375, "kl_loss_7": 408.7549362182617, "learning_rate": 0.0008809747466353355, "loss": 861.5091, "step": 2320 }, { "ce_loss_10": 3.5223829627037047, "ce_loss_13": 3.4536111116409303, "ce_loss_2": 4.234097373485565, "ce_loss_3": 4.026098394393921, "ce_loss_7": 3.636929678916931, "epoch": 0.233, "grad_norm": 840.0, "kl_loss_10": 149.8192039489746, "kl_loss_2": 1664.0373168945312, "kl_loss_3": 1257.5840637207032, "kl_loss_7": 404.71526489257815, "learning_rate": 0.0008799452493918585, "loss": 871.5133, "step": 2330 }, { "ce_loss_10": 3.60318318605423, "ce_loss_13": 3.535007989406586, "ce_loss_2": 4.313662338256836, "ce_loss_3": 4.121019208431244, "ce_loss_7": 3.7191484212875365, "epoch": 0.234, "grad_norm": 620.0, "kl_loss_10": 147.37739295959472, "kl_loss_2": 1648.7771301269531, "kl_loss_3": 1272.6883178710937, "kl_loss_7": 401.2149230957031, "learning_rate": 0.0008789119261039385, "loss": 889.3826, "step": 2340 }, { "ce_loss_10": 3.512122702598572, "ce_loss_13": 3.444090461730957, "ce_loss_2": 4.230511236190796, "ce_loss_3": 4.033337998390198, "ce_loss_7": 3.623887372016907, "epoch": 0.235, "grad_norm": 584.0, "kl_loss_10": 146.48653411865234, "kl_loss_2": 1640.4592712402343, "kl_loss_3": 1250.2205444335937, "kl_loss_7": 398.50293121337893, "learning_rate": 0.0008778747871771292, "loss": 851.5988, "step": 2350 }, { "ce_loss_10": 3.560721528530121, "ce_loss_13": 3.495018112659454, "ce_loss_2": 4.254593050479889, "ce_loss_3": 4.062426352500916, "ce_loss_7": 3.675720953941345, "epoch": 0.236, "grad_norm": 704.0, "kl_loss_10": 143.95477371215821, "kl_loss_2": 1612.37734375, "kl_loss_3": 1231.095751953125, "kl_loss_7": 391.27414855957034, "learning_rate": 0.0008768338430554083, "loss": 843.9201, "step": 2360 }, { "ce_loss_10": 3.5749622344970704, "ce_loss_13": 3.5057023644447325, "ce_loss_2": 4.284319353103638, "ce_loss_3": 4.084056878089905, "ce_loss_7": 3.691650938987732, "epoch": 0.237, "grad_norm": 748.0, "kl_loss_10": 149.22578506469728, "kl_loss_2": 1642.1105163574218, "kl_loss_3": 1249.0818786621094, "kl_loss_7": 417.6405731201172, "learning_rate": 0.0008757891042210713, "loss": 868.1691, "step": 2370 }, { "ce_loss_10": 3.593665659427643, "ce_loss_13": 3.52440140247345, "ce_loss_2": 4.300392985343933, "ce_loss_3": 4.098441755771637, "ce_loss_7": 3.7135939955711366, "epoch": 0.238, "grad_norm": 668.0, "kl_loss_10": 148.62272415161132, "kl_loss_2": 1634.145733642578, "kl_loss_3": 1239.5517639160157, "kl_loss_7": 415.1738784790039, "learning_rate": 0.0008747405811946271, "loss": 862.5363, "step": 2380 }, { "ce_loss_10": 3.490070474147797, "ce_loss_13": 3.4224871158599854, "ce_loss_2": 4.224261665344239, "ce_loss_3": 4.004424059391022, "ce_loss_7": 3.6100102066993713, "epoch": 0.239, "grad_norm": 652.0, "kl_loss_10": 147.28631515502929, "kl_loss_2": 1686.8346069335937, "kl_loss_3": 1270.0725341796874, "kl_loss_7": 419.5466110229492, "learning_rate": 0.0008736882845346905, "loss": 852.6951, "step": 2390 }, { "ce_loss_10": 3.585322046279907, "ce_loss_13": 3.5153682231903076, "ce_loss_2": 4.304745924472809, "ce_loss_3": 4.094923424720764, "ce_loss_7": 3.7078854203224183, "epoch": 0.24, "grad_norm": 692.0, "kl_loss_10": 151.89797973632812, "kl_loss_2": 1647.5736877441407, "kl_loss_3": 1242.0455993652345, "kl_loss_7": 421.9617263793945, "learning_rate": 0.0008726322248378774, "loss": 857.3443, "step": 2400 }, { "ce_loss_10": 3.583414626121521, "ce_loss_13": 3.515112745761871, "ce_loss_2": 4.311495113372803, "ce_loss_3": 4.099529230594635, "ce_loss_7": 3.7044310569763184, "epoch": 0.241, "grad_norm": 652.0, "kl_loss_10": 147.43828582763672, "kl_loss_2": 1680.8733215332031, "kl_loss_3": 1264.9752563476563, "kl_loss_7": 411.56260681152344, "learning_rate": 0.0008715724127386971, "loss": 878.9705, "step": 2410 }, { "ce_loss_10": 3.6531842708587647, "ce_loss_13": 3.587493336200714, "ce_loss_2": 4.3451045751571655, "ce_loss_3": 4.142902874946595, "ce_loss_7": 3.767136585712433, "epoch": 0.242, "grad_norm": 620.0, "kl_loss_10": 146.30834274291993, "kl_loss_2": 1624.2251770019532, "kl_loss_3": 1225.6600646972656, "kl_loss_7": 408.7913787841797, "learning_rate": 0.0008705088589094458, "loss": 860.6354, "step": 2420 }, { "ce_loss_10": 3.665231490135193, "ce_loss_13": 3.599679338932037, "ce_loss_2": 4.36877521276474, "ce_loss_3": 4.171533620357513, "ce_loss_7": 3.787107288837433, "epoch": 0.243, "grad_norm": 664.0, "kl_loss_10": 144.3328182220459, "kl_loss_2": 1632.8966247558594, "kl_loss_3": 1233.8675537109375, "kl_loss_7": 410.8569137573242, "learning_rate": 0.0008694415740600988, "loss": 860.5134, "step": 2430 }, { "ce_loss_10": 3.5181034803390503, "ce_loss_13": 3.4537795782089233, "ce_loss_2": 4.249938941001892, "ce_loss_3": 4.046365630626679, "ce_loss_7": 3.6414516806602477, "epoch": 0.244, "grad_norm": 704.0, "kl_loss_10": 145.3531593322754, "kl_loss_2": 1684.775213623047, "kl_loss_3": 1287.4376159667968, "kl_loss_7": 412.328303527832, "learning_rate": 0.0008683705689382025, "loss": 869.7711, "step": 2440 }, { "ce_loss_10": 3.6008686304092405, "ce_loss_13": 3.5355240225791933, "ce_loss_2": 4.291689145565033, "ce_loss_3": 4.094740498065948, "ce_loss_7": 3.7141705632209776, "epoch": 0.245, "grad_norm": 760.0, "kl_loss_10": 143.44989089965821, "kl_loss_2": 1616.675146484375, "kl_loss_3": 1220.3111206054687, "kl_loss_7": 396.51316223144534, "learning_rate": 0.0008672958543287666, "loss": 868.0814, "step": 2450 }, { "ce_loss_10": 3.6143836855888365, "ce_loss_13": 3.544294059276581, "ce_loss_2": 4.300798118114471, "ce_loss_3": 4.102933740615844, "ce_loss_7": 3.72817702293396, "epoch": 0.246, "grad_norm": 736.0, "kl_loss_10": 147.29279327392578, "kl_loss_2": 1613.1561401367187, "kl_loss_3": 1222.6258117675782, "kl_loss_7": 401.4141021728516, "learning_rate": 0.0008662174410541554, "loss": 850.6215, "step": 2460 }, { "ce_loss_10": 3.574166786670685, "ce_loss_13": 3.5087246656417848, "ce_loss_2": 4.2659319877624515, "ce_loss_3": 4.068793642520904, "ce_loss_7": 3.686735737323761, "epoch": 0.247, "grad_norm": 604.0, "kl_loss_10": 145.85303649902343, "kl_loss_2": 1618.3893676757812, "kl_loss_3": 1220.1310791015626, "kl_loss_7": 400.6472534179687, "learning_rate": 0.0008651353399739787, "loss": 863.9098, "step": 2470 }, { "ce_loss_10": 3.6058215737342834, "ce_loss_13": 3.539613950252533, "ce_loss_2": 4.302924489974975, "ce_loss_3": 4.098112308979035, "ce_loss_7": 3.71759934425354, "epoch": 0.248, "grad_norm": 548.0, "kl_loss_10": 147.60094833374023, "kl_loss_2": 1624.6452575683593, "kl_loss_3": 1222.2023498535157, "kl_loss_7": 401.4662857055664, "learning_rate": 0.0008640495619849821, "loss": 849.9955, "step": 2480 }, { "ce_loss_10": 3.567488098144531, "ce_loss_13": 3.501059103012085, "ce_loss_2": 4.264333772659302, "ce_loss_3": 4.056195986270905, "ce_loss_7": 3.6804752707481385, "epoch": 0.249, "grad_norm": 712.0, "kl_loss_10": 147.02945251464843, "kl_loss_2": 1627.2623229980468, "kl_loss_3": 1233.1324645996094, "kl_loss_7": 402.8577346801758, "learning_rate": 0.0008629601180209381, "loss": 850.3543, "step": 2490 }, { "ce_loss_10": 3.5611565947532653, "ce_loss_13": 3.492980194091797, "ce_loss_2": 4.257801342010498, "ce_loss_3": 4.052403700351715, "ce_loss_7": 3.6730109333992003, "epoch": 0.25, "grad_norm": 800.0, "kl_loss_10": 150.6272880554199, "kl_loss_2": 1621.1947387695313, "kl_loss_3": 1217.2358276367188, "kl_loss_7": 403.2303207397461, "learning_rate": 0.000861867019052535, "loss": 856.4564, "step": 2500 }, { "ce_loss_10": 3.472098398208618, "ce_loss_13": 3.402975833415985, "ce_loss_2": 4.218045926094055, "ce_loss_3": 3.995682156085968, "ce_loss_7": 3.5920477628707888, "epoch": 0.251, "grad_norm": 756.0, "kl_loss_10": 148.6118423461914, "kl_loss_2": 1683.9299011230469, "kl_loss_3": 1261.464288330078, "kl_loss_7": 407.4490676879883, "learning_rate": 0.0008607702760872678, "loss": 870.0967, "step": 2510 }, { "ce_loss_10": 3.5914869785308836, "ce_loss_13": 3.5267413854599, "ce_loss_2": 4.272609853744507, "ce_loss_3": 4.077222716808319, "ce_loss_7": 3.702296590805054, "epoch": 0.252, "grad_norm": 824.0, "kl_loss_10": 142.76109085083007, "kl_loss_2": 1589.925390625, "kl_loss_3": 1204.9856811523437, "kl_loss_7": 391.77073822021487, "learning_rate": 0.0008596699001693256, "loss": 856.7367, "step": 2520 }, { "ce_loss_10": 3.6065261363983154, "ce_loss_13": 3.542446482181549, "ce_loss_2": 4.280361306667328, "ce_loss_3": 4.078390645980835, "ce_loss_7": 3.713554584980011, "epoch": 0.253, "grad_norm": 580.0, "kl_loss_10": 146.36550064086913, "kl_loss_2": 1602.360205078125, "kl_loss_3": 1201.24404296875, "kl_loss_7": 391.7824478149414, "learning_rate": 0.0008585659023794818, "loss": 855.3571, "step": 2530 }, { "ce_loss_10": 3.556603026390076, "ce_loss_13": 3.488174021244049, "ce_loss_2": 4.2837646961212155, "ce_loss_3": 4.073796629905701, "ce_loss_7": 3.669671726226807, "epoch": 0.254, "grad_norm": 568.0, "kl_loss_10": 150.27775192260742, "kl_loss_2": 1671.5778869628907, "kl_loss_3": 1258.693133544922, "kl_loss_7": 409.5189697265625, "learning_rate": 0.0008574582938349817, "loss": 864.698, "step": 2540 }, { "ce_loss_10": 3.5615516781806944, "ce_loss_13": 3.48288893699646, "ce_loss_2": 4.289009380340576, "ce_loss_3": 4.085076451301575, "ce_loss_7": 3.6758302092552184, "epoch": 0.255, "grad_norm": 636.0, "kl_loss_10": 162.17492446899413, "kl_loss_2": 1689.662127685547, "kl_loss_3": 1286.3527221679688, "kl_loss_7": 415.7776596069336, "learning_rate": 0.0008563470856894315, "loss": 856.5075, "step": 2550 }, { "ce_loss_10": 3.5482829809188843, "ce_loss_13": 3.4777650833129883, "ce_loss_2": 4.255946707725525, "ce_loss_3": 4.048075020313263, "ce_loss_7": 3.656572926044464, "epoch": 0.256, "grad_norm": 784.0, "kl_loss_10": 158.6891746520996, "kl_loss_2": 1642.1806335449219, "kl_loss_3": 1246.0239196777343, "kl_loss_7": 399.031315612793, "learning_rate": 0.0008552322891326845, "loss": 858.266, "step": 2560 }, { "ce_loss_10": 3.5313944816589355, "ce_loss_13": 3.450025427341461, "ce_loss_2": 4.224313974380493, "ce_loss_3": 4.01665427684784, "ce_loss_7": 3.628642737865448, "epoch": 0.257, "grad_norm": 756.0, "kl_loss_10": 179.5360984802246, "kl_loss_2": 1649.637109375, "kl_loss_3": 1240.5775268554687, "kl_loss_7": 401.7601791381836, "learning_rate": 0.0008541139153907296, "loss": 857.8164, "step": 2570 }, { "ce_loss_10": 3.486237919330597, "ce_loss_13": 3.406646740436554, "ce_loss_2": 4.1815930843353275, "ce_loss_3": 3.9679081320762633, "ce_loss_7": 3.589344394207001, "epoch": 0.258, "grad_norm": 580.0, "kl_loss_10": 172.96023330688476, "kl_loss_2": 1636.2860229492187, "kl_loss_3": 1219.9958557128907, "kl_loss_7": 400.6501800537109, "learning_rate": 0.0008529919757255782, "loss": 859.2726, "step": 2580 }, { "ce_loss_10": 3.514637219905853, "ce_loss_13": 3.442402172088623, "ce_loss_2": 4.176178085803985, "ce_loss_3": 3.971681094169617, "ce_loss_7": 3.6139083743095397, "epoch": 0.259, "grad_norm": 592.0, "kl_loss_10": 162.93561325073242, "kl_loss_2": 1568.500262451172, "kl_loss_3": 1166.9389770507812, "kl_loss_7": 392.92857513427737, "learning_rate": 0.0008518664814351503, "loss": 832.6326, "step": 2590 }, { "ce_loss_10": 3.481887364387512, "ce_loss_13": 3.4060272455215452, "ce_loss_2": 4.2031479477882385, "ce_loss_3": 3.989342713356018, "ce_loss_7": 3.5949572801589964, "epoch": 0.26, "grad_norm": 812.0, "kl_loss_10": 160.07810974121094, "kl_loss_2": 1678.1281677246093, "kl_loss_3": 1261.206689453125, "kl_loss_7": 422.9075729370117, "learning_rate": 0.0008507374438531607, "loss": 893.9762, "step": 2600 }, { "ce_loss_10": 3.457890176773071, "ce_loss_13": 3.3876611232757567, "ce_loss_2": 4.156854557991028, "ce_loss_3": 3.951684260368347, "ce_loss_7": 3.5687990188598633, "epoch": 0.261, "grad_norm": 652.0, "kl_loss_10": 151.19789123535156, "kl_loss_2": 1623.104559326172, "kl_loss_3": 1217.5732849121093, "kl_loss_7": 405.58522033691406, "learning_rate": 0.0008496048743490053, "loss": 847.8467, "step": 2610 }, { "ce_loss_10": 3.606453371047974, "ce_loss_13": 3.5347030401229858, "ce_loss_2": 4.287022602558136, "ce_loss_3": 4.085285770893097, "ce_loss_7": 3.7209227800369264, "epoch": 0.262, "grad_norm": 740.0, "kl_loss_10": 149.46137313842775, "kl_loss_2": 1597.4227478027344, "kl_loss_3": 1202.5909423828125, "kl_loss_7": 406.67049102783204, "learning_rate": 0.0008484687843276469, "loss": 841.793, "step": 2620 }, { "ce_loss_10": 3.5375467777252196, "ce_loss_13": 3.4692538261413572, "ce_loss_2": 4.234356260299682, "ce_loss_3": 4.030397474765778, "ce_loss_7": 3.665135991573334, "epoch": 0.263, "grad_norm": 756.0, "kl_loss_10": 148.52831802368163, "kl_loss_2": 1626.5234375, "kl_loss_3": 1230.9505981445313, "kl_loss_7": 421.18434143066406, "learning_rate": 0.0008473291852294987, "loss": 864.2529, "step": 2630 }, { "ce_loss_10": 3.5409929156303406, "ce_loss_13": 3.474526059627533, "ce_loss_2": 4.241577196121216, "ce_loss_3": 4.039238381385803, "ce_loss_7": 3.658675765991211, "epoch": 0.264, "grad_norm": 752.0, "kl_loss_10": 146.5441551208496, "kl_loss_2": 1638.5023559570313, "kl_loss_3": 1236.1968872070313, "kl_loss_7": 412.63101959228516, "learning_rate": 0.0008461860885303114, "loss": 847.5814, "step": 2640 }, { "ce_loss_10": 3.5672938942909242, "ce_loss_13": 3.5024590492248535, "ce_loss_2": 4.25400961637497, "ce_loss_3": 4.04265718460083, "ce_loss_7": 3.682517182826996, "epoch": 0.265, "grad_norm": 532.0, "kl_loss_10": 143.3494441986084, "kl_loss_2": 1592.2557861328125, "kl_loss_3": 1193.9100402832032, "kl_loss_7": 399.56408233642577, "learning_rate": 0.000845039505741056, "loss": 840.8137, "step": 2650 }, { "ce_loss_10": 3.554959547519684, "ce_loss_13": 3.4861936926841737, "ce_loss_2": 4.258109152317047, "ce_loss_3": 4.058207333087921, "ce_loss_7": 3.673966348171234, "epoch": 0.266, "grad_norm": 668.0, "kl_loss_10": 145.93713989257813, "kl_loss_2": 1655.0006469726563, "kl_loss_3": 1257.4821166992188, "kl_loss_7": 414.46795959472655, "learning_rate": 0.0008438894484078086, "loss": 882.0232, "step": 2660 }, { "ce_loss_10": 3.5612363696098326, "ce_loss_13": 3.495659518241882, "ce_loss_2": 4.250771713256836, "ce_loss_3": 4.0548901677131655, "ce_loss_7": 3.6741419315338133, "epoch": 0.267, "grad_norm": 664.0, "kl_loss_10": 143.76045303344728, "kl_loss_2": 1608.0756408691407, "kl_loss_3": 1221.9843811035157, "kl_loss_7": 403.46703948974607, "learning_rate": 0.0008427359281116334, "loss": 850.6983, "step": 2670 }, { "ce_loss_10": 3.4647863864898683, "ce_loss_13": 3.4007498025894165, "ce_loss_2": 4.182628095149994, "ce_loss_3": 3.980172896385193, "ce_loss_7": 3.581213617324829, "epoch": 0.268, "grad_norm": 592.0, "kl_loss_10": 142.86443824768065, "kl_loss_2": 1652.2051391601562, "kl_loss_3": 1244.7330932617188, "kl_loss_7": 401.0715042114258, "learning_rate": 0.0008415789564684673, "loss": 856.052, "step": 2680 }, { "ce_loss_10": 3.7105233311653136, "ce_loss_13": 3.6397859692573546, "ce_loss_2": 4.390112090110779, "ce_loss_3": 4.1913481712341305, "ce_loss_7": 3.827344071865082, "epoch": 0.269, "grad_norm": 616.0, "kl_loss_10": 151.1160675048828, "kl_loss_2": 1576.1515380859375, "kl_loss_3": 1196.6380004882812, "kl_loss_7": 409.57061157226565, "learning_rate": 0.0008404185451290017, "loss": 832.6562, "step": 2690 }, { "ce_loss_10": 3.575976026058197, "ce_loss_13": 3.509383165836334, "ce_loss_2": 4.262010288238526, "ce_loss_3": 4.061827218532562, "ce_loss_7": 3.6923381447792054, "epoch": 0.27, "grad_norm": 828.0, "kl_loss_10": 145.46635818481445, "kl_loss_2": 1614.2012756347656, "kl_loss_3": 1211.5230590820313, "kl_loss_7": 400.46296844482424, "learning_rate": 0.0008392547057785661, "loss": 840.9771, "step": 2700 }, { "ce_loss_10": 3.504245734214783, "ce_loss_13": 3.4360852599143983, "ce_loss_2": 4.231841135025024, "ce_loss_3": 4.012355697154999, "ce_loss_7": 3.6181442975997924, "epoch": 0.271, "grad_norm": 732.0, "kl_loss_10": 148.68926086425782, "kl_loss_2": 1695.421563720703, "kl_loss_3": 1262.842755126953, "kl_loss_7": 405.0101089477539, "learning_rate": 0.0008380874501370098, "loss": 846.7374, "step": 2710 }, { "ce_loss_10": 3.4979911923408507, "ce_loss_13": 3.430197703838348, "ce_loss_2": 4.2191136360168455, "ce_loss_3": 4.007521188259124, "ce_loss_7": 3.6158869743347166, "epoch": 0.272, "grad_norm": 816.0, "kl_loss_10": 150.7740036010742, "kl_loss_2": 1669.1359069824218, "kl_loss_3": 1252.4361511230468, "kl_loss_7": 410.4910232543945, "learning_rate": 0.0008369167899585841, "loss": 863.3943, "step": 2720 }, { "ce_loss_10": 3.622397780418396, "ce_loss_13": 3.5552627205848695, "ce_loss_2": 4.291873097419739, "ce_loss_3": 4.092212498188019, "ce_loss_7": 3.728976047039032, "epoch": 0.273, "grad_norm": 532.0, "kl_loss_10": 145.40375938415528, "kl_loss_2": 1578.4437866210938, "kl_loss_3": 1183.6333801269532, "kl_loss_7": 390.3755645751953, "learning_rate": 0.0008357427370318238, "loss": 851.5875, "step": 2730 }, { "ce_loss_10": 3.57169429063797, "ce_loss_13": 3.5056095838546755, "ce_loss_2": 4.271306252479553, "ce_loss_3": 4.060312724113464, "ce_loss_7": 3.684804010391235, "epoch": 0.274, "grad_norm": 936.0, "kl_loss_10": 145.32760620117188, "kl_loss_2": 1629.5275451660157, "kl_loss_3": 1221.0515014648438, "kl_loss_7": 395.46714477539064, "learning_rate": 0.0008345653031794292, "loss": 853.4608, "step": 2740 }, { "ce_loss_10": 3.5738168597221374, "ce_loss_13": 3.507299304008484, "ce_loss_2": 4.262202572822571, "ce_loss_3": 4.064274084568024, "ce_loss_7": 3.6900595307350157, "epoch": 0.275, "grad_norm": 664.0, "kl_loss_10": 146.08263397216797, "kl_loss_2": 1609.2401916503907, "kl_loss_3": 1212.829638671875, "kl_loss_7": 400.2176742553711, "learning_rate": 0.0008333845002581458, "loss": 847.099, "step": 2750 }, { "ce_loss_10": 3.4940132975578306, "ce_loss_13": 3.4292171597480774, "ce_loss_2": 4.21322615146637, "ce_loss_3": 4.004913711547852, "ce_loss_7": 3.6087831974029543, "epoch": 0.276, "grad_norm": 588.0, "kl_loss_10": 146.3518180847168, "kl_loss_2": 1678.6763549804687, "kl_loss_3": 1262.9351531982422, "kl_loss_7": 407.4271759033203, "learning_rate": 0.0008322003401586462, "loss": 867.0438, "step": 2760 }, { "ce_loss_10": 3.5370134353637694, "ce_loss_13": 3.472352921962738, "ce_loss_2": 4.210755240917206, "ce_loss_3": 4.011643362045288, "ce_loss_7": 3.643345367908478, "epoch": 0.277, "grad_norm": 520.0, "kl_loss_10": 141.4054153442383, "kl_loss_2": 1575.5725219726562, "kl_loss_3": 1185.9700073242188, "kl_loss_7": 382.99049072265626, "learning_rate": 0.0008310128348054094, "loss": 814.8721, "step": 2770 }, { "ce_loss_10": 3.503373074531555, "ce_loss_13": 3.4395504236221313, "ce_loss_2": 4.196349406242371, "ce_loss_3": 3.9976847887039186, "ce_loss_7": 3.612117648124695, "epoch": 0.278, "grad_norm": 684.0, "kl_loss_10": 144.36290817260743, "kl_loss_2": 1611.1840881347657, "kl_loss_3": 1214.7904235839844, "kl_loss_7": 393.8703353881836, "learning_rate": 0.0008298219961566008, "loss": 840.2912, "step": 2780 }, { "ce_loss_10": 3.4721336245536802, "ce_loss_13": 3.4057936549186705, "ce_loss_2": 4.1955530643463135, "ce_loss_3": 3.995982491970062, "ce_loss_7": 3.588437759876251, "epoch": 0.279, "grad_norm": 664.0, "kl_loss_10": 145.45478706359864, "kl_loss_2": 1683.0884338378905, "kl_loss_3": 1281.4284118652345, "kl_loss_7": 402.82340850830076, "learning_rate": 0.0008286278362039527, "loss": 855.1207, "step": 2790 }, { "ce_loss_10": 3.499285411834717, "ce_loss_13": 3.432967686653137, "ce_loss_2": 4.224206006526947, "ce_loss_3": 4.018722629547119, "ce_loss_7": 3.612668144702911, "epoch": 0.28, "grad_norm": 544.0, "kl_loss_10": 142.82978515625, "kl_loss_2": 1658.8205200195312, "kl_loss_3": 1255.1792053222657, "kl_loss_7": 392.1188400268555, "learning_rate": 0.0008274303669726426, "loss": 841.1881, "step": 2800 }, { "ce_loss_10": 3.4018381118774412, "ce_loss_13": 3.3346792578697206, "ce_loss_2": 4.129230046272278, "ce_loss_3": 3.9258937001228333, "ce_loss_7": 3.518822467327118, "epoch": 0.281, "grad_norm": 764.0, "kl_loss_10": 143.8448387145996, "kl_loss_2": 1673.1943664550781, "kl_loss_3": 1270.9249450683594, "kl_loss_7": 403.2898895263672, "learning_rate": 0.0008262296005211721, "loss": 846.9486, "step": 2810 }, { "ce_loss_10": 3.5292460680007935, "ce_loss_13": 3.4640193819999694, "ce_loss_2": 4.239890122413636, "ce_loss_3": 4.038429474830627, "ce_loss_7": 3.641891372203827, "epoch": 0.282, "grad_norm": 624.0, "kl_loss_10": 143.57638244628907, "kl_loss_2": 1645.4285217285155, "kl_loss_3": 1244.0456237792969, "kl_loss_7": 397.29553985595703, "learning_rate": 0.0008250255489412463, "loss": 846.6036, "step": 2820 }, { "ce_loss_10": 3.62887202501297, "ce_loss_13": 3.561253750324249, "ce_loss_2": 4.315714979171753, "ce_loss_3": 4.123977625370026, "ce_loss_7": 3.738538372516632, "epoch": 0.283, "grad_norm": 728.0, "kl_loss_10": 148.4043426513672, "kl_loss_2": 1612.185321044922, "kl_loss_3": 1224.0401733398437, "kl_loss_7": 394.00487060546874, "learning_rate": 0.0008238182243576511, "loss": 846.8261, "step": 2830 }, { "ce_loss_10": 3.5985996246337892, "ce_loss_13": 3.5341097950935363, "ce_loss_2": 4.251928305625915, "ce_loss_3": 4.060415625572205, "ce_loss_7": 3.698906183242798, "epoch": 0.284, "grad_norm": 772.0, "kl_loss_10": 147.80193328857422, "kl_loss_2": 1545.2418884277345, "kl_loss_3": 1180.9795532226562, "kl_loss_7": 381.7768493652344, "learning_rate": 0.0008226076389281315, "loss": 823.0694, "step": 2840 }, { "ce_loss_10": 3.637994980812073, "ce_loss_13": 3.5726293325424194, "ce_loss_2": 4.302945876121521, "ce_loss_3": 4.106851041316986, "ce_loss_7": 3.74611839056015, "epoch": 0.285, "grad_norm": 728.0, "kl_loss_10": 148.46692581176757, "kl_loss_2": 1581.9210205078125, "kl_loss_3": 1188.579412841797, "kl_loss_7": 392.20827026367186, "learning_rate": 0.0008213938048432696, "loss": 821.8449, "step": 2850 }, { "ce_loss_10": 3.5638878107070924, "ce_loss_13": 3.497787523269653, "ce_loss_2": 4.24253523349762, "ce_loss_3": 4.041992700099945, "ce_loss_7": 3.676758587360382, "epoch": 0.286, "grad_norm": 672.0, "kl_loss_10": 148.57295265197754, "kl_loss_2": 1586.4780151367188, "kl_loss_3": 1201.8769653320312, "kl_loss_7": 398.1691589355469, "learning_rate": 0.0008201767343263612, "loss": 837.8469, "step": 2860 }, { "ce_loss_10": 3.5040563464164736, "ce_loss_13": 3.438361716270447, "ce_loss_2": 4.212529039382934, "ce_loss_3": 4.008230900764465, "ce_loss_7": 3.61529027223587, "epoch": 0.287, "grad_norm": 656.0, "kl_loss_10": 142.38912506103514, "kl_loss_2": 1630.1037292480469, "kl_loss_3": 1231.1203369140626, "kl_loss_7": 393.3270919799805, "learning_rate": 0.0008189564396332927, "loss": 822.9645, "step": 2870 }, { "ce_loss_10": 3.4819291472434997, "ce_loss_13": 3.419506084918976, "ce_loss_2": 4.196909439563751, "ce_loss_3": 3.9896247029304504, "ce_loss_7": 3.5970585227012633, "epoch": 0.288, "grad_norm": 820.0, "kl_loss_10": 143.1298400878906, "kl_loss_2": 1632.7102294921874, "kl_loss_3": 1227.3682495117187, "kl_loss_7": 391.58729553222656, "learning_rate": 0.0008177329330524181, "loss": 846.0508, "step": 2880 }, { "ce_loss_10": 3.552425742149353, "ce_loss_13": 3.4808704257011414, "ce_loss_2": 4.2266720056533815, "ce_loss_3": 4.03522971868515, "ce_loss_7": 3.6583752155303957, "epoch": 0.289, "grad_norm": 704.0, "kl_loss_10": 147.06133499145508, "kl_loss_2": 1568.7399963378907, "kl_loss_3": 1192.5995971679688, "kl_loss_7": 385.310205078125, "learning_rate": 0.0008165062269044352, "loss": 830.767, "step": 2890 }, { "ce_loss_10": 3.5040358662605287, "ce_loss_13": 3.4337138772010802, "ce_loss_2": 4.194976377487182, "ce_loss_3": 3.9983848929405212, "ce_loss_7": 3.6155543684959413, "epoch": 0.29, "grad_norm": 588.0, "kl_loss_10": 152.81832695007324, "kl_loss_2": 1609.0662231445312, "kl_loss_3": 1219.3178833007812, "kl_loss_7": 397.04918212890624, "learning_rate": 0.0008152763335422613, "loss": 843.7021, "step": 2900 }, { "ce_loss_10": 3.4909900784492494, "ce_loss_13": 3.419775998592377, "ce_loss_2": 4.196218192577362, "ce_loss_3": 3.98809734582901, "ce_loss_7": 3.596997547149658, "epoch": 0.291, "grad_norm": 680.0, "kl_loss_10": 158.0971206665039, "kl_loss_2": 1636.2816589355468, "kl_loss_3": 1234.2775512695312, "kl_loss_7": 399.34343872070315, "learning_rate": 0.0008140432653509088, "loss": 842.7512, "step": 2910 }, { "ce_loss_10": 3.5415608644485475, "ce_loss_13": 3.474157154560089, "ce_loss_2": 4.221889722347259, "ce_loss_3": 4.020258843898773, "ce_loss_7": 3.649514615535736, "epoch": 0.292, "grad_norm": 692.0, "kl_loss_10": 148.66607284545898, "kl_loss_2": 1602.7043029785157, "kl_loss_3": 1203.1907165527343, "kl_loss_7": 393.12267150878904, "learning_rate": 0.0008128070347473608, "loss": 827.3531, "step": 2920 }, { "ce_loss_10": 3.544040846824646, "ce_loss_13": 3.477999973297119, "ce_loss_2": 4.254761123657227, "ce_loss_3": 4.041979575157166, "ce_loss_7": 3.658432722091675, "epoch": 0.293, "grad_norm": 808.0, "kl_loss_10": 146.20843963623048, "kl_loss_2": 1651.2572692871095, "kl_loss_3": 1232.2542541503906, "kl_loss_7": 402.30601654052737, "learning_rate": 0.0008115676541804455, "loss": 844.0389, "step": 2930 }, { "ce_loss_10": 3.552527105808258, "ce_loss_13": 3.4877038478851317, "ce_loss_2": 4.229338979721069, "ce_loss_3": 4.030285179615021, "ce_loss_7": 3.6580453157424926, "epoch": 0.294, "grad_norm": 596.0, "kl_loss_10": 143.29071197509765, "kl_loss_2": 1590.7006225585938, "kl_loss_3": 1193.8719604492187, "kl_loss_7": 391.36986846923827, "learning_rate": 0.0008103251361307119, "loss": 836.4414, "step": 2940 }, { "ce_loss_10": 3.5813042759895324, "ce_loss_13": 3.5158876180648804, "ce_loss_2": 4.2632251381874084, "ce_loss_3": 4.065558528900146, "ce_loss_7": 3.694361913204193, "epoch": 0.295, "grad_norm": 952.0, "kl_loss_10": 144.15830841064454, "kl_loss_2": 1593.5725402832031, "kl_loss_3": 1203.2418884277345, "kl_loss_7": 400.3397613525391, "learning_rate": 0.0008090794931103026, "loss": 828.5039, "step": 2950 }, { "ce_loss_10": 3.5701854705810545, "ce_loss_13": 3.506050479412079, "ce_loss_2": 4.251671302318573, "ce_loss_3": 4.048157429695129, "ce_loss_7": 3.6768277049064637, "epoch": 0.296, "grad_norm": 704.0, "kl_loss_10": 140.38467445373536, "kl_loss_2": 1584.526300048828, "kl_loss_3": 1190.709698486328, "kl_loss_7": 385.96351318359376, "learning_rate": 0.0008078307376628291, "loss": 831.3865, "step": 2960 }, { "ce_loss_10": 3.6296083092689515, "ce_loss_13": 3.5671839475631715, "ce_loss_2": 4.282579398155212, "ce_loss_3": 4.087380516529083, "ce_loss_7": 3.736238217353821, "epoch": 0.297, "grad_norm": 652.0, "kl_loss_10": 137.90597343444824, "kl_loss_2": 1528.18271484375, "kl_loss_3": 1148.9516845703124, "kl_loss_7": 376.7103607177734, "learning_rate": 0.000806578882363245, "loss": 801.1323, "step": 2970 }, { "ce_loss_10": 3.5410609245300293, "ce_loss_13": 3.480309045314789, "ce_loss_2": 4.21635273694992, "ce_loss_3": 4.023106849193573, "ce_loss_7": 3.651861608028412, "epoch": 0.298, "grad_norm": 872.0, "kl_loss_10": 137.76350517272948, "kl_loss_2": 1573.1726318359374, "kl_loss_3": 1191.347589111328, "kl_loss_7": 386.14882202148436, "learning_rate": 0.0008053239398177191, "loss": 838.2662, "step": 2980 }, { "ce_loss_10": 3.521118640899658, "ce_loss_13": 3.4576277256011965, "ce_loss_2": 4.211665558815002, "ce_loss_3": 4.007085061073303, "ce_loss_7": 3.6337268471717836, "epoch": 0.299, "grad_norm": 804.0, "kl_loss_10": 139.79225463867186, "kl_loss_2": 1595.8759216308595, "kl_loss_3": 1195.4937316894532, "kl_loss_7": 385.7449325561523, "learning_rate": 0.0008040659226635089, "loss": 850.0258, "step": 2990 }, { "ce_loss_10": 3.65621532201767, "ce_loss_13": 3.589854049682617, "ce_loss_2": 4.332050681114197, "ce_loss_3": 4.135171377658844, "ce_loss_7": 3.7766780138015745, "epoch": 0.3, "grad_norm": 716.0, "kl_loss_10": 145.6563461303711, "kl_loss_2": 1576.9345825195312, "kl_loss_3": 1194.2649353027343, "kl_loss_7": 413.7483200073242, "learning_rate": 0.0008028048435688333, "loss": 829.777, "step": 3000 }, { "ce_loss_10": 3.5251790165901182, "ce_loss_13": 3.4619635701179505, "ce_loss_2": 4.2210460782051085, "ce_loss_3": 4.019052767753601, "ce_loss_7": 3.637720024585724, "epoch": 0.301, "grad_norm": 780.0, "kl_loss_10": 141.6140563964844, "kl_loss_2": 1625.1482543945312, "kl_loss_3": 1220.7144775390625, "kl_loss_7": 397.0341278076172, "learning_rate": 0.0008015407152327448, "loss": 838.2924, "step": 3010 }, { "ce_loss_10": 3.5773096442222596, "ce_loss_13": 3.510933578014374, "ce_loss_2": 4.2585627913475035, "ce_loss_3": 4.061378359794617, "ce_loss_7": 3.693008613586426, "epoch": 0.302, "grad_norm": 640.0, "kl_loss_10": 141.85217247009277, "kl_loss_2": 1606.789813232422, "kl_loss_3": 1214.533203125, "kl_loss_7": 400.54812774658205, "learning_rate": 0.0008002735503850016, "loss": 839.8877, "step": 3020 }, { "ce_loss_10": 3.4636507511138914, "ce_loss_13": 3.397989869117737, "ce_loss_2": 4.174388039112091, "ce_loss_3": 3.9605722427368164, "ce_loss_7": 3.577491307258606, "epoch": 0.303, "grad_norm": 600.0, "kl_loss_10": 143.30892028808594, "kl_loss_2": 1645.3674255371093, "kl_loss_3": 1230.3723571777343, "kl_loss_7": 406.163623046875, "learning_rate": 0.0007990033617859396, "loss": 850.1339, "step": 3030 }, { "ce_loss_10": 3.514539110660553, "ce_loss_13": 3.451056456565857, "ce_loss_2": 4.192648077011109, "ce_loss_3": 3.9946502685546874, "ce_loss_7": 3.623524785041809, "epoch": 0.304, "grad_norm": 636.0, "kl_loss_10": 141.17939987182618, "kl_loss_2": 1581.641424560547, "kl_loss_3": 1189.4458251953124, "kl_loss_7": 394.9202285766602, "learning_rate": 0.000797730162226344, "loss": 811.3634, "step": 3040 }, { "ce_loss_10": 3.5422164678573607, "ce_loss_13": 3.474979078769684, "ce_loss_2": 4.236926698684693, "ce_loss_3": 4.0294880151748655, "ce_loss_7": 3.663298499584198, "epoch": 0.305, "grad_norm": 800.0, "kl_loss_10": 146.4817584991455, "kl_loss_2": 1609.1203186035157, "kl_loss_3": 1205.9356170654296, "kl_loss_7": 413.7532440185547, "learning_rate": 0.0007964539645273203, "loss": 829.1729, "step": 3050 }, { "ce_loss_10": 3.553938126564026, "ce_loss_13": 3.4921163439750673, "ce_loss_2": 4.224380815029145, "ce_loss_3": 4.028742516040802, "ce_loss_7": 3.6613959074020386, "epoch": 0.306, "grad_norm": 588.0, "kl_loss_10": 142.35839805603027, "kl_loss_2": 1565.0873718261719, "kl_loss_3": 1175.3519958496095, "kl_loss_7": 399.386003112793, "learning_rate": 0.000795174781540165, "loss": 830.5109, "step": 3060 }, { "ce_loss_10": 3.631449246406555, "ce_loss_13": 3.5652127861976624, "ce_loss_2": 4.277838742733001, "ce_loss_3": 4.087875485420227, "ce_loss_7": 3.7482593059539795, "epoch": 0.307, "grad_norm": 948.0, "kl_loss_10": 145.9558292388916, "kl_loss_2": 1522.917333984375, "kl_loss_3": 1148.0942016601562, "kl_loss_7": 410.9433837890625, "learning_rate": 0.0007938926261462366, "loss": 827.8861, "step": 3070 }, { "ce_loss_10": 3.5865222454071044, "ce_loss_13": 3.5132097244262694, "ce_loss_2": 4.221575832366943, "ce_loss_3": 4.03163822889328, "ce_loss_7": 3.709211730957031, "epoch": 0.308, "grad_norm": 644.0, "kl_loss_10": 149.5517364501953, "kl_loss_2": 1554.0492431640625, "kl_loss_3": 1164.7528442382813, "kl_loss_7": 426.5201721191406, "learning_rate": 0.0007926075112568258, "loss": 842.9685, "step": 3080 }, { "ce_loss_10": 3.5743375420570374, "ce_loss_13": 3.506422483921051, "ce_loss_2": 4.235567331314087, "ce_loss_3": 4.042410182952881, "ce_loss_7": 3.6804218649864198, "epoch": 0.309, "grad_norm": 576.0, "kl_loss_10": 144.69867630004882, "kl_loss_2": 1571.8233825683594, "kl_loss_3": 1187.3203979492187, "kl_loss_7": 398.75867767333983, "learning_rate": 0.0007913194498130252, "loss": 817.3697, "step": 3090 }, { "ce_loss_10": 3.494604206085205, "ce_loss_13": 3.4323593616485595, "ce_loss_2": 4.202052474021912, "ce_loss_3": 3.988065266609192, "ce_loss_7": 3.6171945691108705, "epoch": 0.31, "grad_norm": 740.0, "kl_loss_10": 144.33132781982422, "kl_loss_2": 1625.1172668457032, "kl_loss_3": 1206.2160430908202, "kl_loss_7": 405.6477508544922, "learning_rate": 0.0007900284547855992, "loss": 844.1217, "step": 3100 }, { "ce_loss_10": 3.505799424648285, "ce_loss_13": 3.44124299287796, "ce_loss_2": 4.186880040168762, "ce_loss_3": 3.9723469614982605, "ce_loss_7": 3.6141058206558228, "epoch": 0.311, "grad_norm": 588.0, "kl_loss_10": 142.34576911926268, "kl_loss_2": 1592.635546875, "kl_loss_3": 1175.9765869140624, "kl_loss_7": 387.8748123168945, "learning_rate": 0.0007887345391748532, "loss": 841.3018, "step": 3110 }, { "ce_loss_10": 3.641534376144409, "ce_loss_13": 3.5780982255935667, "ce_loss_2": 4.291218495368957, "ce_loss_3": 4.0910943150520325, "ce_loss_7": 3.742597687244415, "epoch": 0.312, "grad_norm": 712.0, "kl_loss_10": 144.0367401123047, "kl_loss_2": 1544.438739013672, "kl_loss_3": 1151.2371490478515, "kl_loss_7": 377.4181396484375, "learning_rate": 0.0007874377160105036, "loss": 801.789, "step": 3120 }, { "ce_loss_10": 3.531910240650177, "ce_loss_13": 3.468705189228058, "ce_loss_2": 4.221284866333008, "ce_loss_3": 4.0103159785270694, "ce_loss_7": 3.6348586559295653, "epoch": 0.313, "grad_norm": 628.0, "kl_loss_10": 147.38395767211915, "kl_loss_2": 1606.8059204101562, "kl_loss_3": 1190.6296630859374, "kl_loss_7": 377.1212661743164, "learning_rate": 0.0007861379983515449, "loss": 844.5914, "step": 3130 }, { "ce_loss_10": 3.622577941417694, "ce_loss_13": 3.55368732213974, "ce_loss_2": 4.281805419921875, "ce_loss_3": 4.091654586791992, "ce_loss_7": 3.7212623953819275, "epoch": 0.314, "grad_norm": 656.0, "kl_loss_10": 151.8330436706543, "kl_loss_2": 1578.883233642578, "kl_loss_3": 1199.2917419433593, "kl_loss_7": 384.7697189331055, "learning_rate": 0.0007848353992861195, "loss": 819.7404, "step": 3140 }, { "ce_loss_10": 3.7075061440467834, "ce_loss_13": 3.6256630778312684, "ce_loss_2": 4.37855339050293, "ce_loss_3": 4.184061086177826, "ce_loss_7": 3.806570255756378, "epoch": 0.315, "grad_norm": 568.0, "kl_loss_10": 167.983447265625, "kl_loss_2": 1588.1090698242188, "kl_loss_3": 1201.9598571777344, "kl_loss_7": 396.2051040649414, "learning_rate": 0.0007835299319313853, "loss": 837.6727, "step": 3150 }, { "ce_loss_10": 3.5817773222923277, "ce_loss_13": 3.5118511438369753, "ce_loss_2": 4.233609986305237, "ce_loss_3": 4.0398486137390135, "ce_loss_7": 3.679852533340454, "epoch": 0.316, "grad_norm": 792.0, "kl_loss_10": 157.4121223449707, "kl_loss_2": 1557.2783630371093, "kl_loss_3": 1171.8280517578125, "kl_loss_7": 382.1309753417969, "learning_rate": 0.0007822216094333848, "loss": 839.4477, "step": 3160 }, { "ce_loss_10": 3.582335615158081, "ce_loss_13": 3.514190638065338, "ce_loss_2": 4.262095773220063, "ce_loss_3": 4.0640422105789185, "ce_loss_7": 3.687114107608795, "epoch": 0.317, "grad_norm": 752.0, "kl_loss_10": 149.11183700561523, "kl_loss_2": 1582.70458984375, "kl_loss_3": 1193.9826293945312, "kl_loss_7": 384.67308807373047, "learning_rate": 0.0007809104449669101, "loss": 818.48, "step": 3170 }, { "ce_loss_10": 3.5347620606422425, "ce_loss_13": 3.469420051574707, "ce_loss_2": 4.189749908447266, "ce_loss_3": 3.9996961116790772, "ce_loss_7": 3.639148008823395, "epoch": 0.318, "grad_norm": 612.0, "kl_loss_10": 145.1158645629883, "kl_loss_2": 1538.3842163085938, "kl_loss_3": 1168.4396179199218, "kl_loss_7": 377.5471496582031, "learning_rate": 0.0007795964517353734, "loss": 813.6468, "step": 3180 }, { "ce_loss_10": 3.5253086924552917, "ce_loss_13": 3.4601287484169005, "ce_loss_2": 4.204790925979614, "ce_loss_3": 4.0041629552841185, "ce_loss_7": 3.633437788486481, "epoch": 0.319, "grad_norm": 556.0, "kl_loss_10": 145.9946216583252, "kl_loss_2": 1610.9086120605468, "kl_loss_3": 1214.3668518066406, "kl_loss_7": 388.49003143310546, "learning_rate": 0.000778279642970672, "loss": 816.3545, "step": 3190 }, { "ce_loss_10": 3.5291069030761717, "ce_loss_13": 3.465493679046631, "ce_loss_2": 4.189659130573273, "ce_loss_3": 3.9933029651641845, "ce_loss_7": 3.637093019485474, "epoch": 0.32, "grad_norm": 720.0, "kl_loss_10": 142.4067527770996, "kl_loss_2": 1555.5811950683594, "kl_loss_3": 1166.4454162597656, "kl_loss_7": 380.82061767578125, "learning_rate": 0.0007769600319330552, "loss": 803.8339, "step": 3200 }, { "ce_loss_10": 3.562716245651245, "ce_loss_13": 3.499916505813599, "ce_loss_2": 4.260833311080932, "ce_loss_3": 4.057473051548004, "ce_loss_7": 3.6713879346847533, "epoch": 0.321, "grad_norm": 992.0, "kl_loss_10": 141.0684398651123, "kl_loss_2": 1608.2053955078125, "kl_loss_3": 1204.6956695556642, "kl_loss_7": 382.2118606567383, "learning_rate": 0.0007756376319109917, "loss": 822.6916, "step": 3210 }, { "ce_loss_10": 3.6111506104469298, "ce_loss_13": 3.5478773951530456, "ce_loss_2": 4.274733674526215, "ce_loss_3": 4.077989876270294, "ce_loss_7": 3.716237735748291, "epoch": 0.322, "grad_norm": 592.0, "kl_loss_10": 142.35418815612792, "kl_loss_2": 1549.4132995605469, "kl_loss_3": 1170.5071533203125, "kl_loss_7": 383.4806655883789, "learning_rate": 0.0007743124562210351, "loss": 802.4669, "step": 3220 }, { "ce_loss_10": 3.6234222531318663, "ce_loss_13": 3.5583123326301576, "ce_loss_2": 4.2762122631073, "ce_loss_3": 4.079313564300537, "ce_loss_7": 3.725733482837677, "epoch": 0.323, "grad_norm": 696.0, "kl_loss_10": 141.49731979370117, "kl_loss_2": 1560.1914489746093, "kl_loss_3": 1169.5012756347655, "kl_loss_7": 380.71566619873045, "learning_rate": 0.0007729845182076895, "loss": 818.9047, "step": 3230 }, { "ce_loss_10": 3.54907089471817, "ce_loss_13": 3.488371527194977, "ce_loss_2": 4.202688992023468, "ce_loss_3": 4.013319063186645, "ce_loss_7": 3.6541411519050597, "epoch": 0.324, "grad_norm": 772.0, "kl_loss_10": 135.94298095703124, "kl_loss_2": 1538.9539428710937, "kl_loss_3": 1161.7360473632812, "kl_loss_7": 374.43616180419923, "learning_rate": 0.0007716538312432765, "loss": 820.5339, "step": 3240 }, { "ce_loss_10": 3.5088236331939697, "ce_loss_13": 3.4442797899246216, "ce_loss_2": 4.2046965718269345, "ce_loss_3": 3.9894727945327757, "ce_loss_7": 3.6167898058891295, "epoch": 0.325, "grad_norm": 760.0, "kl_loss_10": 142.3162754058838, "kl_loss_2": 1605.5329223632812, "kl_loss_3": 1201.896401977539, "kl_loss_7": 388.1252075195313, "learning_rate": 0.0007703204087277988, "loss": 826.9689, "step": 3250 }, { "ce_loss_10": 3.609957015514374, "ce_loss_13": 3.5482339024543763, "ce_loss_2": 4.249799501895905, "ce_loss_3": 4.058551073074341, "ce_loss_7": 3.7137101888656616, "epoch": 0.326, "grad_norm": 620.0, "kl_loss_10": 137.27352027893068, "kl_loss_2": 1501.4761779785156, "kl_loss_3": 1131.705010986328, "kl_loss_7": 374.86394653320315, "learning_rate": 0.0007689842640888063, "loss": 797.6071, "step": 3260 }, { "ce_loss_10": 3.6040314555168154, "ce_loss_13": 3.5402695417404173, "ce_loss_2": 4.258551788330078, "ce_loss_3": 4.068223142623902, "ce_loss_7": 3.7120986700057985, "epoch": 0.327, "grad_norm": 584.0, "kl_loss_10": 139.66932640075683, "kl_loss_2": 1524.4914489746093, "kl_loss_3": 1163.2306518554688, "kl_loss_7": 381.93678894042966, "learning_rate": 0.0007676454107812607, "loss": 811.0578, "step": 3270 }, { "ce_loss_10": 3.53929922580719, "ce_loss_13": 3.4770522236824037, "ce_loss_2": 4.213063597679138, "ce_loss_3": 4.026750934123993, "ce_loss_7": 3.6522504925727843, "epoch": 0.328, "grad_norm": 784.0, "kl_loss_10": 140.27813835144042, "kl_loss_2": 1568.5742309570312, "kl_loss_3": 1193.110528564453, "kl_loss_7": 393.0425598144531, "learning_rate": 0.0007663038622873999, "loss": 813.8143, "step": 3280 }, { "ce_loss_10": 3.578033113479614, "ce_loss_13": 3.516423726081848, "ce_loss_2": 4.246110367774963, "ce_loss_3": 4.043308067321777, "ce_loss_7": 3.6857515454292296, "epoch": 0.329, "grad_norm": 824.0, "kl_loss_10": 139.4735927581787, "kl_loss_2": 1560.1141662597656, "kl_loss_3": 1172.8965942382813, "kl_loss_7": 398.01239624023435, "learning_rate": 0.0007649596321166025, "loss": 805.2687, "step": 3290 }, { "ce_loss_10": 3.4800352215766908, "ce_loss_13": 3.421827828884125, "ce_loss_2": 4.145622873306275, "ce_loss_3": 3.954922652244568, "ce_loss_7": 3.5925204157829285, "epoch": 0.33, "grad_norm": 680.0, "kl_loss_10": 134.23374557495117, "kl_loss_2": 1532.8545349121093, "kl_loss_3": 1164.9057250976562, "kl_loss_7": 383.72555084228514, "learning_rate": 0.0007636127338052513, "loss": 811.8129, "step": 3300 }, { "ce_loss_10": 3.5890893220901487, "ce_loss_13": 3.5242863059043885, "ce_loss_2": 4.25543829202652, "ce_loss_3": 4.057149302959442, "ce_loss_7": 3.698796546459198, "epoch": 0.331, "grad_norm": 616.0, "kl_loss_10": 140.55243148803712, "kl_loss_2": 1560.8628784179687, "kl_loss_3": 1176.2081298828125, "kl_loss_7": 390.31951293945315, "learning_rate": 0.0007622631809165971, "loss": 803.85, "step": 3310 }, { "ce_loss_10": 3.5855463981628417, "ce_loss_13": 3.5262409806251527, "ce_loss_2": 4.216806030273437, "ce_loss_3": 4.027826583385467, "ce_loss_7": 3.6855634450912476, "epoch": 0.332, "grad_norm": 458.0, "kl_loss_10": 131.1049461364746, "kl_loss_2": 1471.740216064453, "kl_loss_3": 1107.5941833496095, "kl_loss_7": 363.5269348144531, "learning_rate": 0.000760910987040623, "loss": 786.43, "step": 3320 }, { "ce_loss_10": 3.5629027485847473, "ce_loss_13": 3.502229619026184, "ce_loss_2": 4.248357820510864, "ce_loss_3": 4.046060919761658, "ce_loss_7": 3.6763986110687257, "epoch": 0.333, "grad_norm": 580.0, "kl_loss_10": 138.26232109069824, "kl_loss_2": 1589.9033935546875, "kl_loss_3": 1202.1307739257813, "kl_loss_7": 389.39125366210936, "learning_rate": 0.000759556165793906, "loss": 806.7218, "step": 3330 }, { "ce_loss_10": 3.593952786922455, "ce_loss_13": 3.532469391822815, "ce_loss_2": 4.24890683889389, "ce_loss_3": 4.057988488674164, "ce_loss_7": 3.69755597114563, "epoch": 0.334, "grad_norm": 632.0, "kl_loss_10": 136.70615882873534, "kl_loss_2": 1542.6665771484375, "kl_loss_3": 1160.2529327392579, "kl_loss_7": 375.4426513671875, "learning_rate": 0.000758198730819481, "loss": 814.5036, "step": 3340 }, { "ce_loss_10": 3.5359851241111757, "ce_loss_13": 3.4757973551750183, "ce_loss_2": 4.205133056640625, "ce_loss_3": 4.011187362670898, "ce_loss_7": 3.6422808527946473, "epoch": 0.335, "grad_norm": 676.0, "kl_loss_10": 134.6895553588867, "kl_loss_2": 1573.4298767089845, "kl_loss_3": 1188.663296508789, "kl_loss_7": 378.12191619873045, "learning_rate": 0.0007568386957867032, "loss": 813.6836, "step": 3350 }, { "ce_loss_10": 3.6098239421844482, "ce_loss_13": 3.5456187248229982, "ce_loss_2": 4.260682666301728, "ce_loss_3": 4.070243191719055, "ce_loss_7": 3.7122027039527894, "epoch": 0.336, "grad_norm": 780.0, "kl_loss_10": 136.96125411987305, "kl_loss_2": 1524.2330810546875, "kl_loss_3": 1148.0459899902344, "kl_loss_7": 377.05298767089846, "learning_rate": 0.0007554760743911103, "loss": 810.5187, "step": 3360 }, { "ce_loss_10": 3.507435417175293, "ce_loss_13": 3.44707133769989, "ce_loss_2": 4.168538379669189, "ce_loss_3": 3.969741427898407, "ce_loss_7": 3.6143924474716185, "epoch": 0.337, "grad_norm": 800.0, "kl_loss_10": 133.15070648193358, "kl_loss_2": 1558.3870727539063, "kl_loss_3": 1166.5326843261719, "kl_loss_7": 373.68982391357423, "learning_rate": 0.0007541108803542846, "loss": 823.2562, "step": 3370 }, { "ce_loss_10": 3.556701052188873, "ce_loss_13": 3.4952089309692385, "ce_loss_2": 4.213332033157348, "ce_loss_3": 4.01972188949585, "ce_loss_7": 3.666098403930664, "epoch": 0.338, "grad_norm": 632.0, "kl_loss_10": 137.10250129699708, "kl_loss_2": 1548.2002746582032, "kl_loss_3": 1163.8560943603516, "kl_loss_7": 378.4952331542969, "learning_rate": 0.0007527431274237149, "loss": 839.8433, "step": 3380 }, { "ce_loss_10": 3.5281825184822084, "ce_loss_13": 3.469517374038696, "ce_loss_2": 4.175353538990021, "ce_loss_3": 3.982305443286896, "ce_loss_7": 3.630697858333588, "epoch": 0.339, "grad_norm": 604.0, "kl_loss_10": 134.46246299743652, "kl_loss_2": 1533.5295776367188, "kl_loss_3": 1153.648422241211, "kl_loss_7": 372.5039978027344, "learning_rate": 0.0007513728293726579, "loss": 803.5898, "step": 3390 }, { "ce_loss_10": 3.6488207459449766, "ce_loss_13": 3.587409019470215, "ce_loss_2": 4.293999433517456, "ce_loss_3": 4.103942286968231, "ce_loss_7": 3.7538707733154295, "epoch": 0.34, "grad_norm": 568.0, "kl_loss_10": 137.6070526123047, "kl_loss_2": 1523.785009765625, "kl_loss_3": 1146.2487243652345, "kl_loss_7": 378.4647155761719, "learning_rate": 0.00075, "loss": 796.8509, "step": 3400 }, { "ce_loss_10": 3.635476815700531, "ce_loss_13": 3.5716666340827943, "ce_loss_2": 4.305250811576843, "ce_loss_3": 4.109015083312988, "ce_loss_7": 3.745475196838379, "epoch": 0.341, "grad_norm": 608.0, "kl_loss_10": 140.0908172607422, "kl_loss_2": 1556.6490234375, "kl_loss_3": 1162.7167419433595, "kl_loss_7": 384.58277435302733, "learning_rate": 0.0007486246531301177, "loss": 802.0264, "step": 3410 }, { "ce_loss_10": 3.4416700124740602, "ce_loss_13": 3.3813814163208007, "ce_loss_2": 4.116898477077484, "ce_loss_3": 3.9193392276763914, "ce_loss_7": 3.5528218507766725, "epoch": 0.342, "grad_norm": 668.0, "kl_loss_10": 134.79825401306152, "kl_loss_2": 1559.6565307617188, "kl_loss_3": 1172.1594482421874, "kl_loss_7": 377.6139343261719, "learning_rate": 0.0007472468026127384, "loss": 799.9335, "step": 3420 }, { "ce_loss_10": 3.577793312072754, "ce_loss_13": 3.511690676212311, "ce_loss_2": 4.266387677192688, "ce_loss_3": 4.059280645847321, "ce_loss_7": 3.6900326371192933, "epoch": 0.343, "grad_norm": 592.0, "kl_loss_10": 141.6757396697998, "kl_loss_2": 1606.4993103027343, "kl_loss_3": 1208.6977478027343, "kl_loss_7": 394.94551849365234, "learning_rate": 0.000745866462322802, "loss": 828.0714, "step": 3430 }, { "ce_loss_10": 3.563853549957275, "ce_loss_13": 3.505466651916504, "ce_loss_2": 4.219634628295898, "ce_loss_3": 4.022562730312347, "ce_loss_7": 3.6712709188461305, "epoch": 0.344, "grad_norm": 576.0, "kl_loss_10": 133.0882568359375, "kl_loss_2": 1511.8997375488282, "kl_loss_3": 1135.5877319335937, "kl_loss_7": 369.4819305419922, "learning_rate": 0.0007444836461603195, "loss": 797.3501, "step": 3440 }, { "ce_loss_10": 3.6264248490333557, "ce_loss_13": 3.5626481413841247, "ce_loss_2": 4.290192425251007, "ce_loss_3": 4.099602663516999, "ce_loss_7": 3.734038972854614, "epoch": 0.345, "grad_norm": 592.0, "kl_loss_10": 139.87619667053224, "kl_loss_2": 1568.7929321289062, "kl_loss_3": 1192.3937042236328, "kl_loss_7": 387.790544128418, "learning_rate": 0.0007430983680502344, "loss": 820.2707, "step": 3450 }, { "ce_loss_10": 3.468415367603302, "ce_loss_13": 3.4085907101631165, "ce_loss_2": 4.147791481018066, "ce_loss_3": 3.9522446393966675, "ce_loss_7": 3.5773945450782776, "epoch": 0.346, "grad_norm": 524.0, "kl_loss_10": 138.47050895690919, "kl_loss_2": 1571.893621826172, "kl_loss_3": 1183.226983642578, "kl_loss_7": 380.1432510375977, "learning_rate": 0.0007417106419422819, "loss": 814.8158, "step": 3460 }, { "ce_loss_10": 3.578550028800964, "ce_loss_13": 3.51279159784317, "ce_loss_2": 4.240513134002685, "ce_loss_3": 4.041714072227478, "ce_loss_7": 3.684761953353882, "epoch": 0.347, "grad_norm": 708.0, "kl_loss_10": 139.78108291625978, "kl_loss_2": 1533.4877563476562, "kl_loss_3": 1153.673779296875, "kl_loss_7": 374.2180770874023, "learning_rate": 0.0007403204818108486, "loss": 807.0902, "step": 3470 }, { "ce_loss_10": 3.548770797252655, "ce_loss_13": 3.486237347126007, "ce_loss_2": 4.2026319146156315, "ce_loss_3": 4.003607368469238, "ce_loss_7": 3.6532763123512266, "epoch": 0.348, "grad_norm": 596.0, "kl_loss_10": 144.19391098022462, "kl_loss_2": 1553.6705322265625, "kl_loss_3": 1160.6052368164062, "kl_loss_7": 378.1271469116211, "learning_rate": 0.0007389279016548316, "loss": 788.2532, "step": 3480 }, { "ce_loss_10": 3.5647180557250975, "ce_loss_13": 3.492380142211914, "ce_loss_2": 4.253465700149536, "ce_loss_3": 4.037442588806153, "ce_loss_7": 3.6645753383636475, "epoch": 0.349, "grad_norm": 732.0, "kl_loss_10": 149.7018730163574, "kl_loss_2": 1606.8699096679688, "kl_loss_3": 1193.213555908203, "kl_loss_7": 386.1459732055664, "learning_rate": 0.0007375329154974975, "loss": 825.1197, "step": 3490 }, { "ce_loss_10": 3.5201268196105957, "ce_loss_13": 3.4533039331436157, "ce_loss_2": 4.166223227977753, "ce_loss_3": 3.9767157316207884, "ce_loss_7": 3.6191452860832216, "epoch": 0.35, "grad_norm": 584.0, "kl_loss_10": 144.2131031036377, "kl_loss_2": 1530.7996887207032, "kl_loss_3": 1156.2872650146485, "kl_loss_7": 371.8103332519531, "learning_rate": 0.0007361355373863414, "loss": 814.7808, "step": 3500 }, { "ce_loss_10": 3.5710195899009705, "ce_loss_13": 3.508971703052521, "ce_loss_2": 4.213377046585083, "ce_loss_3": 4.024645984172821, "ce_loss_7": 3.67548463344574, "epoch": 0.351, "grad_norm": 736.0, "kl_loss_10": 140.72288818359374, "kl_loss_2": 1512.9147888183593, "kl_loss_3": 1137.2003234863282, "kl_loss_7": 372.58856811523435, "learning_rate": 0.0007347357813929454, "loss": 814.5176, "step": 3510 }, { "ce_loss_10": 3.5162337183952332, "ce_loss_13": 3.4520907759666444, "ce_loss_2": 4.1670368075370785, "ce_loss_3": 3.9769131183624267, "ce_loss_7": 3.61603764295578, "epoch": 0.352, "grad_norm": 656.0, "kl_loss_10": 138.93255500793458, "kl_loss_2": 1512.7831420898438, "kl_loss_3": 1140.8003845214844, "kl_loss_7": 368.38318023681643, "learning_rate": 0.0007333336616128369, "loss": 806.3783, "step": 3520 }, { "ce_loss_10": 3.488769805431366, "ce_loss_13": 3.4252532839775087, "ce_loss_2": 4.161493599414825, "ce_loss_3": 3.9646050333976746, "ce_loss_7": 3.595223593711853, "epoch": 0.353, "grad_norm": 548.0, "kl_loss_10": 138.03040084838867, "kl_loss_2": 1568.9273254394532, "kl_loss_3": 1179.4151916503906, "kl_loss_7": 383.01140747070315, "learning_rate": 0.0007319291921653463, "loss": 814.4657, "step": 3530 }, { "ce_loss_10": 3.5761617183685304, "ce_loss_13": 3.5123104214668275, "ce_loss_2": 4.247880482673645, "ce_loss_3": 4.050547051429748, "ce_loss_7": 3.68228440284729, "epoch": 0.354, "grad_norm": 808.0, "kl_loss_10": 141.07060623168945, "kl_loss_2": 1567.1333923339844, "kl_loss_3": 1178.535693359375, "kl_loss_7": 381.32129669189453, "learning_rate": 0.0007305223871934656, "loss": 802.0609, "step": 3540 }, { "ce_loss_10": 3.540350914001465, "ce_loss_13": 3.4772907376289366, "ce_loss_2": 4.197093963623047, "ce_loss_3": 4.001269197463989, "ce_loss_7": 3.647739040851593, "epoch": 0.355, "grad_norm": 580.0, "kl_loss_10": 140.28063926696777, "kl_loss_2": 1533.3375122070313, "kl_loss_3": 1148.1182403564453, "kl_loss_7": 375.75494995117185, "learning_rate": 0.0007291132608637052, "loss": 801.6683, "step": 3550 }, { "ce_loss_10": 3.4978664398193358, "ce_loss_13": 3.4386332392692567, "ce_loss_2": 4.202150619029998, "ce_loss_3": 3.97188538312912, "ce_loss_7": 3.6044459462165834, "epoch": 0.356, "grad_norm": 676.0, "kl_loss_10": 133.73454742431642, "kl_loss_2": 1596.5189636230468, "kl_loss_3": 1160.679183959961, "kl_loss_7": 371.6182601928711, "learning_rate": 0.0007277018273659516, "loss": 819.2582, "step": 3560 }, { "ce_loss_10": 3.628795838356018, "ce_loss_13": 3.56366685628891, "ce_loss_2": 4.295041692256928, "ce_loss_3": 4.1010064601898195, "ce_loss_7": 3.7392033100128175, "epoch": 0.357, "grad_norm": 536.0, "kl_loss_10": 141.76525268554687, "kl_loss_2": 1568.0928771972656, "kl_loss_3": 1175.8153198242187, "kl_loss_7": 388.54504241943357, "learning_rate": 0.0007262881009133242, "loss": 816.7139, "step": 3570 }, { "ce_loss_10": 3.5417507767677305, "ce_loss_13": 3.4833286881446837, "ce_loss_2": 4.192662954330444, "ce_loss_3": 4.004778635501862, "ce_loss_7": 3.6475730895996095, "epoch": 0.358, "grad_norm": 576.0, "kl_loss_10": 134.09114761352538, "kl_loss_2": 1537.5412170410157, "kl_loss_3": 1149.5142639160156, "kl_loss_7": 372.7196243286133, "learning_rate": 0.0007248720957420329, "loss": 793.2854, "step": 3580 }, { "ce_loss_10": 3.558194160461426, "ce_loss_13": 3.499031662940979, "ce_loss_2": 4.207183480262756, "ce_loss_3": 4.008278286457061, "ce_loss_7": 3.6595579862594603, "epoch": 0.359, "grad_norm": 668.0, "kl_loss_10": 134.52191429138185, "kl_loss_2": 1505.1361145019532, "kl_loss_3": 1117.103219604492, "kl_loss_7": 369.4859024047852, "learning_rate": 0.0007234538261112341, "loss": 793.0623, "step": 3590 }, { "ce_loss_10": 3.590154302120209, "ce_loss_13": 3.5281407237052917, "ce_loss_2": 4.254630589485169, "ce_loss_3": 4.051980185508728, "ce_loss_7": 3.6962651371955872, "epoch": 0.36, "grad_norm": 462.0, "kl_loss_10": 136.44585952758788, "kl_loss_2": 1546.1867126464845, "kl_loss_3": 1153.344775390625, "kl_loss_7": 380.6938873291016, "learning_rate": 0.0007220333063028871, "loss": 793.3124, "step": 3600 }, { "ce_loss_10": 3.618264949321747, "ce_loss_13": 3.5577764391899107, "ce_loss_2": 4.310294914245605, "ce_loss_3": 4.092818439006805, "ce_loss_7": 3.725991404056549, "epoch": 0.361, "grad_norm": 892.0, "kl_loss_10": 137.56422424316406, "kl_loss_2": 1628.0519165039063, "kl_loss_3": 1198.5521118164063, "kl_loss_7": 399.71742095947263, "learning_rate": 0.0007206105506216106, "loss": 830.3656, "step": 3610 }, { "ce_loss_10": 3.500383186340332, "ce_loss_13": 3.4409209847450257, "ce_loss_2": 4.152735877037048, "ce_loss_3": 3.957272839546204, "ce_loss_7": 3.6082523345947264, "epoch": 0.362, "grad_norm": 768.0, "kl_loss_10": 133.25403366088867, "kl_loss_2": 1518.7402770996093, "kl_loss_3": 1139.5292907714843, "kl_loss_7": 378.18980712890624, "learning_rate": 0.0007191855733945387, "loss": 786.9895, "step": 3620 }, { "ce_loss_10": 3.5967095613479616, "ce_loss_13": 3.5339751839637756, "ce_loss_2": 4.247948789596558, "ce_loss_3": 4.048540914058686, "ce_loss_7": 3.702574074268341, "epoch": 0.363, "grad_norm": 672.0, "kl_loss_10": 134.42433967590333, "kl_loss_2": 1527.0834228515625, "kl_loss_3": 1137.2749816894532, "kl_loss_7": 374.51196441650393, "learning_rate": 0.0007177583889711762, "loss": 793.3341, "step": 3630 }, { "ce_loss_10": 3.512792682647705, "ce_loss_13": 3.450398051738739, "ce_loss_2": 4.175726044178009, "ce_loss_3": 3.969858479499817, "ce_loss_7": 3.6202203273773192, "epoch": 0.364, "grad_norm": 536.0, "kl_loss_10": 136.62388954162597, "kl_loss_2": 1562.967547607422, "kl_loss_3": 1163.4830383300782, "kl_loss_7": 382.96795043945315, "learning_rate": 0.0007163290117232541, "loss": 807.9524, "step": 3640 }, { "ce_loss_10": 3.6289470553398133, "ce_loss_13": 3.5686827301979065, "ce_loss_2": 4.252556777000427, "ce_loss_3": 4.061027491092682, "ce_loss_7": 3.728983438014984, "epoch": 0.365, "grad_norm": 676.0, "kl_loss_10": 134.2694351196289, "kl_loss_2": 1484.0365478515625, "kl_loss_3": 1115.571533203125, "kl_loss_7": 372.05673828125, "learning_rate": 0.0007148974560445859, "loss": 788.3101, "step": 3650 }, { "ce_loss_10": 3.549181044101715, "ce_loss_13": 3.487704300880432, "ce_loss_2": 4.190627813339233, "ce_loss_3": 3.9991440176963806, "ce_loss_7": 3.650784492492676, "epoch": 0.366, "grad_norm": 588.0, "kl_loss_10": 133.14555854797362, "kl_loss_2": 1493.0444396972657, "kl_loss_3": 1131.270849609375, "kl_loss_7": 370.2608352661133, "learning_rate": 0.0007134637363509209, "loss": 781.269, "step": 3660 }, { "ce_loss_10": 3.656008231639862, "ce_loss_13": 3.5968250274658202, "ce_loss_2": 4.288893938064575, "ce_loss_3": 4.1020159244537355, "ce_loss_7": 3.758394181728363, "epoch": 0.367, "grad_norm": 624.0, "kl_loss_10": 132.44244766235352, "kl_loss_2": 1477.2258239746093, "kl_loss_3": 1114.760919189453, "kl_loss_7": 362.33375549316406, "learning_rate": 0.0007120278670798009, "loss": 789.3051, "step": 3670 }, { "ce_loss_10": 3.4513864398002623, "ce_loss_13": 3.390084111690521, "ce_loss_2": 4.156805229187012, "ce_loss_3": 3.942430257797241, "ce_loss_7": 3.5631762027740477, "epoch": 0.368, "grad_norm": 852.0, "kl_loss_10": 136.21654624938964, "kl_loss_2": 1617.9169982910157, "kl_loss_3": 1207.7436462402343, "kl_loss_7": 385.1312530517578, "learning_rate": 0.0007105898626904133, "loss": 833.6849, "step": 3680 }, { "ce_loss_10": 3.557909631729126, "ce_loss_13": 3.496003878116608, "ce_loss_2": 4.215323185920715, "ce_loss_3": 4.025267434120178, "ce_loss_7": 3.660618233680725, "epoch": 0.369, "grad_norm": 486.0, "kl_loss_10": 136.01496505737305, "kl_loss_2": 1527.9191650390626, "kl_loss_3": 1153.4224945068358, "kl_loss_7": 371.0873062133789, "learning_rate": 0.0007091497376634463, "loss": 787.2614, "step": 3690 }, { "ce_loss_10": 3.5013809204101562, "ce_loss_13": 3.4402984261512755, "ce_loss_2": 4.1529758214950565, "ce_loss_3": 3.961398553848267, "ce_loss_7": 3.6025787115097048, "epoch": 0.37, "grad_norm": 684.0, "kl_loss_10": 136.102490234375, "kl_loss_2": 1516.0378479003907, "kl_loss_3": 1141.0336791992188, "kl_loss_7": 368.23791046142577, "learning_rate": 0.0007077075065009433, "loss": 806.8564, "step": 3700 }, { "ce_loss_10": 3.6074369311332704, "ce_loss_13": 3.5442083716392516, "ce_loss_2": 4.27118090391159, "ce_loss_3": 4.079712843894958, "ce_loss_7": 3.7118129253387453, "epoch": 0.371, "grad_norm": 616.0, "kl_loss_10": 141.17327156066895, "kl_loss_2": 1545.3048278808594, "kl_loss_3": 1174.181103515625, "kl_loss_7": 378.79654998779296, "learning_rate": 0.0007062631837261557, "loss": 803.7765, "step": 3710 }, { "ce_loss_10": 3.4776635646820067, "ce_loss_13": 3.417153787612915, "ce_loss_2": 4.138115549087525, "ce_loss_3": 3.947688353061676, "ce_loss_7": 3.583261823654175, "epoch": 0.372, "grad_norm": 912.0, "kl_loss_10": 136.18754692077636, "kl_loss_2": 1542.702392578125, "kl_loss_3": 1155.0931549072266, "kl_loss_7": 374.73779144287107, "learning_rate": 0.0007048167838833977, "loss": 812.6596, "step": 3720 }, { "ce_loss_10": 3.5752769351005553, "ce_loss_13": 3.513002848625183, "ce_loss_2": 4.210619521141052, "ce_loss_3": 4.022019147872925, "ce_loss_7": 3.6798208355903625, "epoch": 0.373, "grad_norm": 768.0, "kl_loss_10": 136.63083267211914, "kl_loss_2": 1506.4640258789063, "kl_loss_3": 1132.6720397949218, "kl_loss_7": 377.5881011962891, "learning_rate": 0.0007033683215379002, "loss": 791.4403, "step": 3730 }, { "ce_loss_10": 3.5619895219802857, "ce_loss_13": 3.4989688754081727, "ce_loss_2": 4.2153314590454105, "ce_loss_3": 4.0190078020095825, "ce_loss_7": 3.6668317794799803, "epoch": 0.374, "grad_norm": 728.0, "kl_loss_10": 134.1514114379883, "kl_loss_2": 1513.4870727539062, "kl_loss_3": 1134.2889343261718, "kl_loss_7": 369.0950271606445, "learning_rate": 0.0007019178112756625, "loss": 803.4245, "step": 3740 }, { "ce_loss_10": 3.52027747631073, "ce_loss_13": 3.4614068984985353, "ce_loss_2": 4.172631430625915, "ce_loss_3": 3.9826321721076967, "ce_loss_7": 3.623573863506317, "epoch": 0.375, "grad_norm": 720.0, "kl_loss_10": 133.24607543945314, "kl_loss_2": 1508.3497802734375, "kl_loss_3": 1138.9531616210938, "kl_loss_7": 371.3535675048828, "learning_rate": 0.0007004652677033068, "loss": 797.0561, "step": 3750 }, { "ce_loss_10": 3.598753345012665, "ce_loss_13": 3.5414743185043336, "ce_loss_2": 4.221561062335968, "ce_loss_3": 4.034064853191376, "ce_loss_7": 3.697866952419281, "epoch": 0.376, "grad_norm": 620.0, "kl_loss_10": 131.1568790435791, "kl_loss_2": 1477.522637939453, "kl_loss_3": 1113.0271392822265, "kl_loss_7": 362.1108154296875, "learning_rate": 0.0006990107054479312, "loss": 785.3042, "step": 3760 }, { "ce_loss_10": 3.5856411337852476, "ce_loss_13": 3.523606741428375, "ce_loss_2": 4.220624828338623, "ce_loss_3": 4.039202105998993, "ce_loss_7": 3.686489188671112, "epoch": 0.377, "grad_norm": 784.0, "kl_loss_10": 134.56854705810548, "kl_loss_2": 1499.6362426757812, "kl_loss_3": 1135.1793060302734, "kl_loss_7": 367.8611801147461, "learning_rate": 0.000697554139156961, "loss": 789.6146, "step": 3770 }, { "ce_loss_10": 3.571768081188202, "ce_loss_13": 3.5090900897979735, "ce_loss_2": 4.22368232011795, "ce_loss_3": 4.029348587989807, "ce_loss_7": 3.6724517226219175, "epoch": 0.378, "grad_norm": 652.0, "kl_loss_10": 139.37470092773438, "kl_loss_2": 1539.3553405761718, "kl_loss_3": 1153.9761901855468, "kl_loss_7": 376.85098724365236, "learning_rate": 0.0006960955834980027, "loss": 789.6849, "step": 3780 }, { "ce_loss_10": 3.5432674288749695, "ce_loss_13": 3.481638765335083, "ce_loss_2": 4.195074439048767, "ce_loss_3": 4.005574572086334, "ce_loss_7": 3.645065152645111, "epoch": 0.379, "grad_norm": 668.0, "kl_loss_10": 137.8487949371338, "kl_loss_2": 1516.3669982910155, "kl_loss_3": 1146.334066772461, "kl_loss_7": 368.8575042724609, "learning_rate": 0.0006946350531586958, "loss": 794.4263, "step": 3790 }, { "ce_loss_10": 3.563885974884033, "ce_loss_13": 3.5041411876678468, "ce_loss_2": 4.217035782337189, "ce_loss_3": 4.025569212436676, "ce_loss_7": 3.670181393623352, "epoch": 0.38, "grad_norm": 872.0, "kl_loss_10": 134.99261245727538, "kl_loss_2": 1515.9988891601563, "kl_loss_3": 1138.6263427734375, "kl_loss_7": 365.9822494506836, "learning_rate": 0.0006931725628465643, "loss": 804.7092, "step": 3800 }, { "ce_loss_10": 3.5913591384887695, "ce_loss_13": 3.527244985103607, "ce_loss_2": 4.242334771156311, "ce_loss_3": 4.055222499370575, "ce_loss_7": 3.6929625034332276, "epoch": 0.381, "grad_norm": 864.0, "kl_loss_10": 138.56078910827637, "kl_loss_2": 1516.233056640625, "kl_loss_3": 1147.4771270751953, "kl_loss_7": 371.2011444091797, "learning_rate": 0.0006917081272888696, "loss": 799.2829, "step": 3810 }, { "ce_loss_10": 3.483885133266449, "ce_loss_13": 3.423719954490662, "ce_loss_2": 4.133236110210419, "ce_loss_3": 3.9456497192382813, "ce_loss_7": 3.592355155944824, "epoch": 0.382, "grad_norm": 596.0, "kl_loss_10": 137.18110198974608, "kl_loss_2": 1511.9404541015624, "kl_loss_3": 1152.5890838623047, "kl_loss_7": 371.8784881591797, "learning_rate": 0.0006902417612324615, "loss": 790.0969, "step": 3820 }, { "ce_loss_10": 3.6188692688941955, "ce_loss_13": 3.555316996574402, "ce_loss_2": 4.287074863910675, "ce_loss_3": 4.092138230800629, "ce_loss_7": 3.7254763722419737, "epoch": 0.383, "grad_norm": 792.0, "kl_loss_10": 141.66006164550782, "kl_loss_2": 1560.3333679199218, "kl_loss_3": 1187.6506469726562, "kl_loss_7": 385.8511306762695, "learning_rate": 0.00068877347944363, "loss": 808.5316, "step": 3830 }, { "ce_loss_10": 3.6141200184822084, "ce_loss_13": 3.5523874282836916, "ce_loss_2": 4.2468698740005495, "ce_loss_3": 4.064640355110169, "ce_loss_7": 3.715350341796875, "epoch": 0.384, "grad_norm": 708.0, "kl_loss_10": 136.68106803894042, "kl_loss_2": 1492.917822265625, "kl_loss_3": 1138.3358154296875, "kl_loss_7": 369.60352630615233, "learning_rate": 0.0006873032967079561, "loss": 799.188, "step": 3840 }, { "ce_loss_10": 3.5983062267303465, "ce_loss_13": 3.54010808467865, "ce_loss_2": 4.215501749515534, "ce_loss_3": 4.038501214981079, "ce_loss_7": 3.696172285079956, "epoch": 0.385, "grad_norm": 700.0, "kl_loss_10": 132.26292190551757, "kl_loss_2": 1474.6833190917969, "kl_loss_3": 1124.775860595703, "kl_loss_7": 362.6510437011719, "learning_rate": 0.0006858312278301637, "loss": 777.6832, "step": 3850 }, { "ce_loss_10": 3.6401477217674256, "ce_loss_13": 3.5812854051589964, "ce_loss_2": 4.2578874111175535, "ce_loss_3": 4.075009536743164, "ce_loss_7": 3.7349116444587707, "epoch": 0.386, "grad_norm": 716.0, "kl_loss_10": 133.9440372467041, "kl_loss_2": 1480.8558410644532, "kl_loss_3": 1115.9467712402343, "kl_loss_7": 363.8341888427734, "learning_rate": 0.0006843572876339704, "loss": 778.8414, "step": 3860 }, { "ce_loss_10": 3.5566078424453735, "ce_loss_13": 3.499259579181671, "ce_loss_2": 4.1666911244392395, "ce_loss_3": 3.9863228678703306, "ce_loss_7": 3.651654100418091, "epoch": 0.387, "grad_norm": 712.0, "kl_loss_10": 128.78639640808106, "kl_loss_2": 1448.728778076172, "kl_loss_3": 1092.9980010986328, "kl_loss_7": 353.82066497802737, "learning_rate": 0.0006828814909619373, "loss": 789.1794, "step": 3870 }, { "ce_loss_10": 3.68144371509552, "ce_loss_13": 3.618389356136322, "ce_loss_2": 4.313943779468536, "ce_loss_3": 4.125034952163697, "ce_loss_7": 3.7789146065711976, "epoch": 0.388, "grad_norm": 564.0, "kl_loss_10": 137.6860321044922, "kl_loss_2": 1484.8064819335937, "kl_loss_3": 1117.9655303955078, "kl_loss_7": 368.4295623779297, "learning_rate": 0.0006814038526753205, "loss": 776.0895, "step": 3880 }, { "ce_loss_10": 3.5763523101806642, "ce_loss_13": 3.5149505019187925, "ce_loss_2": 4.210803210735321, "ce_loss_3": 4.024750709533691, "ce_loss_7": 3.6787024259567263, "epoch": 0.389, "grad_norm": 540.0, "kl_loss_10": 134.68067474365233, "kl_loss_2": 1505.5961059570313, "kl_loss_3": 1129.528253173828, "kl_loss_7": 365.9979309082031, "learning_rate": 0.0006799243876539213, "loss": 785.1848, "step": 3890 }, { "ce_loss_10": 3.4995123624801634, "ce_loss_13": 3.4395276427268984, "ce_loss_2": 4.165191233158112, "ce_loss_3": 3.963883662223816, "ce_loss_7": 3.6026421189308167, "epoch": 0.39, "grad_norm": 856.0, "kl_loss_10": 132.61898651123047, "kl_loss_2": 1536.6189819335937, "kl_loss_3": 1135.7314849853515, "kl_loss_7": 364.9299591064453, "learning_rate": 0.0006784431107959359, "loss": 796.0479, "step": 3900 }, { "ce_loss_10": 3.5600151419639587, "ce_loss_13": 3.4963993072509765, "ce_loss_2": 4.227058172225952, "ce_loss_3": 4.032915997505188, "ce_loss_7": 3.669356656074524, "epoch": 0.391, "grad_norm": 804.0, "kl_loss_10": 136.25931625366212, "kl_loss_2": 1560.3639404296875, "kl_loss_3": 1170.0307891845703, "kl_loss_7": 379.51966705322263, "learning_rate": 0.0006769600370178059, "loss": 800.6438, "step": 3910 }, { "ce_loss_10": 3.5234851360321047, "ce_loss_13": 3.463622975349426, "ce_loss_2": 4.184793496131897, "ce_loss_3": 3.9945266962051393, "ce_loss_7": 3.634400510787964, "epoch": 0.392, "grad_norm": 572.0, "kl_loss_10": 132.96168327331543, "kl_loss_2": 1528.766912841797, "kl_loss_3": 1149.3914581298827, "kl_loss_7": 369.23984985351564, "learning_rate": 0.0006754751812540679, "loss": 781.9199, "step": 3920 }, { "ce_loss_10": 3.573040223121643, "ce_loss_13": 3.5096162438392637, "ce_loss_2": 4.2254300832748415, "ce_loss_3": 4.032295274734497, "ce_loss_7": 3.6753496289253236, "epoch": 0.393, "grad_norm": 776.0, "kl_loss_10": 137.39977340698243, "kl_loss_2": 1524.5934692382812, "kl_loss_3": 1142.2673461914062, "kl_loss_7": 372.6196746826172, "learning_rate": 0.0006739885584572025, "loss": 799.1489, "step": 3930 }, { "ce_loss_10": 3.6011941909790037, "ce_loss_13": 3.5389833211898805, "ce_loss_2": 4.258283352851867, "ce_loss_3": 4.056079113483429, "ce_loss_7": 3.706939959526062, "epoch": 0.394, "grad_norm": 760.0, "kl_loss_10": 138.09807777404785, "kl_loss_2": 1554.7301696777345, "kl_loss_3": 1153.3549835205079, "kl_loss_7": 372.54356536865237, "learning_rate": 0.0006725001835974853, "loss": 791.1359, "step": 3940 }, { "ce_loss_10": 3.5847023248672487, "ce_loss_13": 3.5242226362228393, "ce_loss_2": 4.236932027339935, "ce_loss_3": 4.044707441329956, "ce_loss_7": 3.6880187392234802, "epoch": 0.395, "grad_norm": 604.0, "kl_loss_10": 137.06708946228028, "kl_loss_2": 1531.090771484375, "kl_loss_3": 1148.1961975097656, "kl_loss_7": 372.88367767333983, "learning_rate": 0.0006710100716628344, "loss": 781.9915, "step": 3950 }, { "ce_loss_10": 3.568317210674286, "ce_loss_13": 3.5080092191696166, "ce_loss_2": 4.221521747112274, "ce_loss_3": 4.030601763725281, "ce_loss_7": 3.673730731010437, "epoch": 0.396, "grad_norm": 732.0, "kl_loss_10": 134.04078521728516, "kl_loss_2": 1522.4079895019531, "kl_loss_3": 1151.196096801758, "kl_loss_7": 372.5210220336914, "learning_rate": 0.0006695182376586602, "loss": 800.3014, "step": 3960 }, { "ce_loss_10": 3.6057994961738586, "ce_loss_13": 3.5476833462715147, "ce_loss_2": 4.220119166374206, "ce_loss_3": 4.040166866779328, "ce_loss_7": 3.705691361427307, "epoch": 0.397, "grad_norm": 940.0, "kl_loss_10": 128.9409210205078, "kl_loss_2": 1444.9400512695313, "kl_loss_3": 1091.2126403808593, "kl_loss_7": 357.9466751098633, "learning_rate": 0.000668024696607715, "loss": 783.9521, "step": 3970 }, { "ce_loss_10": 3.5564019203186037, "ce_loss_13": 3.4956687688827515, "ce_loss_2": 4.194620299339294, "ce_loss_3": 4.004274892807007, "ce_loss_7": 3.6527443528175354, "epoch": 0.398, "grad_norm": 704.0, "kl_loss_10": 134.44026298522948, "kl_loss_2": 1512.640576171875, "kl_loss_3": 1141.9819427490233, "kl_loss_7": 367.26869049072263, "learning_rate": 0.0006665294635499404, "loss": 789.8477, "step": 3980 }, { "ce_loss_10": 3.567546045780182, "ce_loss_13": 3.5039931416511534, "ce_loss_2": 4.231116080284119, "ce_loss_3": 4.034705054759979, "ce_loss_7": 3.672286367416382, "epoch": 0.399, "grad_norm": 700.0, "kl_loss_10": 143.2273063659668, "kl_loss_2": 1571.5681518554688, "kl_loss_3": 1178.7588897705077, "kl_loss_7": 382.66470794677736, "learning_rate": 0.0006650325535423167, "loss": 806.0485, "step": 3990 }, { "ce_loss_10": 3.589041221141815, "ce_loss_13": 3.5297691583633424, "ce_loss_2": 4.205903816223144, "ce_loss_3": 4.024567484855652, "ce_loss_7": 3.6903899908065796, "epoch": 0.4, "grad_norm": 716.0, "kl_loss_10": 134.77462882995604, "kl_loss_2": 1448.7763793945312, "kl_loss_3": 1097.182077026367, "kl_loss_7": 360.74890594482423, "learning_rate": 0.0006635339816587109, "loss": 774.6992, "step": 4000 }, { "ce_loss_10": 3.534158933162689, "ce_loss_13": 3.4722786784172057, "ce_loss_2": 4.18008325099945, "ce_loss_3": 3.9927919030189516, "ce_loss_7": 3.636194169521332, "epoch": 0.401, "grad_norm": 608.0, "kl_loss_10": 137.7214611053467, "kl_loss_2": 1517.0520935058594, "kl_loss_3": 1144.4528839111329, "kl_loss_7": 367.4022415161133, "learning_rate": 0.0006620337629897252, "loss": 785.9456, "step": 4010 }, { "ce_loss_10": 3.5372211933135986, "ce_loss_13": 3.475364565849304, "ce_loss_2": 4.187776136398315, "ce_loss_3": 3.995320773124695, "ce_loss_7": 3.6415831565856935, "epoch": 0.402, "grad_norm": 544.0, "kl_loss_10": 137.00096588134767, "kl_loss_2": 1516.9021362304688, "kl_loss_3": 1134.6023742675782, "kl_loss_7": 368.9509552001953, "learning_rate": 0.0006605319126425454, "loss": 802.8082, "step": 4020 }, { "ce_loss_10": 3.438814675807953, "ce_loss_13": 3.379668688774109, "ce_loss_2": 4.109059810638428, "ce_loss_3": 3.909671998023987, "ce_loss_7": 3.544471001625061, "epoch": 0.403, "grad_norm": 560.0, "kl_loss_10": 136.3015495300293, "kl_loss_2": 1560.7927734375, "kl_loss_3": 1171.8411346435546, "kl_loss_7": 373.3894378662109, "learning_rate": 0.0006590284457407876, "loss": 802.5854, "step": 4030 }, { "ce_loss_10": 3.544812524318695, "ce_loss_13": 3.481894314289093, "ce_loss_2": 4.181082665920258, "ce_loss_3": 3.9941136717796324, "ce_loss_7": 3.6471312403678895, "epoch": 0.404, "grad_norm": 504.0, "kl_loss_10": 135.4376022338867, "kl_loss_2": 1496.4591674804688, "kl_loss_3": 1126.9239776611328, "kl_loss_7": 368.8154357910156, "learning_rate": 0.0006575233774243465, "loss": 785.2985, "step": 4040 }, { "ce_loss_10": 3.529277968406677, "ce_loss_13": 3.469450843334198, "ce_loss_2": 4.183178901672363, "ce_loss_3": 3.989623689651489, "ce_loss_7": 3.634640073776245, "epoch": 0.405, "grad_norm": 760.0, "kl_loss_10": 134.75451889038087, "kl_loss_2": 1538.0418823242187, "kl_loss_3": 1155.4544311523437, "kl_loss_7": 372.7149856567383, "learning_rate": 0.0006560167228492435, "loss": 793.4811, "step": 4050 }, { "ce_loss_10": 3.577071988582611, "ce_loss_13": 3.5211926221847536, "ce_loss_2": 4.202403485774994, "ce_loss_3": 4.0193228960037235, "ce_loss_7": 3.6783261060714723, "epoch": 0.406, "grad_norm": 792.0, "kl_loss_10": 129.85480155944825, "kl_loss_2": 1467.7122497558594, "kl_loss_3": 1107.2084655761719, "kl_loss_7": 358.02198791503906, "learning_rate": 0.0006545084971874737, "loss": 784.8675, "step": 4060 }, { "ce_loss_10": 3.5425363302230837, "ce_loss_13": 3.481105864048004, "ce_loss_2": 4.214594578742981, "ce_loss_3": 4.014482605457306, "ce_loss_7": 3.651681327819824, "epoch": 0.407, "grad_norm": 604.0, "kl_loss_10": 137.89012451171874, "kl_loss_2": 1563.3667907714844, "kl_loss_3": 1169.847265625, "kl_loss_7": 378.33221740722655, "learning_rate": 0.0006529987156268526, "loss": 790.6762, "step": 4070 }, { "ce_loss_10": 3.461014246940613, "ce_loss_13": 3.3993191599845884, "ce_loss_2": 4.127788650989532, "ce_loss_3": 3.9290434598922728, "ce_loss_7": 3.5692273139953614, "epoch": 0.408, "grad_norm": 692.0, "kl_loss_10": 135.76864738464354, "kl_loss_2": 1537.1478942871095, "kl_loss_3": 1151.7357269287108, "kl_loss_7": 372.04627532958983, "learning_rate": 0.0006514873933708637, "loss": 806.6379, "step": 4080 }, { "ce_loss_10": 3.5668829321861266, "ce_loss_13": 3.508492851257324, "ce_loss_2": 4.2085763812065125, "ce_loss_3": 4.0170141696929935, "ce_loss_7": 3.6687933683395384, "epoch": 0.409, "grad_norm": 680.0, "kl_loss_10": 133.2143711090088, "kl_loss_2": 1492.4684814453126, "kl_loss_3": 1121.5162017822265, "kl_loss_7": 364.1954345703125, "learning_rate": 0.0006499745456385053, "loss": 779.6072, "step": 4090 }, { "ce_loss_10": 3.5439602375030517, "ce_loss_13": 3.481731653213501, "ce_loss_2": 4.193144726753235, "ce_loss_3": 4.0000463128089905, "ce_loss_7": 3.650206482410431, "epoch": 0.41, "grad_norm": 720.0, "kl_loss_10": 138.83553466796874, "kl_loss_2": 1504.902880859375, "kl_loss_3": 1128.8948608398437, "kl_loss_7": 372.4293869018555, "learning_rate": 0.0006484601876641375, "loss": 796.3825, "step": 4100 }, { "ce_loss_10": 3.524893641471863, "ce_loss_13": 3.4652820467948913, "ce_loss_2": 4.154143571853638, "ce_loss_3": 3.963244378566742, "ce_loss_7": 3.6235713839530943, "epoch": 0.411, "grad_norm": 524.0, "kl_loss_10": 135.7376163482666, "kl_loss_2": 1475.3457275390624, "kl_loss_3": 1102.1329620361328, "kl_loss_7": 360.7025970458984, "learning_rate": 0.000646944334697328, "loss": 776.625, "step": 4110 }, { "ce_loss_10": 3.6424561977386474, "ce_loss_13": 3.582131230831146, "ce_loss_2": 4.258833718299866, "ce_loss_3": 4.075766789913177, "ce_loss_7": 3.7454846620559694, "epoch": 0.412, "grad_norm": 520.0, "kl_loss_10": 134.410147857666, "kl_loss_2": 1442.3484252929688, "kl_loss_3": 1087.205389404297, "kl_loss_7": 362.8587707519531, "learning_rate": 0.0006454270020026995, "loss": 761.3288, "step": 4120 }, { "ce_loss_10": 3.6111577272415163, "ce_loss_13": 3.5512704968452455, "ce_loss_2": 4.226312971115112, "ce_loss_3": 4.035128366947174, "ce_loss_7": 3.707612764835358, "epoch": 0.413, "grad_norm": 556.0, "kl_loss_10": 134.3211742401123, "kl_loss_2": 1434.7612060546876, "kl_loss_3": 1081.237615966797, "kl_loss_7": 354.78483428955076, "learning_rate": 0.0006439082048597755, "loss": 762.4907, "step": 4130 }, { "ce_loss_10": 3.5981813311576842, "ce_loss_13": 3.5334428906440736, "ce_loss_2": 4.23305733203888, "ce_loss_3": 4.0418706178665165, "ce_loss_7": 3.6992475748062135, "epoch": 0.414, "grad_norm": 752.0, "kl_loss_10": 138.56320190429688, "kl_loss_2": 1493.8901489257812, "kl_loss_3": 1121.1268218994142, "kl_loss_7": 366.90565490722656, "learning_rate": 0.0006423879585628261, "loss": 783.5715, "step": 4140 }, { "ce_loss_10": 3.560960817337036, "ce_loss_13": 3.495613181591034, "ce_loss_2": 4.226023101806641, "ce_loss_3": 4.0232850313186646, "ce_loss_7": 3.6655412554740905, "epoch": 0.415, "grad_norm": 1056.0, "kl_loss_10": 140.29082679748535, "kl_loss_2": 1548.172344970703, "kl_loss_3": 1152.182876586914, "kl_loss_7": 374.3309783935547, "learning_rate": 0.0006408662784207149, "loss": 802.2298, "step": 4150 }, { "ce_loss_10": 3.5164562225341798, "ce_loss_13": 3.456049418449402, "ce_loss_2": 4.159878623485565, "ce_loss_3": 3.973027527332306, "ce_loss_7": 3.6125931262969972, "epoch": 0.416, "grad_norm": 708.0, "kl_loss_10": 132.27476196289064, "kl_loss_2": 1515.6659545898438, "kl_loss_3": 1142.7359313964844, "kl_loss_7": 364.6348907470703, "learning_rate": 0.0006393431797567439, "loss": 789.1856, "step": 4160 }, { "ce_loss_10": 3.6022548198699953, "ce_loss_13": 3.5429869413375856, "ce_loss_2": 4.207081604003906, "ce_loss_3": 4.019421660900116, "ce_loss_7": 3.6976672649383544, "epoch": 0.417, "grad_norm": 612.0, "kl_loss_10": 132.8531795501709, "kl_loss_2": 1457.9248413085938, "kl_loss_3": 1092.8853637695313, "kl_loss_7": 359.7133728027344, "learning_rate": 0.0006378186779084996, "loss": 753.2417, "step": 4170 }, { "ce_loss_10": 3.4348825335502626, "ce_loss_13": 3.3747984290122988, "ce_loss_2": 4.09944007396698, "ce_loss_3": 3.9020219922065733, "ce_loss_7": 3.5400782108306883, "epoch": 0.418, "grad_norm": 680.0, "kl_loss_10": 132.64606018066405, "kl_loss_2": 1522.9734741210937, "kl_loss_3": 1138.7015899658204, "kl_loss_7": 366.7797393798828, "learning_rate": 0.0006362927882276989, "loss": 789.6667, "step": 4180 }, { "ce_loss_10": 3.6310564041137696, "ce_loss_13": 3.5738319396972655, "ce_loss_2": 4.248906910419464, "ce_loss_3": 4.05875905752182, "ce_loss_7": 3.7278629899024964, "epoch": 0.419, "grad_norm": 508.0, "kl_loss_10": 132.2452365875244, "kl_loss_2": 1463.0255004882813, "kl_loss_3": 1091.5722778320312, "kl_loss_7": 354.3390426635742, "learning_rate": 0.000634765526080034, "loss": 756.9509, "step": 4190 }, { "ce_loss_10": 3.633085823059082, "ce_loss_13": 3.572413682937622, "ce_loss_2": 4.256156611442566, "ce_loss_3": 4.07630068063736, "ce_loss_7": 3.7311901450157166, "epoch": 0.42, "grad_norm": 680.0, "kl_loss_10": 136.76210670471193, "kl_loss_2": 1473.4716735839843, "kl_loss_3": 1115.9601440429688, "kl_loss_7": 365.41827697753905, "learning_rate": 0.0006332369068450174, "loss": 766.6168, "step": 4200 }, { "ce_loss_10": 3.5673229098320007, "ce_loss_13": 3.5074177622795104, "ce_loss_2": 4.202928698062896, "ce_loss_3": 4.015132880210876, "ce_loss_7": 3.6681143999099732, "epoch": 0.421, "grad_norm": 628.0, "kl_loss_10": 134.72608184814453, "kl_loss_2": 1491.3523498535155, "kl_loss_3": 1126.61123046875, "kl_loss_7": 365.6973648071289, "learning_rate": 0.0006317069459158283, "loss": 775.0173, "step": 4210 }, { "ce_loss_10": 3.6810522437095643, "ce_loss_13": 3.6207136154174804, "ce_loss_2": 4.278917360305786, "ce_loss_3": 4.101971006393432, "ce_loss_7": 3.776351547241211, "epoch": 0.422, "grad_norm": 560.0, "kl_loss_10": 138.53412284851075, "kl_loss_2": 1443.6548767089844, "kl_loss_3": 1091.453189086914, "kl_loss_7": 359.41749114990233, "learning_rate": 0.0006301756586991561, "loss": 771.2455, "step": 4220 }, { "ce_loss_10": 3.453490364551544, "ce_loss_13": 3.39296897649765, "ce_loss_2": 4.11007170677185, "ce_loss_3": 3.9121878266334535, "ce_loss_7": 3.5533955574035643, "epoch": 0.423, "grad_norm": 620.0, "kl_loss_10": 141.56931991577147, "kl_loss_2": 1538.1698364257813, "kl_loss_3": 1154.8234252929688, "kl_loss_7": 367.4682983398437, "learning_rate": 0.0006286430606150459, "loss": 792.7883, "step": 4230 }, { "ce_loss_10": 3.6686601042747498, "ce_loss_13": 3.592938470840454, "ce_loss_2": 4.279926109313965, "ce_loss_3": 4.093138873577118, "ce_loss_7": 3.750678813457489, "epoch": 0.424, "grad_norm": 644.0, "kl_loss_10": 158.1731170654297, "kl_loss_2": 1484.5655151367187, "kl_loss_3": 1109.7966552734374, "kl_loss_7": 363.6700012207031, "learning_rate": 0.0006271091670967436, "loss": 774.7598, "step": 4240 }, { "ce_loss_10": 3.5760006070137025, "ce_loss_13": 3.508677899837494, "ce_loss_2": 4.227059412002563, "ce_loss_3": 4.0314107775688175, "ce_loss_7": 3.6780433177948, "epoch": 0.425, "grad_norm": 740.0, "kl_loss_10": 156.61751861572264, "kl_loss_2": 1553.6643676757812, "kl_loss_3": 1164.257012939453, "kl_loss_7": 379.240168762207, "learning_rate": 0.0006255739935905395, "loss": 794.0103, "step": 4250 }, { "ce_loss_10": 3.6080758333206178, "ce_loss_13": 3.5441837668418885, "ce_loss_2": 4.217771422863007, "ce_loss_3": 4.030789840221405, "ce_loss_7": 3.696694552898407, "epoch": 0.426, "grad_norm": 700.0, "kl_loss_10": 147.6225784301758, "kl_loss_2": 1459.3168395996095, "kl_loss_3": 1092.8426239013672, "kl_loss_7": 360.9449523925781, "learning_rate": 0.0006240375555556145, "loss": 787.5354, "step": 4260 }, { "ce_loss_10": 3.619337785243988, "ce_loss_13": 3.5525267839431764, "ce_loss_2": 4.272982287406921, "ce_loss_3": 4.074936735630035, "ce_loss_7": 3.7203184485435488, "epoch": 0.427, "grad_norm": 580.0, "kl_loss_10": 143.41627883911133, "kl_loss_2": 1519.2254211425782, "kl_loss_3": 1136.8392669677735, "kl_loss_7": 366.1272567749023, "learning_rate": 0.000622499868463882, "loss": 784.6523, "step": 4270 }, { "ce_loss_10": 3.583260440826416, "ce_loss_13": 3.522217357158661, "ce_loss_2": 4.191686594486237, "ce_loss_3": 4.004029047489166, "ce_loss_7": 3.6739280343055727, "epoch": 0.428, "grad_norm": 760.0, "kl_loss_10": 137.14938316345214, "kl_loss_2": 1458.7350463867188, "kl_loss_3": 1091.8113159179688, "kl_loss_7": 355.58583679199216, "learning_rate": 0.0006209609477998338, "loss": 772.4823, "step": 4280 }, { "ce_loss_10": 3.6303748726844787, "ce_loss_13": 3.5688055872917177, "ce_loss_2": 4.266738796234131, "ce_loss_3": 4.077020514011383, "ce_loss_7": 3.73097505569458, "epoch": 0.429, "grad_norm": 728.0, "kl_loss_10": 138.04970626831056, "kl_loss_2": 1487.9487243652343, "kl_loss_3": 1125.538851928711, "kl_loss_7": 364.74159851074216, "learning_rate": 0.0006194208090603844, "loss": 785.8367, "step": 4290 }, { "ce_loss_10": 3.5521617889404298, "ce_loss_13": 3.492956447601318, "ce_loss_2": 4.177777218818664, "ce_loss_3": 3.9935933470726015, "ce_loss_7": 3.6513041496276855, "epoch": 0.43, "grad_norm": 628.0, "kl_loss_10": 130.21162300109864, "kl_loss_2": 1462.7013305664063, "kl_loss_3": 1094.481967163086, "kl_loss_7": 351.07578735351564, "learning_rate": 0.0006178794677547138, "loss": 761.1579, "step": 4300 }, { "ce_loss_10": 3.580393135547638, "ce_loss_13": 3.5223410606384276, "ce_loss_2": 4.22029185295105, "ce_loss_3": 4.0296910285949705, "ce_loss_7": 3.6796607255935667, "epoch": 0.431, "grad_norm": 804.0, "kl_loss_10": 135.95439834594725, "kl_loss_2": 1507.2907592773438, "kl_loss_3": 1135.5215759277344, "kl_loss_7": 368.5531372070312, "learning_rate": 0.0006163369394041111, "loss": 777.0869, "step": 4310 }, { "ce_loss_10": 3.5184757232666017, "ce_loss_13": 3.4593456625938415, "ce_loss_2": 4.166491711139679, "ce_loss_3": 3.9752755165100098, "ce_loss_7": 3.6229493021965027, "epoch": 0.432, "grad_norm": 816.0, "kl_loss_10": 132.72277908325196, "kl_loss_2": 1510.2736328125, "kl_loss_3": 1137.5515747070312, "kl_loss_7": 362.1718246459961, "learning_rate": 0.0006147932395418205, "loss": 797.0466, "step": 4320 }, { "ce_loss_10": 3.5534854769706725, "ce_loss_13": 3.493700551986694, "ce_loss_2": 4.1732647776603695, "ce_loss_3": 3.9909292578697206, "ce_loss_7": 3.652637302875519, "epoch": 0.433, "grad_norm": 532.0, "kl_loss_10": 131.40403900146484, "kl_loss_2": 1471.334698486328, "kl_loss_3": 1111.2522705078125, "kl_loss_7": 360.8691162109375, "learning_rate": 0.0006132483837128823, "loss": 767.8431, "step": 4330 }, { "ce_loss_10": 3.534538817405701, "ce_loss_13": 3.4763117671012878, "ce_loss_2": 4.177856540679931, "ce_loss_3": 3.977602541446686, "ce_loss_7": 3.6365644097328187, "epoch": 0.434, "grad_norm": 576.0, "kl_loss_10": 132.02191047668458, "kl_loss_2": 1511.0365112304687, "kl_loss_3": 1116.52158203125, "kl_loss_7": 363.1332717895508, "learning_rate": 0.0006117023874739772, "loss": 782.3545, "step": 4340 }, { "ce_loss_10": 3.523367393016815, "ce_loss_13": 3.465071129798889, "ce_loss_2": 4.167889106273651, "ce_loss_3": 3.9776375889778137, "ce_loss_7": 3.6235153794288637, "epoch": 0.435, "grad_norm": 560.0, "kl_loss_10": 132.62512550354003, "kl_loss_2": 1522.6255187988281, "kl_loss_3": 1134.744940185547, "kl_loss_7": 364.6108734130859, "learning_rate": 0.0006101552663932703, "loss": 787.945, "step": 4350 }, { "ce_loss_10": 3.560639572143555, "ce_loss_13": 3.4985696911811828, "ce_loss_2": 4.184593963623047, "ce_loss_3": 3.9949865102767945, "ce_loss_7": 3.657104122638702, "epoch": 0.436, "grad_norm": 652.0, "kl_loss_10": 134.92124710083007, "kl_loss_2": 1480.7232177734375, "kl_loss_3": 1106.3843566894532, "kl_loss_7": 362.02510833740234, "learning_rate": 0.0006086070360502539, "loss": 776.0767, "step": 4360 }, { "ce_loss_10": 3.5607874393463135, "ce_loss_13": 3.5032904148101807, "ce_loss_2": 4.19012690782547, "ce_loss_3": 4.001884508132934, "ce_loss_7": 3.661303186416626, "epoch": 0.437, "grad_norm": 652.0, "kl_loss_10": 131.70704956054686, "kl_loss_2": 1494.450860595703, "kl_loss_3": 1114.0662811279296, "kl_loss_7": 359.0336669921875, "learning_rate": 0.0006070577120355903, "loss": 773.2331, "step": 4370 }, { "ce_loss_10": 3.5673134207725523, "ce_loss_13": 3.5069605112075806, "ce_loss_2": 4.194201278686523, "ce_loss_3": 4.008713376522064, "ce_loss_7": 3.6672159075737, "epoch": 0.438, "grad_norm": 724.0, "kl_loss_10": 129.65911521911622, "kl_loss_2": 1461.538018798828, "kl_loss_3": 1099.2910552978515, "kl_loss_7": 357.59047546386716, "learning_rate": 0.0006055073099509549, "loss": 765.2788, "step": 4380 }, { "ce_loss_10": 3.6260691046714784, "ce_loss_13": 3.566242527961731, "ce_loss_2": 4.243451619148255, "ce_loss_3": 4.051405000686645, "ce_loss_7": 3.723850154876709, "epoch": 0.439, "grad_norm": 708.0, "kl_loss_10": 131.5600498199463, "kl_loss_2": 1464.825506591797, "kl_loss_3": 1095.515121459961, "kl_loss_7": 358.8866928100586, "learning_rate": 0.0006039558454088796, "loss": 777.1961, "step": 4390 }, { "ce_loss_10": 3.600092685222626, "ce_loss_13": 3.5382148027420044, "ce_loss_2": 4.238782167434692, "ce_loss_3": 4.047287583351135, "ce_loss_7": 3.7017881989479067, "epoch": 0.44, "grad_norm": 720.0, "kl_loss_10": 134.45111694335938, "kl_loss_2": 1488.527685546875, "kl_loss_3": 1123.7609832763671, "kl_loss_7": 363.68369445800784, "learning_rate": 0.0006024033340325954, "loss": 766.9879, "step": 4400 }, { "ce_loss_10": 3.6631633281707763, "ce_loss_13": 3.6062058329582216, "ce_loss_2": 4.263676905632019, "ce_loss_3": 4.082637584209442, "ce_loss_7": 3.75854674577713, "epoch": 0.441, "grad_norm": 492.0, "kl_loss_10": 127.09016380310058, "kl_loss_2": 1417.7144897460937, "kl_loss_3": 1058.3302947998047, "kl_loss_7": 343.7723358154297, "learning_rate": 0.0006008497914558743, "loss": 752.2297, "step": 4410 }, { "ce_loss_10": 3.601701498031616, "ce_loss_13": 3.5420748829841613, "ce_loss_2": 4.236417984962463, "ce_loss_3": 4.046699476242066, "ce_loss_7": 3.7043898940086364, "epoch": 0.442, "grad_norm": 584.0, "kl_loss_10": 135.9376647949219, "kl_loss_2": 1491.8550354003905, "kl_loss_3": 1120.2072509765626, "kl_loss_7": 366.13590545654296, "learning_rate": 0.0005992952333228728, "loss": 773.4489, "step": 4420 }, { "ce_loss_10": 3.5400898575782778, "ce_loss_13": 3.4842668890953066, "ce_loss_2": 4.171782422065735, "ce_loss_3": 3.9820830821990967, "ce_loss_7": 3.6369754314422607, "epoch": 0.443, "grad_norm": 660.0, "kl_loss_10": 129.43466796875, "kl_loss_2": 1496.3381286621093, "kl_loss_3": 1116.0555755615235, "kl_loss_7": 355.4880676269531, "learning_rate": 0.0005977396752879741, "loss": 771.3094, "step": 4430 }, { "ce_loss_10": 3.4704235672950743, "ce_loss_13": 3.4127361297607424, "ce_loss_2": 4.106459307670593, "ce_loss_3": 3.920796346664429, "ce_loss_7": 3.573098838329315, "epoch": 0.444, "grad_norm": 732.0, "kl_loss_10": 126.1293731689453, "kl_loss_2": 1490.1605346679687, "kl_loss_3": 1120.4780059814452, "kl_loss_7": 354.9955078125, "learning_rate": 0.0005961831330156305, "loss": 764.2076, "step": 4440 }, { "ce_loss_10": 3.615150511264801, "ce_loss_13": 3.5547899127006533, "ce_loss_2": 4.245680320262909, "ce_loss_3": 4.054165327548981, "ce_loss_7": 3.712630808353424, "epoch": 0.445, "grad_norm": 628.0, "kl_loss_10": 131.35802993774413, "kl_loss_2": 1499.3917602539063, "kl_loss_3": 1117.8766357421875, "kl_loss_7": 361.68542938232423, "learning_rate": 0.0005946256221802051, "loss": 786.0435, "step": 4450 }, { "ce_loss_10": 3.591351556777954, "ce_loss_13": 3.535250651836395, "ce_loss_2": 4.191380190849304, "ce_loss_3": 4.004255092144012, "ce_loss_7": 3.6844709634780886, "epoch": 0.446, "grad_norm": 708.0, "kl_loss_10": 129.50046005249024, "kl_loss_2": 1424.1330322265626, "kl_loss_3": 1067.6145050048829, "kl_loss_7": 349.01904144287107, "learning_rate": 0.0005930671584658151, "loss": 778.2988, "step": 4460 }, { "ce_loss_10": 3.590542936325073, "ce_loss_13": 3.5338905096054076, "ce_loss_2": 4.219100630283355, "ce_loss_3": 4.026160931587219, "ce_loss_7": 3.687147891521454, "epoch": 0.447, "grad_norm": 572.0, "kl_loss_10": 130.2479160308838, "kl_loss_2": 1492.514990234375, "kl_loss_3": 1115.7035186767578, "kl_loss_7": 358.6115783691406, "learning_rate": 0.0005915077575661722, "loss": 782.5174, "step": 4470 }, { "ce_loss_10": 3.6091010570526123, "ce_loss_13": 3.548303461074829, "ce_loss_2": 4.243482124805451, "ce_loss_3": 4.051424252986908, "ce_loss_7": 3.709373152256012, "epoch": 0.448, "grad_norm": 692.0, "kl_loss_10": 133.69428749084472, "kl_loss_2": 1503.4253295898438, "kl_loss_3": 1123.9545928955079, "kl_loss_7": 367.3063217163086, "learning_rate": 0.000589947435184427, "loss": 770.9538, "step": 4480 }, { "ce_loss_10": 3.67550984621048, "ce_loss_13": 3.61795197725296, "ce_loss_2": 4.270373678207397, "ce_loss_3": 4.090293383598327, "ce_loss_7": 3.769421923160553, "epoch": 0.449, "grad_norm": 548.0, "kl_loss_10": 132.92403182983398, "kl_loss_2": 1453.855810546875, "kl_loss_3": 1089.5215789794922, "kl_loss_7": 359.96496124267577, "learning_rate": 0.0005883862070330078, "loss": 768.3579, "step": 4490 }, { "ce_loss_10": 3.6023089408874513, "ce_loss_13": 3.5429613828659057, "ce_loss_2": 4.2273586869239805, "ce_loss_3": 4.0474681735038756, "ce_loss_7": 3.7016926527023317, "epoch": 0.45, "grad_norm": 640.0, "kl_loss_10": 131.65977668762207, "kl_loss_2": 1477.8073486328126, "kl_loss_3": 1112.020849609375, "kl_loss_7": 363.57453002929685, "learning_rate": 0.0005868240888334653, "loss": 768.102, "step": 4500 }, { "ce_loss_10": 3.4881216049194337, "ce_loss_13": 3.4285590648651123, "ce_loss_2": 4.145088362693786, "ce_loss_3": 3.9443087697029116, "ce_loss_7": 3.5946366786956787, "epoch": 0.451, "grad_norm": 860.0, "kl_loss_10": 132.0642234802246, "kl_loss_2": 1523.9566467285156, "kl_loss_3": 1139.5087677001952, "kl_loss_7": 370.548567199707, "learning_rate": 0.0005852610963163119, "loss": 782.7128, "step": 4510 }, { "ce_loss_10": 3.509654688835144, "ce_loss_13": 3.453027904033661, "ce_loss_2": 4.135330331325531, "ce_loss_3": 3.948072779178619, "ce_loss_7": 3.6065049171447754, "epoch": 0.452, "grad_norm": 576.0, "kl_loss_10": 127.13306007385253, "kl_loss_2": 1474.840264892578, "kl_loss_3": 1103.8429443359375, "kl_loss_7": 352.3040481567383, "learning_rate": 0.0005836972452208654, "loss": 758.6461, "step": 4520 }, { "ce_loss_10": 3.516415762901306, "ce_loss_13": 3.4608123779296873, "ce_loss_2": 4.151223230361938, "ce_loss_3": 3.964005374908447, "ce_loss_7": 3.6154285073280334, "epoch": 0.453, "grad_norm": 692.0, "kl_loss_10": 129.6707332611084, "kl_loss_2": 1490.4531555175781, "kl_loss_3": 1112.0751403808595, "kl_loss_7": 360.926252746582, "learning_rate": 0.0005821325512950885, "loss": 774.0762, "step": 4530 }, { "ce_loss_10": 3.5449029922485353, "ce_loss_13": 3.4901923894882203, "ce_loss_2": 4.175939702987671, "ce_loss_3": 3.9821668744087217, "ce_loss_7": 3.6469666838645933, "epoch": 0.454, "grad_norm": 536.0, "kl_loss_10": 126.68943367004394, "kl_loss_2": 1457.47294921875, "kl_loss_3": 1089.1407897949218, "kl_loss_7": 353.19891662597655, "learning_rate": 0.0005805670302954321, "loss": 773.5169, "step": 4540 }, { "ce_loss_10": 3.551515507698059, "ce_loss_13": 3.496657633781433, "ce_loss_2": 4.165827226638794, "ce_loss_3": 3.9793267846107483, "ce_loss_7": 3.6467077493667603, "epoch": 0.455, "grad_norm": 632.0, "kl_loss_10": 124.99089202880859, "kl_loss_2": 1454.5718872070313, "kl_loss_3": 1086.5160583496095, "kl_loss_7": 349.37010803222654, "learning_rate": 0.000579000697986675, "loss": 757.4347, "step": 4550 }, { "ce_loss_10": 3.51110919713974, "ce_loss_13": 3.4499236941337585, "ce_loss_2": 4.169062435626984, "ce_loss_3": 3.9678101181983947, "ce_loss_7": 3.6167086601257323, "epoch": 0.456, "grad_norm": 688.0, "kl_loss_10": 133.06291236877442, "kl_loss_2": 1530.2515380859375, "kl_loss_3": 1140.2865753173828, "kl_loss_7": 365.97554779052734, "learning_rate": 0.0005774335701417662, "loss": 779.1915, "step": 4560 }, { "ce_loss_10": 3.495447027683258, "ce_loss_13": 3.440790295600891, "ce_loss_2": 4.137791275978088, "ce_loss_3": 3.9482330679893494, "ce_loss_7": 3.5958903670310973, "epoch": 0.457, "grad_norm": 628.0, "kl_loss_10": 127.81274681091308, "kl_loss_2": 1513.1394897460937, "kl_loss_3": 1131.4115112304687, "kl_loss_7": 359.4502899169922, "learning_rate": 0.0005758656625416658, "loss": 779.9761, "step": 4570 }, { "ce_loss_10": 3.5505786299705506, "ce_loss_13": 3.4906317949295045, "ce_loss_2": 4.1901858925819395, "ce_loss_3": 3.997506558895111, "ce_loss_7": 3.6519751071929933, "epoch": 0.458, "grad_norm": 684.0, "kl_loss_10": 130.90405158996583, "kl_loss_2": 1497.9955810546876, "kl_loss_3": 1116.8375152587892, "kl_loss_7": 361.80901489257815, "learning_rate": 0.0005742969909751859, "loss": 764.1115, "step": 4580 }, { "ce_loss_10": 3.5638930439949035, "ce_loss_13": 3.503375542163849, "ce_loss_2": 4.186403441429138, "ce_loss_3": 3.9995208024978637, "ce_loss_7": 3.6641741752624513, "epoch": 0.459, "grad_norm": 580.0, "kl_loss_10": 130.00192375183104, "kl_loss_2": 1475.8618774414062, "kl_loss_3": 1099.7632873535156, "kl_loss_7": 358.99674835205076, "learning_rate": 0.0005727275712388318, "loss": 769.1571, "step": 4590 }, { "ce_loss_10": 3.593214011192322, "ce_loss_13": 3.53840229511261, "ce_loss_2": 4.204052042961121, "ce_loss_3": 4.022337186336517, "ce_loss_7": 3.6885414600372313, "epoch": 0.46, "grad_norm": 688.0, "kl_loss_10": 126.89060325622559, "kl_loss_2": 1444.349786376953, "kl_loss_3": 1080.8706604003905, "kl_loss_7": 347.9618377685547, "learning_rate": 0.0005711574191366427, "loss": 759.1991, "step": 4600 }, { "ce_loss_10": 3.542857563495636, "ce_loss_13": 3.4855061650276182, "ce_loss_2": 4.162867796421051, "ce_loss_3": 3.973730170726776, "ce_loss_7": 3.6373270750045776, "epoch": 0.461, "grad_norm": 498.0, "kl_loss_10": 127.64135208129883, "kl_loss_2": 1482.562158203125, "kl_loss_3": 1108.3277709960937, "kl_loss_7": 353.4364410400391, "learning_rate": 0.0005695865504800327, "loss": 763.6274, "step": 4610 }, { "ce_loss_10": 3.480057179927826, "ce_loss_13": 3.4188697218894957, "ce_loss_2": 4.161020743846893, "ce_loss_3": 3.9625292778015138, "ce_loss_7": 3.5874672174453734, "epoch": 0.462, "grad_norm": 636.0, "kl_loss_10": 132.61270484924316, "kl_loss_2": 1565.8641235351563, "kl_loss_3": 1171.7663940429688, "kl_loss_7": 371.96593170166017, "learning_rate": 0.0005680149810876322, "loss": 786.1608, "step": 4620 }, { "ce_loss_10": 3.5343680024147033, "ce_loss_13": 3.476098108291626, "ce_loss_2": 4.15952113866806, "ce_loss_3": 3.974844920635223, "ce_loss_7": 3.633632469177246, "epoch": 0.463, "grad_norm": 916.0, "kl_loss_10": 129.65257568359374, "kl_loss_2": 1473.4421691894531, "kl_loss_3": 1102.603567504883, "kl_loss_7": 355.8689987182617, "learning_rate": 0.0005664427267851271, "loss": 764.6609, "step": 4630 }, { "ce_loss_10": 3.4518208742141723, "ce_loss_13": 3.395127773284912, "ce_loss_2": 4.078986668586731, "ce_loss_3": 3.8934460520744323, "ce_loss_7": 3.550265383720398, "epoch": 0.464, "grad_norm": 788.0, "kl_loss_10": 128.43332138061524, "kl_loss_2": 1471.1863525390625, "kl_loss_3": 1105.277377319336, "kl_loss_7": 355.1179916381836, "learning_rate": 0.0005648698034051009, "loss": 759.5711, "step": 4640 }, { "ce_loss_10": 3.5719098687171935, "ce_loss_13": 3.513967990875244, "ce_loss_2": 4.204645431041717, "ce_loss_3": 4.020779967308044, "ce_loss_7": 3.6722232818603517, "epoch": 0.465, "grad_norm": 1056.0, "kl_loss_10": 129.50131530761718, "kl_loss_2": 1492.4452270507813, "kl_loss_3": 1118.05078125, "kl_loss_7": 355.465641784668, "learning_rate": 0.0005632962267868747, "loss": 760.3559, "step": 4650 }, { "ce_loss_10": 3.503978407382965, "ce_loss_13": 3.4484734773635863, "ce_loss_2": 4.13207323551178, "ce_loss_3": 3.944869303703308, "ce_loss_7": 3.6067972064018248, "epoch": 0.466, "grad_norm": 604.0, "kl_loss_10": 124.18871879577637, "kl_loss_2": 1470.0651733398438, "kl_loss_3": 1095.9672973632812, "kl_loss_7": 351.40113372802733, "learning_rate": 0.0005617220127763474, "loss": 767.9368, "step": 4660 }, { "ce_loss_10": 3.585794413089752, "ce_loss_13": 3.5305840611457824, "ce_loss_2": 4.204195821285248, "ce_loss_3": 4.019779980182648, "ce_loss_7": 3.682017946243286, "epoch": 0.467, "grad_norm": 696.0, "kl_loss_10": 127.49123001098633, "kl_loss_2": 1450.9377075195312, "kl_loss_3": 1090.0952362060548, "kl_loss_7": 354.78733215332034, "learning_rate": 0.0005601471772258368, "loss": 762.7567, "step": 4670 }, { "ce_loss_10": 3.5694589257240295, "ce_loss_13": 3.5142530679702757, "ce_loss_2": 4.177708339691162, "ce_loss_3": 3.9971248507499695, "ce_loss_7": 3.668432116508484, "epoch": 0.468, "grad_norm": 604.0, "kl_loss_10": 127.1932876586914, "kl_loss_2": 1430.6669982910157, "kl_loss_3": 1074.753436279297, "kl_loss_7": 350.61267242431643, "learning_rate": 0.0005585717359939192, "loss": 765.1635, "step": 4680 }, { "ce_loss_10": 3.482089602947235, "ce_loss_13": 3.425822353363037, "ce_loss_2": 4.101116871833801, "ce_loss_3": 3.9152937650680544, "ce_loss_7": 3.5779839038848875, "epoch": 0.469, "grad_norm": 528.0, "kl_loss_10": 126.95637588500976, "kl_loss_2": 1451.6751831054687, "kl_loss_3": 1094.0453521728516, "kl_loss_7": 351.15742645263674, "learning_rate": 0.0005569957049452703, "loss": 770.4322, "step": 4690 }, { "ce_loss_10": 3.5357295274734497, "ce_loss_13": 3.4797881722450255, "ce_loss_2": 4.170310592651367, "ce_loss_3": 3.9769333004951477, "ce_loss_7": 3.635083317756653, "epoch": 0.47, "grad_norm": 628.0, "kl_loss_10": 130.350581741333, "kl_loss_2": 1499.3314453125, "kl_loss_3": 1120.0059173583984, "kl_loss_7": 359.9276611328125, "learning_rate": 0.0005554190999505056, "loss": 773.8736, "step": 4700 }, { "ce_loss_10": 3.6618139266967775, "ce_loss_13": 3.603938400745392, "ce_loss_2": 4.291402506828308, "ce_loss_3": 4.10056711435318, "ce_loss_7": 3.7629532337188722, "epoch": 0.471, "grad_norm": 552.0, "kl_loss_10": 132.34991912841798, "kl_loss_2": 1483.7266418457032, "kl_loss_3": 1113.9899047851563, "kl_loss_7": 363.98974151611327, "learning_rate": 0.0005538419368860196, "loss": 745.9498, "step": 4710 }, { "ce_loss_10": 3.58713721036911, "ce_loss_13": 3.5296577334403993, "ce_loss_2": 4.205131685733795, "ce_loss_3": 4.02278323173523, "ce_loss_7": 3.683391070365906, "epoch": 0.472, "grad_norm": 478.0, "kl_loss_10": 128.9550354003906, "kl_loss_2": 1460.061590576172, "kl_loss_3": 1097.231591796875, "kl_loss_7": 353.49449615478517, "learning_rate": 0.0005522642316338268, "loss": 777.6082, "step": 4720 }, { "ce_loss_10": 3.59045729637146, "ce_loss_13": 3.535309398174286, "ce_loss_2": 4.197298634052276, "ce_loss_3": 4.012663578987121, "ce_loss_7": 3.685203659534454, "epoch": 0.473, "grad_norm": 836.0, "kl_loss_10": 129.07300338745117, "kl_loss_2": 1434.7744934082032, "kl_loss_3": 1076.9549468994142, "kl_loss_7": 353.2419830322266, "learning_rate": 0.0005506860000814017, "loss": 772.4581, "step": 4730 }, { "ce_loss_10": 3.6118584752082823, "ce_loss_13": 3.556533193588257, "ce_loss_2": 4.223218786716461, "ce_loss_3": 4.040239870548248, "ce_loss_7": 3.7073376536369325, "epoch": 0.474, "grad_norm": 588.0, "kl_loss_10": 126.57029228210449, "kl_loss_2": 1431.9671630859375, "kl_loss_3": 1080.1793304443358, "kl_loss_7": 348.32355346679685, "learning_rate": 0.0005491072581215186, "loss": 759.6759, "step": 4740 }, { "ce_loss_10": 3.6188451647758484, "ce_loss_13": 3.5581385135650634, "ce_loss_2": 4.235405123233795, "ce_loss_3": 4.047661685943604, "ce_loss_7": 3.7187539458274843, "epoch": 0.475, "grad_norm": 1024.0, "kl_loss_10": 131.5200958251953, "kl_loss_2": 1475.39228515625, "kl_loss_3": 1103.3305755615233, "kl_loss_7": 361.21947174072267, "learning_rate": 0.0005475280216520913, "loss": 750.9313, "step": 4750 }, { "ce_loss_10": 3.5320704579353333, "ce_loss_13": 3.4781441688537598, "ce_loss_2": 4.1474240064620975, "ce_loss_3": 3.960626220703125, "ce_loss_7": 3.629132354259491, "epoch": 0.476, "grad_norm": 636.0, "kl_loss_10": 126.30528450012207, "kl_loss_2": 1437.6857177734375, "kl_loss_3": 1077.3333221435546, "kl_loss_7": 347.41858367919923, "learning_rate": 0.0005459483065760138, "loss": 766.0214, "step": 4760 }, { "ce_loss_10": 3.466313195228577, "ce_loss_13": 3.4118001461029053, "ce_loss_2": 4.126975905895233, "ce_loss_3": 3.9336133003234863, "ce_loss_7": 3.566939985752106, "epoch": 0.477, "grad_norm": 916.0, "kl_loss_10": 127.09881401062012, "kl_loss_2": 1513.8791442871093, "kl_loss_3": 1142.7632537841796, "kl_loss_7": 354.5154052734375, "learning_rate": 0.0005443681288009991, "loss": 767.701, "step": 4770 }, { "ce_loss_10": 3.530055844783783, "ce_loss_13": 3.473679745197296, "ce_loss_2": 4.15294862985611, "ce_loss_3": 3.9638696193695067, "ce_loss_7": 3.6289584755897524, "epoch": 0.478, "grad_norm": 732.0, "kl_loss_10": 128.42002296447754, "kl_loss_2": 1474.6412109375, "kl_loss_3": 1102.924478149414, "kl_loss_7": 356.6063034057617, "learning_rate": 0.0005427875042394199, "loss": 768.8391, "step": 4780 }, { "ce_loss_10": 3.559061658382416, "ce_loss_13": 3.499286782741547, "ce_loss_2": 4.178021502494812, "ce_loss_3": 3.9880162835121156, "ce_loss_7": 3.6537997484207154, "epoch": 0.479, "grad_norm": 604.0, "kl_loss_10": 129.98453598022462, "kl_loss_2": 1476.697979736328, "kl_loss_3": 1103.0448516845704, "kl_loss_7": 357.54207611083984, "learning_rate": 0.0005412064488081482, "loss": 775.8501, "step": 4790 }, { "ce_loss_10": 3.5636435866355898, "ce_loss_13": 3.507556414604187, "ce_loss_2": 4.178312647342682, "ce_loss_3": 3.994305968284607, "ce_loss_7": 3.659174859523773, "epoch": 0.48, "grad_norm": 712.0, "kl_loss_10": 126.99441261291504, "kl_loss_2": 1456.738299560547, "kl_loss_3": 1086.8102722167969, "kl_loss_7": 351.37431640625, "learning_rate": 0.0005396249784283942, "loss": 754.4715, "step": 4800 }, { "ce_loss_10": 3.582900881767273, "ce_loss_13": 3.5235010981559753, "ce_loss_2": 4.229977750778199, "ce_loss_3": 4.035599565505981, "ce_loss_7": 3.682912456989288, "epoch": 0.481, "grad_norm": 788.0, "kl_loss_10": 132.2354824066162, "kl_loss_2": 1513.726251220703, "kl_loss_3": 1136.702828979492, "kl_loss_7": 365.539323425293, "learning_rate": 0.0005380431090255476, "loss": 777.0931, "step": 4810 }, { "ce_loss_10": 3.5710787892341616, "ce_loss_13": 3.518178606033325, "ce_loss_2": 4.170691752433777, "ce_loss_3": 3.9865909218788147, "ce_loss_7": 3.667452025413513, "epoch": 0.482, "grad_norm": 552.0, "kl_loss_10": 124.38290710449219, "kl_loss_2": 1423.2920043945312, "kl_loss_3": 1067.5982177734375, "kl_loss_7": 345.50093231201174, "learning_rate": 0.0005364608565290155, "loss": 749.5516, "step": 4820 }, { "ce_loss_10": 3.5861640691757204, "ce_loss_13": 3.527508783340454, "ce_loss_2": 4.206353557109833, "ce_loss_3": 4.020480215549469, "ce_loss_7": 3.6830241322517394, "epoch": 0.483, "grad_norm": 716.0, "kl_loss_10": 129.12178649902344, "kl_loss_2": 1457.7945068359375, "kl_loss_3": 1095.74462890625, "kl_loss_7": 357.8998229980469, "learning_rate": 0.0005348782368720626, "loss": 759.7926, "step": 4830 }, { "ce_loss_10": 3.512594223022461, "ce_loss_13": 3.4565085053443907, "ce_loss_2": 4.119910812377929, "ce_loss_3": 3.9370038151741027, "ce_loss_7": 3.6123961448669433, "epoch": 0.484, "grad_norm": 892.0, "kl_loss_10": 124.94047012329102, "kl_loss_2": 1436.2225341796875, "kl_loss_3": 1076.01171875, "kl_loss_7": 350.6201461791992, "learning_rate": 0.000533295265991652, "loss": 760.0171, "step": 4840 }, { "ce_loss_10": 3.597572553157806, "ce_loss_13": 3.537912893295288, "ce_loss_2": 4.200307357311249, "ce_loss_3": 4.0205553531646725, "ce_loss_7": 3.6955727219581602, "epoch": 0.485, "grad_norm": 660.0, "kl_loss_10": 127.27426147460938, "kl_loss_2": 1432.4733581542969, "kl_loss_3": 1076.2284576416016, "kl_loss_7": 357.1815933227539, "learning_rate": 0.0005317119598282822, "loss": 752.409, "step": 4850 }, { "ce_loss_10": 3.5897864937782287, "ce_loss_13": 3.5321272253990172, "ce_loss_2": 4.2048394799232485, "ce_loss_3": 4.0231526613235475, "ce_loss_7": 3.697403919696808, "epoch": 0.486, "grad_norm": 740.0, "kl_loss_10": 127.06500816345215, "kl_loss_2": 1436.3693542480469, "kl_loss_3": 1081.5646484375, "kl_loss_7": 361.1262634277344, "learning_rate": 0.0005301283343258293, "loss": 758.6568, "step": 4860 }, { "ce_loss_10": 3.65613032579422, "ce_loss_13": 3.5975700855255126, "ce_loss_2": 4.251663959026336, "ce_loss_3": 4.07744791507721, "ce_loss_7": 3.7647886872291565, "epoch": 0.487, "grad_norm": 724.0, "kl_loss_10": 127.67147789001464, "kl_loss_2": 1417.543603515625, "kl_loss_3": 1072.8577423095703, "kl_loss_7": 370.50387420654295, "learning_rate": 0.000528544405431384, "loss": 748.367, "step": 4870 }, { "ce_loss_10": 3.53450231552124, "ce_loss_13": 3.4755603075027466, "ce_loss_2": 4.1686626195907595, "ce_loss_3": 3.9780558943748474, "ce_loss_7": 3.639273762702942, "epoch": 0.488, "grad_norm": 772.0, "kl_loss_10": 128.8699951171875, "kl_loss_2": 1486.9147888183593, "kl_loss_3": 1119.2680114746095, "kl_loss_7": 374.7581512451172, "learning_rate": 0.000526960189095093, "loss": 773.1109, "step": 4880 }, { "ce_loss_10": 3.5063798785209657, "ce_loss_13": 3.452955174446106, "ce_loss_2": 4.127981126308441, "ce_loss_3": 3.9387494206428526, "ce_loss_7": 3.6084472298622132, "epoch": 0.489, "grad_norm": 588.0, "kl_loss_10": 124.4889362335205, "kl_loss_2": 1436.8860290527343, "kl_loss_3": 1073.7127777099608, "kl_loss_7": 349.7402374267578, "learning_rate": 0.0005253757012699972, "loss": 752.2137, "step": 4890 }, { "ce_loss_10": 3.596962869167328, "ce_loss_13": 3.541905701160431, "ce_loss_2": 4.204607903957367, "ce_loss_3": 4.02187534570694, "ce_loss_7": 3.695280838012695, "epoch": 0.49, "grad_norm": 524.0, "kl_loss_10": 127.72086906433105, "kl_loss_2": 1445.986083984375, "kl_loss_3": 1085.8021209716796, "kl_loss_7": 353.85157623291013, "learning_rate": 0.0005237909579118712, "loss": 766.9711, "step": 4900 }, { "ce_loss_10": 3.5621119856834413, "ce_loss_13": 3.5039254426956177, "ce_loss_2": 4.190770697593689, "ce_loss_3": 4.0025376081466675, "ce_loss_7": 3.6638841152191164, "epoch": 0.491, "grad_norm": 676.0, "kl_loss_10": 130.21649017333985, "kl_loss_2": 1476.865899658203, "kl_loss_3": 1108.4342376708985, "kl_loss_7": 365.27405700683596, "learning_rate": 0.0005222059749790631, "loss": 766.6848, "step": 4910 }, { "ce_loss_10": 3.627886402606964, "ce_loss_13": 3.568757712841034, "ce_loss_2": 4.211319518089295, "ce_loss_3": 4.034268498420715, "ce_loss_7": 3.7232242226600647, "epoch": 0.492, "grad_norm": 704.0, "kl_loss_10": 128.84821739196778, "kl_loss_2": 1416.5603820800782, "kl_loss_3": 1061.9581909179688, "kl_loss_7": 352.0233520507812, "learning_rate": 0.0005206207684323337, "loss": 737.0755, "step": 4920 }, { "ce_loss_10": 3.6068161606788633, "ce_loss_13": 3.5479695200920105, "ce_loss_2": 4.220036661624908, "ce_loss_3": 4.033611464500427, "ce_loss_7": 3.707039773464203, "epoch": 0.493, "grad_norm": 752.0, "kl_loss_10": 130.1327449798584, "kl_loss_2": 1452.3597900390625, "kl_loss_3": 1088.4027435302735, "kl_loss_7": 358.82303619384766, "learning_rate": 0.000519035354234695, "loss": 768.5039, "step": 4930 }, { "ce_loss_10": 3.585032618045807, "ce_loss_13": 3.5267568826675415, "ce_loss_2": 4.200621557235718, "ce_loss_3": 4.019805324077606, "ce_loss_7": 3.6854267716407776, "epoch": 0.494, "grad_norm": 648.0, "kl_loss_10": 130.55963859558105, "kl_loss_2": 1452.412860107422, "kl_loss_3": 1089.5958099365234, "kl_loss_7": 358.87390594482423, "learning_rate": 0.0005174497483512506, "loss": 745.6609, "step": 4940 }, { "ce_loss_10": 3.6264762759208677, "ce_loss_13": 3.57160165309906, "ce_loss_2": 4.232844221591949, "ce_loss_3": 4.046636259555816, "ce_loss_7": 3.7188863515853883, "epoch": 0.495, "grad_norm": 624.0, "kl_loss_10": 126.8898696899414, "kl_loss_2": 1443.1926330566407, "kl_loss_3": 1079.7430755615235, "kl_loss_7": 347.55773010253904, "learning_rate": 0.0005158639667490339, "loss": 757.0432, "step": 4950 }, { "ce_loss_10": 3.522357964515686, "ce_loss_13": 3.4666619300842285, "ce_loss_2": 4.146045255661011, "ce_loss_3": 3.9596054553985596, "ce_loss_7": 3.626363182067871, "epoch": 0.496, "grad_norm": 660.0, "kl_loss_10": 127.58676567077637, "kl_loss_2": 1464.100274658203, "kl_loss_3": 1098.3539154052735, "kl_loss_7": 353.0430084228516, "learning_rate": 0.0005142780253968481, "loss": 757.5217, "step": 4960 }, { "ce_loss_10": 3.4769110679626465, "ce_loss_13": 3.420508313179016, "ce_loss_2": 4.0726398229599, "ce_loss_3": 3.8918757796287538, "ce_loss_7": 3.5705711126327513, "epoch": 0.497, "grad_norm": 712.0, "kl_loss_10": 123.46647834777832, "kl_loss_2": 1419.8298278808593, "kl_loss_3": 1063.150827026367, "kl_loss_7": 341.79971771240236, "learning_rate": 0.0005126919402651053, "loss": 732.2201, "step": 4970 }, { "ce_loss_10": 3.5448826789855956, "ce_loss_13": 3.4868510246276854, "ce_loss_2": 4.179246997833252, "ce_loss_3": 3.9946623921394346, "ce_loss_7": 3.648559832572937, "epoch": 0.498, "grad_norm": 664.0, "kl_loss_10": 129.68025093078614, "kl_loss_2": 1467.3921997070313, "kl_loss_3": 1101.7997802734376, "kl_loss_7": 355.3198486328125, "learning_rate": 0.0005111057273256647, "loss": 763.329, "step": 4980 }, { "ce_loss_10": 3.6479685425758364, "ce_loss_13": 3.593002438545227, "ce_loss_2": 4.218882548809051, "ce_loss_3": 4.0441102385520935, "ce_loss_7": 3.736710476875305, "epoch": 0.499, "grad_norm": 524.0, "kl_loss_10": 124.3959129333496, "kl_loss_2": 1369.187322998047, "kl_loss_3": 1028.7687194824218, "kl_loss_7": 336.4957000732422, "learning_rate": 0.0005095194025516733, "loss": 724.7756, "step": 4990 }, { "ce_loss_10": 3.5743329763412475, "ce_loss_13": 3.5195638060569765, "ce_loss_2": 4.169176626205444, "ce_loss_3": 3.9893115639686583, "ce_loss_7": 3.6650718688964843, "epoch": 0.5, "grad_norm": 636.0, "kl_loss_10": 124.31196022033691, "kl_loss_2": 1414.2773315429688, "kl_loss_3": 1058.442919921875, "kl_loss_7": 342.9336791992188, "learning_rate": 0.000507932981917404, "loss": 758.1979, "step": 5000 }, { "ce_loss_10": 3.527126336097717, "ce_loss_13": 3.4682271480560303, "ce_loss_2": 4.170087909698486, "ce_loss_3": 3.977930450439453, "ce_loss_7": 3.6243650317192078, "epoch": 0.501, "grad_norm": 860.0, "kl_loss_10": 130.30797805786133, "kl_loss_2": 1503.0117431640624, "kl_loss_3": 1119.1187561035156, "kl_loss_7": 358.08606872558596, "learning_rate": 0.0005063464813980949, "loss": 777.8597, "step": 5010 }, { "ce_loss_10": 3.512421131134033, "ce_loss_13": 3.45424964427948, "ce_loss_2": 4.135089802742004, "ce_loss_3": 3.9414800643920898, "ce_loss_7": 3.606937575340271, "epoch": 0.502, "grad_norm": 556.0, "kl_loss_10": 127.28718528747558, "kl_loss_2": 1467.612969970703, "kl_loss_3": 1098.6405395507813, "kl_loss_7": 348.034928894043, "learning_rate": 0.0005047599169697884, "loss": 752.1499, "step": 5020 }, { "ce_loss_10": 3.4468389987945556, "ce_loss_13": 3.3897311687469482, "ce_loss_2": 4.070673036575317, "ce_loss_3": 3.880372130870819, "ce_loss_7": 3.544992959499359, "epoch": 0.503, "grad_norm": 916.0, "kl_loss_10": 124.79916343688964, "kl_loss_2": 1447.2315368652344, "kl_loss_3": 1079.9921875, "kl_loss_7": 346.10303649902346, "learning_rate": 0.000503173304609171, "loss": 739.2281, "step": 5030 }, { "ce_loss_10": 3.572928011417389, "ce_loss_13": 3.5145941257476805, "ce_loss_2": 4.187036621570587, "ce_loss_3": 4.007000887393952, "ce_loss_7": 3.668638730049133, "epoch": 0.504, "grad_norm": 588.0, "kl_loss_10": 126.00835227966309, "kl_loss_2": 1440.8026000976563, "kl_loss_3": 1078.6716094970702, "kl_loss_7": 344.7953277587891, "learning_rate": 0.0005015866602934111, "loss": 744.6847, "step": 5040 }, { "ce_loss_10": 3.5366175055503843, "ce_loss_13": 3.478814089298248, "ce_loss_2": 4.180455148220062, "ce_loss_3": 3.9842303514480593, "ce_loss_7": 3.634747123718262, "epoch": 0.505, "grad_norm": 660.0, "kl_loss_10": 130.4676456451416, "kl_loss_2": 1497.6227783203126, "kl_loss_3": 1125.589825439453, "kl_loss_7": 363.67038879394534, "learning_rate": 0.0005, "loss": 762.9002, "step": 5050 }, { "ce_loss_10": 3.526559591293335, "ce_loss_13": 3.471008539199829, "ce_loss_2": 4.1430164813995365, "ce_loss_3": 3.9600774168968202, "ce_loss_7": 3.6214876055717466, "epoch": 0.506, "grad_norm": 596.0, "kl_loss_10": 127.42528266906739, "kl_loss_2": 1458.6119445800782, "kl_loss_3": 1100.3540649414062, "kl_loss_7": 351.1239486694336, "learning_rate": 0.0004984133397065889, "loss": 745.3238, "step": 5060 }, { "ce_loss_10": 3.538331460952759, "ce_loss_13": 3.48160400390625, "ce_loss_2": 4.170895993709564, "ce_loss_3": 3.9798762083053587, "ce_loss_7": 3.6342212200164794, "epoch": 0.507, "grad_norm": 628.0, "kl_loss_10": 126.94361305236816, "kl_loss_2": 1458.0782897949218, "kl_loss_3": 1096.8780242919922, "kl_loss_7": 351.65411987304685, "learning_rate": 0.0004968266953908291, "loss": 748.5638, "step": 5070 }, { "ce_loss_10": 3.578177201747894, "ce_loss_13": 3.523463523387909, "ce_loss_2": 4.192970585823059, "ce_loss_3": 4.007475554943085, "ce_loss_7": 3.6718322396278382, "epoch": 0.508, "grad_norm": 712.0, "kl_loss_10": 125.75382385253906, "kl_loss_2": 1448.2949584960938, "kl_loss_3": 1085.0182800292969, "kl_loss_7": 344.33011016845705, "learning_rate": 0.0004952400830302117, "loss": 750.5558, "step": 5080 }, { "ce_loss_10": 3.505996084213257, "ce_loss_13": 3.44895259141922, "ce_loss_2": 4.14078243970871, "ce_loss_3": 3.951820456981659, "ce_loss_7": 3.6068713068962097, "epoch": 0.509, "grad_norm": 540.0, "kl_loss_10": 128.75954895019532, "kl_loss_2": 1493.3599365234375, "kl_loss_3": 1111.6652374267578, "kl_loss_7": 355.9008483886719, "learning_rate": 0.0004936535186019053, "loss": 759.8867, "step": 5090 }, { "ce_loss_10": 3.6047815322875976, "ce_loss_13": 3.5503739714622498, "ce_loss_2": 4.190906155109405, "ce_loss_3": 4.0159718751907345, "ce_loss_7": 3.6974563717842104, "epoch": 0.51, "grad_norm": 520.0, "kl_loss_10": 124.55697784423828, "kl_loss_2": 1396.7164306640625, "kl_loss_3": 1052.4375610351562, "kl_loss_7": 340.2462951660156, "learning_rate": 0.000492067018082596, "loss": 740.6732, "step": 5100 }, { "ce_loss_10": 3.5405318260192873, "ce_loss_13": 3.481382191181183, "ce_loss_2": 4.190889728069306, "ce_loss_3": 3.9940460085868836, "ce_loss_7": 3.6407326340675352, "epoch": 0.511, "grad_norm": 748.0, "kl_loss_10": 129.94738388061523, "kl_loss_2": 1511.246484375, "kl_loss_3": 1130.2775482177735, "kl_loss_7": 359.2450241088867, "learning_rate": 0.0004904805974483267, "loss": 784.2884, "step": 5110 }, { "ce_loss_10": 3.6526175856590273, "ce_loss_13": 3.592953288555145, "ce_loss_2": 4.278130650520325, "ce_loss_3": 4.098746013641358, "ce_loss_7": 3.7574565052986144, "epoch": 0.512, "grad_norm": 620.0, "kl_loss_10": 133.8547565460205, "kl_loss_2": 1478.876611328125, "kl_loss_3": 1118.5677642822266, "kl_loss_7": 366.43187561035154, "learning_rate": 0.0004888942726743353, "loss": 780.2876, "step": 5120 }, { "ce_loss_10": 3.5215405344963076, "ce_loss_13": 3.4650914430618287, "ce_loss_2": 4.143480885028839, "ce_loss_3": 3.9585989832878115, "ce_loss_7": 3.6219504952430723, "epoch": 0.513, "grad_norm": 752.0, "kl_loss_10": 128.29964866638184, "kl_loss_2": 1475.1177978515625, "kl_loss_3": 1104.3501556396484, "kl_loss_7": 355.6892623901367, "learning_rate": 0.0004873080597348947, "loss": 764.5013, "step": 5130 }, { "ce_loss_10": 3.410134506225586, "ce_loss_13": 3.352971625328064, "ce_loss_2": 4.06082159280777, "ce_loss_3": 3.866491210460663, "ce_loss_7": 3.5091195464134217, "epoch": 0.514, "grad_norm": 644.0, "kl_loss_10": 126.35911636352539, "kl_loss_2": 1516.3789794921875, "kl_loss_3": 1137.2391021728515, "kl_loss_7": 355.16859741210936, "learning_rate": 0.0004857219746031519, "loss": 770.8493, "step": 5140 }, { "ce_loss_10": 3.584864628314972, "ce_loss_13": 3.529874527454376, "ce_loss_2": 4.1941790103912355, "ce_loss_3": 4.00935822725296, "ce_loss_7": 3.6780989289283754, "epoch": 0.515, "grad_norm": 780.0, "kl_loss_10": 128.64718170166014, "kl_loss_2": 1438.0651428222657, "kl_loss_3": 1078.5996124267579, "kl_loss_7": 349.5986953735352, "learning_rate": 0.0004841360332509663, "loss": 752.5851, "step": 5150 }, { "ce_loss_10": 3.535393476486206, "ce_loss_13": 3.481099021434784, "ce_loss_2": 4.140241587162018, "ce_loss_3": 3.9574079871177674, "ce_loss_7": 3.634384071826935, "epoch": 0.516, "grad_norm": 640.0, "kl_loss_10": 122.83691864013672, "kl_loss_2": 1426.9576843261718, "kl_loss_3": 1065.5508270263672, "kl_loss_7": 341.7165832519531, "learning_rate": 0.0004825502516487497, "loss": 727.2895, "step": 5160 }, { "ce_loss_10": 3.4977298855781553, "ce_loss_13": 3.441702198982239, "ce_loss_2": 4.130070972442627, "ce_loss_3": 3.942211651802063, "ce_loss_7": 3.5965057730674745, "epoch": 0.517, "grad_norm": 760.0, "kl_loss_10": 127.02273902893066, "kl_loss_2": 1477.1372314453124, "kl_loss_3": 1105.9117126464844, "kl_loss_7": 351.33150787353514, "learning_rate": 0.00048096464576530507, "loss": 761.6553, "step": 5170 }, { "ce_loss_10": 3.6062949657440186, "ce_loss_13": 3.5500340938568113, "ce_loss_2": 4.195667040348053, "ce_loss_3": 4.018487000465393, "ce_loss_7": 3.6985435009002687, "epoch": 0.518, "grad_norm": 624.0, "kl_loss_10": 126.56138648986817, "kl_loss_2": 1408.2973205566407, "kl_loss_3": 1057.3329833984376, "kl_loss_7": 341.4838348388672, "learning_rate": 0.00047937923156766646, "loss": 734.8762, "step": 5180 }, { "ce_loss_10": 3.6485862135887146, "ce_loss_13": 3.5955163717269896, "ce_loss_2": 4.232599627971649, "ce_loss_3": 4.060397815704346, "ce_loss_7": 3.738468253612518, "epoch": 0.519, "grad_norm": 620.0, "kl_loss_10": 126.33506202697754, "kl_loss_2": 1394.4201049804688, "kl_loss_3": 1048.218344116211, "kl_loss_7": 340.96579895019534, "learning_rate": 0.00047779402502093696, "loss": 736.9193, "step": 5190 }, { "ce_loss_10": 3.6157581567764283, "ce_loss_13": 3.558749091625214, "ce_loss_2": 4.207399034500122, "ce_loss_3": 4.03502504825592, "ce_loss_7": 3.7083640336990356, "epoch": 0.52, "grad_norm": 672.0, "kl_loss_10": 126.87294883728028, "kl_loss_2": 1406.3830688476562, "kl_loss_3": 1064.0467315673827, "kl_loss_7": 343.44087677001954, "learning_rate": 0.0004762090420881289, "loss": 745.0921, "step": 5200 }, { "ce_loss_10": 3.535387361049652, "ce_loss_13": 3.4797078251838682, "ce_loss_2": 4.134657156467438, "ce_loss_3": 3.951842713356018, "ce_loss_7": 3.625101017951965, "epoch": 0.521, "grad_norm": 568.0, "kl_loss_10": 125.83340911865234, "kl_loss_2": 1426.3083251953126, "kl_loss_3": 1067.8335723876953, "kl_loss_7": 343.47533721923827, "learning_rate": 0.00047462429873000296, "loss": 735.2042, "step": 5210 }, { "ce_loss_10": 3.616153085231781, "ce_loss_13": 3.561689925193787, "ce_loss_2": 4.214185404777527, "ce_loss_3": 4.027488625049591, "ce_loss_7": 3.7073824644088744, "epoch": 0.522, "grad_norm": 556.0, "kl_loss_10": 127.8383171081543, "kl_loss_2": 1424.687451171875, "kl_loss_3": 1065.997296142578, "kl_loss_7": 346.1371368408203, "learning_rate": 0.0004730398109049071, "loss": 741.2838, "step": 5220 }, { "ce_loss_10": 3.54775093793869, "ce_loss_13": 3.4891308307647706, "ce_loss_2": 4.180230689048767, "ce_loss_3": 3.993181014060974, "ce_loss_7": 3.6464683175086976, "epoch": 0.523, "grad_norm": 612.0, "kl_loss_10": 129.67901077270508, "kl_loss_2": 1474.5384399414063, "kl_loss_3": 1110.3589935302734, "kl_loss_7": 357.5801712036133, "learning_rate": 0.000471455594568616, "loss": 754.4792, "step": 5230 }, { "ce_loss_10": 3.6236977100372316, "ce_loss_13": 3.567158377170563, "ce_loss_2": 4.210835099220276, "ce_loss_3": 4.030972874164581, "ce_loss_7": 3.713850724697113, "epoch": 0.524, "grad_norm": 488.0, "kl_loss_10": 127.79978103637696, "kl_loss_2": 1409.9825317382813, "kl_loss_3": 1052.4463439941405, "kl_loss_7": 343.3757125854492, "learning_rate": 0.00046987166567417086, "loss": 746.4875, "step": 5240 }, { "ce_loss_10": 3.5389233589172364, "ce_loss_13": 3.4855753421783446, "ce_loss_2": 4.151636373996735, "ce_loss_3": 3.963833916187286, "ce_loss_7": 3.633453297615051, "epoch": 0.525, "grad_norm": 700.0, "kl_loss_10": 124.83961982727051, "kl_loss_2": 1432.0410766601562, "kl_loss_3": 1068.8124633789062, "kl_loss_7": 344.24831237792966, "learning_rate": 0.00046828804017171776, "loss": 730.1019, "step": 5250 }, { "ce_loss_10": 3.5789129376411437, "ce_loss_13": 3.52143075466156, "ce_loss_2": 4.213813376426697, "ce_loss_3": 4.024446547031403, "ce_loss_7": 3.6814927458763123, "epoch": 0.526, "grad_norm": 556.0, "kl_loss_10": 128.2565517425537, "kl_loss_2": 1458.2886047363281, "kl_loss_3": 1090.4730590820313, "kl_loss_7": 352.8787322998047, "learning_rate": 0.00046670473400834805, "loss": 759.958, "step": 5260 }, { "ce_loss_10": 3.5171929001808167, "ce_loss_13": 3.4629207491874694, "ce_loss_2": 4.1135843873023985, "ce_loss_3": 3.9294076561927795, "ce_loss_7": 3.6082523465156555, "epoch": 0.527, "grad_norm": 712.0, "kl_loss_10": 123.60066146850586, "kl_loss_2": 1415.8293395996093, "kl_loss_3": 1053.1150390625, "kl_loss_7": 339.89840545654295, "learning_rate": 0.00046512176312793734, "loss": 756.1812, "step": 5270 }, { "ce_loss_10": 3.5107405304908754, "ce_loss_13": 3.453246533870697, "ce_loss_2": 4.126374876499176, "ce_loss_3": 3.936051630973816, "ce_loss_7": 3.6049073815345762, "epoch": 0.528, "grad_norm": 612.0, "kl_loss_10": 125.6848876953125, "kl_loss_2": 1449.2553955078124, "kl_loss_3": 1081.2026336669921, "kl_loss_7": 345.4993423461914, "learning_rate": 0.00046353914347098467, "loss": 756.4846, "step": 5280 }, { "ce_loss_10": 3.604662263393402, "ce_loss_13": 3.5488700747489927, "ce_loss_2": 4.218853032588958, "ce_loss_3": 4.034527897834778, "ce_loss_7": 3.6985684871673583, "epoch": 0.529, "grad_norm": 908.0, "kl_loss_10": 126.48544654846191, "kl_loss_2": 1444.1267395019531, "kl_loss_3": 1078.7888458251953, "kl_loss_7": 346.7707778930664, "learning_rate": 0.0004619568909744524, "loss": 752.0158, "step": 5290 }, { "ce_loss_10": 3.612137234210968, "ce_loss_13": 3.5561481952667235, "ce_loss_2": 4.217017912864685, "ce_loss_3": 4.034022784233093, "ce_loss_7": 3.7084673762321474, "epoch": 0.53, "grad_norm": 704.0, "kl_loss_10": 128.3290657043457, "kl_loss_2": 1421.7739501953124, "kl_loss_3": 1070.839599609375, "kl_loss_7": 349.6264083862305, "learning_rate": 0.00046037502157160573, "loss": 754.9794, "step": 5300 }, { "ce_loss_10": 3.4819830536842344, "ce_loss_13": 3.42584068775177, "ce_loss_2": 4.099390125274658, "ce_loss_3": 3.917300546169281, "ce_loss_7": 3.580989933013916, "epoch": 0.531, "grad_norm": 628.0, "kl_loss_10": 125.62411231994629, "kl_loss_2": 1453.1245910644532, "kl_loss_3": 1098.5252807617187, "kl_loss_7": 353.93557891845705, "learning_rate": 0.00045879355119185207, "loss": 756.8294, "step": 5310 }, { "ce_loss_10": 3.5613385915756224, "ce_loss_13": 3.5066535234451295, "ce_loss_2": 4.183086156845093, "ce_loss_3": 3.9961040735244753, "ce_loss_7": 3.6569605588912966, "epoch": 0.532, "grad_norm": 696.0, "kl_loss_10": 127.47045364379883, "kl_loss_2": 1474.839111328125, "kl_loss_3": 1106.3402404785156, "kl_loss_7": 356.02258758544923, "learning_rate": 0.0004572124957605803, "loss": 763.853, "step": 5320 }, { "ce_loss_10": 3.580208718776703, "ce_loss_13": 3.523788559436798, "ce_loss_2": 4.186931335926056, "ce_loss_3": 4.0057693243026735, "ce_loss_7": 3.677249050140381, "epoch": 0.533, "grad_norm": 428.0, "kl_loss_10": 125.33763389587402, "kl_loss_2": 1450.7080810546875, "kl_loss_3": 1089.869924926758, "kl_loss_7": 352.4146255493164, "learning_rate": 0.00045563187119900103, "loss": 745.5743, "step": 5330 }, { "ce_loss_10": 3.424732136726379, "ce_loss_13": 3.3700300335884092, "ce_loss_2": 4.053750395774841, "ce_loss_3": 3.866895389556885, "ce_loss_7": 3.5216084599494932, "epoch": 0.534, "grad_norm": 1168.0, "kl_loss_10": 125.33371696472167, "kl_loss_2": 1466.25302734375, "kl_loss_3": 1098.1051971435547, "kl_loss_7": 349.4919769287109, "learning_rate": 0.00045405169342398633, "loss": 760.794, "step": 5340 }, { "ce_loss_10": 3.516588735580444, "ce_loss_13": 3.456972897052765, "ce_loss_2": 4.132767844200134, "ce_loss_3": 3.951627218723297, "ce_loss_7": 3.6111021041870117, "epoch": 0.535, "grad_norm": 720.0, "kl_loss_10": 127.6759262084961, "kl_loss_2": 1454.3429931640626, "kl_loss_3": 1093.5691711425782, "kl_loss_7": 349.7331176757813, "learning_rate": 0.0004524719783479088, "loss": 745.902, "step": 5350 }, { "ce_loss_10": 3.4667383193969727, "ce_loss_13": 3.4120450973510743, "ce_loss_2": 4.107578992843628, "ce_loss_3": 3.907571184635162, "ce_loss_7": 3.5639479041099547, "epoch": 0.536, "grad_norm": 620.0, "kl_loss_10": 126.3938491821289, "kl_loss_2": 1486.8213806152344, "kl_loss_3": 1107.0321716308595, "kl_loss_7": 352.4698181152344, "learning_rate": 0.00045089274187848144, "loss": 748.4564, "step": 5360 }, { "ce_loss_10": 3.5862554907798767, "ce_loss_13": 3.5314642190933228, "ce_loss_2": 4.182982349395752, "ce_loss_3": 4.007098364830017, "ce_loss_7": 3.6815426349639893, "epoch": 0.537, "grad_norm": 780.0, "kl_loss_10": 125.15973091125488, "kl_loss_2": 1417.1767822265624, "kl_loss_3": 1063.112875366211, "kl_loss_7": 344.52098083496094, "learning_rate": 0.00044931399991859835, "loss": 740.4288, "step": 5370 }, { "ce_loss_10": 3.4537264466285706, "ce_loss_13": 3.397633194923401, "ce_loss_2": 4.068418169021607, "ce_loss_3": 3.877337634563446, "ce_loss_7": 3.548009955883026, "epoch": 0.538, "grad_norm": 536.0, "kl_loss_10": 125.85848808288574, "kl_loss_2": 1456.3653503417968, "kl_loss_3": 1089.4956604003905, "kl_loss_7": 347.38047180175784, "learning_rate": 0.00044773576836617336, "loss": 740.3193, "step": 5380 }, { "ce_loss_10": 3.5413371920585632, "ce_loss_13": 3.4863479495048524, "ce_loss_2": 4.166609585285187, "ce_loss_3": 3.9833678722381594, "ce_loss_7": 3.641672730445862, "epoch": 0.539, "grad_norm": 548.0, "kl_loss_10": 127.76419677734376, "kl_loss_2": 1480.62939453125, "kl_loss_3": 1109.371633911133, "kl_loss_7": 357.3611831665039, "learning_rate": 0.00044615806311398056, "loss": 767.7122, "step": 5390 }, { "ce_loss_10": 3.6181453466415405, "ce_loss_13": 3.567015016078949, "ce_loss_2": 4.187848663330078, "ce_loss_3": 4.017346155643463, "ce_loss_7": 3.7096580266952515, "epoch": 0.54, "grad_norm": 652.0, "kl_loss_10": 125.5275634765625, "kl_loss_2": 1384.711865234375, "kl_loss_3": 1042.3586151123047, "kl_loss_7": 339.9911834716797, "learning_rate": 0.00044458090004949454, "loss": 745.8007, "step": 5400 }, { "ce_loss_10": 3.47789705991745, "ce_loss_13": 3.418693256378174, "ce_loss_2": 4.127793288230896, "ce_loss_3": 3.9291198015213014, "ce_loss_7": 3.57582768201828, "epoch": 0.541, "grad_norm": 628.0, "kl_loss_10": 129.80282020568848, "kl_loss_2": 1531.2485473632812, "kl_loss_3": 1138.2959686279296, "kl_loss_7": 360.45466766357424, "learning_rate": 0.0004430042950547297, "loss": 759.4717, "step": 5410 }, { "ce_loss_10": 3.573679769039154, "ce_loss_13": 3.515029692649841, "ce_loss_2": 4.1977743268013, "ce_loss_3": 4.015309143066406, "ce_loss_7": 3.6747510194778443, "epoch": 0.542, "grad_norm": 624.0, "kl_loss_10": 130.55573616027831, "kl_loss_2": 1466.3621948242187, "kl_loss_3": 1100.1870544433593, "kl_loss_7": 357.17127685546876, "learning_rate": 0.0004414282640060809, "loss": 755.4526, "step": 5420 }, { "ce_loss_10": 3.665824794769287, "ce_loss_13": 3.607898008823395, "ce_loss_2": 4.26334011554718, "ce_loss_3": 4.0842081785202025, "ce_loss_7": 3.765260875225067, "epoch": 0.543, "grad_norm": 680.0, "kl_loss_10": 130.16803436279298, "kl_loss_2": 1410.027520751953, "kl_loss_3": 1066.1269775390624, "kl_loss_7": 359.3704772949219, "learning_rate": 0.0004398528227741633, "loss": 755.4893, "step": 5430 }, { "ce_loss_10": 3.533736264705658, "ce_loss_13": 3.4744524717330934, "ce_loss_2": 4.148015642166138, "ce_loss_3": 3.9667993783950806, "ce_loss_7": 3.6348182201385497, "epoch": 0.544, "grad_norm": 676.0, "kl_loss_10": 133.03388061523438, "kl_loss_2": 1434.2911987304688, "kl_loss_3": 1079.9505981445313, "kl_loss_7": 359.5822357177734, "learning_rate": 0.00043827798722365264, "loss": 762.0716, "step": 5440 }, { "ce_loss_10": 3.6589300632476807, "ce_loss_13": 3.6001421213150024, "ce_loss_2": 4.238667392730713, "ce_loss_3": 4.060091936588288, "ce_loss_7": 3.750708055496216, "epoch": 0.545, "grad_norm": 494.0, "kl_loss_10": 131.36463470458983, "kl_loss_2": 1393.9716003417968, "kl_loss_3": 1045.8557373046874, "kl_loss_7": 350.9205383300781, "learning_rate": 0.00043670377321312535, "loss": 729.8014, "step": 5450 }, { "ce_loss_10": 3.656389832496643, "ce_loss_13": 3.600922691822052, "ce_loss_2": 4.24760650396347, "ce_loss_3": 4.063620638847351, "ce_loss_7": 3.7452564239501953, "epoch": 0.546, "grad_norm": 556.0, "kl_loss_10": 128.25653953552245, "kl_loss_2": 1405.696063232422, "kl_loss_3": 1049.458319091797, "kl_loss_7": 344.41868896484374, "learning_rate": 0.0004351301965948991, "loss": 746.7757, "step": 5460 }, { "ce_loss_10": 3.5661354064941406, "ce_loss_13": 3.508555901050568, "ce_loss_2": 4.156990563869476, "ce_loss_3": 3.9813518643379213, "ce_loss_7": 3.6595167994499205, "epoch": 0.547, "grad_norm": 600.0, "kl_loss_10": 126.90561714172364, "kl_loss_2": 1396.3604797363282, "kl_loss_3": 1049.8896087646485, "kl_loss_7": 342.8857620239258, "learning_rate": 0.000433557273214873, "loss": 740.9387, "step": 5470 }, { "ce_loss_10": 3.5527311325073243, "ce_loss_13": 3.497834849357605, "ce_loss_2": 4.150702881813049, "ce_loss_3": 3.9698083996772766, "ce_loss_7": 3.6472991704940796, "epoch": 0.548, "grad_norm": 644.0, "kl_loss_10": 126.52342948913574, "kl_loss_2": 1407.9827697753906, "kl_loss_3": 1056.515103149414, "kl_loss_7": 345.3242782592773, "learning_rate": 0.000431985018912368, "loss": 732.1462, "step": 5480 }, { "ce_loss_10": 3.5216315269470213, "ce_loss_13": 3.4642876982688904, "ce_loss_2": 4.148514151573181, "ce_loss_3": 3.9579554200172424, "ce_loss_7": 3.61717346906662, "epoch": 0.549, "grad_norm": 458.0, "kl_loss_10": 129.01484336853028, "kl_loss_2": 1464.8551086425782, "kl_loss_3": 1100.7810821533203, "kl_loss_7": 353.8451156616211, "learning_rate": 0.0004304134495199674, "loss": 742.2356, "step": 5490 }, { "ce_loss_10": 3.551531457901001, "ce_loss_13": 3.4940086603164673, "ce_loss_2": 4.170977103710174, "ce_loss_3": 3.9918709278106688, "ce_loss_7": 3.6487022042274475, "epoch": 0.55, "grad_norm": 696.0, "kl_loss_10": 129.0471164703369, "kl_loss_2": 1483.5442321777343, "kl_loss_3": 1116.8151824951171, "kl_loss_7": 357.02617797851565, "learning_rate": 0.0004288425808633575, "loss": 757.0911, "step": 5500 }, { "ce_loss_10": 3.524256336688995, "ce_loss_13": 3.4696091651916503, "ce_loss_2": 4.134656190872192, "ce_loss_3": 3.948019301891327, "ce_loss_7": 3.6168912172317507, "epoch": 0.551, "grad_norm": 764.0, "kl_loss_10": 124.78674278259277, "kl_loss_2": 1451.5693908691405, "kl_loss_3": 1084.9963470458983, "kl_loss_7": 345.0725372314453, "learning_rate": 0.0004272724287611684, "loss": 748.5045, "step": 5510 }, { "ce_loss_10": 3.5044045448303223, "ce_loss_13": 3.4458850502967833, "ce_loss_2": 4.11513135433197, "ce_loss_3": 3.923069179058075, "ce_loss_7": 3.598716700077057, "epoch": 0.552, "grad_norm": 564.0, "kl_loss_10": 128.57932319641114, "kl_loss_2": 1450.3947509765626, "kl_loss_3": 1084.4042419433595, "kl_loss_7": 349.6862258911133, "learning_rate": 0.00042570300902481425, "loss": 748.2176, "step": 5520 }, { "ce_loss_10": 3.5342564582824707, "ce_loss_13": 3.4802250385284426, "ce_loss_2": 4.127187561988831, "ce_loss_3": 3.9438385248184202, "ce_loss_7": 3.6231723546981813, "epoch": 0.553, "grad_norm": 684.0, "kl_loss_10": 125.04237670898438, "kl_loss_2": 1423.2647521972656, "kl_loss_3": 1067.049887084961, "kl_loss_7": 344.51769561767577, "learning_rate": 0.00042413433745833423, "loss": 740.4593, "step": 5530 }, { "ce_loss_10": 3.537394309043884, "ce_loss_13": 3.4792711973190307, "ce_loss_2": 4.149854254722595, "ce_loss_3": 3.968058681488037, "ce_loss_7": 3.6317521929740906, "epoch": 0.554, "grad_norm": 592.0, "kl_loss_10": 126.85282897949219, "kl_loss_2": 1431.3300415039062, "kl_loss_3": 1076.83173828125, "kl_loss_7": 346.4241668701172, "learning_rate": 0.0004225664298582339, "loss": 727.5832, "step": 5540 }, { "ce_loss_10": 3.61503586769104, "ce_loss_13": 3.5593451619148255, "ce_loss_2": 4.2083780169487, "ce_loss_3": 4.028390157222748, "ce_loss_7": 3.706908369064331, "epoch": 0.555, "grad_norm": 472.0, "kl_loss_10": 124.98624839782715, "kl_loss_2": 1399.5473022460938, "kl_loss_3": 1052.0424438476562, "kl_loss_7": 341.1046676635742, "learning_rate": 0.000420999302013325, "loss": 731.5678, "step": 5550 }, { "ce_loss_10": 3.5158777952194216, "ce_loss_13": 3.4559564113616945, "ce_loss_2": 4.1452751636505125, "ce_loss_3": 3.9481736540794374, "ce_loss_7": 3.6144383549690247, "epoch": 0.556, "grad_norm": 696.0, "kl_loss_10": 130.04578018188477, "kl_loss_2": 1463.335516357422, "kl_loss_3": 1083.2877349853516, "kl_loss_7": 355.2161560058594, "learning_rate": 0.000419432969704568, "loss": 741.9638, "step": 5560 }, { "ce_loss_10": 3.5586655020713804, "ce_loss_13": 3.503348696231842, "ce_loss_2": 4.1602645993232725, "ce_loss_3": 3.976087248325348, "ce_loss_7": 3.6495119094848634, "epoch": 0.557, "grad_norm": 478.0, "kl_loss_10": 125.02511405944824, "kl_loss_2": 1410.6633605957031, "kl_loss_3": 1053.2783630371093, "kl_loss_7": 341.9718505859375, "learning_rate": 0.00041786744870491154, "loss": 750.6796, "step": 5570 }, { "ce_loss_10": 3.49469313621521, "ce_loss_13": 3.4374850153923036, "ce_loss_2": 4.115650498867035, "ce_loss_3": 3.9304126381874083, "ce_loss_7": 3.589133381843567, "epoch": 0.558, "grad_norm": 636.0, "kl_loss_10": 128.60980453491212, "kl_loss_2": 1454.5861450195312, "kl_loss_3": 1099.7731384277345, "kl_loss_7": 352.64312438964845, "learning_rate": 0.0004163027547791347, "loss": 750.0387, "step": 5580 }, { "ce_loss_10": 3.4733791589736938, "ce_loss_13": 3.417029893398285, "ce_loss_2": 4.109368133544922, "ce_loss_3": 3.916108226776123, "ce_loss_7": 3.57048362493515, "epoch": 0.559, "grad_norm": 544.0, "kl_loss_10": 126.46923561096192, "kl_loss_2": 1481.6322875976562, "kl_loss_3": 1110.3733978271484, "kl_loss_7": 353.67320861816404, "learning_rate": 0.0004147389036836881, "loss": 753.4516, "step": 5590 }, { "ce_loss_10": 3.522750961780548, "ce_loss_13": 3.4660881876945497, "ce_loss_2": 4.150948774814606, "ce_loss_3": 3.9613837599754333, "ce_loss_7": 3.6164920568466186, "epoch": 0.56, "grad_norm": 1008.0, "kl_loss_10": 127.6525722503662, "kl_loss_2": 1456.663018798828, "kl_loss_3": 1093.7019775390625, "kl_loss_7": 348.2412384033203, "learning_rate": 0.00041317591116653486, "loss": 760.9835, "step": 5600 }, { "ce_loss_10": 3.5656349301338195, "ce_loss_13": 3.506862556934357, "ce_loss_2": 4.182165312767029, "ce_loss_3": 3.996174454689026, "ce_loss_7": 3.6606045246124266, "epoch": 0.561, "grad_norm": 544.0, "kl_loss_10": 130.3198528289795, "kl_loss_2": 1451.4231079101562, "kl_loss_3": 1085.5545135498046, "kl_loss_7": 355.61197052001955, "learning_rate": 0.0004116137929669921, "loss": 742.7544, "step": 5610 }, { "ce_loss_10": 3.550603926181793, "ce_loss_13": 3.4951741695404053, "ce_loss_2": 4.154799246788025, "ce_loss_3": 3.9741489410400392, "ce_loss_7": 3.642624282836914, "epoch": 0.562, "grad_norm": 612.0, "kl_loss_10": 125.04610176086426, "kl_loss_2": 1434.1698974609376, "kl_loss_3": 1079.523110961914, "kl_loss_7": 345.24672241210936, "learning_rate": 0.00041005256481557305, "loss": 736.6574, "step": 5620 }, { "ce_loss_10": 3.6528918623924254, "ce_loss_13": 3.599192976951599, "ce_loss_2": 4.227750253677368, "ce_loss_3": 4.054849910736084, "ce_loss_7": 3.7427762031555174, "epoch": 0.563, "grad_norm": 960.0, "kl_loss_10": 123.06231956481933, "kl_loss_2": 1355.2447143554687, "kl_loss_3": 1017.4669219970704, "kl_loss_7": 334.08680114746096, "learning_rate": 0.00040849224243382767, "loss": 721.732, "step": 5630 }, { "ce_loss_10": 3.5077744126319885, "ce_loss_13": 3.4527820706367494, "ce_loss_2": 4.120105528831482, "ce_loss_3": 3.935258185863495, "ce_loss_7": 3.6035592913627625, "epoch": 0.564, "grad_norm": 572.0, "kl_loss_10": 124.89362907409668, "kl_loss_2": 1440.2459045410155, "kl_loss_3": 1077.581704711914, "kl_loss_7": 345.66795349121094, "learning_rate": 0.000406932841534185, "loss": 734.8656, "step": 5640 }, { "ce_loss_10": 3.4617214798927307, "ce_loss_13": 3.404375874996185, "ce_loss_2": 4.0868846535682675, "ce_loss_3": 3.9023816704750063, "ce_loss_7": 3.5591673016548158, "epoch": 0.565, "grad_norm": 736.0, "kl_loss_10": 126.24258880615234, "kl_loss_2": 1461.9494995117188, "kl_loss_3": 1098.3101470947265, "kl_loss_7": 351.8350204467773, "learning_rate": 0.0004053743778197951, "loss": 763.8107, "step": 5650 }, { "ce_loss_10": 3.5757392168045046, "ce_loss_13": 3.516366720199585, "ce_loss_2": 4.176673400402069, "ce_loss_3": 3.9969864964485167, "ce_loss_7": 3.6699735283851624, "epoch": 0.566, "grad_norm": 556.0, "kl_loss_10": 129.16880264282227, "kl_loss_2": 1421.8304565429687, "kl_loss_3": 1070.1267395019531, "kl_loss_7": 347.0413360595703, "learning_rate": 0.0004038168669843697, "loss": 753.9809, "step": 5660 }, { "ce_loss_10": 3.5368810892105103, "ce_loss_13": 3.480051815509796, "ce_loss_2": 4.12183108329773, "ce_loss_3": 3.947605645656586, "ce_loss_7": 3.6282246232032778, "epoch": 0.567, "grad_norm": 700.0, "kl_loss_10": 124.853763961792, "kl_loss_2": 1403.9633483886719, "kl_loss_3": 1055.4384826660157, "kl_loss_7": 340.73865661621096, "learning_rate": 0.000402260324712026, "loss": 742.4323, "step": 5670 }, { "ce_loss_10": 3.5802783489227297, "ce_loss_13": 3.5246822237968445, "ce_loss_2": 4.191337883472443, "ce_loss_3": 4.009469735622406, "ce_loss_7": 3.674550974369049, "epoch": 0.568, "grad_norm": 704.0, "kl_loss_10": 125.56253471374512, "kl_loss_2": 1435.6087524414063, "kl_loss_3": 1077.026220703125, "kl_loss_7": 344.2860565185547, "learning_rate": 0.00040070476667712743, "loss": 736.5972, "step": 5680 }, { "ce_loss_10": 3.6075830340385435, "ce_loss_13": 3.5494349122047426, "ce_loss_2": 4.2079323649406435, "ce_loss_3": 4.023375844955444, "ce_loss_7": 3.7011712551116944, "epoch": 0.569, "grad_norm": 652.0, "kl_loss_10": 127.4619140625, "kl_loss_2": 1424.6428833007812, "kl_loss_3": 1060.6977600097657, "kl_loss_7": 344.1405715942383, "learning_rate": 0.0003991502085441259, "loss": 745.1114, "step": 5690 }, { "ce_loss_10": 3.6421966791152953, "ce_loss_13": 3.5865265488624574, "ce_loss_2": 4.215503621101379, "ce_loss_3": 4.038523530960083, "ce_loss_7": 3.7318795323371887, "epoch": 0.57, "grad_norm": 552.0, "kl_loss_10": 124.02103271484376, "kl_loss_2": 1365.1710144042968, "kl_loss_3": 1022.5380310058594, "kl_loss_7": 335.7863967895508, "learning_rate": 0.0003975966659674047, "loss": 734.2836, "step": 5700 }, { "ce_loss_10": 3.6048430800437927, "ce_loss_13": 3.549668312072754, "ce_loss_2": 4.207078647613526, "ce_loss_3": 4.021631062030792, "ce_loss_7": 3.695444619655609, "epoch": 0.571, "grad_norm": 680.0, "kl_loss_10": 126.19835929870605, "kl_loss_2": 1414.5069702148437, "kl_loss_3": 1054.5915283203126, "kl_loss_7": 343.2915603637695, "learning_rate": 0.0003960441545911204, "loss": 733.8609, "step": 5710 }, { "ce_loss_10": 3.6052708625793457, "ce_loss_13": 3.5485535979270937, "ce_loss_2": 4.193977308273316, "ce_loss_3": 4.016455709934235, "ce_loss_7": 3.696723520755768, "epoch": 0.572, "grad_norm": 652.0, "kl_loss_10": 125.34993591308594, "kl_loss_2": 1414.3463562011718, "kl_loss_3": 1059.2802703857421, "kl_loss_7": 344.37230987548827, "learning_rate": 0.0003944926900490452, "loss": 732.3417, "step": 5720 }, { "ce_loss_10": 3.519063436985016, "ce_loss_13": 3.4620590448379516, "ce_loss_2": 4.135345363616944, "ce_loss_3": 3.9530657052993776, "ce_loss_7": 3.614231622219086, "epoch": 0.573, "grad_norm": 588.0, "kl_loss_10": 125.74856452941894, "kl_loss_2": 1450.9848693847657, "kl_loss_3": 1088.1497314453125, "kl_loss_7": 348.66001892089844, "learning_rate": 0.0003929422879644099, "loss": 736.7084, "step": 5730 }, { "ce_loss_10": 3.5202754855155947, "ce_loss_13": 3.466830384731293, "ce_loss_2": 4.112395560741424, "ce_loss_3": 3.926817226409912, "ce_loss_7": 3.6115634441375732, "epoch": 0.574, "grad_norm": 780.0, "kl_loss_10": 122.84286041259766, "kl_loss_2": 1409.6117370605468, "kl_loss_3": 1058.0203704833984, "kl_loss_7": 337.2828994750977, "learning_rate": 0.0003913929639497462, "loss": 725.4092, "step": 5740 }, { "ce_loss_10": 3.4744406223297117, "ce_loss_13": 3.417355275154114, "ce_loss_2": 4.097714972496033, "ce_loss_3": 3.902831184864044, "ce_loss_7": 3.5695701360702516, "epoch": 0.575, "grad_norm": 524.0, "kl_loss_10": 124.28501434326172, "kl_loss_2": 1441.8303283691407, "kl_loss_3": 1071.458563232422, "kl_loss_7": 341.63841705322267, "learning_rate": 0.00038984473360672965, "loss": 732.2352, "step": 5750 }, { "ce_loss_10": 3.4835135340690613, "ce_loss_13": 3.42867751121521, "ce_loss_2": 4.105578863620758, "ce_loss_3": 3.919316029548645, "ce_loss_7": 3.576983118057251, "epoch": 0.576, "grad_norm": 604.0, "kl_loss_10": 122.92881278991699, "kl_loss_2": 1439.3748413085937, "kl_loss_3": 1074.1585357666015, "kl_loss_7": 339.31846008300784, "learning_rate": 0.0003882976125260229, "loss": 730.0831, "step": 5760 }, { "ce_loss_10": 3.5526882290840147, "ce_loss_13": 3.4981236934661863, "ce_loss_2": 4.157402575016022, "ce_loss_3": 3.970548963546753, "ce_loss_7": 3.6471086144447327, "epoch": 0.577, "grad_norm": 556.0, "kl_loss_10": 125.73483810424804, "kl_loss_2": 1407.3273315429688, "kl_loss_3": 1052.3861572265625, "kl_loss_7": 341.37246856689455, "learning_rate": 0.00038675161628711776, "loss": 735.7479, "step": 5770 }, { "ce_loss_10": 3.58783597946167, "ce_loss_13": 3.533314561843872, "ce_loss_2": 4.171427321434021, "ce_loss_3": 3.994054675102234, "ce_loss_7": 3.681211507320404, "epoch": 0.578, "grad_norm": 560.0, "kl_loss_10": 125.05509986877442, "kl_loss_2": 1389.438739013672, "kl_loss_3": 1037.8104064941406, "kl_loss_7": 339.4297866821289, "learning_rate": 0.0003852067604581794, "loss": 745.1567, "step": 5780 }, { "ce_loss_10": 3.533677875995636, "ce_loss_13": 3.4786616802215575, "ce_loss_2": 4.1458081841468815, "ce_loss_3": 3.950434994697571, "ce_loss_7": 3.6267109632492067, "epoch": 0.579, "grad_norm": 660.0, "kl_loss_10": 123.52711334228516, "kl_loss_2": 1443.4204162597657, "kl_loss_3": 1072.433172607422, "kl_loss_7": 341.32163391113284, "learning_rate": 0.0003836630605958888, "loss": 735.883, "step": 5790 }, { "ce_loss_10": 3.590009105205536, "ce_loss_13": 3.5354955911636354, "ce_loss_2": 4.183343994617462, "ce_loss_3": 4.006175303459168, "ce_loss_7": 3.6826868414878846, "epoch": 0.58, "grad_norm": 724.0, "kl_loss_10": 125.50635833740235, "kl_loss_2": 1422.5762878417968, "kl_loss_3": 1067.5176635742187, "kl_loss_7": 343.88026275634763, "learning_rate": 0.0003821205322452863, "loss": 758.9688, "step": 5800 }, { "ce_loss_10": 3.5736894965171815, "ce_loss_13": 3.518394339084625, "ce_loss_2": 4.166361927986145, "ce_loss_3": 3.9780752897262572, "ce_loss_7": 3.658141016960144, "epoch": 0.581, "grad_norm": 520.0, "kl_loss_10": 124.10846366882325, "kl_loss_2": 1410.343994140625, "kl_loss_3": 1048.025698852539, "kl_loss_7": 337.55668487548826, "learning_rate": 0.0003805791909396155, "loss": 735.6956, "step": 5810 }, { "ce_loss_10": 3.524351119995117, "ce_loss_13": 3.471441614627838, "ce_loss_2": 4.123242568969727, "ce_loss_3": 3.9424999237060545, "ce_loss_7": 3.6170790791511536, "epoch": 0.582, "grad_norm": 656.0, "kl_loss_10": 123.72361793518067, "kl_loss_2": 1420.7679870605468, "kl_loss_3": 1060.7876434326172, "kl_loss_7": 339.7288070678711, "learning_rate": 0.0003790390522001662, "loss": 741.0435, "step": 5820 }, { "ce_loss_10": 3.455526554584503, "ce_loss_13": 3.4017743825912476, "ce_loss_2": 4.064185953140258, "ce_loss_3": 3.875770378112793, "ce_loss_7": 3.5457238078117372, "epoch": 0.583, "grad_norm": 668.0, "kl_loss_10": 122.62833824157715, "kl_loss_2": 1442.9026000976562, "kl_loss_3": 1073.4856384277343, "kl_loss_7": 338.7090118408203, "learning_rate": 0.0003775001315361183, "loss": 731.9279, "step": 5830 }, { "ce_loss_10": 3.573053014278412, "ce_loss_13": 3.5164970636367796, "ce_loss_2": 4.179701626300812, "ce_loss_3": 3.9943284034729003, "ce_loss_7": 3.6686230897903442, "epoch": 0.584, "grad_norm": 532.0, "kl_loss_10": 126.27806777954102, "kl_loss_2": 1439.9238403320312, "kl_loss_3": 1072.7950469970704, "kl_loss_7": 343.28225860595705, "learning_rate": 0.0003759624444443858, "loss": 741.2989, "step": 5840 }, { "ce_loss_10": 3.6071199655532835, "ce_loss_13": 3.5526736259460447, "ce_loss_2": 4.190285313129425, "ce_loss_3": 4.009751296043396, "ce_loss_7": 3.695155990123749, "epoch": 0.585, "grad_norm": 628.0, "kl_loss_10": 124.63733139038087, "kl_loss_2": 1396.041973876953, "kl_loss_3": 1037.517807006836, "kl_loss_7": 335.564924621582, "learning_rate": 0.00037442600640946044, "loss": 725.2555, "step": 5850 }, { "ce_loss_10": 3.5607337236404417, "ce_loss_13": 3.5095672369003297, "ce_loss_2": 4.148695635795593, "ce_loss_3": 3.9706854939460756, "ce_loss_7": 3.6532173633575438, "epoch": 0.586, "grad_norm": 672.0, "kl_loss_10": 123.19213104248047, "kl_loss_2": 1393.3970886230468, "kl_loss_3": 1047.943051147461, "kl_loss_7": 339.57422332763673, "learning_rate": 0.00037289083290325663, "loss": 720.2672, "step": 5860 }, { "ce_loss_10": 3.5448028206825257, "ce_loss_13": 3.488830196857452, "ce_loss_2": 4.1354421257972716, "ce_loss_3": 3.9604512453079224, "ce_loss_7": 3.6347436904907227, "epoch": 0.587, "grad_norm": 572.0, "kl_loss_10": 126.071875, "kl_loss_2": 1390.8989135742188, "kl_loss_3": 1037.6858978271484, "kl_loss_7": 339.9308090209961, "learning_rate": 0.0003713569393849543, "loss": 722.7879, "step": 5870 }, { "ce_loss_10": 3.597252070903778, "ce_loss_13": 3.5426252484321594, "ce_loss_2": 4.190739142894745, "ce_loss_3": 4.007934546470642, "ce_loss_7": 3.68798006772995, "epoch": 0.588, "grad_norm": 616.0, "kl_loss_10": 124.46675567626953, "kl_loss_2": 1411.17705078125, "kl_loss_3": 1055.4470306396483, "kl_loss_7": 337.98778839111327, "learning_rate": 0.00036982434130084397, "loss": 734.5103, "step": 5880 }, { "ce_loss_10": 3.508778083324432, "ce_loss_13": 3.4500787973403932, "ce_loss_2": 4.1147748827934265, "ce_loss_3": 3.9282889366149902, "ce_loss_7": 3.6036253452301024, "epoch": 0.589, "grad_norm": 732.0, "kl_loss_10": 127.70808792114258, "kl_loss_2": 1428.9704956054688, "kl_loss_3": 1069.9872589111328, "kl_loss_7": 346.86595458984374, "learning_rate": 0.00036829305408417166, "loss": 744.0622, "step": 5890 }, { "ce_loss_10": 3.4944640517234804, "ce_loss_13": 3.439568829536438, "ce_loss_2": 4.117368769645691, "ce_loss_3": 3.928926873207092, "ce_loss_7": 3.5905642032623293, "epoch": 0.59, "grad_norm": 920.0, "kl_loss_10": 127.46843223571777, "kl_loss_2": 1459.3504028320312, "kl_loss_3": 1088.396890258789, "kl_loss_7": 350.43089599609374, "learning_rate": 0.0003667630931549826, "loss": 743.2958, "step": 5900 }, { "ce_loss_10": 3.4649301290512087, "ce_loss_13": 3.407567024230957, "ce_loss_2": 4.097899031639099, "ce_loss_3": 3.909872758388519, "ce_loss_7": 3.560973751544952, "epoch": 0.591, "grad_norm": 820.0, "kl_loss_10": 125.37057266235351, "kl_loss_2": 1485.4420349121094, "kl_loss_3": 1107.6814758300782, "kl_loss_7": 347.6374969482422, "learning_rate": 0.00036523447391996613, "loss": 752.9677, "step": 5910 }, { "ce_loss_10": 3.5579336881637573, "ce_loss_13": 3.5037983298301696, "ce_loss_2": 4.148412072658539, "ce_loss_3": 3.9693949699401854, "ce_loss_7": 3.6502652764320374, "epoch": 0.592, "grad_norm": 540.0, "kl_loss_10": 122.69281196594238, "kl_loss_2": 1394.9801696777345, "kl_loss_3": 1040.2122680664063, "kl_loss_7": 341.23377685546876, "learning_rate": 0.00036370721177230114, "loss": 725.8651, "step": 5920 }, { "ce_loss_10": 3.553625154495239, "ce_loss_13": 3.4975630164146425, "ce_loss_2": 4.162805891036987, "ce_loss_3": 3.9766315698623655, "ce_loss_7": 3.648140561580658, "epoch": 0.593, "grad_norm": 506.0, "kl_loss_10": 125.43215103149414, "kl_loss_2": 1439.8947875976562, "kl_loss_3": 1073.3429077148437, "kl_loss_7": 345.2577102661133, "learning_rate": 0.00036218132209150044, "loss": 740.7089, "step": 5930 }, { "ce_loss_10": 3.5076088428497316, "ce_loss_13": 3.4480642318725585, "ce_loss_2": 4.140249872207642, "ce_loss_3": 3.9551444053649902, "ce_loss_7": 3.609627366065979, "epoch": 0.594, "grad_norm": 512.0, "kl_loss_10": 129.52795867919923, "kl_loss_2": 1482.7498962402344, "kl_loss_3": 1115.7573791503905, "kl_loss_7": 355.26904449462893, "learning_rate": 0.0003606568202432562, "loss": 752.0646, "step": 5940 }, { "ce_loss_10": 3.57774213552475, "ce_loss_13": 3.5228155970573427, "ce_loss_2": 4.19405552148819, "ce_loss_3": 4.006729650497436, "ce_loss_7": 3.6705931544303896, "epoch": 0.595, "grad_norm": 796.0, "kl_loss_10": 127.42050399780274, "kl_loss_2": 1457.0114624023438, "kl_loss_3": 1078.6867706298829, "kl_loss_7": 347.5483764648437, "learning_rate": 0.0003591337215792851, "loss": 738.7038, "step": 5950 }, { "ce_loss_10": 3.6211448907852173, "ce_loss_13": 3.567346215248108, "ce_loss_2": 4.192822194099426, "ce_loss_3": 4.018707859516144, "ce_loss_7": 3.7064581990242003, "epoch": 0.596, "grad_norm": 504.0, "kl_loss_10": 124.89567108154297, "kl_loss_2": 1390.8341674804688, "kl_loss_3": 1045.144497680664, "kl_loss_7": 337.26280822753904, "learning_rate": 0.00035761204143717383, "loss": 735.5005, "step": 5960 }, { "ce_loss_10": 3.5706104040145874, "ce_loss_13": 3.514435362815857, "ce_loss_2": 4.1687785387039185, "ce_loss_3": 3.9898008584976195, "ce_loss_7": 3.6620924711227416, "epoch": 0.597, "grad_norm": 752.0, "kl_loss_10": 125.53282928466797, "kl_loss_2": 1417.2740112304687, "kl_loss_3": 1066.1643615722655, "kl_loss_7": 341.81075897216795, "learning_rate": 0.0003560917951402245, "loss": 752.2635, "step": 5970 }, { "ce_loss_10": 3.5460765242576597, "ce_loss_13": 3.4938451290130614, "ce_loss_2": 4.134422564506531, "ce_loss_3": 3.9595829844474792, "ce_loss_7": 3.6350401043891907, "epoch": 0.598, "grad_norm": 732.0, "kl_loss_10": 123.55417900085449, "kl_loss_2": 1403.7479370117187, "kl_loss_3": 1060.9109008789062, "kl_loss_7": 338.4989807128906, "learning_rate": 0.00035457299799730046, "loss": 729.4276, "step": 5980 }, { "ce_loss_10": 3.613729405403137, "ce_loss_13": 3.560684585571289, "ce_loss_2": 4.205018150806427, "ce_loss_3": 4.024239468574524, "ce_loss_7": 3.7021196484565735, "epoch": 0.599, "grad_norm": 506.0, "kl_loss_10": 123.72282943725585, "kl_loss_2": 1406.9813415527344, "kl_loss_3": 1052.2019958496094, "kl_loss_7": 339.51587066650393, "learning_rate": 0.0003530556653026721, "loss": 741.729, "step": 5990 }, { "ce_loss_10": 3.523304808139801, "ce_loss_13": 3.4713462948799134, "ce_loss_2": 4.125553596019745, "ce_loss_3": 3.935919165611267, "ce_loss_7": 3.6131145000457763, "epoch": 0.6, "grad_norm": 1792.0, "kl_loss_10": 122.18126983642578, "kl_loss_2": 1418.4766357421875, "kl_loss_3": 1045.9818206787108, "kl_loss_7": 333.5164306640625, "learning_rate": 0.00035153981233586274, "loss": 736.1904, "step": 6000 }, { "ce_loss_10": 3.503693675994873, "ce_loss_13": 3.449203038215637, "ce_loss_2": 4.1076519846916195, "ce_loss_3": 3.9302319645881654, "ce_loss_7": 3.59748318195343, "epoch": 0.601, "grad_norm": 624.0, "kl_loss_10": 123.16584014892578, "kl_loss_2": 1427.8430603027343, "kl_loss_3": 1067.8656494140625, "kl_loss_7": 338.2985580444336, "learning_rate": 0.00035002545436149473, "loss": 755.88, "step": 6010 }, { "ce_loss_10": 3.5142654895782472, "ce_loss_13": 3.4579913139343263, "ce_loss_2": 4.128756499290466, "ce_loss_3": 3.9431054472923277, "ce_loss_7": 3.6089595675468447, "epoch": 0.602, "grad_norm": 732.0, "kl_loss_10": 128.0988456726074, "kl_loss_2": 1457.7831787109376, "kl_loss_3": 1094.435125732422, "kl_loss_7": 346.98584442138673, "learning_rate": 0.0003485126066291364, "loss": 739.3451, "step": 6020 }, { "ce_loss_10": 3.560540997982025, "ce_loss_13": 3.5067087769508363, "ce_loss_2": 4.174036264419556, "ce_loss_3": 3.9827836632728575, "ce_loss_7": 3.651438629627228, "epoch": 0.603, "grad_norm": 540.0, "kl_loss_10": 123.4724838256836, "kl_loss_2": 1428.8510314941407, "kl_loss_3": 1061.573553466797, "kl_loss_7": 336.35887298583987, "learning_rate": 0.0003470012843731476, "loss": 740.6739, "step": 6030 }, { "ce_loss_10": 3.50463045835495, "ce_loss_13": 3.4489762425422668, "ce_loss_2": 4.112862968444825, "ce_loss_3": 3.9258066773414613, "ce_loss_7": 3.596073019504547, "epoch": 0.604, "grad_norm": 640.0, "kl_loss_10": 122.90189971923829, "kl_loss_2": 1436.7977661132813, "kl_loss_3": 1067.8268920898438, "kl_loss_7": 338.7447814941406, "learning_rate": 0.00034549150281252633, "loss": 752.0526, "step": 6040 }, { "ce_loss_10": 3.4799697160720826, "ce_loss_13": 3.424366593360901, "ce_loss_2": 4.0825390934944155, "ce_loss_3": 3.8963089108467104, "ce_loss_7": 3.5723769664764404, "epoch": 0.605, "grad_norm": 516.0, "kl_loss_10": 123.8424732208252, "kl_loss_2": 1400.4283142089844, "kl_loss_3": 1041.9283935546875, "kl_loss_7": 336.8849105834961, "learning_rate": 0.0003439832771507565, "loss": 727.6076, "step": 6050 }, { "ce_loss_10": 3.486681044101715, "ce_loss_13": 3.432289206981659, "ce_loss_2": 4.096179568767548, "ce_loss_3": 3.9118806958198546, "ce_loss_7": 3.5778406381607057, "epoch": 0.606, "grad_norm": 524.0, "kl_loss_10": 123.29789390563965, "kl_loss_2": 1440.8687683105468, "kl_loss_3": 1077.3507263183594, "kl_loss_7": 339.76096496582034, "learning_rate": 0.0003424766225756537, "loss": 735.1911, "step": 6060 }, { "ce_loss_10": 3.5489431023597717, "ce_loss_13": 3.4944167613983153, "ce_loss_2": 4.150770962238312, "ce_loss_3": 3.9680647253990173, "ce_loss_7": 3.642049860954285, "epoch": 0.607, "grad_norm": 544.0, "kl_loss_10": 125.39007148742675, "kl_loss_2": 1412.7550415039063, "kl_loss_3": 1051.9062286376952, "kl_loss_7": 340.649674987793, "learning_rate": 0.00034097155425921255, "loss": 725.0725, "step": 6070 }, { "ce_loss_10": 3.4372221708297728, "ce_loss_13": 3.381313109397888, "ce_loss_2": 4.052776217460632, "ce_loss_3": 3.8650416135787964, "ce_loss_7": 3.5328315615653993, "epoch": 0.608, "grad_norm": 1456.0, "kl_loss_10": 124.18506164550782, "kl_loss_2": 1445.7065368652343, "kl_loss_3": 1072.234182739258, "kl_loss_7": 342.8041061401367, "learning_rate": 0.0003394680873574546, "loss": 736.824, "step": 6080 }, { "ce_loss_10": 3.553036856651306, "ce_loss_13": 3.4948946237564087, "ce_loss_2": 4.163804018497467, "ce_loss_3": 3.9761640787124635, "ce_loss_7": 3.6420689702033995, "epoch": 0.609, "grad_norm": 640.0, "kl_loss_10": 125.32937316894531, "kl_loss_2": 1441.000421142578, "kl_loss_3": 1071.1825592041016, "kl_loss_7": 339.07115783691404, "learning_rate": 0.0003379662370102747, "loss": 733.7026, "step": 6090 }, { "ce_loss_10": 3.556735038757324, "ce_loss_13": 3.5061814308166506, "ce_loss_2": 4.14713134765625, "ce_loss_3": 3.96578129529953, "ce_loss_7": 3.649851453304291, "epoch": 0.61, "grad_norm": 708.0, "kl_loss_10": 123.27509727478028, "kl_loss_2": 1418.8950134277343, "kl_loss_3": 1054.7578002929688, "kl_loss_7": 338.9109802246094, "learning_rate": 0.0003364660183412892, "loss": 735.4294, "step": 6100 }, { "ce_loss_10": 3.538415002822876, "ce_loss_13": 3.4844126224517824, "ce_loss_2": 4.1343903064727785, "ce_loss_3": 3.9533764481544496, "ce_loss_7": 3.6281407356262205, "epoch": 0.611, "grad_norm": 568.0, "kl_loss_10": 124.66803207397462, "kl_loss_2": 1415.106756591797, "kl_loss_3": 1056.7506927490235, "kl_loss_7": 340.27066802978516, "learning_rate": 0.0003349674464576834, "loss": 741.5298, "step": 6110 }, { "ce_loss_10": 3.491645836830139, "ce_loss_13": 3.43500040769577, "ce_loss_2": 4.0973351955413815, "ce_loss_3": 3.9120434165000915, "ce_loss_7": 3.5810877084732056, "epoch": 0.612, "grad_norm": 676.0, "kl_loss_10": 124.53080596923829, "kl_loss_2": 1429.85146484375, "kl_loss_3": 1068.896337890625, "kl_loss_7": 338.6284439086914, "learning_rate": 0.00033347053645005966, "loss": 725.7744, "step": 6120 }, { "ce_loss_10": 3.6020973324775696, "ce_loss_13": 3.54690443277359, "ce_loss_2": 4.190545213222504, "ce_loss_3": 4.01407161951065, "ce_loss_7": 3.6918676257133485, "epoch": 0.613, "grad_norm": 1016.0, "kl_loss_10": 123.26982383728027, "kl_loss_2": 1385.2622436523438, "kl_loss_3": 1041.1985900878906, "kl_loss_7": 335.70687713623045, "learning_rate": 0.00033197530339228485, "loss": 735.7329, "step": 6130 }, { "ce_loss_10": 3.5564875841140746, "ce_loss_13": 3.5000383257865906, "ce_loss_2": 4.153977084159851, "ce_loss_3": 3.976674497127533, "ce_loss_7": 3.650752902030945, "epoch": 0.614, "grad_norm": 520.0, "kl_loss_10": 125.21984558105468, "kl_loss_2": 1410.2340087890625, "kl_loss_3": 1060.819757080078, "kl_loss_7": 342.1903564453125, "learning_rate": 0.00033048176234133967, "loss": 730.7129, "step": 6140 }, { "ce_loss_10": 3.5401206254959106, "ce_loss_13": 3.48342844247818, "ce_loss_2": 4.13830029964447, "ce_loss_3": 3.9539244890213014, "ce_loss_7": 3.632445764541626, "epoch": 0.615, "grad_norm": 600.0, "kl_loss_10": 124.45763664245605, "kl_loss_2": 1419.0204040527344, "kl_loss_3": 1063.9102508544922, "kl_loss_7": 341.1919967651367, "learning_rate": 0.0003289899283371657, "loss": 739.3029, "step": 6150 }, { "ce_loss_10": 3.5686771631240846, "ce_loss_13": 3.5131590008735656, "ce_loss_2": 4.169058787822723, "ce_loss_3": 3.9850056529045106, "ce_loss_7": 3.6635145783424377, "epoch": 0.616, "grad_norm": 828.0, "kl_loss_10": 123.31291618347169, "kl_loss_2": 1402.3692749023437, "kl_loss_3": 1052.1495544433594, "kl_loss_7": 335.0513031005859, "learning_rate": 0.0003274998164025148, "loss": 738.5246, "step": 6160 }, { "ce_loss_10": 3.5952057123184202, "ce_loss_13": 3.538980412483215, "ce_loss_2": 4.18984272480011, "ce_loss_3": 4.006945097446442, "ce_loss_7": 3.684834325313568, "epoch": 0.617, "grad_norm": 508.0, "kl_loss_10": 126.31275444030761, "kl_loss_2": 1409.8781188964845, "kl_loss_3": 1055.6673553466796, "kl_loss_7": 341.4982223510742, "learning_rate": 0.0003260114415427975, "loss": 748.9741, "step": 6170 }, { "ce_loss_10": 3.5180846571922304, "ce_loss_13": 3.4631958842277526, "ce_loss_2": 4.117404592037201, "ce_loss_3": 3.942760634422302, "ce_loss_7": 3.6095527529716493, "epoch": 0.618, "grad_norm": 544.0, "kl_loss_10": 123.8319091796875, "kl_loss_2": 1425.0959899902343, "kl_loss_3": 1070.6471069335937, "kl_loss_7": 336.5234832763672, "learning_rate": 0.0003245248187459323, "loss": 747.7899, "step": 6180 }, { "ce_loss_10": 3.503745806217194, "ce_loss_13": 3.452336239814758, "ce_loss_2": 4.087360787391662, "ce_loss_3": 3.902265763282776, "ce_loss_7": 3.589401423931122, "epoch": 0.619, "grad_norm": 812.0, "kl_loss_10": 119.91349830627442, "kl_loss_2": 1387.6965454101562, "kl_loss_3": 1031.423031616211, "kl_loss_7": 329.9644256591797, "learning_rate": 0.00032303996298219416, "loss": 722.3549, "step": 6190 }, { "ce_loss_10": 3.5912120580673217, "ce_loss_13": 3.532735550403595, "ce_loss_2": 4.175826632976532, "ce_loss_3": 3.990143024921417, "ce_loss_7": 3.678599214553833, "epoch": 0.62, "grad_norm": 572.0, "kl_loss_10": 123.47973709106445, "kl_loss_2": 1384.380682373047, "kl_loss_3": 1031.3387481689454, "kl_loss_7": 333.18008575439455, "learning_rate": 0.00032155688920406414, "loss": 721.7089, "step": 6200 }, { "ce_loss_10": 3.5028671979904176, "ce_loss_13": 3.4443759083747865, "ce_loss_2": 4.133521604537964, "ce_loss_3": 3.941824531555176, "ce_loss_7": 3.5987909913063048, "epoch": 0.621, "grad_norm": 628.0, "kl_loss_10": 128.0587142944336, "kl_loss_2": 1455.4984985351562, "kl_loss_3": 1085.7472259521485, "kl_loss_7": 344.9437423706055, "learning_rate": 0.0003200756123460788, "loss": 757.9494, "step": 6210 }, { "ce_loss_10": 3.53128182888031, "ce_loss_13": 3.473754036426544, "ce_loss_2": 4.1455615043640135, "ce_loss_3": 3.960498309135437, "ce_loss_7": 3.627154862880707, "epoch": 0.622, "grad_norm": 852.0, "kl_loss_10": 126.65972442626953, "kl_loss_2": 1452.3799194335938, "kl_loss_3": 1080.0731689453125, "kl_loss_7": 345.12744140625, "learning_rate": 0.00031859614732467957, "loss": 747.3707, "step": 6220 }, { "ce_loss_10": 3.580655241012573, "ce_loss_13": 3.5249094009399413, "ce_loss_2": 4.172132050991058, "ce_loss_3": 3.9869516491889954, "ce_loss_7": 3.672192335128784, "epoch": 0.623, "grad_norm": 704.0, "kl_loss_10": 122.48365020751953, "kl_loss_2": 1384.416357421875, "kl_loss_3": 1029.3415832519531, "kl_loss_7": 332.85533294677737, "learning_rate": 0.00031711850903806275, "loss": 722.4154, "step": 6230 }, { "ce_loss_10": 3.4871063590049745, "ce_loss_13": 3.433118486404419, "ce_loss_2": 4.09981507062912, "ce_loss_3": 3.9184054851531984, "ce_loss_7": 3.580829381942749, "epoch": 0.624, "grad_norm": 564.0, "kl_loss_10": 127.4908618927002, "kl_loss_2": 1445.4068542480468, "kl_loss_3": 1080.8368865966797, "kl_loss_7": 346.9587997436523, "learning_rate": 0.0003156427123660297, "loss": 733.6306, "step": 6240 }, { "ce_loss_10": 3.577600693702698, "ce_loss_13": 3.520812726020813, "ce_loss_2": 4.167683470249176, "ce_loss_3": 3.9879754543304444, "ce_loss_7": 3.6715123414993287, "epoch": 0.625, "grad_norm": 580.0, "kl_loss_10": 123.75414009094239, "kl_loss_2": 1393.0920532226562, "kl_loss_3": 1043.1118560791015, "kl_loss_7": 336.03589630126953, "learning_rate": 0.0003141687721698363, "loss": 735.9893, "step": 6250 }, { "ce_loss_10": 3.5385371804237367, "ce_loss_13": 3.4868279933929442, "ce_loss_2": 4.114449465274811, "ce_loss_3": 3.9369269490242003, "ce_loss_7": 3.626781237125397, "epoch": 0.626, "grad_norm": 474.0, "kl_loss_10": 120.09895820617676, "kl_loss_2": 1350.9141357421875, "kl_loss_3": 1009.9356262207032, "kl_loss_7": 325.1434753417969, "learning_rate": 0.00031269670329204396, "loss": 718.769, "step": 6260 }, { "ce_loss_10": 3.582829785346985, "ce_loss_13": 3.528691279888153, "ce_loss_2": 4.164977276325226, "ce_loss_3": 3.9852482438087464, "ce_loss_7": 3.668261468410492, "epoch": 0.627, "grad_norm": 648.0, "kl_loss_10": 124.36363410949707, "kl_loss_2": 1384.6681701660157, "kl_loss_3": 1036.161444091797, "kl_loss_7": 336.17731018066405, "learning_rate": 0.00031122652055637015, "loss": 724.7516, "step": 6270 }, { "ce_loss_10": 3.543643081188202, "ce_loss_13": 3.4886670470237733, "ce_loss_2": 4.15304182767868, "ce_loss_3": 3.9659879207611084, "ce_loss_7": 3.6350444078445436, "epoch": 0.628, "grad_norm": 744.0, "kl_loss_10": 124.99579772949218, "kl_loss_2": 1439.9531860351562, "kl_loss_3": 1072.6888000488282, "kl_loss_7": 339.7438629150391, "learning_rate": 0.0003097582387675385, "loss": 726.9059, "step": 6280 }, { "ce_loss_10": 3.5834500670433043, "ce_loss_13": 3.5285070896148683, "ce_loss_2": 4.176322209835052, "ce_loss_3": 3.994720828533173, "ce_loss_7": 3.670943582057953, "epoch": 0.629, "grad_norm": 720.0, "kl_loss_10": 124.72322578430176, "kl_loss_2": 1416.1133972167968, "kl_loss_3": 1054.6082000732422, "kl_loss_7": 338.46196899414065, "learning_rate": 0.00030829187271113034, "loss": 727.2747, "step": 6290 }, { "ce_loss_10": 3.5845503926277162, "ce_loss_13": 3.5288984060287474, "ce_loss_2": 4.167012679576874, "ce_loss_3": 3.9864045858383177, "ce_loss_7": 3.6704161047935484, "epoch": 0.63, "grad_norm": 520.0, "kl_loss_10": 122.14001274108887, "kl_loss_2": 1381.4854248046875, "kl_loss_3": 1031.7789672851563, "kl_loss_7": 329.18092193603513, "learning_rate": 0.00030682743715343565, "loss": 729.9382, "step": 6300 }, { "ce_loss_10": 3.530526852607727, "ce_loss_13": 3.4727389931678774, "ce_loss_2": 4.135389792919159, "ce_loss_3": 3.9555342316627504, "ce_loss_7": 3.6228522419929505, "epoch": 0.631, "grad_norm": 608.0, "kl_loss_10": 126.8146873474121, "kl_loss_2": 1414.6725708007812, "kl_loss_3": 1060.110418701172, "kl_loss_7": 343.28450164794924, "learning_rate": 0.0003053649468413043, "loss": 735.6304, "step": 6310 }, { "ce_loss_10": 3.6406059503555297, "ce_loss_13": 3.584047770500183, "ce_loss_2": 4.235766565799713, "ce_loss_3": 4.052544438838959, "ce_loss_7": 3.7318387985229493, "epoch": 0.632, "grad_norm": 872.0, "kl_loss_10": 126.71429443359375, "kl_loss_2": 1412.1578735351563, "kl_loss_3": 1059.7192962646484, "kl_loss_7": 343.02261505126955, "learning_rate": 0.00030390441650199725, "loss": 726.2171, "step": 6320 }, { "ce_loss_10": 3.53815176486969, "ce_loss_13": 3.4841132283210756, "ce_loss_2": 4.135061252117157, "ce_loss_3": 3.953271949291229, "ce_loss_7": 3.6305129885673524, "epoch": 0.633, "grad_norm": 580.0, "kl_loss_10": 122.51690254211425, "kl_loss_2": 1406.480645751953, "kl_loss_3": 1045.3659637451171, "kl_loss_7": 338.1310546875, "learning_rate": 0.00030244586084303903, "loss": 723.4697, "step": 6330 }, { "ce_loss_10": 3.5056158542633056, "ce_loss_13": 3.45035959482193, "ce_loss_2": 4.1218892455101015, "ce_loss_3": 3.9383804321289064, "ce_loss_7": 3.598428511619568, "epoch": 0.634, "grad_norm": 524.0, "kl_loss_10": 125.22274055480958, "kl_loss_2": 1454.4839721679687, "kl_loss_3": 1091.4578918457032, "kl_loss_7": 343.3879928588867, "learning_rate": 0.00030098929455206903, "loss": 732.6389, "step": 6340 }, { "ce_loss_10": 3.5128793120384216, "ce_loss_13": 3.458734905719757, "ce_loss_2": 4.109455585479736, "ce_loss_3": 3.921981763839722, "ce_loss_7": 3.603852927684784, "epoch": 0.635, "grad_norm": 660.0, "kl_loss_10": 123.21167640686035, "kl_loss_2": 1429.5969665527343, "kl_loss_3": 1065.2174682617188, "kl_loss_7": 343.878678894043, "learning_rate": 0.00029953473229669324, "loss": 754.7498, "step": 6350 }, { "ce_loss_10": 3.5429938197135926, "ce_loss_13": 3.4890798926353455, "ce_loss_2": 4.142077577114105, "ce_loss_3": 3.9637051105499266, "ce_loss_7": 3.633872401714325, "epoch": 0.636, "grad_norm": 532.0, "kl_loss_10": 122.68165473937988, "kl_loss_2": 1408.9635009765625, "kl_loss_3": 1054.9124420166015, "kl_loss_7": 341.2626647949219, "learning_rate": 0.00029808218872433767, "loss": 726.679, "step": 6360 }, { "ce_loss_10": 3.6060853004455566, "ce_loss_13": 3.550333082675934, "ce_loss_2": 4.186102759838104, "ce_loss_3": 4.006532001495361, "ce_loss_7": 3.6955028772354126, "epoch": 0.637, "grad_norm": 488.0, "kl_loss_10": 123.89888572692871, "kl_loss_2": 1393.8600280761718, "kl_loss_3": 1041.2120056152344, "kl_loss_7": 339.00569610595704, "learning_rate": 0.0002966316784621, "loss": 721.496, "step": 6370 }, { "ce_loss_10": 3.5151076436042787, "ce_loss_13": 3.460108757019043, "ce_loss_2": 4.134497022628784, "ce_loss_3": 3.949837851524353, "ce_loss_7": 3.610580575466156, "epoch": 0.638, "grad_norm": 556.0, "kl_loss_10": 124.86854286193848, "kl_loss_2": 1453.7235046386718, "kl_loss_3": 1082.8961395263673, "kl_loss_7": 346.0769012451172, "learning_rate": 0.0002951832161166024, "loss": 732.5327, "step": 6380 }, { "ce_loss_10": 3.59385507106781, "ce_loss_13": 3.5390156865119935, "ce_loss_2": 4.197193539142608, "ce_loss_3": 4.0121661901474, "ce_loss_7": 3.6833563446998596, "epoch": 0.639, "grad_norm": 780.0, "kl_loss_10": 125.91835517883301, "kl_loss_2": 1404.7720581054687, "kl_loss_3": 1048.3076721191405, "kl_loss_7": 340.255078125, "learning_rate": 0.0002937368162738445, "loss": 721.7094, "step": 6390 }, { "ce_loss_10": 3.531213629245758, "ce_loss_13": 3.4822975039482116, "ce_loss_2": 4.113115763664245, "ce_loss_3": 3.9376835942268373, "ce_loss_7": 3.6169563055038454, "epoch": 0.64, "grad_norm": 800.0, "kl_loss_10": 120.01525421142578, "kl_loss_2": 1385.8341735839845, "kl_loss_3": 1040.7418151855468, "kl_loss_7": 330.25692291259764, "learning_rate": 0.0002922924934990568, "loss": 729.1814, "step": 6400 }, { "ce_loss_10": 3.470765709877014, "ce_loss_13": 3.4150401830673216, "ce_loss_2": 4.09455554485321, "ce_loss_3": 3.90150500535965, "ce_loss_7": 3.5624879002571106, "epoch": 0.641, "grad_norm": 532.0, "kl_loss_10": 123.4526496887207, "kl_loss_2": 1456.2535888671875, "kl_loss_3": 1080.2837280273438, "kl_loss_7": 339.2045959472656, "learning_rate": 0.0002908502623365536, "loss": 736.9762, "step": 6410 }, { "ce_loss_10": 3.4067123413085936, "ce_loss_13": 3.350449573993683, "ce_loss_2": 4.032287085056305, "ce_loss_3": 3.845944118499756, "ce_loss_7": 3.5039458990097048, "epoch": 0.642, "grad_norm": 660.0, "kl_loss_10": 122.2471866607666, "kl_loss_2": 1452.7192321777343, "kl_loss_3": 1079.5542907714844, "kl_loss_7": 340.34538116455076, "learning_rate": 0.0002894101373095867, "loss": 734.8263, "step": 6420 }, { "ce_loss_10": 3.617260241508484, "ce_loss_13": 3.561350774765015, "ce_loss_2": 4.2049798488616945, "ce_loss_3": 4.030479991436005, "ce_loss_7": 3.7099608182907104, "epoch": 0.643, "grad_norm": 580.0, "kl_loss_10": 126.8806884765625, "kl_loss_2": 1398.0826477050782, "kl_loss_3": 1052.9947052001953, "kl_loss_7": 344.0747329711914, "learning_rate": 0.00028797213292019926, "loss": 731.6678, "step": 6430 }, { "ce_loss_10": 3.597985827922821, "ce_loss_13": 3.5433950066566466, "ce_loss_2": 4.183076286315918, "ce_loss_3": 4.004205846786499, "ce_loss_7": 3.6886964678764342, "epoch": 0.644, "grad_norm": 498.0, "kl_loss_10": 124.60531654357911, "kl_loss_2": 1393.4992248535157, "kl_loss_3": 1047.5234649658203, "kl_loss_7": 339.99706268310547, "learning_rate": 0.0002865362636490791, "loss": 738.58, "step": 6440 }, { "ce_loss_10": 3.6034607529640197, "ce_loss_13": 3.551686775684357, "ce_loss_2": 4.19627479314804, "ce_loss_3": 4.010957944393158, "ce_loss_7": 3.695608949661255, "epoch": 0.645, "grad_norm": 532.0, "kl_loss_10": 122.10445442199708, "kl_loss_2": 1399.2243957519531, "kl_loss_3": 1038.563070678711, "kl_loss_7": 334.98720092773436, "learning_rate": 0.0002851025439554142, "loss": 722.2279, "step": 6450 }, { "ce_loss_10": 3.604773259162903, "ce_loss_13": 3.547994613647461, "ce_loss_2": 4.190113079547882, "ce_loss_3": 4.01669454574585, "ce_loss_7": 3.6950499534606935, "epoch": 0.646, "grad_norm": 568.0, "kl_loss_10": 124.84522972106933, "kl_loss_2": 1393.757257080078, "kl_loss_3": 1045.5576049804688, "kl_loss_7": 340.6378997802734, "learning_rate": 0.00028367098827674573, "loss": 721.3065, "step": 6460 }, { "ce_loss_10": 3.5277416110038757, "ce_loss_13": 3.4715929985046388, "ce_loss_2": 4.1209129095077515, "ce_loss_3": 3.936191809177399, "ce_loss_7": 3.617918372154236, "epoch": 0.647, "grad_norm": 644.0, "kl_loss_10": 121.93290557861329, "kl_loss_2": 1392.9058166503905, "kl_loss_3": 1035.490902709961, "kl_loss_7": 330.90175476074216, "learning_rate": 0.00028224161102882397, "loss": 724.9437, "step": 6470 }, { "ce_loss_10": 3.506662678718567, "ce_loss_13": 3.4533620953559874, "ce_loss_2": 4.092417252063751, "ce_loss_3": 3.9158310890197754, "ce_loss_7": 3.596454584598541, "epoch": 0.648, "grad_norm": 700.0, "kl_loss_10": 122.67344360351562, "kl_loss_2": 1380.4693481445313, "kl_loss_3": 1032.2998260498048, "kl_loss_7": 331.9904846191406, "learning_rate": 0.00028081442660546124, "loss": 724.6898, "step": 6480 }, { "ce_loss_10": 3.561033821105957, "ce_loss_13": 3.5084841728210447, "ce_loss_2": 4.147166728973389, "ce_loss_3": 3.9707638025283813, "ce_loss_7": 3.6506178617477416, "epoch": 0.649, "grad_norm": 612.0, "kl_loss_10": 125.22979164123535, "kl_loss_2": 1398.154718017578, "kl_loss_3": 1043.1712646484375, "kl_loss_7": 336.69880828857424, "learning_rate": 0.0002793894493783892, "loss": 728.1853, "step": 6490 }, { "ce_loss_10": 3.5835307121276854, "ce_loss_13": 3.5306628704071046, "ce_loss_2": 4.164869272708893, "ce_loss_3": 3.9835118532180784, "ce_loss_7": 3.6689149260520937, "epoch": 0.65, "grad_norm": 648.0, "kl_loss_10": 121.68083915710449, "kl_loss_2": 1374.832666015625, "kl_loss_3": 1024.9227661132813, "kl_loss_7": 329.3937103271484, "learning_rate": 0.0002779666936971129, "loss": 715.8627, "step": 6500 }, { "ce_loss_10": 3.5850860357284544, "ce_loss_13": 3.5323161482810974, "ce_loss_2": 4.190707218647003, "ce_loss_3": 4.01264888048172, "ce_loss_7": 3.678597128391266, "epoch": 0.651, "grad_norm": 588.0, "kl_loss_10": 123.99790267944336, "kl_loss_2": 1415.28037109375, "kl_loss_3": 1063.5095092773438, "kl_loss_7": 339.4371505737305, "learning_rate": 0.00027654617388876614, "loss": 737.8925, "step": 6510 }, { "ce_loss_10": 3.618337428569794, "ce_loss_13": 3.5666980028152464, "ce_loss_2": 4.202663445472718, "ce_loss_3": 4.029495453834533, "ce_loss_7": 3.7100385665893554, "epoch": 0.652, "grad_norm": 672.0, "kl_loss_10": 125.23239784240722, "kl_loss_2": 1392.5545654296875, "kl_loss_3": 1036.998681640625, "kl_loss_7": 336.28130493164065, "learning_rate": 0.0002751279042579672, "loss": 726.2184, "step": 6520 }, { "ce_loss_10": 3.5651236891746523, "ce_loss_13": 3.510008442401886, "ce_loss_2": 4.147745668888092, "ce_loss_3": 3.967809629440308, "ce_loss_7": 3.655534052848816, "epoch": 0.653, "grad_norm": 596.0, "kl_loss_10": 122.48806457519531, "kl_loss_2": 1376.0337341308593, "kl_loss_3": 1025.2164947509766, "kl_loss_7": 331.06346282958987, "learning_rate": 0.00027371189908667604, "loss": 727.9293, "step": 6530 }, { "ce_loss_10": 3.610418975353241, "ce_loss_13": 3.5547208905220034, "ce_loss_2": 4.223685729503631, "ce_loss_3": 4.037129259109497, "ce_loss_7": 3.7057825326919556, "epoch": 0.654, "grad_norm": 664.0, "kl_loss_10": 127.65673484802247, "kl_loss_2": 1437.9926696777343, "kl_loss_3": 1063.241262817383, "kl_loss_7": 346.38170166015624, "learning_rate": 0.00027229817263404863, "loss": 746.9211, "step": 6540 }, { "ce_loss_10": 3.594118654727936, "ce_loss_13": 3.5403075575828553, "ce_loss_2": 4.165842056274414, "ce_loss_3": 3.9902518033981322, "ce_loss_7": 3.6817231297492983, "epoch": 0.655, "grad_norm": 612.0, "kl_loss_10": 123.36601295471192, "kl_loss_2": 1372.2304443359376, "kl_loss_3": 1023.0949890136719, "kl_loss_7": 330.6029968261719, "learning_rate": 0.0002708867391362948, "loss": 721.8018, "step": 6550 }, { "ce_loss_10": 3.5721522688865663, "ce_loss_13": 3.5191043853759765, "ce_loss_2": 4.140277779102325, "ce_loss_3": 3.9575397610664367, "ce_loss_7": 3.658042335510254, "epoch": 0.656, "grad_norm": 494.0, "kl_loss_10": 121.24272346496582, "kl_loss_2": 1340.1691162109375, "kl_loss_3": 1000.573861694336, "kl_loss_7": 325.7244140625, "learning_rate": 0.0002694776128065345, "loss": 714.533, "step": 6560 }, { "ce_loss_10": 3.503811073303223, "ce_loss_13": 3.450370526313782, "ce_loss_2": 4.107607281208038, "ce_loss_3": 3.92315833568573, "ce_loss_7": 3.597831392288208, "epoch": 0.657, "grad_norm": 450.0, "kl_loss_10": 122.38468208312989, "kl_loss_2": 1424.4636169433593, "kl_loss_3": 1064.6302703857423, "kl_loss_7": 339.14308166503906, "learning_rate": 0.00026807080783465374, "loss": 721.975, "step": 6570 }, { "ce_loss_10": 3.61981600522995, "ce_loss_13": 3.5655640721321107, "ce_loss_2": 4.2158555626869205, "ce_loss_3": 4.038338911533356, "ce_loss_7": 3.7099917411804197, "epoch": 0.658, "grad_norm": 696.0, "kl_loss_10": 124.21554260253906, "kl_loss_2": 1399.7592712402343, "kl_loss_3": 1044.859848022461, "kl_loss_7": 337.2662155151367, "learning_rate": 0.00026666633838716316, "loss": 734.2561, "step": 6580 }, { "ce_loss_10": 3.5148606061935426, "ce_loss_13": 3.4572384357452393, "ce_loss_2": 4.128946197032929, "ce_loss_3": 3.9375492334365845, "ce_loss_7": 3.6097939372062684, "epoch": 0.659, "grad_norm": 524.0, "kl_loss_10": 127.11134567260743, "kl_loss_2": 1438.689013671875, "kl_loss_3": 1068.9808227539063, "kl_loss_7": 344.35106506347654, "learning_rate": 0.00026526421860705474, "loss": 741.562, "step": 6590 }, { "ce_loss_10": 3.537730169296265, "ce_loss_13": 3.4823685765266417, "ce_loss_2": 4.134520995616913, "ce_loss_3": 3.9507094264030456, "ce_loss_7": 3.6302355527877808, "epoch": 0.66, "grad_norm": 780.0, "kl_loss_10": 124.95889053344726, "kl_loss_2": 1415.88623046875, "kl_loss_3": 1054.4846405029298, "kl_loss_7": 341.83483123779297, "learning_rate": 0.0002638644626136587, "loss": 726.3774, "step": 6600 }, { "ce_loss_10": 3.5502317190170287, "ce_loss_13": 3.495832419395447, "ce_loss_2": 4.1422311663627625, "ce_loss_3": 3.9583224058151245, "ce_loss_7": 3.6375673055648803, "epoch": 0.661, "grad_norm": 540.0, "kl_loss_10": 122.10346221923828, "kl_loss_2": 1403.828271484375, "kl_loss_3": 1050.9346862792968, "kl_loss_7": 334.5736999511719, "learning_rate": 0.00026246708450250255, "loss": 731.0771, "step": 6610 }, { "ce_loss_10": 3.5466328024864198, "ce_loss_13": 3.4922005414962767, "ce_loss_2": 4.128600871562957, "ce_loss_3": 3.9507038831710815, "ce_loss_7": 3.633044409751892, "epoch": 0.662, "grad_norm": 836.0, "kl_loss_10": 122.05910720825196, "kl_loss_2": 1380.4447082519532, "kl_loss_3": 1030.7735382080077, "kl_loss_7": 329.82720336914065, "learning_rate": 0.00026107209834516854, "loss": 719.4666, "step": 6620 }, { "ce_loss_10": 3.4943259835243223, "ce_loss_13": 3.441339361667633, "ce_loss_2": 4.105875480175018, "ce_loss_3": 3.918928301334381, "ce_loss_7": 3.5848689913749694, "epoch": 0.663, "grad_norm": 604.0, "kl_loss_10": 123.39703712463378, "kl_loss_2": 1449.5614074707032, "kl_loss_3": 1074.3954467773438, "kl_loss_7": 340.9826889038086, "learning_rate": 0.0002596795181891514, "loss": 745.5657, "step": 6630 }, { "ce_loss_10": 3.5065298676490784, "ce_loss_13": 3.4482388257980348, "ce_loss_2": 4.110616528987885, "ce_loss_3": 3.921921765804291, "ce_loss_7": 3.597921073436737, "epoch": 0.664, "grad_norm": 624.0, "kl_loss_10": 126.37582130432129, "kl_loss_2": 1431.846942138672, "kl_loss_3": 1067.7791381835937, "kl_loss_7": 343.4202651977539, "learning_rate": 0.000258289358057718, "loss": 756.1383, "step": 6640 }, { "ce_loss_10": 3.577939677238464, "ce_loss_13": 3.5215064764022825, "ce_loss_2": 4.174340093135834, "ce_loss_3": 3.993846929073334, "ce_loss_7": 3.6716681122779846, "epoch": 0.665, "grad_norm": 668.0, "kl_loss_10": 125.94303131103516, "kl_loss_2": 1418.1614074707031, "kl_loss_3": 1063.662789916992, "kl_loss_7": 345.09018859863284, "learning_rate": 0.0002569016319497657, "loss": 737.0033, "step": 6650 }, { "ce_loss_10": 3.558946442604065, "ce_loss_13": 3.504122722148895, "ce_loss_2": 4.157543361186981, "ce_loss_3": 3.9731531977653503, "ce_loss_7": 3.6553978085517884, "epoch": 0.666, "grad_norm": 456.0, "kl_loss_10": 126.41628189086914, "kl_loss_2": 1425.6318298339843, "kl_loss_3": 1065.6677124023438, "kl_loss_7": 344.7531311035156, "learning_rate": 0.00025551635383968066, "loss": 743.7271, "step": 6660 }, { "ce_loss_10": 3.475124168395996, "ce_loss_13": 3.4190633296966553, "ce_loss_2": 4.0756109118461605, "ce_loss_3": 3.892716574668884, "ce_loss_7": 3.5657395362854003, "epoch": 0.667, "grad_norm": 776.0, "kl_loss_10": 125.15830154418946, "kl_loss_2": 1434.4202819824218, "kl_loss_3": 1067.0250885009766, "kl_loss_7": 342.79358825683596, "learning_rate": 0.00025413353767719804, "loss": 737.7381, "step": 6670 }, { "ce_loss_10": 3.5303653359413145, "ce_loss_13": 3.47825140953064, "ce_loss_2": 4.123199880123138, "ce_loss_3": 3.9427443981170653, "ce_loss_7": 3.6180224299430845, "epoch": 0.668, "grad_norm": 624.0, "kl_loss_10": 121.16463623046874, "kl_loss_2": 1412.6630737304688, "kl_loss_3": 1057.0863677978516, "kl_loss_7": 333.19127655029297, "learning_rate": 0.0002527531973872617, "loss": 734.8999, "step": 6680 }, { "ce_loss_10": 3.547012138366699, "ce_loss_13": 3.4931538343429565, "ce_loss_2": 4.134020876884461, "ce_loss_3": 3.961080086231232, "ce_loss_7": 3.6357810616493227, "epoch": 0.669, "grad_norm": 470.0, "kl_loss_10": 121.4739860534668, "kl_loss_2": 1401.3072937011718, "kl_loss_3": 1050.0212188720702, "kl_loss_7": 335.34568786621094, "learning_rate": 0.0002513753468698826, "loss": 727.7887, "step": 6690 }, { "ce_loss_10": 3.5165226101875304, "ce_loss_13": 3.460037863254547, "ce_loss_2": 4.116587007045746, "ce_loss_3": 3.9380706429481505, "ce_loss_7": 3.6079202771186827, "epoch": 0.67, "grad_norm": 604.0, "kl_loss_10": 124.7790916442871, "kl_loss_2": 1438.8750366210938, "kl_loss_3": 1075.9765594482421, "kl_loss_7": 341.0475021362305, "learning_rate": 0.0002500000000000001, "loss": 739.6845, "step": 6700 }, { "ce_loss_10": 3.6302000522613525, "ce_loss_13": 3.5780060410499575, "ce_loss_2": 4.188638615608215, "ce_loss_3": 4.019280982017517, "ce_loss_7": 3.71583753824234, "epoch": 0.671, "grad_norm": 548.0, "kl_loss_10": 120.78085708618164, "kl_loss_2": 1356.7482360839845, "kl_loss_3": 1016.2201110839844, "kl_loss_7": 327.5862182617187, "learning_rate": 0.0002486271706273421, "loss": 736.1638, "step": 6710 }, { "ce_loss_10": 3.569162166118622, "ce_loss_13": 3.5176132678985597, "ce_loss_2": 4.128765881061554, "ce_loss_3": 3.955933618545532, "ce_loss_7": 3.6550265192985534, "epoch": 0.672, "grad_norm": 540.0, "kl_loss_10": 120.67655563354492, "kl_loss_2": 1342.1685119628905, "kl_loss_3": 1003.4603668212891, "kl_loss_7": 326.5387390136719, "learning_rate": 0.0002472568725762853, "loss": 721.6279, "step": 6720 }, { "ce_loss_10": 3.5551081538200378, "ce_loss_13": 3.5027125597000124, "ce_loss_2": 4.122924709320069, "ce_loss_3": 3.947139251232147, "ce_loss_7": 3.640329647064209, "epoch": 0.673, "grad_norm": 498.0, "kl_loss_10": 120.39876937866211, "kl_loss_2": 1360.6974243164063, "kl_loss_3": 1011.3546844482422, "kl_loss_7": 324.81715393066406, "learning_rate": 0.00024588911964571554, "loss": 714.1743, "step": 6730 }, { "ce_loss_10": 3.5764050483703613, "ce_loss_13": 3.5180741786956786, "ce_loss_2": 4.191099977493286, "ce_loss_3": 4.004332780838013, "ce_loss_7": 3.6720454335212707, "epoch": 0.674, "grad_norm": 580.0, "kl_loss_10": 128.467537689209, "kl_loss_2": 1438.534490966797, "kl_loss_3": 1073.315625, "kl_loss_7": 347.5059188842773, "learning_rate": 0.00024452392560888974, "loss": 732.8049, "step": 6740 }, { "ce_loss_10": 3.463340771198273, "ce_loss_13": 3.4112897157669066, "ce_loss_2": 4.061108565330505, "ce_loss_3": 3.8762802481651306, "ce_loss_7": 3.55279803276062, "epoch": 0.675, "grad_norm": 536.0, "kl_loss_10": 121.23651008605957, "kl_loss_2": 1415.8220092773438, "kl_loss_3": 1050.3499908447266, "kl_loss_7": 334.4798278808594, "learning_rate": 0.00024316130421329695, "loss": 724.3385, "step": 6750 }, { "ce_loss_10": 3.544707751274109, "ce_loss_13": 3.4911407709121702, "ce_loss_2": 4.13702780008316, "ce_loss_3": 3.952805519104004, "ce_loss_7": 3.6338755965232847, "epoch": 0.676, "grad_norm": 504.0, "kl_loss_10": 122.65901947021484, "kl_loss_2": 1379.1199523925782, "kl_loss_3": 1026.6421783447265, "kl_loss_7": 329.13787536621095, "learning_rate": 0.00024180126918051909, "loss": 720.2172, "step": 6760 }, { "ce_loss_10": 3.585362899303436, "ce_loss_13": 3.533327639102936, "ce_loss_2": 4.1589976906776425, "ce_loss_3": 3.9885815143585206, "ce_loss_7": 3.674212193489075, "epoch": 0.677, "grad_norm": 580.0, "kl_loss_10": 122.34142112731934, "kl_loss_2": 1375.4806579589845, "kl_loss_3": 1026.1968933105468, "kl_loss_7": 332.3818161010742, "learning_rate": 0.00024044383420609406, "loss": 716.5443, "step": 6770 }, { "ce_loss_10": 3.5967800617218018, "ce_loss_13": 3.54542738199234, "ce_loss_2": 4.162489807605743, "ce_loss_3": 3.9877678751945496, "ce_loss_7": 3.6823439836502074, "epoch": 0.678, "grad_norm": 684.0, "kl_loss_10": 121.3688591003418, "kl_loss_2": 1374.671405029297, "kl_loss_3": 1025.4278350830077, "kl_loss_7": 330.73584136962893, "learning_rate": 0.00023908901295937712, "loss": 726.931, "step": 6780 }, { "ce_loss_10": 3.5975115299224854, "ce_loss_13": 3.541917252540588, "ce_loss_2": 4.178954315185547, "ce_loss_3": 3.9986441016197203, "ce_loss_7": 3.685844695568085, "epoch": 0.679, "grad_norm": 796.0, "kl_loss_10": 123.49897994995118, "kl_loss_2": 1371.78173828125, "kl_loss_3": 1025.5135620117187, "kl_loss_7": 330.7898513793945, "learning_rate": 0.00023773681908340283, "loss": 734.601, "step": 6790 }, { "ce_loss_10": 3.570222854614258, "ce_loss_13": 3.5119346141815186, "ce_loss_2": 4.171153092384339, "ce_loss_3": 3.988275647163391, "ce_loss_7": 3.661699855327606, "epoch": 0.68, "grad_norm": 648.0, "kl_loss_10": 128.40531120300292, "kl_loss_2": 1437.0543151855468, "kl_loss_3": 1074.290087890625, "kl_loss_7": 346.53758697509767, "learning_rate": 0.00023638726619474876, "loss": 749.3265, "step": 6800 }, { "ce_loss_10": 3.555301105976105, "ce_loss_13": 3.50036598443985, "ce_loss_2": 4.171612620353699, "ce_loss_3": 3.9867984652519226, "ce_loss_7": 3.6493401288986207, "epoch": 0.681, "grad_norm": 628.0, "kl_loss_10": 123.96326141357422, "kl_loss_2": 1442.6337036132813, "kl_loss_3": 1080.4893859863282, "kl_loss_7": 341.9077606201172, "learning_rate": 0.0002350403678833976, "loss": 737.603, "step": 6810 }, { "ce_loss_10": 3.4832152485847474, "ce_loss_13": 3.4290732622146605, "ce_loss_2": 4.0865050792694095, "ce_loss_3": 3.9012642741203307, "ce_loss_7": 3.5707468390464783, "epoch": 0.682, "grad_norm": 388.0, "kl_loss_10": 121.88200225830079, "kl_loss_2": 1425.3134704589843, "kl_loss_3": 1067.0139251708983, "kl_loss_7": 333.2817916870117, "learning_rate": 0.00023369613771260007, "loss": 730.0828, "step": 6820 }, { "ce_loss_10": 3.602006256580353, "ce_loss_13": 3.546574425697327, "ce_loss_2": 4.19493260383606, "ce_loss_3": 4.014666783809662, "ce_loss_7": 3.694399046897888, "epoch": 0.683, "grad_norm": 896.0, "kl_loss_10": 124.15231056213379, "kl_loss_2": 1416.5041259765626, "kl_loss_3": 1057.1513946533203, "kl_loss_7": 338.50926971435547, "learning_rate": 0.00023235458921873925, "loss": 737.5669, "step": 6830 }, { "ce_loss_10": 3.5514315247535704, "ce_loss_13": 3.4963643193244933, "ce_loss_2": 4.175014972686768, "ce_loss_3": 3.987934386730194, "ce_loss_7": 3.649992787837982, "epoch": 0.684, "grad_norm": 780.0, "kl_loss_10": 126.71570472717285, "kl_loss_2": 1460.5936950683595, "kl_loss_3": 1091.2405181884765, "kl_loss_7": 351.1172622680664, "learning_rate": 0.0002310157359111938, "loss": 752.2999, "step": 6840 }, { "ce_loss_10": 3.4418938040733336, "ce_loss_13": 3.3859288573265074, "ce_loss_2": 4.088224470615387, "ce_loss_3": 3.8917805433273314, "ce_loss_7": 3.539370059967041, "epoch": 0.685, "grad_norm": 932.0, "kl_loss_10": 125.29055137634278, "kl_loss_2": 1493.244873046875, "kl_loss_3": 1106.4089385986329, "kl_loss_7": 346.971989440918, "learning_rate": 0.0002296795912722014, "loss": 747.3044, "step": 6850 }, { "ce_loss_10": 3.582219159603119, "ce_loss_13": 3.5297583818435667, "ce_loss_2": 4.163276970386505, "ce_loss_3": 3.9838827729225157, "ce_loss_7": 3.673186790943146, "epoch": 0.686, "grad_norm": 688.0, "kl_loss_10": 122.75707702636718, "kl_loss_2": 1381.4411376953126, "kl_loss_3": 1023.4174530029297, "kl_loss_7": 331.40368804931643, "learning_rate": 0.0002283461687567236, "loss": 713.3312, "step": 6860 }, { "ce_loss_10": 3.64503128528595, "ce_loss_13": 3.5887781262397764, "ce_loss_2": 4.219438052177429, "ce_loss_3": 4.043782579898834, "ce_loss_7": 3.7344509243965147, "epoch": 0.687, "grad_norm": 516.0, "kl_loss_10": 123.2198699951172, "kl_loss_2": 1361.601385498047, "kl_loss_3": 1017.4406616210938, "kl_loss_7": 328.3076904296875, "learning_rate": 0.00022701548179231045, "loss": 725.902, "step": 6870 }, { "ce_loss_10": 3.5936806321144106, "ce_loss_13": 3.538178300857544, "ce_loss_2": 4.177520775794983, "ce_loss_3": 3.9925229072570803, "ce_loss_7": 3.6811250925064085, "epoch": 0.688, "grad_norm": 812.0, "kl_loss_10": 124.56343841552734, "kl_loss_2": 1398.6163452148437, "kl_loss_3": 1039.188638305664, "kl_loss_7": 333.23457489013674, "learning_rate": 0.00022568754377896516, "loss": 717.3196, "step": 6880 }, { "ce_loss_10": 3.5798384070396425, "ce_loss_13": 3.52534077167511, "ce_loss_2": 4.165230524539948, "ce_loss_3": 3.9855322360992433, "ce_loss_7": 3.669637417793274, "epoch": 0.689, "grad_norm": 636.0, "kl_loss_10": 122.47865982055664, "kl_loss_2": 1400.101580810547, "kl_loss_3": 1045.2517852783203, "kl_loss_7": 337.63702697753905, "learning_rate": 0.00022436236808900844, "loss": 724.2279, "step": 6890 }, { "ce_loss_10": 3.4826380729675295, "ce_loss_13": 3.4284149289131163, "ce_loss_2": 4.081106758117675, "ce_loss_3": 3.8968387603759767, "ce_loss_7": 3.5732897400856016, "epoch": 0.69, "grad_norm": 720.0, "kl_loss_10": 123.13472061157226, "kl_loss_2": 1424.4590087890624, "kl_loss_3": 1061.0906616210937, "kl_loss_7": 336.4748291015625, "learning_rate": 0.00022303996806694487, "loss": 726.4395, "step": 6900 }, { "ce_loss_10": 3.5565951108932494, "ce_loss_13": 3.5028126001358033, "ce_loss_2": 4.153330898284912, "ce_loss_3": 3.972140097618103, "ce_loss_7": 3.6470712184906007, "epoch": 0.691, "grad_norm": 620.0, "kl_loss_10": 121.5474639892578, "kl_loss_2": 1406.192333984375, "kl_loss_3": 1054.1312591552735, "kl_loss_7": 333.32630310058596, "learning_rate": 0.00022172035702932823, "loss": 725.3298, "step": 6910 }, { "ce_loss_10": 3.596930432319641, "ce_loss_13": 3.5424793004989623, "ce_loss_2": 4.174435448646546, "ce_loss_3": 4.0008144736289974, "ce_loss_7": 3.6854524970054627, "epoch": 0.692, "grad_norm": 588.0, "kl_loss_10": 122.45778541564941, "kl_loss_2": 1364.4023864746093, "kl_loss_3": 1023.2470947265625, "kl_loss_7": 330.3191375732422, "learning_rate": 0.00022040354826462666, "loss": 715.1212, "step": 6920 }, { "ce_loss_10": 3.5372806906700136, "ce_loss_13": 3.484300172328949, "ce_loss_2": 4.122054016590118, "ce_loss_3": 3.9410375475883486, "ce_loss_7": 3.6267031908035277, "epoch": 0.693, "grad_norm": 592.0, "kl_loss_10": 121.36230354309082, "kl_loss_2": 1387.7280639648438, "kl_loss_3": 1035.8701171875, "kl_loss_7": 328.8681671142578, "learning_rate": 0.0002190895550330899, "loss": 723.1688, "step": 6930 }, { "ce_loss_10": 3.4684043288230897, "ce_loss_13": 3.4110892057418822, "ce_loss_2": 4.0784489750862125, "ce_loss_3": 3.8970334768295287, "ce_loss_7": 3.5636763691902162, "epoch": 0.694, "grad_norm": 708.0, "kl_loss_10": 125.41013412475586, "kl_loss_2": 1438.7724853515624, "kl_loss_3": 1074.242709350586, "kl_loss_7": 343.83753967285156, "learning_rate": 0.00021777839056661552, "loss": 726.9423, "step": 6940 }, { "ce_loss_10": 3.5493281960487364, "ce_loss_13": 3.4966490268707275, "ce_loss_2": 4.138473987579346, "ce_loss_3": 3.9581478118896483, "ce_loss_7": 3.6378383159637453, "epoch": 0.695, "grad_norm": 482.0, "kl_loss_10": 121.87666625976563, "kl_loss_2": 1388.4705749511718, "kl_loss_3": 1035.8258056640625, "kl_loss_7": 330.15142211914065, "learning_rate": 0.0002164700680686147, "loss": 714.1528, "step": 6950 }, { "ce_loss_10": 3.5943965315818787, "ce_loss_13": 3.5415783882141114, "ce_loss_2": 4.166309404373169, "ce_loss_3": 3.9927005410194396, "ce_loss_7": 3.6852437376976015, "epoch": 0.696, "grad_norm": 524.0, "kl_loss_10": 123.10262947082519, "kl_loss_2": 1358.2594055175782, "kl_loss_3": 1022.4194183349609, "kl_loss_7": 332.60595855712893, "learning_rate": 0.0002151646007138806, "loss": 711.1066, "step": 6960 }, { "ce_loss_10": 3.4674277782440184, "ce_loss_13": 3.4147743582725525, "ce_loss_2": 4.0764969229698185, "ce_loss_3": 3.8895933270454406, "ce_loss_7": 3.559154045581818, "epoch": 0.697, "grad_norm": 506.0, "kl_loss_10": 124.91875114440919, "kl_loss_2": 1433.1934448242187, "kl_loss_3": 1072.8756225585937, "kl_loss_7": 340.41393890380857, "learning_rate": 0.00021386200164845526, "loss": 732.8288, "step": 6970 }, { "ce_loss_10": 3.655595052242279, "ce_loss_13": 3.600779819488525, "ce_loss_2": 4.214467906951905, "ce_loss_3": 4.045685410499573, "ce_loss_7": 3.7411547183990477, "epoch": 0.698, "grad_norm": 492.0, "kl_loss_10": 123.51595115661621, "kl_loss_2": 1358.5181579589844, "kl_loss_3": 1018.3159729003906, "kl_loss_7": 331.73258514404296, "learning_rate": 0.0002125622839894964, "loss": 717.1124, "step": 6980 }, { "ce_loss_10": 3.5993648409843444, "ce_loss_13": 3.5441166758537292, "ce_loss_2": 4.181045114994049, "ce_loss_3": 4.002468681335449, "ce_loss_7": 3.6823740720748903, "epoch": 0.699, "grad_norm": 470.0, "kl_loss_10": 122.90305519104004, "kl_loss_2": 1383.0861511230469, "kl_loss_3": 1029.9532836914063, "kl_loss_7": 329.35023193359376, "learning_rate": 0.00021126546082514663, "loss": 715.5651, "step": 6990 }, { "ce_loss_10": 3.6185239911079408, "ce_loss_13": 3.565263593196869, "ce_loss_2": 4.191958248615265, "ce_loss_3": 4.013105678558349, "ce_loss_7": 3.705930233001709, "epoch": 0.7, "grad_norm": 600.0, "kl_loss_10": 123.65934333801269, "kl_loss_2": 1374.6581237792968, "kl_loss_3": 1025.8725860595703, "kl_loss_7": 331.70873565673827, "learning_rate": 0.00020997154521440098, "loss": 715.4568, "step": 7000 }, { "ce_loss_10": 3.557955777645111, "ce_loss_13": 3.503411018848419, "ce_loss_2": 4.137540674209594, "ce_loss_3": 3.959644913673401, "ce_loss_7": 3.6458826899528503, "epoch": 0.701, "grad_norm": 536.0, "kl_loss_10": 120.48597831726075, "kl_loss_2": 1374.6299133300781, "kl_loss_3": 1030.6099670410156, "kl_loss_7": 329.32201385498047, "learning_rate": 0.0002086805501869749, "loss": 710.937, "step": 7010 }, { "ce_loss_10": 3.526408576965332, "ce_loss_13": 3.4718175530433655, "ce_loss_2": 4.136154270172119, "ce_loss_3": 3.9502137660980225, "ce_loss_7": 3.618171179294586, "epoch": 0.702, "grad_norm": 676.0, "kl_loss_10": 123.665576171875, "kl_loss_2": 1445.9505981445313, "kl_loss_3": 1081.7786437988282, "kl_loss_7": 344.37613677978516, "learning_rate": 0.0002073924887431744, "loss": 737.9115, "step": 7020 }, { "ce_loss_10": 3.537488567829132, "ce_loss_13": 3.4824650168418883, "ce_loss_2": 4.120364391803742, "ce_loss_3": 3.942833948135376, "ce_loss_7": 3.628614103794098, "epoch": 0.703, "grad_norm": 580.0, "kl_loss_10": 122.33941802978515, "kl_loss_2": 1393.7160217285157, "kl_loss_3": 1044.1484924316405, "kl_loss_7": 335.0018844604492, "learning_rate": 0.00020610737385376348, "loss": 733.2367, "step": 7030 }, { "ce_loss_10": 3.6018667101860045, "ce_loss_13": 3.547727274894714, "ce_loss_2": 4.1699677348136905, "ce_loss_3": 3.9950947046279905, "ce_loss_7": 3.6891037821769714, "epoch": 0.704, "grad_norm": 664.0, "kl_loss_10": 122.15450096130371, "kl_loss_2": 1356.7330383300782, "kl_loss_3": 1011.3340454101562, "kl_loss_7": 326.781640625, "learning_rate": 0.00020482521845983521, "loss": 721.0978, "step": 7040 }, { "ce_loss_10": 3.59405198097229, "ce_loss_13": 3.538354456424713, "ce_loss_2": 4.1835708022117615, "ce_loss_3": 3.999073255062103, "ce_loss_7": 3.6849095940589907, "epoch": 0.705, "grad_norm": 688.0, "kl_loss_10": 126.76047058105469, "kl_loss_2": 1398.0797119140625, "kl_loss_3": 1040.0777679443358, "kl_loss_7": 339.4572814941406, "learning_rate": 0.00020354603547267987, "loss": 733.3687, "step": 7050 }, { "ce_loss_10": 3.579659843444824, "ce_loss_13": 3.521292781829834, "ce_loss_2": 4.17962476015091, "ce_loss_3": 3.9999419927597044, "ce_loss_7": 3.672910511493683, "epoch": 0.706, "grad_norm": 506.0, "kl_loss_10": 125.07297286987304, "kl_loss_2": 1407.0430908203125, "kl_loss_3": 1054.5457977294923, "kl_loss_7": 338.6408721923828, "learning_rate": 0.00020226983777365604, "loss": 743.0122, "step": 7060 }, { "ce_loss_10": 3.4797715306282044, "ce_loss_13": 3.4280895590782166, "ce_loss_2": 4.093261420726776, "ce_loss_3": 3.9058175683021545, "ce_loss_7": 3.5675304889678956, "epoch": 0.707, "grad_norm": 460.0, "kl_loss_10": 118.65347862243652, "kl_loss_2": 1437.4217590332032, "kl_loss_3": 1060.7939331054688, "kl_loss_7": 327.5643600463867, "learning_rate": 0.00020099663821406056, "loss": 725.6605, "step": 7070 }, { "ce_loss_10": 3.584267723560333, "ce_loss_13": 3.5309558272361756, "ce_loss_2": 4.158235001564026, "ce_loss_3": 3.9820183753967284, "ce_loss_7": 3.6715306878089904, "epoch": 0.708, "grad_norm": 736.0, "kl_loss_10": 120.5715232849121, "kl_loss_2": 1372.4082824707032, "kl_loss_3": 1020.9255096435547, "kl_loss_7": 326.9998489379883, "learning_rate": 0.00019972644961499853, "loss": 723.5401, "step": 7080 }, { "ce_loss_10": 3.5517083406448364, "ce_loss_13": 3.498913753032684, "ce_loss_2": 4.1585370898246765, "ce_loss_3": 3.977364408969879, "ce_loss_7": 3.6439939856529238, "epoch": 0.709, "grad_norm": 596.0, "kl_loss_10": 123.60837135314941, "kl_loss_2": 1431.0212951660155, "kl_loss_3": 1069.8634002685546, "kl_loss_7": 340.6724349975586, "learning_rate": 0.00019845928476725522, "loss": 732.3908, "step": 7090 }, { "ce_loss_10": 3.626398813724518, "ce_loss_13": 3.572152090072632, "ce_loss_2": 4.209421014785766, "ce_loss_3": 4.036573505401611, "ce_loss_7": 3.7170314311981203, "epoch": 0.71, "grad_norm": 596.0, "kl_loss_10": 123.35403976440429, "kl_loss_2": 1387.5173217773438, "kl_loss_3": 1038.8995300292968, "kl_loss_7": 335.42689056396483, "learning_rate": 0.00019719515643116677, "loss": 738.7718, "step": 7100 }, { "ce_loss_10": 3.572767961025238, "ce_loss_13": 3.515077757835388, "ce_loss_2": 4.155257606506348, "ce_loss_3": 3.9734201788902284, "ce_loss_7": 3.6621560573577883, "epoch": 0.711, "grad_norm": 580.0, "kl_loss_10": 123.21557540893555, "kl_loss_2": 1376.1987548828124, "kl_loss_3": 1026.8055908203125, "kl_loss_7": 331.16675262451173, "learning_rate": 0.0001959340773364911, "loss": 723.44, "step": 7110 }, { "ce_loss_10": 3.5861273288726805, "ce_loss_13": 3.5300360202789305, "ce_loss_2": 4.173725187778473, "ce_loss_3": 3.996710407733917, "ce_loss_7": 3.674662780761719, "epoch": 0.712, "grad_norm": 482.0, "kl_loss_10": 123.5694019317627, "kl_loss_2": 1397.5086608886718, "kl_loss_3": 1041.5564971923827, "kl_loss_7": 333.47118682861327, "learning_rate": 0.0001946760601822809, "loss": 715.5152, "step": 7120 }, { "ce_loss_10": 3.638036513328552, "ce_loss_13": 3.585539197921753, "ce_loss_2": 4.208897590637207, "ce_loss_3": 4.033627045154572, "ce_loss_7": 3.7303940296173095, "epoch": 0.713, "grad_norm": 588.0, "kl_loss_10": 120.84820327758788, "kl_loss_2": 1364.6588439941406, "kl_loss_3": 1017.2987609863281, "kl_loss_7": 331.6714385986328, "learning_rate": 0.00019342111763675512, "loss": 705.8723, "step": 7130 }, { "ce_loss_10": 3.6417887210845947, "ce_loss_13": 3.5866674900054933, "ce_loss_2": 4.206362402439117, "ce_loss_3": 4.032591104507446, "ce_loss_7": 3.726455843448639, "epoch": 0.714, "grad_norm": 488.0, "kl_loss_10": 125.14588623046875, "kl_loss_2": 1367.4994873046876, "kl_loss_3": 1022.9796356201172, "kl_loss_7": 332.9434066772461, "learning_rate": 0.00019216926233717085, "loss": 711.7196, "step": 7140 }, { "ce_loss_10": 3.5203476548194885, "ce_loss_13": 3.4688742399215697, "ce_loss_2": 4.134868347644806, "ce_loss_3": 3.9454983115196227, "ce_loss_7": 3.6101885557174684, "epoch": 0.715, "grad_norm": 536.0, "kl_loss_10": 121.2591423034668, "kl_loss_2": 1433.3612915039062, "kl_loss_3": 1061.6283813476562, "kl_loss_7": 328.62769012451173, "learning_rate": 0.00019092050688969737, "loss": 730.5997, "step": 7150 }, { "ce_loss_10": 3.5955933809280394, "ce_loss_13": 3.544034016132355, "ce_loss_2": 4.171285545825958, "ce_loss_3": 3.988926124572754, "ce_loss_7": 3.680844259262085, "epoch": 0.716, "grad_norm": 552.0, "kl_loss_10": 121.48760719299317, "kl_loss_2": 1380.6668640136718, "kl_loss_3": 1031.5187957763671, "kl_loss_7": 330.45118255615233, "learning_rate": 0.00018967486386928817, "loss": 714.2, "step": 7160 }, { "ce_loss_10": 3.469675064086914, "ce_loss_13": 3.414101004600525, "ce_loss_2": 4.0789219498634335, "ce_loss_3": 3.894315481185913, "ce_loss_7": 3.5637245774269104, "epoch": 0.717, "grad_norm": 656.0, "kl_loss_10": 121.65066604614258, "kl_loss_2": 1434.495782470703, "kl_loss_3": 1071.64462890625, "kl_loss_7": 339.27454376220703, "learning_rate": 0.00018843234581955443, "loss": 752.8244, "step": 7170 }, { "ce_loss_10": 3.485479485988617, "ce_loss_13": 3.429234707355499, "ce_loss_2": 4.093677091598511, "ce_loss_3": 3.906477701663971, "ce_loss_7": 3.5766185998916624, "epoch": 0.718, "grad_norm": 608.0, "kl_loss_10": 124.83513031005859, "kl_loss_2": 1436.6897827148437, "kl_loss_3": 1067.1097259521484, "kl_loss_7": 339.108544921875, "learning_rate": 0.00018719296525263924, "loss": 735.575, "step": 7180 }, { "ce_loss_10": 3.5846765637397766, "ce_loss_13": 3.529562759399414, "ce_loss_2": 4.150809407234192, "ce_loss_3": 3.9718990206718443, "ce_loss_7": 3.6702256917953493, "epoch": 0.719, "grad_norm": 442.0, "kl_loss_10": 122.53091278076172, "kl_loss_2": 1356.4698181152344, "kl_loss_3": 1008.6680480957032, "kl_loss_7": 327.7504058837891, "learning_rate": 0.0001859567346490913, "loss": 712.9806, "step": 7190 }, { "ce_loss_10": 3.554144012928009, "ce_loss_13": 3.4977105259895325, "ce_loss_2": 4.150786626338959, "ce_loss_3": 3.9735541462898256, "ce_loss_7": 3.6462294340133665, "epoch": 0.72, "grad_norm": 676.0, "kl_loss_10": 123.88136787414551, "kl_loss_2": 1414.2480834960938, "kl_loss_3": 1060.4930938720704, "kl_loss_7": 338.99412231445314, "learning_rate": 0.0001847236664577389, "loss": 719.9778, "step": 7200 }, { "ce_loss_10": 3.5858793020248414, "ce_loss_13": 3.5322806358337404, "ce_loss_2": 4.153151452541351, "ce_loss_3": 3.978500175476074, "ce_loss_7": 3.6706031560897827, "epoch": 0.721, "grad_norm": 430.0, "kl_loss_10": 123.21329269409179, "kl_loss_2": 1359.6646850585937, "kl_loss_3": 1009.3902221679688, "kl_loss_7": 329.09681243896483, "learning_rate": 0.00018349377309556487, "loss": 702.6776, "step": 7210 }, { "ce_loss_10": 3.525311827659607, "ce_loss_13": 3.4706099390983582, "ce_loss_2": 4.130126202106476, "ce_loss_3": 3.9440832138061523, "ce_loss_7": 3.615574586391449, "epoch": 0.722, "grad_norm": 684.0, "kl_loss_10": 122.96382064819336, "kl_loss_2": 1434.9597778320312, "kl_loss_3": 1071.8526824951173, "kl_loss_7": 338.4400665283203, "learning_rate": 0.00018226706694758193, "loss": 733.9826, "step": 7220 }, { "ce_loss_10": 3.6010716795921325, "ce_loss_13": 3.5477463483810423, "ce_loss_2": 4.1720555305480955, "ce_loss_3": 3.999657225608826, "ce_loss_7": 3.6874852776527405, "epoch": 0.723, "grad_norm": 596.0, "kl_loss_10": 122.04865837097168, "kl_loss_2": 1376.6821655273438, "kl_loss_3": 1035.7532440185546, "kl_loss_7": 331.5156555175781, "learning_rate": 0.0001810435603667075, "loss": 733.5101, "step": 7230 }, { "ce_loss_10": 3.448229801654816, "ce_loss_13": 3.395207440853119, "ce_loss_2": 4.045393764972687, "ce_loss_3": 3.8576236844062803, "ce_loss_7": 3.538518488407135, "epoch": 0.724, "grad_norm": 568.0, "kl_loss_10": 118.53696784973144, "kl_loss_2": 1409.9154846191407, "kl_loss_3": 1048.6354064941406, "kl_loss_7": 329.32737426757814, "learning_rate": 0.0001798232656736389, "loss": 731.7912, "step": 7240 }, { "ce_loss_10": 3.6248636484146117, "ce_loss_13": 3.5711260557174684, "ce_loss_2": 4.182618510723114, "ce_loss_3": 4.011233007907867, "ce_loss_7": 3.7107828497886657, "epoch": 0.725, "grad_norm": 648.0, "kl_loss_10": 123.01177520751953, "kl_loss_2": 1340.389141845703, "kl_loss_3": 1003.9724822998047, "kl_loss_7": 328.3069091796875, "learning_rate": 0.0001786061951567303, "loss": 717.0013, "step": 7250 }, { "ce_loss_10": 3.5421255350112917, "ce_loss_13": 3.4872742772102354, "ce_loss_2": 4.13176680803299, "ce_loss_3": 3.9531715869903565, "ce_loss_7": 3.6325071692466735, "epoch": 0.726, "grad_norm": 672.0, "kl_loss_10": 124.13465118408203, "kl_loss_2": 1389.2992919921876, "kl_loss_3": 1037.6413665771483, "kl_loss_7": 334.9740844726563, "learning_rate": 0.00017739236107186857, "loss": 725.9827, "step": 7260 }, { "ce_loss_10": 3.628635025024414, "ce_loss_13": 3.5775145173072813, "ce_loss_2": 4.183088374137879, "ce_loss_3": 4.009625935554505, "ce_loss_7": 3.7096946001052857, "epoch": 0.727, "grad_norm": 484.0, "kl_loss_10": 119.83437309265136, "kl_loss_2": 1335.5197631835938, "kl_loss_3": 996.1497222900391, "kl_loss_7": 322.9276321411133, "learning_rate": 0.00017618177564234904, "loss": 706.5093, "step": 7270 }, { "ce_loss_10": 3.606297266483307, "ce_loss_13": 3.5551859974861144, "ce_loss_2": 4.166692161560059, "ce_loss_3": 3.9936567068099977, "ce_loss_7": 3.6888177514076235, "epoch": 0.728, "grad_norm": 510.0, "kl_loss_10": 120.30095024108887, "kl_loss_2": 1330.818670654297, "kl_loss_3": 1000.4637176513672, "kl_loss_7": 321.7369613647461, "learning_rate": 0.00017497445105875377, "loss": 706.7148, "step": 7280 }, { "ce_loss_10": 3.513353967666626, "ce_loss_13": 3.4601715326309206, "ce_loss_2": 4.118406116962433, "ce_loss_3": 3.934498977661133, "ce_loss_7": 3.60476815700531, "epoch": 0.729, "grad_norm": 716.0, "kl_loss_10": 122.56389846801758, "kl_loss_2": 1424.7031311035157, "kl_loss_3": 1066.4053436279296, "kl_loss_7": 335.16686553955077, "learning_rate": 0.000173770399478828, "loss": 727.812, "step": 7290 }, { "ce_loss_10": 3.435167062282562, "ce_loss_13": 3.3832804918289185, "ce_loss_2": 4.021002113819122, "ce_loss_3": 3.841395652294159, "ce_loss_7": 3.519324839115143, "epoch": 0.73, "grad_norm": 636.0, "kl_loss_10": 121.18411865234376, "kl_loss_2": 1399.8541625976563, "kl_loss_3": 1044.4448303222657, "kl_loss_7": 331.79268341064454, "learning_rate": 0.0001725696330273575, "loss": 737.3452, "step": 7300 }, { "ce_loss_10": 3.6242716908454895, "ce_loss_13": 3.5716004967689514, "ce_loss_2": 4.191676688194275, "ce_loss_3": 4.013358986377716, "ce_loss_7": 3.712622547149658, "epoch": 0.731, "grad_norm": 652.0, "kl_loss_10": 119.75424537658691, "kl_loss_2": 1345.9469665527345, "kl_loss_3": 1000.0272003173828, "kl_loss_7": 325.01279907226564, "learning_rate": 0.00017137216379604724, "loss": 701.3725, "step": 7310 }, { "ce_loss_10": 3.506552290916443, "ce_loss_13": 3.452127659320831, "ce_loss_2": 4.097395420074463, "ce_loss_3": 3.9160946011543274, "ce_loss_7": 3.5951048493385316, "epoch": 0.732, "grad_norm": 884.0, "kl_loss_10": 121.7713752746582, "kl_loss_2": 1385.7196105957032, "kl_loss_3": 1034.3794219970703, "kl_loss_7": 329.5286666870117, "learning_rate": 0.00017017800384339925, "loss": 717.8389, "step": 7320 }, { "ce_loss_10": 3.4542035102844237, "ce_loss_13": 3.398160481452942, "ce_loss_2": 4.066942834854126, "ce_loss_3": 3.8794593691825865, "ce_loss_7": 3.5470592975616455, "epoch": 0.733, "grad_norm": 548.0, "kl_loss_10": 122.54219818115234, "kl_loss_2": 1439.6918151855468, "kl_loss_3": 1065.2620758056642, "kl_loss_7": 336.47518768310545, "learning_rate": 0.00016898716519459073, "loss": 717.2488, "step": 7330 }, { "ce_loss_10": 3.581343674659729, "ce_loss_13": 3.526875352859497, "ce_loss_2": 4.190316534042358, "ce_loss_3": 4.004525983333588, "ce_loss_7": 3.6749117851257322, "epoch": 0.734, "grad_norm": 624.0, "kl_loss_10": 125.29887886047364, "kl_loss_2": 1416.729052734375, "kl_loss_3": 1057.2286193847656, "kl_loss_7": 342.11007537841795, "learning_rate": 0.00016779965984135375, "loss": 727.1093, "step": 7340 }, { "ce_loss_10": 3.48441618680954, "ce_loss_13": 3.431960880756378, "ce_loss_2": 4.071764206886291, "ce_loss_3": 3.887656939029694, "ce_loss_7": 3.571474778652191, "epoch": 0.735, "grad_norm": 652.0, "kl_loss_10": 118.69619903564453, "kl_loss_2": 1381.495343017578, "kl_loss_3": 1023.2910827636719, "kl_loss_7": 324.31610412597655, "learning_rate": 0.00016661549974185424, "loss": 716.1001, "step": 7350 }, { "ce_loss_10": 3.5254514336586, "ce_loss_13": 3.4723365902900696, "ce_loss_2": 4.108010959625244, "ce_loss_3": 3.933693265914917, "ce_loss_7": 3.613771104812622, "epoch": 0.736, "grad_norm": 510.0, "kl_loss_10": 123.67498970031738, "kl_loss_2": 1388.5188293457031, "kl_loss_3": 1037.3818817138672, "kl_loss_7": 333.49166717529295, "learning_rate": 0.00016543469682057105, "loss": 711.0732, "step": 7360 }, { "ce_loss_10": 3.5577150702476503, "ce_loss_13": 3.5016186833381653, "ce_loss_2": 4.143563580513001, "ce_loss_3": 3.9626933932304382, "ce_loss_7": 3.6453551292419433, "epoch": 0.737, "grad_norm": 486.0, "kl_loss_10": 124.5495719909668, "kl_loss_2": 1398.1186157226562, "kl_loss_3": 1044.5123626708985, "kl_loss_7": 336.8351364135742, "learning_rate": 0.00016425726296817632, "loss": 723.8395, "step": 7370 }, { "ce_loss_10": 3.570967364311218, "ce_loss_13": 3.519097864627838, "ce_loss_2": 4.144868564605713, "ce_loss_3": 3.9664488077163695, "ce_loss_7": 3.6589640259742735, "epoch": 0.738, "grad_norm": 564.0, "kl_loss_10": 120.96943435668945, "kl_loss_2": 1359.615216064453, "kl_loss_3": 1016.4945587158203, "kl_loss_7": 326.19647216796875, "learning_rate": 0.00016308321004141607, "loss": 710.8186, "step": 7380 }, { "ce_loss_10": 3.5232744455337524, "ce_loss_13": 3.467752158641815, "ce_loss_2": 4.123220896720886, "ce_loss_3": 3.9425734519958495, "ce_loss_7": 3.615010941028595, "epoch": 0.739, "grad_norm": 528.0, "kl_loss_10": 124.12866439819337, "kl_loss_2": 1396.836865234375, "kl_loss_3": 1039.4474639892578, "kl_loss_7": 335.5626510620117, "learning_rate": 0.00016191254986299043, "loss": 719.3172, "step": 7390 }, { "ce_loss_10": 3.5758928298950194, "ce_loss_13": 3.523856747150421, "ce_loss_2": 4.139862871170044, "ce_loss_3": 3.963184416294098, "ce_loss_7": 3.658868145942688, "epoch": 0.74, "grad_norm": 656.0, "kl_loss_10": 120.1721866607666, "kl_loss_2": 1373.3153991699219, "kl_loss_3": 1020.2515319824219, "kl_loss_7": 326.2480270385742, "learning_rate": 0.00016074529422143398, "loss": 722.962, "step": 7400 }, { "ce_loss_10": 3.512345218658447, "ce_loss_13": 3.460979771614075, "ce_loss_2": 4.098102223873139, "ce_loss_3": 3.9215418100357056, "ce_loss_7": 3.603636908531189, "epoch": 0.741, "grad_norm": 736.0, "kl_loss_10": 122.76387939453124, "kl_loss_2": 1396.6786743164062, "kl_loss_3": 1042.0954986572265, "kl_loss_7": 331.67714538574216, "learning_rate": 0.0001595814548709983, "loss": 725.9221, "step": 7410 }, { "ce_loss_10": 3.5902742028236387, "ce_loss_13": 3.537025344371796, "ce_loss_2": 4.177208817005157, "ce_loss_3": 3.997780406475067, "ce_loss_7": 3.6821145176887513, "epoch": 0.742, "grad_norm": 564.0, "kl_loss_10": 124.67025566101074, "kl_loss_2": 1403.5627502441407, "kl_loss_3": 1044.860205078125, "kl_loss_7": 338.93174896240237, "learning_rate": 0.00015842104353153285, "loss": 727.9301, "step": 7420 }, { "ce_loss_10": 3.604291892051697, "ce_loss_13": 3.548382747173309, "ce_loss_2": 4.191494596004486, "ce_loss_3": 4.010533785820007, "ce_loss_7": 3.6923877358436585, "epoch": 0.743, "grad_norm": 472.0, "kl_loss_10": 124.3033836364746, "kl_loss_2": 1400.2394775390626, "kl_loss_3": 1049.6304718017577, "kl_loss_7": 337.5371841430664, "learning_rate": 0.0001572640718883667, "loss": 738.9812, "step": 7430 }, { "ce_loss_10": 3.538733124732971, "ce_loss_13": 3.4872957110404967, "ce_loss_2": 4.113576173782349, "ce_loss_3": 3.934070587158203, "ce_loss_7": 3.625633692741394, "epoch": 0.744, "grad_norm": 572.0, "kl_loss_10": 119.72441368103027, "kl_loss_2": 1360.4320068359375, "kl_loss_3": 1016.749853515625, "kl_loss_7": 324.85569152832034, "learning_rate": 0.0001561105515921915, "loss": 723.982, "step": 7440 }, { "ce_loss_10": 3.3811042308807373, "ce_loss_13": 3.330593299865723, "ce_loss_2": 3.999517023563385, "ce_loss_3": 3.807965099811554, "ce_loss_7": 3.472578394412994, "epoch": 0.745, "grad_norm": 536.0, "kl_loss_10": 118.95663032531738, "kl_loss_2": 1440.7939697265624, "kl_loss_3": 1063.4994506835938, "kl_loss_7": 330.3991363525391, "learning_rate": 0.0001549604942589441, "loss": 720.67, "step": 7450 }, { "ce_loss_10": 3.574368488788605, "ce_loss_13": 3.523599100112915, "ce_loss_2": 4.1251343250274655, "ce_loss_3": 3.9507637143135073, "ce_loss_7": 3.656884014606476, "epoch": 0.746, "grad_norm": 544.0, "kl_loss_10": 118.31760025024414, "kl_loss_2": 1313.1225219726562, "kl_loss_3": 979.9062561035156, "kl_loss_7": 317.6555862426758, "learning_rate": 0.00015381391146968864, "loss": 701.5394, "step": 7460 }, { "ce_loss_10": 3.5501048445701597, "ce_loss_13": 3.498226988315582, "ce_loss_2": 4.138570165634155, "ce_loss_3": 3.95827853679657, "ce_loss_7": 3.636888933181763, "epoch": 0.747, "grad_norm": 496.0, "kl_loss_10": 119.63836708068848, "kl_loss_2": 1383.4101440429688, "kl_loss_3": 1029.2711975097657, "kl_loss_7": 327.9986038208008, "learning_rate": 0.00015267081477050133, "loss": 722.7866, "step": 7470 }, { "ce_loss_10": 3.651386225223541, "ce_loss_13": 3.595274817943573, "ce_loss_2": 4.220568442344666, "ce_loss_3": 4.041546940803528, "ce_loss_7": 3.73706601858139, "epoch": 0.748, "grad_norm": 640.0, "kl_loss_10": 124.5992389678955, "kl_loss_2": 1365.5502685546876, "kl_loss_3": 1015.7865600585938, "kl_loss_7": 333.7725234985352, "learning_rate": 0.00015153121567235335, "loss": 706.2273, "step": 7480 }, { "ce_loss_10": 3.543039095401764, "ce_loss_13": 3.4888150691986084, "ce_loss_2": 4.1341440916061405, "ce_loss_3": 3.9522986888885496, "ce_loss_7": 3.628537285327911, "epoch": 0.749, "grad_norm": 506.0, "kl_loss_10": 122.94704780578613, "kl_loss_2": 1414.9927001953124, "kl_loss_3": 1049.655908203125, "kl_loss_7": 334.1303375244141, "learning_rate": 0.00015039512565099468, "loss": 708.8927, "step": 7490 }, { "ce_loss_10": 3.6088897585868835, "ce_loss_13": 3.5548375248908997, "ce_loss_2": 4.185053658485413, "ce_loss_3": 4.00367146730423, "ce_loss_7": 3.693446183204651, "epoch": 0.75, "grad_norm": 592.0, "kl_loss_10": 123.0703842163086, "kl_loss_2": 1378.3978271484375, "kl_loss_3": 1023.3384765625, "kl_loss_7": 330.60157012939453, "learning_rate": 0.00014926255614683932, "loss": 733.1395, "step": 7500 }, { "ce_loss_10": 3.5458021998405456, "ce_loss_13": 3.4934080958366396, "ce_loss_2": 4.122628927230835, "ce_loss_3": 3.943239653110504, "ce_loss_7": 3.6338084936141968, "epoch": 0.751, "grad_norm": 576.0, "kl_loss_10": 121.31806373596191, "kl_loss_2": 1371.3551818847657, "kl_loss_3": 1020.9572631835938, "kl_loss_7": 328.8893646240234, "learning_rate": 0.0001481335185648498, "loss": 724.9404, "step": 7510 }, { "ce_loss_10": 3.557645547389984, "ce_loss_13": 3.5049761295318604, "ce_loss_2": 4.132593739032745, "ce_loss_3": 3.9585251092910765, "ce_loss_7": 3.6444791197776794, "epoch": 0.752, "grad_norm": 716.0, "kl_loss_10": 120.7141185760498, "kl_loss_2": 1376.1685791015625, "kl_loss_3": 1027.3751647949218, "kl_loss_7": 331.6845397949219, "learning_rate": 0.0001470080242744218, "loss": 711.1242, "step": 7520 }, { "ce_loss_10": 3.5526819705963133, "ce_loss_13": 3.5017164587974547, "ce_loss_2": 4.13426855802536, "ce_loss_3": 3.950691211223602, "ce_loss_7": 3.637460446357727, "epoch": 0.753, "grad_norm": 752.0, "kl_loss_10": 118.99393234252929, "kl_loss_2": 1376.8853271484375, "kl_loss_3": 1021.0036926269531, "kl_loss_7": 324.0994644165039, "learning_rate": 0.0001458860846092705, "loss": 721.2631, "step": 7530 }, { "ce_loss_10": 3.587147796154022, "ce_loss_13": 3.534837579727173, "ce_loss_2": 4.153036820888519, "ce_loss_3": 3.982726287841797, "ce_loss_7": 3.6733571171760557, "epoch": 0.754, "grad_norm": 568.0, "kl_loss_10": 120.59756584167481, "kl_loss_2": 1358.3469970703125, "kl_loss_3": 1018.7931213378906, "kl_loss_7": 325.6051193237305, "learning_rate": 0.00014476771086731566, "loss": 702.7048, "step": 7540 }, { "ce_loss_10": 3.6986522316932677, "ce_loss_13": 3.640228807926178, "ce_loss_2": 4.267730498313904, "ce_loss_3": 4.08877055644989, "ce_loss_7": 3.7881681442260744, "epoch": 0.755, "grad_norm": 716.0, "kl_loss_10": 125.66169509887695, "kl_loss_2": 1361.5933959960937, "kl_loss_3": 1006.495458984375, "kl_loss_7": 334.8022232055664, "learning_rate": 0.00014365291431056872, "loss": 725.5867, "step": 7550 }, { "ce_loss_10": 3.526888978481293, "ce_loss_13": 3.4713703632354735, "ce_loss_2": 4.122870457172394, "ce_loss_3": 3.9425498366355898, "ce_loss_7": 3.622254657745361, "epoch": 0.756, "grad_norm": 804.0, "kl_loss_10": 124.94528083801269, "kl_loss_2": 1422.8938781738282, "kl_loss_3": 1061.9186767578126, "kl_loss_7": 342.73841705322263, "learning_rate": 0.00014254170616501827, "loss": 726.9622, "step": 7560 }, { "ce_loss_10": 3.4563034534454347, "ce_loss_13": 3.399739706516266, "ce_loss_2": 4.073044645786285, "ce_loss_3": 3.8895306468009947, "ce_loss_7": 3.552928638458252, "epoch": 0.757, "grad_norm": 756.0, "kl_loss_10": 123.76986045837403, "kl_loss_2": 1446.2292724609374, "kl_loss_3": 1085.1556243896484, "kl_loss_7": 342.7120895385742, "learning_rate": 0.0001414340976205183, "loss": 745.9122, "step": 7570 }, { "ce_loss_10": 3.4699150562286376, "ce_loss_13": 3.4161347150802612, "ce_loss_2": 4.070275318622589, "ce_loss_3": 3.888191211223602, "ce_loss_7": 3.563317656517029, "epoch": 0.758, "grad_norm": 502.0, "kl_loss_10": 121.47369956970215, "kl_loss_2": 1404.6524047851562, "kl_loss_3": 1044.3208923339844, "kl_loss_7": 333.030908203125, "learning_rate": 0.00014033009983067452, "loss": 723.4891, "step": 7580 }, { "ce_loss_10": 3.635603678226471, "ce_loss_13": 3.5829265475273133, "ce_loss_2": 4.192306113243103, "ce_loss_3": 4.022082531452179, "ce_loss_7": 3.7206236600875853, "epoch": 0.759, "grad_norm": 464.0, "kl_loss_10": 119.94292831420898, "kl_loss_2": 1340.960137939453, "kl_loss_3": 1001.6939056396484, "kl_loss_7": 322.9109375, "learning_rate": 0.00013922972391273224, "loss": 710.9632, "step": 7590 }, { "ce_loss_10": 3.635016143321991, "ce_loss_13": 3.582967531681061, "ce_loss_2": 4.220010781288147, "ce_loss_3": 4.042008626461029, "ce_loss_7": 3.722532641887665, "epoch": 0.76, "grad_norm": 476.0, "kl_loss_10": 121.22842979431152, "kl_loss_2": 1370.4319946289063, "kl_loss_3": 1019.5878387451172, "kl_loss_7": 326.00676879882815, "learning_rate": 0.0001381329809474649, "loss": 718.1754, "step": 7600 }, { "ce_loss_10": 3.5380415320396423, "ce_loss_13": 3.4831011414527895, "ce_loss_2": 4.144773721694946, "ce_loss_3": 3.96180077791214, "ce_loss_7": 3.6306992173194885, "epoch": 0.761, "grad_norm": 744.0, "kl_loss_10": 122.81468658447265, "kl_loss_2": 1420.2211853027343, "kl_loss_3": 1060.393051147461, "kl_loss_7": 335.4973602294922, "learning_rate": 0.0001370398819790621, "loss": 732.0773, "step": 7610 }, { "ce_loss_10": 3.678859758377075, "ce_loss_13": 3.6265331745147704, "ce_loss_2": 4.236479330062866, "ce_loss_3": 4.066269385814667, "ce_loss_7": 3.7672220945358275, "epoch": 0.762, "grad_norm": 4000.0, "kl_loss_10": 122.85980682373047, "kl_loss_2": 1334.2934326171876, "kl_loss_3": 996.7666259765625, "kl_loss_7": 327.2108581542969, "learning_rate": 0.00013595043801501794, "loss": 697.9223, "step": 7620 }, { "ce_loss_10": 3.472164535522461, "ce_loss_13": 3.4196288228034972, "ce_loss_2": 4.094062793254852, "ce_loss_3": 3.90572075843811, "ce_loss_7": 3.565885674953461, "epoch": 0.763, "grad_norm": 940.0, "kl_loss_10": 120.75509605407714, "kl_loss_2": 1461.8281555175781, "kl_loss_3": 1080.7115509033204, "kl_loss_7": 336.56712799072267, "learning_rate": 0.00013486466002602133, "loss": 732.8728, "step": 7630 }, { "ce_loss_10": 3.594163513183594, "ce_loss_13": 3.538478171825409, "ce_loss_2": 4.157584226131439, "ce_loss_3": 3.9850178241729735, "ce_loss_7": 3.678713285923004, "epoch": 0.764, "grad_norm": 520.0, "kl_loss_10": 122.31045188903809, "kl_loss_2": 1349.2873657226562, "kl_loss_3": 1010.3630493164062, "kl_loss_7": 327.035498046875, "learning_rate": 0.00013378255894584462, "loss": 726.2445, "step": 7640 }, { "ce_loss_10": 3.524076557159424, "ce_loss_13": 3.467902934551239, "ce_loss_2": 4.118615138530731, "ce_loss_3": 3.9384886622428894, "ce_loss_7": 3.6175344228744506, "epoch": 0.765, "grad_norm": 572.0, "kl_loss_10": 123.03233604431152, "kl_loss_2": 1402.026629638672, "kl_loss_3": 1045.6179962158203, "kl_loss_7": 336.69920959472654, "learning_rate": 0.0001327041456712334, "loss": 726.6062, "step": 7650 }, { "ce_loss_10": 3.564585876464844, "ce_loss_13": 3.5087494015693665, "ce_loss_2": 4.150593364238739, "ce_loss_3": 3.973892557621002, "ce_loss_7": 3.6529523134231567, "epoch": 0.766, "grad_norm": 572.0, "kl_loss_10": 123.00491752624512, "kl_loss_2": 1400.3231262207032, "kl_loss_3": 1047.99375, "kl_loss_7": 335.2826919555664, "learning_rate": 0.00013162943106179747, "loss": 728.8339, "step": 7660 }, { "ce_loss_10": 3.545767056941986, "ce_loss_13": 3.491313898563385, "ce_loss_2": 4.129991114139557, "ce_loss_3": 3.9483101606369018, "ce_loss_7": 3.6297065734863283, "epoch": 0.767, "grad_norm": 680.0, "kl_loss_10": 123.4360237121582, "kl_loss_2": 1383.0119750976562, "kl_loss_3": 1028.4972290039063, "kl_loss_7": 329.80051574707034, "learning_rate": 0.00013055842593990132, "loss": 717.3344, "step": 7670 }, { "ce_loss_10": 3.484251117706299, "ce_loss_13": 3.4325303077697753, "ce_loss_2": 4.067324638366699, "ce_loss_3": 3.8889448761940004, "ce_loss_7": 3.5747691988945007, "epoch": 0.768, "grad_norm": 520.0, "kl_loss_10": 119.67072257995605, "kl_loss_2": 1369.2867492675782, "kl_loss_3": 1022.6093109130859, "kl_loss_7": 327.1118728637695, "learning_rate": 0.00012949114109055414, "loss": 725.7104, "step": 7680 }, { "ce_loss_10": 3.531809902191162, "ce_loss_13": 3.47885684967041, "ce_loss_2": 4.124879586696625, "ce_loss_3": 3.9439189553260805, "ce_loss_7": 3.622943699359894, "epoch": 0.769, "grad_norm": 624.0, "kl_loss_10": 122.69542655944824, "kl_loss_2": 1398.2911865234375, "kl_loss_3": 1038.9411651611329, "kl_loss_7": 336.0517639160156, "learning_rate": 0.00012842758726130281, "loss": 730.0852, "step": 7690 }, { "ce_loss_10": 3.56878023147583, "ce_loss_13": 3.5146379590034487, "ce_loss_2": 4.161990666389466, "ce_loss_3": 3.9848286867141725, "ce_loss_7": 3.663095223903656, "epoch": 0.77, "grad_norm": 644.0, "kl_loss_10": 123.06643791198731, "kl_loss_2": 1403.577325439453, "kl_loss_3": 1048.3497802734375, "kl_loss_7": 337.41247100830077, "learning_rate": 0.00012736777516212267, "loss": 719.0299, "step": 7700 }, { "ce_loss_10": 3.564795804023743, "ce_loss_13": 3.5100102066993712, "ce_loss_2": 4.15794951915741, "ce_loss_3": 3.9753509163856506, "ce_loss_7": 3.656312108039856, "epoch": 0.771, "grad_norm": 540.0, "kl_loss_10": 123.37502822875976, "kl_loss_2": 1401.978204345703, "kl_loss_3": 1048.225601196289, "kl_loss_7": 339.4851440429687, "learning_rate": 0.00012631171546530968, "loss": 716.8339, "step": 7710 }, { "ce_loss_10": 3.5768836736679077, "ce_loss_13": 3.521056818962097, "ce_loss_2": 4.168219780921936, "ce_loss_3": 3.9888468265533445, "ce_loss_7": 3.6682852506637573, "epoch": 0.772, "grad_norm": 556.0, "kl_loss_10": 124.41245651245117, "kl_loss_2": 1394.1679321289062, "kl_loss_3": 1046.6662353515626, "kl_loss_7": 335.3405014038086, "learning_rate": 0.00012525941880537307, "loss": 729.6016, "step": 7720 }, { "ce_loss_10": 3.615005624294281, "ce_loss_13": 3.5605417847633363, "ce_loss_2": 4.187514328956604, "ce_loss_3": 4.013498651981354, "ce_loss_7": 3.7059258341789247, "epoch": 0.773, "grad_norm": 430.0, "kl_loss_10": 121.39755172729492, "kl_loss_2": 1357.1861572265625, "kl_loss_3": 1015.121484375, "kl_loss_7": 327.748600769043, "learning_rate": 0.00012421089577892869, "loss": 711.3485, "step": 7730 }, { "ce_loss_10": 3.56311240196228, "ce_loss_13": 3.5084868669509888, "ce_loss_2": 4.148178255558014, "ce_loss_3": 3.9718670725822447, "ce_loss_7": 3.6510336875915526, "epoch": 0.774, "grad_norm": 700.0, "kl_loss_10": 123.63254661560059, "kl_loss_2": 1398.957989501953, "kl_loss_3": 1048.010546875, "kl_loss_7": 337.9348663330078, "learning_rate": 0.0001231661569445919, "loss": 727.9016, "step": 7740 }, { "ce_loss_10": 3.4179163813591003, "ce_loss_13": 3.3657287478446962, "ce_loss_2": 4.01623170375824, "ce_loss_3": 3.8275793313980104, "ce_loss_7": 3.5108714938163756, "epoch": 0.775, "grad_norm": 512.0, "kl_loss_10": 120.1868335723877, "kl_loss_2": 1396.8933349609374, "kl_loss_3": 1037.240249633789, "kl_loss_7": 331.2000259399414, "learning_rate": 0.00012212521282287093, "loss": 730.0172, "step": 7750 }, { "ce_loss_10": 3.578013074398041, "ce_loss_13": 3.521529698371887, "ce_loss_2": 4.159704029560089, "ce_loss_3": 3.979478430747986, "ce_loss_7": 3.667074370384216, "epoch": 0.776, "grad_norm": 636.0, "kl_loss_10": 125.35750999450684, "kl_loss_2": 1375.0942077636719, "kl_loss_3": 1026.904037475586, "kl_loss_7": 334.176286315918, "learning_rate": 0.00012108807389606158, "loss": 729.955, "step": 7760 }, { "ce_loss_10": 3.5703899383544924, "ce_loss_13": 3.5183567881584166, "ce_loss_2": 4.152936434745788, "ce_loss_3": 3.975999045372009, "ce_loss_7": 3.6579775333404543, "epoch": 0.777, "grad_norm": 564.0, "kl_loss_10": 119.99107208251954, "kl_loss_2": 1371.4642456054687, "kl_loss_3": 1021.7969329833984, "kl_loss_7": 325.3572662353516, "learning_rate": 0.00012005475060814159, "loss": 713.174, "step": 7770 }, { "ce_loss_10": 3.5056790947914123, "ce_loss_13": 3.452779543399811, "ce_loss_2": 4.10383517742157, "ce_loss_3": 3.917849028110504, "ce_loss_7": 3.595883679389954, "epoch": 0.778, "grad_norm": 880.0, "kl_loss_10": 123.71958351135254, "kl_loss_2": 1422.6599975585937, "kl_loss_3": 1058.5932189941407, "kl_loss_7": 337.1080978393555, "learning_rate": 0.00011902525336466464, "loss": 729.9841, "step": 7780 }, { "ce_loss_10": 3.498458230495453, "ce_loss_13": 3.4422589898109437, "ce_loss_2": 4.102750754356384, "ce_loss_3": 3.920209753513336, "ce_loss_7": 3.590150833129883, "epoch": 0.779, "grad_norm": 616.0, "kl_loss_10": 125.57383270263672, "kl_loss_2": 1428.4908935546875, "kl_loss_3": 1067.4621215820312, "kl_loss_7": 342.1434066772461, "learning_rate": 0.00011799959253265668, "loss": 725.2893, "step": 7790 }, { "ce_loss_10": 3.5606491565704346, "ce_loss_13": 3.504223346710205, "ce_loss_2": 4.145915222167969, "ce_loss_3": 3.9612861037254334, "ce_loss_7": 3.6475781321525576, "epoch": 0.78, "grad_norm": 552.0, "kl_loss_10": 124.7652572631836, "kl_loss_2": 1399.7738220214844, "kl_loss_3": 1040.1979217529297, "kl_loss_7": 335.10644683837893, "learning_rate": 0.00011697777844051105, "loss": 725.9237, "step": 7800 }, { "ce_loss_10": 3.5388702630996702, "ce_loss_13": 3.4825316429138184, "ce_loss_2": 4.142555356025696, "ce_loss_3": 3.955347108840942, "ce_loss_7": 3.6329334139823914, "epoch": 0.781, "grad_norm": 664.0, "kl_loss_10": 123.95801277160645, "kl_loss_2": 1431.2548095703125, "kl_loss_3": 1059.7420715332032, "kl_loss_7": 337.7406295776367, "learning_rate": 0.00011595982137788402, "loss": 730.6986, "step": 7810 }, { "ce_loss_10": 3.5142834782600403, "ce_loss_13": 3.462408125400543, "ce_loss_2": 4.0907470941543576, "ce_loss_3": 3.9156432271003725, "ce_loss_7": 3.6036725878715514, "epoch": 0.782, "grad_norm": 572.0, "kl_loss_10": 120.28751068115234, "kl_loss_2": 1360.7383850097656, "kl_loss_3": 1015.1698822021484, "kl_loss_7": 327.2494552612305, "learning_rate": 0.00011494573159559212, "loss": 716.929, "step": 7820 }, { "ce_loss_10": 3.4999680519104004, "ce_loss_13": 3.444619429111481, "ce_loss_2": 4.088768780231476, "ce_loss_3": 3.917073929309845, "ce_loss_7": 3.5901222705841063, "epoch": 0.783, "grad_norm": 520.0, "kl_loss_10": 121.69903526306152, "kl_loss_2": 1381.9976928710937, "kl_loss_3": 1044.1288269042968, "kl_loss_7": 330.6407958984375, "learning_rate": 0.00011393551930550828, "loss": 732.0409, "step": 7830 }, { "ce_loss_10": 3.6485252261161802, "ce_loss_13": 3.591708707809448, "ce_loss_2": 4.217781341075897, "ce_loss_3": 4.040521049499512, "ce_loss_7": 3.7363662481307984, "epoch": 0.784, "grad_norm": 588.0, "kl_loss_10": 124.61389122009277, "kl_loss_2": 1368.7540710449218, "kl_loss_3": 1018.7755676269531, "kl_loss_7": 332.2383743286133, "learning_rate": 0.00011292919468045875, "loss": 715.6162, "step": 7840 }, { "ce_loss_10": 3.594165802001953, "ce_loss_13": 3.5395636916160584, "ce_loss_2": 4.171285450458527, "ce_loss_3": 3.9935011982917787, "ce_loss_7": 3.679953920841217, "epoch": 0.785, "grad_norm": 492.0, "kl_loss_10": 122.30264816284179, "kl_loss_2": 1382.2249145507812, "kl_loss_3": 1033.9700012207031, "kl_loss_7": 332.99535675048827, "learning_rate": 0.00011192676785412154, "loss": 713.0011, "step": 7850 }, { "ce_loss_10": 3.5345438003540037, "ce_loss_13": 3.4770457625389097, "ce_loss_2": 4.13993616104126, "ce_loss_3": 3.951429307460785, "ce_loss_7": 3.627079486846924, "epoch": 0.786, "grad_norm": 588.0, "kl_loss_10": 123.3083724975586, "kl_loss_2": 1404.34658203125, "kl_loss_3": 1041.594400024414, "kl_loss_7": 334.9306671142578, "learning_rate": 0.00011092824892092374, "loss": 727.4104, "step": 7860 }, { "ce_loss_10": 3.459770083427429, "ce_loss_13": 3.408163917064667, "ce_loss_2": 4.069538617134095, "ce_loss_3": 3.8821940660476684, "ce_loss_7": 3.5510602235794066, "epoch": 0.787, "grad_norm": 568.0, "kl_loss_10": 120.58967247009278, "kl_loss_2": 1421.5660217285156, "kl_loss_3": 1058.3918731689453, "kl_loss_7": 332.33583374023436, "learning_rate": 0.0001099336479359398, "loss": 722.3462, "step": 7870 }, { "ce_loss_10": 3.5867392897605894, "ce_loss_13": 3.5377102971076964, "ce_loss_2": 4.162244534492492, "ce_loss_3": 3.9819667458534242, "ce_loss_7": 3.6743969202041624, "epoch": 0.788, "grad_norm": 556.0, "kl_loss_10": 120.32511672973632, "kl_loss_2": 1368.990167236328, "kl_loss_3": 1015.7321105957031, "kl_loss_7": 326.9243728637695, "learning_rate": 0.00010894297491479043, "loss": 718.1454, "step": 7880 }, { "ce_loss_10": 3.587001371383667, "ce_loss_13": 3.53232136964798, "ce_loss_2": 4.164430761337281, "ce_loss_3": 3.9850462794303896, "ce_loss_7": 3.6721792697906492, "epoch": 0.789, "grad_norm": 484.0, "kl_loss_10": 122.12056427001953, "kl_loss_2": 1371.6335876464843, "kl_loss_3": 1019.5728363037109, "kl_loss_7": 329.8185546875, "learning_rate": 0.00010795623983354214, "loss": 710.1288, "step": 7890 }, { "ce_loss_10": 3.467512822151184, "ce_loss_13": 3.415497064590454, "ce_loss_2": 4.068468177318573, "ce_loss_3": 3.88867791891098, "ce_loss_7": 3.561111843585968, "epoch": 0.79, "grad_norm": 684.0, "kl_loss_10": 123.31999626159669, "kl_loss_2": 1425.7031616210938, "kl_loss_3": 1062.748342895508, "kl_loss_7": 341.4196151733398, "learning_rate": 0.00010697345262860636, "loss": 728.0182, "step": 7900 }, { "ce_loss_10": 3.6126829385757446, "ce_loss_13": 3.5593294978141783, "ce_loss_2": 4.182345855236053, "ce_loss_3": 4.000946879386902, "ce_loss_7": 3.7014246940612794, "epoch": 0.791, "grad_norm": 704.0, "kl_loss_10": 123.03226280212402, "kl_loss_2": 1358.3401000976562, "kl_loss_3": 1010.9559631347656, "kl_loss_7": 329.75187530517576, "learning_rate": 0.00010599462319663906, "loss": 708.3709, "step": 7910 }, { "ce_loss_10": 3.5866196155548096, "ce_loss_13": 3.532234954833984, "ce_loss_2": 4.136071729660034, "ce_loss_3": 3.971488630771637, "ce_loss_7": 3.672000765800476, "epoch": 0.792, "grad_norm": 472.0, "kl_loss_10": 120.42074203491211, "kl_loss_2": 1328.9012939453125, "kl_loss_3": 996.4815002441406, "kl_loss_7": 325.10135955810546, "learning_rate": 0.00010501976139444191, "loss": 702.0556, "step": 7920 }, { "ce_loss_10": 3.6193451285362244, "ce_loss_13": 3.564842689037323, "ce_loss_2": 4.190324020385742, "ce_loss_3": 4.011660039424896, "ce_loss_7": 3.700591838359833, "epoch": 0.793, "grad_norm": 640.0, "kl_loss_10": 122.40432777404786, "kl_loss_2": 1363.9416870117188, "kl_loss_3": 1013.281216430664, "kl_loss_7": 326.6585952758789, "learning_rate": 0.0001040488770388625, "loss": 717.8754, "step": 7930 }, { "ce_loss_10": 3.5575379133224487, "ce_loss_13": 3.5070634841918946, "ce_loss_2": 4.132549190521241, "ce_loss_3": 3.9550219655036924, "ce_loss_7": 3.642544448375702, "epoch": 0.794, "grad_norm": 640.0, "kl_loss_10": 122.45821609497071, "kl_loss_2": 1379.8712585449218, "kl_loss_3": 1030.4774230957032, "kl_loss_7": 332.6387283325195, "learning_rate": 0.00010308197990669538, "loss": 717.4487, "step": 7940 }, { "ce_loss_10": 3.67416570186615, "ce_loss_13": 3.6173601388931274, "ce_loss_2": 4.2498343110084535, "ce_loss_3": 4.069960105419159, "ce_loss_7": 3.7626309990882874, "epoch": 0.795, "grad_norm": 640.0, "kl_loss_10": 126.44907455444336, "kl_loss_2": 1373.5286987304687, "kl_loss_3": 1024.227099609375, "kl_loss_7": 334.73041381835935, "learning_rate": 0.0001021190797345839, "loss": 713.8917, "step": 7950 }, { "ce_loss_10": 3.3944215536117555, "ce_loss_13": 3.3384058237075807, "ce_loss_2": 4.020748329162598, "ce_loss_3": 3.8264388918876646, "ce_loss_7": 3.4902202129364013, "epoch": 0.796, "grad_norm": 628.0, "kl_loss_10": 124.78897056579589, "kl_loss_2": 1471.5206115722656, "kl_loss_3": 1092.6566253662108, "kl_loss_7": 346.82877960205076, "learning_rate": 0.00010116018621892236, "loss": 734.37, "step": 7960 }, { "ce_loss_10": 3.6051715135574343, "ce_loss_13": 3.548674774169922, "ce_loss_2": 4.196666061878204, "ce_loss_3": 4.012264215946198, "ce_loss_7": 3.6960227131843566, "epoch": 0.797, "grad_norm": 660.0, "kl_loss_10": 127.20088500976563, "kl_loss_2": 1406.4417114257812, "kl_loss_3": 1049.11025390625, "kl_loss_7": 343.34503784179685, "learning_rate": 0.00010020530901575753, "loss": 714.2687, "step": 7970 }, { "ce_loss_10": 3.640721297264099, "ce_loss_13": 3.5854681611061094, "ce_loss_2": 4.2142220616340635, "ce_loss_3": 4.037451386451721, "ce_loss_7": 3.727248787879944, "epoch": 0.798, "grad_norm": 536.0, "kl_loss_10": 124.72403526306152, "kl_loss_2": 1381.4070373535155, "kl_loss_3": 1030.5914398193358, "kl_loss_7": 332.8453369140625, "learning_rate": 9.925445774069231e-05, "loss": 705.9745, "step": 7980 }, { "ce_loss_10": 3.591674566268921, "ce_loss_13": 3.5351518869400023, "ce_loss_2": 4.173741042613983, "ce_loss_3": 3.9968824982643127, "ce_loss_7": 3.682232987880707, "epoch": 0.799, "grad_norm": 640.0, "kl_loss_10": 123.57306632995605, "kl_loss_2": 1377.6592651367187, "kl_loss_3": 1026.3024841308593, "kl_loss_7": 331.9126678466797, "learning_rate": 9.830764196878872e-05, "loss": 704.6161, "step": 7990 }, { "ce_loss_10": 3.523613679409027, "ce_loss_13": 3.470061206817627, "ce_loss_2": 4.108954405784607, "ce_loss_3": 3.929824376106262, "ce_loss_7": 3.61134192943573, "epoch": 0.8, "grad_norm": 470.0, "kl_loss_10": 120.64988784790039, "kl_loss_2": 1404.6622375488282, "kl_loss_3": 1043.9930480957032, "kl_loss_7": 329.1615982055664, "learning_rate": 9.736487123447069e-05, "loss": 721.8352, "step": 8000 }, { "ce_loss_10": 3.4729248166084288, "ce_loss_13": 3.41879620552063, "ce_loss_2": 4.0886385679245, "ce_loss_3": 3.8962788820266723, "ce_loss_7": 3.5611618995666503, "epoch": 0.801, "grad_norm": 560.0, "kl_loss_10": 123.18882446289062, "kl_loss_2": 1455.8741088867187, "kl_loss_3": 1067.2413635253906, "kl_loss_7": 335.87705688476564, "learning_rate": 9.642615503142926e-05, "loss": 732.712, "step": 8010 }, { "ce_loss_10": 3.5466567516326903, "ce_loss_13": 3.4922700762748717, "ce_loss_2": 4.133945047855377, "ce_loss_3": 3.9467738389968874, "ce_loss_7": 3.6311212301254274, "epoch": 0.802, "grad_norm": 544.0, "kl_loss_10": 121.13310089111329, "kl_loss_2": 1388.1513732910157, "kl_loss_3": 1021.6431732177734, "kl_loss_7": 326.09789276123047, "learning_rate": 9.549150281252633e-05, "loss": 714.0566, "step": 8020 }, { "ce_loss_10": 3.574864935874939, "ce_loss_13": 3.5205947041511534, "ce_loss_2": 4.153928911685943, "ce_loss_3": 3.979640507698059, "ce_loss_7": 3.66382520198822, "epoch": 0.803, "grad_norm": 438.0, "kl_loss_10": 122.92463874816895, "kl_loss_2": 1376.0149536132812, "kl_loss_3": 1028.9556762695313, "kl_loss_7": 330.7016662597656, "learning_rate": 9.4560923989699e-05, "loss": 724.1703, "step": 8030 }, { "ce_loss_10": 3.566141927242279, "ce_loss_13": 3.5096715688705444, "ce_loss_2": 4.149633848667145, "ce_loss_3": 3.9718169212341308, "ce_loss_7": 3.6562745332717896, "epoch": 0.804, "grad_norm": 548.0, "kl_loss_10": 123.20092582702637, "kl_loss_2": 1395.099835205078, "kl_loss_3": 1034.7683349609374, "kl_loss_7": 333.7354141235352, "learning_rate": 9.363442793386607e-05, "loss": 730.3557, "step": 8040 }, { "ce_loss_10": 3.5398176670074464, "ce_loss_13": 3.483390522003174, "ce_loss_2": 4.144168162345887, "ce_loss_3": 3.963836133480072, "ce_loss_7": 3.629956769943237, "epoch": 0.805, "grad_norm": 708.0, "kl_loss_10": 123.99459381103516, "kl_loss_2": 1423.4671752929687, "kl_loss_3": 1066.5139556884765, "kl_loss_7": 341.58594970703126, "learning_rate": 9.271202397483213e-05, "loss": 716.4859, "step": 8050 }, { "ce_loss_10": 3.559576690196991, "ce_loss_13": 3.5078131318092347, "ce_loss_2": 4.1300270557403564, "ce_loss_3": 3.948986303806305, "ce_loss_7": 3.6449739813804625, "epoch": 0.806, "grad_norm": 616.0, "kl_loss_10": 120.9858283996582, "kl_loss_2": 1365.0379943847656, "kl_loss_3": 1009.7584777832031, "kl_loss_7": 326.2353118896484, "learning_rate": 9.179372140119524e-05, "loss": 720.3497, "step": 8060 }, { "ce_loss_10": 3.507103145122528, "ce_loss_13": 3.4538211464881896, "ce_loss_2": 4.078395879268646, "ce_loss_3": 3.9048713564872743, "ce_loss_7": 3.595319855213165, "epoch": 0.807, "grad_norm": 564.0, "kl_loss_10": 120.51027412414551, "kl_loss_2": 1373.4655517578126, "kl_loss_3": 1023.649853515625, "kl_loss_7": 328.5488845825195, "learning_rate": 9.087952946025175e-05, "loss": 723.5377, "step": 8070 }, { "ce_loss_10": 3.620259165763855, "ce_loss_13": 3.566833519935608, "ce_loss_2": 4.171628093719482, "ce_loss_3": 3.99437997341156, "ce_loss_7": 3.7069010019302366, "epoch": 0.808, "grad_norm": 564.0, "kl_loss_10": 121.02688407897949, "kl_loss_2": 1331.2975463867188, "kl_loss_3": 991.3489471435547, "kl_loss_7": 323.82676849365237, "learning_rate": 8.996945735790446e-05, "loss": 710.1383, "step": 8080 }, { "ce_loss_10": 3.5165679335594175, "ce_loss_13": 3.464854347705841, "ce_loss_2": 4.098834156990051, "ce_loss_3": 3.920664381980896, "ce_loss_7": 3.6038545966148376, "epoch": 0.809, "grad_norm": 692.0, "kl_loss_10": 120.93192329406739, "kl_loss_2": 1395.3546203613282, "kl_loss_3": 1041.6934967041016, "kl_loss_7": 328.9375244140625, "learning_rate": 8.906351425856951e-05, "loss": 724.6123, "step": 8090 }, { "ce_loss_10": 3.500584590435028, "ce_loss_13": 3.446643114089966, "ce_loss_2": 4.105693817138672, "ce_loss_3": 3.91541211605072, "ce_loss_7": 3.5908884406089783, "epoch": 0.81, "grad_norm": 528.0, "kl_loss_10": 122.57808074951171, "kl_loss_2": 1425.2539184570312, "kl_loss_3": 1056.913995361328, "kl_loss_7": 333.52864227294924, "learning_rate": 8.816170928508365e-05, "loss": 733.849, "step": 8100 }, { "ce_loss_10": 3.4640366792678834, "ce_loss_13": 3.408512365818024, "ce_loss_2": 4.075878214836121, "ce_loss_3": 3.885686254501343, "ce_loss_7": 3.555466377735138, "epoch": 0.811, "grad_norm": 488.0, "kl_loss_10": 122.87005310058593, "kl_loss_2": 1440.5128112792968, "kl_loss_3": 1072.4588012695312, "kl_loss_7": 337.0792922973633, "learning_rate": 8.7264051518613e-05, "loss": 731.1898, "step": 8110 }, { "ce_loss_10": 3.555766189098358, "ce_loss_13": 3.5061283111572266, "ce_loss_2": 4.124446415901184, "ce_loss_3": 3.9479679465293884, "ce_loss_7": 3.643243062496185, "epoch": 0.812, "grad_norm": 636.0, "kl_loss_10": 119.26363830566406, "kl_loss_2": 1364.3051879882812, "kl_loss_3": 1016.0982299804688, "kl_loss_7": 325.98375396728517, "learning_rate": 8.637054999856148e-05, "loss": 716.5947, "step": 8120 }, { "ce_loss_10": 3.5404892563819885, "ce_loss_13": 3.485300052165985, "ce_loss_2": 4.1336122989654545, "ce_loss_3": 3.949459767341614, "ce_loss_7": 3.632100021839142, "epoch": 0.813, "grad_norm": 732.0, "kl_loss_10": 122.73268966674804, "kl_loss_2": 1402.0976989746093, "kl_loss_3": 1043.5717376708985, "kl_loss_7": 334.2214889526367, "learning_rate": 8.548121372247918e-05, "loss": 731.7564, "step": 8130 }, { "ce_loss_10": 3.6156569719314575, "ce_loss_13": 3.5624505519866942, "ce_loss_2": 4.1817370533943174, "ce_loss_3": 4.006722009181976, "ce_loss_7": 3.7026910543441773, "epoch": 0.814, "grad_norm": 604.0, "kl_loss_10": 122.37059669494629, "kl_loss_2": 1370.1346496582032, "kl_loss_3": 1023.8347961425782, "kl_loss_7": 328.72179260253904, "learning_rate": 8.459605164597267e-05, "loss": 713.0094, "step": 8140 }, { "ce_loss_10": 3.4985602378845213, "ce_loss_13": 3.445442032814026, "ce_loss_2": 4.088584578037262, "ce_loss_3": 3.9099313855171203, "ce_loss_7": 3.5871086597442625, "epoch": 0.815, "grad_norm": 540.0, "kl_loss_10": 120.2810188293457, "kl_loss_2": 1388.320379638672, "kl_loss_3": 1032.2357604980468, "kl_loss_7": 326.0157501220703, "learning_rate": 8.371507268261436e-05, "loss": 722.8974, "step": 8150 }, { "ce_loss_10": 3.575037980079651, "ce_loss_13": 3.5237354397773744, "ce_loss_2": 4.15966123342514, "ce_loss_3": 3.9799288749694823, "ce_loss_7": 3.662469041347504, "epoch": 0.816, "grad_norm": 468.0, "kl_loss_10": 122.56170921325683, "kl_loss_2": 1387.7402465820312, "kl_loss_3": 1035.590023803711, "kl_loss_7": 331.77565307617186, "learning_rate": 8.283828570385238e-05, "loss": 702.9698, "step": 8160 }, { "ce_loss_10": 3.575086772441864, "ce_loss_13": 3.521743583679199, "ce_loss_2": 4.162600886821747, "ce_loss_3": 3.980288898944855, "ce_loss_7": 3.6681403875350953, "epoch": 0.817, "grad_norm": 536.0, "kl_loss_10": 121.89449920654297, "kl_loss_2": 1363.099432373047, "kl_loss_3": 1020.9603607177735, "kl_loss_7": 330.69257202148435, "learning_rate": 8.196569953892202e-05, "loss": 714.9846, "step": 8170 }, { "ce_loss_10": 3.48666273355484, "ce_loss_13": 3.4311928510665894, "ce_loss_2": 4.078561079502106, "ce_loss_3": 3.8965752482414246, "ce_loss_7": 3.574566900730133, "epoch": 0.818, "grad_norm": 564.0, "kl_loss_10": 121.38032493591308, "kl_loss_2": 1383.4040405273438, "kl_loss_3": 1031.3620330810547, "kl_loss_7": 329.9718292236328, "learning_rate": 8.109732297475635e-05, "loss": 716.3832, "step": 8180 }, { "ce_loss_10": 3.4624683260917664, "ce_loss_13": 3.4071059107780455, "ce_loss_2": 4.093664598464966, "ce_loss_3": 3.904293656349182, "ce_loss_7": 3.5597102522850035, "epoch": 0.819, "grad_norm": 612.0, "kl_loss_10": 124.19370193481446, "kl_loss_2": 1449.606427001953, "kl_loss_3": 1083.9475494384765, "kl_loss_7": 344.92893371582034, "learning_rate": 8.023316475589754e-05, "loss": 736.8709, "step": 8190 }, { "ce_loss_10": 3.426418936252594, "ce_loss_13": 3.369425415992737, "ce_loss_2": 4.0705150127410885, "ce_loss_3": 3.8759565591812133, "ce_loss_7": 3.523312973976135, "epoch": 0.82, "grad_norm": 960.0, "kl_loss_10": 127.30840263366699, "kl_loss_2": 1489.4338317871093, "kl_loss_3": 1101.4859008789062, "kl_loss_7": 350.6812271118164, "learning_rate": 7.937323358440934e-05, "loss": 747.0156, "step": 8200 }, { "ce_loss_10": 3.5525304675102234, "ce_loss_13": 3.501710522174835, "ce_loss_2": 4.118105101585388, "ce_loss_3": 3.940403604507446, "ce_loss_7": 3.634892928600311, "epoch": 0.821, "grad_norm": 592.0, "kl_loss_10": 120.62562103271485, "kl_loss_2": 1362.5559814453125, "kl_loss_3": 1013.3167175292969, "kl_loss_7": 324.7458526611328, "learning_rate": 7.851753811978923e-05, "loss": 715.6536, "step": 8210 }, { "ce_loss_10": 3.572048854827881, "ce_loss_13": 3.51797137260437, "ce_loss_2": 4.168267369270325, "ce_loss_3": 3.9798033118247984, "ce_loss_7": 3.6613059639930725, "epoch": 0.822, "grad_norm": 584.0, "kl_loss_10": 123.35920181274415, "kl_loss_2": 1404.1062316894531, "kl_loss_3": 1032.806460571289, "kl_loss_7": 332.36991729736326, "learning_rate": 7.766608697888095e-05, "loss": 716.8437, "step": 8220 }, { "ce_loss_10": 3.5873825192451476, "ce_loss_13": 3.531443381309509, "ce_loss_2": 4.165439331531525, "ce_loss_3": 3.9851741552352906, "ce_loss_7": 3.677255082130432, "epoch": 0.823, "grad_norm": 596.0, "kl_loss_10": 125.00053062438965, "kl_loss_2": 1401.6642272949218, "kl_loss_3": 1042.9925048828125, "kl_loss_7": 337.75463714599607, "learning_rate": 7.681888873578785e-05, "loss": 727.6588, "step": 8230 }, { "ce_loss_10": 3.521757662296295, "ce_loss_13": 3.4622273206710816, "ce_loss_2": 4.119533801078797, "ce_loss_3": 3.934826338291168, "ce_loss_7": 3.613308382034302, "epoch": 0.824, "grad_norm": 528.0, "kl_loss_10": 124.3594253540039, "kl_loss_2": 1415.3692321777344, "kl_loss_3": 1054.1624694824218, "kl_loss_7": 337.27447814941405, "learning_rate": 7.597595192178702e-05, "loss": 720.58, "step": 8240 }, { "ce_loss_10": 3.510056436061859, "ce_loss_13": 3.4549400925636293, "ce_loss_2": 4.117804288864136, "ce_loss_3": 3.931263828277588, "ce_loss_7": 3.603116035461426, "epoch": 0.825, "grad_norm": 732.0, "kl_loss_10": 125.15355644226074, "kl_loss_2": 1448.2377319335938, "kl_loss_3": 1071.90830078125, "kl_loss_7": 342.34349822998047, "learning_rate": 7.513728502524286e-05, "loss": 737.9312, "step": 8250 }, { "ce_loss_10": 3.5135778069496153, "ce_loss_13": 3.4605449557304384, "ce_loss_2": 4.092071676254273, "ce_loss_3": 3.911950242519379, "ce_loss_7": 3.6011463046073913, "epoch": 0.826, "grad_norm": 724.0, "kl_loss_10": 119.03209075927734, "kl_loss_2": 1362.1486572265626, "kl_loss_3": 1014.1013732910156, "kl_loss_7": 323.3859298706055, "learning_rate": 7.430289649152156e-05, "loss": 719.2754, "step": 8260 }, { "ce_loss_10": 3.4184691548347472, "ce_loss_13": 3.366116726398468, "ce_loss_2": 4.024687492847443, "ce_loss_3": 3.840518391132355, "ce_loss_7": 3.5109126925468446, "epoch": 0.827, "grad_norm": 620.0, "kl_loss_10": 121.1826301574707, "kl_loss_2": 1442.715380859375, "kl_loss_3": 1078.2603820800782, "kl_loss_7": 337.94980773925784, "learning_rate": 7.347279472290646e-05, "loss": 725.4499, "step": 8270 }, { "ce_loss_10": 3.553591012954712, "ce_loss_13": 3.5011276602745056, "ce_loss_2": 4.143419885635376, "ce_loss_3": 3.963691782951355, "ce_loss_7": 3.6430840849876405, "epoch": 0.828, "grad_norm": 632.0, "kl_loss_10": 122.11264228820801, "kl_loss_2": 1390.1591186523438, "kl_loss_3": 1031.9824768066405, "kl_loss_7": 331.70887298583983, "learning_rate": 7.264698807851328e-05, "loss": 723.2428, "step": 8280 }, { "ce_loss_10": 3.5191181659698487, "ce_loss_13": 3.469066548347473, "ce_loss_2": 4.081725561618805, "ce_loss_3": 3.910724198818207, "ce_loss_7": 3.6018188834190368, "epoch": 0.829, "grad_norm": 588.0, "kl_loss_10": 118.70943183898926, "kl_loss_2": 1345.8255798339844, "kl_loss_3": 1008.5260772705078, "kl_loss_7": 322.22408599853514, "learning_rate": 7.182548487420554e-05, "loss": 711.3388, "step": 8290 }, { "ce_loss_10": 3.5789464712142944, "ce_loss_13": 3.5243069410324095, "ce_loss_2": 4.157199239730835, "ce_loss_3": 3.980182957649231, "ce_loss_7": 3.6695273160934447, "epoch": 0.83, "grad_norm": 532.0, "kl_loss_10": 123.22674674987793, "kl_loss_2": 1386.9601989746093, "kl_loss_3": 1033.9500579833984, "kl_loss_7": 333.202214050293, "learning_rate": 7.100829338251146e-05, "loss": 715.895, "step": 8300 }, { "ce_loss_10": 3.509700119495392, "ce_loss_13": 3.454431247711182, "ce_loss_2": 4.1180780053138735, "ce_loss_3": 3.9303313374519346, "ce_loss_7": 3.601513075828552, "epoch": 0.831, "grad_norm": 620.0, "kl_loss_10": 123.90524368286133, "kl_loss_2": 1420.9776611328125, "kl_loss_3": 1058.4737640380858, "kl_loss_7": 338.4593246459961, "learning_rate": 7.019542183254046e-05, "loss": 720.623, "step": 8310 }, { "ce_loss_10": 3.5525806307792664, "ce_loss_13": 3.4959965467453005, "ce_loss_2": 4.128799915313721, "ce_loss_3": 3.9509485840797423, "ce_loss_7": 3.6412771344184875, "epoch": 0.832, "grad_norm": 716.0, "kl_loss_10": 127.12583503723144, "kl_loss_2": 1391.4539794921875, "kl_loss_3": 1033.12021484375, "kl_loss_7": 336.7935516357422, "learning_rate": 6.938687840989971e-05, "loss": 719.1849, "step": 8320 }, { "ce_loss_10": 3.4853139400482176, "ce_loss_13": 3.429826629161835, "ce_loss_2": 4.072561550140381, "ce_loss_3": 3.893218147754669, "ce_loss_7": 3.576352059841156, "epoch": 0.833, "grad_norm": 652.0, "kl_loss_10": 123.63401412963867, "kl_loss_2": 1387.767022705078, "kl_loss_3": 1034.514959716797, "kl_loss_7": 333.90601654052733, "learning_rate": 6.858267125661271e-05, "loss": 724.1394, "step": 8330 }, { "ce_loss_10": 3.5483754873275757, "ce_loss_13": 3.4943613171577455, "ce_loss_2": 4.1425862312316895, "ce_loss_3": 3.9642663478851317, "ce_loss_7": 3.637398338317871, "epoch": 0.834, "grad_norm": 688.0, "kl_loss_10": 120.76418533325196, "kl_loss_2": 1390.6704711914062, "kl_loss_3": 1037.854913330078, "kl_loss_7": 329.1157623291016, "learning_rate": 6.778280847103668e-05, "loss": 734.3215, "step": 8340 }, { "ce_loss_10": 3.5596023082733153, "ce_loss_13": 3.505242204666138, "ce_loss_2": 4.147267746925354, "ce_loss_3": 3.966709625720978, "ce_loss_7": 3.6490352749824524, "epoch": 0.835, "grad_norm": 616.0, "kl_loss_10": 124.8666488647461, "kl_loss_2": 1410.322442626953, "kl_loss_3": 1046.9622619628906, "kl_loss_7": 337.84252166748047, "learning_rate": 6.698729810778065e-05, "loss": 721.4561, "step": 8350 }, { "ce_loss_10": 3.4651718974113463, "ce_loss_13": 3.4114996194839478, "ce_loss_2": 4.066473186016083, "ce_loss_3": 3.881610298156738, "ce_loss_7": 3.5564165115356445, "epoch": 0.836, "grad_norm": 744.0, "kl_loss_10": 119.62793006896973, "kl_loss_2": 1401.6785705566406, "kl_loss_3": 1042.894009399414, "kl_loss_7": 327.8875930786133, "learning_rate": 6.619614817762538e-05, "loss": 723.3062, "step": 8360 }, { "ce_loss_10": 3.4274404406547547, "ce_loss_13": 3.374155807495117, "ce_loss_2": 4.061884236335755, "ce_loss_3": 3.8700376033782957, "ce_loss_7": 3.5248605847358703, "epoch": 0.837, "grad_norm": 476.0, "kl_loss_10": 121.19938850402832, "kl_loss_2": 1469.314794921875, "kl_loss_3": 1094.0616088867187, "kl_loss_7": 343.3897903442383, "learning_rate": 6.540936664744196e-05, "loss": 736.0315, "step": 8370 }, { "ce_loss_10": 3.5783516645431517, "ce_loss_13": 3.5220268607139587, "ce_loss_2": 4.174437272548675, "ce_loss_3": 3.990994656085968, "ce_loss_7": 3.667269802093506, "epoch": 0.838, "grad_norm": 414.0, "kl_loss_10": 122.77630615234375, "kl_loss_2": 1392.2082885742188, "kl_loss_3": 1041.2133331298828, "kl_loss_7": 332.893977355957, "learning_rate": 6.462696144011149e-05, "loss": 716.4901, "step": 8380 }, { "ce_loss_10": 3.5278226017951964, "ce_loss_13": 3.4739001631736754, "ce_loss_2": 4.114171576499939, "ce_loss_3": 3.937883758544922, "ce_loss_7": 3.62037034034729, "epoch": 0.839, "grad_norm": 506.0, "kl_loss_10": 125.26272201538086, "kl_loss_2": 1392.125311279297, "kl_loss_3": 1040.5689788818358, "kl_loss_7": 336.6851501464844, "learning_rate": 6.384894043444567e-05, "loss": 715.348, "step": 8390 }, { "ce_loss_10": 3.5627179741859436, "ce_loss_13": 3.510275673866272, "ce_loss_2": 4.151006007194519, "ce_loss_3": 3.9754858732223513, "ce_loss_7": 3.650054228305817, "epoch": 0.84, "grad_norm": 840.0, "kl_loss_10": 122.64037246704102, "kl_loss_2": 1398.1174865722655, "kl_loss_3": 1038.6087371826172, "kl_loss_7": 331.8931655883789, "learning_rate": 6.307531146510753e-05, "loss": 716.8575, "step": 8400 }, { "ce_loss_10": 3.5370197057724, "ce_loss_13": 3.481518292427063, "ce_loss_2": 4.113574099540711, "ce_loss_3": 3.934196209907532, "ce_loss_7": 3.6256550788879394, "epoch": 0.841, "grad_norm": 564.0, "kl_loss_10": 122.6060775756836, "kl_loss_2": 1369.6577880859375, "kl_loss_3": 1022.7808959960937, "kl_loss_7": 330.5647766113281, "learning_rate": 6.230608232253226e-05, "loss": 710.1536, "step": 8410 }, { "ce_loss_10": 3.489914321899414, "ce_loss_13": 3.4370184540748596, "ce_loss_2": 4.1010636448860165, "ce_loss_3": 3.9170850038528444, "ce_loss_7": 3.5802398562431335, "epoch": 0.842, "grad_norm": 544.0, "kl_loss_10": 121.60591506958008, "kl_loss_2": 1432.9647521972656, "kl_loss_3": 1068.4068176269532, "kl_loss_7": 334.95867614746095, "learning_rate": 6.154126075284855e-05, "loss": 721.6619, "step": 8420 }, { "ce_loss_10": 3.583195209503174, "ce_loss_13": 3.530484902858734, "ce_loss_2": 4.156937325000763, "ce_loss_3": 3.9834988713264465, "ce_loss_7": 3.6713316440582275, "epoch": 0.843, "grad_norm": 548.0, "kl_loss_10": 119.82128868103027, "kl_loss_2": 1350.0745971679687, "kl_loss_3": 1013.26455078125, "kl_loss_7": 326.27203674316405, "learning_rate": 6.078085445780129e-05, "loss": 702.2121, "step": 8430 }, { "ce_loss_10": 3.5919320344924928, "ce_loss_13": 3.5380735635757445, "ce_loss_2": 4.185958683490753, "ce_loss_3": 4.000372779369354, "ce_loss_7": 3.680304741859436, "epoch": 0.844, "grad_norm": 624.0, "kl_loss_10": 122.76680603027344, "kl_loss_2": 1400.8627502441407, "kl_loss_3": 1034.3849639892578, "kl_loss_7": 332.1929397583008, "learning_rate": 6.002487109467347e-05, "loss": 712.4612, "step": 8440 }, { "ce_loss_10": 3.598021149635315, "ce_loss_13": 3.544245195388794, "ce_loss_2": 4.1716133713722225, "ce_loss_3": 3.996164381504059, "ce_loss_7": 3.684845209121704, "epoch": 0.845, "grad_norm": 624.0, "kl_loss_10": 122.97368698120117, "kl_loss_2": 1382.0722961425781, "kl_loss_3": 1033.296566772461, "kl_loss_7": 335.2268692016602, "learning_rate": 5.927331827620902e-05, "loss": 714.5446, "step": 8450 }, { "ce_loss_10": 3.583691966533661, "ce_loss_13": 3.532571244239807, "ce_loss_2": 4.150978291034699, "ce_loss_3": 3.980244314670563, "ce_loss_7": 3.668419122695923, "epoch": 0.846, "grad_norm": 442.0, "kl_loss_10": 119.68389167785645, "kl_loss_2": 1349.830810546875, "kl_loss_3": 1009.5769073486329, "kl_loss_7": 326.7773956298828, "learning_rate": 5.852620357053651e-05, "loss": 711.2381, "step": 8460 }, { "ce_loss_10": 3.621056246757507, "ce_loss_13": 3.566351020336151, "ce_loss_2": 4.186827552318573, "ce_loss_3": 4.011941504478455, "ce_loss_7": 3.707981014251709, "epoch": 0.847, "grad_norm": 604.0, "kl_loss_10": 120.51869049072266, "kl_loss_2": 1355.9027709960938, "kl_loss_3": 1009.8785064697265, "kl_loss_7": 326.8977752685547, "learning_rate": 5.778353450109286e-05, "loss": 709.6583, "step": 8470 }, { "ce_loss_10": 3.6638654470443726, "ce_loss_13": 3.6074997663497923, "ce_loss_2": 4.246403753757477, "ce_loss_3": 4.066155457496643, "ce_loss_7": 3.7529626727104186, "epoch": 0.848, "grad_norm": 496.0, "kl_loss_10": 124.99574241638183, "kl_loss_2": 1393.828564453125, "kl_loss_3": 1038.2093139648437, "kl_loss_7": 335.6842575073242, "learning_rate": 5.7045318546547206e-05, "loss": 718.2406, "step": 8480 }, { "ce_loss_10": 3.5572060704231263, "ce_loss_13": 3.502591860294342, "ce_loss_2": 4.141818737983703, "ce_loss_3": 3.964042770862579, "ce_loss_7": 3.6434488534927367, "epoch": 0.849, "grad_norm": 720.0, "kl_loss_10": 122.56028099060059, "kl_loss_2": 1399.0360961914062, "kl_loss_3": 1046.2178619384765, "kl_loss_7": 331.661442565918, "learning_rate": 5.631156314072605e-05, "loss": 715.0076, "step": 8490 }, { "ce_loss_10": 3.56727614402771, "ce_loss_13": 3.5156813502311706, "ce_loss_2": 4.136505460739135, "ce_loss_3": 3.9590030670166017, "ce_loss_7": 3.655721139907837, "epoch": 0.85, "grad_norm": 552.0, "kl_loss_10": 120.79662399291992, "kl_loss_2": 1366.066455078125, "kl_loss_3": 1009.6999877929687, "kl_loss_7": 325.6238540649414, "learning_rate": 5.5582275672538315e-05, "loss": 705.6439, "step": 8500 }, { "ce_loss_10": 3.489551877975464, "ce_loss_13": 3.4322364211082457, "ce_loss_2": 4.108343851566315, "ce_loss_3": 3.922130513191223, "ce_loss_7": 3.582332742214203, "epoch": 0.851, "grad_norm": 572.0, "kl_loss_10": 125.23014411926269, "kl_loss_2": 1452.216046142578, "kl_loss_3": 1083.0195220947267, "kl_loss_7": 340.2689270019531, "learning_rate": 5.4857463485900484e-05, "loss": 735.124, "step": 8510 }, { "ce_loss_10": 3.5416255474090574, "ce_loss_13": 3.4896072506904603, "ce_loss_2": 4.120888018608094, "ce_loss_3": 3.9421190857887267, "ce_loss_7": 3.631319212913513, "epoch": 0.852, "grad_norm": 492.0, "kl_loss_10": 121.33798141479492, "kl_loss_2": 1376.6540832519531, "kl_loss_3": 1028.2473266601562, "kl_loss_7": 331.88528594970705, "learning_rate": 5.413713387966329e-05, "loss": 712.989, "step": 8520 }, { "ce_loss_10": 3.56683589220047, "ce_loss_13": 3.511161148548126, "ce_loss_2": 4.154978930950165, "ce_loss_3": 3.97281414270401, "ce_loss_7": 3.656111001968384, "epoch": 0.853, "grad_norm": 908.0, "kl_loss_10": 124.57011680603027, "kl_loss_2": 1397.046124267578, "kl_loss_3": 1044.6578674316406, "kl_loss_7": 333.34014892578125, "learning_rate": 5.34212941075381e-05, "loss": 723.4795, "step": 8530 }, { "ce_loss_10": 3.571316683292389, "ce_loss_13": 3.518352448940277, "ce_loss_2": 4.139795422554016, "ce_loss_3": 3.96577330827713, "ce_loss_7": 3.6558568358421324, "epoch": 0.854, "grad_norm": 704.0, "kl_loss_10": 119.71459197998047, "kl_loss_2": 1355.7856689453124, "kl_loss_3": 1011.7955169677734, "kl_loss_7": 321.64742279052734, "learning_rate": 5.270995137802315e-05, "loss": 706.1101, "step": 8540 }, { "ce_loss_10": 3.502516198158264, "ce_loss_13": 3.4539687633514404, "ce_loss_2": 4.084729707241058, "ce_loss_3": 3.9072174072265624, "ce_loss_7": 3.5934065103530886, "epoch": 0.855, "grad_norm": 532.0, "kl_loss_10": 119.76500358581544, "kl_loss_2": 1392.7714599609376, "kl_loss_3": 1033.4823822021485, "kl_loss_7": 329.61717224121094, "learning_rate": 5.2003112854332125e-05, "loss": 718.4223, "step": 8550 }, { "ce_loss_10": 3.5103381156921385, "ce_loss_13": 3.457986581325531, "ce_loss_2": 4.087642765045166, "ce_loss_3": 3.9084141731262205, "ce_loss_7": 3.5956949472427366, "epoch": 0.856, "grad_norm": 724.0, "kl_loss_10": 120.56714744567871, "kl_loss_2": 1380.4932006835938, "kl_loss_3": 1032.8779724121093, "kl_loss_7": 327.5092468261719, "learning_rate": 5.130078565432089e-05, "loss": 704.4872, "step": 8560 }, { "ce_loss_10": 3.5799346208572387, "ce_loss_13": 3.5277039766311646, "ce_loss_2": 4.142898440361023, "ce_loss_3": 3.9674919128417967, "ce_loss_7": 3.662895882129669, "epoch": 0.857, "grad_norm": 492.0, "kl_loss_10": 120.35042152404785, "kl_loss_2": 1359.3675537109375, "kl_loss_3": 1015.0716613769531, "kl_loss_7": 324.0651107788086, "learning_rate": 5.060297685041659e-05, "loss": 701.0533, "step": 8570 }, { "ce_loss_10": 3.5102761030197143, "ce_loss_13": 3.4554622650146483, "ce_loss_2": 4.102907431125641, "ce_loss_3": 3.920087468624115, "ce_loss_7": 3.5963584423065185, "epoch": 0.858, "grad_norm": 560.0, "kl_loss_10": 123.81989250183105, "kl_loss_2": 1410.788916015625, "kl_loss_3": 1048.8169738769532, "kl_loss_7": 335.03162536621096, "learning_rate": 4.99096934695461e-05, "loss": 732.0039, "step": 8580 }, { "ce_loss_10": 3.571087384223938, "ce_loss_13": 3.5143581509590147, "ce_loss_2": 4.155477786064148, "ce_loss_3": 3.9760597348213196, "ce_loss_7": 3.659903717041016, "epoch": 0.859, "grad_norm": 524.0, "kl_loss_10": 122.09205551147461, "kl_loss_2": 1377.2748107910156, "kl_loss_3": 1023.4202880859375, "kl_loss_7": 329.93336029052733, "learning_rate": 4.922094249306558e-05, "loss": 708.5735, "step": 8590 }, { "ce_loss_10": 3.5948333024978636, "ce_loss_13": 3.542141377925873, "ce_loss_2": 4.168703019618988, "ce_loss_3": 3.9915787816047668, "ce_loss_7": 3.6865815997123716, "epoch": 0.86, "grad_norm": 512.0, "kl_loss_10": 122.46472663879395, "kl_loss_2": 1371.870458984375, "kl_loss_3": 1024.2403106689453, "kl_loss_7": 333.7798217773437, "learning_rate": 4.853673085668947e-05, "loss": 703.5446, "step": 8600 }, { "ce_loss_10": 3.621105194091797, "ce_loss_13": 3.5630958437919618, "ce_loss_2": 4.201367115974426, "ce_loss_3": 4.027955961227417, "ce_loss_7": 3.7082759618759153, "epoch": 0.861, "grad_norm": 560.0, "kl_loss_10": 123.08092155456544, "kl_loss_2": 1377.9781494140625, "kl_loss_3": 1025.2794342041016, "kl_loss_7": 328.5953765869141, "learning_rate": 4.78570654504214e-05, "loss": 717.6915, "step": 8610 }, { "ce_loss_10": 3.5619210958480836, "ce_loss_13": 3.507647895812988, "ce_loss_2": 4.149072754383087, "ce_loss_3": 3.970789170265198, "ce_loss_7": 3.648519480228424, "epoch": 0.862, "grad_norm": 516.0, "kl_loss_10": 120.92977561950684, "kl_loss_2": 1399.8869262695312, "kl_loss_3": 1044.365057373047, "kl_loss_7": 330.3472198486328, "learning_rate": 4.7181953118484556e-05, "loss": 723.0992, "step": 8620 }, { "ce_loss_10": 3.5900183796882628, "ce_loss_13": 3.5374162673950194, "ce_loss_2": 4.16554582118988, "ce_loss_3": 3.994710421562195, "ce_loss_7": 3.678675186634064, "epoch": 0.863, "grad_norm": 520.0, "kl_loss_10": 120.54650344848633, "kl_loss_2": 1357.929443359375, "kl_loss_3": 1018.2571746826172, "kl_loss_7": 328.08815002441406, "learning_rate": 4.651140065925269e-05, "loss": 721.2698, "step": 8630 }, { "ce_loss_10": 3.5209784507751465, "ce_loss_13": 3.4652896523475647, "ce_loss_2": 4.103803014755249, "ce_loss_3": 3.9223596453666687, "ce_loss_7": 3.611543357372284, "epoch": 0.864, "grad_norm": 604.0, "kl_loss_10": 123.07759132385254, "kl_loss_2": 1391.4273742675782, "kl_loss_3": 1032.6504333496093, "kl_loss_7": 332.56396484375, "learning_rate": 4.58454148251814e-05, "loss": 725.9105, "step": 8640 }, { "ce_loss_10": 3.5388185143470765, "ce_loss_13": 3.4819789290428163, "ce_loss_2": 4.147819340229034, "ce_loss_3": 3.9573601484298706, "ce_loss_7": 3.6289569973945617, "epoch": 0.865, "grad_norm": 600.0, "kl_loss_10": 122.27800941467285, "kl_loss_2": 1421.122607421875, "kl_loss_3": 1049.9566833496094, "kl_loss_7": 332.15319976806643, "learning_rate": 4.518400232274078e-05, "loss": 721.911, "step": 8650 }, { "ce_loss_10": 3.552417826652527, "ce_loss_13": 3.4967941880226134, "ce_loss_2": 4.138116705417633, "ce_loss_3": 3.959863018989563, "ce_loss_7": 3.642176163196564, "epoch": 0.866, "grad_norm": 536.0, "kl_loss_10": 123.07808303833008, "kl_loss_2": 1379.3301574707032, "kl_loss_3": 1030.6435760498048, "kl_loss_7": 333.7181594848633, "learning_rate": 4.452716981234745e-05, "loss": 702.8377, "step": 8660 }, { "ce_loss_10": 3.5320884346961976, "ce_loss_13": 3.480946350097656, "ce_loss_2": 4.1082984685897825, "ce_loss_3": 3.930745470523834, "ce_loss_7": 3.6193148374557493, "epoch": 0.867, "grad_norm": 486.0, "kl_loss_10": 119.37153434753418, "kl_loss_2": 1378.435809326172, "kl_loss_3": 1026.5518157958984, "kl_loss_7": 327.11744079589846, "learning_rate": 4.3874923908297335e-05, "loss": 706.7176, "step": 8670 }, { "ce_loss_10": 3.5820580959320067, "ce_loss_13": 3.5283179759979246, "ce_loss_2": 4.165662562847137, "ce_loss_3": 3.9848700284957888, "ce_loss_7": 3.668103575706482, "epoch": 0.868, "grad_norm": 576.0, "kl_loss_10": 123.53395080566406, "kl_loss_2": 1397.873388671875, "kl_loss_3": 1040.8485137939454, "kl_loss_7": 332.79543151855466, "learning_rate": 4.322727117869951e-05, "loss": 718.6055, "step": 8680 }, { "ce_loss_10": 3.5884267687797546, "ce_loss_13": 3.5351755380630494, "ce_loss_2": 4.178534460067749, "ce_loss_3": 3.9966900825500487, "ce_loss_7": 3.6785280823707582, "epoch": 0.869, "grad_norm": 628.0, "kl_loss_10": 123.76097946166992, "kl_loss_2": 1403.0090087890626, "kl_loss_3": 1049.369467163086, "kl_loss_7": 333.3337600708008, "learning_rate": 4.2584218145409916e-05, "loss": 715.6396, "step": 8690 }, { "ce_loss_10": 3.6362990498542787, "ce_loss_13": 3.5836689591407778, "ce_loss_2": 4.204011178016662, "ce_loss_3": 4.026763451099396, "ce_loss_7": 3.723234498500824, "epoch": 0.87, "grad_norm": 640.0, "kl_loss_10": 121.82669830322266, "kl_loss_2": 1356.0820190429688, "kl_loss_3": 1011.6945983886719, "kl_loss_7": 326.52610778808594, "learning_rate": 4.194577128396521e-05, "loss": 701.9309, "step": 8700 }, { "ce_loss_10": 3.509493625164032, "ce_loss_13": 3.45839284658432, "ce_loss_2": 4.095285260677338, "ce_loss_3": 3.9108598709106444, "ce_loss_7": 3.5953684210777284, "epoch": 0.871, "grad_norm": 466.0, "kl_loss_10": 119.7718490600586, "kl_loss_2": 1389.6183959960938, "kl_loss_3": 1027.7238983154298, "kl_loss_7": 325.0868804931641, "learning_rate": 4.1311937023518264e-05, "loss": 718.8018, "step": 8710 }, { "ce_loss_10": 3.529244434833527, "ce_loss_13": 3.4789538025856017, "ce_loss_2": 4.10749055147171, "ce_loss_3": 3.9206608176231383, "ce_loss_7": 3.613613450527191, "epoch": 0.872, "grad_norm": 552.0, "kl_loss_10": 118.38729095458984, "kl_loss_2": 1381.1460327148438, "kl_loss_3": 1020.1363098144532, "kl_loss_7": 319.3013519287109, "learning_rate": 4.0682721746773344e-05, "loss": 710.0534, "step": 8720 }, { "ce_loss_10": 3.3995256781578065, "ce_loss_13": 3.3480949640274047, "ce_loss_2": 4.007587265968323, "ce_loss_3": 3.8226790904998778, "ce_loss_7": 3.4936659812927244, "epoch": 0.873, "grad_norm": 680.0, "kl_loss_10": 119.34899826049805, "kl_loss_2": 1401.68115234375, "kl_loss_3": 1045.86484375, "kl_loss_7": 331.2798797607422, "learning_rate": 4.0058131789920904e-05, "loss": 709.8896, "step": 8730 }, { "ce_loss_10": 3.5508158564567567, "ce_loss_13": 3.4964456081390383, "ce_loss_2": 4.1352542519569395, "ce_loss_3": 3.9534544229507445, "ce_loss_7": 3.636772119998932, "epoch": 0.874, "grad_norm": 584.0, "kl_loss_10": 120.94258270263671, "kl_loss_2": 1390.5471740722655, "kl_loss_3": 1033.9399536132812, "kl_loss_7": 328.365657043457, "learning_rate": 3.9438173442575e-05, "loss": 736.2612, "step": 8740 }, { "ce_loss_10": 3.5804495930671694, "ce_loss_13": 3.5271941781044007, "ce_loss_2": 4.154019808769226, "ce_loss_3": 3.978409993648529, "ce_loss_7": 3.6670626044273376, "epoch": 0.875, "grad_norm": 660.0, "kl_loss_10": 121.4291778564453, "kl_loss_2": 1363.337957763672, "kl_loss_3": 1017.4170349121093, "kl_loss_7": 330.21262817382814, "learning_rate": 3.882285294770937e-05, "loss": 711.9646, "step": 8750 }, { "ce_loss_10": 3.548740530014038, "ce_loss_13": 3.4961506009101866, "ce_loss_2": 4.1233531594276425, "ce_loss_3": 3.9456629276275637, "ce_loss_7": 3.635119545459747, "epoch": 0.876, "grad_norm": 720.0, "kl_loss_10": 122.49603576660157, "kl_loss_2": 1370.9503845214845, "kl_loss_3": 1018.0048278808594, "kl_loss_7": 328.40975494384764, "learning_rate": 3.821217650159453e-05, "loss": 718.8667, "step": 8760 }, { "ce_loss_10": 3.418784809112549, "ce_loss_13": 3.365262305736542, "ce_loss_2": 4.043610215187073, "ce_loss_3": 3.850952887535095, "ce_loss_7": 3.512869107723236, "epoch": 0.877, "grad_norm": 864.0, "kl_loss_10": 121.31179428100586, "kl_loss_2": 1448.362078857422, "kl_loss_3": 1071.3075866699219, "kl_loss_7": 338.1376647949219, "learning_rate": 3.760615025373543e-05, "loss": 729.3138, "step": 8770 }, { "ce_loss_10": 3.600062382221222, "ce_loss_13": 3.5458203673362734, "ce_loss_2": 4.1950247406959535, "ce_loss_3": 4.014675962924957, "ce_loss_7": 3.6932862639427184, "epoch": 0.878, "grad_norm": 684.0, "kl_loss_10": 125.84054679870606, "kl_loss_2": 1408.6423461914062, "kl_loss_3": 1048.4514282226562, "kl_loss_7": 337.6252685546875, "learning_rate": 3.700478030680987e-05, "loss": 731.0535, "step": 8780 }, { "ce_loss_10": 3.588497185707092, "ce_loss_13": 3.5357969880104063, "ce_loss_2": 4.16150141954422, "ce_loss_3": 3.9861255884170532, "ce_loss_7": 3.6764004111289976, "epoch": 0.879, "grad_norm": 616.0, "kl_loss_10": 121.01314735412598, "kl_loss_2": 1369.610662841797, "kl_loss_3": 1017.0912567138672, "kl_loss_7": 326.87464447021483, "learning_rate": 3.6408072716606344e-05, "loss": 710.2985, "step": 8790 }, { "ce_loss_10": 3.511665999889374, "ce_loss_13": 3.455206000804901, "ce_loss_2": 4.111484944820404, "ce_loss_3": 3.9289029002189637, "ce_loss_7": 3.6025957107543944, "epoch": 0.88, "grad_norm": 780.0, "kl_loss_10": 123.63747024536133, "kl_loss_2": 1420.9652282714844, "kl_loss_3": 1057.6597961425782, "kl_loss_7": 336.1513076782227, "learning_rate": 3.5816033491963716e-05, "loss": 739.0035, "step": 8800 }, { "ce_loss_10": 3.371099352836609, "ce_loss_13": 3.316172993183136, "ce_loss_2": 3.9832777261734007, "ce_loss_3": 3.789150130748749, "ce_loss_7": 3.461447310447693, "epoch": 0.881, "grad_norm": 512.0, "kl_loss_10": 119.77749671936036, "kl_loss_2": 1419.2442565917968, "kl_loss_3": 1050.3526092529296, "kl_loss_7": 326.61090087890625, "learning_rate": 3.522866859471047e-05, "loss": 723.13, "step": 8810 }, { "ce_loss_10": 3.611319732666016, "ce_loss_13": 3.561573588848114, "ce_loss_2": 4.164506709575653, "ce_loss_3": 3.9919765472412108, "ce_loss_7": 3.6927991986274717, "epoch": 0.882, "grad_norm": 668.0, "kl_loss_10": 117.92159461975098, "kl_loss_2": 1322.9934326171874, "kl_loss_3": 989.4436096191406, "kl_loss_7": 318.43848724365233, "learning_rate": 3.46459839396045e-05, "loss": 702.0629, "step": 8820 }, { "ce_loss_10": 3.538902699947357, "ce_loss_13": 3.4826046943664553, "ce_loss_2": 4.136587750911713, "ce_loss_3": 3.952931213378906, "ce_loss_7": 3.6263251304626465, "epoch": 0.883, "grad_norm": 692.0, "kl_loss_10": 123.36521034240722, "kl_loss_2": 1392.0963928222657, "kl_loss_3": 1039.635934448242, "kl_loss_7": 335.2001617431641, "learning_rate": 3.406798539427386e-05, "loss": 735.3253, "step": 8830 }, { "ce_loss_10": 3.5910847425460815, "ce_loss_13": 3.537630581855774, "ce_loss_2": 4.170853447914124, "ce_loss_3": 3.988684332370758, "ce_loss_7": 3.6767850637435915, "epoch": 0.884, "grad_norm": 676.0, "kl_loss_10": 121.19235725402832, "kl_loss_2": 1382.8566284179688, "kl_loss_3": 1032.8232696533203, "kl_loss_7": 330.4065704345703, "learning_rate": 3.349467877915746e-05, "loss": 719.4985, "step": 8840 }, { "ce_loss_10": 3.549594223499298, "ce_loss_13": 3.4964571714401247, "ce_loss_2": 4.147552752494812, "ce_loss_3": 3.9666833519935607, "ce_loss_7": 3.6410465359687807, "epoch": 0.885, "grad_norm": 756.0, "kl_loss_10": 121.92766189575195, "kl_loss_2": 1412.680615234375, "kl_loss_3": 1051.8612976074219, "kl_loss_7": 334.56292724609375, "learning_rate": 3.292606986744667e-05, "loss": 738.1569, "step": 8850 }, { "ce_loss_10": 3.5065809965133665, "ce_loss_13": 3.4557624578475954, "ce_loss_2": 4.09570015668869, "ce_loss_3": 3.9213839888572695, "ce_loss_7": 3.5942333817481993, "epoch": 0.886, "grad_norm": 512.0, "kl_loss_10": 120.2051342010498, "kl_loss_2": 1395.1206481933593, "kl_loss_3": 1046.7202606201172, "kl_loss_7": 328.31476440429685, "learning_rate": 3.23621643850267e-05, "loss": 722.0618, "step": 8860 }, { "ce_loss_10": 3.5800418615341187, "ce_loss_13": 3.528806471824646, "ce_loss_2": 4.1604786038398744, "ce_loss_3": 3.9754431366920473, "ce_loss_7": 3.6690159320831297, "epoch": 0.887, "grad_norm": 668.0, "kl_loss_10": 122.70932960510254, "kl_loss_2": 1398.141259765625, "kl_loss_3": 1041.2169250488282, "kl_loss_7": 333.5382675170898, "learning_rate": 3.180296801041971e-05, "loss": 714.5479, "step": 8870 }, { "ce_loss_10": 3.605249297618866, "ce_loss_13": 3.552952218055725, "ce_loss_2": 4.17888309955597, "ce_loss_3": 4.001760041713714, "ce_loss_7": 3.6925482630729674, "epoch": 0.888, "grad_norm": 640.0, "kl_loss_10": 121.45921478271484, "kl_loss_2": 1367.8746765136718, "kl_loss_3": 1020.4184844970703, "kl_loss_7": 326.21177368164064, "learning_rate": 3.124848637472688e-05, "loss": 704.8254, "step": 8880 }, { "ce_loss_10": 3.425955653190613, "ce_loss_13": 3.372738444805145, "ce_loss_2": 4.030108571052551, "ce_loss_3": 3.8461776614189147, "ce_loss_7": 3.5158618688583374, "epoch": 0.889, "grad_norm": 836.0, "kl_loss_10": 119.03256759643554, "kl_loss_2": 1409.894512939453, "kl_loss_3": 1046.528024291992, "kl_loss_7": 325.74627990722655, "learning_rate": 3.069872506157212e-05, "loss": 719.6042, "step": 8890 }, { "ce_loss_10": 3.5282222867012023, "ce_loss_13": 3.475008749961853, "ce_loss_2": 4.108449482917786, "ce_loss_3": 3.9331199288368226, "ce_loss_7": 3.617098903656006, "epoch": 0.89, "grad_norm": 628.0, "kl_loss_10": 120.31341552734375, "kl_loss_2": 1379.0091857910156, "kl_loss_3": 1029.306707763672, "kl_loss_7": 330.0529190063477, "learning_rate": 3.0153689607045842e-05, "loss": 710.0303, "step": 8900 }, { "ce_loss_10": 3.4255795001983644, "ce_loss_13": 3.371835172176361, "ce_loss_2": 4.053711616992951, "ce_loss_3": 3.8632387042045595, "ce_loss_7": 3.523522126674652, "epoch": 0.891, "grad_norm": 580.0, "kl_loss_10": 124.12492294311524, "kl_loss_2": 1471.6921020507812, "kl_loss_3": 1089.004214477539, "kl_loss_7": 339.9063247680664, "learning_rate": 2.9613385499648926e-05, "loss": 724.8125, "step": 8910 }, { "ce_loss_10": 3.47619286775589, "ce_loss_13": 3.424519944190979, "ce_loss_2": 4.070773077011109, "ce_loss_3": 3.896732985973358, "ce_loss_7": 3.571604323387146, "epoch": 0.892, "grad_norm": 532.0, "kl_loss_10": 120.07063331604004, "kl_loss_2": 1390.2006958007812, "kl_loss_3": 1039.9530181884766, "kl_loss_7": 328.3143585205078, "learning_rate": 2.9077818180237692e-05, "loss": 718.9485, "step": 8920 }, { "ce_loss_10": 3.5255975484848023, "ce_loss_13": 3.4710716724395754, "ce_loss_2": 4.129261720180511, "ce_loss_3": 3.9440025210380556, "ce_loss_7": 3.6209316849708557, "epoch": 0.893, "grad_norm": 796.0, "kl_loss_10": 121.16016159057617, "kl_loss_2": 1394.0591796875, "kl_loss_3": 1037.3413208007812, "kl_loss_7": 329.095556640625, "learning_rate": 2.8546993041969172e-05, "loss": 716.5755, "step": 8930 }, { "ce_loss_10": 3.5639485001564024, "ce_loss_13": 3.5113102793693542, "ce_loss_2": 4.132089781761169, "ce_loss_3": 3.958901858329773, "ce_loss_7": 3.651428020000458, "epoch": 0.894, "grad_norm": 584.0, "kl_loss_10": 119.52723693847656, "kl_loss_2": 1364.812921142578, "kl_loss_3": 1022.4147399902344, "kl_loss_7": 326.37430267333986, "learning_rate": 2.802091543024671e-05, "loss": 717.1699, "step": 8940 }, { "ce_loss_10": 3.557297170162201, "ce_loss_13": 3.5066465854644777, "ce_loss_2": 4.156519591808319, "ce_loss_3": 3.973790967464447, "ce_loss_7": 3.6468861937522887, "epoch": 0.895, "grad_norm": 584.0, "kl_loss_10": 122.12691040039063, "kl_loss_2": 1419.7261474609375, "kl_loss_3": 1054.6295593261718, "kl_loss_7": 334.19554290771487, "learning_rate": 2.7499590642665774e-05, "loss": 738.7192, "step": 8950 }, { "ce_loss_10": 3.573615825176239, "ce_loss_13": 3.519201862812042, "ce_loss_2": 4.15412825345993, "ce_loss_3": 3.969862473011017, "ce_loss_7": 3.6663325428962708, "epoch": 0.896, "grad_norm": 494.0, "kl_loss_10": 123.42191467285156, "kl_loss_2": 1381.5768737792969, "kl_loss_3": 1020.5846405029297, "kl_loss_7": 340.43305053710935, "learning_rate": 2.6983023928961405e-05, "loss": 711.4102, "step": 8960 }, { "ce_loss_10": 3.5397876501083374, "ce_loss_13": 3.488338351249695, "ce_loss_2": 4.125940537452697, "ce_loss_3": 3.9469223976135255, "ce_loss_7": 3.6297765612602233, "epoch": 0.897, "grad_norm": 728.0, "kl_loss_10": 122.37606697082519, "kl_loss_2": 1384.7419189453126, "kl_loss_3": 1034.7682739257812, "kl_loss_7": 330.70338439941406, "learning_rate": 2.6471220490954628e-05, "loss": 723.2938, "step": 8970 }, { "ce_loss_10": 3.5269408941268923, "ce_loss_13": 3.4752776265144347, "ce_loss_2": 4.100160777568817, "ce_loss_3": 3.9212745785713197, "ce_loss_7": 3.610729730129242, "epoch": 0.898, "grad_norm": 628.0, "kl_loss_10": 119.57647438049317, "kl_loss_2": 1371.9164184570313, "kl_loss_3": 1019.5086395263672, "kl_loss_7": 324.32104339599607, "learning_rate": 2.596418548250029e-05, "loss": 718.0098, "step": 8980 }, { "ce_loss_10": 3.571890485286713, "ce_loss_13": 3.515963816642761, "ce_loss_2": 4.1442801594734195, "ce_loss_3": 3.9694687128067017, "ce_loss_7": 3.659953308105469, "epoch": 0.899, "grad_norm": 580.0, "kl_loss_10": 123.79312667846679, "kl_loss_2": 1387.2481201171875, "kl_loss_3": 1035.6228240966798, "kl_loss_7": 332.8598129272461, "learning_rate": 2.5461924009435368e-05, "loss": 711.2326, "step": 8990 }, { "ce_loss_10": 3.565682017803192, "ce_loss_13": 3.5124054074287416, "ce_loss_2": 4.145227205753327, "ce_loss_3": 3.971540629863739, "ce_loss_7": 3.652699661254883, "epoch": 0.9, "grad_norm": 700.0, "kl_loss_10": 122.56795883178711, "kl_loss_2": 1366.3777648925782, "kl_loss_3": 1030.5671783447265, "kl_loss_7": 328.8155014038086, "learning_rate": 2.4964441129527336e-05, "loss": 725.1452, "step": 9000 }, { "ce_loss_10": 3.5651490688323975, "ce_loss_13": 3.51036376953125, "ce_loss_2": 4.134415197372436, "ce_loss_3": 3.954487180709839, "ce_loss_7": 3.6518504142761232, "epoch": 0.901, "grad_norm": 700.0, "kl_loss_10": 120.3508358001709, "kl_loss_2": 1357.658221435547, "kl_loss_3": 1008.9297454833984, "kl_loss_7": 324.7009643554687, "learning_rate": 2.4471741852423235e-05, "loss": 704.2774, "step": 9010 }, { "ce_loss_10": 3.6071965098381042, "ce_loss_13": 3.5544245719909666, "ce_loss_2": 4.1868168115615845, "ce_loss_3": 4.0108413934707645, "ce_loss_7": 3.695762050151825, "epoch": 0.902, "grad_norm": 704.0, "kl_loss_10": 122.07797813415527, "kl_loss_2": 1359.7636352539062, "kl_loss_3": 1012.6970703125, "kl_loss_7": 327.2878921508789, "learning_rate": 2.3983831139599287e-05, "loss": 709.9809, "step": 9020 }, { "ce_loss_10": 3.5311748027801513, "ce_loss_13": 3.47797931432724, "ce_loss_2": 4.098857629299164, "ce_loss_3": 3.923769438266754, "ce_loss_7": 3.61720005273819, "epoch": 0.903, "grad_norm": 1040.0, "kl_loss_10": 119.62764778137208, "kl_loss_2": 1359.2027770996094, "kl_loss_3": 1016.2866149902344, "kl_loss_7": 322.86070251464844, "learning_rate": 2.3500713904311022e-05, "loss": 696.3648, "step": 9030 }, { "ce_loss_10": 3.572555947303772, "ce_loss_13": 3.522153615951538, "ce_loss_2": 4.132451498508454, "ce_loss_3": 3.9519538998603823, "ce_loss_7": 3.65519403219223, "epoch": 0.904, "grad_norm": 540.0, "kl_loss_10": 119.08038482666015, "kl_loss_2": 1330.948681640625, "kl_loss_3": 984.9072479248047, "kl_loss_7": 319.1572540283203, "learning_rate": 2.3022395011543685e-05, "loss": 697.6727, "step": 9040 }, { "ce_loss_10": 3.6023966908454894, "ce_loss_13": 3.549352025985718, "ce_loss_2": 4.193000841140747, "ce_loss_3": 4.014123034477234, "ce_loss_7": 3.6947824120521546, "epoch": 0.905, "grad_norm": 640.0, "kl_loss_10": 123.01276931762695, "kl_loss_2": 1397.0471923828125, "kl_loss_3": 1045.0749206542969, "kl_loss_7": 336.0963409423828, "learning_rate": 2.2548879277963063e-05, "loss": 729.8438, "step": 9050 }, { "ce_loss_10": 3.517413592338562, "ce_loss_13": 3.4641988396644594, "ce_loss_2": 4.097688150405884, "ce_loss_3": 3.9156952857971192, "ce_loss_7": 3.603403353691101, "epoch": 0.906, "grad_norm": 548.0, "kl_loss_10": 120.84680671691895, "kl_loss_2": 1380.5550354003906, "kl_loss_3": 1028.3634216308594, "kl_loss_7": 326.4955429077148, "learning_rate": 2.208017147186736e-05, "loss": 700.2203, "step": 9060 }, { "ce_loss_10": 3.5140005707740785, "ce_loss_13": 3.461076188087463, "ce_loss_2": 4.10326977968216, "ce_loss_3": 3.9209877371788027, "ce_loss_7": 3.6043805360794066, "epoch": 0.907, "grad_norm": 492.0, "kl_loss_10": 121.33981552124024, "kl_loss_2": 1388.803759765625, "kl_loss_3": 1033.561016845703, "kl_loss_7": 330.547900390625, "learning_rate": 2.1616276313139227e-05, "loss": 708.1628, "step": 9070 }, { "ce_loss_10": 3.55505918264389, "ce_loss_13": 3.5002601861953737, "ce_loss_2": 4.133837890625, "ce_loss_3": 3.9574143290519714, "ce_loss_7": 3.645774781703949, "epoch": 0.908, "grad_norm": 548.0, "kl_loss_10": 121.9651065826416, "kl_loss_2": 1372.4083923339845, "kl_loss_3": 1024.9973175048829, "kl_loss_7": 329.2260681152344, "learning_rate": 2.1157198473197415e-05, "loss": 716.0005, "step": 9080 }, { "ce_loss_10": 3.6146026134490965, "ce_loss_13": 3.5627476572990417, "ce_loss_2": 4.199038410186768, "ce_loss_3": 4.024258577823639, "ce_loss_7": 3.7075974106788636, "epoch": 0.909, "grad_norm": 660.0, "kl_loss_10": 122.82046203613281, "kl_loss_2": 1376.7192138671876, "kl_loss_3": 1033.8393127441407, "kl_loss_7": 333.55931091308594, "learning_rate": 2.0702942574950812e-05, "loss": 717.5656, "step": 9090 }, { "ce_loss_10": 3.536907970905304, "ce_loss_13": 3.4830648064613343, "ce_loss_2": 4.130435681343078, "ce_loss_3": 3.9466469168663023, "ce_loss_7": 3.630732071399689, "epoch": 0.91, "grad_norm": 430.0, "kl_loss_10": 122.83297958374024, "kl_loss_2": 1402.052911376953, "kl_loss_3": 1039.2423034667968, "kl_loss_7": 334.35711669921875, "learning_rate": 2.025351319275137e-05, "loss": 720.0911, "step": 9100 }, { "ce_loss_10": 3.6753941059112547, "ce_loss_13": 3.6204377889633177, "ce_loss_2": 4.253736305236816, "ce_loss_3": 4.077165937423706, "ce_loss_7": 3.7643893718719483, "epoch": 0.911, "grad_norm": 684.0, "kl_loss_10": 127.13848838806152, "kl_loss_2": 1410.5498107910157, "kl_loss_3": 1052.9429016113281, "kl_loss_7": 338.90263214111326, "learning_rate": 1.9808914852347816e-05, "loss": 739.4578, "step": 9110 }, { "ce_loss_10": 3.5164944171905517, "ce_loss_13": 3.460708129405975, "ce_loss_2": 4.116264307498932, "ce_loss_3": 3.9366886615753174, "ce_loss_7": 3.608424687385559, "epoch": 0.912, "grad_norm": 466.0, "kl_loss_10": 122.47514915466309, "kl_loss_2": 1399.5153747558593, "kl_loss_3": 1043.8174133300781, "kl_loss_7": 330.74664306640625, "learning_rate": 1.9369152030840554e-05, "loss": 717.7691, "step": 9120 }, { "ce_loss_10": 3.5966818809509276, "ce_loss_13": 3.540822982788086, "ce_loss_2": 4.167646491527558, "ce_loss_3": 3.9919039726257326, "ce_loss_7": 3.6817273020744326, "epoch": 0.913, "grad_norm": 836.0, "kl_loss_10": 122.55923118591309, "kl_loss_2": 1386.7923583984375, "kl_loss_3": 1030.3927276611328, "kl_loss_7": 327.60221710205076, "learning_rate": 1.893422915663645e-05, "loss": 717.9165, "step": 9130 }, { "ce_loss_10": 3.463078701496124, "ce_loss_13": 3.4048054456710815, "ce_loss_2": 4.085853815078735, "ce_loss_3": 3.8942277312278746, "ce_loss_7": 3.555465543270111, "epoch": 0.914, "grad_norm": 532.0, "kl_loss_10": 123.98791694641113, "kl_loss_2": 1448.6031616210937, "kl_loss_3": 1079.3504547119142, "kl_loss_7": 336.90855102539064, "learning_rate": 1.850415060940386e-05, "loss": 735.5819, "step": 9140 }, { "ce_loss_10": 3.5885175228118897, "ce_loss_13": 3.5330852389335634, "ce_loss_2": 4.165771210193634, "ce_loss_3": 3.986107361316681, "ce_loss_7": 3.673333930969238, "epoch": 0.915, "grad_norm": 568.0, "kl_loss_10": 122.50748710632324, "kl_loss_2": 1369.035382080078, "kl_loss_3": 1022.5911560058594, "kl_loss_7": 329.97228851318357, "learning_rate": 1.8078920720028978e-05, "loss": 713.4122, "step": 9150 }, { "ce_loss_10": 3.513621473312378, "ce_loss_13": 3.461913502216339, "ce_loss_2": 4.0891550302505495, "ce_loss_3": 3.914201045036316, "ce_loss_7": 3.5989736318588257, "epoch": 0.916, "grad_norm": 608.0, "kl_loss_10": 119.31071891784669, "kl_loss_2": 1368.6764831542969, "kl_loss_3": 1018.941748046875, "kl_loss_7": 323.95599975585935, "learning_rate": 1.765854377057219e-05, "loss": 724.9803, "step": 9160 }, { "ce_loss_10": 3.4946448802948, "ce_loss_13": 3.444287621974945, "ce_loss_2": 4.072016930580139, "ce_loss_3": 3.891819429397583, "ce_loss_7": 3.581692707538605, "epoch": 0.917, "grad_norm": 568.0, "kl_loss_10": 118.33210716247558, "kl_loss_2": 1371.4457458496095, "kl_loss_3": 1019.0162200927734, "kl_loss_7": 322.0634735107422, "learning_rate": 1.724302399422456e-05, "loss": 713.6212, "step": 9170 }, { "ce_loss_10": 3.443930947780609, "ce_loss_13": 3.391281247138977, "ce_loss_2": 4.039050686359405, "ce_loss_3": 3.8598959922790526, "ce_loss_7": 3.5371329545974732, "epoch": 0.918, "grad_norm": 552.0, "kl_loss_10": 122.66972045898437, "kl_loss_2": 1399.7767211914063, "kl_loss_3": 1046.084487915039, "kl_loss_7": 337.6794631958008, "learning_rate": 1.683236557526574e-05, "loss": 725.2953, "step": 9180 }, { "ce_loss_10": 3.5698832750320433, "ce_loss_13": 3.517447316646576, "ce_loss_2": 4.126532173156738, "ce_loss_3": 3.9537552118301393, "ce_loss_7": 3.65188353061676, "epoch": 0.919, "grad_norm": 484.0, "kl_loss_10": 118.96306114196777, "kl_loss_2": 1337.912353515625, "kl_loss_3": 994.4553466796875, "kl_loss_7": 319.3814407348633, "learning_rate": 1.6426572649021475e-05, "loss": 706.8777, "step": 9190 }, { "ce_loss_10": 3.60268212556839, "ce_loss_13": 3.55240513086319, "ce_loss_2": 4.154273760318756, "ce_loss_3": 3.984792101383209, "ce_loss_7": 3.685652470588684, "epoch": 0.92, "grad_norm": 652.0, "kl_loss_10": 122.0446949005127, "kl_loss_2": 1341.5455627441406, "kl_loss_3": 996.6870147705079, "kl_loss_7": 324.77650756835936, "learning_rate": 1.6025649301821876e-05, "loss": 705.4512, "step": 9200 }, { "ce_loss_10": 3.5914915084838865, "ce_loss_13": 3.5390833497047423, "ce_loss_2": 4.147667050361633, "ce_loss_3": 3.9771674513816833, "ce_loss_7": 3.677826452255249, "epoch": 0.921, "grad_norm": 672.0, "kl_loss_10": 122.41096229553223, "kl_loss_2": 1365.7819885253907, "kl_loss_3": 1026.9352416992188, "kl_loss_7": 331.3459762573242, "learning_rate": 1.5629599570960716e-05, "loss": 710.4124, "step": 9210 }, { "ce_loss_10": 3.4989588618278504, "ce_loss_13": 3.443503034114838, "ce_loss_2": 4.072719764709473, "ce_loss_3": 3.895606553554535, "ce_loss_7": 3.583188033103943, "epoch": 0.922, "grad_norm": 564.0, "kl_loss_10": 121.58853492736816, "kl_loss_2": 1383.309326171875, "kl_loss_3": 1029.440249633789, "kl_loss_7": 329.04571533203125, "learning_rate": 1.5238427444654367e-05, "loss": 715.6756, "step": 9220 }, { "ce_loss_10": 3.5566193222999574, "ce_loss_13": 3.5052598237991335, "ce_loss_2": 4.12589042186737, "ce_loss_3": 3.9515591621398927, "ce_loss_7": 3.6445566058158874, "epoch": 0.923, "grad_norm": 656.0, "kl_loss_10": 120.25884666442872, "kl_loss_2": 1353.9432434082032, "kl_loss_3": 1004.0840515136719, "kl_loss_7": 324.49686737060546, "learning_rate": 1.4852136862001764e-05, "loss": 707.9903, "step": 9230 }, { "ce_loss_10": 3.519503819942474, "ce_loss_13": 3.4680001974105834, "ce_loss_2": 4.090194070339203, "ce_loss_3": 3.9192580938339234, "ce_loss_7": 3.603424859046936, "epoch": 0.924, "grad_norm": 536.0, "kl_loss_10": 116.57286224365234, "kl_loss_2": 1356.3960021972657, "kl_loss_3": 1013.3616424560547, "kl_loss_7": 322.25772247314455, "learning_rate": 1.4470731712944884e-05, "loss": 712.4052, "step": 9240 }, { "ce_loss_10": 3.546650528907776, "ce_loss_13": 3.4919941663742065, "ce_loss_2": 4.125348937511444, "ce_loss_3": 3.9468334317207336, "ce_loss_7": 3.637234890460968, "epoch": 0.925, "grad_norm": 560.0, "kl_loss_10": 121.30828056335449, "kl_loss_2": 1376.0818115234374, "kl_loss_3": 1026.3591033935547, "kl_loss_7": 330.9017593383789, "learning_rate": 1.4094215838229174e-05, "loss": 725.0938, "step": 9250 }, { "ce_loss_10": 3.506890153884888, "ce_loss_13": 3.4531724214553834, "ce_loss_2": 4.09957047700882, "ce_loss_3": 3.9183789253234864, "ce_loss_7": 3.595683777332306, "epoch": 0.926, "grad_norm": 668.0, "kl_loss_10": 120.98923149108887, "kl_loss_2": 1405.4196044921875, "kl_loss_3": 1046.142251586914, "kl_loss_7": 330.8429779052734, "learning_rate": 1.372259302936546e-05, "loss": 741.1663, "step": 9260 }, { "ce_loss_10": 3.6240168809890747, "ce_loss_13": 3.5658324003219604, "ce_loss_2": 4.201930189132691, "ce_loss_3": 4.022644340991974, "ce_loss_7": 3.7125780820846557, "epoch": 0.927, "grad_norm": 458.0, "kl_loss_10": 125.30646934509278, "kl_loss_2": 1376.9204162597657, "kl_loss_3": 1027.8259399414062, "kl_loss_7": 335.74131164550784, "learning_rate": 1.3355867028591206e-05, "loss": 709.1274, "step": 9270 }, { "ce_loss_10": 3.5262920260429382, "ce_loss_13": 3.4725812315940856, "ce_loss_2": 4.089487946033477, "ce_loss_3": 3.917177438735962, "ce_loss_7": 3.6126119017601015, "epoch": 0.928, "grad_norm": 528.0, "kl_loss_10": 120.4309455871582, "kl_loss_2": 1373.5199279785156, "kl_loss_3": 1023.1070373535156, "kl_loss_7": 328.47514190673826, "learning_rate": 1.2994041528833267e-05, "loss": 708.8351, "step": 9280 }, { "ce_loss_10": 3.5236886620521544, "ce_loss_13": 3.470105516910553, "ce_loss_2": 4.10210679769516, "ce_loss_3": 3.92024689912796, "ce_loss_7": 3.6086822271347048, "epoch": 0.929, "grad_norm": 576.0, "kl_loss_10": 120.04236526489258, "kl_loss_2": 1388.3319274902344, "kl_loss_3": 1022.3574737548828, "kl_loss_7": 325.6973571777344, "learning_rate": 1.2637120173670358e-05, "loss": 713.4749, "step": 9290 }, { "ce_loss_10": 3.5528993606567383, "ce_loss_13": 3.4979908227920533, "ce_loss_2": 4.1398141264915465, "ce_loss_3": 3.960055100917816, "ce_loss_7": 3.6429410934448243, "epoch": 0.93, "grad_norm": 752.0, "kl_loss_10": 121.99558181762696, "kl_loss_2": 1397.67568359375, "kl_loss_3": 1041.351336669922, "kl_loss_7": 332.56233978271484, "learning_rate": 1.2285106557296478e-05, "loss": 717.6662, "step": 9300 }, { "ce_loss_10": 3.4253716588020326, "ce_loss_13": 3.374079155921936, "ce_loss_2": 4.050317943096161, "ce_loss_3": 3.85725314617157, "ce_loss_7": 3.5170278906822205, "epoch": 0.931, "grad_norm": 856.0, "kl_loss_10": 120.44239501953125, "kl_loss_2": 1437.18662109375, "kl_loss_3": 1066.2714141845704, "kl_loss_7": 331.22974700927733, "learning_rate": 1.1938004224484989e-05, "loss": 725.9393, "step": 9310 }, { "ce_loss_10": 3.6618238210678102, "ce_loss_13": 3.606413471698761, "ce_loss_2": 4.2395406603813175, "ce_loss_3": 4.060720527172089, "ce_loss_7": 3.7527721166610717, "epoch": 0.932, "grad_norm": 620.0, "kl_loss_10": 124.90289039611817, "kl_loss_2": 1384.36318359375, "kl_loss_3": 1029.1616516113281, "kl_loss_7": 333.3983581542969, "learning_rate": 1.1595816670552429e-05, "loss": 726.8374, "step": 9320 }, { "ce_loss_10": 3.595258188247681, "ce_loss_13": 3.538734793663025, "ce_loss_2": 4.16660704612732, "ce_loss_3": 3.9852279901504515, "ce_loss_7": 3.6792800307273863, "epoch": 0.933, "grad_norm": 620.0, "kl_loss_10": 122.25293235778808, "kl_loss_2": 1363.7237548828125, "kl_loss_3": 1008.4813812255859, "kl_loss_7": 325.02245025634767, "learning_rate": 1.1258547341323699e-05, "loss": 704.2615, "step": 9330 }, { "ce_loss_10": 3.619642269611359, "ce_loss_13": 3.5654753684997558, "ce_loss_2": 4.190784621238708, "ce_loss_3": 4.012927258014679, "ce_loss_7": 3.707143473625183, "epoch": 0.934, "grad_norm": 704.0, "kl_loss_10": 122.84645042419433, "kl_loss_2": 1390.2992065429687, "kl_loss_3": 1031.7191131591796, "kl_loss_7": 331.35271606445315, "learning_rate": 1.0926199633097156e-05, "loss": 714.1544, "step": 9340 }, { "ce_loss_10": 3.6207199335098266, "ce_loss_13": 3.5691072225570677, "ce_loss_2": 4.169072163105011, "ce_loss_3": 3.9992053508758545, "ce_loss_7": 3.7047256231307983, "epoch": 0.935, "grad_norm": 524.0, "kl_loss_10": 119.55591087341308, "kl_loss_2": 1335.967266845703, "kl_loss_3": 1000.2812530517579, "kl_loss_7": 321.08820190429685, "learning_rate": 1.0598776892610684e-05, "loss": 714.8448, "step": 9350 }, { "ce_loss_10": 3.431315243244171, "ce_loss_13": 3.381504237651825, "ce_loss_2": 4.028350937366485, "ce_loss_3": 3.8452399015426635, "ce_loss_7": 3.5204859137535096, "epoch": 0.936, "grad_norm": 492.0, "kl_loss_10": 118.58314819335938, "kl_loss_2": 1395.6468383789063, "kl_loss_3": 1031.9574981689452, "kl_loss_7": 323.2079360961914, "learning_rate": 1.0276282417007399e-05, "loss": 710.134, "step": 9360 }, { "ce_loss_10": 3.593991827964783, "ce_loss_13": 3.543764090538025, "ce_loss_2": 4.150844514369965, "ce_loss_3": 3.976518452167511, "ce_loss_7": 3.680371618270874, "epoch": 0.937, "grad_norm": 608.0, "kl_loss_10": 119.22171020507812, "kl_loss_2": 1340.3917053222656, "kl_loss_3": 996.455322265625, "kl_loss_7": 321.72610626220705, "learning_rate": 9.958719453803277e-06, "loss": 703.7472, "step": 9370 }, { "ce_loss_10": 3.5886499643325807, "ce_loss_13": 3.535153257846832, "ce_loss_2": 4.169439589977264, "ce_loss_3": 3.9961138367652893, "ce_loss_7": 3.681587278842926, "epoch": 0.938, "grad_norm": 632.0, "kl_loss_10": 121.95464172363282, "kl_loss_2": 1374.1223999023437, "kl_loss_3": 1033.233740234375, "kl_loss_7": 331.1907165527344, "learning_rate": 9.646091200853802e-06, "loss": 715.3208, "step": 9380 }, { "ce_loss_10": 3.5463305473327638, "ce_loss_13": 3.4947377681732177, "ce_loss_2": 4.119847238063812, "ce_loss_3": 3.9456714272499083, "ce_loss_7": 3.632569658756256, "epoch": 0.939, "grad_norm": 600.0, "kl_loss_10": 118.46955299377441, "kl_loss_2": 1357.3059997558594, "kl_loss_3": 1010.5910614013671, "kl_loss_7": 323.0857559204102, "learning_rate": 9.338400806321978e-06, "loss": 692.6078, "step": 9390 }, { "ce_loss_10": 3.58269464969635, "ce_loss_13": 3.5270420789718626, "ce_loss_2": 4.153971230983734, "ce_loss_3": 3.977994203567505, "ce_loss_7": 3.6744091510772705, "epoch": 0.94, "grad_norm": 576.0, "kl_loss_10": 122.99457702636718, "kl_loss_2": 1362.3225341796874, "kl_loss_3": 1014.1672058105469, "kl_loss_7": 329.80276641845705, "learning_rate": 9.035651368646646e-06, "loss": 707.3777, "step": 9400 }, { "ce_loss_10": 3.583444893360138, "ce_loss_13": 3.5304938554763794, "ce_loss_2": 4.144635462760926, "ce_loss_3": 3.973416244983673, "ce_loss_7": 3.669160747528076, "epoch": 0.941, "grad_norm": 612.0, "kl_loss_10": 120.20222663879395, "kl_loss_2": 1350.0137756347656, "kl_loss_3": 1008.0319641113281, "kl_loss_7": 325.5893859863281, "learning_rate": 8.737845936511335e-06, "loss": 711.1381, "step": 9410 }, { "ce_loss_10": 3.534907364845276, "ce_loss_13": 3.4788982629776, "ce_loss_2": 4.116820418834687, "ce_loss_3": 3.93233003616333, "ce_loss_7": 3.6241695284843445, "epoch": 0.942, "grad_norm": 552.0, "kl_loss_10": 123.2990665435791, "kl_loss_2": 1381.4971130371093, "kl_loss_3": 1029.372280883789, "kl_loss_7": 332.8606323242187, "learning_rate": 8.444987508813451e-06, "loss": 712.9717, "step": 9420 }, { "ce_loss_10": 3.486678731441498, "ce_loss_13": 3.4316638946533202, "ce_loss_2": 4.079938578605652, "ce_loss_3": 3.897518050670624, "ce_loss_7": 3.5741005659103395, "epoch": 0.943, "grad_norm": 592.0, "kl_loss_10": 122.28645133972168, "kl_loss_2": 1428.6765991210937, "kl_loss_3": 1060.144955444336, "kl_loss_7": 334.1707931518555, "learning_rate": 8.157079034633974e-06, "loss": 725.4578, "step": 9430 }, { "ce_loss_10": 3.487192440032959, "ce_loss_13": 3.434910070896149, "ce_loss_2": 4.076284384727478, "ce_loss_3": 3.895754504203796, "ce_loss_7": 3.5757576704025267, "epoch": 0.944, "grad_norm": 438.0, "kl_loss_10": 120.67500839233398, "kl_loss_2": 1416.8783447265625, "kl_loss_3": 1052.7425445556642, "kl_loss_7": 329.4114395141602, "learning_rate": 7.874123413208145e-06, "loss": 718.8903, "step": 9440 }, { "ce_loss_10": 3.4529663920402527, "ce_loss_13": 3.4014953494071962, "ce_loss_2": 4.053472077846527, "ce_loss_3": 3.86897349357605, "ce_loss_7": 3.5433228611946106, "epoch": 0.945, "grad_norm": 434.0, "kl_loss_10": 119.67061233520508, "kl_loss_2": 1402.618475341797, "kl_loss_3": 1041.5067199707032, "kl_loss_7": 330.63839569091795, "learning_rate": 7.59612349389599e-06, "loss": 719.7231, "step": 9450 }, { "ce_loss_10": 3.547183060646057, "ce_loss_13": 3.4943931818008425, "ce_loss_2": 4.110193264484406, "ce_loss_3": 3.9380640506744387, "ce_loss_7": 3.6319218397140505, "epoch": 0.946, "grad_norm": 752.0, "kl_loss_10": 118.77492752075196, "kl_loss_2": 1337.786553955078, "kl_loss_3": 997.432504272461, "kl_loss_7": 323.5429290771484, "learning_rate": 7.323082076153509e-06, "loss": 706.6428, "step": 9460 }, { "ce_loss_10": 3.5930521965026854, "ce_loss_13": 3.538786733150482, "ce_loss_2": 4.154218971729279, "ce_loss_3": 3.979544770717621, "ce_loss_7": 3.6779901146888734, "epoch": 0.947, "grad_norm": 548.0, "kl_loss_10": 122.71770210266114, "kl_loss_2": 1348.3024658203126, "kl_loss_3": 1007.2475158691407, "kl_loss_7": 330.79364318847655, "learning_rate": 7.055001909504755e-06, "loss": 715.8398, "step": 9470 }, { "ce_loss_10": 3.6217771649360655, "ce_loss_13": 3.567501354217529, "ce_loss_2": 4.193489670753479, "ce_loss_3": 4.013223147392273, "ce_loss_7": 3.7104034662246703, "epoch": 0.948, "grad_norm": 436.0, "kl_loss_10": 123.1129165649414, "kl_loss_2": 1368.2989318847656, "kl_loss_3": 1020.2081787109375, "kl_loss_7": 330.82146606445315, "learning_rate": 6.791885693514133e-06, "loss": 715.8734, "step": 9480 }, { "ce_loss_10": 3.52802118062973, "ce_loss_13": 3.475515973567963, "ce_loss_2": 4.114694261550904, "ce_loss_3": 3.9377319693565367, "ce_loss_7": 3.617742133140564, "epoch": 0.949, "grad_norm": 1104.0, "kl_loss_10": 122.59672470092774, "kl_loss_2": 1394.1083374023438, "kl_loss_3": 1037.977471923828, "kl_loss_7": 331.45179290771483, "learning_rate": 6.533736077758867e-06, "loss": 721.0864, "step": 9490 }, { "ce_loss_10": 3.489359438419342, "ce_loss_13": 3.4337912678718565, "ce_loss_2": 4.0956980109214784, "ce_loss_3": 3.90721480846405, "ce_loss_7": 3.581552469730377, "epoch": 0.95, "grad_norm": 840.0, "kl_loss_10": 123.24414253234863, "kl_loss_2": 1427.8860778808594, "kl_loss_3": 1052.8961486816406, "kl_loss_7": 336.8455841064453, "learning_rate": 6.2805556618028556e-06, "loss": 722.0676, "step": 9500 }, { "ce_loss_10": 3.5827051520347597, "ce_loss_13": 3.5319561839103697, "ce_loss_2": 4.139924156665802, "ce_loss_3": 3.966154897212982, "ce_loss_7": 3.6660770177841187, "epoch": 0.951, "grad_norm": 640.0, "kl_loss_10": 119.18858680725097, "kl_loss_2": 1329.3287841796875, "kl_loss_3": 987.8519744873047, "kl_loss_7": 318.80920562744143, "learning_rate": 6.032346995169968e-06, "loss": 686.1183, "step": 9510 }, { "ce_loss_10": 3.588587963581085, "ce_loss_13": 3.5356736183166504, "ce_loss_2": 4.163759100437164, "ce_loss_3": 3.986702060699463, "ce_loss_7": 3.6727798104286196, "epoch": 0.952, "grad_norm": 612.0, "kl_loss_10": 121.3857364654541, "kl_loss_2": 1371.2152526855468, "kl_loss_3": 1021.5204376220703, "kl_loss_7": 325.9972854614258, "learning_rate": 5.789112577318789e-06, "loss": 707.1283, "step": 9520 }, { "ce_loss_10": 3.566323149204254, "ce_loss_13": 3.5122985243797302, "ce_loss_2": 4.147273254394531, "ce_loss_3": 3.9658782124519347, "ce_loss_7": 3.650079107284546, "epoch": 0.953, "grad_norm": 624.0, "kl_loss_10": 122.64759712219238, "kl_loss_2": 1404.9125610351562, "kl_loss_3": 1038.5805755615233, "kl_loss_7": 331.40319061279297, "learning_rate": 5.550854857617194e-06, "loss": 709.3621, "step": 9530 }, { "ce_loss_10": 3.5508513927459715, "ce_loss_13": 3.49852135181427, "ce_loss_2": 4.146372210979462, "ce_loss_3": 3.9630183458328245, "ce_loss_7": 3.6422205328941346, "epoch": 0.954, "grad_norm": 808.0, "kl_loss_10": 124.43129844665528, "kl_loss_2": 1406.9052978515624, "kl_loss_3": 1050.3708709716798, "kl_loss_7": 337.19909210205077, "learning_rate": 5.317576235317756e-06, "loss": 722.1983, "step": 9540 }, { "ce_loss_10": 3.579137623310089, "ce_loss_13": 3.525528633594513, "ce_loss_2": 4.13328732252121, "ce_loss_3": 3.9567643284797667, "ce_loss_7": 3.6605886578559876, "epoch": 0.955, "grad_norm": 672.0, "kl_loss_10": 120.62237091064453, "kl_loss_2": 1320.1938232421876, "kl_loss_3": 981.861474609375, "kl_loss_7": 320.4243621826172, "learning_rate": 5.089279059533658e-06, "loss": 709.8652, "step": 9550 }, { "ce_loss_10": 3.6386606693267822, "ce_loss_13": 3.5838342666625977, "ce_loss_2": 4.210740327835083, "ce_loss_3": 4.036587131023407, "ce_loss_7": 3.7290936589241026, "epoch": 0.956, "grad_norm": 564.0, "kl_loss_10": 126.81226425170898, "kl_loss_2": 1377.3177856445313, "kl_loss_3": 1028.8962188720702, "kl_loss_7": 336.68345794677737, "learning_rate": 4.865965629214819e-06, "loss": 712.0689, "step": 9560 }, { "ce_loss_10": 3.5826368927955627, "ce_loss_13": 3.5299017906188963, "ce_loss_2": 4.156905472278595, "ce_loss_3": 3.9820918679237365, "ce_loss_7": 3.6713940501213074, "epoch": 0.957, "grad_norm": 540.0, "kl_loss_10": 122.74875144958496, "kl_loss_2": 1392.4383850097656, "kl_loss_3": 1042.0474884033204, "kl_loss_7": 333.6945205688477, "learning_rate": 4.6476381931251366e-06, "loss": 708.1864, "step": 9570 }, { "ce_loss_10": 3.5621810436248778, "ce_loss_13": 3.509333276748657, "ce_loss_2": 4.1382688164711, "ce_loss_3": 3.9572407245635985, "ce_loss_7": 3.6505929470062255, "epoch": 0.958, "grad_norm": 588.0, "kl_loss_10": 120.34364318847656, "kl_loss_2": 1372.2328552246095, "kl_loss_3": 1022.0481781005859, "kl_loss_7": 328.2518478393555, "learning_rate": 4.434298949819449e-06, "loss": 710.7535, "step": 9580 }, { "ce_loss_10": 3.517833399772644, "ce_loss_13": 3.4627076268196104, "ce_loss_2": 4.125231349468232, "ce_loss_3": 3.937967097759247, "ce_loss_7": 3.6109151244163513, "epoch": 0.959, "grad_norm": 496.0, "kl_loss_10": 124.38531723022462, "kl_loss_2": 1447.3932067871094, "kl_loss_3": 1073.26435546875, "kl_loss_7": 339.64784240722656, "learning_rate": 4.2259500476214406e-06, "loss": 729.0812, "step": 9590 }, { "ce_loss_10": 3.5029053330421447, "ce_loss_13": 3.4484299540519716, "ce_loss_2": 4.084228038787842, "ce_loss_3": 3.901972544193268, "ce_loss_7": 3.5915270805358888, "epoch": 0.96, "grad_norm": 482.0, "kl_loss_10": 121.2360626220703, "kl_loss_2": 1395.7050842285157, "kl_loss_3": 1035.7142211914063, "kl_loss_7": 330.5919525146484, "learning_rate": 4.02259358460233e-06, "loss": 712.7003, "step": 9600 }, { "ce_loss_10": 3.569297027587891, "ce_loss_13": 3.5153488874435426, "ce_loss_2": 4.138071930408477, "ce_loss_3": 3.9621680974960327, "ce_loss_7": 3.655005395412445, "epoch": 0.961, "grad_norm": 520.0, "kl_loss_10": 122.30363540649414, "kl_loss_2": 1365.015313720703, "kl_loss_3": 1017.6215026855468, "kl_loss_7": 330.8979751586914, "learning_rate": 3.8242316085594916e-06, "loss": 706.2896, "step": 9610 }, { "ce_loss_10": 3.453506600856781, "ce_loss_13": 3.4016806364059446, "ce_loss_2": 4.060185205936432, "ce_loss_3": 3.874277877807617, "ce_loss_7": 3.5428122758865355, "epoch": 0.962, "grad_norm": 548.0, "kl_loss_10": 121.87925300598144, "kl_loss_2": 1436.2672180175782, "kl_loss_3": 1071.4315460205078, "kl_loss_7": 334.0261703491211, "learning_rate": 3.630866116995757e-06, "loss": 735.1148, "step": 9620 }, { "ce_loss_10": 3.608195972442627, "ce_loss_13": 3.555492627620697, "ce_loss_2": 4.172922539710998, "ce_loss_3": 3.999432122707367, "ce_loss_7": 3.695437788963318, "epoch": 0.963, "grad_norm": 480.0, "kl_loss_10": 120.99373207092285, "kl_loss_2": 1338.744921875, "kl_loss_3": 1001.2767883300781, "kl_loss_7": 323.61705017089844, "learning_rate": 3.4424990570994797e-06, "loss": 713.4098, "step": 9630 }, { "ce_loss_10": 3.6004083514213563, "ce_loss_13": 3.545426595211029, "ce_loss_2": 4.163943660259247, "ce_loss_3": 3.9917925119400026, "ce_loss_7": 3.683776152133942, "epoch": 0.964, "grad_norm": 422.0, "kl_loss_10": 120.70787086486817, "kl_loss_2": 1359.9990783691405, "kl_loss_3": 1013.7219421386719, "kl_loss_7": 325.9754806518555, "learning_rate": 3.2591323257248896e-06, "loss": 708.6987, "step": 9640 }, { "ce_loss_10": 3.4461316108703612, "ce_loss_13": 3.3950406193733214, "ce_loss_2": 4.035059344768524, "ce_loss_3": 3.8544202208518983, "ce_loss_7": 3.5323742747306826, "epoch": 0.965, "grad_norm": 588.0, "kl_loss_10": 119.44026069641113, "kl_loss_2": 1390.1842041015625, "kl_loss_3": 1035.5374267578125, "kl_loss_7": 326.7879272460938, "learning_rate": 3.0807677693729385e-06, "loss": 718.7271, "step": 9650 }, { "ce_loss_10": 3.6339459300041197, "ce_loss_13": 3.5821001648902895, "ce_loss_2": 4.195254683494568, "ce_loss_3": 4.029372644424439, "ce_loss_7": 3.721749794483185, "epoch": 0.966, "grad_norm": 624.0, "kl_loss_10": 120.16477317810059, "kl_loss_2": 1345.0911743164063, "kl_loss_3": 1009.0331939697265, "kl_loss_7": 326.2617385864258, "learning_rate": 2.9074071841727055e-06, "loss": 700.3471, "step": 9660 }, { "ce_loss_10": 3.564396059513092, "ce_loss_13": 3.5104949116706847, "ce_loss_2": 4.14932644367218, "ce_loss_3": 3.9688431143760683, "ce_loss_7": 3.655857837200165, "epoch": 0.967, "grad_norm": 824.0, "kl_loss_10": 121.51194686889649, "kl_loss_2": 1391.90537109375, "kl_loss_3": 1034.157763671875, "kl_loss_7": 333.8142547607422, "learning_rate": 2.739052315863355e-06, "loss": 705.9752, "step": 9670 }, { "ce_loss_10": 3.547489809989929, "ce_loss_13": 3.4908099293708803, "ce_loss_2": 4.124406385421753, "ce_loss_3": 3.952444839477539, "ce_loss_7": 3.6360405683517456, "epoch": 0.968, "grad_norm": 708.0, "kl_loss_10": 123.83374938964843, "kl_loss_2": 1374.874462890625, "kl_loss_3": 1031.4533416748047, "kl_loss_7": 329.6816864013672, "learning_rate": 2.5757048597765396e-06, "loss": 709.6677, "step": 9680 }, { "ce_loss_10": 3.5581902265548706, "ce_loss_13": 3.5045996785163878, "ce_loss_2": 4.142367708683014, "ce_loss_3": 3.960366427898407, "ce_loss_7": 3.6433528900146483, "epoch": 0.969, "grad_norm": 716.0, "kl_loss_10": 121.9698501586914, "kl_loss_2": 1398.1140563964843, "kl_loss_3": 1040.0749725341798, "kl_loss_7": 330.96569061279297, "learning_rate": 2.417366460819359e-06, "loss": 716.8742, "step": 9690 }, { "ce_loss_10": 3.5652561664581297, "ce_loss_13": 3.5113066673278808, "ce_loss_2": 4.170096576213837, "ce_loss_3": 3.9840278029441833, "ce_loss_7": 3.657517433166504, "epoch": 0.97, "grad_norm": 648.0, "kl_loss_10": 124.85320587158203, "kl_loss_2": 1419.4852294921875, "kl_loss_3": 1056.5964263916017, "kl_loss_7": 336.44810638427737, "learning_rate": 2.2640387134577057e-06, "loss": 715.8505, "step": 9700 }, { "ce_loss_10": 3.4900317192077637, "ce_loss_13": 3.438812232017517, "ce_loss_2": 4.047373950481415, "ce_loss_3": 3.874912989139557, "ce_loss_7": 3.5746123671531675, "epoch": 0.971, "grad_norm": 552.0, "kl_loss_10": 115.83142395019532, "kl_loss_2": 1328.5251892089843, "kl_loss_3": 984.848501586914, "kl_loss_7": 316.37536163330077, "learning_rate": 2.115723161700278e-06, "loss": 696.8717, "step": 9710 }, { "ce_loss_10": 3.4717228293418883, "ce_loss_13": 3.4165940165519713, "ce_loss_2": 4.07243583202362, "ce_loss_3": 3.8875492215156555, "ce_loss_7": 3.5627353191375732, "epoch": 0.972, "grad_norm": 588.0, "kl_loss_10": 124.52442169189453, "kl_loss_2": 1418.6435485839843, "kl_loss_3": 1056.855615234375, "kl_loss_7": 337.8177230834961, "learning_rate": 1.9724212990830937e-06, "loss": 729.082, "step": 9720 }, { "ce_loss_10": 3.620276391506195, "ce_loss_13": 3.5666480660438538, "ce_loss_2": 4.208401417732238, "ce_loss_3": 4.026053476333618, "ce_loss_7": 3.710793709754944, "epoch": 0.973, "grad_norm": 464.0, "kl_loss_10": 123.11025886535644, "kl_loss_2": 1394.9627868652344, "kl_loss_3": 1036.4825164794922, "kl_loss_7": 333.15184173583987, "learning_rate": 1.8341345686543331e-06, "loss": 717.9429, "step": 9730 }, { "ce_loss_10": 3.6050823092460633, "ce_loss_13": 3.5513722777366636, "ce_loss_2": 4.157528936862946, "ce_loss_3": 3.9856823086738586, "ce_loss_7": 3.689386820793152, "epoch": 0.974, "grad_norm": 804.0, "kl_loss_10": 120.63820838928223, "kl_loss_2": 1337.2397399902343, "kl_loss_3": 1002.3361602783203, "kl_loss_7": 325.78345794677733, "learning_rate": 1.7008643629596864e-06, "loss": 711.823, "step": 9740 }, { "ce_loss_10": 3.586062693595886, "ce_loss_13": 3.53137663602829, "ce_loss_2": 4.1578493475914, "ce_loss_3": 3.981482672691345, "ce_loss_7": 3.676142621040344, "epoch": 0.975, "grad_norm": 604.0, "kl_loss_10": 122.01227416992188, "kl_loss_2": 1380.4110900878907, "kl_loss_3": 1023.8995727539062, "kl_loss_7": 329.3552581787109, "learning_rate": 1.5726120240288633e-06, "loss": 721.4617, "step": 9750 }, { "ce_loss_10": 3.4836260080337524, "ce_loss_13": 3.431521987915039, "ce_loss_2": 4.062640523910522, "ce_loss_3": 3.8828797817230223, "ce_loss_7": 3.5693193793296816, "epoch": 0.976, "grad_norm": 528.0, "kl_loss_10": 119.90267982482911, "kl_loss_2": 1373.1053588867187, "kl_loss_3": 1020.3870666503906, "kl_loss_7": 327.03973999023435, "learning_rate": 1.4493788433612708e-06, "loss": 706.3937, "step": 9760 }, { "ce_loss_10": 3.603860354423523, "ce_loss_13": 3.5489115476608277, "ce_loss_2": 4.185849332809449, "ce_loss_3": 4.005133509635925, "ce_loss_7": 3.6924179434776305, "epoch": 0.977, "grad_norm": 516.0, "kl_loss_10": 121.7234733581543, "kl_loss_2": 1390.346942138672, "kl_loss_3": 1030.5024047851562, "kl_loss_7": 329.0796890258789, "learning_rate": 1.3311660619138578e-06, "loss": 718.6364, "step": 9770 }, { "ce_loss_10": 3.598566448688507, "ce_loss_13": 3.546519470214844, "ce_loss_2": 4.149194121360779, "ce_loss_3": 3.9783671855926515, "ce_loss_7": 3.6831716775894163, "epoch": 0.978, "grad_norm": 548.0, "kl_loss_10": 120.76995811462402, "kl_loss_2": 1329.148126220703, "kl_loss_3": 995.5872283935547, "kl_loss_7": 324.982649230957, "learning_rate": 1.2179748700879012e-06, "loss": 702.9616, "step": 9780 }, { "ce_loss_10": 3.527452754974365, "ce_loss_13": 3.4756274223327637, "ce_loss_2": 4.103048396110535, "ce_loss_3": 3.9268922090530394, "ce_loss_7": 3.6169739961624146, "epoch": 0.979, "grad_norm": 576.0, "kl_loss_10": 121.63832931518554, "kl_loss_2": 1367.4532653808594, "kl_loss_3": 1023.8312957763671, "kl_loss_7": 327.45165252685547, "learning_rate": 1.1098064077174619e-06, "loss": 710.6072, "step": 9790 }, { "ce_loss_10": 3.560258626937866, "ce_loss_13": 3.5036699771881104, "ce_loss_2": 4.162828862667084, "ce_loss_3": 3.9741065382957457, "ce_loss_7": 3.651784634590149, "epoch": 0.98, "grad_norm": 732.0, "kl_loss_10": 120.97579002380371, "kl_loss_2": 1405.2699890136719, "kl_loss_3": 1040.5084320068358, "kl_loss_7": 329.4268035888672, "learning_rate": 1.006661764057837e-06, "loss": 717.7347, "step": 9800 }, { "ce_loss_10": 3.564870071411133, "ce_loss_13": 3.5106621384620667, "ce_loss_2": 4.146739649772644, "ce_loss_3": 3.9645097851753235, "ce_loss_7": 3.6526604771614073, "epoch": 0.981, "grad_norm": 624.0, "kl_loss_10": 120.16670303344726, "kl_loss_2": 1385.8602905273438, "kl_loss_3": 1033.0024688720703, "kl_loss_7": 325.0816711425781, "learning_rate": 9.085419777743465e-07, "loss": 712.9294, "step": 9810 }, { "ce_loss_10": 3.502484345436096, "ce_loss_13": 3.4494829535484315, "ce_loss_2": 4.077840411663056, "ce_loss_3": 3.901638996601105, "ce_loss_7": 3.5905481934547425, "epoch": 0.982, "grad_norm": 430.0, "kl_loss_10": 117.31855773925781, "kl_loss_2": 1365.9148986816406, "kl_loss_3": 1018.6503601074219, "kl_loss_7": 321.9003936767578, "learning_rate": 8.15448036932176e-07, "loss": 700.3924, "step": 9820 }, { "ce_loss_10": 3.553510880470276, "ce_loss_13": 3.5014522314071654, "ce_loss_2": 4.126162922382354, "ce_loss_3": 3.9481998801231386, "ce_loss_7": 3.6414800763130186, "epoch": 0.983, "grad_norm": 704.0, "kl_loss_10": 120.78939666748047, "kl_loss_2": 1375.8851440429687, "kl_loss_3": 1031.1501098632812, "kl_loss_7": 328.82582702636716, "learning_rate": 7.273808789862724e-07, "loss": 719.6496, "step": 9830 }, { "ce_loss_10": 3.6392180204391478, "ce_loss_13": 3.5844750881195067, "ce_loss_2": 4.2080818772315975, "ce_loss_3": 4.02874116897583, "ce_loss_7": 3.7243886232376098, "epoch": 0.984, "grad_norm": 588.0, "kl_loss_10": 122.5202766418457, "kl_loss_2": 1367.7145080566406, "kl_loss_3": 1019.624478149414, "kl_loss_7": 330.5028793334961, "learning_rate": 6.443413907720186e-07, "loss": 706.112, "step": 9840 }, { "ce_loss_10": 3.5670676469802856, "ce_loss_13": 3.5155144810676573, "ce_loss_2": 4.136582219600678, "ce_loss_3": 3.958021545410156, "ce_loss_7": 3.6541135787963865, "epoch": 0.985, "grad_norm": 572.0, "kl_loss_10": 120.43879356384278, "kl_loss_2": 1362.252947998047, "kl_loss_3": 1014.570947265625, "kl_loss_7": 327.1987045288086, "learning_rate": 5.663304084960185e-07, "loss": 705.8062, "step": 9850 }, { "ce_loss_10": 3.493795156478882, "ce_loss_13": 3.439519703388214, "ce_loss_2": 4.085382175445557, "ce_loss_3": 3.902758014202118, "ce_loss_7": 3.5809911727905273, "epoch": 0.986, "grad_norm": 664.0, "kl_loss_10": 121.81441802978516, "kl_loss_2": 1393.3001831054687, "kl_loss_3": 1030.4414154052733, "kl_loss_7": 327.58775482177737, "learning_rate": 4.933487177280482e-07, "loss": 703.9349, "step": 9860 }, { "ce_loss_10": 3.5870001196861265, "ce_loss_13": 3.534350836277008, "ce_loss_2": 4.159269857406616, "ce_loss_3": 3.9821672320365904, "ce_loss_7": 3.67519474029541, "epoch": 0.987, "grad_norm": 608.0, "kl_loss_10": 119.21302146911621, "kl_loss_2": 1369.2319030761719, "kl_loss_3": 1017.3517211914062, "kl_loss_7": 324.8195739746094, "learning_rate": 4.2539705339295075e-07, "loss": 704.4329, "step": 9870 }, { "ce_loss_10": 3.4422147393226625, "ce_loss_13": 3.3882473349571227, "ce_loss_2": 4.03389185667038, "ce_loss_3": 3.855961525440216, "ce_loss_7": 3.5283602476119995, "epoch": 0.988, "grad_norm": 616.0, "kl_loss_10": 120.49687614440919, "kl_loss_2": 1390.164501953125, "kl_loss_3": 1040.5754180908202, "kl_loss_7": 328.028791809082, "learning_rate": 3.6247609976319816e-07, "loss": 708.3993, "step": 9880 }, { "ce_loss_10": 3.5487411856651305, "ce_loss_13": 3.4932268500328063, "ce_loss_2": 4.1378997445106505, "ce_loss_3": 3.956749749183655, "ce_loss_7": 3.6394209384918215, "epoch": 0.989, "grad_norm": 684.0, "kl_loss_10": 122.58170623779297, "kl_loss_2": 1392.1090087890625, "kl_loss_3": 1035.2212493896484, "kl_loss_7": 332.9962554931641, "learning_rate": 3.0458649045211895e-07, "loss": 730.05, "step": 9890 }, { "ce_loss_10": 3.5127838611602784, "ce_loss_13": 3.4589962124824525, "ce_loss_2": 4.101286160945892, "ce_loss_3": 3.9235021352767943, "ce_loss_7": 3.6055259227752687, "epoch": 0.99, "grad_norm": 624.0, "kl_loss_10": 123.06077766418457, "kl_loss_2": 1395.3993530273438, "kl_loss_3": 1043.9721893310548, "kl_loss_7": 335.08056182861327, "learning_rate": 2.517288084074587e-07, "loss": 727.1166, "step": 9900 }, { "ce_loss_10": 3.5486058712005617, "ce_loss_13": 3.494112956523895, "ce_loss_2": 4.158055460453033, "ce_loss_3": 3.9726677179336547, "ce_loss_7": 3.642500126361847, "epoch": 0.991, "grad_norm": 604.0, "kl_loss_10": 123.90233879089355, "kl_loss_2": 1423.9731567382812, "kl_loss_3": 1061.412161254883, "kl_loss_7": 340.47805938720705, "learning_rate": 2.0390358590538505e-07, "loss": 725.7766, "step": 9910 }, { "ce_loss_10": 3.554875147342682, "ce_loss_13": 3.5004802942276, "ce_loss_2": 4.137669503688812, "ce_loss_3": 3.9679285287857056, "ce_loss_7": 3.645340549945831, "epoch": 0.992, "grad_norm": 516.0, "kl_loss_10": 122.4406665802002, "kl_loss_2": 1388.8013732910156, "kl_loss_3": 1043.7246154785157, "kl_loss_7": 334.8171920776367, "learning_rate": 1.61111304545436e-07, "loss": 713.2598, "step": 9920 }, { "ce_loss_10": 3.5292787551879883, "ce_loss_13": 3.4751243948936463, "ce_loss_2": 4.103587174415589, "ce_loss_3": 3.930684947967529, "ce_loss_7": 3.6189408540725707, "epoch": 0.993, "grad_norm": 524.0, "kl_loss_10": 120.616743850708, "kl_loss_2": 1377.230780029297, "kl_loss_3": 1032.888330078125, "kl_loss_7": 326.32662811279295, "learning_rate": 1.2335239524541298e-07, "loss": 705.4175, "step": 9930 }, { "ce_loss_10": 3.4969447016716004, "ce_loss_13": 3.444041609764099, "ce_loss_2": 4.070138645172119, "ce_loss_3": 3.897006869316101, "ce_loss_7": 3.5882808804512023, "epoch": 0.994, "grad_norm": 480.0, "kl_loss_10": 119.86849899291992, "kl_loss_2": 1356.5659057617188, "kl_loss_3": 1016.5262145996094, "kl_loss_7": 323.5000595092773, "learning_rate": 9.06272382371065e-08, "loss": 711.4588, "step": 9940 }, { "ce_loss_10": 3.562340235710144, "ce_loss_13": 3.509819734096527, "ce_loss_2": 4.1450182557106015, "ce_loss_3": 3.96974858045578, "ce_loss_7": 3.6521814823150636, "epoch": 0.995, "grad_norm": 572.0, "kl_loss_10": 121.95822410583496, "kl_loss_2": 1386.1470336914062, "kl_loss_3": 1036.3678161621094, "kl_loss_7": 331.8135269165039, "learning_rate": 6.293616306246586e-08, "loss": 711.5672, "step": 9950 }, { "ce_loss_10": 3.560911405086517, "ce_loss_13": 3.5092976689338684, "ce_loss_2": 4.1187317132949826, "ce_loss_3": 3.945353388786316, "ce_loss_7": 3.6452075004577638, "epoch": 0.996, "grad_norm": 532.0, "kl_loss_10": 118.4669750213623, "kl_loss_2": 1347.5411254882813, "kl_loss_3": 1008.2919952392579, "kl_loss_7": 321.9423858642578, "learning_rate": 4.027944857032395e-08, "loss": 694.5431, "step": 9960 }, { "ce_loss_10": 3.5510011553764342, "ce_loss_13": 3.4982069730758667, "ce_loss_2": 4.102385640144348, "ce_loss_3": 3.9248210430145263, "ce_loss_7": 3.6324383854866027, "epoch": 0.997, "grad_norm": 596.0, "kl_loss_10": 118.21413650512696, "kl_loss_2": 1309.8837768554688, "kl_loss_3": 973.4349212646484, "kl_loss_7": 314.49192199707034, "learning_rate": 2.265732291356626e-08, "loss": 688.6318, "step": 9970 }, { "ce_loss_10": 3.5984220147132873, "ce_loss_13": 3.544767773151398, "ce_loss_2": 4.165575993061066, "ce_loss_3": 3.98673597574234, "ce_loss_7": 3.684629225730896, "epoch": 0.998, "grad_norm": 492.0, "kl_loss_10": 121.18408889770508, "kl_loss_2": 1353.7459533691406, "kl_loss_3": 1007.0253448486328, "kl_loss_7": 326.501042175293, "learning_rate": 1.0069963546743833e-08, "loss": 717.6554, "step": 9980 }, { "ce_loss_10": 3.578536665439606, "ce_loss_13": 3.5262981772422792, "ce_loss_2": 4.151448047161102, "ce_loss_3": 3.9757529973983763, "ce_loss_7": 3.666281545162201, "epoch": 0.999, "grad_norm": 490.0, "kl_loss_10": 121.82665138244629, "kl_loss_2": 1373.672137451172, "kl_loss_3": 1028.6984802246093, "kl_loss_7": 330.99612579345705, "learning_rate": 2.517497224463483e-09, "loss": 710.4597, "step": 9990 }, { "ce_loss_10": 3.5323142886161802, "ce_loss_13": 3.4774991631507874, "ce_loss_2": 4.149592983722687, "ce_loss_3": 3.9584404826164246, "ce_loss_7": 3.6246480464935305, "epoch": 1.0, "grad_norm": 576.0, "kl_loss_10": 122.81040077209472, "kl_loss_2": 1444.6247863769531, "kl_loss_3": 1063.938232421875, "kl_loss_7": 337.08654937744143, "learning_rate": 0.0, "loss": 730.4086, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.177819035608023e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }