diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_10": 5.865575313568115, + "ce_loss_13": 3.509598731994629, + "ce_loss_2": 10.007837295532227, + "ce_loss_3": 13.285265445709229, + "ce_loss_7": 6.684143781661987, + "epoch": 0.0001, + "grad_norm": 110592.0, + "kl_loss_10": 5785.801025390625, + "kl_loss_2": 13150.353515625, + "kl_loss_3": 19835.314453125, + "kl_loss_7": 7173.60595703125, + "learning_rate": 1e-05, + "loss": 11523.9297, + "step": 1 + }, + { + "ce_loss_10": 5.085373534096612, + "ce_loss_13": 3.570544626977709, + "ce_loss_2": 7.954865243699816, + "ce_loss_3": 9.156357712215847, + "ce_loss_7": 5.791120873557197, + "epoch": 0.001, + "grad_norm": 12160.0, + "kl_loss_10": 3681.9522908528647, + "kl_loss_2": 8514.069715711805, + "kl_loss_3": 11012.32679578993, + "kl_loss_7": 4671.0064697265625, + "learning_rate": 0.0001, + "loss": 7057.7209, + "step": 10 + }, + { + "ce_loss_10": 4.197324633598328, + "ce_loss_13": 3.5781216859817504, + "ce_loss_2": 6.396872496604919, + "ce_loss_3": 6.317626190185547, + "ce_loss_7": 4.718231725692749, + "epoch": 0.002, + "grad_norm": 3088.0, + "kl_loss_10": 1178.2664031982422, + "kl_loss_2": 5194.5411376953125, + "kl_loss_3": 5041.807763671875, + "kl_loss_7": 2107.8896606445314, + "learning_rate": 0.0002, + "loss": 3433.318, + "step": 20 + }, + { + "ce_loss_10": 3.7206726551055906, + "ce_loss_13": 3.368276393413544, + "ce_loss_2": 5.781760859489441, + "ce_loss_3": 5.545315575599671, + "ce_loss_7": 4.186028468608856, + "epoch": 0.003, + "grad_norm": 4032.0, + "kl_loss_10": 674.2754943847656, + "kl_loss_2": 4511.11181640625, + "kl_loss_3": 4054.1059814453124, + "kl_loss_7": 1528.1455383300781, + "learning_rate": 0.0003, + "loss": 2640.5512, + "step": 30 + }, + { + "ce_loss_10": 3.821107840538025, + "ce_loss_13": 3.536966252326965, + "ce_loss_2": 5.572521734237671, + "ce_loss_3": 5.341288280487061, + "ce_loss_7": 4.234801423549652, + "epoch": 0.004, + "grad_norm": 2960.0, + "kl_loss_10": 524.9487045288085, + "kl_loss_2": 3838.2613525390625, + "kl_loss_3": 3402.8488647460936, + "kl_loss_7": 1309.348095703125, + "learning_rate": 0.0004, + "loss": 2283.1129, + "step": 40 + }, + { + "ce_loss_10": 3.7567438006401064, + "ce_loss_13": 3.5144933104515075, + "ce_loss_2": 5.419973969459534, + "ce_loss_3": 5.193652868270874, + "ce_loss_7": 4.1413051843643185, + "epoch": 0.005, + "grad_norm": 6016.0, + "kl_loss_10": 445.8336715698242, + "kl_loss_2": 3600.4189453125, + "kl_loss_3": 3176.044384765625, + "kl_loss_7": 1185.156512451172, + "learning_rate": 0.0005, + "loss": 2103.5049, + "step": 50 + }, + { + "ce_loss_10": 3.722773575782776, + "ce_loss_13": 3.5224708676338197, + "ce_loss_2": 5.278621530532837, + "ce_loss_3": 5.080061268806458, + "ce_loss_7": 4.097856783866883, + "epoch": 0.006, + "grad_norm": 3824.0, + "kl_loss_10": 390.2220687866211, + "kl_loss_2": 3350.347863769531, + "kl_loss_3": 2998.066064453125, + "kl_loss_7": 1115.832781982422, + "learning_rate": 0.0006, + "loss": 1964.402, + "step": 60 + }, + { + "ce_loss_10": 3.6256401419639586, + "ce_loss_13": 3.4409272193908693, + "ce_loss_2": 5.1505759954452515, + "ce_loss_3": 4.97669529914856, + "ce_loss_7": 3.983938765525818, + "epoch": 0.007, + "grad_norm": 3920.0, + "kl_loss_10": 355.79547729492185, + "kl_loss_2": 3279.6125610351564, + "kl_loss_3": 2951.422497558594, + "kl_loss_7": 1050.2557006835937, + "learning_rate": 0.0007, + "loss": 1892.3121, + "step": 70 + }, + { + "ce_loss_10": 3.611355257034302, + "ce_loss_13": 3.4418101072311402, + "ce_loss_2": 5.105751895904541, + "ce_loss_3": 4.90991735458374, + "ce_loss_7": 3.9785196661949156, + "epoch": 0.008, + "grad_norm": 3488.0, + "kl_loss_10": 338.34486999511716, + "kl_loss_2": 3190.1597534179687, + "kl_loss_3": 2825.821044921875, + "kl_loss_7": 1014.8152435302734, + "learning_rate": 0.0008, + "loss": 1848.4, + "step": 80 + }, + { + "ce_loss_10": 3.5723747968673707, + "ce_loss_13": 3.4033101916313173, + "ce_loss_2": 5.05491304397583, + "ce_loss_3": 4.844810819625854, + "ce_loss_7": 3.8869771361351013, + "epoch": 0.009, + "grad_norm": 3504.0, + "kl_loss_10": 351.16760406494143, + "kl_loss_2": 3202.9417724609375, + "kl_loss_3": 2806.134655761719, + "kl_loss_7": 954.3875213623047, + "learning_rate": 0.0009000000000000001, + "loss": 1819.1223, + "step": 90 + }, + { + "ce_loss_10": 3.714704382419586, + "ce_loss_13": 3.524587631225586, + "ce_loss_2": 5.140144395828247, + "ce_loss_3": 4.962791919708252, + "ce_loss_7": 4.010744750499725, + "epoch": 0.01, + "grad_norm": 4832.0, + "kl_loss_10": 371.28753662109375, + "kl_loss_2": 3117.2229125976564, + "kl_loss_3": 2795.0933227539062, + "kl_loss_7": 947.7742279052734, + "learning_rate": 0.001, + "loss": 1805.8203, + "step": 100 + }, + { + "ce_loss_10": 3.6942928433418274, + "ce_loss_13": 3.4817589998245237, + "ce_loss_2": 5.091573238372803, + "ce_loss_3": 4.877256274223328, + "ce_loss_7": 3.9481292366981506, + "epoch": 0.011, + "grad_norm": 2008.0, + "kl_loss_10": 429.9922073364258, + "kl_loss_2": 3126.605187988281, + "kl_loss_3": 2695.8526123046877, + "kl_loss_7": 906.8043029785156, + "learning_rate": 0.0009999974825027757, + "loss": 1785.3453, + "step": 110 + }, + { + "ce_loss_10": 3.7456037163734437, + "ce_loss_13": 3.5361623644828795, + "ce_loss_2": 5.1227661848068236, + "ce_loss_3": 4.809029960632325, + "ce_loss_7": 4.0057451844215395, + "epoch": 0.012, + "grad_norm": 2736.0, + "kl_loss_10": 421.653271484375, + "kl_loss_2": 3041.225866699219, + "kl_loss_3": 2468.0310668945312, + "kl_loss_7": 901.2433288574218, + "learning_rate": 0.0009999899300364532, + "loss": 1691.2215, + "step": 120 + }, + { + "ce_loss_10": 3.6967467427253724, + "ce_loss_13": 3.5110474586486817, + "ce_loss_2": 5.076847052574157, + "ce_loss_3": 4.790580677986145, + "ce_loss_7": 3.9812671184539794, + "epoch": 0.013, + "grad_norm": 1792.0, + "kl_loss_10": 366.0322860717773, + "kl_loss_2": 3049.608190917969, + "kl_loss_3": 2474.2186645507813, + "kl_loss_7": 909.4364349365235, + "learning_rate": 0.0009999773426770863, + "loss": 1717.7953, + "step": 130 + }, + { + "ce_loss_10": 3.7232070088386537, + "ce_loss_13": 3.543111026287079, + "ce_loss_2": 5.047617030143738, + "ce_loss_3": 4.763497018814087, + "ce_loss_7": 3.990681600570679, + "epoch": 0.014, + "grad_norm": 1920.0, + "kl_loss_10": 359.95825347900393, + "kl_loss_2": 2908.210729980469, + "kl_loss_3": 2355.444958496094, + "kl_loss_7": 888.9608703613281, + "learning_rate": 0.0009999597205514296, + "loss": 1639.8703, + "step": 140 + }, + { + "ce_loss_10": 3.676049864292145, + "ce_loss_13": 3.505241572856903, + "ce_loss_2": 4.947909688949585, + "ce_loss_3": 4.669061374664307, + "ce_loss_7": 3.9511622548103333, + "epoch": 0.015, + "grad_norm": 1376.0, + "kl_loss_10": 345.57527313232424, + "kl_loss_2": 2795.33154296875, + "kl_loss_3": 2267.452508544922, + "kl_loss_7": 859.9151733398437, + "learning_rate": 0.0009999370638369377, + "loss": 1577.1158, + "step": 150 + }, + { + "ce_loss_10": 3.7071213841438295, + "ce_loss_13": 3.5414621829986572, + "ce_loss_2": 4.982771682739258, + "ce_loss_3": 4.697902798652649, + "ce_loss_7": 3.964942920207977, + "epoch": 0.016, + "grad_norm": 1568.0, + "kl_loss_10": 323.99631652832034, + "kl_loss_2": 2816.620886230469, + "kl_loss_3": 2258.00849609375, + "kl_loss_7": 828.0151275634765, + "learning_rate": 0.000999909372761763, + "loss": 1565.1712, + "step": 160 + }, + { + "ce_loss_10": 3.629442536830902, + "ce_loss_13": 3.475699579715729, + "ce_loss_2": 4.9288132905960085, + "ce_loss_3": 4.636562061309815, + "ce_loss_7": 3.8839596271514893, + "epoch": 0.017, + "grad_norm": 1512.0, + "kl_loss_10": 310.6042221069336, + "kl_loss_2": 2849.280310058594, + "kl_loss_3": 2291.288397216797, + "kl_loss_7": 820.7699920654297, + "learning_rate": 0.0009998766476047546, + "loss": 1575.1025, + "step": 170 + }, + { + "ce_loss_10": 3.6930266618728638, + "ce_loss_13": 3.519435966014862, + "ce_loss_2": 4.993418955802918, + "ce_loss_3": 4.673493552207947, + "ce_loss_7": 3.9580948114395142, + "epoch": 0.018, + "grad_norm": 3152.0, + "kl_loss_10": 337.58184814453125, + "kl_loss_2": 2880.2222290039062, + "kl_loss_3": 2268.8982666015627, + "kl_loss_7": 865.2954742431641, + "learning_rate": 0.0009998388886954545, + "loss": 1604.3529, + "step": 180 + }, + { + "ce_loss_10": 3.6799558639526366, + "ce_loss_13": 3.486136960983276, + "ce_loss_2": 4.946302032470703, + "ce_loss_3": 4.601687026023865, + "ce_loss_7": 3.9099194288253782, + "epoch": 0.019, + "grad_norm": 1672.0, + "kl_loss_10": 402.4949157714844, + "kl_loss_2": 2886.513952636719, + "kl_loss_3": 2211.9050720214846, + "kl_loss_7": 841.9078552246094, + "learning_rate": 0.0009997960964140947, + "loss": 1577.3238, + "step": 190 + }, + { + "ce_loss_10": 3.650504744052887, + "ce_loss_13": 3.4824187994003295, + "ce_loss_2": 4.945061421394348, + "ce_loss_3": 4.60343816280365, + "ce_loss_7": 3.8988500118255613, + "epoch": 0.02, + "grad_norm": 1872.0, + "kl_loss_10": 343.68775634765626, + "kl_loss_2": 2847.6089111328124, + "kl_loss_3": 2192.5098266601562, + "kl_loss_7": 843.3558349609375, + "learning_rate": 0.0009997482711915926, + "loss": 1555.5152, + "step": 200 + }, + { + "ce_loss_10": 3.6082719564437866, + "ce_loss_13": 3.44773451089859, + "ce_loss_2": 4.884588956832886, + "ce_loss_3": 4.508144104480744, + "ce_loss_7": 3.8779943227767943, + "epoch": 0.021, + "grad_norm": 1736.0, + "kl_loss_10": 324.77989807128904, + "kl_loss_2": 2826.437780761719, + "kl_loss_3": 2092.716345214844, + "kl_loss_7": 846.5946807861328, + "learning_rate": 0.0009996954135095479, + "loss": 1513.0088, + "step": 210 + }, + { + "ce_loss_10": 3.6795857906341554, + "ce_loss_13": 3.5324608206748964, + "ce_loss_2": 4.862332582473755, + "ce_loss_3": 4.554595494270325, + "ce_loss_7": 3.907832646369934, + "epoch": 0.022, + "grad_norm": 1032.0, + "kl_loss_10": 296.77941131591797, + "kl_loss_2": 2593.2522094726564, + "kl_loss_3": 2006.6829711914063, + "kl_loss_7": 745.140835571289, + "learning_rate": 0.0009996375239002368, + "loss": 1414.3565, + "step": 220 + }, + { + "ce_loss_10": 3.740398120880127, + "ce_loss_13": 3.5988372921943665, + "ce_loss_2": 4.867113184928894, + "ce_loss_3": 4.597586512565613, + "ce_loss_7": 3.96889066696167, + "epoch": 0.023, + "grad_norm": 1048.0, + "kl_loss_10": 276.71729202270507, + "kl_loss_2": 2492.076037597656, + "kl_loss_3": 1982.453546142578, + "kl_loss_7": 727.5907012939454, + "learning_rate": 0.0009995746029466072, + "loss": 1380.723, + "step": 230 + }, + { + "ce_loss_10": 3.5279888272285462, + "ce_loss_13": 3.3874544382095335, + "ce_loss_2": 4.788827538490295, + "ce_loss_3": 4.472659921646118, + "ce_loss_7": 3.778586721420288, + "epoch": 0.024, + "grad_norm": 1296.0, + "kl_loss_10": 279.43826217651366, + "kl_loss_2": 2758.438586425781, + "kl_loss_3": 2143.7233947753907, + "kl_loss_7": 779.3903015136718, + "learning_rate": 0.0009995066512822719, + "loss": 1441.9243, + "step": 240 + }, + { + "ce_loss_10": 3.6201495051383974, + "ce_loss_13": 3.4905712485313414, + "ce_loss_2": 4.891209101676941, + "ce_loss_3": 4.570740580558777, + "ce_loss_7": 3.865015411376953, + "epoch": 0.025, + "grad_norm": 1432.0, + "kl_loss_10": 261.5669853210449, + "kl_loss_2": 2739.434948730469, + "kl_loss_3": 2116.1525390625, + "kl_loss_7": 737.9495910644531, + "learning_rate": 0.000999433669591504, + "loss": 1414.2406, + "step": 250 + }, + { + "ce_loss_10": 3.5210766077041624, + "ce_loss_13": 3.3931028485298156, + "ce_loss_2": 4.796879243850708, + "ce_loss_3": 4.466079044342041, + "ce_loss_7": 3.767536473274231, + "epoch": 0.026, + "grad_norm": 1368.0, + "kl_loss_10": 264.8755844116211, + "kl_loss_2": 2785.579797363281, + "kl_loss_3": 2137.9736206054686, + "kl_loss_7": 744.8723297119141, + "learning_rate": 0.000999355658609228, + "loss": 1444.816, + "step": 260 + }, + { + "ce_loss_10": 3.5607513666152952, + "ce_loss_13": 3.4221261620521544, + "ce_loss_2": 4.833321261405945, + "ce_loss_3": 4.517770767211914, + "ce_loss_7": 3.800622284412384, + "epoch": 0.027, + "grad_norm": 932.0, + "kl_loss_10": 273.0821258544922, + "kl_loss_2": 2761.8063842773436, + "kl_loss_3": 2143.9581298828125, + "kl_loss_7": 740.5351409912109, + "learning_rate": 0.0009992726191210138, + "loss": 1461.2188, + "step": 270 + }, + { + "ce_loss_10": 3.5995601177215577, + "ce_loss_13": 3.465478837490082, + "ce_loss_2": 4.800075507164001, + "ce_loss_3": 4.5197283864021305, + "ce_loss_7": 3.8348299264907837, + "epoch": 0.028, + "grad_norm": 1432.0, + "kl_loss_10": 281.3691596984863, + "kl_loss_2": 2633.5447509765627, + "kl_loss_3": 2094.4590698242187, + "kl_loss_7": 738.63564453125, + "learning_rate": 0.0009991845519630679, + "loss": 1420.4679, + "step": 280 + }, + { + "ce_loss_10": 3.4883801102638246, + "ce_loss_13": 3.3483877182006836, + "ce_loss_2": 4.6885244131088255, + "ce_loss_3": 4.424971508979797, + "ce_loss_7": 3.7516568541526794, + "epoch": 0.029, + "grad_norm": 1696.0, + "kl_loss_10": 289.22730712890626, + "kl_loss_2": 2620.5162719726563, + "kl_loss_3": 2117.432470703125, + "kl_loss_7": 775.0425170898437, + "learning_rate": 0.0009990914580222257, + "loss": 1449.7931, + "step": 290 + }, + { + "ce_loss_10": 3.6296466469764708, + "ce_loss_13": 3.491527271270752, + "ce_loss_2": 4.723770475387573, + "ce_loss_3": 4.4687070608139035, + "ce_loss_7": 3.8463655352592467, + "epoch": 0.03, + "grad_norm": 960.0, + "kl_loss_10": 275.84105758666993, + "kl_loss_2": 2456.5258056640623, + "kl_loss_3": 1969.6472045898438, + "kl_loss_7": 744.4771392822265, + "learning_rate": 0.0009989933382359422, + "loss": 1393.9777, + "step": 300 + }, + { + "ce_loss_10": 3.6187907338142393, + "ce_loss_13": 3.494164001941681, + "ce_loss_2": 4.727794432640076, + "ce_loss_3": 4.440771436691284, + "ce_loss_7": 3.837166500091553, + "epoch": 0.031, + "grad_norm": 1020.0, + "kl_loss_10": 256.4068969726562, + "kl_loss_2": 2450.5587768554688, + "kl_loss_3": 1899.8815795898438, + "kl_loss_7": 696.280160522461, + "learning_rate": 0.0009988901935922825, + "loss": 1338.8556, + "step": 310 + }, + { + "ce_loss_10": 3.475846517086029, + "ce_loss_13": 3.3469168901443482, + "ce_loss_2": 4.662670254707336, + "ce_loss_3": 4.363593912124633, + "ce_loss_7": 3.6978800535202025, + "epoch": 0.032, + "grad_norm": 1064.0, + "kl_loss_10": 265.57069702148436, + "kl_loss_2": 2603.8465576171875, + "kl_loss_3": 2019.1532775878907, + "kl_loss_7": 702.3248809814453, + "learning_rate": 0.0009987820251299122, + "loss": 1359.8287, + "step": 320 + }, + { + "ce_loss_10": 3.6019670248031614, + "ce_loss_13": 3.4775232672691345, + "ce_loss_2": 4.698803400993347, + "ce_loss_3": 4.42151095867157, + "ce_loss_7": 3.817472517490387, + "epoch": 0.033, + "grad_norm": 940.0, + "kl_loss_10": 252.79367218017578, + "kl_loss_2": 2444.330114746094, + "kl_loss_3": 1906.024169921875, + "kl_loss_7": 675.2368347167969, + "learning_rate": 0.0009986688339380862, + "loss": 1315.1269, + "step": 330 + }, + { + "ce_loss_10": 3.5493945360183714, + "ce_loss_13": 3.430993151664734, + "ce_loss_2": 4.632452750205994, + "ce_loss_3": 4.36405119895935, + "ce_loss_7": 3.7596432328224183, + "epoch": 0.034, + "grad_norm": 964.0, + "kl_loss_10": 238.95554962158204, + "kl_loss_2": 2376.1999267578126, + "kl_loss_3": 1854.2742736816406, + "kl_loss_7": 652.8402496337891, + "learning_rate": 0.0009985506211566387, + "loss": 1294.451, + "step": 340 + }, + { + "ce_loss_10": 3.5777671217918394, + "ce_loss_13": 3.459764850139618, + "ce_loss_2": 4.638717436790467, + "ce_loss_3": 4.377784371376038, + "ce_loss_7": 3.7803191184997558, + "epoch": 0.035, + "grad_norm": 1072.0, + "kl_loss_10": 247.259854888916, + "kl_loss_2": 2338.356652832031, + "kl_loss_3": 1833.6623596191407, + "kl_loss_7": 643.2027313232422, + "learning_rate": 0.0009984273879759713, + "loss": 1274.0854, + "step": 350 + }, + { + "ce_loss_10": 3.615658330917358, + "ce_loss_13": 3.4873095512390138, + "ce_loss_2": 4.692537188529968, + "ce_loss_3": 4.444826865196228, + "ce_loss_7": 3.812762463092804, + "epoch": 0.036, + "grad_norm": 944.0, + "kl_loss_10": 267.5936080932617, + "kl_loss_2": 2401.089453125, + "kl_loss_3": 1906.0577697753906, + "kl_loss_7": 656.432861328125, + "learning_rate": 0.0009982991356370402, + "loss": 1322.0952, + "step": 360 + }, + { + "ce_loss_10": 3.60038241147995, + "ce_loss_13": 3.4653908014297485, + "ce_loss_2": 4.659529018402099, + "ce_loss_3": 4.398728227615356, + "ce_loss_7": 3.788052773475647, + "epoch": 0.037, + "grad_norm": 1264.0, + "kl_loss_10": 274.8468948364258, + "kl_loss_2": 2378.8714233398437, + "kl_loss_3": 1863.3742370605469, + "kl_loss_7": 654.2035675048828, + "learning_rate": 0.0009981658654313456, + "loss": 1294.735, + "step": 370 + }, + { + "ce_loss_10": 3.6825244545936586, + "ce_loss_13": 3.5485115528106688, + "ce_loss_2": 4.6979457378387455, + "ce_loss_3": 4.432817983627319, + "ce_loss_7": 3.8632850527763365, + "epoch": 0.038, + "grad_norm": 776.0, + "kl_loss_10": 272.49184265136716, + "kl_loss_2": 2301.4927673339844, + "kl_loss_3": 1785.1670227050781, + "kl_loss_7": 634.1265563964844, + "learning_rate": 0.000998027578700917, + "loss": 1269.1893, + "step": 380 + }, + { + "ce_loss_10": 3.605938124656677, + "ce_loss_13": 3.478776490688324, + "ce_loss_2": 4.650116562843323, + "ce_loss_3": 4.38636953830719, + "ce_loss_7": 3.796186101436615, + "epoch": 0.039, + "grad_norm": 884.0, + "kl_loss_10": 259.15301818847655, + "kl_loss_2": 2337.7537109375, + "kl_loss_3": 1833.4333435058593, + "kl_loss_7": 650.2792358398438, + "learning_rate": 0.0009978842768382998, + "loss": 1277.8982, + "step": 390 + }, + { + "ce_loss_10": 3.619408428668976, + "ce_loss_13": 3.5009591698646547, + "ce_loss_2": 4.620783948898316, + "ce_loss_3": 4.36586412191391, + "ce_loss_7": 3.8007230043411253, + "epoch": 0.04, + "grad_norm": 924.0, + "kl_loss_10": 239.69640197753907, + "kl_loss_2": 2237.8325927734377, + "kl_loss_3": 1755.0042358398437, + "kl_loss_7": 615.1319396972656, + "learning_rate": 0.0009977359612865424, + "loss": 1226.1033, + "step": 400 + }, + { + "ce_loss_10": 3.6167723059654238, + "ce_loss_13": 3.5048955202102663, + "ce_loss_2": 4.640660691261291, + "ce_loss_3": 4.3846115350723265, + "ce_loss_7": 3.8166274309158323, + "epoch": 0.041, + "grad_norm": 724.0, + "kl_loss_10": 232.37744522094727, + "kl_loss_2": 2279.1153564453125, + "kl_loss_3": 1781.2996765136718, + "kl_loss_7": 644.5638702392578, + "learning_rate": 0.0009975826335391806, + "loss": 1223.2463, + "step": 410 + }, + { + "ce_loss_10": 3.6286866068840027, + "ce_loss_13": 3.52371027469635, + "ce_loss_2": 4.6422699928283695, + "ce_loss_3": 4.3733536958694454, + "ce_loss_7": 3.8216816902160646, + "epoch": 0.042, + "grad_norm": 920.0, + "kl_loss_10": 219.2363723754883, + "kl_loss_2": 2237.4999084472656, + "kl_loss_3": 1720.3533020019531, + "kl_loss_7": 611.6994201660157, + "learning_rate": 0.0009974242951402235, + "loss": 1211.1291, + "step": 420 + }, + { + "ce_loss_10": 3.638279104232788, + "ce_loss_13": 3.528469812870026, + "ce_loss_2": 4.666049456596374, + "ce_loss_3": 4.395814538002014, + "ce_loss_7": 3.8397316694259644, + "epoch": 0.043, + "grad_norm": 980.0, + "kl_loss_10": 225.09888305664063, + "kl_loss_2": 2284.1164428710936, + "kl_loss_3": 1757.2524719238281, + "kl_loss_7": 629.9028503417969, + "learning_rate": 0.0009972609476841367, + "loss": 1210.4814, + "step": 430 + }, + { + "ce_loss_10": 3.549589216709137, + "ce_loss_13": 3.4418772578239443, + "ce_loss_2": 4.606719565391541, + "ce_loss_3": 4.324760735034943, + "ce_loss_7": 3.7483399391174315, + "epoch": 0.044, + "grad_norm": 932.0, + "kl_loss_10": 216.09697952270508, + "kl_loss_2": 2312.0109741210936, + "kl_loss_3": 1769.2410278320312, + "kl_loss_7": 615.9786346435546, + "learning_rate": 0.0009970925928158272, + "loss": 1228.8015, + "step": 440 + }, + { + "ce_loss_10": 3.4980531454086305, + "ce_loss_13": 3.385247766971588, + "ce_loss_2": 4.560380291938782, + "ce_loss_3": 4.278938281536102, + "ce_loss_7": 3.7021584153175353, + "epoch": 0.045, + "grad_norm": 908.0, + "kl_loss_10": 226.3798355102539, + "kl_loss_2": 2369.1811340332033, + "kl_loss_3": 1809.7959411621093, + "kl_loss_7": 638.1731658935547, + "learning_rate": 0.000996919232230627, + "loss": 1239.8368, + "step": 450 + }, + { + "ce_loss_10": 3.576229965686798, + "ce_loss_13": 3.473420023918152, + "ce_loss_2": 4.5887164831161495, + "ce_loss_3": 4.321652209758758, + "ce_loss_7": 3.77212575674057, + "epoch": 0.046, + "grad_norm": 1024.0, + "kl_loss_10": 210.70022430419922, + "kl_loss_2": 2231.498858642578, + "kl_loss_3": 1716.9997863769531, + "kl_loss_7": 612.2136322021485, + "learning_rate": 0.0009967408676742752, + "loss": 1172.7754, + "step": 460 + }, + { + "ce_loss_10": 3.734071063995361, + "ce_loss_13": 3.618434226512909, + "ce_loss_2": 4.695290613174438, + "ce_loss_3": 4.43647825717926, + "ce_loss_7": 3.918177044391632, + "epoch": 0.047, + "grad_norm": 908.0, + "kl_loss_10": 231.6582176208496, + "kl_loss_2": 2180.8106018066405, + "kl_loss_3": 1692.2353576660157, + "kl_loss_7": 612.3778228759766, + "learning_rate": 0.0009965575009429006, + "loss": 1209.0801, + "step": 470 + }, + { + "ce_loss_10": 3.5073325395584107, + "ce_loss_13": 3.397393560409546, + "ce_loss_2": 4.541721534729004, + "ce_loss_3": 4.268522250652313, + "ce_loss_7": 3.696891689300537, + "epoch": 0.048, + "grad_norm": 888.0, + "kl_loss_10": 227.73510971069337, + "kl_loss_2": 2294.750048828125, + "kl_loss_3": 1770.7283081054688, + "kl_loss_7": 619.1150939941406, + "learning_rate": 0.0009963691338830043, + "loss": 1207.0088, + "step": 480 + }, + { + "ce_loss_10": 3.598436427116394, + "ce_loss_13": 3.498102140426636, + "ce_loss_2": 4.597862339019775, + "ce_loss_3": 4.321764397621155, + "ce_loss_7": 3.779419946670532, + "epoch": 0.049, + "grad_norm": 820.0, + "kl_loss_10": 216.79126739501953, + "kl_loss_2": 2232.1513854980467, + "kl_loss_3": 1698.4708862304688, + "kl_loss_7": 586.6665161132812, + "learning_rate": 0.0009961757683914405, + "loss": 1170.0627, + "step": 490 + }, + { + "ce_loss_10": 3.587793219089508, + "ce_loss_13": 3.479810047149658, + "ce_loss_2": 4.565227222442627, + "ce_loss_3": 4.309310102462769, + "ce_loss_7": 3.773853075504303, + "epoch": 0.05, + "grad_norm": 1760.0, + "kl_loss_10": 222.1615104675293, + "kl_loss_2": 2198.9348083496093, + "kl_loss_3": 1694.7262390136718, + "kl_loss_7": 612.1928680419921, + "learning_rate": 0.0009959774064153978, + "loss": 1195.7824, + "step": 500 + }, + { + "ce_loss_10": 3.590281581878662, + "ce_loss_13": 3.492150938510895, + "ce_loss_2": 4.57180380821228, + "ce_loss_3": 4.302522945404053, + "ce_loss_7": 3.772382390499115, + "epoch": 0.051, + "grad_norm": 972.0, + "kl_loss_10": 210.51349639892578, + "kl_loss_2": 2180.4048400878905, + "kl_loss_3": 1653.8468933105469, + "kl_loss_7": 588.0678894042969, + "learning_rate": 0.0009957740499523787, + "loss": 1178.0232, + "step": 510 + }, + { + "ce_loss_10": 3.6132477045059206, + "ce_loss_13": 3.511445939540863, + "ce_loss_2": 4.587271404266358, + "ce_loss_3": 4.3159163236618046, + "ce_loss_7": 3.796973693370819, + "epoch": 0.052, + "grad_norm": 720.0, + "kl_loss_10": 204.7069091796875, + "kl_loss_2": 2151.1836120605467, + "kl_loss_3": 1634.1124877929688, + "kl_loss_7": 572.2468353271485, + "learning_rate": 0.0009955657010501807, + "loss": 1149.754, + "step": 520 + }, + { + "ce_loss_10": 3.5784901857376097, + "ce_loss_13": 3.4709136962890623, + "ce_loss_2": 4.551413655281067, + "ce_loss_3": 4.286619782447815, + "ce_loss_7": 3.7552420496940613, + "epoch": 0.053, + "grad_norm": 1032.0, + "kl_loss_10": 223.27496261596679, + "kl_loss_2": 2177.6577087402343, + "kl_loss_3": 1660.8929260253906, + "kl_loss_7": 572.54072265625, + "learning_rate": 0.000995352361806875, + "loss": 1147.3114, + "step": 530 + }, + { + "ce_loss_10": 3.625127899646759, + "ce_loss_13": 3.511006486415863, + "ce_loss_2": 4.581828641891479, + "ce_loss_3": 4.31304669380188, + "ce_loss_7": 3.7988196134567263, + "epoch": 0.054, + "grad_norm": 732.0, + "kl_loss_10": 231.79150848388673, + "kl_loss_2": 2171.6323669433596, + "kl_loss_3": 1659.2912536621093, + "kl_loss_7": 599.0842193603515, + "learning_rate": 0.0009951340343707852, + "loss": 1174.3984, + "step": 540 + }, + { + "ce_loss_10": 3.676268827915192, + "ce_loss_13": 3.5701403856277465, + "ce_loss_2": 4.638545846939087, + "ce_loss_3": 4.371236753463745, + "ce_loss_7": 3.8460424661636354, + "epoch": 0.055, + "grad_norm": 700.0, + "kl_loss_10": 216.45612487792968, + "kl_loss_2": 2133.3610229492188, + "kl_loss_3": 1642.1666015625, + "kl_loss_7": 563.3070922851563, + "learning_rate": 0.0009949107209404665, + "loss": 1152.1754, + "step": 550 + }, + { + "ce_loss_10": 3.587022233009338, + "ce_loss_13": 3.474756634235382, + "ce_loss_2": 4.525298738479615, + "ce_loss_3": 4.273183763027191, + "ce_loss_7": 3.750283181667328, + "epoch": 0.056, + "grad_norm": 676.0, + "kl_loss_10": 231.15026779174804, + "kl_loss_2": 2132.3562866210937, + "kl_loss_3": 1646.3475463867187, + "kl_loss_7": 568.931118774414, + "learning_rate": 0.0009946824237646824, + "loss": 1145.0641, + "step": 560 + }, + { + "ce_loss_10": 3.5445329308509828, + "ce_loss_13": 3.4262341499328612, + "ce_loss_2": 4.4978625774383545, + "ce_loss_3": 4.245725357532502, + "ce_loss_7": 3.7067874550819395, + "epoch": 0.057, + "grad_norm": 824.0, + "kl_loss_10": 238.33600463867188, + "kl_loss_2": 2178.645184326172, + "kl_loss_3": 1689.5344848632812, + "kl_loss_7": 583.5655319213868, + "learning_rate": 0.0009944491451423828, + "loss": 1186.293, + "step": 570 + }, + { + "ce_loss_10": 3.53601359128952, + "ce_loss_13": 3.420380687713623, + "ce_loss_2": 4.508120799064637, + "ce_loss_3": 4.24744416475296, + "ce_loss_7": 3.7096508502960206, + "epoch": 0.058, + "grad_norm": 956.0, + "kl_loss_10": 240.11019058227538, + "kl_loss_2": 2193.1629943847656, + "kl_loss_3": 1688.384765625, + "kl_loss_7": 592.8175323486328, + "learning_rate": 0.0009942108874226813, + "loss": 1155.7771, + "step": 580 + }, + { + "ce_loss_10": 3.657444155216217, + "ce_loss_13": 3.5424296021461488, + "ce_loss_2": 4.5656781673431395, + "ce_loss_3": 4.320976912975311, + "ce_loss_7": 3.8200041532516478, + "epoch": 0.059, + "grad_norm": 820.0, + "kl_loss_10": 237.33242263793946, + "kl_loss_2": 2063.7497314453126, + "kl_loss_3": 1590.3793151855468, + "kl_loss_7": 574.3437225341797, + "learning_rate": 0.00099396765300483, + "loss": 1113.7337, + "step": 590 + }, + { + "ce_loss_10": 3.6340256094932557, + "ce_loss_13": 3.519163191318512, + "ce_loss_2": 4.549298119544983, + "ce_loss_3": 4.307851886749267, + "ce_loss_7": 3.8015130519866944, + "epoch": 0.06, + "grad_norm": 948.0, + "kl_loss_10": 239.48185348510742, + "kl_loss_2": 2090.356060791016, + "kl_loss_3": 1616.7106872558593, + "kl_loss_7": 579.2961456298829, + "learning_rate": 0.0009937194443381972, + "loss": 1128.8246, + "step": 600 + }, + { + "ce_loss_10": 3.646886777877808, + "ce_loss_13": 3.5430930137634276, + "ce_loss_2": 4.552362751960755, + "ce_loss_3": 4.3045818567276, + "ce_loss_7": 3.812400245666504, + "epoch": 0.061, + "grad_norm": 836.0, + "kl_loss_10": 216.1532455444336, + "kl_loss_2": 2053.682244873047, + "kl_loss_3": 1576.81474609375, + "kl_loss_7": 571.3107208251953, + "learning_rate": 0.0009934662639222412, + "loss": 1128.2942, + "step": 610 + }, + { + "ce_loss_10": 3.602829623222351, + "ce_loss_13": 3.4988005995750426, + "ce_loss_2": 4.552669429779053, + "ce_loss_3": 4.288874197006225, + "ce_loss_7": 3.771229422092438, + "epoch": 0.062, + "grad_norm": 752.0, + "kl_loss_10": 211.5402572631836, + "kl_loss_2": 2146.809149169922, + "kl_loss_3": 1635.902978515625, + "kl_loss_7": 572.7010498046875, + "learning_rate": 0.000993208114306486, + "loss": 1128.1789, + "step": 620 + }, + { + "ce_loss_10": 3.513095796108246, + "ce_loss_13": 3.41504830121994, + "ce_loss_2": 4.476359033584595, + "ce_loss_3": 4.210507690906525, + "ce_loss_7": 3.692080223560333, + "epoch": 0.063, + "grad_norm": 840.0, + "kl_loss_10": 207.13141326904298, + "kl_loss_2": 2149.41875, + "kl_loss_3": 1630.2548095703125, + "kl_loss_7": 566.9929428100586, + "learning_rate": 0.0009929449980904952, + "loss": 1107.0104, + "step": 630 + }, + { + "ce_loss_10": 3.5761404514312742, + "ce_loss_13": 3.479358458518982, + "ce_loss_2": 4.512639951705933, + "ce_loss_3": 4.251035594940186, + "ce_loss_7": 3.746527075767517, + "epoch": 0.064, + "grad_norm": 704.0, + "kl_loss_10": 199.16628189086913, + "kl_loss_2": 2115.853399658203, + "kl_loss_3": 1620.3352111816407, + "kl_loss_7": 563.6727096557618, + "learning_rate": 0.0009926769179238466, + "loss": 1116.6029, + "step": 640 + }, + { + "ce_loss_10": 3.6297390460968018, + "ce_loss_13": 3.526135504245758, + "ce_loss_2": 4.559930968284607, + "ce_loss_3": 4.2981769919395445, + "ce_loss_7": 3.8018287062644958, + "epoch": 0.065, + "grad_norm": 812.0, + "kl_loss_10": 217.01969451904296, + "kl_loss_2": 2101.874133300781, + "kl_loss_3": 1596.4249572753906, + "kl_loss_7": 579.9847625732422, + "learning_rate": 0.000992403876506104, + "loss": 1118.0791, + "step": 650 + }, + { + "ce_loss_10": 3.5638173818588257, + "ce_loss_13": 3.4600673079490663, + "ce_loss_2": 4.492022132873535, + "ce_loss_3": 4.226557815074921, + "ce_loss_7": 3.737081015110016, + "epoch": 0.066, + "grad_norm": 636.0, + "kl_loss_10": 213.38376693725587, + "kl_loss_2": 2105.9776245117187, + "kl_loss_3": 1588.3972534179688, + "kl_loss_7": 565.9338088989258, + "learning_rate": 0.0009921258765867918, + "loss": 1115.5061, + "step": 660 + }, + { + "ce_loss_10": 3.524133229255676, + "ce_loss_13": 3.424080693721771, + "ce_loss_2": 4.483368253707885, + "ce_loss_3": 4.2175681591033936, + "ce_loss_7": 3.6960334897041323, + "epoch": 0.067, + "grad_norm": 824.0, + "kl_loss_10": 213.54936981201172, + "kl_loss_2": 2168.286993408203, + "kl_loss_3": 1640.3900268554687, + "kl_loss_7": 567.8847198486328, + "learning_rate": 0.0009918429209653662, + "loss": 1124.8729, + "step": 670 + }, + { + "ce_loss_10": 3.5795446634292603, + "ce_loss_13": 3.479706287384033, + "ce_loss_2": 4.519145917892456, + "ce_loss_3": 4.256680989265442, + "ce_loss_7": 3.746138072013855, + "epoch": 0.068, + "grad_norm": 820.0, + "kl_loss_10": 211.06802139282226, + "kl_loss_2": 2113.3779052734376, + "kl_loss_3": 1607.1892822265625, + "kl_loss_7": 560.476708984375, + "learning_rate": 0.0009915550124911866, + "loss": 1099.4275, + "step": 680 + }, + { + "ce_loss_10": 3.5930914521217345, + "ce_loss_13": 3.4938461661338804, + "ce_loss_2": 4.51025116443634, + "ce_loss_3": 4.262886941432953, + "ce_loss_7": 3.7600345849990844, + "epoch": 0.069, + "grad_norm": 960.0, + "kl_loss_10": 200.71142120361327, + "kl_loss_2": 2064.7206604003904, + "kl_loss_3": 1576.7652465820313, + "kl_loss_7": 544.939192199707, + "learning_rate": 0.0009912621540634887, + "loss": 1096.3648, + "step": 690 + }, + { + "ce_loss_10": 3.6200631141662596, + "ce_loss_13": 3.5290600538253782, + "ce_loss_2": 4.496898746490478, + "ce_loss_3": 4.253770506381988, + "ce_loss_7": 3.775084400177002, + "epoch": 0.07, + "grad_norm": 952.0, + "kl_loss_10": 188.68883590698243, + "kl_loss_2": 2000.6690612792968, + "kl_loss_3": 1528.578564453125, + "kl_loss_7": 525.6970199584961, + "learning_rate": 0.0009909643486313534, + "loss": 1075.2783, + "step": 700 + }, + { + "ce_loss_10": 3.491303098201752, + "ce_loss_13": 3.3972670793533326, + "ce_loss_2": 4.432852864265442, + "ce_loss_3": 4.169692039489746, + "ce_loss_7": 3.659318673610687, + "epoch": 0.071, + "grad_norm": 772.0, + "kl_loss_10": 195.57371444702147, + "kl_loss_2": 2111.2869506835937, + "kl_loss_3": 1599.3862426757812, + "kl_loss_7": 541.3287918090821, + "learning_rate": 0.000990661599193678, + "loss": 1124.3994, + "step": 710 + }, + { + "ce_loss_10": 3.630624604225159, + "ce_loss_13": 3.5385274052619935, + "ce_loss_2": 4.531144857406616, + "ce_loss_3": 4.287478506565094, + "ce_loss_7": 3.7991448640823364, + "epoch": 0.072, + "grad_norm": 636.0, + "kl_loss_10": 192.84549560546876, + "kl_loss_2": 2026.7384338378906, + "kl_loss_3": 1547.2736206054688, + "kl_loss_7": 541.6422912597657, + "learning_rate": 0.0009903539087991462, + "loss": 1076.785, + "step": 720 + }, + { + "ce_loss_10": 3.594977593421936, + "ce_loss_13": 3.50269376039505, + "ce_loss_2": 4.492258667945862, + "ce_loss_3": 4.242715549468994, + "ce_loss_7": 3.7583904504776, + "epoch": 0.073, + "grad_norm": 732.0, + "kl_loss_10": 189.20598907470702, + "kl_loss_2": 2021.3861267089844, + "kl_loss_3": 1532.5231994628907, + "kl_loss_7": 542.5753295898437, + "learning_rate": 0.0009900412805461966, + "loss": 1080.9737, + "step": 730 + }, + { + "ce_loss_10": 3.6703630805015566, + "ce_loss_13": 3.5826353311538695, + "ce_loss_2": 4.54851381778717, + "ce_loss_3": 4.306425213813782, + "ce_loss_7": 3.843156623840332, + "epoch": 0.074, + "grad_norm": 920.0, + "kl_loss_10": 187.67292861938478, + "kl_loss_2": 1985.2287841796874, + "kl_loss_3": 1507.3379516601562, + "kl_loss_7": 538.3505889892579, + "learning_rate": 0.0009897237175829927, + "loss": 1072.161, + "step": 740 + }, + { + "ce_loss_10": 3.5649651885032654, + "ce_loss_13": 3.4684099912643434, + "ce_loss_2": 4.487882328033447, + "ce_loss_3": 4.228008484840393, + "ce_loss_7": 3.731177771091461, + "epoch": 0.075, + "grad_norm": 768.0, + "kl_loss_10": 193.74180603027344, + "kl_loss_2": 2084.8017517089843, + "kl_loss_3": 1566.5948913574218, + "kl_loss_7": 554.2125396728516, + "learning_rate": 0.0009894012231073895, + "loss": 1088.093, + "step": 750 + }, + { + "ce_loss_10": 3.6071489572525026, + "ce_loss_13": 3.515170168876648, + "ce_loss_2": 4.504528665542603, + "ce_loss_3": 4.2520447134971615, + "ce_loss_7": 3.7718453645706176, + "epoch": 0.076, + "grad_norm": 620.0, + "kl_loss_10": 189.55669250488282, + "kl_loss_2": 2013.1507141113282, + "kl_loss_3": 1520.1027954101562, + "kl_loss_7": 526.1563827514649, + "learning_rate": 0.0009890738003669028, + "loss": 1077.2552, + "step": 760 + }, + { + "ce_loss_10": 3.5843887448310854, + "ce_loss_13": 3.489227271080017, + "ce_loss_2": 4.5111163854599, + "ce_loss_3": 4.255188155174255, + "ce_loss_7": 3.7455286383628845, + "epoch": 0.077, + "grad_norm": 728.0, + "kl_loss_10": 207.77494277954102, + "kl_loss_2": 2098.0493408203124, + "kl_loss_3": 1588.9514770507812, + "kl_loss_7": 542.3229370117188, + "learning_rate": 0.0009887414526586764, + "loss": 1081.4613, + "step": 770 + }, + { + "ce_loss_10": 3.6421250104904175, + "ce_loss_13": 3.5463628649711607, + "ce_loss_2": 4.539319705963135, + "ce_loss_3": 4.268165516853332, + "ce_loss_7": 3.799858510494232, + "epoch": 0.078, + "grad_norm": 924.0, + "kl_loss_10": 198.3345474243164, + "kl_loss_2": 2029.2047729492188, + "kl_loss_3": 1506.253399658203, + "kl_loss_7": 526.1126602172851, + "learning_rate": 0.0009884041833294476, + "loss": 1050.2004, + "step": 780 + }, + { + "ce_loss_10": 3.6439041018486025, + "ce_loss_13": 3.546130394935608, + "ce_loss_2": 4.5299879789352415, + "ce_loss_3": 4.267681610584259, + "ce_loss_7": 3.798111093044281, + "epoch": 0.079, + "grad_norm": 1072.0, + "kl_loss_10": 193.93799591064453, + "kl_loss_2": 2009.9286010742187, + "kl_loss_3": 1509.10400390625, + "kl_loss_7": 542.0801528930664, + "learning_rate": 0.000988061995775515, + "loss": 1085.4477, + "step": 790 + }, + { + "ce_loss_10": 3.5596301794052123, + "ce_loss_13": 3.470755398273468, + "ce_loss_2": 4.442834830284118, + "ce_loss_3": 4.188168525695801, + "ce_loss_7": 3.741673398017883, + "epoch": 0.08, + "grad_norm": 640.0, + "kl_loss_10": 202.93540420532227, + "kl_loss_2": 2011.5775634765625, + "kl_loss_3": 1495.8062683105468, + "kl_loss_7": 546.9228302001953, + "learning_rate": 0.0009877148934427035, + "loss": 1066.8678, + "step": 800 + }, + { + "ce_loss_10": 3.6398961186408996, + "ce_loss_13": 3.5154946088790893, + "ce_loss_2": 4.5010058879852295, + "ce_loss_3": 4.234906959533691, + "ce_loss_7": 3.7647638440132143, + "epoch": 0.081, + "grad_norm": 784.0, + "kl_loss_10": 239.83477783203125, + "kl_loss_2": 2027.1837829589845, + "kl_loss_3": 1514.4463195800781, + "kl_loss_7": 519.1964233398437, + "learning_rate": 0.0009873628798263297, + "loss": 1059.5939, + "step": 810 + }, + { + "ce_loss_10": 3.5738606095314025, + "ce_loss_13": 3.468097412586212, + "ce_loss_2": 4.424917078018188, + "ce_loss_3": 4.169192314147949, + "ce_loss_7": 3.707214820384979, + "epoch": 0.082, + "grad_norm": 708.0, + "kl_loss_10": 242.0955726623535, + "kl_loss_2": 1952.9846618652343, + "kl_loss_3": 1470.680194091797, + "kl_loss_7": 500.8814727783203, + "learning_rate": 0.0009870059584711668, + "loss": 1069.6682, + "step": 820 + }, + { + "ce_loss_10": 3.58900762796402, + "ce_loss_13": 3.4884706497192384, + "ce_loss_2": 4.43571412563324, + "ce_loss_3": 4.198841071128845, + "ce_loss_7": 3.7228142976760865, + "epoch": 0.083, + "grad_norm": 788.0, + "kl_loss_10": 211.60461807250977, + "kl_loss_2": 1966.7832885742187, + "kl_loss_3": 1492.8640014648438, + "kl_loss_7": 499.06214294433596, + "learning_rate": 0.000986644132971409, + "loss": 1044.2496, + "step": 830 + }, + { + "ce_loss_10": 3.5730356097221376, + "ce_loss_13": 3.475196421146393, + "ce_loss_2": 4.4566532373428345, + "ce_loss_3": 4.211660146713257, + "ce_loss_7": 3.717496383190155, + "epoch": 0.084, + "grad_norm": 856.0, + "kl_loss_10": 204.7541816711426, + "kl_loss_2": 2003.5655700683594, + "kl_loss_3": 1527.1750061035157, + "kl_loss_7": 514.9196395874023, + "learning_rate": 0.0009862774069706345, + "loss": 1053.708, + "step": 840 + }, + { + "ce_loss_10": 3.6906569719314577, + "ce_loss_13": 3.601027488708496, + "ce_loss_2": 4.525540113449097, + "ce_loss_3": 4.283427906036377, + "ce_loss_7": 3.8413923501968386, + "epoch": 0.085, + "grad_norm": 844.0, + "kl_loss_10": 194.00841827392577, + "kl_loss_2": 1934.2958251953125, + "kl_loss_3": 1471.3841796875, + "kl_loss_7": 520.3793319702148, + "learning_rate": 0.000985905784161771, + "loss": 1041.0401, + "step": 850 + }, + { + "ce_loss_10": 3.6237664222717285, + "ce_loss_13": 3.5309490084648134, + "ce_loss_2": 4.46936445236206, + "ce_loss_3": 4.220357573032379, + "ce_loss_7": 3.779926073551178, + "epoch": 0.086, + "grad_norm": 836.0, + "kl_loss_10": 189.39860305786132, + "kl_loss_2": 1951.8812316894532, + "kl_loss_3": 1472.02509765625, + "kl_loss_7": 520.433024597168, + "learning_rate": 0.000985529268287055, + "loss": 1032.3361, + "step": 860 + }, + { + "ce_loss_10": 3.5477117776870726, + "ce_loss_13": 3.4556665897369383, + "ce_loss_2": 4.441434073448181, + "ce_loss_3": 4.170016729831696, + "ce_loss_7": 3.7197394251823424, + "epoch": 0.087, + "grad_norm": 956.0, + "kl_loss_10": 188.84106826782227, + "kl_loss_2": 2015.502880859375, + "kl_loss_3": 1487.3405334472657, + "kl_loss_7": 539.0837844848633, + "learning_rate": 0.0009851478631379982, + "loss": 1053.5276, + "step": 870 + }, + { + "ce_loss_10": 3.609230947494507, + "ce_loss_13": 3.516581404209137, + "ce_loss_2": 4.48051495552063, + "ce_loss_3": 4.213979172706604, + "ce_loss_7": 3.765811729431152, + "epoch": 0.088, + "grad_norm": 908.0, + "kl_loss_10": 186.26299972534179, + "kl_loss_2": 1983.9146301269532, + "kl_loss_3": 1461.4365661621093, + "kl_loss_7": 506.38514251708983, + "learning_rate": 0.0009847615725553456, + "loss": 1027.144, + "step": 880 + }, + { + "ce_loss_10": 3.6631253480911257, + "ce_loss_13": 3.580364799499512, + "ce_loss_2": 4.489781618118286, + "ce_loss_3": 4.246252822875976, + "ce_loss_7": 3.813025879859924, + "epoch": 0.089, + "grad_norm": 820.0, + "kl_loss_10": 177.18833389282227, + "kl_loss_2": 1872.1732604980468, + "kl_loss_3": 1411.1392700195313, + "kl_loss_7": 495.2686401367188, + "learning_rate": 0.0009843704004290394, + "loss": 1029.3286, + "step": 890 + }, + { + "ce_loss_10": 3.565847933292389, + "ce_loss_13": 3.477725636959076, + "ce_loss_2": 4.422844636440277, + "ce_loss_3": 4.1976773858070375, + "ce_loss_7": 3.7395910024642944, + "epoch": 0.09, + "grad_norm": 1012.0, + "kl_loss_10": 186.12151031494142, + "kl_loss_2": 1965.6116821289063, + "kl_loss_3": 1515.8754577636719, + "kl_loss_7": 555.1820693969727, + "learning_rate": 0.0009839743506981783, + "loss": 1049.7934, + "step": 900 + }, + { + "ce_loss_10": 3.490706205368042, + "ce_loss_13": 3.4019437432289124, + "ce_loss_2": 4.38123586177826, + "ce_loss_3": 4.134083950519562, + "ce_loss_7": 3.6646034359931945, + "epoch": 0.091, + "grad_norm": 700.0, + "kl_loss_10": 189.97202682495117, + "kl_loss_2": 2033.5506591796875, + "kl_loss_3": 1553.0788024902345, + "kl_loss_7": 550.8684387207031, + "learning_rate": 0.0009835734273509786, + "loss": 1058.8781, + "step": 910 + }, + { + "ce_loss_10": 3.5909106731414795, + "ce_loss_13": 3.4989862561225893, + "ce_loss_2": 4.468850481510162, + "ce_loss_3": 4.242748379707336, + "ce_loss_7": 3.7557740807533264, + "epoch": 0.092, + "grad_norm": 1200.0, + "kl_loss_10": 187.22658081054686, + "kl_loss_2": 1967.8191162109374, + "kl_loss_3": 1530.7474304199218, + "kl_loss_7": 525.3208801269532, + "learning_rate": 0.0009831676344247342, + "loss": 1050.8719, + "step": 920 + }, + { + "ce_loss_10": 3.601709270477295, + "ce_loss_13": 3.517729616165161, + "ce_loss_2": 4.460277819633484, + "ce_loss_3": 4.208156263828277, + "ce_loss_7": 3.750774598121643, + "epoch": 0.093, + "grad_norm": 976.0, + "kl_loss_10": 178.7038887023926, + "kl_loss_2": 1956.5622802734374, + "kl_loss_3": 1466.5545593261718, + "kl_loss_7": 497.9136016845703, + "learning_rate": 0.0009827569760057755, + "loss": 1037.5467, + "step": 930 + }, + { + "ce_loss_10": 3.524882364273071, + "ce_loss_13": 3.433571922779083, + "ce_loss_2": 4.444187712669373, + "ce_loss_3": 4.170401692390442, + "ce_loss_7": 3.6811744213104247, + "epoch": 0.094, + "grad_norm": 784.0, + "kl_loss_10": 187.684228515625, + "kl_loss_2": 2098.4013122558595, + "kl_loss_3": 1541.9158447265625, + "kl_loss_7": 517.6738510131836, + "learning_rate": 0.000982341456229428, + "loss": 1051.2325, + "step": 940 + }, + { + "ce_loss_10": 3.615868926048279, + "ce_loss_13": 3.5291592121124267, + "ce_loss_2": 4.486113166809082, + "ce_loss_3": 4.232404887676239, + "ce_loss_7": 3.76742799282074, + "epoch": 0.095, + "grad_norm": 676.0, + "kl_loss_10": 183.69078369140624, + "kl_loss_2": 1981.9986938476563, + "kl_loss_3": 1486.5308166503905, + "kl_loss_7": 510.1406478881836, + "learning_rate": 0.000981921079279971, + "loss": 1019.1191, + "step": 950 + }, + { + "ce_loss_10": 3.630085599422455, + "ce_loss_13": 3.547876071929932, + "ce_loss_2": 4.444245457649231, + "ce_loss_3": 4.2063503623008724, + "ce_loss_7": 3.7743829011917116, + "epoch": 0.096, + "grad_norm": 688.0, + "kl_loss_10": 177.08920364379884, + "kl_loss_2": 1876.4136657714844, + "kl_loss_3": 1407.2470703125, + "kl_loss_7": 490.83606872558596, + "learning_rate": 0.0009814958493905962, + "loss": 998.0566, + "step": 960 + }, + { + "ce_loss_10": 3.58119341135025, + "ce_loss_13": 3.4952746510505674, + "ce_loss_2": 4.4530457019805905, + "ce_loss_3": 4.1936848878860475, + "ce_loss_7": 3.745371329784393, + "epoch": 0.097, + "grad_norm": 856.0, + "kl_loss_10": 180.93770446777344, + "kl_loss_2": 1972.7642639160156, + "kl_loss_3": 1475.1272094726562, + "kl_loss_7": 518.6202331542969, + "learning_rate": 0.0009810657708433637, + "loss": 1046.3625, + "step": 970 + }, + { + "ce_loss_10": 3.6587608098983764, + "ce_loss_13": 3.5761932730674744, + "ce_loss_2": 4.4908324718475345, + "ce_loss_3": 4.249144637584687, + "ce_loss_7": 3.8146904349327087, + "epoch": 0.098, + "grad_norm": 1040.0, + "kl_loss_10": 175.9536979675293, + "kl_loss_2": 1896.2690856933593, + "kl_loss_3": 1430.7713623046875, + "kl_loss_7": 520.7419006347657, + "learning_rate": 0.0009806308479691594, + "loss": 1012.5152, + "step": 980 + }, + { + "ce_loss_10": 3.6701141357421876, + "ce_loss_13": 3.5853224396705627, + "ce_loss_2": 4.52058436870575, + "ce_loss_3": 4.304493975639343, + "ce_loss_7": 3.8337016344070434, + "epoch": 0.099, + "grad_norm": 1064.0, + "kl_loss_10": 182.05279541015625, + "kl_loss_2": 1942.8283203125, + "kl_loss_3": 1504.5979553222655, + "kl_loss_7": 525.28359375, + "learning_rate": 0.0009801910851476522, + "loss": 1029.841, + "step": 990 + }, + { + "ce_loss_10": 3.5780128598213197, + "ce_loss_13": 3.4931989908218384, + "ce_loss_2": 4.455501818656922, + "ce_loss_3": 4.210770845413208, + "ce_loss_7": 3.7347821235656737, + "epoch": 0.1, + "grad_norm": 668.0, + "kl_loss_10": 179.7102798461914, + "kl_loss_2": 1998.8098388671874, + "kl_loss_3": 1528.9624084472657, + "kl_loss_7": 516.746615600586, + "learning_rate": 0.0009797464868072487, + "loss": 1031.5818, + "step": 1000 + }, + { + "ce_loss_10": 3.568189251422882, + "ce_loss_13": 3.4829827189445495, + "ce_loss_2": 4.416838443279266, + "ce_loss_3": 4.186929941177368, + "ce_loss_7": 3.7205358743667603, + "epoch": 0.101, + "grad_norm": 732.0, + "kl_loss_10": 179.53496170043945, + "kl_loss_2": 1933.2125244140625, + "kl_loss_3": 1477.44189453125, + "kl_loss_7": 505.2583862304688, + "learning_rate": 0.0009792970574250492, + "loss": 1018.5471, + "step": 1010 + }, + { + "ce_loss_10": 3.6072539567947386, + "ce_loss_13": 3.519195032119751, + "ce_loss_2": 4.433947956562042, + "ce_loss_3": 4.211443436145783, + "ce_loss_7": 3.7644609808921814, + "epoch": 0.102, + "grad_norm": 824.0, + "kl_loss_10": 179.83254623413086, + "kl_loss_2": 1919.0231384277345, + "kl_loss_3": 1470.111962890625, + "kl_loss_7": 523.1709747314453, + "learning_rate": 0.0009788428015268028, + "loss": 1009.9572, + "step": 1020 + }, + { + "ce_loss_10": 3.595908534526825, + "ce_loss_13": 3.5105677366256716, + "ce_loss_2": 4.409725475311279, + "ce_loss_3": 4.1861141443252565, + "ce_loss_7": 3.7467220067977904, + "epoch": 0.103, + "grad_norm": 708.0, + "kl_loss_10": 184.79271774291993, + "kl_loss_2": 1885.4566284179687, + "kl_loss_3": 1448.107196044922, + "kl_loss_7": 506.4451049804687, + "learning_rate": 0.0009783837236868609, + "loss": 1001.9834, + "step": 1030 + }, + { + "ce_loss_10": 3.5803792238235475, + "ce_loss_13": 3.48011519908905, + "ce_loss_2": 4.398702096939087, + "ce_loss_3": 4.170737850666046, + "ce_loss_7": 3.7171101450920103, + "epoch": 0.104, + "grad_norm": 652.0, + "kl_loss_10": 198.62368392944336, + "kl_loss_2": 1895.2196655273438, + "kl_loss_3": 1456.3298950195312, + "kl_loss_7": 493.37102966308595, + "learning_rate": 0.0009779198285281327, + "loss": 1003.9096, + "step": 1040 + }, + { + "ce_loss_10": 3.5599252581596375, + "ce_loss_13": 3.472754955291748, + "ce_loss_2": 4.411727952957153, + "ce_loss_3": 4.179691898822784, + "ce_loss_7": 3.7037505507469177, + "epoch": 0.105, + "grad_norm": 612.0, + "kl_loss_10": 185.47133712768556, + "kl_loss_2": 1925.2284606933595, + "kl_loss_3": 1470.763250732422, + "kl_loss_7": 480.902815246582, + "learning_rate": 0.0009774511207220368, + "loss": 1008.8404, + "step": 1050 + }, + { + "ce_loss_10": 3.5997629284858705, + "ce_loss_13": 3.5105605721473694, + "ce_loss_2": 4.449126553535462, + "ce_loss_3": 4.208902645111084, + "ce_loss_7": 3.7385777592658997, + "epoch": 0.106, + "grad_norm": 664.0, + "kl_loss_10": 188.41523284912108, + "kl_loss_2": 1931.9002990722656, + "kl_loss_3": 1463.5917114257813, + "kl_loss_7": 488.90601348876953, + "learning_rate": 0.0009769776049884564, + "loss": 1011.1688, + "step": 1060 + }, + { + "ce_loss_10": 3.512972867488861, + "ce_loss_13": 3.4215759873390197, + "ce_loss_2": 4.383116137981415, + "ce_loss_3": 4.136074674129486, + "ce_loss_7": 3.658175325393677, + "epoch": 0.107, + "grad_norm": 852.0, + "kl_loss_10": 190.86334686279298, + "kl_loss_2": 1988.4344360351563, + "kl_loss_3": 1500.9677978515624, + "kl_loss_7": 497.2473831176758, + "learning_rate": 0.0009764992860956889, + "loss": 1046.9855, + "step": 1070 + }, + { + "ce_loss_10": 3.664934504032135, + "ce_loss_13": 3.581758964061737, + "ce_loss_2": 4.456975436210632, + "ce_loss_3": 4.2267944574356076, + "ce_loss_7": 3.7991317749023437, + "epoch": 0.108, + "grad_norm": 692.0, + "kl_loss_10": 194.86636047363282, + "kl_loss_2": 1828.549542236328, + "kl_loss_3": 1386.0209167480468, + "kl_loss_7": 470.81641540527346, + "learning_rate": 0.0009760161688604008, + "loss": 983.4062, + "step": 1080 + }, + { + "ce_loss_10": 3.699925255775452, + "ce_loss_13": 3.581090009212494, + "ce_loss_2": 4.5030299663543705, + "ce_loss_3": 4.258689761161804, + "ce_loss_7": 3.8088807463645935, + "epoch": 0.109, + "grad_norm": 932.0, + "kl_loss_10": 231.67974853515625, + "kl_loss_2": 1899.6866027832032, + "kl_loss_3": 1422.8416687011718, + "kl_loss_7": 478.274723815918, + "learning_rate": 0.0009755282581475768, + "loss": 1018.6061, + "step": 1090 + }, + { + "ce_loss_10": 3.7407832741737366, + "ce_loss_13": 3.629633975028992, + "ce_loss_2": 4.543957281112671, + "ce_loss_3": 4.291879677772522, + "ce_loss_7": 3.859333908557892, + "epoch": 0.11, + "grad_norm": 840.0, + "kl_loss_10": 242.35417098999022, + "kl_loss_2": 1901.089776611328, + "kl_loss_3": 1411.2118530273438, + "kl_loss_7": 486.46076049804685, + "learning_rate": 0.0009750355588704727, + "loss": 1002.7542, + "step": 1100 + }, + { + "ce_loss_10": 3.5715940475463865, + "ce_loss_13": 3.4648202657699585, + "ce_loss_2": 4.382298231124878, + "ce_loss_3": 4.144306838512421, + "ce_loss_7": 3.693096125125885, + "epoch": 0.111, + "grad_norm": 724.0, + "kl_loss_10": 228.16529388427733, + "kl_loss_2": 1889.4921142578125, + "kl_loss_3": 1417.6652526855469, + "kl_loss_7": 475.9224227905273, + "learning_rate": 0.0009745380759905647, + "loss": 1021.5175, + "step": 1110 + }, + { + "ce_loss_10": 3.518705630302429, + "ce_loss_13": 3.418720316886902, + "ce_loss_2": 4.3428931593894955, + "ce_loss_3": 4.101069831848145, + "ce_loss_7": 3.6415586709976195, + "epoch": 0.112, + "grad_norm": 704.0, + "kl_loss_10": 208.09901504516603, + "kl_loss_2": 1914.6993408203125, + "kl_loss_3": 1443.0666809082031, + "kl_loss_7": 475.36561584472656, + "learning_rate": 0.0009740358145174998, + "loss": 1021.0135, + "step": 1120 + }, + { + "ce_loss_10": 3.6743900656700133, + "ce_loss_13": 3.5760027527809144, + "ce_loss_2": 4.446346926689148, + "ce_loss_3": 4.222055697441101, + "ce_loss_7": 3.7923503518104553, + "epoch": 0.113, + "grad_norm": 568.0, + "kl_loss_10": 194.76330795288087, + "kl_loss_2": 1822.0701782226563, + "kl_loss_3": 1381.2394104003906, + "kl_loss_7": 467.96456146240234, + "learning_rate": 0.0009735287795090455, + "loss": 980.3441, + "step": 1130 + }, + { + "ce_loss_10": 3.5504071354866027, + "ce_loss_13": 3.462277662754059, + "ce_loss_2": 4.376504588127136, + "ce_loss_3": 4.138739550113678, + "ce_loss_7": 3.69127904176712, + "epoch": 0.114, + "grad_norm": 924.0, + "kl_loss_10": 182.32659149169922, + "kl_loss_2": 1886.9593383789063, + "kl_loss_3": 1422.0450927734375, + "kl_loss_7": 478.68931579589844, + "learning_rate": 0.0009730169760710386, + "loss": 988.1342, + "step": 1140 + }, + { + "ce_loss_10": 3.62770699262619, + "ce_loss_13": 3.5463622212409973, + "ce_loss_2": 4.45663468837738, + "ce_loss_3": 4.212454223632813, + "ce_loss_7": 3.7987569212913512, + "epoch": 0.115, + "grad_norm": 904.0, + "kl_loss_10": 178.53827209472655, + "kl_loss_2": 1875.3674560546874, + "kl_loss_3": 1405.993963623047, + "kl_loss_7": 538.1174468994141, + "learning_rate": 0.0009725004093573342, + "loss": 1006.2083, + "step": 1150 + }, + { + "ce_loss_10": 3.5690181612968446, + "ce_loss_13": 3.4807565331459047, + "ce_loss_2": 4.384287440776825, + "ce_loss_3": 4.156945848464966, + "ce_loss_7": 3.752134454250336, + "epoch": 0.116, + "grad_norm": 716.0, + "kl_loss_10": 176.73938064575196, + "kl_loss_2": 1855.7882263183594, + "kl_loss_3": 1408.193621826172, + "kl_loss_7": 553.3539993286133, + "learning_rate": 0.0009719790845697534, + "loss": 992.4009, + "step": 1160 + }, + { + "ce_loss_10": 3.5152013182640074, + "ce_loss_13": 3.4388429045677187, + "ce_loss_2": 4.306136679649353, + "ce_loss_3": 4.084867370128632, + "ce_loss_7": 3.685141396522522, + "epoch": 0.117, + "grad_norm": 696.0, + "kl_loss_10": 168.1632064819336, + "kl_loss_2": 1813.9858947753905, + "kl_loss_3": 1386.7823852539063, + "kl_loss_7": 525.0696212768555, + "learning_rate": 0.0009714530069580309, + "loss": 973.7273, + "step": 1170 + }, + { + "ce_loss_10": 3.62287323474884, + "ce_loss_13": 3.5366265535354615, + "ce_loss_2": 4.438341093063355, + "ce_loss_3": 4.2517277002334595, + "ce_loss_7": 3.7834308385849, + "epoch": 0.118, + "grad_norm": 668.0, + "kl_loss_10": 180.73734664916992, + "kl_loss_2": 1856.9345581054688, + "kl_loss_3": 1479.8200439453126, + "kl_loss_7": 530.3196792602539, + "learning_rate": 0.0009709221818197624, + "loss": 1003.6437, + "step": 1180 + }, + { + "ce_loss_10": 3.6476974844932557, + "ce_loss_13": 3.566649878025055, + "ce_loss_2": 4.460924696922302, + "ce_loss_3": 4.266936922073365, + "ce_loss_7": 3.8050862193107604, + "epoch": 0.119, + "grad_norm": 684.0, + "kl_loss_10": 169.83945999145507, + "kl_loss_2": 1853.9904663085938, + "kl_loss_3": 1484.227764892578, + "kl_loss_7": 493.99244689941406, + "learning_rate": 0.0009703866145003512, + "loss": 1003.36, + "step": 1190 + }, + { + "ce_loss_10": 3.6218933939933775, + "ce_loss_13": 3.5399347305297852, + "ce_loss_2": 4.422274470329285, + "ce_loss_3": 4.231696951389313, + "ce_loss_7": 3.76218581199646, + "epoch": 0.12, + "grad_norm": 860.0, + "kl_loss_10": 167.10036239624023, + "kl_loss_2": 1845.4068298339844, + "kl_loss_3": 1456.3514770507813, + "kl_loss_7": 478.3939971923828, + "learning_rate": 0.0009698463103929542, + "loss": 1001.8003, + "step": 1200 + }, + { + "ce_loss_10": 3.586829674243927, + "ce_loss_13": 3.5053379774093627, + "ce_loss_2": 4.3998222827911375, + "ce_loss_3": 4.1850836157798765, + "ce_loss_7": 3.728915822505951, + "epoch": 0.121, + "grad_norm": 780.0, + "kl_loss_10": 169.9067398071289, + "kl_loss_2": 1841.5256103515626, + "kl_loss_3": 1426.9917419433593, + "kl_loss_7": 468.1813171386719, + "learning_rate": 0.0009693012749384279, + "loss": 985.2267, + "step": 1210 + }, + { + "ce_loss_10": 3.601468288898468, + "ce_loss_13": 3.5188393354415894, + "ce_loss_2": 4.404597902297974, + "ce_loss_3": 4.181776678562164, + "ce_loss_7": 3.7396764159202576, + "epoch": 0.122, + "grad_norm": 644.0, + "kl_loss_10": 173.15536575317384, + "kl_loss_2": 1847.19716796875, + "kl_loss_3": 1416.7066284179687, + "kl_loss_7": 463.79361419677736, + "learning_rate": 0.0009687515136252732, + "loss": 967.898, + "step": 1220 + }, + { + "ce_loss_10": 3.5564996123313906, + "ce_loss_13": 3.473362135887146, + "ce_loss_2": 4.387398433685303, + "ce_loss_3": 4.149928534030915, + "ce_loss_7": 3.689453649520874, + "epoch": 0.123, + "grad_norm": 740.0, + "kl_loss_10": 174.67308807373047, + "kl_loss_2": 1905.6107360839844, + "kl_loss_3": 1438.5151733398438, + "kl_loss_7": 470.0431854248047, + "learning_rate": 0.0009681970319895803, + "loss": 999.513, + "step": 1230 + }, + { + "ce_loss_10": 3.6379595756530763, + "ce_loss_13": 3.558580422401428, + "ce_loss_2": 4.434055185317993, + "ce_loss_3": 4.211921918392181, + "ce_loss_7": 3.768270802497864, + "epoch": 0.124, + "grad_norm": 604.0, + "kl_loss_10": 171.7199905395508, + "kl_loss_2": 1822.535498046875, + "kl_loss_3": 1383.5216369628906, + "kl_loss_7": 455.1578430175781, + "learning_rate": 0.0009676378356149733, + "loss": 961.3066, + "step": 1240 + }, + { + "ce_loss_10": 3.605825924873352, + "ce_loss_13": 3.526325762271881, + "ce_loss_2": 4.392473816871643, + "ce_loss_3": 4.1692427635192875, + "ce_loss_7": 3.7352627873420716, + "epoch": 0.125, + "grad_norm": 720.0, + "kl_loss_10": 167.6490280151367, + "kl_loss_2": 1810.04775390625, + "kl_loss_3": 1368.414892578125, + "kl_loss_7": 448.32765197753906, + "learning_rate": 0.0009670739301325534, + "loss": 957.2052, + "step": 1250 + }, + { + "ce_loss_10": 3.571504831314087, + "ce_loss_13": 3.4888750672340394, + "ce_loss_2": 4.378832995891571, + "ce_loss_3": 4.150575721263886, + "ce_loss_7": 3.7100158095359803, + "epoch": 0.126, + "grad_norm": 584.0, + "kl_loss_10": 173.1241600036621, + "kl_loss_2": 1845.0594482421875, + "kl_loss_3": 1403.9748901367188, + "kl_loss_7": 459.8861801147461, + "learning_rate": 0.0009665053212208426, + "loss": 977.427, + "step": 1260 + }, + { + "ce_loss_10": 3.618936550617218, + "ce_loss_13": 3.536702239513397, + "ce_loss_2": 4.445125913619995, + "ce_loss_3": 4.200946152210236, + "ce_loss_7": 3.752047967910767, + "epoch": 0.127, + "grad_norm": 636.0, + "kl_loss_10": 175.11898193359374, + "kl_loss_2": 1894.717462158203, + "kl_loss_3": 1411.8616638183594, + "kl_loss_7": 466.6258575439453, + "learning_rate": 0.0009659320146057262, + "loss": 985.5969, + "step": 1270 + }, + { + "ce_loss_10": 3.615454339981079, + "ce_loss_13": 3.536350691318512, + "ce_loss_2": 4.4296933054924015, + "ce_loss_3": 4.193294942378998, + "ce_loss_7": 3.7456225991249084, + "epoch": 0.128, + "grad_norm": 684.0, + "kl_loss_10": 172.33256301879882, + "kl_loss_2": 1863.494482421875, + "kl_loss_3": 1382.5830627441405, + "kl_loss_7": 454.3094543457031, + "learning_rate": 0.0009653540160603955, + "loss": 967.9032, + "step": 1280 + }, + { + "ce_loss_10": 3.618008828163147, + "ce_loss_13": 3.540120279788971, + "ce_loss_2": 4.412742114067077, + "ce_loss_3": 4.178766667842865, + "ce_loss_7": 3.7450068473815916, + "epoch": 0.129, + "grad_norm": 728.0, + "kl_loss_10": 168.77175369262696, + "kl_loss_2": 1848.1599365234374, + "kl_loss_3": 1392.0753234863282, + "kl_loss_7": 450.8601531982422, + "learning_rate": 0.0009647713314052896, + "loss": 957.1789, + "step": 1290 + }, + { + "ce_loss_10": 3.570509469509125, + "ce_loss_13": 3.489526689052582, + "ce_loss_2": 4.4025511026382445, + "ce_loss_3": 4.168629574775696, + "ce_loss_7": 3.7077707052230835, + "epoch": 0.13, + "grad_norm": 676.0, + "kl_loss_10": 173.87704010009764, + "kl_loss_2": 1909.4092407226562, + "kl_loss_3": 1439.0013549804687, + "kl_loss_7": 467.00162200927736, + "learning_rate": 0.0009641839665080363, + "loss": 987.4625, + "step": 1300 + }, + { + "ce_loss_10": 3.5321548104286196, + "ce_loss_13": 3.45308997631073, + "ce_loss_2": 4.342683684825897, + "ce_loss_3": 4.104823327064514, + "ce_loss_7": 3.6708693146705627, + "epoch": 0.131, + "grad_norm": 760.0, + "kl_loss_10": 166.72612762451172, + "kl_loss_2": 1841.438250732422, + "kl_loss_3": 1384.6276489257812, + "kl_loss_7": 460.2369979858398, + "learning_rate": 0.0009635919272833937, + "loss": 954.8711, + "step": 1310 + }, + { + "ce_loss_10": 3.5722994804382324, + "ce_loss_13": 3.4894865989685058, + "ce_loss_2": 4.388064694404602, + "ce_loss_3": 4.155776739120483, + "ce_loss_7": 3.710488736629486, + "epoch": 0.132, + "grad_norm": 752.0, + "kl_loss_10": 172.25881423950196, + "kl_loss_2": 1844.984326171875, + "kl_loss_3": 1399.1760986328125, + "kl_loss_7": 472.22783203125, + "learning_rate": 0.0009629952196931902, + "loss": 955.9352, + "step": 1320 + }, + { + "ce_loss_10": 3.551358866691589, + "ce_loss_13": 3.472510349750519, + "ce_loss_2": 4.355191397666931, + "ce_loss_3": 4.118129765987396, + "ce_loss_7": 3.6837236404418947, + "epoch": 0.133, + "grad_norm": 588.0, + "kl_loss_10": 171.36332321166992, + "kl_loss_2": 1841.2326904296874, + "kl_loss_3": 1376.0937561035157, + "kl_loss_7": 456.13024139404297, + "learning_rate": 0.0009623938497462645, + "loss": 960.5266, + "step": 1330 + }, + { + "ce_loss_10": 3.5559371590614317, + "ce_loss_13": 3.4709559440612794, + "ce_loss_2": 4.358719682693481, + "ce_loss_3": 4.123664891719818, + "ce_loss_7": 3.688221287727356, + "epoch": 0.134, + "grad_norm": 652.0, + "kl_loss_10": 176.0792022705078, + "kl_loss_2": 1841.4331909179687, + "kl_loss_3": 1396.2080993652344, + "kl_loss_7": 467.3533462524414, + "learning_rate": 0.0009617878234984055, + "loss": 977.5735, + "step": 1340 + }, + { + "ce_loss_10": 3.6392318964004517, + "ce_loss_13": 3.5607362031936645, + "ce_loss_2": 4.411078405380249, + "ce_loss_3": 4.175350475311279, + "ce_loss_7": 3.7691015005111694, + "epoch": 0.135, + "grad_norm": 680.0, + "kl_loss_10": 167.24838333129884, + "kl_loss_2": 1775.0880249023437, + "kl_loss_3": 1327.7960632324218, + "kl_loss_7": 444.50580139160155, + "learning_rate": 0.0009611771470522907, + "loss": 944.3117, + "step": 1350 + }, + { + "ce_loss_10": 3.571557867527008, + "ce_loss_13": 3.4895546078681945, + "ce_loss_2": 4.367475354671479, + "ce_loss_3": 4.138877761363983, + "ce_loss_7": 3.70872939825058, + "epoch": 0.136, + "grad_norm": 620.0, + "kl_loss_10": 168.35062713623046, + "kl_loss_2": 1814.3583923339843, + "kl_loss_3": 1366.009014892578, + "kl_loss_7": 453.19874114990233, + "learning_rate": 0.0009605618265574251, + "loss": 946.0445, + "step": 1360 + }, + { + "ce_loss_10": 3.5265390634536744, + "ce_loss_13": 3.4456461548805235, + "ce_loss_2": 4.336602091789246, + "ce_loss_3": 4.115667700767517, + "ce_loss_7": 3.666548228263855, + "epoch": 0.137, + "grad_norm": 640.0, + "kl_loss_10": 170.2979606628418, + "kl_loss_2": 1872.8250915527344, + "kl_loss_3": 1430.012762451172, + "kl_loss_7": 472.1897598266602, + "learning_rate": 0.0009599418682100792, + "loss": 971.0635, + "step": 1370 + }, + { + "ce_loss_10": 3.5678892135620117, + "ce_loss_13": 3.4896570086479186, + "ce_loss_2": 4.374086010456085, + "ce_loss_3": 4.133544516563416, + "ce_loss_7": 3.7014646172523498, + "epoch": 0.138, + "grad_norm": 724.0, + "kl_loss_10": 164.6707275390625, + "kl_loss_2": 1831.634326171875, + "kl_loss_3": 1376.7487731933593, + "kl_loss_7": 454.62243194580077, + "learning_rate": 0.0009593172782532268, + "loss": 962.5875, + "step": 1380 + }, + { + "ce_loss_10": 3.6105551958084106, + "ce_loss_13": 3.5338037848472594, + "ce_loss_2": 4.406553602218628, + "ce_loss_3": 4.176796317100525, + "ce_loss_7": 3.7485893249511717, + "epoch": 0.139, + "grad_norm": 640.0, + "kl_loss_10": 166.68616943359376, + "kl_loss_2": 1814.0467590332032, + "kl_loss_3": 1370.630841064453, + "kl_loss_7": 459.292626953125, + "learning_rate": 0.0009586880629764817, + "loss": 950.4029, + "step": 1390 + }, + { + "ce_loss_10": 3.5382327914237974, + "ce_loss_13": 3.4606364369392395, + "ce_loss_2": 4.352329158782959, + "ce_loss_3": 4.119093835353851, + "ce_loss_7": 3.6758073806762694, + "epoch": 0.14, + "grad_norm": 604.0, + "kl_loss_10": 168.50987167358397, + "kl_loss_2": 1828.4553466796874, + "kl_loss_3": 1383.6268249511718, + "kl_loss_7": 462.06499633789065, + "learning_rate": 0.0009580542287160348, + "loss": 947.015, + "step": 1400 + }, + { + "ce_loss_10": 3.505202662944794, + "ce_loss_13": 3.4246376156806946, + "ce_loss_2": 4.3068211555480955, + "ce_loss_3": 4.070422291755676, + "ce_loss_7": 3.6390760779380797, + "epoch": 0.141, + "grad_norm": 736.0, + "kl_loss_10": 165.31348571777343, + "kl_loss_2": 1827.4319946289063, + "kl_loss_3": 1369.9763732910155, + "kl_loss_7": 452.67235260009767, + "learning_rate": 0.0009574157818545901, + "loss": 940.6614, + "step": 1410 + }, + { + "ce_loss_10": 3.573702847957611, + "ce_loss_13": 3.498333144187927, + "ce_loss_2": 4.353998446464539, + "ce_loss_3": 4.126765632629395, + "ce_loss_7": 3.70104296207428, + "epoch": 0.142, + "grad_norm": 672.0, + "kl_loss_10": 160.25886917114258, + "kl_loss_2": 1778.8529113769532, + "kl_loss_3": 1335.792724609375, + "kl_loss_7": 436.3431121826172, + "learning_rate": 0.0009567727288213005, + "loss": 948.7402, + "step": 1420 + }, + { + "ce_loss_10": 3.552593457698822, + "ce_loss_13": 3.4785722613334658, + "ce_loss_2": 4.346270668506622, + "ce_loss_3": 4.128087055683136, + "ce_loss_7": 3.691338813304901, + "epoch": 0.143, + "grad_norm": 720.0, + "kl_loss_10": 169.80655975341796, + "kl_loss_2": 1820.7370666503907, + "kl_loss_3": 1388.6684265136719, + "kl_loss_7": 470.6744079589844, + "learning_rate": 0.0009561250760917027, + "loss": 952.012, + "step": 1430 + }, + { + "ce_loss_10": 3.5761927485466005, + "ce_loss_13": 3.4948782205581663, + "ce_loss_2": 4.358104157447815, + "ce_loss_3": 4.14318814277649, + "ce_loss_7": 3.7130213379859924, + "epoch": 0.144, + "grad_norm": 656.0, + "kl_loss_10": 169.0357551574707, + "kl_loss_2": 1829.9590637207032, + "kl_loss_3": 1398.402197265625, + "kl_loss_7": 465.52441864013673, + "learning_rate": 0.0009554728301876525, + "loss": 941.6771, + "step": 1440 + }, + { + "ce_loss_10": 3.6277766704559324, + "ce_loss_13": 3.5470379829406737, + "ce_loss_2": 4.396523261070252, + "ce_loss_3": 4.18031120300293, + "ce_loss_7": 3.7636173605918883, + "epoch": 0.145, + "grad_norm": 572.0, + "kl_loss_10": 168.25755767822267, + "kl_loss_2": 1775.9608276367187, + "kl_loss_3": 1358.2267517089845, + "kl_loss_7": 455.221989440918, + "learning_rate": 0.0009548159976772592, + "loss": 965.8246, + "step": 1450 + }, + { + "ce_loss_10": 3.5671639442443848, + "ce_loss_13": 3.488686537742615, + "ce_loss_2": 4.366501688957214, + "ce_loss_3": 4.148012197017669, + "ce_loss_7": 3.7019433975219727, + "epoch": 0.146, + "grad_norm": 948.0, + "kl_loss_10": 167.73931274414062, + "kl_loss_2": 1823.703350830078, + "kl_loss_3": 1385.1528442382812, + "kl_loss_7": 466.3150634765625, + "learning_rate": 0.0009541545851748186, + "loss": 952.6289, + "step": 1460 + }, + { + "ce_loss_10": 3.438965713977814, + "ce_loss_13": 3.356391704082489, + "ce_loss_2": 4.271179831027984, + "ce_loss_3": 4.023587942123413, + "ce_loss_7": 3.5836508512496947, + "epoch": 0.147, + "grad_norm": 728.0, + "kl_loss_10": 166.18173599243164, + "kl_loss_2": 1871.1487426757812, + "kl_loss_3": 1398.6961181640625, + "kl_loss_7": 475.6040863037109, + "learning_rate": 0.0009534885993407473, + "loss": 965.3007, + "step": 1470 + }, + { + "ce_loss_10": 3.6119082808494567, + "ce_loss_13": 3.5323118925094605, + "ce_loss_2": 4.421308779716492, + "ce_loss_3": 4.1795473337173465, + "ce_loss_7": 3.7527047395706177, + "epoch": 0.148, + "grad_norm": 924.0, + "kl_loss_10": 169.8495635986328, + "kl_loss_2": 1843.5690612792969, + "kl_loss_3": 1385.9791320800782, + "kl_loss_7": 477.3311019897461, + "learning_rate": 0.0009528180468815154, + "loss": 969.2416, + "step": 1480 + }, + { + "ce_loss_10": 3.6473464369773865, + "ce_loss_13": 3.5708807826042177, + "ce_loss_2": 4.4276422500610355, + "ce_loss_3": 4.19535973072052, + "ce_loss_7": 3.780378723144531, + "epoch": 0.149, + "grad_norm": 716.0, + "kl_loss_10": 168.1454620361328, + "kl_loss_2": 1800.059014892578, + "kl_loss_3": 1349.663555908203, + "kl_loss_7": 474.4862854003906, + "learning_rate": 0.0009521429345495787, + "loss": 948.5005, + "step": 1490 + }, + { + "ce_loss_10": 3.6274529099464417, + "ce_loss_13": 3.5506494641304016, + "ce_loss_2": 4.379093813896179, + "ce_loss_3": 4.163582825660706, + "ce_loss_7": 3.767805981636047, + "epoch": 0.15, + "grad_norm": 544.0, + "kl_loss_10": 161.9274887084961, + "kl_loss_2": 1749.5603088378907, + "kl_loss_3": 1322.4438842773438, + "kl_loss_7": 461.2112503051758, + "learning_rate": 0.0009514632691433108, + "loss": 934.5367, + "step": 1500 + }, + { + "ce_loss_10": 3.589273178577423, + "ce_loss_13": 3.5116019368171694, + "ce_loss_2": 4.371138715744019, + "ce_loss_3": 4.152551782131195, + "ce_loss_7": 3.737976813316345, + "epoch": 0.151, + "grad_norm": 756.0, + "kl_loss_10": 165.94934310913087, + "kl_loss_2": 1800.9721557617188, + "kl_loss_3": 1379.0521667480468, + "kl_loss_7": 496.21505279541014, + "learning_rate": 0.0009507790575069346, + "loss": 965.8927, + "step": 1510 + }, + { + "ce_loss_10": 3.567319369316101, + "ce_loss_13": 3.4871225714683534, + "ce_loss_2": 4.369505047798157, + "ce_loss_3": 4.1454179883003235, + "ce_loss_7": 3.7245999336242677, + "epoch": 0.152, + "grad_norm": 800.0, + "kl_loss_10": 168.54553909301757, + "kl_loss_2": 1831.139581298828, + "kl_loss_3": 1381.621405029297, + "kl_loss_7": 492.096989440918, + "learning_rate": 0.0009500903065304539, + "loss": 972.4712, + "step": 1520 + }, + { + "ce_loss_10": 3.6029422521591186, + "ce_loss_13": 3.529670035839081, + "ce_loss_2": 4.370370721817016, + "ce_loss_3": 4.140894186496735, + "ce_loss_7": 3.73762069940567, + "epoch": 0.153, + "grad_norm": 828.0, + "kl_loss_10": 160.99332580566406, + "kl_loss_2": 1756.960675048828, + "kl_loss_3": 1307.8462524414062, + "kl_loss_7": 457.64794006347654, + "learning_rate": 0.0009493970231495835, + "loss": 935.4037, + "step": 1530 + }, + { + "ce_loss_10": 3.5418770790100096, + "ce_loss_13": 3.470135521888733, + "ce_loss_2": 4.307833838462829, + "ce_loss_3": 4.0862520098686215, + "ce_loss_7": 3.6741525650024416, + "epoch": 0.154, + "grad_norm": 788.0, + "kl_loss_10": 161.51265563964844, + "kl_loss_2": 1777.391455078125, + "kl_loss_3": 1322.6101928710937, + "kl_loss_7": 442.0227294921875, + "learning_rate": 0.0009486992143456792, + "loss": 925.9713, + "step": 1540 + }, + { + "ce_loss_10": 3.572007155418396, + "ce_loss_13": 3.4902480483055114, + "ce_loss_2": 4.407237410545349, + "ce_loss_3": 4.155784142017365, + "ce_loss_7": 3.711955487728119, + "epoch": 0.155, + "grad_norm": 804.0, + "kl_loss_10": 173.13835983276368, + "kl_loss_2": 1896.4215270996094, + "kl_loss_3": 1404.887060546875, + "kl_loss_7": 488.79372253417966, + "learning_rate": 0.0009479968871456679, + "loss": 969.3572, + "step": 1550 + }, + { + "ce_loss_10": 3.5374878644943237, + "ce_loss_13": 3.4610541582107546, + "ce_loss_2": 4.35056471824646, + "ce_loss_3": 4.090835416316986, + "ce_loss_7": 3.67384033203125, + "epoch": 0.156, + "grad_norm": 664.0, + "kl_loss_10": 168.38916778564453, + "kl_loss_2": 1863.9793334960937, + "kl_loss_3": 1362.9484497070312, + "kl_loss_7": 457.0409851074219, + "learning_rate": 0.0009472900486219768, + "loss": 942.6081, + "step": 1560 + }, + { + "ce_loss_10": 3.5255000948905946, + "ce_loss_13": 3.4513633012771607, + "ce_loss_2": 4.313361859321594, + "ce_loss_3": 4.075027894973755, + "ce_loss_7": 3.655648243427277, + "epoch": 0.157, + "grad_norm": 632.0, + "kl_loss_10": 163.33478240966798, + "kl_loss_2": 1801.9673278808593, + "kl_loss_3": 1334.2445068359375, + "kl_loss_7": 440.6134338378906, + "learning_rate": 0.000946578705892462, + "loss": 935.4695, + "step": 1570 + }, + { + "ce_loss_10": 3.566776442527771, + "ce_loss_13": 3.4891526818275453, + "ce_loss_2": 4.334450674057007, + "ce_loss_3": 4.097967481613159, + "ce_loss_7": 3.6950459837913514, + "epoch": 0.158, + "grad_norm": 596.0, + "kl_loss_10": 161.22595672607423, + "kl_loss_2": 1756.149267578125, + "kl_loss_3": 1303.3608947753905, + "kl_loss_7": 433.9689010620117, + "learning_rate": 0.0009458628661203367, + "loss": 927.5395, + "step": 1580 + }, + { + "ce_loss_10": 3.5715178489685058, + "ce_loss_13": 3.493246281147003, + "ce_loss_2": 4.381004786491394, + "ce_loss_3": 4.15310035943985, + "ce_loss_7": 3.7095054388046265, + "epoch": 0.159, + "grad_norm": 772.0, + "kl_loss_10": 166.47358245849608, + "kl_loss_2": 1829.0434631347657, + "kl_loss_3": 1386.59541015625, + "kl_loss_7": 455.2687271118164, + "learning_rate": 0.0009451425365140996, + "loss": 931.1994, + "step": 1590 + }, + { + "ce_loss_10": 3.644000780582428, + "ce_loss_13": 3.5690673708915712, + "ce_loss_2": 4.3989935398101805, + "ce_loss_3": 4.183095216751099, + "ce_loss_7": 3.770583391189575, + "epoch": 0.16, + "grad_norm": 684.0, + "kl_loss_10": 164.29436340332032, + "kl_loss_2": 1741.866064453125, + "kl_loss_3": 1315.065985107422, + "kl_loss_7": 435.75640869140625, + "learning_rate": 0.0009444177243274617, + "loss": 913.077, + "step": 1600 + }, + { + "ce_loss_10": 3.5015690445899965, + "ce_loss_13": 3.420224165916443, + "ce_loss_2": 4.298003304004669, + "ce_loss_3": 4.071039080619812, + "ce_loss_7": 3.6376882791519165, + "epoch": 0.161, + "grad_norm": 648.0, + "kl_loss_10": 170.34925384521483, + "kl_loss_2": 1819.6341918945313, + "kl_loss_3": 1383.1305297851563, + "kl_loss_7": 454.9949661254883, + "learning_rate": 0.0009436884368592739, + "loss": 949.2351, + "step": 1610 + }, + { + "ce_loss_10": 3.553143525123596, + "ce_loss_13": 3.4774653792381285, + "ce_loss_2": 4.3112345933914185, + "ce_loss_3": 4.104372417926788, + "ce_loss_7": 3.6796785831451415, + "epoch": 0.162, + "grad_norm": 728.0, + "kl_loss_10": 164.4239158630371, + "kl_loss_2": 1752.8823547363281, + "kl_loss_3": 1345.055731201172, + "kl_loss_7": 438.5689453125, + "learning_rate": 0.0009429546814534529, + "loss": 940.1898, + "step": 1620 + }, + { + "ce_loss_10": 3.5688347697257994, + "ce_loss_13": 3.4930466771125794, + "ce_loss_2": 4.32410843372345, + "ce_loss_3": 4.122909438610077, + "ce_loss_7": 3.6923286437988283, + "epoch": 0.163, + "grad_norm": 596.0, + "kl_loss_10": 162.7445167541504, + "kl_loss_2": 1746.289862060547, + "kl_loss_3": 1339.1979248046875, + "kl_loss_7": 437.2148178100586, + "learning_rate": 0.0009422164654989072, + "loss": 912.9286, + "step": 1630 + }, + { + "ce_loss_10": 3.678850030899048, + "ce_loss_13": 3.6019165754318236, + "ce_loss_2": 4.419181323051452, + "ce_loss_3": 4.2116731882095335, + "ce_loss_7": 3.801846480369568, + "epoch": 0.164, + "grad_norm": 688.0, + "kl_loss_10": 162.59844360351562, + "kl_loss_2": 1724.923516845703, + "kl_loss_3": 1329.3218505859375, + "kl_loss_7": 438.8222412109375, + "learning_rate": 0.0009414737964294635, + "loss": 921.2891, + "step": 1640 + }, + { + "ce_loss_10": 3.603172779083252, + "ce_loss_13": 3.5337018847465513, + "ce_loss_2": 4.340193152427673, + "ce_loss_3": 4.136861097812653, + "ce_loss_7": 3.7259446382522583, + "epoch": 0.165, + "grad_norm": 732.0, + "kl_loss_10": 157.09877319335936, + "kl_loss_2": 1687.0253173828125, + "kl_loss_3": 1302.9964721679687, + "kl_loss_7": 423.92798919677733, + "learning_rate": 0.000940726681723791, + "loss": 916.6791, + "step": 1650 + }, + { + "ce_loss_10": 3.440978169441223, + "ce_loss_13": 3.3671964764595033, + "ce_loss_2": 4.236228859424591, + "ce_loss_3": 4.024500918388367, + "ce_loss_7": 3.5734368205070495, + "epoch": 0.166, + "grad_norm": 692.0, + "kl_loss_10": 159.3915771484375, + "kl_loss_2": 1815.1759338378906, + "kl_loss_3": 1393.582989501953, + "kl_loss_7": 444.88841857910154, + "learning_rate": 0.0009399751289053266, + "loss": 922.4507, + "step": 1660 + }, + { + "ce_loss_10": 3.6614871978759767, + "ce_loss_13": 3.588225471973419, + "ce_loss_2": 4.408766531944275, + "ce_loss_3": 4.200141429901123, + "ce_loss_7": 3.7855040192604066, + "epoch": 0.167, + "grad_norm": 780.0, + "kl_loss_10": 157.96069717407227, + "kl_loss_2": 1723.715997314453, + "kl_loss_3": 1322.4010009765625, + "kl_loss_7": 429.9161651611328, + "learning_rate": 0.0009392191455421988, + "loss": 917.9642, + "step": 1670 + }, + { + "ce_loss_10": 3.63707195520401, + "ce_loss_13": 3.5556543111801147, + "ce_loss_2": 4.380373930931091, + "ce_loss_3": 4.166925942897796, + "ce_loss_7": 3.758769381046295, + "epoch": 0.168, + "grad_norm": 660.0, + "kl_loss_10": 175.24741973876954, + "kl_loss_2": 1761.8118225097655, + "kl_loss_3": 1344.4787719726562, + "kl_loss_7": 450.10546875, + "learning_rate": 0.0009384587392471515, + "loss": 916.3633, + "step": 1680 + }, + { + "ce_loss_10": 3.622085762023926, + "ce_loss_13": 3.5469981908798216, + "ce_loss_2": 4.356770062446595, + "ce_loss_3": 4.151288604736328, + "ce_loss_7": 3.746310067176819, + "epoch": 0.169, + "grad_norm": 608.0, + "kl_loss_10": 163.2817695617676, + "kl_loss_2": 1704.9851623535155, + "kl_loss_3": 1297.997540283203, + "kl_loss_7": 428.83862762451173, + "learning_rate": 0.0009376939176774678, + "loss": 905.5219, + "step": 1690 + }, + { + "ce_loss_10": 3.5977450728416445, + "ce_loss_13": 3.519981324672699, + "ce_loss_2": 4.352665519714355, + "ce_loss_3": 4.143550324440002, + "ce_loss_7": 3.7202346324920654, + "epoch": 0.17, + "grad_norm": 632.0, + "kl_loss_10": 162.28709259033204, + "kl_loss_2": 1725.159112548828, + "kl_loss_3": 1313.8654235839845, + "kl_loss_7": 435.5428039550781, + "learning_rate": 0.0009369246885348925, + "loss": 921.765, + "step": 1700 + }, + { + "ce_loss_10": 3.59084609746933, + "ce_loss_13": 3.5151766538619995, + "ce_loss_2": 4.375664782524109, + "ce_loss_3": 4.156036603450775, + "ce_loss_7": 3.7246397972106933, + "epoch": 0.171, + "grad_norm": 684.0, + "kl_loss_10": 164.49726409912108, + "kl_loss_2": 1801.033349609375, + "kl_loss_3": 1370.6250915527344, + "kl_loss_7": 453.40001373291017, + "learning_rate": 0.0009361510595655545, + "loss": 935.6853, + "step": 1710 + }, + { + "ce_loss_10": 3.549048590660095, + "ce_loss_13": 3.4653629899024962, + "ce_loss_2": 4.307630968093872, + "ce_loss_3": 4.094715774059296, + "ce_loss_7": 3.6762614488601684, + "epoch": 0.172, + "grad_norm": 680.0, + "kl_loss_10": 170.3953384399414, + "kl_loss_2": 1762.9258972167968, + "kl_loss_3": 1346.3614562988282, + "kl_loss_7": 449.2826202392578, + "learning_rate": 0.0009353730385598887, + "loss": 922.2734, + "step": 1720 + }, + { + "ce_loss_10": 3.472779655456543, + "ce_loss_13": 3.3950839400291444, + "ce_loss_2": 4.263340127468109, + "ce_loss_3": 4.03793773651123, + "ce_loss_7": 3.6020253181457518, + "epoch": 0.173, + "grad_norm": 656.0, + "kl_loss_10": 163.52980422973633, + "kl_loss_2": 1797.2368774414062, + "kl_loss_3": 1351.431103515625, + "kl_loss_7": 441.22152252197264, + "learning_rate": 0.0009345906333525581, + "loss": 935.6812, + "step": 1730 + }, + { + "ce_loss_10": 3.5214235901832582, + "ce_loss_13": 3.4371795892715453, + "ce_loss_2": 4.292250299453736, + "ce_loss_3": 4.065243577957153, + "ce_loss_7": 3.642275357246399, + "epoch": 0.174, + "grad_norm": 756.0, + "kl_loss_10": 178.52879638671874, + "kl_loss_2": 1812.4777038574218, + "kl_loss_3": 1354.8654235839845, + "kl_loss_7": 448.5812484741211, + "learning_rate": 0.0009338038518223745, + "loss": 926.7967, + "step": 1740 + }, + { + "ce_loss_10": 3.579227292537689, + "ce_loss_13": 3.497506093978882, + "ce_loss_2": 4.353535735607148, + "ce_loss_3": 4.1314038872718815, + "ce_loss_7": 3.712595534324646, + "epoch": 0.175, + "grad_norm": 684.0, + "kl_loss_10": 170.88160781860353, + "kl_loss_2": 1791.0358825683593, + "kl_loss_3": 1358.0457336425782, + "kl_loss_7": 456.49011383056643, + "learning_rate": 0.0009330127018922195, + "loss": 955.6061, + "step": 1750 + }, + { + "ce_loss_10": 3.534073221683502, + "ce_loss_13": 3.4557726621627807, + "ce_loss_2": 4.298114824295044, + "ce_loss_3": 4.071936726570129, + "ce_loss_7": 3.6692795515060426, + "epoch": 0.176, + "grad_norm": 940.0, + "kl_loss_10": 162.63356018066406, + "kl_loss_2": 1777.0078063964843, + "kl_loss_3": 1322.5951354980468, + "kl_loss_7": 452.01353607177737, + "learning_rate": 0.0009322171915289634, + "loss": 925.2979, + "step": 1760 + }, + { + "ce_loss_10": 3.562859356403351, + "ce_loss_13": 3.4936991572380065, + "ce_loss_2": 4.311712801456451, + "ce_loss_3": 4.096149682998657, + "ce_loss_7": 3.695814371109009, + "epoch": 0.177, + "grad_norm": 672.0, + "kl_loss_10": 159.3963638305664, + "kl_loss_2": 1739.3221557617187, + "kl_loss_3": 1308.4884643554688, + "kl_loss_7": 450.83116302490237, + "learning_rate": 0.0009314173287433873, + "loss": 905.2543, + "step": 1770 + }, + { + "ce_loss_10": 3.5641183972358705, + "ce_loss_13": 3.4850690603256225, + "ce_loss_2": 4.308147180080414, + "ce_loss_3": 4.097426617145539, + "ce_loss_7": 3.6933220624923706, + "epoch": 0.178, + "grad_norm": 848.0, + "kl_loss_10": 167.73994827270508, + "kl_loss_2": 1746.2864990234375, + "kl_loss_3": 1316.532891845703, + "kl_loss_7": 455.0190155029297, + "learning_rate": 0.0009306131215901003, + "loss": 906.8408, + "step": 1780 + }, + { + "ce_loss_10": 3.596917653083801, + "ce_loss_13": 3.516416549682617, + "ce_loss_2": 4.341592967510223, + "ce_loss_3": 4.1163853168487545, + "ce_loss_7": 3.717692863941193, + "epoch": 0.179, + "grad_norm": 696.0, + "kl_loss_10": 170.9574432373047, + "kl_loss_2": 1727.8181701660155, + "kl_loss_3": 1299.9059143066406, + "kl_loss_7": 441.5744827270508, + "learning_rate": 0.0009298045781674596, + "loss": 897.6227, + "step": 1790 + }, + { + "ce_loss_10": 3.57572420835495, + "ce_loss_13": 3.4945407032966616, + "ce_loss_2": 4.302236485481262, + "ce_loss_3": 4.08895423412323, + "ce_loss_7": 3.6926202178001404, + "epoch": 0.18, + "grad_norm": 664.0, + "kl_loss_10": 179.89295349121093, + "kl_loss_2": 1700.9044799804688, + "kl_loss_3": 1284.20361328125, + "kl_loss_7": 431.50037536621096, + "learning_rate": 0.0009289917066174886, + "loss": 914.1196, + "step": 1800 + }, + { + "ce_loss_10": 3.574003517627716, + "ce_loss_13": 3.4940902471542357, + "ce_loss_2": 4.277863013744354, + "ce_loss_3": 4.078153216838837, + "ce_loss_7": 3.683695209026337, + "epoch": 0.181, + "grad_norm": 664.0, + "kl_loss_10": 173.87937088012694, + "kl_loss_2": 1662.8412353515625, + "kl_loss_3": 1261.6159118652345, + "kl_loss_7": 416.16939239501954, + "learning_rate": 0.0009281745151257945, + "loss": 889.3567, + "step": 1810 + }, + { + "ce_loss_10": 3.587869346141815, + "ce_loss_13": 3.50655517578125, + "ce_loss_2": 4.322592568397522, + "ce_loss_3": 4.116343176364898, + "ce_loss_7": 3.7045819759368896, + "epoch": 0.182, + "grad_norm": 688.0, + "kl_loss_10": 170.37163772583008, + "kl_loss_2": 1707.6110900878907, + "kl_loss_3": 1315.0409118652344, + "kl_loss_7": 429.47176055908204, + "learning_rate": 0.0009273530119214868, + "loss": 911.8113, + "step": 1820 + }, + { + "ce_loss_10": 3.696093189716339, + "ce_loss_13": 3.6190614104270935, + "ce_loss_2": 4.3983154296875, + "ce_loss_3": 4.195645475387574, + "ce_loss_7": 3.807457995414734, + "epoch": 0.183, + "grad_norm": 800.0, + "kl_loss_10": 165.97553634643555, + "kl_loss_2": 1682.4634155273438, + "kl_loss_3": 1278.163592529297, + "kl_loss_7": 420.69031982421876, + "learning_rate": 0.0009265272052770935, + "loss": 880.8881, + "step": 1830 + }, + { + "ce_loss_10": 3.5015958070755007, + "ce_loss_13": 3.424196720123291, + "ce_loss_2": 4.267678773403167, + "ce_loss_3": 4.0475095987319945, + "ce_loss_7": 3.6268508434295654, + "epoch": 0.184, + "grad_norm": 704.0, + "kl_loss_10": 163.908536529541, + "kl_loss_2": 1749.7696533203125, + "kl_loss_3": 1320.0984130859374, + "kl_loss_7": 427.6091049194336, + "learning_rate": 0.0009256971035084784, + "loss": 912.5401, + "step": 1840 + }, + { + "ce_loss_10": 3.44786741733551, + "ce_loss_13": 3.3659385561943056, + "ce_loss_2": 4.2315644264221195, + "ce_loss_3": 4.008389139175415, + "ce_loss_7": 3.574787473678589, + "epoch": 0.185, + "grad_norm": 708.0, + "kl_loss_10": 172.1600601196289, + "kl_loss_2": 1787.1378540039063, + "kl_loss_3": 1351.4346496582032, + "kl_loss_7": 442.25659637451173, + "learning_rate": 0.0009248627149747573, + "loss": 920.3978, + "step": 1850 + }, + { + "ce_loss_10": 3.6512333154678345, + "ce_loss_13": 3.5744757533073424, + "ce_loss_2": 4.376823902130127, + "ce_loss_3": 4.171487009525299, + "ce_loss_7": 3.772753894329071, + "epoch": 0.186, + "grad_norm": 580.0, + "kl_loss_10": 179.92476654052734, + "kl_loss_2": 1699.939630126953, + "kl_loss_3": 1295.8851440429687, + "kl_loss_7": 434.7665496826172, + "learning_rate": 0.0009240240480782129, + "loss": 904.4956, + "step": 1860 + }, + { + "ce_loss_10": 3.5657613396644594, + "ce_loss_13": 3.4775595903396606, + "ce_loss_2": 4.310595238208771, + "ce_loss_3": 4.096242725849152, + "ce_loss_7": 3.6834920406341554, + "epoch": 0.187, + "grad_norm": 564.0, + "kl_loss_10": 176.3750358581543, + "kl_loss_2": 1732.9331481933593, + "kl_loss_3": 1312.267645263672, + "kl_loss_7": 431.05884704589846, + "learning_rate": 0.0009231811112642122, + "loss": 904.4842, + "step": 1870 + }, + { + "ce_loss_10": 3.5995773315429687, + "ce_loss_13": 3.523776340484619, + "ce_loss_2": 4.318611538410186, + "ce_loss_3": 4.111791932582856, + "ce_loss_7": 3.7252105116844176, + "epoch": 0.188, + "grad_norm": 800.0, + "kl_loss_10": 166.8633888244629, + "kl_loss_2": 1689.3792785644532, + "kl_loss_3": 1285.7919128417968, + "kl_loss_7": 438.99388275146487, + "learning_rate": 0.0009223339130211192, + "loss": 895.5117, + "step": 1880 + }, + { + "ce_loss_10": 3.447600543498993, + "ce_loss_13": 3.3741417050361635, + "ce_loss_2": 4.2147566199302675, + "ce_loss_3": 3.9939645886421205, + "ce_loss_7": 3.574213218688965, + "epoch": 0.189, + "grad_norm": 744.0, + "kl_loss_10": 155.9600372314453, + "kl_loss_2": 1748.6449157714844, + "kl_loss_3": 1322.1059143066407, + "kl_loss_7": 423.45860900878904, + "learning_rate": 0.0009214824618802108, + "loss": 908.6522, + "step": 1890 + }, + { + "ce_loss_10": 3.6377331376075746, + "ce_loss_13": 3.5647199988365172, + "ce_loss_2": 4.387881731986999, + "ce_loss_3": 4.164871621131897, + "ce_loss_7": 3.76009886264801, + "epoch": 0.19, + "grad_norm": 684.0, + "kl_loss_10": 159.65029067993163, + "kl_loss_2": 1726.2060913085938, + "kl_loss_3": 1294.0604736328125, + "kl_loss_7": 432.2201461791992, + "learning_rate": 0.0009206267664155906, + "loss": 913.599, + "step": 1900 + }, + { + "ce_loss_10": 3.547088313102722, + "ce_loss_13": 3.4745959639549255, + "ce_loss_2": 4.296490705013275, + "ce_loss_3": 4.079526424407959, + "ce_loss_7": 3.669068491458893, + "epoch": 0.191, + "grad_norm": 772.0, + "kl_loss_10": 158.69638366699218, + "kl_loss_2": 1713.6190551757813, + "kl_loss_3": 1288.2169921875, + "kl_loss_7": 425.7963638305664, + "learning_rate": 0.0009197668352441024, + "loss": 897.9576, + "step": 1910 + }, + { + "ce_loss_10": 3.604101300239563, + "ce_loss_13": 3.530220317840576, + "ce_loss_2": 4.342615389823914, + "ce_loss_3": 4.1288164258003235, + "ce_loss_7": 3.7257459282875063, + "epoch": 0.192, + "grad_norm": 808.0, + "kl_loss_10": 155.30001373291014, + "kl_loss_2": 1698.1914672851562, + "kl_loss_3": 1284.7894897460938, + "kl_loss_7": 421.06340942382815, + "learning_rate": 0.0009189026770252437, + "loss": 894.5027, + "step": 1920 + }, + { + "ce_loss_10": 3.6317692160606385, + "ce_loss_13": 3.5578647017478944, + "ce_loss_2": 4.357654023170471, + "ce_loss_3": 4.154970502853393, + "ce_loss_7": 3.754297876358032, + "epoch": 0.193, + "grad_norm": 672.0, + "kl_loss_10": 156.60808029174805, + "kl_loss_2": 1681.9849914550782, + "kl_loss_3": 1293.0608825683594, + "kl_loss_7": 425.2352691650391, + "learning_rate": 0.000918034300461078, + "loss": 914.7908, + "step": 1930 + }, + { + "ce_loss_10": 3.661780071258545, + "ce_loss_13": 3.5855299711227415, + "ce_loss_2": 4.3866435289382935, + "ce_loss_3": 4.176734316349029, + "ce_loss_7": 3.780391752719879, + "epoch": 0.194, + "grad_norm": 508.0, + "kl_loss_10": 158.01766357421874, + "kl_loss_2": 1686.9480285644531, + "kl_loss_3": 1278.583349609375, + "kl_loss_7": 423.1416976928711, + "learning_rate": 0.0009171617142961477, + "loss": 886.9112, + "step": 1940 + }, + { + "ce_loss_10": 3.619071674346924, + "ce_loss_13": 3.544144856929779, + "ce_loss_2": 4.3446578741073605, + "ce_loss_3": 4.132830607891083, + "ce_loss_7": 3.7368661999702453, + "epoch": 0.195, + "grad_norm": 736.0, + "kl_loss_10": 156.3160285949707, + "kl_loss_2": 1687.3851745605468, + "kl_loss_3": 1275.4538269042969, + "kl_loss_7": 421.2539489746094, + "learning_rate": 0.0009162849273173857, + "loss": 885.9303, + "step": 1950 + }, + { + "ce_loss_10": 3.553014504909515, + "ce_loss_13": 3.4827070951461794, + "ce_loss_2": 4.291935205459595, + "ce_loss_3": 4.0778828144073485, + "ce_loss_7": 3.6772719860076903, + "epoch": 0.196, + "grad_norm": 628.0, + "kl_loss_10": 154.84951171875, + "kl_loss_2": 1692.2017944335937, + "kl_loss_3": 1271.0808410644531, + "kl_loss_7": 426.93184356689454, + "learning_rate": 0.0009154039483540273, + "loss": 894.1963, + "step": 1960 + }, + { + "ce_loss_10": 3.5380710124969483, + "ce_loss_13": 3.4637695431709288, + "ce_loss_2": 4.2753846645355225, + "ce_loss_3": 4.052942824363709, + "ce_loss_7": 3.656904327869415, + "epoch": 0.197, + "grad_norm": 632.0, + "kl_loss_10": 159.71602325439454, + "kl_loss_2": 1709.4674560546875, + "kl_loss_3": 1272.3471313476562, + "kl_loss_7": 423.2468658447266, + "learning_rate": 0.0009145187862775209, + "loss": 892.0132, + "step": 1970 + }, + { + "ce_loss_10": 3.573553669452667, + "ce_loss_13": 3.503090190887451, + "ce_loss_2": 4.311598265171051, + "ce_loss_3": 4.097398114204407, + "ce_loss_7": 3.6970279932022097, + "epoch": 0.198, + "grad_norm": 640.0, + "kl_loss_10": 162.31361923217773, + "kl_loss_2": 1708.26845703125, + "kl_loss_3": 1289.0806701660156, + "kl_loss_7": 427.25081634521484, + "learning_rate": 0.0009136294500014386, + "loss": 892.4137, + "step": 1980 + }, + { + "ce_loss_10": 3.519877481460571, + "ce_loss_13": 3.445008838176727, + "ce_loss_2": 4.2790512323379515, + "ce_loss_3": 4.054595351219177, + "ce_loss_7": 3.6438548445701597, + "epoch": 0.199, + "grad_norm": 828.0, + "kl_loss_10": 163.45086746215821, + "kl_loss_2": 1735.9828308105468, + "kl_loss_3": 1307.2176513671875, + "kl_loss_7": 429.4031814575195, + "learning_rate": 0.000912735948481387, + "loss": 908.3786, + "step": 1990 + }, + { + "ce_loss_10": 3.553472650051117, + "ce_loss_13": 3.478245162963867, + "ce_loss_2": 4.28654375076294, + "ce_loss_3": 4.0802433609962465, + "ce_loss_7": 3.677069938182831, + "epoch": 0.2, + "grad_norm": 628.0, + "kl_loss_10": 160.29014053344727, + "kl_loss_2": 1707.9494140625, + "kl_loss_3": 1296.2968994140624, + "kl_loss_7": 428.4298263549805, + "learning_rate": 0.0009118382907149164, + "loss": 885.6779, + "step": 2000 + }, + { + "ce_loss_10": 3.5850682854652405, + "ce_loss_13": 3.50952627658844, + "ce_loss_2": 4.315432548522949, + "ce_loss_3": 4.100254940986633, + "ce_loss_7": 3.7033676624298097, + "epoch": 0.201, + "grad_norm": 708.0, + "kl_loss_10": 157.8212142944336, + "kl_loss_2": 1702.1678955078125, + "kl_loss_3": 1284.286181640625, + "kl_loss_7": 423.3880950927734, + "learning_rate": 0.0009109364857414306, + "loss": 884.5561, + "step": 2010 + }, + { + "ce_loss_10": 3.5489532589912414, + "ce_loss_13": 3.4744242310523985, + "ce_loss_2": 4.2670130610466, + "ce_loss_3": 4.060766589641571, + "ce_loss_7": 3.668097496032715, + "epoch": 0.202, + "grad_norm": 728.0, + "kl_loss_10": 156.86791915893554, + "kl_loss_2": 1700.6970947265625, + "kl_loss_3": 1274.6605895996095, + "kl_loss_7": 423.90224609375, + "learning_rate": 0.0009100305426420956, + "loss": 905.0367, + "step": 2020 + }, + { + "ce_loss_10": 3.5054296493530273, + "ce_loss_13": 3.4360422253608705, + "ce_loss_2": 4.28641676902771, + "ce_loss_3": 4.060827207565308, + "ce_loss_7": 3.6314488530158995, + "epoch": 0.203, + "grad_norm": 568.0, + "kl_loss_10": 154.1449821472168, + "kl_loss_2": 1787.4573059082031, + "kl_loss_3": 1334.0317810058593, + "kl_loss_7": 424.69750823974607, + "learning_rate": 0.0009091204705397484, + "loss": 899.0799, + "step": 2030 + }, + { + "ce_loss_10": 3.500208306312561, + "ce_loss_13": 3.4300543308258056, + "ce_loss_2": 4.267745566368103, + "ce_loss_3": 4.045078694820404, + "ce_loss_7": 3.619711148738861, + "epoch": 0.204, + "grad_norm": 668.0, + "kl_loss_10": 156.37296295166016, + "kl_loss_2": 1764.6390991210938, + "kl_loss_3": 1327.2704162597656, + "kl_loss_7": 422.20280303955076, + "learning_rate": 0.0009082062785988049, + "loss": 907.7636, + "step": 2040 + }, + { + "ce_loss_10": 3.639115536212921, + "ce_loss_13": 3.570273053646088, + "ce_loss_2": 4.345433187484741, + "ce_loss_3": 4.141461956501007, + "ce_loss_7": 3.7540995121002196, + "epoch": 0.205, + "grad_norm": 684.0, + "kl_loss_10": 150.40593338012695, + "kl_loss_2": 1673.0178833007812, + "kl_loss_3": 1264.1800903320313, + "kl_loss_7": 411.66319274902344, + "learning_rate": 0.0009072879760251679, + "loss": 886.5711, + "step": 2050 + }, + { + "ce_loss_10": 3.5780452609062197, + "ce_loss_13": 3.506811273097992, + "ce_loss_2": 4.32657071352005, + "ce_loss_3": 4.111942863464355, + "ce_loss_7": 3.7021100521087646, + "epoch": 0.206, + "grad_norm": 848.0, + "kl_loss_10": 153.30594024658203, + "kl_loss_2": 1718.6283569335938, + "kl_loss_3": 1295.268243408203, + "kl_loss_7": 424.54795989990237, + "learning_rate": 0.0009063655720661341, + "loss": 891.0267, + "step": 2060 + }, + { + "ce_loss_10": 3.6308677077293394, + "ce_loss_13": 3.559994864463806, + "ce_loss_2": 4.349410057067871, + "ce_loss_3": 4.144072437286377, + "ce_loss_7": 3.752864146232605, + "epoch": 0.207, + "grad_norm": 656.0, + "kl_loss_10": 154.1874885559082, + "kl_loss_2": 1672.9453002929688, + "kl_loss_3": 1275.2094665527343, + "kl_loss_7": 426.11539459228516, + "learning_rate": 0.000905439076010301, + "loss": 882.46, + "step": 2070 + }, + { + "ce_loss_10": 3.5808544397354125, + "ce_loss_13": 3.509160017967224, + "ce_loss_2": 4.326603198051453, + "ce_loss_3": 4.109063744544983, + "ce_loss_7": 3.705178952217102, + "epoch": 0.208, + "grad_norm": 692.0, + "kl_loss_10": 153.3233413696289, + "kl_loss_2": 1708.8538391113282, + "kl_loss_3": 1284.7677429199218, + "kl_loss_7": 424.9750183105469, + "learning_rate": 0.0009045084971874737, + "loss": 878.8373, + "step": 2080 + }, + { + "ce_loss_10": 3.5615262746810914, + "ce_loss_13": 3.489412307739258, + "ce_loss_2": 4.29884420633316, + "ce_loss_3": 4.0808792352676395, + "ce_loss_7": 3.6813034057617187, + "epoch": 0.209, + "grad_norm": 676.0, + "kl_loss_10": 155.00849456787108, + "kl_loss_2": 1709.5053344726562, + "kl_loss_3": 1283.1550598144531, + "kl_loss_7": 423.63720855712893, + "learning_rate": 0.0009035738449685707, + "loss": 901.9238, + "step": 2090 + }, + { + "ce_loss_10": 3.500594878196716, + "ce_loss_13": 3.4269418120384216, + "ce_loss_2": 4.261061310768127, + "ce_loss_3": 4.0436607837677006, + "ce_loss_7": 3.6208752751350404, + "epoch": 0.21, + "grad_norm": 664.0, + "kl_loss_10": 154.60431098937988, + "kl_loss_2": 1738.2481689453125, + "kl_loss_3": 1307.3330932617187, + "kl_loss_7": 419.5928253173828, + "learning_rate": 0.0009026351287655293, + "loss": 886.3637, + "step": 2100 + }, + { + "ce_loss_10": 3.6964518427848816, + "ce_loss_13": 3.628730046749115, + "ce_loss_2": 4.3766416072845455, + "ce_loss_3": 4.168106377124786, + "ce_loss_7": 3.810632276535034, + "epoch": 0.211, + "grad_norm": 892.0, + "kl_loss_10": 147.90714263916016, + "kl_loss_2": 1608.2051940917968, + "kl_loss_3": 1206.7733154296875, + "kl_loss_7": 407.6394790649414, + "learning_rate": 0.0009016923580312113, + "loss": 848.7596, + "step": 2110 + }, + { + "ce_loss_10": 3.551041543483734, + "ce_loss_13": 3.4838817596435545, + "ce_loss_2": 4.263743901252747, + "ce_loss_3": 4.0499477744102474, + "ce_loss_7": 3.675402212142944, + "epoch": 0.212, + "grad_norm": 560.0, + "kl_loss_10": 151.607958984375, + "kl_loss_2": 1656.1309509277344, + "kl_loss_3": 1250.3619262695313, + "kl_loss_7": 422.76923828125, + "learning_rate": 0.0009007455422593077, + "loss": 885.3857, + "step": 2120 + }, + { + "ce_loss_10": 3.562802243232727, + "ce_loss_13": 3.4917763471603394, + "ce_loss_2": 4.310350394248962, + "ce_loss_3": 4.089577150344849, + "ce_loss_7": 3.6881680369377134, + "epoch": 0.213, + "grad_norm": 596.0, + "kl_loss_10": 154.58464736938475, + "kl_loss_2": 1729.4122314453125, + "kl_loss_3": 1302.9043884277344, + "kl_loss_7": 434.224055480957, + "learning_rate": 0.0008997946909842425, + "loss": 899.9531, + "step": 2130 + }, + { + "ce_loss_10": 3.5820153951644897, + "ce_loss_13": 3.5091649174690245, + "ce_loss_2": 4.356534934043884, + "ce_loss_3": 4.134391415119171, + "ce_loss_7": 3.712181234359741, + "epoch": 0.214, + "grad_norm": 868.0, + "kl_loss_10": 160.32343063354492, + "kl_loss_2": 1772.4649536132813, + "kl_loss_3": 1337.5961608886719, + "kl_loss_7": 449.59600219726565, + "learning_rate": 0.0008988398137810777, + "loss": 897.9156, + "step": 2140 + }, + { + "ce_loss_10": 3.6153340220451353, + "ce_loss_13": 3.5469638228416445, + "ce_loss_2": 4.333649778366089, + "ce_loss_3": 4.1305704593658445, + "ce_loss_7": 3.746591365337372, + "epoch": 0.215, + "grad_norm": 744.0, + "kl_loss_10": 152.21541442871094, + "kl_loss_2": 1670.5354553222655, + "kl_loss_3": 1258.0333435058594, + "kl_loss_7": 436.6368347167969, + "learning_rate": 0.0008978809202654162, + "loss": 878.6867, + "step": 2150 + }, + { + "ce_loss_10": 3.5994900941848753, + "ce_loss_13": 3.523406147956848, + "ce_loss_2": 4.32131108045578, + "ce_loss_3": 4.106508493423462, + "ce_loss_7": 3.71969929933548, + "epoch": 0.216, + "grad_norm": 680.0, + "kl_loss_10": 156.3335433959961, + "kl_loss_2": 1665.5125793457032, + "kl_loss_3": 1255.7060974121093, + "kl_loss_7": 430.11304168701173, + "learning_rate": 0.0008969180200933046, + "loss": 882.8723, + "step": 2160 + }, + { + "ce_loss_10": 3.5530227780342103, + "ce_loss_13": 3.4805894613265993, + "ce_loss_2": 4.314852666854859, + "ce_loss_3": 4.093249773979187, + "ce_loss_7": 3.68630450963974, + "epoch": 0.217, + "grad_norm": 884.0, + "kl_loss_10": 159.62687911987305, + "kl_loss_2": 1730.9878967285156, + "kl_loss_3": 1299.03779296875, + "kl_loss_7": 449.18050384521484, + "learning_rate": 0.0008959511229611376, + "loss": 904.2477, + "step": 2170 + }, + { + "ce_loss_10": 3.6367149114608766, + "ce_loss_13": 3.5638245701789857, + "ce_loss_2": 4.346567583084107, + "ce_loss_3": 4.133716177940369, + "ce_loss_7": 3.7738547563552856, + "epoch": 0.218, + "grad_norm": 1040.0, + "kl_loss_10": 154.5606544494629, + "kl_loss_2": 1663.1614990234375, + "kl_loss_3": 1249.802392578125, + "kl_loss_7": 464.9475952148438, + "learning_rate": 0.0008949802386055581, + "loss": 886.1459, + "step": 2180 + }, + { + "ce_loss_10": 3.4948283553123476, + "ce_loss_13": 3.422482490539551, + "ce_loss_2": 4.231684553623199, + "ce_loss_3": 4.012719178199768, + "ce_loss_7": 3.651959717273712, + "epoch": 0.219, + "grad_norm": 848.0, + "kl_loss_10": 152.99597930908203, + "kl_loss_2": 1675.9751953125, + "kl_loss_3": 1256.631854248047, + "kl_loss_7": 486.07566986083987, + "learning_rate": 0.0008940053768033609, + "loss": 903.9787, + "step": 2190 + }, + { + "ce_loss_10": 3.5814868688583372, + "ce_loss_13": 3.511065399646759, + "ce_loss_2": 4.282150137424469, + "ce_loss_3": 4.074921286106109, + "ce_loss_7": 3.7124979138374328, + "epoch": 0.22, + "grad_norm": 612.0, + "kl_loss_10": 151.5022117614746, + "kl_loss_2": 1646.2027404785156, + "kl_loss_3": 1238.9055969238282, + "kl_loss_7": 446.9908981323242, + "learning_rate": 0.0008930265473713938, + "loss": 875.279, + "step": 2200 + }, + { + "ce_loss_10": 3.545922613143921, + "ce_loss_13": 3.4695589780807494, + "ce_loss_2": 4.269280636310578, + "ce_loss_3": 4.051465404033661, + "ce_loss_7": 3.678086221218109, + "epoch": 0.221, + "grad_norm": 720.0, + "kl_loss_10": 159.6601760864258, + "kl_loss_2": 1673.8038024902344, + "kl_loss_3": 1248.3465393066406, + "kl_loss_7": 435.1474182128906, + "learning_rate": 0.0008920437601664579, + "loss": 865.5202, + "step": 2210 + }, + { + "ce_loss_10": 3.540095329284668, + "ce_loss_13": 3.461512506008148, + "ce_loss_2": 4.25678983926773, + "ce_loss_3": 4.043842458724976, + "ce_loss_7": 3.6578521966934203, + "epoch": 0.222, + "grad_norm": 696.0, + "kl_loss_10": 164.2334945678711, + "kl_loss_2": 1687.953759765625, + "kl_loss_3": 1269.6444458007813, + "kl_loss_7": 430.08106689453126, + "learning_rate": 0.0008910570250852097, + "loss": 872.3879, + "step": 2220 + }, + { + "ce_loss_10": 3.645900297164917, + "ce_loss_13": 3.5746286392211912, + "ce_loss_2": 4.329179430007935, + "ce_loss_3": 4.126194024085999, + "ce_loss_7": 3.7611724615097044, + "epoch": 0.223, + "grad_norm": 580.0, + "kl_loss_10": 161.01494750976562, + "kl_loss_2": 1616.48173828125, + "kl_loss_3": 1212.9592407226562, + "kl_loss_7": 411.58105926513673, + "learning_rate": 0.0008900663520640604, + "loss": 857.1105, + "step": 2230 + }, + { + "ce_loss_10": 3.5964877367019654, + "ce_loss_13": 3.51711448431015, + "ce_loss_2": 4.304993438720703, + "ce_loss_3": 4.098519861698151, + "ce_loss_7": 3.7116694688796996, + "epoch": 0.224, + "grad_norm": 736.0, + "kl_loss_10": 163.17066192626953, + "kl_loss_2": 1651.7649719238282, + "kl_loss_3": 1242.3382141113282, + "kl_loss_7": 411.6787506103516, + "learning_rate": 0.0008890717510790764, + "loss": 876.1222, + "step": 2240 + }, + { + "ce_loss_10": 3.5471703886985777, + "ce_loss_13": 3.4758081436157227, + "ce_loss_2": 4.273530685901642, + "ce_loss_3": 4.067912590503693, + "ce_loss_7": 3.6660157322883604, + "epoch": 0.225, + "grad_norm": 708.0, + "kl_loss_10": 157.27454681396483, + "kl_loss_2": 1677.1010131835938, + "kl_loss_3": 1277.8482238769532, + "kl_loss_7": 413.5911529541016, + "learning_rate": 0.0008880732321458784, + "loss": 886.1244, + "step": 2250 + }, + { + "ce_loss_10": 3.584555244445801, + "ce_loss_13": 3.5100926637649534, + "ce_loss_2": 4.294793701171875, + "ce_loss_3": 4.089890396595001, + "ce_loss_7": 3.6978877186775208, + "epoch": 0.226, + "grad_norm": 668.0, + "kl_loss_10": 156.83258056640625, + "kl_loss_2": 1654.9069946289062, + "kl_loss_3": 1258.4203796386719, + "kl_loss_7": 411.3527374267578, + "learning_rate": 0.0008870708053195413, + "loss": 882.6709, + "step": 2260 + }, + { + "ce_loss_10": 3.6060186505317686, + "ce_loss_13": 3.537421989440918, + "ce_loss_2": 4.294502913951874, + "ce_loss_3": 4.09798412322998, + "ce_loss_7": 3.7177546858787536, + "epoch": 0.227, + "grad_norm": 752.0, + "kl_loss_10": 151.7200439453125, + "kl_loss_2": 1629.2985412597657, + "kl_loss_3": 1232.4538330078126, + "kl_loss_7": 405.0216552734375, + "learning_rate": 0.0008860644806944918, + "loss": 858.0595, + "step": 2270 + }, + { + "ce_loss_10": 3.5466419100761413, + "ce_loss_13": 3.471850037574768, + "ce_loss_2": 4.279071676731109, + "ce_loss_3": 4.0602316617965695, + "ce_loss_7": 3.6642366647720337, + "epoch": 0.228, + "grad_norm": 732.0, + "kl_loss_10": 159.87943267822266, + "kl_loss_2": 1702.7948913574219, + "kl_loss_3": 1281.67705078125, + "kl_loss_7": 420.01871795654296, + "learning_rate": 0.0008850542684044079, + "loss": 867.6785, + "step": 2280 + }, + { + "ce_loss_10": 3.5172786831855776, + "ce_loss_13": 3.442570173740387, + "ce_loss_2": 4.280114269256591, + "ce_loss_3": 4.0603335976600645, + "ce_loss_7": 3.642946183681488, + "epoch": 0.229, + "grad_norm": 868.0, + "kl_loss_10": 165.0769485473633, + "kl_loss_2": 1756.4314086914062, + "kl_loss_3": 1316.273486328125, + "kl_loss_7": 426.0765625, + "learning_rate": 0.0008840401786221159, + "loss": 888.6179, + "step": 2290 + }, + { + "ce_loss_10": 3.653991627693176, + "ce_loss_13": 3.587805616855621, + "ce_loss_2": 4.348642802238464, + "ce_loss_3": 4.1388364911079405, + "ce_loss_7": 3.767448306083679, + "epoch": 0.23, + "grad_norm": 660.0, + "kl_loss_10": 150.6523193359375, + "kl_loss_2": 1617.4279296875, + "kl_loss_3": 1219.0333129882813, + "kl_loss_7": 399.7216049194336, + "learning_rate": 0.000883022221559489, + "loss": 847.182, + "step": 2300 + }, + { + "ce_loss_10": 3.6083556771278382, + "ce_loss_13": 3.5367133378982545, + "ce_loss_2": 4.322761964797974, + "ce_loss_3": 4.111302089691162, + "ce_loss_7": 3.720748817920685, + "epoch": 0.231, + "grad_norm": 700.0, + "kl_loss_10": 152.93154830932616, + "kl_loss_2": 1665.5345397949218, + "kl_loss_3": 1251.20830078125, + "kl_loss_7": 409.15943145751953, + "learning_rate": 0.0008820004074673434, + "loss": 890.3789, + "step": 2310 + }, + { + "ce_loss_10": 3.51333087682724, + "ce_loss_13": 3.4452512502670287, + "ce_loss_2": 4.2334395289421085, + "ce_loss_3": 4.0242817282676695, + "ce_loss_7": 3.6292155742645265, + "epoch": 0.232, + "grad_norm": 852.0, + "kl_loss_10": 146.69402542114258, + "kl_loss_2": 1683.8106201171875, + "kl_loss_3": 1269.85615234375, + "kl_loss_7": 408.7549362182617, + "learning_rate": 0.0008809747466353355, + "loss": 861.5091, + "step": 2320 + }, + { + "ce_loss_10": 3.5223829627037047, + "ce_loss_13": 3.4536111116409303, + "ce_loss_2": 4.234097373485565, + "ce_loss_3": 4.026098394393921, + "ce_loss_7": 3.636929678916931, + "epoch": 0.233, + "grad_norm": 840.0, + "kl_loss_10": 149.8192039489746, + "kl_loss_2": 1664.0373168945312, + "kl_loss_3": 1257.5840637207032, + "kl_loss_7": 404.71526489257815, + "learning_rate": 0.0008799452493918585, + "loss": 871.5133, + "step": 2330 + }, + { + "ce_loss_10": 3.60318318605423, + "ce_loss_13": 3.535007989406586, + "ce_loss_2": 4.313662338256836, + "ce_loss_3": 4.121019208431244, + "ce_loss_7": 3.7191484212875365, + "epoch": 0.234, + "grad_norm": 620.0, + "kl_loss_10": 147.37739295959472, + "kl_loss_2": 1648.7771301269531, + "kl_loss_3": 1272.6883178710937, + "kl_loss_7": 401.2149230957031, + "learning_rate": 0.0008789119261039385, + "loss": 889.3826, + "step": 2340 + }, + { + "ce_loss_10": 3.512122702598572, + "ce_loss_13": 3.444090461730957, + "ce_loss_2": 4.230511236190796, + "ce_loss_3": 4.033337998390198, + "ce_loss_7": 3.623887372016907, + "epoch": 0.235, + "grad_norm": 584.0, + "kl_loss_10": 146.48653411865234, + "kl_loss_2": 1640.4592712402343, + "kl_loss_3": 1250.2205444335937, + "kl_loss_7": 398.50293121337893, + "learning_rate": 0.0008778747871771292, + "loss": 851.5988, + "step": 2350 + }, + { + "ce_loss_10": 3.560721528530121, + "ce_loss_13": 3.495018112659454, + "ce_loss_2": 4.254593050479889, + "ce_loss_3": 4.062426352500916, + "ce_loss_7": 3.675720953941345, + "epoch": 0.236, + "grad_norm": 704.0, + "kl_loss_10": 143.95477371215821, + "kl_loss_2": 1612.37734375, + "kl_loss_3": 1231.095751953125, + "kl_loss_7": 391.27414855957034, + "learning_rate": 0.0008768338430554083, + "loss": 843.9201, + "step": 2360 + }, + { + "ce_loss_10": 3.5749622344970704, + "ce_loss_13": 3.5057023644447325, + "ce_loss_2": 4.284319353103638, + "ce_loss_3": 4.084056878089905, + "ce_loss_7": 3.691650938987732, + "epoch": 0.237, + "grad_norm": 748.0, + "kl_loss_10": 149.22578506469728, + "kl_loss_2": 1642.1105163574218, + "kl_loss_3": 1249.0818786621094, + "kl_loss_7": 417.6405731201172, + "learning_rate": 0.0008757891042210713, + "loss": 868.1691, + "step": 2370 + }, + { + "ce_loss_10": 3.593665659427643, + "ce_loss_13": 3.52440140247345, + "ce_loss_2": 4.300392985343933, + "ce_loss_3": 4.098441755771637, + "ce_loss_7": 3.7135939955711366, + "epoch": 0.238, + "grad_norm": 668.0, + "kl_loss_10": 148.62272415161132, + "kl_loss_2": 1634.145733642578, + "kl_loss_3": 1239.5517639160157, + "kl_loss_7": 415.1738784790039, + "learning_rate": 0.0008747405811946271, + "loss": 862.5363, + "step": 2380 + }, + { + "ce_loss_10": 3.490070474147797, + "ce_loss_13": 3.4224871158599854, + "ce_loss_2": 4.224261665344239, + "ce_loss_3": 4.004424059391022, + "ce_loss_7": 3.6100102066993713, + "epoch": 0.239, + "grad_norm": 652.0, + "kl_loss_10": 147.28631515502929, + "kl_loss_2": 1686.8346069335937, + "kl_loss_3": 1270.0725341796874, + "kl_loss_7": 419.5466110229492, + "learning_rate": 0.0008736882845346905, + "loss": 852.6951, + "step": 2390 + }, + { + "ce_loss_10": 3.585322046279907, + "ce_loss_13": 3.5153682231903076, + "ce_loss_2": 4.304745924472809, + "ce_loss_3": 4.094923424720764, + "ce_loss_7": 3.7078854203224183, + "epoch": 0.24, + "grad_norm": 692.0, + "kl_loss_10": 151.89797973632812, + "kl_loss_2": 1647.5736877441407, + "kl_loss_3": 1242.0455993652345, + "kl_loss_7": 421.9617263793945, + "learning_rate": 0.0008726322248378774, + "loss": 857.3443, + "step": 2400 + }, + { + "ce_loss_10": 3.583414626121521, + "ce_loss_13": 3.515112745761871, + "ce_loss_2": 4.311495113372803, + "ce_loss_3": 4.099529230594635, + "ce_loss_7": 3.7044310569763184, + "epoch": 0.241, + "grad_norm": 652.0, + "kl_loss_10": 147.43828582763672, + "kl_loss_2": 1680.8733215332031, + "kl_loss_3": 1264.9752563476563, + "kl_loss_7": 411.56260681152344, + "learning_rate": 0.0008715724127386971, + "loss": 878.9705, + "step": 2410 + }, + { + "ce_loss_10": 3.6531842708587647, + "ce_loss_13": 3.587493336200714, + "ce_loss_2": 4.3451045751571655, + "ce_loss_3": 4.142902874946595, + "ce_loss_7": 3.767136585712433, + "epoch": 0.242, + "grad_norm": 620.0, + "kl_loss_10": 146.30834274291993, + "kl_loss_2": 1624.2251770019532, + "kl_loss_3": 1225.6600646972656, + "kl_loss_7": 408.7913787841797, + "learning_rate": 0.0008705088589094458, + "loss": 860.6354, + "step": 2420 + }, + { + "ce_loss_10": 3.665231490135193, + "ce_loss_13": 3.599679338932037, + "ce_loss_2": 4.36877521276474, + "ce_loss_3": 4.171533620357513, + "ce_loss_7": 3.787107288837433, + "epoch": 0.243, + "grad_norm": 664.0, + "kl_loss_10": 144.3328182220459, + "kl_loss_2": 1632.8966247558594, + "kl_loss_3": 1233.8675537109375, + "kl_loss_7": 410.8569137573242, + "learning_rate": 0.0008694415740600988, + "loss": 860.5134, + "step": 2430 + }, + { + "ce_loss_10": 3.5181034803390503, + "ce_loss_13": 3.4537795782089233, + "ce_loss_2": 4.249938941001892, + "ce_loss_3": 4.046365630626679, + "ce_loss_7": 3.6414516806602477, + "epoch": 0.244, + "grad_norm": 704.0, + "kl_loss_10": 145.3531593322754, + "kl_loss_2": 1684.775213623047, + "kl_loss_3": 1287.4376159667968, + "kl_loss_7": 412.328303527832, + "learning_rate": 0.0008683705689382025, + "loss": 869.7711, + "step": 2440 + }, + { + "ce_loss_10": 3.6008686304092405, + "ce_loss_13": 3.5355240225791933, + "ce_loss_2": 4.291689145565033, + "ce_loss_3": 4.094740498065948, + "ce_loss_7": 3.7141705632209776, + "epoch": 0.245, + "grad_norm": 760.0, + "kl_loss_10": 143.44989089965821, + "kl_loss_2": 1616.675146484375, + "kl_loss_3": 1220.3111206054687, + "kl_loss_7": 396.51316223144534, + "learning_rate": 0.0008672958543287666, + "loss": 868.0814, + "step": 2450 + }, + { + "ce_loss_10": 3.6143836855888365, + "ce_loss_13": 3.544294059276581, + "ce_loss_2": 4.300798118114471, + "ce_loss_3": 4.102933740615844, + "ce_loss_7": 3.72817702293396, + "epoch": 0.246, + "grad_norm": 736.0, + "kl_loss_10": 147.29279327392578, + "kl_loss_2": 1613.1561401367187, + "kl_loss_3": 1222.6258117675782, + "kl_loss_7": 401.4141021728516, + "learning_rate": 0.0008662174410541554, + "loss": 850.6215, + "step": 2460 + }, + { + "ce_loss_10": 3.574166786670685, + "ce_loss_13": 3.5087246656417848, + "ce_loss_2": 4.2659319877624515, + "ce_loss_3": 4.068793642520904, + "ce_loss_7": 3.686735737323761, + "epoch": 0.247, + "grad_norm": 604.0, + "kl_loss_10": 145.85303649902343, + "kl_loss_2": 1618.3893676757812, + "kl_loss_3": 1220.1310791015626, + "kl_loss_7": 400.6472534179687, + "learning_rate": 0.0008651353399739787, + "loss": 863.9098, + "step": 2470 + }, + { + "ce_loss_10": 3.6058215737342834, + "ce_loss_13": 3.539613950252533, + "ce_loss_2": 4.302924489974975, + "ce_loss_3": 4.098112308979035, + "ce_loss_7": 3.71759934425354, + "epoch": 0.248, + "grad_norm": 548.0, + "kl_loss_10": 147.60094833374023, + "kl_loss_2": 1624.6452575683593, + "kl_loss_3": 1222.2023498535157, + "kl_loss_7": 401.4662857055664, + "learning_rate": 0.0008640495619849821, + "loss": 849.9955, + "step": 2480 + }, + { + "ce_loss_10": 3.567488098144531, + "ce_loss_13": 3.501059103012085, + "ce_loss_2": 4.264333772659302, + "ce_loss_3": 4.056195986270905, + "ce_loss_7": 3.6804752707481385, + "epoch": 0.249, + "grad_norm": 712.0, + "kl_loss_10": 147.02945251464843, + "kl_loss_2": 1627.2623229980468, + "kl_loss_3": 1233.1324645996094, + "kl_loss_7": 402.8577346801758, + "learning_rate": 0.0008629601180209381, + "loss": 850.3543, + "step": 2490 + }, + { + "ce_loss_10": 3.5611565947532653, + "ce_loss_13": 3.492980194091797, + "ce_loss_2": 4.257801342010498, + "ce_loss_3": 4.052403700351715, + "ce_loss_7": 3.6730109333992003, + "epoch": 0.25, + "grad_norm": 800.0, + "kl_loss_10": 150.6272880554199, + "kl_loss_2": 1621.1947387695313, + "kl_loss_3": 1217.2358276367188, + "kl_loss_7": 403.2303207397461, + "learning_rate": 0.000861867019052535, + "loss": 856.4564, + "step": 2500 + }, + { + "ce_loss_10": 3.472098398208618, + "ce_loss_13": 3.402975833415985, + "ce_loss_2": 4.218045926094055, + "ce_loss_3": 3.995682156085968, + "ce_loss_7": 3.5920477628707888, + "epoch": 0.251, + "grad_norm": 756.0, + "kl_loss_10": 148.6118423461914, + "kl_loss_2": 1683.9299011230469, + "kl_loss_3": 1261.464288330078, + "kl_loss_7": 407.4490676879883, + "learning_rate": 0.0008607702760872678, + "loss": 870.0967, + "step": 2510 + }, + { + "ce_loss_10": 3.5914869785308836, + "ce_loss_13": 3.5267413854599, + "ce_loss_2": 4.272609853744507, + "ce_loss_3": 4.077222716808319, + "ce_loss_7": 3.702296590805054, + "epoch": 0.252, + "grad_norm": 824.0, + "kl_loss_10": 142.76109085083007, + "kl_loss_2": 1589.925390625, + "kl_loss_3": 1204.9856811523437, + "kl_loss_7": 391.77073822021487, + "learning_rate": 0.0008596699001693256, + "loss": 856.7367, + "step": 2520 + }, + { + "ce_loss_10": 3.6065261363983154, + "ce_loss_13": 3.542446482181549, + "ce_loss_2": 4.280361306667328, + "ce_loss_3": 4.078390645980835, + "ce_loss_7": 3.713554584980011, + "epoch": 0.253, + "grad_norm": 580.0, + "kl_loss_10": 146.36550064086913, + "kl_loss_2": 1602.360205078125, + "kl_loss_3": 1201.24404296875, + "kl_loss_7": 391.7824478149414, + "learning_rate": 0.0008585659023794818, + "loss": 855.3571, + "step": 2530 + }, + { + "ce_loss_10": 3.556603026390076, + "ce_loss_13": 3.488174021244049, + "ce_loss_2": 4.2837646961212155, + "ce_loss_3": 4.073796629905701, + "ce_loss_7": 3.669671726226807, + "epoch": 0.254, + "grad_norm": 568.0, + "kl_loss_10": 150.27775192260742, + "kl_loss_2": 1671.5778869628907, + "kl_loss_3": 1258.693133544922, + "kl_loss_7": 409.5189697265625, + "learning_rate": 0.0008574582938349817, + "loss": 864.698, + "step": 2540 + }, + { + "ce_loss_10": 3.5615516781806944, + "ce_loss_13": 3.48288893699646, + "ce_loss_2": 4.289009380340576, + "ce_loss_3": 4.085076451301575, + "ce_loss_7": 3.6758302092552184, + "epoch": 0.255, + "grad_norm": 636.0, + "kl_loss_10": 162.17492446899413, + "kl_loss_2": 1689.662127685547, + "kl_loss_3": 1286.3527221679688, + "kl_loss_7": 415.7776596069336, + "learning_rate": 0.0008563470856894315, + "loss": 856.5075, + "step": 2550 + }, + { + "ce_loss_10": 3.5482829809188843, + "ce_loss_13": 3.4777650833129883, + "ce_loss_2": 4.255946707725525, + "ce_loss_3": 4.048075020313263, + "ce_loss_7": 3.656572926044464, + "epoch": 0.256, + "grad_norm": 784.0, + "kl_loss_10": 158.6891746520996, + "kl_loss_2": 1642.1806335449219, + "kl_loss_3": 1246.0239196777343, + "kl_loss_7": 399.031315612793, + "learning_rate": 0.0008552322891326845, + "loss": 858.266, + "step": 2560 + }, + { + "ce_loss_10": 3.5313944816589355, + "ce_loss_13": 3.450025427341461, + "ce_loss_2": 4.224313974380493, + "ce_loss_3": 4.01665427684784, + "ce_loss_7": 3.628642737865448, + "epoch": 0.257, + "grad_norm": 756.0, + "kl_loss_10": 179.5360984802246, + "kl_loss_2": 1649.637109375, + "kl_loss_3": 1240.5775268554687, + "kl_loss_7": 401.7601791381836, + "learning_rate": 0.0008541139153907296, + "loss": 857.8164, + "step": 2570 + }, + { + "ce_loss_10": 3.486237919330597, + "ce_loss_13": 3.406646740436554, + "ce_loss_2": 4.1815930843353275, + "ce_loss_3": 3.9679081320762633, + "ce_loss_7": 3.589344394207001, + "epoch": 0.258, + "grad_norm": 580.0, + "kl_loss_10": 172.96023330688476, + "kl_loss_2": 1636.2860229492187, + "kl_loss_3": 1219.9958557128907, + "kl_loss_7": 400.6501800537109, + "learning_rate": 0.0008529919757255782, + "loss": 859.2726, + "step": 2580 + }, + { + "ce_loss_10": 3.514637219905853, + "ce_loss_13": 3.442402172088623, + "ce_loss_2": 4.176178085803985, + "ce_loss_3": 3.971681094169617, + "ce_loss_7": 3.6139083743095397, + "epoch": 0.259, + "grad_norm": 592.0, + "kl_loss_10": 162.93561325073242, + "kl_loss_2": 1568.500262451172, + "kl_loss_3": 1166.9389770507812, + "kl_loss_7": 392.92857513427737, + "learning_rate": 0.0008518664814351503, + "loss": 832.6326, + "step": 2590 + }, + { + "ce_loss_10": 3.481887364387512, + "ce_loss_13": 3.4060272455215452, + "ce_loss_2": 4.2031479477882385, + "ce_loss_3": 3.989342713356018, + "ce_loss_7": 3.5949572801589964, + "epoch": 0.26, + "grad_norm": 812.0, + "kl_loss_10": 160.07810974121094, + "kl_loss_2": 1678.1281677246093, + "kl_loss_3": 1261.206689453125, + "kl_loss_7": 422.9075729370117, + "learning_rate": 0.0008507374438531607, + "loss": 893.9762, + "step": 2600 + }, + { + "ce_loss_10": 3.457890176773071, + "ce_loss_13": 3.3876611232757567, + "ce_loss_2": 4.156854557991028, + "ce_loss_3": 3.951684260368347, + "ce_loss_7": 3.5687990188598633, + "epoch": 0.261, + "grad_norm": 652.0, + "kl_loss_10": 151.19789123535156, + "kl_loss_2": 1623.104559326172, + "kl_loss_3": 1217.5732849121093, + "kl_loss_7": 405.58522033691406, + "learning_rate": 0.0008496048743490053, + "loss": 847.8467, + "step": 2610 + }, + { + "ce_loss_10": 3.606453371047974, + "ce_loss_13": 3.5347030401229858, + "ce_loss_2": 4.287022602558136, + "ce_loss_3": 4.085285770893097, + "ce_loss_7": 3.7209227800369264, + "epoch": 0.262, + "grad_norm": 740.0, + "kl_loss_10": 149.46137313842775, + "kl_loss_2": 1597.4227478027344, + "kl_loss_3": 1202.5909423828125, + "kl_loss_7": 406.67049102783204, + "learning_rate": 0.0008484687843276469, + "loss": 841.793, + "step": 2620 + }, + { + "ce_loss_10": 3.5375467777252196, + "ce_loss_13": 3.4692538261413572, + "ce_loss_2": 4.234356260299682, + "ce_loss_3": 4.030397474765778, + "ce_loss_7": 3.665135991573334, + "epoch": 0.263, + "grad_norm": 756.0, + "kl_loss_10": 148.52831802368163, + "kl_loss_2": 1626.5234375, + "kl_loss_3": 1230.9505981445313, + "kl_loss_7": 421.18434143066406, + "learning_rate": 0.0008473291852294987, + "loss": 864.2529, + "step": 2630 + }, + { + "ce_loss_10": 3.5409929156303406, + "ce_loss_13": 3.474526059627533, + "ce_loss_2": 4.241577196121216, + "ce_loss_3": 4.039238381385803, + "ce_loss_7": 3.658675765991211, + "epoch": 0.264, + "grad_norm": 752.0, + "kl_loss_10": 146.5441551208496, + "kl_loss_2": 1638.5023559570313, + "kl_loss_3": 1236.1968872070313, + "kl_loss_7": 412.63101959228516, + "learning_rate": 0.0008461860885303114, + "loss": 847.5814, + "step": 2640 + }, + { + "ce_loss_10": 3.5672938942909242, + "ce_loss_13": 3.5024590492248535, + "ce_loss_2": 4.25400961637497, + "ce_loss_3": 4.04265718460083, + "ce_loss_7": 3.682517182826996, + "epoch": 0.265, + "grad_norm": 532.0, + "kl_loss_10": 143.3494441986084, + "kl_loss_2": 1592.2557861328125, + "kl_loss_3": 1193.9100402832032, + "kl_loss_7": 399.56408233642577, + "learning_rate": 0.000845039505741056, + "loss": 840.8137, + "step": 2650 + }, + { + "ce_loss_10": 3.554959547519684, + "ce_loss_13": 3.4861936926841737, + "ce_loss_2": 4.258109152317047, + "ce_loss_3": 4.058207333087921, + "ce_loss_7": 3.673966348171234, + "epoch": 0.266, + "grad_norm": 668.0, + "kl_loss_10": 145.93713989257813, + "kl_loss_2": 1655.0006469726563, + "kl_loss_3": 1257.4821166992188, + "kl_loss_7": 414.46795959472655, + "learning_rate": 0.0008438894484078086, + "loss": 882.0232, + "step": 2660 + }, + { + "ce_loss_10": 3.5612363696098326, + "ce_loss_13": 3.495659518241882, + "ce_loss_2": 4.250771713256836, + "ce_loss_3": 4.0548901677131655, + "ce_loss_7": 3.6741419315338133, + "epoch": 0.267, + "grad_norm": 664.0, + "kl_loss_10": 143.76045303344728, + "kl_loss_2": 1608.0756408691407, + "kl_loss_3": 1221.9843811035157, + "kl_loss_7": 403.46703948974607, + "learning_rate": 0.0008427359281116334, + "loss": 850.6983, + "step": 2670 + }, + { + "ce_loss_10": 3.4647863864898683, + "ce_loss_13": 3.4007498025894165, + "ce_loss_2": 4.182628095149994, + "ce_loss_3": 3.980172896385193, + "ce_loss_7": 3.581213617324829, + "epoch": 0.268, + "grad_norm": 592.0, + "kl_loss_10": 142.86443824768065, + "kl_loss_2": 1652.2051391601562, + "kl_loss_3": 1244.7330932617188, + "kl_loss_7": 401.0715042114258, + "learning_rate": 0.0008415789564684673, + "loss": 856.052, + "step": 2680 + }, + { + "ce_loss_10": 3.7105233311653136, + "ce_loss_13": 3.6397859692573546, + "ce_loss_2": 4.390112090110779, + "ce_loss_3": 4.1913481712341305, + "ce_loss_7": 3.827344071865082, + "epoch": 0.269, + "grad_norm": 616.0, + "kl_loss_10": 151.1160675048828, + "kl_loss_2": 1576.1515380859375, + "kl_loss_3": 1196.6380004882812, + "kl_loss_7": 409.57061157226565, + "learning_rate": 0.0008404185451290017, + "loss": 832.6562, + "step": 2690 + }, + { + "ce_loss_10": 3.575976026058197, + "ce_loss_13": 3.509383165836334, + "ce_loss_2": 4.262010288238526, + "ce_loss_3": 4.061827218532562, + "ce_loss_7": 3.6923381447792054, + "epoch": 0.27, + "grad_norm": 828.0, + "kl_loss_10": 145.46635818481445, + "kl_loss_2": 1614.2012756347656, + "kl_loss_3": 1211.5230590820313, + "kl_loss_7": 400.46296844482424, + "learning_rate": 0.0008392547057785661, + "loss": 840.9771, + "step": 2700 + }, + { + "ce_loss_10": 3.504245734214783, + "ce_loss_13": 3.4360852599143983, + "ce_loss_2": 4.231841135025024, + "ce_loss_3": 4.012355697154999, + "ce_loss_7": 3.6181442975997924, + "epoch": 0.271, + "grad_norm": 732.0, + "kl_loss_10": 148.68926086425782, + "kl_loss_2": 1695.421563720703, + "kl_loss_3": 1262.842755126953, + "kl_loss_7": 405.0101089477539, + "learning_rate": 0.0008380874501370098, + "loss": 846.7374, + "step": 2710 + }, + { + "ce_loss_10": 3.4979911923408507, + "ce_loss_13": 3.430197703838348, + "ce_loss_2": 4.2191136360168455, + "ce_loss_3": 4.007521188259124, + "ce_loss_7": 3.6158869743347166, + "epoch": 0.272, + "grad_norm": 816.0, + "kl_loss_10": 150.7740036010742, + "kl_loss_2": 1669.1359069824218, + "kl_loss_3": 1252.4361511230468, + "kl_loss_7": 410.4910232543945, + "learning_rate": 0.0008369167899585841, + "loss": 863.3943, + "step": 2720 + }, + { + "ce_loss_10": 3.622397780418396, + "ce_loss_13": 3.5552627205848695, + "ce_loss_2": 4.291873097419739, + "ce_loss_3": 4.092212498188019, + "ce_loss_7": 3.728976047039032, + "epoch": 0.273, + "grad_norm": 532.0, + "kl_loss_10": 145.40375938415528, + "kl_loss_2": 1578.4437866210938, + "kl_loss_3": 1183.6333801269532, + "kl_loss_7": 390.3755645751953, + "learning_rate": 0.0008357427370318238, + "loss": 851.5875, + "step": 2730 + }, + { + "ce_loss_10": 3.57169429063797, + "ce_loss_13": 3.5056095838546755, + "ce_loss_2": 4.271306252479553, + "ce_loss_3": 4.060312724113464, + "ce_loss_7": 3.684804010391235, + "epoch": 0.274, + "grad_norm": 936.0, + "kl_loss_10": 145.32760620117188, + "kl_loss_2": 1629.5275451660157, + "kl_loss_3": 1221.0515014648438, + "kl_loss_7": 395.46714477539064, + "learning_rate": 0.0008345653031794292, + "loss": 853.4608, + "step": 2740 + }, + { + "ce_loss_10": 3.5738168597221374, + "ce_loss_13": 3.507299304008484, + "ce_loss_2": 4.262202572822571, + "ce_loss_3": 4.064274084568024, + "ce_loss_7": 3.6900595307350157, + "epoch": 0.275, + "grad_norm": 664.0, + "kl_loss_10": 146.08263397216797, + "kl_loss_2": 1609.2401916503907, + "kl_loss_3": 1212.829638671875, + "kl_loss_7": 400.2176742553711, + "learning_rate": 0.0008333845002581458, + "loss": 847.099, + "step": 2750 + }, + { + "ce_loss_10": 3.4940132975578306, + "ce_loss_13": 3.4292171597480774, + "ce_loss_2": 4.21322615146637, + "ce_loss_3": 4.004913711547852, + "ce_loss_7": 3.6087831974029543, + "epoch": 0.276, + "grad_norm": 588.0, + "kl_loss_10": 146.3518180847168, + "kl_loss_2": 1678.6763549804687, + "kl_loss_3": 1262.9351531982422, + "kl_loss_7": 407.4271759033203, + "learning_rate": 0.0008322003401586462, + "loss": 867.0438, + "step": 2760 + }, + { + "ce_loss_10": 3.5370134353637694, + "ce_loss_13": 3.472352921962738, + "ce_loss_2": 4.210755240917206, + "ce_loss_3": 4.011643362045288, + "ce_loss_7": 3.643345367908478, + "epoch": 0.277, + "grad_norm": 520.0, + "kl_loss_10": 141.4054153442383, + "kl_loss_2": 1575.5725219726562, + "kl_loss_3": 1185.9700073242188, + "kl_loss_7": 382.99049072265626, + "learning_rate": 0.0008310128348054094, + "loss": 814.8721, + "step": 2770 + }, + { + "ce_loss_10": 3.503373074531555, + "ce_loss_13": 3.4395504236221313, + "ce_loss_2": 4.196349406242371, + "ce_loss_3": 3.9976847887039186, + "ce_loss_7": 3.612117648124695, + "epoch": 0.278, + "grad_norm": 684.0, + "kl_loss_10": 144.36290817260743, + "kl_loss_2": 1611.1840881347657, + "kl_loss_3": 1214.7904235839844, + "kl_loss_7": 393.8703353881836, + "learning_rate": 0.0008298219961566008, + "loss": 840.2912, + "step": 2780 + }, + { + "ce_loss_10": 3.4721336245536802, + "ce_loss_13": 3.4057936549186705, + "ce_loss_2": 4.1955530643463135, + "ce_loss_3": 3.995982491970062, + "ce_loss_7": 3.588437759876251, + "epoch": 0.279, + "grad_norm": 664.0, + "kl_loss_10": 145.45478706359864, + "kl_loss_2": 1683.0884338378905, + "kl_loss_3": 1281.4284118652345, + "kl_loss_7": 402.82340850830076, + "learning_rate": 0.0008286278362039527, + "loss": 855.1207, + "step": 2790 + }, + { + "ce_loss_10": 3.499285411834717, + "ce_loss_13": 3.432967686653137, + "ce_loss_2": 4.224206006526947, + "ce_loss_3": 4.018722629547119, + "ce_loss_7": 3.612668144702911, + "epoch": 0.28, + "grad_norm": 544.0, + "kl_loss_10": 142.82978515625, + "kl_loss_2": 1658.8205200195312, + "kl_loss_3": 1255.1792053222657, + "kl_loss_7": 392.1188400268555, + "learning_rate": 0.0008274303669726426, + "loss": 841.1881, + "step": 2800 + }, + { + "ce_loss_10": 3.4018381118774412, + "ce_loss_13": 3.3346792578697206, + "ce_loss_2": 4.129230046272278, + "ce_loss_3": 3.9258937001228333, + "ce_loss_7": 3.518822467327118, + "epoch": 0.281, + "grad_norm": 764.0, + "kl_loss_10": 143.8448387145996, + "kl_loss_2": 1673.1943664550781, + "kl_loss_3": 1270.9249450683594, + "kl_loss_7": 403.2898895263672, + "learning_rate": 0.0008262296005211721, + "loss": 846.9486, + "step": 2810 + }, + { + "ce_loss_10": 3.5292460680007935, + "ce_loss_13": 3.4640193819999694, + "ce_loss_2": 4.239890122413636, + "ce_loss_3": 4.038429474830627, + "ce_loss_7": 3.641891372203827, + "epoch": 0.282, + "grad_norm": 624.0, + "kl_loss_10": 143.57638244628907, + "kl_loss_2": 1645.4285217285155, + "kl_loss_3": 1244.0456237792969, + "kl_loss_7": 397.29553985595703, + "learning_rate": 0.0008250255489412463, + "loss": 846.6036, + "step": 2820 + }, + { + "ce_loss_10": 3.62887202501297, + "ce_loss_13": 3.561253750324249, + "ce_loss_2": 4.315714979171753, + "ce_loss_3": 4.123977625370026, + "ce_loss_7": 3.738538372516632, + "epoch": 0.283, + "grad_norm": 728.0, + "kl_loss_10": 148.4043426513672, + "kl_loss_2": 1612.185321044922, + "kl_loss_3": 1224.0401733398437, + "kl_loss_7": 394.00487060546874, + "learning_rate": 0.0008238182243576511, + "loss": 846.8261, + "step": 2830 + }, + { + "ce_loss_10": 3.5985996246337892, + "ce_loss_13": 3.5341097950935363, + "ce_loss_2": 4.251928305625915, + "ce_loss_3": 4.060415625572205, + "ce_loss_7": 3.698906183242798, + "epoch": 0.284, + "grad_norm": 772.0, + "kl_loss_10": 147.80193328857422, + "kl_loss_2": 1545.2418884277345, + "kl_loss_3": 1180.9795532226562, + "kl_loss_7": 381.7768493652344, + "learning_rate": 0.0008226076389281315, + "loss": 823.0694, + "step": 2840 + }, + { + "ce_loss_10": 3.637994980812073, + "ce_loss_13": 3.5726293325424194, + "ce_loss_2": 4.302945876121521, + "ce_loss_3": 4.106851041316986, + "ce_loss_7": 3.74611839056015, + "epoch": 0.285, + "grad_norm": 728.0, + "kl_loss_10": 148.46692581176757, + "kl_loss_2": 1581.9210205078125, + "kl_loss_3": 1188.579412841797, + "kl_loss_7": 392.20827026367186, + "learning_rate": 0.0008213938048432696, + "loss": 821.8449, + "step": 2850 + }, + { + "ce_loss_10": 3.5638878107070924, + "ce_loss_13": 3.497787523269653, + "ce_loss_2": 4.24253523349762, + "ce_loss_3": 4.041992700099945, + "ce_loss_7": 3.676758587360382, + "epoch": 0.286, + "grad_norm": 672.0, + "kl_loss_10": 148.57295265197754, + "kl_loss_2": 1586.4780151367188, + "kl_loss_3": 1201.8769653320312, + "kl_loss_7": 398.1691589355469, + "learning_rate": 0.0008201767343263612, + "loss": 837.8469, + "step": 2860 + }, + { + "ce_loss_10": 3.5040563464164736, + "ce_loss_13": 3.438361716270447, + "ce_loss_2": 4.212529039382934, + "ce_loss_3": 4.008230900764465, + "ce_loss_7": 3.61529027223587, + "epoch": 0.287, + "grad_norm": 656.0, + "kl_loss_10": 142.38912506103514, + "kl_loss_2": 1630.1037292480469, + "kl_loss_3": 1231.1203369140626, + "kl_loss_7": 393.3270919799805, + "learning_rate": 0.0008189564396332927, + "loss": 822.9645, + "step": 2870 + }, + { + "ce_loss_10": 3.4819291472434997, + "ce_loss_13": 3.419506084918976, + "ce_loss_2": 4.196909439563751, + "ce_loss_3": 3.9896247029304504, + "ce_loss_7": 3.5970585227012633, + "epoch": 0.288, + "grad_norm": 820.0, + "kl_loss_10": 143.1298400878906, + "kl_loss_2": 1632.7102294921874, + "kl_loss_3": 1227.3682495117187, + "kl_loss_7": 391.58729553222656, + "learning_rate": 0.0008177329330524181, + "loss": 846.0508, + "step": 2880 + }, + { + "ce_loss_10": 3.552425742149353, + "ce_loss_13": 3.4808704257011414, + "ce_loss_2": 4.2266720056533815, + "ce_loss_3": 4.03522971868515, + "ce_loss_7": 3.6583752155303957, + "epoch": 0.289, + "grad_norm": 704.0, + "kl_loss_10": 147.06133499145508, + "kl_loss_2": 1568.7399963378907, + "kl_loss_3": 1192.5995971679688, + "kl_loss_7": 385.310205078125, + "learning_rate": 0.0008165062269044352, + "loss": 830.767, + "step": 2890 + }, + { + "ce_loss_10": 3.5040358662605287, + "ce_loss_13": 3.4337138772010802, + "ce_loss_2": 4.194976377487182, + "ce_loss_3": 3.9983848929405212, + "ce_loss_7": 3.6155543684959413, + "epoch": 0.29, + "grad_norm": 588.0, + "kl_loss_10": 152.81832695007324, + "kl_loss_2": 1609.0662231445312, + "kl_loss_3": 1219.3178833007812, + "kl_loss_7": 397.04918212890624, + "learning_rate": 0.0008152763335422613, + "loss": 843.7021, + "step": 2900 + }, + { + "ce_loss_10": 3.4909900784492494, + "ce_loss_13": 3.419775998592377, + "ce_loss_2": 4.196218192577362, + "ce_loss_3": 3.98809734582901, + "ce_loss_7": 3.596997547149658, + "epoch": 0.291, + "grad_norm": 680.0, + "kl_loss_10": 158.0971206665039, + "kl_loss_2": 1636.2816589355468, + "kl_loss_3": 1234.2775512695312, + "kl_loss_7": 399.34343872070315, + "learning_rate": 0.0008140432653509088, + "loss": 842.7512, + "step": 2910 + }, + { + "ce_loss_10": 3.5415608644485475, + "ce_loss_13": 3.474157154560089, + "ce_loss_2": 4.221889722347259, + "ce_loss_3": 4.020258843898773, + "ce_loss_7": 3.649514615535736, + "epoch": 0.292, + "grad_norm": 692.0, + "kl_loss_10": 148.66607284545898, + "kl_loss_2": 1602.7043029785157, + "kl_loss_3": 1203.1907165527343, + "kl_loss_7": 393.12267150878904, + "learning_rate": 0.0008128070347473608, + "loss": 827.3531, + "step": 2920 + }, + { + "ce_loss_10": 3.544040846824646, + "ce_loss_13": 3.477999973297119, + "ce_loss_2": 4.254761123657227, + "ce_loss_3": 4.041979575157166, + "ce_loss_7": 3.658432722091675, + "epoch": 0.293, + "grad_norm": 808.0, + "kl_loss_10": 146.20843963623048, + "kl_loss_2": 1651.2572692871095, + "kl_loss_3": 1232.2542541503906, + "kl_loss_7": 402.30601654052737, + "learning_rate": 0.0008115676541804455, + "loss": 844.0389, + "step": 2930 + }, + { + "ce_loss_10": 3.552527105808258, + "ce_loss_13": 3.4877038478851317, + "ce_loss_2": 4.229338979721069, + "ce_loss_3": 4.030285179615021, + "ce_loss_7": 3.6580453157424926, + "epoch": 0.294, + "grad_norm": 596.0, + "kl_loss_10": 143.29071197509765, + "kl_loss_2": 1590.7006225585938, + "kl_loss_3": 1193.8719604492187, + "kl_loss_7": 391.36986846923827, + "learning_rate": 0.0008103251361307119, + "loss": 836.4414, + "step": 2940 + }, + { + "ce_loss_10": 3.5813042759895324, + "ce_loss_13": 3.5158876180648804, + "ce_loss_2": 4.2632251381874084, + "ce_loss_3": 4.065558528900146, + "ce_loss_7": 3.694361913204193, + "epoch": 0.295, + "grad_norm": 952.0, + "kl_loss_10": 144.15830841064454, + "kl_loss_2": 1593.5725402832031, + "kl_loss_3": 1203.2418884277345, + "kl_loss_7": 400.3397613525391, + "learning_rate": 0.0008090794931103026, + "loss": 828.5039, + "step": 2950 + }, + { + "ce_loss_10": 3.5701854705810545, + "ce_loss_13": 3.506050479412079, + "ce_loss_2": 4.251671302318573, + "ce_loss_3": 4.048157429695129, + "ce_loss_7": 3.6768277049064637, + "epoch": 0.296, + "grad_norm": 704.0, + "kl_loss_10": 140.38467445373536, + "kl_loss_2": 1584.526300048828, + "kl_loss_3": 1190.709698486328, + "kl_loss_7": 385.96351318359376, + "learning_rate": 0.0008078307376628291, + "loss": 831.3865, + "step": 2960 + }, + { + "ce_loss_10": 3.6296083092689515, + "ce_loss_13": 3.5671839475631715, + "ce_loss_2": 4.282579398155212, + "ce_loss_3": 4.087380516529083, + "ce_loss_7": 3.736238217353821, + "epoch": 0.297, + "grad_norm": 652.0, + "kl_loss_10": 137.90597343444824, + "kl_loss_2": 1528.18271484375, + "kl_loss_3": 1148.9516845703124, + "kl_loss_7": 376.7103607177734, + "learning_rate": 0.000806578882363245, + "loss": 801.1323, + "step": 2970 + }, + { + "ce_loss_10": 3.5410609245300293, + "ce_loss_13": 3.480309045314789, + "ce_loss_2": 4.21635273694992, + "ce_loss_3": 4.023106849193573, + "ce_loss_7": 3.651861608028412, + "epoch": 0.298, + "grad_norm": 872.0, + "kl_loss_10": 137.76350517272948, + "kl_loss_2": 1573.1726318359374, + "kl_loss_3": 1191.347589111328, + "kl_loss_7": 386.14882202148436, + "learning_rate": 0.0008053239398177191, + "loss": 838.2662, + "step": 2980 + }, + { + "ce_loss_10": 3.521118640899658, + "ce_loss_13": 3.4576277256011965, + "ce_loss_2": 4.211665558815002, + "ce_loss_3": 4.007085061073303, + "ce_loss_7": 3.6337268471717836, + "epoch": 0.299, + "grad_norm": 804.0, + "kl_loss_10": 139.79225463867186, + "kl_loss_2": 1595.8759216308595, + "kl_loss_3": 1195.4937316894532, + "kl_loss_7": 385.7449325561523, + "learning_rate": 0.0008040659226635089, + "loss": 850.0258, + "step": 2990 + }, + { + "ce_loss_10": 3.65621532201767, + "ce_loss_13": 3.589854049682617, + "ce_loss_2": 4.332050681114197, + "ce_loss_3": 4.135171377658844, + "ce_loss_7": 3.7766780138015745, + "epoch": 0.3, + "grad_norm": 716.0, + "kl_loss_10": 145.6563461303711, + "kl_loss_2": 1576.9345825195312, + "kl_loss_3": 1194.2649353027343, + "kl_loss_7": 413.7483200073242, + "learning_rate": 0.0008028048435688333, + "loss": 829.777, + "step": 3000 + }, + { + "ce_loss_10": 3.5251790165901182, + "ce_loss_13": 3.4619635701179505, + "ce_loss_2": 4.2210460782051085, + "ce_loss_3": 4.019052767753601, + "ce_loss_7": 3.637720024585724, + "epoch": 0.301, + "grad_norm": 780.0, + "kl_loss_10": 141.6140563964844, + "kl_loss_2": 1625.1482543945312, + "kl_loss_3": 1220.7144775390625, + "kl_loss_7": 397.0341278076172, + "learning_rate": 0.0008015407152327448, + "loss": 838.2924, + "step": 3010 + }, + { + "ce_loss_10": 3.5773096442222596, + "ce_loss_13": 3.510933578014374, + "ce_loss_2": 4.2585627913475035, + "ce_loss_3": 4.061378359794617, + "ce_loss_7": 3.693008613586426, + "epoch": 0.302, + "grad_norm": 640.0, + "kl_loss_10": 141.85217247009277, + "kl_loss_2": 1606.789813232422, + "kl_loss_3": 1214.533203125, + "kl_loss_7": 400.54812774658205, + "learning_rate": 0.0008002735503850016, + "loss": 839.8877, + "step": 3020 + }, + { + "ce_loss_10": 3.4636507511138914, + "ce_loss_13": 3.397989869117737, + "ce_loss_2": 4.174388039112091, + "ce_loss_3": 3.9605722427368164, + "ce_loss_7": 3.577491307258606, + "epoch": 0.303, + "grad_norm": 600.0, + "kl_loss_10": 143.30892028808594, + "kl_loss_2": 1645.3674255371093, + "kl_loss_3": 1230.3723571777343, + "kl_loss_7": 406.163623046875, + "learning_rate": 0.0007990033617859396, + "loss": 850.1339, + "step": 3030 + }, + { + "ce_loss_10": 3.514539110660553, + "ce_loss_13": 3.451056456565857, + "ce_loss_2": 4.192648077011109, + "ce_loss_3": 3.9946502685546874, + "ce_loss_7": 3.623524785041809, + "epoch": 0.304, + "grad_norm": 636.0, + "kl_loss_10": 141.17939987182618, + "kl_loss_2": 1581.641424560547, + "kl_loss_3": 1189.4458251953124, + "kl_loss_7": 394.9202285766602, + "learning_rate": 0.000797730162226344, + "loss": 811.3634, + "step": 3040 + }, + { + "ce_loss_10": 3.5422164678573607, + "ce_loss_13": 3.474979078769684, + "ce_loss_2": 4.236926698684693, + "ce_loss_3": 4.0294880151748655, + "ce_loss_7": 3.663298499584198, + "epoch": 0.305, + "grad_norm": 800.0, + "kl_loss_10": 146.4817584991455, + "kl_loss_2": 1609.1203186035157, + "kl_loss_3": 1205.9356170654296, + "kl_loss_7": 413.7532440185547, + "learning_rate": 0.0007964539645273203, + "loss": 829.1729, + "step": 3050 + }, + { + "ce_loss_10": 3.553938126564026, + "ce_loss_13": 3.4921163439750673, + "ce_loss_2": 4.224380815029145, + "ce_loss_3": 4.028742516040802, + "ce_loss_7": 3.6613959074020386, + "epoch": 0.306, + "grad_norm": 588.0, + "kl_loss_10": 142.35839805603027, + "kl_loss_2": 1565.0873718261719, + "kl_loss_3": 1175.3519958496095, + "kl_loss_7": 399.386003112793, + "learning_rate": 0.000795174781540165, + "loss": 830.5109, + "step": 3060 + }, + { + "ce_loss_10": 3.631449246406555, + "ce_loss_13": 3.5652127861976624, + "ce_loss_2": 4.277838742733001, + "ce_loss_3": 4.087875485420227, + "ce_loss_7": 3.7482593059539795, + "epoch": 0.307, + "grad_norm": 948.0, + "kl_loss_10": 145.9558292388916, + "kl_loss_2": 1522.917333984375, + "kl_loss_3": 1148.0942016601562, + "kl_loss_7": 410.9433837890625, + "learning_rate": 0.0007938926261462366, + "loss": 827.8861, + "step": 3070 + }, + { + "ce_loss_10": 3.5865222454071044, + "ce_loss_13": 3.5132097244262694, + "ce_loss_2": 4.221575832366943, + "ce_loss_3": 4.03163822889328, + "ce_loss_7": 3.709211730957031, + "epoch": 0.308, + "grad_norm": 644.0, + "kl_loss_10": 149.5517364501953, + "kl_loss_2": 1554.0492431640625, + "kl_loss_3": 1164.7528442382813, + "kl_loss_7": 426.5201721191406, + "learning_rate": 0.0007926075112568258, + "loss": 842.9685, + "step": 3080 + }, + { + "ce_loss_10": 3.5743375420570374, + "ce_loss_13": 3.506422483921051, + "ce_loss_2": 4.235567331314087, + "ce_loss_3": 4.042410182952881, + "ce_loss_7": 3.6804218649864198, + "epoch": 0.309, + "grad_norm": 576.0, + "kl_loss_10": 144.69867630004882, + "kl_loss_2": 1571.8233825683594, + "kl_loss_3": 1187.3203979492187, + "kl_loss_7": 398.75867767333983, + "learning_rate": 0.0007913194498130252, + "loss": 817.3697, + "step": 3090 + }, + { + "ce_loss_10": 3.494604206085205, + "ce_loss_13": 3.4323593616485595, + "ce_loss_2": 4.202052474021912, + "ce_loss_3": 3.988065266609192, + "ce_loss_7": 3.6171945691108705, + "epoch": 0.31, + "grad_norm": 740.0, + "kl_loss_10": 144.33132781982422, + "kl_loss_2": 1625.1172668457032, + "kl_loss_3": 1206.2160430908202, + "kl_loss_7": 405.6477508544922, + "learning_rate": 0.0007900284547855992, + "loss": 844.1217, + "step": 3100 + }, + { + "ce_loss_10": 3.505799424648285, + "ce_loss_13": 3.44124299287796, + "ce_loss_2": 4.186880040168762, + "ce_loss_3": 3.9723469614982605, + "ce_loss_7": 3.6141058206558228, + "epoch": 0.311, + "grad_norm": 588.0, + "kl_loss_10": 142.34576911926268, + "kl_loss_2": 1592.635546875, + "kl_loss_3": 1175.9765869140624, + "kl_loss_7": 387.8748123168945, + "learning_rate": 0.0007887345391748532, + "loss": 841.3018, + "step": 3110 + }, + { + "ce_loss_10": 3.641534376144409, + "ce_loss_13": 3.5780982255935667, + "ce_loss_2": 4.291218495368957, + "ce_loss_3": 4.0910943150520325, + "ce_loss_7": 3.742597687244415, + "epoch": 0.312, + "grad_norm": 712.0, + "kl_loss_10": 144.0367401123047, + "kl_loss_2": 1544.438739013672, + "kl_loss_3": 1151.2371490478515, + "kl_loss_7": 377.4181396484375, + "learning_rate": 0.0007874377160105036, + "loss": 801.789, + "step": 3120 + }, + { + "ce_loss_10": 3.531910240650177, + "ce_loss_13": 3.468705189228058, + "ce_loss_2": 4.221284866333008, + "ce_loss_3": 4.0103159785270694, + "ce_loss_7": 3.6348586559295653, + "epoch": 0.313, + "grad_norm": 628.0, + "kl_loss_10": 147.38395767211915, + "kl_loss_2": 1606.8059204101562, + "kl_loss_3": 1190.6296630859374, + "kl_loss_7": 377.1212661743164, + "learning_rate": 0.0007861379983515449, + "loss": 844.5914, + "step": 3130 + }, + { + "ce_loss_10": 3.622577941417694, + "ce_loss_13": 3.55368732213974, + "ce_loss_2": 4.281805419921875, + "ce_loss_3": 4.091654586791992, + "ce_loss_7": 3.7212623953819275, + "epoch": 0.314, + "grad_norm": 656.0, + "kl_loss_10": 151.8330436706543, + "kl_loss_2": 1578.883233642578, + "kl_loss_3": 1199.2917419433593, + "kl_loss_7": 384.7697189331055, + "learning_rate": 0.0007848353992861195, + "loss": 819.7404, + "step": 3140 + }, + { + "ce_loss_10": 3.7075061440467834, + "ce_loss_13": 3.6256630778312684, + "ce_loss_2": 4.37855339050293, + "ce_loss_3": 4.184061086177826, + "ce_loss_7": 3.806570255756378, + "epoch": 0.315, + "grad_norm": 568.0, + "kl_loss_10": 167.983447265625, + "kl_loss_2": 1588.1090698242188, + "kl_loss_3": 1201.9598571777344, + "kl_loss_7": 396.2051040649414, + "learning_rate": 0.0007835299319313853, + "loss": 837.6727, + "step": 3150 + }, + { + "ce_loss_10": 3.5817773222923277, + "ce_loss_13": 3.5118511438369753, + "ce_loss_2": 4.233609986305237, + "ce_loss_3": 4.0398486137390135, + "ce_loss_7": 3.679852533340454, + "epoch": 0.316, + "grad_norm": 792.0, + "kl_loss_10": 157.4121223449707, + "kl_loss_2": 1557.2783630371093, + "kl_loss_3": 1171.8280517578125, + "kl_loss_7": 382.1309753417969, + "learning_rate": 0.0007822216094333848, + "loss": 839.4477, + "step": 3160 + }, + { + "ce_loss_10": 3.582335615158081, + "ce_loss_13": 3.514190638065338, + "ce_loss_2": 4.262095773220063, + "ce_loss_3": 4.0640422105789185, + "ce_loss_7": 3.687114107608795, + "epoch": 0.317, + "grad_norm": 752.0, + "kl_loss_10": 149.11183700561523, + "kl_loss_2": 1582.70458984375, + "kl_loss_3": 1193.9826293945312, + "kl_loss_7": 384.67308807373047, + "learning_rate": 0.0007809104449669101, + "loss": 818.48, + "step": 3170 + }, + { + "ce_loss_10": 3.5347620606422425, + "ce_loss_13": 3.469420051574707, + "ce_loss_2": 4.189749908447266, + "ce_loss_3": 3.9996961116790772, + "ce_loss_7": 3.639148008823395, + "epoch": 0.318, + "grad_norm": 612.0, + "kl_loss_10": 145.1158645629883, + "kl_loss_2": 1538.3842163085938, + "kl_loss_3": 1168.4396179199218, + "kl_loss_7": 377.5471496582031, + "learning_rate": 0.0007795964517353734, + "loss": 813.6468, + "step": 3180 + }, + { + "ce_loss_10": 3.5253086924552917, + "ce_loss_13": 3.4601287484169005, + "ce_loss_2": 4.204790925979614, + "ce_loss_3": 4.0041629552841185, + "ce_loss_7": 3.633437788486481, + "epoch": 0.319, + "grad_norm": 556.0, + "kl_loss_10": 145.9946216583252, + "kl_loss_2": 1610.9086120605468, + "kl_loss_3": 1214.3668518066406, + "kl_loss_7": 388.49003143310546, + "learning_rate": 0.000778279642970672, + "loss": 816.3545, + "step": 3190 + }, + { + "ce_loss_10": 3.5291069030761717, + "ce_loss_13": 3.465493679046631, + "ce_loss_2": 4.189659130573273, + "ce_loss_3": 3.9933029651641845, + "ce_loss_7": 3.637093019485474, + "epoch": 0.32, + "grad_norm": 720.0, + "kl_loss_10": 142.4067527770996, + "kl_loss_2": 1555.5811950683594, + "kl_loss_3": 1166.4454162597656, + "kl_loss_7": 380.82061767578125, + "learning_rate": 0.0007769600319330552, + "loss": 803.8339, + "step": 3200 + }, + { + "ce_loss_10": 3.562716245651245, + "ce_loss_13": 3.499916505813599, + "ce_loss_2": 4.260833311080932, + "ce_loss_3": 4.057473051548004, + "ce_loss_7": 3.6713879346847533, + "epoch": 0.321, + "grad_norm": 992.0, + "kl_loss_10": 141.0684398651123, + "kl_loss_2": 1608.2053955078125, + "kl_loss_3": 1204.6956695556642, + "kl_loss_7": 382.2118606567383, + "learning_rate": 0.0007756376319109917, + "loss": 822.6916, + "step": 3210 + }, + { + "ce_loss_10": 3.6111506104469298, + "ce_loss_13": 3.5478773951530456, + "ce_loss_2": 4.274733674526215, + "ce_loss_3": 4.077989876270294, + "ce_loss_7": 3.716237735748291, + "epoch": 0.322, + "grad_norm": 592.0, + "kl_loss_10": 142.35418815612792, + "kl_loss_2": 1549.4132995605469, + "kl_loss_3": 1170.5071533203125, + "kl_loss_7": 383.4806655883789, + "learning_rate": 0.0007743124562210351, + "loss": 802.4669, + "step": 3220 + }, + { + "ce_loss_10": 3.6234222531318663, + "ce_loss_13": 3.5583123326301576, + "ce_loss_2": 4.2762122631073, + "ce_loss_3": 4.079313564300537, + "ce_loss_7": 3.725733482837677, + "epoch": 0.323, + "grad_norm": 696.0, + "kl_loss_10": 141.49731979370117, + "kl_loss_2": 1560.1914489746093, + "kl_loss_3": 1169.5012756347655, + "kl_loss_7": 380.71566619873045, + "learning_rate": 0.0007729845182076895, + "loss": 818.9047, + "step": 3230 + }, + { + "ce_loss_10": 3.54907089471817, + "ce_loss_13": 3.488371527194977, + "ce_loss_2": 4.202688992023468, + "ce_loss_3": 4.013319063186645, + "ce_loss_7": 3.6541411519050597, + "epoch": 0.324, + "grad_norm": 772.0, + "kl_loss_10": 135.94298095703124, + "kl_loss_2": 1538.9539428710937, + "kl_loss_3": 1161.7360473632812, + "kl_loss_7": 374.43616180419923, + "learning_rate": 0.0007716538312432765, + "loss": 820.5339, + "step": 3240 + }, + { + "ce_loss_10": 3.5088236331939697, + "ce_loss_13": 3.4442797899246216, + "ce_loss_2": 4.2046965718269345, + "ce_loss_3": 3.9894727945327757, + "ce_loss_7": 3.6167898058891295, + "epoch": 0.325, + "grad_norm": 760.0, + "kl_loss_10": 142.3162754058838, + "kl_loss_2": 1605.5329223632812, + "kl_loss_3": 1201.896401977539, + "kl_loss_7": 388.1252075195313, + "learning_rate": 0.0007703204087277988, + "loss": 826.9689, + "step": 3250 + }, + { + "ce_loss_10": 3.609957015514374, + "ce_loss_13": 3.5482339024543763, + "ce_loss_2": 4.249799501895905, + "ce_loss_3": 4.058551073074341, + "ce_loss_7": 3.7137101888656616, + "epoch": 0.326, + "grad_norm": 620.0, + "kl_loss_10": 137.27352027893068, + "kl_loss_2": 1501.4761779785156, + "kl_loss_3": 1131.705010986328, + "kl_loss_7": 374.86394653320315, + "learning_rate": 0.0007689842640888063, + "loss": 797.6071, + "step": 3260 + }, + { + "ce_loss_10": 3.6040314555168154, + "ce_loss_13": 3.5402695417404173, + "ce_loss_2": 4.258551788330078, + "ce_loss_3": 4.068223142623902, + "ce_loss_7": 3.7120986700057985, + "epoch": 0.327, + "grad_norm": 584.0, + "kl_loss_10": 139.66932640075683, + "kl_loss_2": 1524.4914489746093, + "kl_loss_3": 1163.2306518554688, + "kl_loss_7": 381.93678894042966, + "learning_rate": 0.0007676454107812607, + "loss": 811.0578, + "step": 3270 + }, + { + "ce_loss_10": 3.53929922580719, + "ce_loss_13": 3.4770522236824037, + "ce_loss_2": 4.213063597679138, + "ce_loss_3": 4.026750934123993, + "ce_loss_7": 3.6522504925727843, + "epoch": 0.328, + "grad_norm": 784.0, + "kl_loss_10": 140.27813835144042, + "kl_loss_2": 1568.5742309570312, + "kl_loss_3": 1193.110528564453, + "kl_loss_7": 393.0425598144531, + "learning_rate": 0.0007663038622873999, + "loss": 813.8143, + "step": 3280 + }, + { + "ce_loss_10": 3.578033113479614, + "ce_loss_13": 3.516423726081848, + "ce_loss_2": 4.246110367774963, + "ce_loss_3": 4.043308067321777, + "ce_loss_7": 3.6857515454292296, + "epoch": 0.329, + "grad_norm": 824.0, + "kl_loss_10": 139.4735927581787, + "kl_loss_2": 1560.1141662597656, + "kl_loss_3": 1172.8965942382813, + "kl_loss_7": 398.01239624023435, + "learning_rate": 0.0007649596321166025, + "loss": 805.2687, + "step": 3290 + }, + { + "ce_loss_10": 3.4800352215766908, + "ce_loss_13": 3.421827828884125, + "ce_loss_2": 4.145622873306275, + "ce_loss_3": 3.954922652244568, + "ce_loss_7": 3.5925204157829285, + "epoch": 0.33, + "grad_norm": 680.0, + "kl_loss_10": 134.23374557495117, + "kl_loss_2": 1532.8545349121093, + "kl_loss_3": 1164.9057250976562, + "kl_loss_7": 383.72555084228514, + "learning_rate": 0.0007636127338052513, + "loss": 811.8129, + "step": 3300 + }, + { + "ce_loss_10": 3.5890893220901487, + "ce_loss_13": 3.5242863059043885, + "ce_loss_2": 4.25543829202652, + "ce_loss_3": 4.057149302959442, + "ce_loss_7": 3.698796546459198, + "epoch": 0.331, + "grad_norm": 616.0, + "kl_loss_10": 140.55243148803712, + "kl_loss_2": 1560.8628784179687, + "kl_loss_3": 1176.2081298828125, + "kl_loss_7": 390.31951293945315, + "learning_rate": 0.0007622631809165971, + "loss": 803.85, + "step": 3310 + }, + { + "ce_loss_10": 3.5855463981628417, + "ce_loss_13": 3.5262409806251527, + "ce_loss_2": 4.216806030273437, + "ce_loss_3": 4.027826583385467, + "ce_loss_7": 3.6855634450912476, + "epoch": 0.332, + "grad_norm": 458.0, + "kl_loss_10": 131.1049461364746, + "kl_loss_2": 1471.740216064453, + "kl_loss_3": 1107.5941833496095, + "kl_loss_7": 363.5269348144531, + "learning_rate": 0.000760910987040623, + "loss": 786.43, + "step": 3320 + }, + { + "ce_loss_10": 3.5629027485847473, + "ce_loss_13": 3.502229619026184, + "ce_loss_2": 4.248357820510864, + "ce_loss_3": 4.046060919761658, + "ce_loss_7": 3.6763986110687257, + "epoch": 0.333, + "grad_norm": 580.0, + "kl_loss_10": 138.26232109069824, + "kl_loss_2": 1589.9033935546875, + "kl_loss_3": 1202.1307739257813, + "kl_loss_7": 389.39125366210936, + "learning_rate": 0.000759556165793906, + "loss": 806.7218, + "step": 3330 + }, + { + "ce_loss_10": 3.593952786922455, + "ce_loss_13": 3.532469391822815, + "ce_loss_2": 4.24890683889389, + "ce_loss_3": 4.057988488674164, + "ce_loss_7": 3.69755597114563, + "epoch": 0.334, + "grad_norm": 632.0, + "kl_loss_10": 136.70615882873534, + "kl_loss_2": 1542.6665771484375, + "kl_loss_3": 1160.2529327392579, + "kl_loss_7": 375.4426513671875, + "learning_rate": 0.000758198730819481, + "loss": 814.5036, + "step": 3340 + }, + { + "ce_loss_10": 3.5359851241111757, + "ce_loss_13": 3.4757973551750183, + "ce_loss_2": 4.205133056640625, + "ce_loss_3": 4.011187362670898, + "ce_loss_7": 3.6422808527946473, + "epoch": 0.335, + "grad_norm": 676.0, + "kl_loss_10": 134.6895553588867, + "kl_loss_2": 1573.4298767089845, + "kl_loss_3": 1188.663296508789, + "kl_loss_7": 378.12191619873045, + "learning_rate": 0.0007568386957867032, + "loss": 813.6836, + "step": 3350 + }, + { + "ce_loss_10": 3.6098239421844482, + "ce_loss_13": 3.5456187248229982, + "ce_loss_2": 4.260682666301728, + "ce_loss_3": 4.070243191719055, + "ce_loss_7": 3.7122027039527894, + "epoch": 0.336, + "grad_norm": 780.0, + "kl_loss_10": 136.96125411987305, + "kl_loss_2": 1524.2330810546875, + "kl_loss_3": 1148.0459899902344, + "kl_loss_7": 377.05298767089846, + "learning_rate": 0.0007554760743911103, + "loss": 810.5187, + "step": 3360 + }, + { + "ce_loss_10": 3.507435417175293, + "ce_loss_13": 3.44707133769989, + "ce_loss_2": 4.168538379669189, + "ce_loss_3": 3.969741427898407, + "ce_loss_7": 3.6143924474716185, + "epoch": 0.337, + "grad_norm": 800.0, + "kl_loss_10": 133.15070648193358, + "kl_loss_2": 1558.3870727539063, + "kl_loss_3": 1166.5326843261719, + "kl_loss_7": 373.68982391357423, + "learning_rate": 0.0007541108803542846, + "loss": 823.2562, + "step": 3370 + }, + { + "ce_loss_10": 3.556701052188873, + "ce_loss_13": 3.4952089309692385, + "ce_loss_2": 4.213332033157348, + "ce_loss_3": 4.01972188949585, + "ce_loss_7": 3.666098403930664, + "epoch": 0.338, + "grad_norm": 632.0, + "kl_loss_10": 137.10250129699708, + "kl_loss_2": 1548.2002746582032, + "kl_loss_3": 1163.8560943603516, + "kl_loss_7": 378.4952331542969, + "learning_rate": 0.0007527431274237149, + "loss": 839.8433, + "step": 3380 + }, + { + "ce_loss_10": 3.5281825184822084, + "ce_loss_13": 3.469517374038696, + "ce_loss_2": 4.175353538990021, + "ce_loss_3": 3.982305443286896, + "ce_loss_7": 3.630697858333588, + "epoch": 0.339, + "grad_norm": 604.0, + "kl_loss_10": 134.46246299743652, + "kl_loss_2": 1533.5295776367188, + "kl_loss_3": 1153.648422241211, + "kl_loss_7": 372.5039978027344, + "learning_rate": 0.0007513728293726579, + "loss": 803.5898, + "step": 3390 + }, + { + "ce_loss_10": 3.6488207459449766, + "ce_loss_13": 3.587409019470215, + "ce_loss_2": 4.293999433517456, + "ce_loss_3": 4.103942286968231, + "ce_loss_7": 3.7538707733154295, + "epoch": 0.34, + "grad_norm": 568.0, + "kl_loss_10": 137.6070526123047, + "kl_loss_2": 1523.785009765625, + "kl_loss_3": 1146.2487243652345, + "kl_loss_7": 378.4647155761719, + "learning_rate": 0.00075, + "loss": 796.8509, + "step": 3400 + }, + { + "ce_loss_10": 3.635476815700531, + "ce_loss_13": 3.5716666340827943, + "ce_loss_2": 4.305250811576843, + "ce_loss_3": 4.109015083312988, + "ce_loss_7": 3.745475196838379, + "epoch": 0.341, + "grad_norm": 608.0, + "kl_loss_10": 140.0908172607422, + "kl_loss_2": 1556.6490234375, + "kl_loss_3": 1162.7167419433595, + "kl_loss_7": 384.58277435302733, + "learning_rate": 0.0007486246531301177, + "loss": 802.0264, + "step": 3410 + }, + { + "ce_loss_10": 3.4416700124740602, + "ce_loss_13": 3.3813814163208007, + "ce_loss_2": 4.116898477077484, + "ce_loss_3": 3.9193392276763914, + "ce_loss_7": 3.5528218507766725, + "epoch": 0.342, + "grad_norm": 668.0, + "kl_loss_10": 134.79825401306152, + "kl_loss_2": 1559.6565307617188, + "kl_loss_3": 1172.1594482421874, + "kl_loss_7": 377.6139343261719, + "learning_rate": 0.0007472468026127384, + "loss": 799.9335, + "step": 3420 + }, + { + "ce_loss_10": 3.577793312072754, + "ce_loss_13": 3.511690676212311, + "ce_loss_2": 4.266387677192688, + "ce_loss_3": 4.059280645847321, + "ce_loss_7": 3.6900326371192933, + "epoch": 0.343, + "grad_norm": 592.0, + "kl_loss_10": 141.6757396697998, + "kl_loss_2": 1606.4993103027343, + "kl_loss_3": 1208.6977478027343, + "kl_loss_7": 394.94551849365234, + "learning_rate": 0.000745866462322802, + "loss": 828.0714, + "step": 3430 + }, + { + "ce_loss_10": 3.563853549957275, + "ce_loss_13": 3.505466651916504, + "ce_loss_2": 4.219634628295898, + "ce_loss_3": 4.022562730312347, + "ce_loss_7": 3.6712709188461305, + "epoch": 0.344, + "grad_norm": 576.0, + "kl_loss_10": 133.0882568359375, + "kl_loss_2": 1511.8997375488282, + "kl_loss_3": 1135.5877319335937, + "kl_loss_7": 369.4819305419922, + "learning_rate": 0.0007444836461603195, + "loss": 797.3501, + "step": 3440 + }, + { + "ce_loss_10": 3.6264248490333557, + "ce_loss_13": 3.5626481413841247, + "ce_loss_2": 4.290192425251007, + "ce_loss_3": 4.099602663516999, + "ce_loss_7": 3.734038972854614, + "epoch": 0.345, + "grad_norm": 592.0, + "kl_loss_10": 139.87619667053224, + "kl_loss_2": 1568.7929321289062, + "kl_loss_3": 1192.3937042236328, + "kl_loss_7": 387.790544128418, + "learning_rate": 0.0007430983680502344, + "loss": 820.2707, + "step": 3450 + }, + { + "ce_loss_10": 3.468415367603302, + "ce_loss_13": 3.4085907101631165, + "ce_loss_2": 4.147791481018066, + "ce_loss_3": 3.9522446393966675, + "ce_loss_7": 3.5773945450782776, + "epoch": 0.346, + "grad_norm": 524.0, + "kl_loss_10": 138.47050895690919, + "kl_loss_2": 1571.893621826172, + "kl_loss_3": 1183.226983642578, + "kl_loss_7": 380.1432510375977, + "learning_rate": 0.0007417106419422819, + "loss": 814.8158, + "step": 3460 + }, + { + "ce_loss_10": 3.578550028800964, + "ce_loss_13": 3.51279159784317, + "ce_loss_2": 4.240513134002685, + "ce_loss_3": 4.041714072227478, + "ce_loss_7": 3.684761953353882, + "epoch": 0.347, + "grad_norm": 708.0, + "kl_loss_10": 139.78108291625978, + "kl_loss_2": 1533.4877563476562, + "kl_loss_3": 1153.673779296875, + "kl_loss_7": 374.2180770874023, + "learning_rate": 0.0007403204818108486, + "loss": 807.0902, + "step": 3470 + }, + { + "ce_loss_10": 3.548770797252655, + "ce_loss_13": 3.486237347126007, + "ce_loss_2": 4.2026319146156315, + "ce_loss_3": 4.003607368469238, + "ce_loss_7": 3.6532763123512266, + "epoch": 0.348, + "grad_norm": 596.0, + "kl_loss_10": 144.19391098022462, + "kl_loss_2": 1553.6705322265625, + "kl_loss_3": 1160.6052368164062, + "kl_loss_7": 378.1271469116211, + "learning_rate": 0.0007389279016548316, + "loss": 788.2532, + "step": 3480 + }, + { + "ce_loss_10": 3.5647180557250975, + "ce_loss_13": 3.492380142211914, + "ce_loss_2": 4.253465700149536, + "ce_loss_3": 4.037442588806153, + "ce_loss_7": 3.6645753383636475, + "epoch": 0.349, + "grad_norm": 732.0, + "kl_loss_10": 149.7018730163574, + "kl_loss_2": 1606.8699096679688, + "kl_loss_3": 1193.213555908203, + "kl_loss_7": 386.1459732055664, + "learning_rate": 0.0007375329154974975, + "loss": 825.1197, + "step": 3490 + }, + { + "ce_loss_10": 3.5201268196105957, + "ce_loss_13": 3.4533039331436157, + "ce_loss_2": 4.166223227977753, + "ce_loss_3": 3.9767157316207884, + "ce_loss_7": 3.6191452860832216, + "epoch": 0.35, + "grad_norm": 584.0, + "kl_loss_10": 144.2131031036377, + "kl_loss_2": 1530.7996887207032, + "kl_loss_3": 1156.2872650146485, + "kl_loss_7": 371.8103332519531, + "learning_rate": 0.0007361355373863414, + "loss": 814.7808, + "step": 3500 + }, + { + "ce_loss_10": 3.5710195899009705, + "ce_loss_13": 3.508971703052521, + "ce_loss_2": 4.213377046585083, + "ce_loss_3": 4.024645984172821, + "ce_loss_7": 3.67548463344574, + "epoch": 0.351, + "grad_norm": 736.0, + "kl_loss_10": 140.72288818359374, + "kl_loss_2": 1512.9147888183593, + "kl_loss_3": 1137.2003234863282, + "kl_loss_7": 372.58856811523435, + "learning_rate": 0.0007347357813929454, + "loss": 814.5176, + "step": 3510 + }, + { + "ce_loss_10": 3.5162337183952332, + "ce_loss_13": 3.4520907759666444, + "ce_loss_2": 4.1670368075370785, + "ce_loss_3": 3.9769131183624267, + "ce_loss_7": 3.61603764295578, + "epoch": 0.352, + "grad_norm": 656.0, + "kl_loss_10": 138.93255500793458, + "kl_loss_2": 1512.7831420898438, + "kl_loss_3": 1140.8003845214844, + "kl_loss_7": 368.38318023681643, + "learning_rate": 0.0007333336616128369, + "loss": 806.3783, + "step": 3520 + }, + { + "ce_loss_10": 3.488769805431366, + "ce_loss_13": 3.4252532839775087, + "ce_loss_2": 4.161493599414825, + "ce_loss_3": 3.9646050333976746, + "ce_loss_7": 3.595223593711853, + "epoch": 0.353, + "grad_norm": 548.0, + "kl_loss_10": 138.03040084838867, + "kl_loss_2": 1568.9273254394532, + "kl_loss_3": 1179.4151916503906, + "kl_loss_7": 383.01140747070315, + "learning_rate": 0.0007319291921653463, + "loss": 814.4657, + "step": 3530 + }, + { + "ce_loss_10": 3.5761617183685304, + "ce_loss_13": 3.5123104214668275, + "ce_loss_2": 4.247880482673645, + "ce_loss_3": 4.050547051429748, + "ce_loss_7": 3.68228440284729, + "epoch": 0.354, + "grad_norm": 808.0, + "kl_loss_10": 141.07060623168945, + "kl_loss_2": 1567.1333923339844, + "kl_loss_3": 1178.535693359375, + "kl_loss_7": 381.32129669189453, + "learning_rate": 0.0007305223871934656, + "loss": 802.0609, + "step": 3540 + }, + { + "ce_loss_10": 3.540350914001465, + "ce_loss_13": 3.4772907376289366, + "ce_loss_2": 4.197093963623047, + "ce_loss_3": 4.001269197463989, + "ce_loss_7": 3.647739040851593, + "epoch": 0.355, + "grad_norm": 580.0, + "kl_loss_10": 140.28063926696777, + "kl_loss_2": 1533.3375122070313, + "kl_loss_3": 1148.1182403564453, + "kl_loss_7": 375.75494995117185, + "learning_rate": 0.0007291132608637052, + "loss": 801.6683, + "step": 3550 + }, + { + "ce_loss_10": 3.4978664398193358, + "ce_loss_13": 3.4386332392692567, + "ce_loss_2": 4.202150619029998, + "ce_loss_3": 3.97188538312912, + "ce_loss_7": 3.6044459462165834, + "epoch": 0.356, + "grad_norm": 676.0, + "kl_loss_10": 133.73454742431642, + "kl_loss_2": 1596.5189636230468, + "kl_loss_3": 1160.679183959961, + "kl_loss_7": 371.6182601928711, + "learning_rate": 0.0007277018273659516, + "loss": 819.2582, + "step": 3560 + }, + { + "ce_loss_10": 3.628795838356018, + "ce_loss_13": 3.56366685628891, + "ce_loss_2": 4.295041692256928, + "ce_loss_3": 4.1010064601898195, + "ce_loss_7": 3.7392033100128175, + "epoch": 0.357, + "grad_norm": 536.0, + "kl_loss_10": 141.76525268554687, + "kl_loss_2": 1568.0928771972656, + "kl_loss_3": 1175.8153198242187, + "kl_loss_7": 388.54504241943357, + "learning_rate": 0.0007262881009133242, + "loss": 816.7139, + "step": 3570 + }, + { + "ce_loss_10": 3.5417507767677305, + "ce_loss_13": 3.4833286881446837, + "ce_loss_2": 4.192662954330444, + "ce_loss_3": 4.004778635501862, + "ce_loss_7": 3.6475730895996095, + "epoch": 0.358, + "grad_norm": 576.0, + "kl_loss_10": 134.09114761352538, + "kl_loss_2": 1537.5412170410157, + "kl_loss_3": 1149.5142639160156, + "kl_loss_7": 372.7196243286133, + "learning_rate": 0.0007248720957420329, + "loss": 793.2854, + "step": 3580 + }, + { + "ce_loss_10": 3.558194160461426, + "ce_loss_13": 3.499031662940979, + "ce_loss_2": 4.207183480262756, + "ce_loss_3": 4.008278286457061, + "ce_loss_7": 3.6595579862594603, + "epoch": 0.359, + "grad_norm": 668.0, + "kl_loss_10": 134.52191429138185, + "kl_loss_2": 1505.1361145019532, + "kl_loss_3": 1117.103219604492, + "kl_loss_7": 369.4859024047852, + "learning_rate": 0.0007234538261112341, + "loss": 793.0623, + "step": 3590 + }, + { + "ce_loss_10": 3.590154302120209, + "ce_loss_13": 3.5281407237052917, + "ce_loss_2": 4.254630589485169, + "ce_loss_3": 4.051980185508728, + "ce_loss_7": 3.6962651371955872, + "epoch": 0.36, + "grad_norm": 462.0, + "kl_loss_10": 136.44585952758788, + "kl_loss_2": 1546.1867126464845, + "kl_loss_3": 1153.344775390625, + "kl_loss_7": 380.6938873291016, + "learning_rate": 0.0007220333063028871, + "loss": 793.3124, + "step": 3600 + }, + { + "ce_loss_10": 3.618264949321747, + "ce_loss_13": 3.5577764391899107, + "ce_loss_2": 4.310294914245605, + "ce_loss_3": 4.092818439006805, + "ce_loss_7": 3.725991404056549, + "epoch": 0.361, + "grad_norm": 892.0, + "kl_loss_10": 137.56422424316406, + "kl_loss_2": 1628.0519165039063, + "kl_loss_3": 1198.5521118164063, + "kl_loss_7": 399.71742095947263, + "learning_rate": 0.0007206105506216106, + "loss": 830.3656, + "step": 3610 + }, + { + "ce_loss_10": 3.500383186340332, + "ce_loss_13": 3.4409209847450257, + "ce_loss_2": 4.152735877037048, + "ce_loss_3": 3.957272839546204, + "ce_loss_7": 3.6082523345947264, + "epoch": 0.362, + "grad_norm": 768.0, + "kl_loss_10": 133.25403366088867, + "kl_loss_2": 1518.7402770996093, + "kl_loss_3": 1139.5292907714843, + "kl_loss_7": 378.18980712890624, + "learning_rate": 0.0007191855733945387, + "loss": 786.9895, + "step": 3620 + }, + { + "ce_loss_10": 3.5967095613479616, + "ce_loss_13": 3.5339751839637756, + "ce_loss_2": 4.247948789596558, + "ce_loss_3": 4.048540914058686, + "ce_loss_7": 3.702574074268341, + "epoch": 0.363, + "grad_norm": 672.0, + "kl_loss_10": 134.42433967590333, + "kl_loss_2": 1527.0834228515625, + "kl_loss_3": 1137.2749816894532, + "kl_loss_7": 374.51196441650393, + "learning_rate": 0.0007177583889711762, + "loss": 793.3341, + "step": 3630 + }, + { + "ce_loss_10": 3.512792682647705, + "ce_loss_13": 3.450398051738739, + "ce_loss_2": 4.175726044178009, + "ce_loss_3": 3.969858479499817, + "ce_loss_7": 3.6202203273773192, + "epoch": 0.364, + "grad_norm": 536.0, + "kl_loss_10": 136.62388954162597, + "kl_loss_2": 1562.967547607422, + "kl_loss_3": 1163.4830383300782, + "kl_loss_7": 382.96795043945315, + "learning_rate": 0.0007163290117232541, + "loss": 807.9524, + "step": 3640 + }, + { + "ce_loss_10": 3.6289470553398133, + "ce_loss_13": 3.5686827301979065, + "ce_loss_2": 4.252556777000427, + "ce_loss_3": 4.061027491092682, + "ce_loss_7": 3.728983438014984, + "epoch": 0.365, + "grad_norm": 676.0, + "kl_loss_10": 134.2694351196289, + "kl_loss_2": 1484.0365478515625, + "kl_loss_3": 1115.571533203125, + "kl_loss_7": 372.05673828125, + "learning_rate": 0.0007148974560445859, + "loss": 788.3101, + "step": 3650 + }, + { + "ce_loss_10": 3.549181044101715, + "ce_loss_13": 3.487704300880432, + "ce_loss_2": 4.190627813339233, + "ce_loss_3": 3.9991440176963806, + "ce_loss_7": 3.650784492492676, + "epoch": 0.366, + "grad_norm": 588.0, + "kl_loss_10": 133.14555854797362, + "kl_loss_2": 1493.0444396972657, + "kl_loss_3": 1131.270849609375, + "kl_loss_7": 370.2608352661133, + "learning_rate": 0.0007134637363509209, + "loss": 781.269, + "step": 3660 + }, + { + "ce_loss_10": 3.656008231639862, + "ce_loss_13": 3.5968250274658202, + "ce_loss_2": 4.288893938064575, + "ce_loss_3": 4.1020159244537355, + "ce_loss_7": 3.758394181728363, + "epoch": 0.367, + "grad_norm": 624.0, + "kl_loss_10": 132.44244766235352, + "kl_loss_2": 1477.2258239746093, + "kl_loss_3": 1114.760919189453, + "kl_loss_7": 362.33375549316406, + "learning_rate": 0.0007120278670798009, + "loss": 789.3051, + "step": 3670 + }, + { + "ce_loss_10": 3.4513864398002623, + "ce_loss_13": 3.390084111690521, + "ce_loss_2": 4.156805229187012, + "ce_loss_3": 3.942430257797241, + "ce_loss_7": 3.5631762027740477, + "epoch": 0.368, + "grad_norm": 852.0, + "kl_loss_10": 136.21654624938964, + "kl_loss_2": 1617.9169982910157, + "kl_loss_3": 1207.7436462402343, + "kl_loss_7": 385.1312530517578, + "learning_rate": 0.0007105898626904133, + "loss": 833.6849, + "step": 3680 + }, + { + "ce_loss_10": 3.557909631729126, + "ce_loss_13": 3.496003878116608, + "ce_loss_2": 4.215323185920715, + "ce_loss_3": 4.025267434120178, + "ce_loss_7": 3.660618233680725, + "epoch": 0.369, + "grad_norm": 486.0, + "kl_loss_10": 136.01496505737305, + "kl_loss_2": 1527.9191650390626, + "kl_loss_3": 1153.4224945068358, + "kl_loss_7": 371.0873062133789, + "learning_rate": 0.0007091497376634463, + "loss": 787.2614, + "step": 3690 + }, + { + "ce_loss_10": 3.5013809204101562, + "ce_loss_13": 3.4402984261512755, + "ce_loss_2": 4.1529758214950565, + "ce_loss_3": 3.961398553848267, + "ce_loss_7": 3.6025787115097048, + "epoch": 0.37, + "grad_norm": 684.0, + "kl_loss_10": 136.102490234375, + "kl_loss_2": 1516.0378479003907, + "kl_loss_3": 1141.0336791992188, + "kl_loss_7": 368.23791046142577, + "learning_rate": 0.0007077075065009433, + "loss": 806.8564, + "step": 3700 + }, + { + "ce_loss_10": 3.6074369311332704, + "ce_loss_13": 3.5442083716392516, + "ce_loss_2": 4.27118090391159, + "ce_loss_3": 4.079712843894958, + "ce_loss_7": 3.7118129253387453, + "epoch": 0.371, + "grad_norm": 616.0, + "kl_loss_10": 141.17327156066895, + "kl_loss_2": 1545.3048278808594, + "kl_loss_3": 1174.181103515625, + "kl_loss_7": 378.79654998779296, + "learning_rate": 0.0007062631837261557, + "loss": 803.7765, + "step": 3710 + }, + { + "ce_loss_10": 3.4776635646820067, + "ce_loss_13": 3.417153787612915, + "ce_loss_2": 4.138115549087525, + "ce_loss_3": 3.947688353061676, + "ce_loss_7": 3.583261823654175, + "epoch": 0.372, + "grad_norm": 912.0, + "kl_loss_10": 136.18754692077636, + "kl_loss_2": 1542.702392578125, + "kl_loss_3": 1155.0931549072266, + "kl_loss_7": 374.73779144287107, + "learning_rate": 0.0007048167838833977, + "loss": 812.6596, + "step": 3720 + }, + { + "ce_loss_10": 3.5752769351005553, + "ce_loss_13": 3.513002848625183, + "ce_loss_2": 4.210619521141052, + "ce_loss_3": 4.022019147872925, + "ce_loss_7": 3.6798208355903625, + "epoch": 0.373, + "grad_norm": 768.0, + "kl_loss_10": 136.63083267211914, + "kl_loss_2": 1506.4640258789063, + "kl_loss_3": 1132.6720397949218, + "kl_loss_7": 377.5881011962891, + "learning_rate": 0.0007033683215379002, + "loss": 791.4403, + "step": 3730 + }, + { + "ce_loss_10": 3.5619895219802857, + "ce_loss_13": 3.4989688754081727, + "ce_loss_2": 4.2153314590454105, + "ce_loss_3": 4.0190078020095825, + "ce_loss_7": 3.6668317794799803, + "epoch": 0.374, + "grad_norm": 728.0, + "kl_loss_10": 134.1514114379883, + "kl_loss_2": 1513.4870727539062, + "kl_loss_3": 1134.2889343261718, + "kl_loss_7": 369.0950271606445, + "learning_rate": 0.0007019178112756625, + "loss": 803.4245, + "step": 3740 + }, + { + "ce_loss_10": 3.52027747631073, + "ce_loss_13": 3.4614068984985353, + "ce_loss_2": 4.172631430625915, + "ce_loss_3": 3.9826321721076967, + "ce_loss_7": 3.623573863506317, + "epoch": 0.375, + "grad_norm": 720.0, + "kl_loss_10": 133.24607543945314, + "kl_loss_2": 1508.3497802734375, + "kl_loss_3": 1138.9531616210938, + "kl_loss_7": 371.3535675048828, + "learning_rate": 0.0007004652677033068, + "loss": 797.0561, + "step": 3750 + }, + { + "ce_loss_10": 3.598753345012665, + "ce_loss_13": 3.5414743185043336, + "ce_loss_2": 4.221561062335968, + "ce_loss_3": 4.034064853191376, + "ce_loss_7": 3.697866952419281, + "epoch": 0.376, + "grad_norm": 620.0, + "kl_loss_10": 131.1568790435791, + "kl_loss_2": 1477.522637939453, + "kl_loss_3": 1113.0271392822265, + "kl_loss_7": 362.1108154296875, + "learning_rate": 0.0006990107054479312, + "loss": 785.3042, + "step": 3760 + }, + { + "ce_loss_10": 3.5856411337852476, + "ce_loss_13": 3.523606741428375, + "ce_loss_2": 4.220624828338623, + "ce_loss_3": 4.039202105998993, + "ce_loss_7": 3.686489188671112, + "epoch": 0.377, + "grad_norm": 784.0, + "kl_loss_10": 134.56854705810548, + "kl_loss_2": 1499.6362426757812, + "kl_loss_3": 1135.1793060302734, + "kl_loss_7": 367.8611801147461, + "learning_rate": 0.000697554139156961, + "loss": 789.6146, + "step": 3770 + }, + { + "ce_loss_10": 3.571768081188202, + "ce_loss_13": 3.5090900897979735, + "ce_loss_2": 4.22368232011795, + "ce_loss_3": 4.029348587989807, + "ce_loss_7": 3.6724517226219175, + "epoch": 0.378, + "grad_norm": 652.0, + "kl_loss_10": 139.37470092773438, + "kl_loss_2": 1539.3553405761718, + "kl_loss_3": 1153.9761901855468, + "kl_loss_7": 376.85098724365236, + "learning_rate": 0.0006960955834980027, + "loss": 789.6849, + "step": 3780 + }, + { + "ce_loss_10": 3.5432674288749695, + "ce_loss_13": 3.481638765335083, + "ce_loss_2": 4.195074439048767, + "ce_loss_3": 4.005574572086334, + "ce_loss_7": 3.645065152645111, + "epoch": 0.379, + "grad_norm": 668.0, + "kl_loss_10": 137.8487949371338, + "kl_loss_2": 1516.3669982910155, + "kl_loss_3": 1146.334066772461, + "kl_loss_7": 368.8575042724609, + "learning_rate": 0.0006946350531586958, + "loss": 794.4263, + "step": 3790 + }, + { + "ce_loss_10": 3.563885974884033, + "ce_loss_13": 3.5041411876678468, + "ce_loss_2": 4.217035782337189, + "ce_loss_3": 4.025569212436676, + "ce_loss_7": 3.670181393623352, + "epoch": 0.38, + "grad_norm": 872.0, + "kl_loss_10": 134.99261245727538, + "kl_loss_2": 1515.9988891601563, + "kl_loss_3": 1138.6263427734375, + "kl_loss_7": 365.9822494506836, + "learning_rate": 0.0006931725628465643, + "loss": 804.7092, + "step": 3800 + }, + { + "ce_loss_10": 3.5913591384887695, + "ce_loss_13": 3.527244985103607, + "ce_loss_2": 4.242334771156311, + "ce_loss_3": 4.055222499370575, + "ce_loss_7": 3.6929625034332276, + "epoch": 0.381, + "grad_norm": 864.0, + "kl_loss_10": 138.56078910827637, + "kl_loss_2": 1516.233056640625, + "kl_loss_3": 1147.4771270751953, + "kl_loss_7": 371.2011444091797, + "learning_rate": 0.0006917081272888696, + "loss": 799.2829, + "step": 3810 + }, + { + "ce_loss_10": 3.483885133266449, + "ce_loss_13": 3.423719954490662, + "ce_loss_2": 4.133236110210419, + "ce_loss_3": 3.9456497192382813, + "ce_loss_7": 3.592355155944824, + "epoch": 0.382, + "grad_norm": 596.0, + "kl_loss_10": 137.18110198974608, + "kl_loss_2": 1511.9404541015624, + "kl_loss_3": 1152.5890838623047, + "kl_loss_7": 371.8784881591797, + "learning_rate": 0.0006902417612324615, + "loss": 790.0969, + "step": 3820 + }, + { + "ce_loss_10": 3.6188692688941955, + "ce_loss_13": 3.555316996574402, + "ce_loss_2": 4.287074863910675, + "ce_loss_3": 4.092138230800629, + "ce_loss_7": 3.7254763722419737, + "epoch": 0.383, + "grad_norm": 792.0, + "kl_loss_10": 141.66006164550782, + "kl_loss_2": 1560.3333679199218, + "kl_loss_3": 1187.6506469726562, + "kl_loss_7": 385.8511306762695, + "learning_rate": 0.00068877347944363, + "loss": 808.5316, + "step": 3830 + }, + { + "ce_loss_10": 3.6141200184822084, + "ce_loss_13": 3.5523874282836916, + "ce_loss_2": 4.2468698740005495, + "ce_loss_3": 4.064640355110169, + "ce_loss_7": 3.715350341796875, + "epoch": 0.384, + "grad_norm": 708.0, + "kl_loss_10": 136.68106803894042, + "kl_loss_2": 1492.917822265625, + "kl_loss_3": 1138.3358154296875, + "kl_loss_7": 369.60352630615233, + "learning_rate": 0.0006873032967079561, + "loss": 799.188, + "step": 3840 + }, + { + "ce_loss_10": 3.5983062267303465, + "ce_loss_13": 3.54010808467865, + "ce_loss_2": 4.215501749515534, + "ce_loss_3": 4.038501214981079, + "ce_loss_7": 3.696172285079956, + "epoch": 0.385, + "grad_norm": 700.0, + "kl_loss_10": 132.26292190551757, + "kl_loss_2": 1474.6833190917969, + "kl_loss_3": 1124.775860595703, + "kl_loss_7": 362.6510437011719, + "learning_rate": 0.0006858312278301637, + "loss": 777.6832, + "step": 3850 + }, + { + "ce_loss_10": 3.6401477217674256, + "ce_loss_13": 3.5812854051589964, + "ce_loss_2": 4.2578874111175535, + "ce_loss_3": 4.075009536743164, + "ce_loss_7": 3.7349116444587707, + "epoch": 0.386, + "grad_norm": 716.0, + "kl_loss_10": 133.9440372467041, + "kl_loss_2": 1480.8558410644532, + "kl_loss_3": 1115.9467712402343, + "kl_loss_7": 363.8341888427734, + "learning_rate": 0.0006843572876339704, + "loss": 778.8414, + "step": 3860 + }, + { + "ce_loss_10": 3.5566078424453735, + "ce_loss_13": 3.499259579181671, + "ce_loss_2": 4.1666911244392395, + "ce_loss_3": 3.9863228678703306, + "ce_loss_7": 3.651654100418091, + "epoch": 0.387, + "grad_norm": 712.0, + "kl_loss_10": 128.78639640808106, + "kl_loss_2": 1448.728778076172, + "kl_loss_3": 1092.9980010986328, + "kl_loss_7": 353.82066497802737, + "learning_rate": 0.0006828814909619373, + "loss": 789.1794, + "step": 3870 + }, + { + "ce_loss_10": 3.68144371509552, + "ce_loss_13": 3.618389356136322, + "ce_loss_2": 4.313943779468536, + "ce_loss_3": 4.125034952163697, + "ce_loss_7": 3.7789146065711976, + "epoch": 0.388, + "grad_norm": 564.0, + "kl_loss_10": 137.6860321044922, + "kl_loss_2": 1484.8064819335937, + "kl_loss_3": 1117.9655303955078, + "kl_loss_7": 368.4295623779297, + "learning_rate": 0.0006814038526753205, + "loss": 776.0895, + "step": 3880 + }, + { + "ce_loss_10": 3.5763523101806642, + "ce_loss_13": 3.5149505019187925, + "ce_loss_2": 4.210803210735321, + "ce_loss_3": 4.024750709533691, + "ce_loss_7": 3.6787024259567263, + "epoch": 0.389, + "grad_norm": 540.0, + "kl_loss_10": 134.68067474365233, + "kl_loss_2": 1505.5961059570313, + "kl_loss_3": 1129.528253173828, + "kl_loss_7": 365.9979309082031, + "learning_rate": 0.0006799243876539213, + "loss": 785.1848, + "step": 3890 + }, + { + "ce_loss_10": 3.4995123624801634, + "ce_loss_13": 3.4395276427268984, + "ce_loss_2": 4.165191233158112, + "ce_loss_3": 3.963883662223816, + "ce_loss_7": 3.6026421189308167, + "epoch": 0.39, + "grad_norm": 856.0, + "kl_loss_10": 132.61898651123047, + "kl_loss_2": 1536.6189819335937, + "kl_loss_3": 1135.7314849853515, + "kl_loss_7": 364.9299591064453, + "learning_rate": 0.0006784431107959359, + "loss": 796.0479, + "step": 3900 + }, + { + "ce_loss_10": 3.5600151419639587, + "ce_loss_13": 3.4963993072509765, + "ce_loss_2": 4.227058172225952, + "ce_loss_3": 4.032915997505188, + "ce_loss_7": 3.669356656074524, + "epoch": 0.391, + "grad_norm": 804.0, + "kl_loss_10": 136.25931625366212, + "kl_loss_2": 1560.3639404296875, + "kl_loss_3": 1170.0307891845703, + "kl_loss_7": 379.51966705322263, + "learning_rate": 0.0006769600370178059, + "loss": 800.6438, + "step": 3910 + }, + { + "ce_loss_10": 3.5234851360321047, + "ce_loss_13": 3.463622975349426, + "ce_loss_2": 4.184793496131897, + "ce_loss_3": 3.9945266962051393, + "ce_loss_7": 3.634400510787964, + "epoch": 0.392, + "grad_norm": 572.0, + "kl_loss_10": 132.96168327331543, + "kl_loss_2": 1528.766912841797, + "kl_loss_3": 1149.3914581298827, + "kl_loss_7": 369.23984985351564, + "learning_rate": 0.0006754751812540679, + "loss": 781.9199, + "step": 3920 + }, + { + "ce_loss_10": 3.573040223121643, + "ce_loss_13": 3.5096162438392637, + "ce_loss_2": 4.2254300832748415, + "ce_loss_3": 4.032295274734497, + "ce_loss_7": 3.6753496289253236, + "epoch": 0.393, + "grad_norm": 776.0, + "kl_loss_10": 137.39977340698243, + "kl_loss_2": 1524.5934692382812, + "kl_loss_3": 1142.2673461914062, + "kl_loss_7": 372.6196746826172, + "learning_rate": 0.0006739885584572025, + "loss": 799.1489, + "step": 3930 + }, + { + "ce_loss_10": 3.6011941909790037, + "ce_loss_13": 3.5389833211898805, + "ce_loss_2": 4.258283352851867, + "ce_loss_3": 4.056079113483429, + "ce_loss_7": 3.706939959526062, + "epoch": 0.394, + "grad_norm": 760.0, + "kl_loss_10": 138.09807777404785, + "kl_loss_2": 1554.7301696777345, + "kl_loss_3": 1153.3549835205079, + "kl_loss_7": 372.54356536865237, + "learning_rate": 0.0006725001835974853, + "loss": 791.1359, + "step": 3940 + }, + { + "ce_loss_10": 3.5847023248672487, + "ce_loss_13": 3.5242226362228393, + "ce_loss_2": 4.236932027339935, + "ce_loss_3": 4.044707441329956, + "ce_loss_7": 3.6880187392234802, + "epoch": 0.395, + "grad_norm": 604.0, + "kl_loss_10": 137.06708946228028, + "kl_loss_2": 1531.090771484375, + "kl_loss_3": 1148.1961975097656, + "kl_loss_7": 372.88367767333983, + "learning_rate": 0.0006710100716628344, + "loss": 781.9915, + "step": 3950 + }, + { + "ce_loss_10": 3.568317210674286, + "ce_loss_13": 3.5080092191696166, + "ce_loss_2": 4.221521747112274, + "ce_loss_3": 4.030601763725281, + "ce_loss_7": 3.673730731010437, + "epoch": 0.396, + "grad_norm": 732.0, + "kl_loss_10": 134.04078521728516, + "kl_loss_2": 1522.4079895019531, + "kl_loss_3": 1151.196096801758, + "kl_loss_7": 372.5210220336914, + "learning_rate": 0.0006695182376586602, + "loss": 800.3014, + "step": 3960 + }, + { + "ce_loss_10": 3.6057994961738586, + "ce_loss_13": 3.5476833462715147, + "ce_loss_2": 4.220119166374206, + "ce_loss_3": 4.040166866779328, + "ce_loss_7": 3.705691361427307, + "epoch": 0.397, + "grad_norm": 940.0, + "kl_loss_10": 128.9409210205078, + "kl_loss_2": 1444.9400512695313, + "kl_loss_3": 1091.2126403808593, + "kl_loss_7": 357.9466751098633, + "learning_rate": 0.000668024696607715, + "loss": 783.9521, + "step": 3970 + }, + { + "ce_loss_10": 3.5564019203186037, + "ce_loss_13": 3.4956687688827515, + "ce_loss_2": 4.194620299339294, + "ce_loss_3": 4.004274892807007, + "ce_loss_7": 3.6527443528175354, + "epoch": 0.398, + "grad_norm": 704.0, + "kl_loss_10": 134.44026298522948, + "kl_loss_2": 1512.640576171875, + "kl_loss_3": 1141.9819427490233, + "kl_loss_7": 367.26869049072263, + "learning_rate": 0.0006665294635499404, + "loss": 789.8477, + "step": 3980 + }, + { + "ce_loss_10": 3.567546045780182, + "ce_loss_13": 3.5039931416511534, + "ce_loss_2": 4.231116080284119, + "ce_loss_3": 4.034705054759979, + "ce_loss_7": 3.672286367416382, + "epoch": 0.399, + "grad_norm": 700.0, + "kl_loss_10": 143.2273063659668, + "kl_loss_2": 1571.5681518554688, + "kl_loss_3": 1178.7588897705077, + "kl_loss_7": 382.66470794677736, + "learning_rate": 0.0006650325535423167, + "loss": 806.0485, + "step": 3990 + }, + { + "ce_loss_10": 3.589041221141815, + "ce_loss_13": 3.5297691583633424, + "ce_loss_2": 4.205903816223144, + "ce_loss_3": 4.024567484855652, + "ce_loss_7": 3.6903899908065796, + "epoch": 0.4, + "grad_norm": 716.0, + "kl_loss_10": 134.77462882995604, + "kl_loss_2": 1448.7763793945312, + "kl_loss_3": 1097.182077026367, + "kl_loss_7": 360.74890594482423, + "learning_rate": 0.0006635339816587109, + "loss": 774.6992, + "step": 4000 + }, + { + "ce_loss_10": 3.534158933162689, + "ce_loss_13": 3.4722786784172057, + "ce_loss_2": 4.18008325099945, + "ce_loss_3": 3.9927919030189516, + "ce_loss_7": 3.636194169521332, + "epoch": 0.401, + "grad_norm": 608.0, + "kl_loss_10": 137.7214611053467, + "kl_loss_2": 1517.0520935058594, + "kl_loss_3": 1144.4528839111329, + "kl_loss_7": 367.4022415161133, + "learning_rate": 0.0006620337629897252, + "loss": 785.9456, + "step": 4010 + }, + { + "ce_loss_10": 3.5372211933135986, + "ce_loss_13": 3.475364565849304, + "ce_loss_2": 4.187776136398315, + "ce_loss_3": 3.995320773124695, + "ce_loss_7": 3.6415831565856935, + "epoch": 0.402, + "grad_norm": 544.0, + "kl_loss_10": 137.00096588134767, + "kl_loss_2": 1516.9021362304688, + "kl_loss_3": 1134.6023742675782, + "kl_loss_7": 368.9509552001953, + "learning_rate": 0.0006605319126425454, + "loss": 802.8082, + "step": 4020 + }, + { + "ce_loss_10": 3.438814675807953, + "ce_loss_13": 3.379668688774109, + "ce_loss_2": 4.109059810638428, + "ce_loss_3": 3.909671998023987, + "ce_loss_7": 3.544471001625061, + "epoch": 0.403, + "grad_norm": 560.0, + "kl_loss_10": 136.3015495300293, + "kl_loss_2": 1560.7927734375, + "kl_loss_3": 1171.8411346435546, + "kl_loss_7": 373.3894378662109, + "learning_rate": 0.0006590284457407876, + "loss": 802.5854, + "step": 4030 + }, + { + "ce_loss_10": 3.544812524318695, + "ce_loss_13": 3.481894314289093, + "ce_loss_2": 4.181082665920258, + "ce_loss_3": 3.9941136717796324, + "ce_loss_7": 3.6471312403678895, + "epoch": 0.404, + "grad_norm": 504.0, + "kl_loss_10": 135.4376022338867, + "kl_loss_2": 1496.4591674804688, + "kl_loss_3": 1126.9239776611328, + "kl_loss_7": 368.8154357910156, + "learning_rate": 0.0006575233774243465, + "loss": 785.2985, + "step": 4040 + }, + { + "ce_loss_10": 3.529277968406677, + "ce_loss_13": 3.469450843334198, + "ce_loss_2": 4.183178901672363, + "ce_loss_3": 3.989623689651489, + "ce_loss_7": 3.634640073776245, + "epoch": 0.405, + "grad_norm": 760.0, + "kl_loss_10": 134.75451889038087, + "kl_loss_2": 1538.0418823242187, + "kl_loss_3": 1155.4544311523437, + "kl_loss_7": 372.7149856567383, + "learning_rate": 0.0006560167228492435, + "loss": 793.4811, + "step": 4050 + }, + { + "ce_loss_10": 3.577071988582611, + "ce_loss_13": 3.5211926221847536, + "ce_loss_2": 4.202403485774994, + "ce_loss_3": 4.0193228960037235, + "ce_loss_7": 3.6783261060714723, + "epoch": 0.406, + "grad_norm": 792.0, + "kl_loss_10": 129.85480155944825, + "kl_loss_2": 1467.7122497558594, + "kl_loss_3": 1107.2084655761719, + "kl_loss_7": 358.02198791503906, + "learning_rate": 0.0006545084971874737, + "loss": 784.8675, + "step": 4060 + }, + { + "ce_loss_10": 3.5425363302230837, + "ce_loss_13": 3.481105864048004, + "ce_loss_2": 4.214594578742981, + "ce_loss_3": 4.014482605457306, + "ce_loss_7": 3.651681327819824, + "epoch": 0.407, + "grad_norm": 604.0, + "kl_loss_10": 137.89012451171874, + "kl_loss_2": 1563.3667907714844, + "kl_loss_3": 1169.847265625, + "kl_loss_7": 378.33221740722655, + "learning_rate": 0.0006529987156268526, + "loss": 790.6762, + "step": 4070 + }, + { + "ce_loss_10": 3.461014246940613, + "ce_loss_13": 3.3993191599845884, + "ce_loss_2": 4.127788650989532, + "ce_loss_3": 3.9290434598922728, + "ce_loss_7": 3.5692273139953614, + "epoch": 0.408, + "grad_norm": 692.0, + "kl_loss_10": 135.76864738464354, + "kl_loss_2": 1537.1478942871095, + "kl_loss_3": 1151.7357269287108, + "kl_loss_7": 372.04627532958983, + "learning_rate": 0.0006514873933708637, + "loss": 806.6379, + "step": 4080 + }, + { + "ce_loss_10": 3.5668829321861266, + "ce_loss_13": 3.508492851257324, + "ce_loss_2": 4.2085763812065125, + "ce_loss_3": 4.0170141696929935, + "ce_loss_7": 3.6687933683395384, + "epoch": 0.409, + "grad_norm": 680.0, + "kl_loss_10": 133.2143711090088, + "kl_loss_2": 1492.4684814453126, + "kl_loss_3": 1121.5162017822265, + "kl_loss_7": 364.1954345703125, + "learning_rate": 0.0006499745456385053, + "loss": 779.6072, + "step": 4090 + }, + { + "ce_loss_10": 3.5439602375030517, + "ce_loss_13": 3.481731653213501, + "ce_loss_2": 4.193144726753235, + "ce_loss_3": 4.0000463128089905, + "ce_loss_7": 3.650206482410431, + "epoch": 0.41, + "grad_norm": 720.0, + "kl_loss_10": 138.83553466796874, + "kl_loss_2": 1504.902880859375, + "kl_loss_3": 1128.8948608398437, + "kl_loss_7": 372.4293869018555, + "learning_rate": 0.0006484601876641375, + "loss": 796.3825, + "step": 4100 + }, + { + "ce_loss_10": 3.524893641471863, + "ce_loss_13": 3.4652820467948913, + "ce_loss_2": 4.154143571853638, + "ce_loss_3": 3.963244378566742, + "ce_loss_7": 3.6235713839530943, + "epoch": 0.411, + "grad_norm": 524.0, + "kl_loss_10": 135.7376163482666, + "kl_loss_2": 1475.3457275390624, + "kl_loss_3": 1102.1329620361328, + "kl_loss_7": 360.7025970458984, + "learning_rate": 0.000646944334697328, + "loss": 776.625, + "step": 4110 + }, + { + "ce_loss_10": 3.6424561977386474, + "ce_loss_13": 3.582131230831146, + "ce_loss_2": 4.258833718299866, + "ce_loss_3": 4.075766789913177, + "ce_loss_7": 3.7454846620559694, + "epoch": 0.412, + "grad_norm": 520.0, + "kl_loss_10": 134.410147857666, + "kl_loss_2": 1442.3484252929688, + "kl_loss_3": 1087.205389404297, + "kl_loss_7": 362.8587707519531, + "learning_rate": 0.0006454270020026995, + "loss": 761.3288, + "step": 4120 + }, + { + "ce_loss_10": 3.6111577272415163, + "ce_loss_13": 3.5512704968452455, + "ce_loss_2": 4.226312971115112, + "ce_loss_3": 4.035128366947174, + "ce_loss_7": 3.707612764835358, + "epoch": 0.413, + "grad_norm": 556.0, + "kl_loss_10": 134.3211742401123, + "kl_loss_2": 1434.7612060546876, + "kl_loss_3": 1081.237615966797, + "kl_loss_7": 354.78483428955076, + "learning_rate": 0.0006439082048597755, + "loss": 762.4907, + "step": 4130 + }, + { + "ce_loss_10": 3.5981813311576842, + "ce_loss_13": 3.5334428906440736, + "ce_loss_2": 4.23305733203888, + "ce_loss_3": 4.0418706178665165, + "ce_loss_7": 3.6992475748062135, + "epoch": 0.414, + "grad_norm": 752.0, + "kl_loss_10": 138.56320190429688, + "kl_loss_2": 1493.8901489257812, + "kl_loss_3": 1121.1268218994142, + "kl_loss_7": 366.90565490722656, + "learning_rate": 0.0006423879585628261, + "loss": 783.5715, + "step": 4140 + }, + { + "ce_loss_10": 3.560960817337036, + "ce_loss_13": 3.495613181591034, + "ce_loss_2": 4.226023101806641, + "ce_loss_3": 4.0232850313186646, + "ce_loss_7": 3.6655412554740905, + "epoch": 0.415, + "grad_norm": 1056.0, + "kl_loss_10": 140.29082679748535, + "kl_loss_2": 1548.172344970703, + "kl_loss_3": 1152.182876586914, + "kl_loss_7": 374.3309783935547, + "learning_rate": 0.0006408662784207149, + "loss": 802.2298, + "step": 4150 + }, + { + "ce_loss_10": 3.5164562225341798, + "ce_loss_13": 3.456049418449402, + "ce_loss_2": 4.159878623485565, + "ce_loss_3": 3.973027527332306, + "ce_loss_7": 3.6125931262969972, + "epoch": 0.416, + "grad_norm": 708.0, + "kl_loss_10": 132.27476196289064, + "kl_loss_2": 1515.6659545898438, + "kl_loss_3": 1142.7359313964844, + "kl_loss_7": 364.6348907470703, + "learning_rate": 0.0006393431797567439, + "loss": 789.1856, + "step": 4160 + }, + { + "ce_loss_10": 3.6022548198699953, + "ce_loss_13": 3.5429869413375856, + "ce_loss_2": 4.207081604003906, + "ce_loss_3": 4.019421660900116, + "ce_loss_7": 3.6976672649383544, + "epoch": 0.417, + "grad_norm": 612.0, + "kl_loss_10": 132.8531795501709, + "kl_loss_2": 1457.9248413085938, + "kl_loss_3": 1092.8853637695313, + "kl_loss_7": 359.7133728027344, + "learning_rate": 0.0006378186779084996, + "loss": 753.2417, + "step": 4170 + }, + { + "ce_loss_10": 3.4348825335502626, + "ce_loss_13": 3.3747984290122988, + "ce_loss_2": 4.09944007396698, + "ce_loss_3": 3.9020219922065733, + "ce_loss_7": 3.5400782108306883, + "epoch": 0.418, + "grad_norm": 680.0, + "kl_loss_10": 132.64606018066405, + "kl_loss_2": 1522.9734741210937, + "kl_loss_3": 1138.7015899658204, + "kl_loss_7": 366.7797393798828, + "learning_rate": 0.0006362927882276989, + "loss": 789.6667, + "step": 4180 + }, + { + "ce_loss_10": 3.6310564041137696, + "ce_loss_13": 3.5738319396972655, + "ce_loss_2": 4.248906910419464, + "ce_loss_3": 4.05875905752182, + "ce_loss_7": 3.7278629899024964, + "epoch": 0.419, + "grad_norm": 508.0, + "kl_loss_10": 132.2452365875244, + "kl_loss_2": 1463.0255004882813, + "kl_loss_3": 1091.5722778320312, + "kl_loss_7": 354.3390426635742, + "learning_rate": 0.000634765526080034, + "loss": 756.9509, + "step": 4190 + }, + { + "ce_loss_10": 3.633085823059082, + "ce_loss_13": 3.572413682937622, + "ce_loss_2": 4.256156611442566, + "ce_loss_3": 4.07630068063736, + "ce_loss_7": 3.7311901450157166, + "epoch": 0.42, + "grad_norm": 680.0, + "kl_loss_10": 136.76210670471193, + "kl_loss_2": 1473.4716735839843, + "kl_loss_3": 1115.9601440429688, + "kl_loss_7": 365.41827697753905, + "learning_rate": 0.0006332369068450174, + "loss": 766.6168, + "step": 4200 + }, + { + "ce_loss_10": 3.5673229098320007, + "ce_loss_13": 3.5074177622795104, + "ce_loss_2": 4.202928698062896, + "ce_loss_3": 4.015132880210876, + "ce_loss_7": 3.6681143999099732, + "epoch": 0.421, + "grad_norm": 628.0, + "kl_loss_10": 134.72608184814453, + "kl_loss_2": 1491.3523498535155, + "kl_loss_3": 1126.61123046875, + "kl_loss_7": 365.6973648071289, + "learning_rate": 0.0006317069459158283, + "loss": 775.0173, + "step": 4210 + }, + { + "ce_loss_10": 3.6810522437095643, + "ce_loss_13": 3.6207136154174804, + "ce_loss_2": 4.278917360305786, + "ce_loss_3": 4.101971006393432, + "ce_loss_7": 3.776351547241211, + "epoch": 0.422, + "grad_norm": 560.0, + "kl_loss_10": 138.53412284851075, + "kl_loss_2": 1443.6548767089844, + "kl_loss_3": 1091.453189086914, + "kl_loss_7": 359.41749114990233, + "learning_rate": 0.0006301756586991561, + "loss": 771.2455, + "step": 4220 + }, + { + "ce_loss_10": 3.453490364551544, + "ce_loss_13": 3.39296897649765, + "ce_loss_2": 4.11007170677185, + "ce_loss_3": 3.9121878266334535, + "ce_loss_7": 3.5533955574035643, + "epoch": 0.423, + "grad_norm": 620.0, + "kl_loss_10": 141.56931991577147, + "kl_loss_2": 1538.1698364257813, + "kl_loss_3": 1154.8234252929688, + "kl_loss_7": 367.4682983398437, + "learning_rate": 0.0006286430606150459, + "loss": 792.7883, + "step": 4230 + }, + { + "ce_loss_10": 3.6686601042747498, + "ce_loss_13": 3.592938470840454, + "ce_loss_2": 4.279926109313965, + "ce_loss_3": 4.093138873577118, + "ce_loss_7": 3.750678813457489, + "epoch": 0.424, + "grad_norm": 644.0, + "kl_loss_10": 158.1731170654297, + "kl_loss_2": 1484.5655151367187, + "kl_loss_3": 1109.7966552734374, + "kl_loss_7": 363.6700012207031, + "learning_rate": 0.0006271091670967436, + "loss": 774.7598, + "step": 4240 + }, + { + "ce_loss_10": 3.5760006070137025, + "ce_loss_13": 3.508677899837494, + "ce_loss_2": 4.227059412002563, + "ce_loss_3": 4.0314107775688175, + "ce_loss_7": 3.6780433177948, + "epoch": 0.425, + "grad_norm": 740.0, + "kl_loss_10": 156.61751861572264, + "kl_loss_2": 1553.6643676757812, + "kl_loss_3": 1164.257012939453, + "kl_loss_7": 379.240168762207, + "learning_rate": 0.0006255739935905395, + "loss": 794.0103, + "step": 4250 + }, + { + "ce_loss_10": 3.6080758333206178, + "ce_loss_13": 3.5441837668418885, + "ce_loss_2": 4.217771422863007, + "ce_loss_3": 4.030789840221405, + "ce_loss_7": 3.696694552898407, + "epoch": 0.426, + "grad_norm": 700.0, + "kl_loss_10": 147.6225784301758, + "kl_loss_2": 1459.3168395996095, + "kl_loss_3": 1092.8426239013672, + "kl_loss_7": 360.9449523925781, + "learning_rate": 0.0006240375555556145, + "loss": 787.5354, + "step": 4260 + }, + { + "ce_loss_10": 3.619337785243988, + "ce_loss_13": 3.5525267839431764, + "ce_loss_2": 4.272982287406921, + "ce_loss_3": 4.074936735630035, + "ce_loss_7": 3.7203184485435488, + "epoch": 0.427, + "grad_norm": 580.0, + "kl_loss_10": 143.41627883911133, + "kl_loss_2": 1519.2254211425782, + "kl_loss_3": 1136.8392669677735, + "kl_loss_7": 366.1272567749023, + "learning_rate": 0.000622499868463882, + "loss": 784.6523, + "step": 4270 + }, + { + "ce_loss_10": 3.583260440826416, + "ce_loss_13": 3.522217357158661, + "ce_loss_2": 4.191686594486237, + "ce_loss_3": 4.004029047489166, + "ce_loss_7": 3.6739280343055727, + "epoch": 0.428, + "grad_norm": 760.0, + "kl_loss_10": 137.14938316345214, + "kl_loss_2": 1458.7350463867188, + "kl_loss_3": 1091.8113159179688, + "kl_loss_7": 355.58583679199216, + "learning_rate": 0.0006209609477998338, + "loss": 772.4823, + "step": 4280 + }, + { + "ce_loss_10": 3.6303748726844787, + "ce_loss_13": 3.5688055872917177, + "ce_loss_2": 4.266738796234131, + "ce_loss_3": 4.077020514011383, + "ce_loss_7": 3.73097505569458, + "epoch": 0.429, + "grad_norm": 728.0, + "kl_loss_10": 138.04970626831056, + "kl_loss_2": 1487.9487243652343, + "kl_loss_3": 1125.538851928711, + "kl_loss_7": 364.74159851074216, + "learning_rate": 0.0006194208090603844, + "loss": 785.8367, + "step": 4290 + }, + { + "ce_loss_10": 3.5521617889404298, + "ce_loss_13": 3.492956447601318, + "ce_loss_2": 4.177777218818664, + "ce_loss_3": 3.9935933470726015, + "ce_loss_7": 3.6513041496276855, + "epoch": 0.43, + "grad_norm": 628.0, + "kl_loss_10": 130.21162300109864, + "kl_loss_2": 1462.7013305664063, + "kl_loss_3": 1094.481967163086, + "kl_loss_7": 351.07578735351564, + "learning_rate": 0.0006178794677547138, + "loss": 761.1579, + "step": 4300 + }, + { + "ce_loss_10": 3.580393135547638, + "ce_loss_13": 3.5223410606384276, + "ce_loss_2": 4.22029185295105, + "ce_loss_3": 4.0296910285949705, + "ce_loss_7": 3.6796607255935667, + "epoch": 0.431, + "grad_norm": 804.0, + "kl_loss_10": 135.95439834594725, + "kl_loss_2": 1507.2907592773438, + "kl_loss_3": 1135.5215759277344, + "kl_loss_7": 368.5531372070312, + "learning_rate": 0.0006163369394041111, + "loss": 777.0869, + "step": 4310 + }, + { + "ce_loss_10": 3.5184757232666017, + "ce_loss_13": 3.4593456625938415, + "ce_loss_2": 4.166491711139679, + "ce_loss_3": 3.9752755165100098, + "ce_loss_7": 3.6229493021965027, + "epoch": 0.432, + "grad_norm": 816.0, + "kl_loss_10": 132.72277908325196, + "kl_loss_2": 1510.2736328125, + "kl_loss_3": 1137.5515747070312, + "kl_loss_7": 362.1718246459961, + "learning_rate": 0.0006147932395418205, + "loss": 797.0466, + "step": 4320 + }, + { + "ce_loss_10": 3.5534854769706725, + "ce_loss_13": 3.493700551986694, + "ce_loss_2": 4.1732647776603695, + "ce_loss_3": 3.9909292578697206, + "ce_loss_7": 3.652637302875519, + "epoch": 0.433, + "grad_norm": 532.0, + "kl_loss_10": 131.40403900146484, + "kl_loss_2": 1471.334698486328, + "kl_loss_3": 1111.2522705078125, + "kl_loss_7": 360.8691162109375, + "learning_rate": 0.0006132483837128823, + "loss": 767.8431, + "step": 4330 + }, + { + "ce_loss_10": 3.534538817405701, + "ce_loss_13": 3.4763117671012878, + "ce_loss_2": 4.177856540679931, + "ce_loss_3": 3.977602541446686, + "ce_loss_7": 3.6365644097328187, + "epoch": 0.434, + "grad_norm": 576.0, + "kl_loss_10": 132.02191047668458, + "kl_loss_2": 1511.0365112304687, + "kl_loss_3": 1116.52158203125, + "kl_loss_7": 363.1332717895508, + "learning_rate": 0.0006117023874739772, + "loss": 782.3545, + "step": 4340 + }, + { + "ce_loss_10": 3.523367393016815, + "ce_loss_13": 3.465071129798889, + "ce_loss_2": 4.167889106273651, + "ce_loss_3": 3.9776375889778137, + "ce_loss_7": 3.6235153794288637, + "epoch": 0.435, + "grad_norm": 560.0, + "kl_loss_10": 132.62512550354003, + "kl_loss_2": 1522.6255187988281, + "kl_loss_3": 1134.744940185547, + "kl_loss_7": 364.6108734130859, + "learning_rate": 0.0006101552663932703, + "loss": 787.945, + "step": 4350 + }, + { + "ce_loss_10": 3.560639572143555, + "ce_loss_13": 3.4985696911811828, + "ce_loss_2": 4.184593963623047, + "ce_loss_3": 3.9949865102767945, + "ce_loss_7": 3.657104122638702, + "epoch": 0.436, + "grad_norm": 652.0, + "kl_loss_10": 134.92124710083007, + "kl_loss_2": 1480.7232177734375, + "kl_loss_3": 1106.3843566894532, + "kl_loss_7": 362.02510833740234, + "learning_rate": 0.0006086070360502539, + "loss": 776.0767, + "step": 4360 + }, + { + "ce_loss_10": 3.5607874393463135, + "ce_loss_13": 3.5032904148101807, + "ce_loss_2": 4.19012690782547, + "ce_loss_3": 4.001884508132934, + "ce_loss_7": 3.661303186416626, + "epoch": 0.437, + "grad_norm": 652.0, + "kl_loss_10": 131.70704956054686, + "kl_loss_2": 1494.450860595703, + "kl_loss_3": 1114.0662811279296, + "kl_loss_7": 359.0336669921875, + "learning_rate": 0.0006070577120355903, + "loss": 773.2331, + "step": 4370 + }, + { + "ce_loss_10": 3.5673134207725523, + "ce_loss_13": 3.5069605112075806, + "ce_loss_2": 4.194201278686523, + "ce_loss_3": 4.008713376522064, + "ce_loss_7": 3.6672159075737, + "epoch": 0.438, + "grad_norm": 724.0, + "kl_loss_10": 129.65911521911622, + "kl_loss_2": 1461.538018798828, + "kl_loss_3": 1099.2910552978515, + "kl_loss_7": 357.59047546386716, + "learning_rate": 0.0006055073099509549, + "loss": 765.2788, + "step": 4380 + }, + { + "ce_loss_10": 3.6260691046714784, + "ce_loss_13": 3.566242527961731, + "ce_loss_2": 4.243451619148255, + "ce_loss_3": 4.051405000686645, + "ce_loss_7": 3.723850154876709, + "epoch": 0.439, + "grad_norm": 708.0, + "kl_loss_10": 131.5600498199463, + "kl_loss_2": 1464.825506591797, + "kl_loss_3": 1095.515121459961, + "kl_loss_7": 358.8866928100586, + "learning_rate": 0.0006039558454088796, + "loss": 777.1961, + "step": 4390 + }, + { + "ce_loss_10": 3.600092685222626, + "ce_loss_13": 3.5382148027420044, + "ce_loss_2": 4.238782167434692, + "ce_loss_3": 4.047287583351135, + "ce_loss_7": 3.7017881989479067, + "epoch": 0.44, + "grad_norm": 720.0, + "kl_loss_10": 134.45111694335938, + "kl_loss_2": 1488.527685546875, + "kl_loss_3": 1123.7609832763671, + "kl_loss_7": 363.68369445800784, + "learning_rate": 0.0006024033340325954, + "loss": 766.9879, + "step": 4400 + }, + { + "ce_loss_10": 3.6631633281707763, + "ce_loss_13": 3.6062058329582216, + "ce_loss_2": 4.263676905632019, + "ce_loss_3": 4.082637584209442, + "ce_loss_7": 3.75854674577713, + "epoch": 0.441, + "grad_norm": 492.0, + "kl_loss_10": 127.09016380310058, + "kl_loss_2": 1417.7144897460937, + "kl_loss_3": 1058.3302947998047, + "kl_loss_7": 343.7723358154297, + "learning_rate": 0.0006008497914558743, + "loss": 752.2297, + "step": 4410 + }, + { + "ce_loss_10": 3.601701498031616, + "ce_loss_13": 3.5420748829841613, + "ce_loss_2": 4.236417984962463, + "ce_loss_3": 4.046699476242066, + "ce_loss_7": 3.7043898940086364, + "epoch": 0.442, + "grad_norm": 584.0, + "kl_loss_10": 135.9376647949219, + "kl_loss_2": 1491.8550354003905, + "kl_loss_3": 1120.2072509765626, + "kl_loss_7": 366.13590545654296, + "learning_rate": 0.0005992952333228728, + "loss": 773.4489, + "step": 4420 + }, + { + "ce_loss_10": 3.5400898575782778, + "ce_loss_13": 3.4842668890953066, + "ce_loss_2": 4.171782422065735, + "ce_loss_3": 3.9820830821990967, + "ce_loss_7": 3.6369754314422607, + "epoch": 0.443, + "grad_norm": 660.0, + "kl_loss_10": 129.43466796875, + "kl_loss_2": 1496.3381286621093, + "kl_loss_3": 1116.0555755615235, + "kl_loss_7": 355.4880676269531, + "learning_rate": 0.0005977396752879741, + "loss": 771.3094, + "step": 4430 + }, + { + "ce_loss_10": 3.4704235672950743, + "ce_loss_13": 3.4127361297607424, + "ce_loss_2": 4.106459307670593, + "ce_loss_3": 3.920796346664429, + "ce_loss_7": 3.573098838329315, + "epoch": 0.444, + "grad_norm": 732.0, + "kl_loss_10": 126.1293731689453, + "kl_loss_2": 1490.1605346679687, + "kl_loss_3": 1120.4780059814452, + "kl_loss_7": 354.9955078125, + "learning_rate": 0.0005961831330156305, + "loss": 764.2076, + "step": 4440 + }, + { + "ce_loss_10": 3.615150511264801, + "ce_loss_13": 3.5547899127006533, + "ce_loss_2": 4.245680320262909, + "ce_loss_3": 4.054165327548981, + "ce_loss_7": 3.712630808353424, + "epoch": 0.445, + "grad_norm": 628.0, + "kl_loss_10": 131.35802993774413, + "kl_loss_2": 1499.3917602539063, + "kl_loss_3": 1117.8766357421875, + "kl_loss_7": 361.68542938232423, + "learning_rate": 0.0005946256221802051, + "loss": 786.0435, + "step": 4450 + }, + { + "ce_loss_10": 3.591351556777954, + "ce_loss_13": 3.535250651836395, + "ce_loss_2": 4.191380190849304, + "ce_loss_3": 4.004255092144012, + "ce_loss_7": 3.6844709634780886, + "epoch": 0.446, + "grad_norm": 708.0, + "kl_loss_10": 129.50046005249024, + "kl_loss_2": 1424.1330322265626, + "kl_loss_3": 1067.6145050048829, + "kl_loss_7": 349.01904144287107, + "learning_rate": 0.0005930671584658151, + "loss": 778.2988, + "step": 4460 + }, + { + "ce_loss_10": 3.590542936325073, + "ce_loss_13": 3.5338905096054076, + "ce_loss_2": 4.219100630283355, + "ce_loss_3": 4.026160931587219, + "ce_loss_7": 3.687147891521454, + "epoch": 0.447, + "grad_norm": 572.0, + "kl_loss_10": 130.2479160308838, + "kl_loss_2": 1492.514990234375, + "kl_loss_3": 1115.7035186767578, + "kl_loss_7": 358.6115783691406, + "learning_rate": 0.0005915077575661722, + "loss": 782.5174, + "step": 4470 + }, + { + "ce_loss_10": 3.6091010570526123, + "ce_loss_13": 3.548303461074829, + "ce_loss_2": 4.243482124805451, + "ce_loss_3": 4.051424252986908, + "ce_loss_7": 3.709373152256012, + "epoch": 0.448, + "grad_norm": 692.0, + "kl_loss_10": 133.69428749084472, + "kl_loss_2": 1503.4253295898438, + "kl_loss_3": 1123.9545928955079, + "kl_loss_7": 367.3063217163086, + "learning_rate": 0.000589947435184427, + "loss": 770.9538, + "step": 4480 + }, + { + "ce_loss_10": 3.67550984621048, + "ce_loss_13": 3.61795197725296, + "ce_loss_2": 4.270373678207397, + "ce_loss_3": 4.090293383598327, + "ce_loss_7": 3.769421923160553, + "epoch": 0.449, + "grad_norm": 548.0, + "kl_loss_10": 132.92403182983398, + "kl_loss_2": 1453.855810546875, + "kl_loss_3": 1089.5215789794922, + "kl_loss_7": 359.96496124267577, + "learning_rate": 0.0005883862070330078, + "loss": 768.3579, + "step": 4490 + }, + { + "ce_loss_10": 3.6023089408874513, + "ce_loss_13": 3.5429613828659057, + "ce_loss_2": 4.2273586869239805, + "ce_loss_3": 4.0474681735038756, + "ce_loss_7": 3.7016926527023317, + "epoch": 0.45, + "grad_norm": 640.0, + "kl_loss_10": 131.65977668762207, + "kl_loss_2": 1477.8073486328126, + "kl_loss_3": 1112.020849609375, + "kl_loss_7": 363.57453002929685, + "learning_rate": 0.0005868240888334653, + "loss": 768.102, + "step": 4500 + }, + { + "ce_loss_10": 3.4881216049194337, + "ce_loss_13": 3.4285590648651123, + "ce_loss_2": 4.145088362693786, + "ce_loss_3": 3.9443087697029116, + "ce_loss_7": 3.5946366786956787, + "epoch": 0.451, + "grad_norm": 860.0, + "kl_loss_10": 132.0642234802246, + "kl_loss_2": 1523.9566467285156, + "kl_loss_3": 1139.5087677001952, + "kl_loss_7": 370.548567199707, + "learning_rate": 0.0005852610963163119, + "loss": 782.7128, + "step": 4510 + }, + { + "ce_loss_10": 3.509654688835144, + "ce_loss_13": 3.453027904033661, + "ce_loss_2": 4.135330331325531, + "ce_loss_3": 3.948072779178619, + "ce_loss_7": 3.6065049171447754, + "epoch": 0.452, + "grad_norm": 576.0, + "kl_loss_10": 127.13306007385253, + "kl_loss_2": 1474.840264892578, + "kl_loss_3": 1103.8429443359375, + "kl_loss_7": 352.3040481567383, + "learning_rate": 0.0005836972452208654, + "loss": 758.6461, + "step": 4520 + }, + { + "ce_loss_10": 3.516415762901306, + "ce_loss_13": 3.4608123779296873, + "ce_loss_2": 4.151223230361938, + "ce_loss_3": 3.964005374908447, + "ce_loss_7": 3.6154285073280334, + "epoch": 0.453, + "grad_norm": 692.0, + "kl_loss_10": 129.6707332611084, + "kl_loss_2": 1490.4531555175781, + "kl_loss_3": 1112.0751403808595, + "kl_loss_7": 360.926252746582, + "learning_rate": 0.0005821325512950885, + "loss": 774.0762, + "step": 4530 + }, + { + "ce_loss_10": 3.5449029922485353, + "ce_loss_13": 3.4901923894882203, + "ce_loss_2": 4.175939702987671, + "ce_loss_3": 3.9821668744087217, + "ce_loss_7": 3.6469666838645933, + "epoch": 0.454, + "grad_norm": 536.0, + "kl_loss_10": 126.68943367004394, + "kl_loss_2": 1457.47294921875, + "kl_loss_3": 1089.1407897949218, + "kl_loss_7": 353.19891662597655, + "learning_rate": 0.0005805670302954321, + "loss": 773.5169, + "step": 4540 + }, + { + "ce_loss_10": 3.551515507698059, + "ce_loss_13": 3.496657633781433, + "ce_loss_2": 4.165827226638794, + "ce_loss_3": 3.9793267846107483, + "ce_loss_7": 3.6467077493667603, + "epoch": 0.455, + "grad_norm": 632.0, + "kl_loss_10": 124.99089202880859, + "kl_loss_2": 1454.5718872070313, + "kl_loss_3": 1086.5160583496095, + "kl_loss_7": 349.37010803222654, + "learning_rate": 0.000579000697986675, + "loss": 757.4347, + "step": 4550 + }, + { + "ce_loss_10": 3.51110919713974, + "ce_loss_13": 3.4499236941337585, + "ce_loss_2": 4.169062435626984, + "ce_loss_3": 3.9678101181983947, + "ce_loss_7": 3.6167086601257323, + "epoch": 0.456, + "grad_norm": 688.0, + "kl_loss_10": 133.06291236877442, + "kl_loss_2": 1530.2515380859375, + "kl_loss_3": 1140.2865753173828, + "kl_loss_7": 365.97554779052734, + "learning_rate": 0.0005774335701417662, + "loss": 779.1915, + "step": 4560 + }, + { + "ce_loss_10": 3.495447027683258, + "ce_loss_13": 3.440790295600891, + "ce_loss_2": 4.137791275978088, + "ce_loss_3": 3.9482330679893494, + "ce_loss_7": 3.5958903670310973, + "epoch": 0.457, + "grad_norm": 628.0, + "kl_loss_10": 127.81274681091308, + "kl_loss_2": 1513.1394897460937, + "kl_loss_3": 1131.4115112304687, + "kl_loss_7": 359.4502899169922, + "learning_rate": 0.0005758656625416658, + "loss": 779.9761, + "step": 4570 + }, + { + "ce_loss_10": 3.5505786299705506, + "ce_loss_13": 3.4906317949295045, + "ce_loss_2": 4.1901858925819395, + "ce_loss_3": 3.997506558895111, + "ce_loss_7": 3.6519751071929933, + "epoch": 0.458, + "grad_norm": 684.0, + "kl_loss_10": 130.90405158996583, + "kl_loss_2": 1497.9955810546876, + "kl_loss_3": 1116.8375152587892, + "kl_loss_7": 361.80901489257815, + "learning_rate": 0.0005742969909751859, + "loss": 764.1115, + "step": 4580 + }, + { + "ce_loss_10": 3.5638930439949035, + "ce_loss_13": 3.503375542163849, + "ce_loss_2": 4.186403441429138, + "ce_loss_3": 3.9995208024978637, + "ce_loss_7": 3.6641741752624513, + "epoch": 0.459, + "grad_norm": 580.0, + "kl_loss_10": 130.00192375183104, + "kl_loss_2": 1475.8618774414062, + "kl_loss_3": 1099.7632873535156, + "kl_loss_7": 358.99674835205076, + "learning_rate": 0.0005727275712388318, + "loss": 769.1571, + "step": 4590 + }, + { + "ce_loss_10": 3.593214011192322, + "ce_loss_13": 3.53840229511261, + "ce_loss_2": 4.204052042961121, + "ce_loss_3": 4.022337186336517, + "ce_loss_7": 3.6885414600372313, + "epoch": 0.46, + "grad_norm": 688.0, + "kl_loss_10": 126.89060325622559, + "kl_loss_2": 1444.349786376953, + "kl_loss_3": 1080.8706604003905, + "kl_loss_7": 347.9618377685547, + "learning_rate": 0.0005711574191366427, + "loss": 759.1991, + "step": 4600 + }, + { + "ce_loss_10": 3.542857563495636, + "ce_loss_13": 3.4855061650276182, + "ce_loss_2": 4.162867796421051, + "ce_loss_3": 3.973730170726776, + "ce_loss_7": 3.6373270750045776, + "epoch": 0.461, + "grad_norm": 498.0, + "kl_loss_10": 127.64135208129883, + "kl_loss_2": 1482.562158203125, + "kl_loss_3": 1108.3277709960937, + "kl_loss_7": 353.4364410400391, + "learning_rate": 0.0005695865504800327, + "loss": 763.6274, + "step": 4610 + }, + { + "ce_loss_10": 3.480057179927826, + "ce_loss_13": 3.4188697218894957, + "ce_loss_2": 4.161020743846893, + "ce_loss_3": 3.9625292778015138, + "ce_loss_7": 3.5874672174453734, + "epoch": 0.462, + "grad_norm": 636.0, + "kl_loss_10": 132.61270484924316, + "kl_loss_2": 1565.8641235351563, + "kl_loss_3": 1171.7663940429688, + "kl_loss_7": 371.96593170166017, + "learning_rate": 0.0005680149810876322, + "loss": 786.1608, + "step": 4620 + }, + { + "ce_loss_10": 3.5343680024147033, + "ce_loss_13": 3.476098108291626, + "ce_loss_2": 4.15952113866806, + "ce_loss_3": 3.974844920635223, + "ce_loss_7": 3.633632469177246, + "epoch": 0.463, + "grad_norm": 916.0, + "kl_loss_10": 129.65257568359374, + "kl_loss_2": 1473.4421691894531, + "kl_loss_3": 1102.603567504883, + "kl_loss_7": 355.8689987182617, + "learning_rate": 0.0005664427267851271, + "loss": 764.6609, + "step": 4630 + }, + { + "ce_loss_10": 3.4518208742141723, + "ce_loss_13": 3.395127773284912, + "ce_loss_2": 4.078986668586731, + "ce_loss_3": 3.8934460520744323, + "ce_loss_7": 3.550265383720398, + "epoch": 0.464, + "grad_norm": 788.0, + "kl_loss_10": 128.43332138061524, + "kl_loss_2": 1471.1863525390625, + "kl_loss_3": 1105.277377319336, + "kl_loss_7": 355.1179916381836, + "learning_rate": 0.0005648698034051009, + "loss": 759.5711, + "step": 4640 + }, + { + "ce_loss_10": 3.5719098687171935, + "ce_loss_13": 3.513967990875244, + "ce_loss_2": 4.204645431041717, + "ce_loss_3": 4.020779967308044, + "ce_loss_7": 3.6722232818603517, + "epoch": 0.465, + "grad_norm": 1056.0, + "kl_loss_10": 129.50131530761718, + "kl_loss_2": 1492.4452270507813, + "kl_loss_3": 1118.05078125, + "kl_loss_7": 355.465641784668, + "learning_rate": 0.0005632962267868747, + "loss": 760.3559, + "step": 4650 + }, + { + "ce_loss_10": 3.503978407382965, + "ce_loss_13": 3.4484734773635863, + "ce_loss_2": 4.13207323551178, + "ce_loss_3": 3.944869303703308, + "ce_loss_7": 3.6067972064018248, + "epoch": 0.466, + "grad_norm": 604.0, + "kl_loss_10": 124.18871879577637, + "kl_loss_2": 1470.0651733398438, + "kl_loss_3": 1095.9672973632812, + "kl_loss_7": 351.40113372802733, + "learning_rate": 0.0005617220127763474, + "loss": 767.9368, + "step": 4660 + }, + { + "ce_loss_10": 3.585794413089752, + "ce_loss_13": 3.5305840611457824, + "ce_loss_2": 4.204195821285248, + "ce_loss_3": 4.019779980182648, + "ce_loss_7": 3.682017946243286, + "epoch": 0.467, + "grad_norm": 696.0, + "kl_loss_10": 127.49123001098633, + "kl_loss_2": 1450.9377075195312, + "kl_loss_3": 1090.0952362060548, + "kl_loss_7": 354.78733215332034, + "learning_rate": 0.0005601471772258368, + "loss": 762.7567, + "step": 4670 + }, + { + "ce_loss_10": 3.5694589257240295, + "ce_loss_13": 3.5142530679702757, + "ce_loss_2": 4.177708339691162, + "ce_loss_3": 3.9971248507499695, + "ce_loss_7": 3.668432116508484, + "epoch": 0.468, + "grad_norm": 604.0, + "kl_loss_10": 127.1932876586914, + "kl_loss_2": 1430.6669982910157, + "kl_loss_3": 1074.753436279297, + "kl_loss_7": 350.61267242431643, + "learning_rate": 0.0005585717359939192, + "loss": 765.1635, + "step": 4680 + }, + { + "ce_loss_10": 3.482089602947235, + "ce_loss_13": 3.425822353363037, + "ce_loss_2": 4.101116871833801, + "ce_loss_3": 3.9152937650680544, + "ce_loss_7": 3.5779839038848875, + "epoch": 0.469, + "grad_norm": 528.0, + "kl_loss_10": 126.95637588500976, + "kl_loss_2": 1451.6751831054687, + "kl_loss_3": 1094.0453521728516, + "kl_loss_7": 351.15742645263674, + "learning_rate": 0.0005569957049452703, + "loss": 770.4322, + "step": 4690 + }, + { + "ce_loss_10": 3.5357295274734497, + "ce_loss_13": 3.4797881722450255, + "ce_loss_2": 4.170310592651367, + "ce_loss_3": 3.9769333004951477, + "ce_loss_7": 3.635083317756653, + "epoch": 0.47, + "grad_norm": 628.0, + "kl_loss_10": 130.350581741333, + "kl_loss_2": 1499.3314453125, + "kl_loss_3": 1120.0059173583984, + "kl_loss_7": 359.9276611328125, + "learning_rate": 0.0005554190999505056, + "loss": 773.8736, + "step": 4700 + }, + { + "ce_loss_10": 3.6618139266967775, + "ce_loss_13": 3.603938400745392, + "ce_loss_2": 4.291402506828308, + "ce_loss_3": 4.10056711435318, + "ce_loss_7": 3.7629532337188722, + "epoch": 0.471, + "grad_norm": 552.0, + "kl_loss_10": 132.34991912841798, + "kl_loss_2": 1483.7266418457032, + "kl_loss_3": 1113.9899047851563, + "kl_loss_7": 363.98974151611327, + "learning_rate": 0.0005538419368860196, + "loss": 745.9498, + "step": 4710 + }, + { + "ce_loss_10": 3.58713721036911, + "ce_loss_13": 3.5296577334403993, + "ce_loss_2": 4.205131685733795, + "ce_loss_3": 4.02278323173523, + "ce_loss_7": 3.683391070365906, + "epoch": 0.472, + "grad_norm": 478.0, + "kl_loss_10": 128.9550354003906, + "kl_loss_2": 1460.061590576172, + "kl_loss_3": 1097.231591796875, + "kl_loss_7": 353.49449615478517, + "learning_rate": 0.0005522642316338268, + "loss": 777.6082, + "step": 4720 + }, + { + "ce_loss_10": 3.59045729637146, + "ce_loss_13": 3.535309398174286, + "ce_loss_2": 4.197298634052276, + "ce_loss_3": 4.012663578987121, + "ce_loss_7": 3.685203659534454, + "epoch": 0.473, + "grad_norm": 836.0, + "kl_loss_10": 129.07300338745117, + "kl_loss_2": 1434.7744934082032, + "kl_loss_3": 1076.9549468994142, + "kl_loss_7": 353.2419830322266, + "learning_rate": 0.0005506860000814017, + "loss": 772.4581, + "step": 4730 + }, + { + "ce_loss_10": 3.6118584752082823, + "ce_loss_13": 3.556533193588257, + "ce_loss_2": 4.223218786716461, + "ce_loss_3": 4.040239870548248, + "ce_loss_7": 3.7073376536369325, + "epoch": 0.474, + "grad_norm": 588.0, + "kl_loss_10": 126.57029228210449, + "kl_loss_2": 1431.9671630859375, + "kl_loss_3": 1080.1793304443358, + "kl_loss_7": 348.32355346679685, + "learning_rate": 0.0005491072581215186, + "loss": 759.6759, + "step": 4740 + }, + { + "ce_loss_10": 3.6188451647758484, + "ce_loss_13": 3.5581385135650634, + "ce_loss_2": 4.235405123233795, + "ce_loss_3": 4.047661685943604, + "ce_loss_7": 3.7187539458274843, + "epoch": 0.475, + "grad_norm": 1024.0, + "kl_loss_10": 131.5200958251953, + "kl_loss_2": 1475.39228515625, + "kl_loss_3": 1103.3305755615233, + "kl_loss_7": 361.21947174072267, + "learning_rate": 0.0005475280216520913, + "loss": 750.9313, + "step": 4750 + }, + { + "ce_loss_10": 3.5320704579353333, + "ce_loss_13": 3.4781441688537598, + "ce_loss_2": 4.1474240064620975, + "ce_loss_3": 3.960626220703125, + "ce_loss_7": 3.629132354259491, + "epoch": 0.476, + "grad_norm": 636.0, + "kl_loss_10": 126.30528450012207, + "kl_loss_2": 1437.6857177734375, + "kl_loss_3": 1077.3333221435546, + "kl_loss_7": 347.41858367919923, + "learning_rate": 0.0005459483065760138, + "loss": 766.0214, + "step": 4760 + }, + { + "ce_loss_10": 3.466313195228577, + "ce_loss_13": 3.4118001461029053, + "ce_loss_2": 4.126975905895233, + "ce_loss_3": 3.9336133003234863, + "ce_loss_7": 3.566939985752106, + "epoch": 0.477, + "grad_norm": 916.0, + "kl_loss_10": 127.09881401062012, + "kl_loss_2": 1513.8791442871093, + "kl_loss_3": 1142.7632537841796, + "kl_loss_7": 354.5154052734375, + "learning_rate": 0.0005443681288009991, + "loss": 767.701, + "step": 4770 + }, + { + "ce_loss_10": 3.530055844783783, + "ce_loss_13": 3.473679745197296, + "ce_loss_2": 4.15294862985611, + "ce_loss_3": 3.9638696193695067, + "ce_loss_7": 3.6289584755897524, + "epoch": 0.478, + "grad_norm": 732.0, + "kl_loss_10": 128.42002296447754, + "kl_loss_2": 1474.6412109375, + "kl_loss_3": 1102.924478149414, + "kl_loss_7": 356.6063034057617, + "learning_rate": 0.0005427875042394199, + "loss": 768.8391, + "step": 4780 + }, + { + "ce_loss_10": 3.559061658382416, + "ce_loss_13": 3.499286782741547, + "ce_loss_2": 4.178021502494812, + "ce_loss_3": 3.9880162835121156, + "ce_loss_7": 3.6537997484207154, + "epoch": 0.479, + "grad_norm": 604.0, + "kl_loss_10": 129.98453598022462, + "kl_loss_2": 1476.697979736328, + "kl_loss_3": 1103.0448516845704, + "kl_loss_7": 357.54207611083984, + "learning_rate": 0.0005412064488081482, + "loss": 775.8501, + "step": 4790 + }, + { + "ce_loss_10": 3.5636435866355898, + "ce_loss_13": 3.507556414604187, + "ce_loss_2": 4.178312647342682, + "ce_loss_3": 3.994305968284607, + "ce_loss_7": 3.659174859523773, + "epoch": 0.48, + "grad_norm": 712.0, + "kl_loss_10": 126.99441261291504, + "kl_loss_2": 1456.738299560547, + "kl_loss_3": 1086.8102722167969, + "kl_loss_7": 351.37431640625, + "learning_rate": 0.0005396249784283942, + "loss": 754.4715, + "step": 4800 + }, + { + "ce_loss_10": 3.582900881767273, + "ce_loss_13": 3.5235010981559753, + "ce_loss_2": 4.229977750778199, + "ce_loss_3": 4.035599565505981, + "ce_loss_7": 3.682912456989288, + "epoch": 0.481, + "grad_norm": 788.0, + "kl_loss_10": 132.2354824066162, + "kl_loss_2": 1513.726251220703, + "kl_loss_3": 1136.702828979492, + "kl_loss_7": 365.539323425293, + "learning_rate": 0.0005380431090255476, + "loss": 777.0931, + "step": 4810 + }, + { + "ce_loss_10": 3.5710787892341616, + "ce_loss_13": 3.518178606033325, + "ce_loss_2": 4.170691752433777, + "ce_loss_3": 3.9865909218788147, + "ce_loss_7": 3.667452025413513, + "epoch": 0.482, + "grad_norm": 552.0, + "kl_loss_10": 124.38290710449219, + "kl_loss_2": 1423.2920043945312, + "kl_loss_3": 1067.5982177734375, + "kl_loss_7": 345.50093231201174, + "learning_rate": 0.0005364608565290155, + "loss": 749.5516, + "step": 4820 + }, + { + "ce_loss_10": 3.5861640691757204, + "ce_loss_13": 3.527508783340454, + "ce_loss_2": 4.206353557109833, + "ce_loss_3": 4.020480215549469, + "ce_loss_7": 3.6830241322517394, + "epoch": 0.483, + "grad_norm": 716.0, + "kl_loss_10": 129.12178649902344, + "kl_loss_2": 1457.7945068359375, + "kl_loss_3": 1095.74462890625, + "kl_loss_7": 357.8998229980469, + "learning_rate": 0.0005348782368720626, + "loss": 759.7926, + "step": 4830 + }, + { + "ce_loss_10": 3.512594223022461, + "ce_loss_13": 3.4565085053443907, + "ce_loss_2": 4.119910812377929, + "ce_loss_3": 3.9370038151741027, + "ce_loss_7": 3.6123961448669433, + "epoch": 0.484, + "grad_norm": 892.0, + "kl_loss_10": 124.94047012329102, + "kl_loss_2": 1436.2225341796875, + "kl_loss_3": 1076.01171875, + "kl_loss_7": 350.6201461791992, + "learning_rate": 0.000533295265991652, + "loss": 760.0171, + "step": 4840 + }, + { + "ce_loss_10": 3.597572553157806, + "ce_loss_13": 3.537912893295288, + "ce_loss_2": 4.200307357311249, + "ce_loss_3": 4.0205553531646725, + "ce_loss_7": 3.6955727219581602, + "epoch": 0.485, + "grad_norm": 660.0, + "kl_loss_10": 127.27426147460938, + "kl_loss_2": 1432.4733581542969, + "kl_loss_3": 1076.2284576416016, + "kl_loss_7": 357.1815933227539, + "learning_rate": 0.0005317119598282822, + "loss": 752.409, + "step": 4850 + }, + { + "ce_loss_10": 3.5897864937782287, + "ce_loss_13": 3.5321272253990172, + "ce_loss_2": 4.2048394799232485, + "ce_loss_3": 4.0231526613235475, + "ce_loss_7": 3.697403919696808, + "epoch": 0.486, + "grad_norm": 740.0, + "kl_loss_10": 127.06500816345215, + "kl_loss_2": 1436.3693542480469, + "kl_loss_3": 1081.5646484375, + "kl_loss_7": 361.1262634277344, + "learning_rate": 0.0005301283343258293, + "loss": 758.6568, + "step": 4860 + }, + { + "ce_loss_10": 3.65613032579422, + "ce_loss_13": 3.5975700855255126, + "ce_loss_2": 4.251663959026336, + "ce_loss_3": 4.07744791507721, + "ce_loss_7": 3.7647886872291565, + "epoch": 0.487, + "grad_norm": 724.0, + "kl_loss_10": 127.67147789001464, + "kl_loss_2": 1417.543603515625, + "kl_loss_3": 1072.8577423095703, + "kl_loss_7": 370.50387420654295, + "learning_rate": 0.000528544405431384, + "loss": 748.367, + "step": 4870 + }, + { + "ce_loss_10": 3.53450231552124, + "ce_loss_13": 3.4755603075027466, + "ce_loss_2": 4.1686626195907595, + "ce_loss_3": 3.9780558943748474, + "ce_loss_7": 3.639273762702942, + "epoch": 0.488, + "grad_norm": 772.0, + "kl_loss_10": 128.8699951171875, + "kl_loss_2": 1486.9147888183593, + "kl_loss_3": 1119.2680114746095, + "kl_loss_7": 374.7581512451172, + "learning_rate": 0.000526960189095093, + "loss": 773.1109, + "step": 4880 + }, + { + "ce_loss_10": 3.5063798785209657, + "ce_loss_13": 3.452955174446106, + "ce_loss_2": 4.127981126308441, + "ce_loss_3": 3.9387494206428526, + "ce_loss_7": 3.6084472298622132, + "epoch": 0.489, + "grad_norm": 588.0, + "kl_loss_10": 124.4889362335205, + "kl_loss_2": 1436.8860290527343, + "kl_loss_3": 1073.7127777099608, + "kl_loss_7": 349.7402374267578, + "learning_rate": 0.0005253757012699972, + "loss": 752.2137, + "step": 4890 + }, + { + "ce_loss_10": 3.596962869167328, + "ce_loss_13": 3.541905701160431, + "ce_loss_2": 4.204607903957367, + "ce_loss_3": 4.02187534570694, + "ce_loss_7": 3.695280838012695, + "epoch": 0.49, + "grad_norm": 524.0, + "kl_loss_10": 127.72086906433105, + "kl_loss_2": 1445.986083984375, + "kl_loss_3": 1085.8021209716796, + "kl_loss_7": 353.85157623291013, + "learning_rate": 0.0005237909579118712, + "loss": 766.9711, + "step": 4900 + }, + { + "ce_loss_10": 3.5621119856834413, + "ce_loss_13": 3.5039254426956177, + "ce_loss_2": 4.190770697593689, + "ce_loss_3": 4.0025376081466675, + "ce_loss_7": 3.6638841152191164, + "epoch": 0.491, + "grad_norm": 676.0, + "kl_loss_10": 130.21649017333985, + "kl_loss_2": 1476.865899658203, + "kl_loss_3": 1108.4342376708985, + "kl_loss_7": 365.27405700683596, + "learning_rate": 0.0005222059749790631, + "loss": 766.6848, + "step": 4910 + }, + { + "ce_loss_10": 3.627886402606964, + "ce_loss_13": 3.568757712841034, + "ce_loss_2": 4.211319518089295, + "ce_loss_3": 4.034268498420715, + "ce_loss_7": 3.7232242226600647, + "epoch": 0.492, + "grad_norm": 704.0, + "kl_loss_10": 128.84821739196778, + "kl_loss_2": 1416.5603820800782, + "kl_loss_3": 1061.9581909179688, + "kl_loss_7": 352.0233520507812, + "learning_rate": 0.0005206207684323337, + "loss": 737.0755, + "step": 4920 + }, + { + "ce_loss_10": 3.6068161606788633, + "ce_loss_13": 3.5479695200920105, + "ce_loss_2": 4.220036661624908, + "ce_loss_3": 4.033611464500427, + "ce_loss_7": 3.707039773464203, + "epoch": 0.493, + "grad_norm": 752.0, + "kl_loss_10": 130.1327449798584, + "kl_loss_2": 1452.3597900390625, + "kl_loss_3": 1088.4027435302735, + "kl_loss_7": 358.82303619384766, + "learning_rate": 0.000519035354234695, + "loss": 768.5039, + "step": 4930 + }, + { + "ce_loss_10": 3.585032618045807, + "ce_loss_13": 3.5267568826675415, + "ce_loss_2": 4.200621557235718, + "ce_loss_3": 4.019805324077606, + "ce_loss_7": 3.6854267716407776, + "epoch": 0.494, + "grad_norm": 648.0, + "kl_loss_10": 130.55963859558105, + "kl_loss_2": 1452.412860107422, + "kl_loss_3": 1089.5958099365234, + "kl_loss_7": 358.87390594482423, + "learning_rate": 0.0005174497483512506, + "loss": 745.6609, + "step": 4940 + }, + { + "ce_loss_10": 3.6264762759208677, + "ce_loss_13": 3.57160165309906, + "ce_loss_2": 4.232844221591949, + "ce_loss_3": 4.046636259555816, + "ce_loss_7": 3.7188863515853883, + "epoch": 0.495, + "grad_norm": 624.0, + "kl_loss_10": 126.8898696899414, + "kl_loss_2": 1443.1926330566407, + "kl_loss_3": 1079.7430755615235, + "kl_loss_7": 347.55773010253904, + "learning_rate": 0.0005158639667490339, + "loss": 757.0432, + "step": 4950 + }, + { + "ce_loss_10": 3.522357964515686, + "ce_loss_13": 3.4666619300842285, + "ce_loss_2": 4.146045255661011, + "ce_loss_3": 3.9596054553985596, + "ce_loss_7": 3.626363182067871, + "epoch": 0.496, + "grad_norm": 660.0, + "kl_loss_10": 127.58676567077637, + "kl_loss_2": 1464.100274658203, + "kl_loss_3": 1098.3539154052735, + "kl_loss_7": 353.0430084228516, + "learning_rate": 0.0005142780253968481, + "loss": 757.5217, + "step": 4960 + }, + { + "ce_loss_10": 3.4769110679626465, + "ce_loss_13": 3.420508313179016, + "ce_loss_2": 4.0726398229599, + "ce_loss_3": 3.8918757796287538, + "ce_loss_7": 3.5705711126327513, + "epoch": 0.497, + "grad_norm": 712.0, + "kl_loss_10": 123.46647834777832, + "kl_loss_2": 1419.8298278808593, + "kl_loss_3": 1063.150827026367, + "kl_loss_7": 341.79971771240236, + "learning_rate": 0.0005126919402651053, + "loss": 732.2201, + "step": 4970 + }, + { + "ce_loss_10": 3.5448826789855956, + "ce_loss_13": 3.4868510246276854, + "ce_loss_2": 4.179246997833252, + "ce_loss_3": 3.9946623921394346, + "ce_loss_7": 3.648559832572937, + "epoch": 0.498, + "grad_norm": 664.0, + "kl_loss_10": 129.68025093078614, + "kl_loss_2": 1467.3921997070313, + "kl_loss_3": 1101.7997802734376, + "kl_loss_7": 355.3198486328125, + "learning_rate": 0.0005111057273256647, + "loss": 763.329, + "step": 4980 + }, + { + "ce_loss_10": 3.6479685425758364, + "ce_loss_13": 3.593002438545227, + "ce_loss_2": 4.218882548809051, + "ce_loss_3": 4.0441102385520935, + "ce_loss_7": 3.736710476875305, + "epoch": 0.499, + "grad_norm": 524.0, + "kl_loss_10": 124.3959129333496, + "kl_loss_2": 1369.187322998047, + "kl_loss_3": 1028.7687194824218, + "kl_loss_7": 336.4957000732422, + "learning_rate": 0.0005095194025516733, + "loss": 724.7756, + "step": 4990 + }, + { + "ce_loss_10": 3.5743329763412475, + "ce_loss_13": 3.5195638060569765, + "ce_loss_2": 4.169176626205444, + "ce_loss_3": 3.9893115639686583, + "ce_loss_7": 3.6650718688964843, + "epoch": 0.5, + "grad_norm": 636.0, + "kl_loss_10": 124.31196022033691, + "kl_loss_2": 1414.2773315429688, + "kl_loss_3": 1058.442919921875, + "kl_loss_7": 342.9336791992188, + "learning_rate": 0.000507932981917404, + "loss": 758.1979, + "step": 5000 + }, + { + "ce_loss_10": 3.527126336097717, + "ce_loss_13": 3.4682271480560303, + "ce_loss_2": 4.170087909698486, + "ce_loss_3": 3.977930450439453, + "ce_loss_7": 3.6243650317192078, + "epoch": 0.501, + "grad_norm": 860.0, + "kl_loss_10": 130.30797805786133, + "kl_loss_2": 1503.0117431640624, + "kl_loss_3": 1119.1187561035156, + "kl_loss_7": 358.08606872558596, + "learning_rate": 0.0005063464813980949, + "loss": 777.8597, + "step": 5010 + }, + { + "ce_loss_10": 3.512421131134033, + "ce_loss_13": 3.45424964427948, + "ce_loss_2": 4.135089802742004, + "ce_loss_3": 3.9414800643920898, + "ce_loss_7": 3.606937575340271, + "epoch": 0.502, + "grad_norm": 556.0, + "kl_loss_10": 127.28718528747558, + "kl_loss_2": 1467.612969970703, + "kl_loss_3": 1098.6405395507813, + "kl_loss_7": 348.034928894043, + "learning_rate": 0.0005047599169697884, + "loss": 752.1499, + "step": 5020 + }, + { + "ce_loss_10": 3.4468389987945556, + "ce_loss_13": 3.3897311687469482, + "ce_loss_2": 4.070673036575317, + "ce_loss_3": 3.880372130870819, + "ce_loss_7": 3.544992959499359, + "epoch": 0.503, + "grad_norm": 916.0, + "kl_loss_10": 124.79916343688964, + "kl_loss_2": 1447.2315368652344, + "kl_loss_3": 1079.9921875, + "kl_loss_7": 346.10303649902346, + "learning_rate": 0.000503173304609171, + "loss": 739.2281, + "step": 5030 + }, + { + "ce_loss_10": 3.572928011417389, + "ce_loss_13": 3.5145941257476805, + "ce_loss_2": 4.187036621570587, + "ce_loss_3": 4.007000887393952, + "ce_loss_7": 3.668638730049133, + "epoch": 0.504, + "grad_norm": 588.0, + "kl_loss_10": 126.00835227966309, + "kl_loss_2": 1440.8026000976563, + "kl_loss_3": 1078.6716094970702, + "kl_loss_7": 344.7953277587891, + "learning_rate": 0.0005015866602934111, + "loss": 744.6847, + "step": 5040 + }, + { + "ce_loss_10": 3.5366175055503843, + "ce_loss_13": 3.478814089298248, + "ce_loss_2": 4.180455148220062, + "ce_loss_3": 3.9842303514480593, + "ce_loss_7": 3.634747123718262, + "epoch": 0.505, + "grad_norm": 660.0, + "kl_loss_10": 130.4676456451416, + "kl_loss_2": 1497.6227783203126, + "kl_loss_3": 1125.589825439453, + "kl_loss_7": 363.67038879394534, + "learning_rate": 0.0005, + "loss": 762.9002, + "step": 5050 + }, + { + "ce_loss_10": 3.526559591293335, + "ce_loss_13": 3.471008539199829, + "ce_loss_2": 4.1430164813995365, + "ce_loss_3": 3.9600774168968202, + "ce_loss_7": 3.6214876055717466, + "epoch": 0.506, + "grad_norm": 596.0, + "kl_loss_10": 127.42528266906739, + "kl_loss_2": 1458.6119445800782, + "kl_loss_3": 1100.3540649414062, + "kl_loss_7": 351.1239486694336, + "learning_rate": 0.0004984133397065889, + "loss": 745.3238, + "step": 5060 + }, + { + "ce_loss_10": 3.538331460952759, + "ce_loss_13": 3.48160400390625, + "ce_loss_2": 4.170895993709564, + "ce_loss_3": 3.9798762083053587, + "ce_loss_7": 3.6342212200164794, + "epoch": 0.507, + "grad_norm": 628.0, + "kl_loss_10": 126.94361305236816, + "kl_loss_2": 1458.0782897949218, + "kl_loss_3": 1096.8780242919922, + "kl_loss_7": 351.65411987304685, + "learning_rate": 0.0004968266953908291, + "loss": 748.5638, + "step": 5070 + }, + { + "ce_loss_10": 3.578177201747894, + "ce_loss_13": 3.523463523387909, + "ce_loss_2": 4.192970585823059, + "ce_loss_3": 4.007475554943085, + "ce_loss_7": 3.6718322396278382, + "epoch": 0.508, + "grad_norm": 712.0, + "kl_loss_10": 125.75382385253906, + "kl_loss_2": 1448.2949584960938, + "kl_loss_3": 1085.0182800292969, + "kl_loss_7": 344.33011016845705, + "learning_rate": 0.0004952400830302117, + "loss": 750.5558, + "step": 5080 + }, + { + "ce_loss_10": 3.505996084213257, + "ce_loss_13": 3.44895259141922, + "ce_loss_2": 4.14078243970871, + "ce_loss_3": 3.951820456981659, + "ce_loss_7": 3.6068713068962097, + "epoch": 0.509, + "grad_norm": 540.0, + "kl_loss_10": 128.75954895019532, + "kl_loss_2": 1493.3599365234375, + "kl_loss_3": 1111.6652374267578, + "kl_loss_7": 355.9008483886719, + "learning_rate": 0.0004936535186019053, + "loss": 759.8867, + "step": 5090 + }, + { + "ce_loss_10": 3.6047815322875976, + "ce_loss_13": 3.5503739714622498, + "ce_loss_2": 4.190906155109405, + "ce_loss_3": 4.0159718751907345, + "ce_loss_7": 3.6974563717842104, + "epoch": 0.51, + "grad_norm": 520.0, + "kl_loss_10": 124.55697784423828, + "kl_loss_2": 1396.7164306640625, + "kl_loss_3": 1052.4375610351562, + "kl_loss_7": 340.2462951660156, + "learning_rate": 0.000492067018082596, + "loss": 740.6732, + "step": 5100 + }, + { + "ce_loss_10": 3.5405318260192873, + "ce_loss_13": 3.481382191181183, + "ce_loss_2": 4.190889728069306, + "ce_loss_3": 3.9940460085868836, + "ce_loss_7": 3.6407326340675352, + "epoch": 0.511, + "grad_norm": 748.0, + "kl_loss_10": 129.94738388061523, + "kl_loss_2": 1511.246484375, + "kl_loss_3": 1130.2775482177735, + "kl_loss_7": 359.2450241088867, + "learning_rate": 0.0004904805974483267, + "loss": 784.2884, + "step": 5110 + }, + { + "ce_loss_10": 3.6526175856590273, + "ce_loss_13": 3.592953288555145, + "ce_loss_2": 4.278130650520325, + "ce_loss_3": 4.098746013641358, + "ce_loss_7": 3.7574565052986144, + "epoch": 0.512, + "grad_norm": 620.0, + "kl_loss_10": 133.8547565460205, + "kl_loss_2": 1478.876611328125, + "kl_loss_3": 1118.5677642822266, + "kl_loss_7": 366.43187561035154, + "learning_rate": 0.0004888942726743353, + "loss": 780.2876, + "step": 5120 + }, + { + "ce_loss_10": 3.5215405344963076, + "ce_loss_13": 3.4650914430618287, + "ce_loss_2": 4.143480885028839, + "ce_loss_3": 3.9585989832878115, + "ce_loss_7": 3.6219504952430723, + "epoch": 0.513, + "grad_norm": 752.0, + "kl_loss_10": 128.29964866638184, + "kl_loss_2": 1475.1177978515625, + "kl_loss_3": 1104.3501556396484, + "kl_loss_7": 355.6892623901367, + "learning_rate": 0.0004873080597348947, + "loss": 764.5013, + "step": 5130 + }, + { + "ce_loss_10": 3.410134506225586, + "ce_loss_13": 3.352971625328064, + "ce_loss_2": 4.06082159280777, + "ce_loss_3": 3.866491210460663, + "ce_loss_7": 3.5091195464134217, + "epoch": 0.514, + "grad_norm": 644.0, + "kl_loss_10": 126.35911636352539, + "kl_loss_2": 1516.3789794921875, + "kl_loss_3": 1137.2391021728515, + "kl_loss_7": 355.16859741210936, + "learning_rate": 0.0004857219746031519, + "loss": 770.8493, + "step": 5140 + }, + { + "ce_loss_10": 3.584864628314972, + "ce_loss_13": 3.529874527454376, + "ce_loss_2": 4.1941790103912355, + "ce_loss_3": 4.00935822725296, + "ce_loss_7": 3.6780989289283754, + "epoch": 0.515, + "grad_norm": 780.0, + "kl_loss_10": 128.64718170166014, + "kl_loss_2": 1438.0651428222657, + "kl_loss_3": 1078.5996124267579, + "kl_loss_7": 349.5986953735352, + "learning_rate": 0.0004841360332509663, + "loss": 752.5851, + "step": 5150 + }, + { + "ce_loss_10": 3.535393476486206, + "ce_loss_13": 3.481099021434784, + "ce_loss_2": 4.140241587162018, + "ce_loss_3": 3.9574079871177674, + "ce_loss_7": 3.634384071826935, + "epoch": 0.516, + "grad_norm": 640.0, + "kl_loss_10": 122.83691864013672, + "kl_loss_2": 1426.9576843261718, + "kl_loss_3": 1065.5508270263672, + "kl_loss_7": 341.7165832519531, + "learning_rate": 0.0004825502516487497, + "loss": 727.2895, + "step": 5160 + }, + { + "ce_loss_10": 3.4977298855781553, + "ce_loss_13": 3.441702198982239, + "ce_loss_2": 4.130070972442627, + "ce_loss_3": 3.942211651802063, + "ce_loss_7": 3.5965057730674745, + "epoch": 0.517, + "grad_norm": 760.0, + "kl_loss_10": 127.02273902893066, + "kl_loss_2": 1477.1372314453124, + "kl_loss_3": 1105.9117126464844, + "kl_loss_7": 351.33150787353514, + "learning_rate": 0.00048096464576530507, + "loss": 761.6553, + "step": 5170 + }, + { + "ce_loss_10": 3.6062949657440186, + "ce_loss_13": 3.5500340938568113, + "ce_loss_2": 4.195667040348053, + "ce_loss_3": 4.018487000465393, + "ce_loss_7": 3.6985435009002687, + "epoch": 0.518, + "grad_norm": 624.0, + "kl_loss_10": 126.56138648986817, + "kl_loss_2": 1408.2973205566407, + "kl_loss_3": 1057.3329833984376, + "kl_loss_7": 341.4838348388672, + "learning_rate": 0.00047937923156766646, + "loss": 734.8762, + "step": 5180 + }, + { + "ce_loss_10": 3.6485862135887146, + "ce_loss_13": 3.5955163717269896, + "ce_loss_2": 4.232599627971649, + "ce_loss_3": 4.060397815704346, + "ce_loss_7": 3.738468253612518, + "epoch": 0.519, + "grad_norm": 620.0, + "kl_loss_10": 126.33506202697754, + "kl_loss_2": 1394.4201049804688, + "kl_loss_3": 1048.218344116211, + "kl_loss_7": 340.96579895019534, + "learning_rate": 0.00047779402502093696, + "loss": 736.9193, + "step": 5190 + }, + { + "ce_loss_10": 3.6157581567764283, + "ce_loss_13": 3.558749091625214, + "ce_loss_2": 4.207399034500122, + "ce_loss_3": 4.03502504825592, + "ce_loss_7": 3.7083640336990356, + "epoch": 0.52, + "grad_norm": 672.0, + "kl_loss_10": 126.87294883728028, + "kl_loss_2": 1406.3830688476562, + "kl_loss_3": 1064.0467315673827, + "kl_loss_7": 343.44087677001954, + "learning_rate": 0.0004762090420881289, + "loss": 745.0921, + "step": 5200 + }, + { + "ce_loss_10": 3.535387361049652, + "ce_loss_13": 3.4797078251838682, + "ce_loss_2": 4.134657156467438, + "ce_loss_3": 3.951842713356018, + "ce_loss_7": 3.625101017951965, + "epoch": 0.521, + "grad_norm": 568.0, + "kl_loss_10": 125.83340911865234, + "kl_loss_2": 1426.3083251953126, + "kl_loss_3": 1067.8335723876953, + "kl_loss_7": 343.47533721923827, + "learning_rate": 0.00047462429873000296, + "loss": 735.2042, + "step": 5210 + }, + { + "ce_loss_10": 3.616153085231781, + "ce_loss_13": 3.561689925193787, + "ce_loss_2": 4.214185404777527, + "ce_loss_3": 4.027488625049591, + "ce_loss_7": 3.7073824644088744, + "epoch": 0.522, + "grad_norm": 556.0, + "kl_loss_10": 127.8383171081543, + "kl_loss_2": 1424.687451171875, + "kl_loss_3": 1065.997296142578, + "kl_loss_7": 346.1371368408203, + "learning_rate": 0.0004730398109049071, + "loss": 741.2838, + "step": 5220 + }, + { + "ce_loss_10": 3.54775093793869, + "ce_loss_13": 3.4891308307647706, + "ce_loss_2": 4.180230689048767, + "ce_loss_3": 3.993181014060974, + "ce_loss_7": 3.6464683175086976, + "epoch": 0.523, + "grad_norm": 612.0, + "kl_loss_10": 129.67901077270508, + "kl_loss_2": 1474.5384399414063, + "kl_loss_3": 1110.3589935302734, + "kl_loss_7": 357.5801712036133, + "learning_rate": 0.000471455594568616, + "loss": 754.4792, + "step": 5230 + }, + { + "ce_loss_10": 3.6236977100372316, + "ce_loss_13": 3.567158377170563, + "ce_loss_2": 4.210835099220276, + "ce_loss_3": 4.030972874164581, + "ce_loss_7": 3.713850724697113, + "epoch": 0.524, + "grad_norm": 488.0, + "kl_loss_10": 127.79978103637696, + "kl_loss_2": 1409.9825317382813, + "kl_loss_3": 1052.4463439941405, + "kl_loss_7": 343.3757125854492, + "learning_rate": 0.00046987166567417086, + "loss": 746.4875, + "step": 5240 + }, + { + "ce_loss_10": 3.5389233589172364, + "ce_loss_13": 3.4855753421783446, + "ce_loss_2": 4.151636373996735, + "ce_loss_3": 3.963833916187286, + "ce_loss_7": 3.633453297615051, + "epoch": 0.525, + "grad_norm": 700.0, + "kl_loss_10": 124.83961982727051, + "kl_loss_2": 1432.0410766601562, + "kl_loss_3": 1068.8124633789062, + "kl_loss_7": 344.24831237792966, + "learning_rate": 0.00046828804017171776, + "loss": 730.1019, + "step": 5250 + }, + { + "ce_loss_10": 3.5789129376411437, + "ce_loss_13": 3.52143075466156, + "ce_loss_2": 4.213813376426697, + "ce_loss_3": 4.024446547031403, + "ce_loss_7": 3.6814927458763123, + "epoch": 0.526, + "grad_norm": 556.0, + "kl_loss_10": 128.2565517425537, + "kl_loss_2": 1458.2886047363281, + "kl_loss_3": 1090.4730590820313, + "kl_loss_7": 352.8787322998047, + "learning_rate": 0.00046670473400834805, + "loss": 759.958, + "step": 5260 + }, + { + "ce_loss_10": 3.5171929001808167, + "ce_loss_13": 3.4629207491874694, + "ce_loss_2": 4.1135843873023985, + "ce_loss_3": 3.9294076561927795, + "ce_loss_7": 3.6082523465156555, + "epoch": 0.527, + "grad_norm": 712.0, + "kl_loss_10": 123.60066146850586, + "kl_loss_2": 1415.8293395996093, + "kl_loss_3": 1053.1150390625, + "kl_loss_7": 339.89840545654295, + "learning_rate": 0.00046512176312793734, + "loss": 756.1812, + "step": 5270 + }, + { + "ce_loss_10": 3.5107405304908754, + "ce_loss_13": 3.453246533870697, + "ce_loss_2": 4.126374876499176, + "ce_loss_3": 3.936051630973816, + "ce_loss_7": 3.6049073815345762, + "epoch": 0.528, + "grad_norm": 612.0, + "kl_loss_10": 125.6848876953125, + "kl_loss_2": 1449.2553955078124, + "kl_loss_3": 1081.2026336669921, + "kl_loss_7": 345.4993423461914, + "learning_rate": 0.00046353914347098467, + "loss": 756.4846, + "step": 5280 + }, + { + "ce_loss_10": 3.604662263393402, + "ce_loss_13": 3.5488700747489927, + "ce_loss_2": 4.218853032588958, + "ce_loss_3": 4.034527897834778, + "ce_loss_7": 3.6985684871673583, + "epoch": 0.529, + "grad_norm": 908.0, + "kl_loss_10": 126.48544654846191, + "kl_loss_2": 1444.1267395019531, + "kl_loss_3": 1078.7888458251953, + "kl_loss_7": 346.7707778930664, + "learning_rate": 0.0004619568909744524, + "loss": 752.0158, + "step": 5290 + }, + { + "ce_loss_10": 3.612137234210968, + "ce_loss_13": 3.5561481952667235, + "ce_loss_2": 4.217017912864685, + "ce_loss_3": 4.034022784233093, + "ce_loss_7": 3.7084673762321474, + "epoch": 0.53, + "grad_norm": 704.0, + "kl_loss_10": 128.3290657043457, + "kl_loss_2": 1421.7739501953124, + "kl_loss_3": 1070.839599609375, + "kl_loss_7": 349.6264083862305, + "learning_rate": 0.00046037502157160573, + "loss": 754.9794, + "step": 5300 + }, + { + "ce_loss_10": 3.4819830536842344, + "ce_loss_13": 3.42584068775177, + "ce_loss_2": 4.099390125274658, + "ce_loss_3": 3.917300546169281, + "ce_loss_7": 3.580989933013916, + "epoch": 0.531, + "grad_norm": 628.0, + "kl_loss_10": 125.62411231994629, + "kl_loss_2": 1453.1245910644532, + "kl_loss_3": 1098.5252807617187, + "kl_loss_7": 353.93557891845705, + "learning_rate": 0.00045879355119185207, + "loss": 756.8294, + "step": 5310 + }, + { + "ce_loss_10": 3.5613385915756224, + "ce_loss_13": 3.5066535234451295, + "ce_loss_2": 4.183086156845093, + "ce_loss_3": 3.9961040735244753, + "ce_loss_7": 3.6569605588912966, + "epoch": 0.532, + "grad_norm": 696.0, + "kl_loss_10": 127.47045364379883, + "kl_loss_2": 1474.839111328125, + "kl_loss_3": 1106.3402404785156, + "kl_loss_7": 356.02258758544923, + "learning_rate": 0.0004572124957605803, + "loss": 763.853, + "step": 5320 + }, + { + "ce_loss_10": 3.580208718776703, + "ce_loss_13": 3.523788559436798, + "ce_loss_2": 4.186931335926056, + "ce_loss_3": 4.0057693243026735, + "ce_loss_7": 3.677249050140381, + "epoch": 0.533, + "grad_norm": 428.0, + "kl_loss_10": 125.33763389587402, + "kl_loss_2": 1450.7080810546875, + "kl_loss_3": 1089.869924926758, + "kl_loss_7": 352.4146255493164, + "learning_rate": 0.00045563187119900103, + "loss": 745.5743, + "step": 5330 + }, + { + "ce_loss_10": 3.424732136726379, + "ce_loss_13": 3.3700300335884092, + "ce_loss_2": 4.053750395774841, + "ce_loss_3": 3.866895389556885, + "ce_loss_7": 3.5216084599494932, + "epoch": 0.534, + "grad_norm": 1168.0, + "kl_loss_10": 125.33371696472167, + "kl_loss_2": 1466.25302734375, + "kl_loss_3": 1098.1051971435547, + "kl_loss_7": 349.4919769287109, + "learning_rate": 0.00045405169342398633, + "loss": 760.794, + "step": 5340 + }, + { + "ce_loss_10": 3.516588735580444, + "ce_loss_13": 3.456972897052765, + "ce_loss_2": 4.132767844200134, + "ce_loss_3": 3.951627218723297, + "ce_loss_7": 3.6111021041870117, + "epoch": 0.535, + "grad_norm": 720.0, + "kl_loss_10": 127.6759262084961, + "kl_loss_2": 1454.3429931640626, + "kl_loss_3": 1093.5691711425782, + "kl_loss_7": 349.7331176757813, + "learning_rate": 0.0004524719783479088, + "loss": 745.902, + "step": 5350 + }, + { + "ce_loss_10": 3.4667383193969727, + "ce_loss_13": 3.4120450973510743, + "ce_loss_2": 4.107578992843628, + "ce_loss_3": 3.907571184635162, + "ce_loss_7": 3.5639479041099547, + "epoch": 0.536, + "grad_norm": 620.0, + "kl_loss_10": 126.3938491821289, + "kl_loss_2": 1486.8213806152344, + "kl_loss_3": 1107.0321716308595, + "kl_loss_7": 352.4698181152344, + "learning_rate": 0.00045089274187848144, + "loss": 748.4564, + "step": 5360 + }, + { + "ce_loss_10": 3.5862554907798767, + "ce_loss_13": 3.5314642190933228, + "ce_loss_2": 4.182982349395752, + "ce_loss_3": 4.007098364830017, + "ce_loss_7": 3.6815426349639893, + "epoch": 0.537, + "grad_norm": 780.0, + "kl_loss_10": 125.15973091125488, + "kl_loss_2": 1417.1767822265624, + "kl_loss_3": 1063.112875366211, + "kl_loss_7": 344.52098083496094, + "learning_rate": 0.00044931399991859835, + "loss": 740.4288, + "step": 5370 + }, + { + "ce_loss_10": 3.4537264466285706, + "ce_loss_13": 3.397633194923401, + "ce_loss_2": 4.068418169021607, + "ce_loss_3": 3.877337634563446, + "ce_loss_7": 3.548009955883026, + "epoch": 0.538, + "grad_norm": 536.0, + "kl_loss_10": 125.85848808288574, + "kl_loss_2": 1456.3653503417968, + "kl_loss_3": 1089.4956604003905, + "kl_loss_7": 347.38047180175784, + "learning_rate": 0.00044773576836617336, + "loss": 740.3193, + "step": 5380 + }, + { + "ce_loss_10": 3.5413371920585632, + "ce_loss_13": 3.4863479495048524, + "ce_loss_2": 4.166609585285187, + "ce_loss_3": 3.9833678722381594, + "ce_loss_7": 3.641672730445862, + "epoch": 0.539, + "grad_norm": 548.0, + "kl_loss_10": 127.76419677734376, + "kl_loss_2": 1480.62939453125, + "kl_loss_3": 1109.371633911133, + "kl_loss_7": 357.3611831665039, + "learning_rate": 0.00044615806311398056, + "loss": 767.7122, + "step": 5390 + }, + { + "ce_loss_10": 3.6181453466415405, + "ce_loss_13": 3.567015016078949, + "ce_loss_2": 4.187848663330078, + "ce_loss_3": 4.017346155643463, + "ce_loss_7": 3.7096580266952515, + "epoch": 0.54, + "grad_norm": 652.0, + "kl_loss_10": 125.5275634765625, + "kl_loss_2": 1384.711865234375, + "kl_loss_3": 1042.3586151123047, + "kl_loss_7": 339.9911834716797, + "learning_rate": 0.00044458090004949454, + "loss": 745.8007, + "step": 5400 + }, + { + "ce_loss_10": 3.47789705991745, + "ce_loss_13": 3.418693256378174, + "ce_loss_2": 4.127793288230896, + "ce_loss_3": 3.9291198015213014, + "ce_loss_7": 3.57582768201828, + "epoch": 0.541, + "grad_norm": 628.0, + "kl_loss_10": 129.80282020568848, + "kl_loss_2": 1531.2485473632812, + "kl_loss_3": 1138.2959686279296, + "kl_loss_7": 360.45466766357424, + "learning_rate": 0.0004430042950547297, + "loss": 759.4717, + "step": 5410 + }, + { + "ce_loss_10": 3.573679769039154, + "ce_loss_13": 3.515029692649841, + "ce_loss_2": 4.1977743268013, + "ce_loss_3": 4.015309143066406, + "ce_loss_7": 3.6747510194778443, + "epoch": 0.542, + "grad_norm": 624.0, + "kl_loss_10": 130.55573616027831, + "kl_loss_2": 1466.3621948242187, + "kl_loss_3": 1100.1870544433593, + "kl_loss_7": 357.17127685546876, + "learning_rate": 0.0004414282640060809, + "loss": 755.4526, + "step": 5420 + }, + { + "ce_loss_10": 3.665824794769287, + "ce_loss_13": 3.607898008823395, + "ce_loss_2": 4.26334011554718, + "ce_loss_3": 4.0842081785202025, + "ce_loss_7": 3.765260875225067, + "epoch": 0.543, + "grad_norm": 680.0, + "kl_loss_10": 130.16803436279298, + "kl_loss_2": 1410.027520751953, + "kl_loss_3": 1066.1269775390624, + "kl_loss_7": 359.3704772949219, + "learning_rate": 0.0004398528227741633, + "loss": 755.4893, + "step": 5430 + }, + { + "ce_loss_10": 3.533736264705658, + "ce_loss_13": 3.4744524717330934, + "ce_loss_2": 4.148015642166138, + "ce_loss_3": 3.9667993783950806, + "ce_loss_7": 3.6348182201385497, + "epoch": 0.544, + "grad_norm": 676.0, + "kl_loss_10": 133.03388061523438, + "kl_loss_2": 1434.2911987304688, + "kl_loss_3": 1079.9505981445313, + "kl_loss_7": 359.5822357177734, + "learning_rate": 0.00043827798722365264, + "loss": 762.0716, + "step": 5440 + }, + { + "ce_loss_10": 3.6589300632476807, + "ce_loss_13": 3.6001421213150024, + "ce_loss_2": 4.238667392730713, + "ce_loss_3": 4.060091936588288, + "ce_loss_7": 3.750708055496216, + "epoch": 0.545, + "grad_norm": 494.0, + "kl_loss_10": 131.36463470458983, + "kl_loss_2": 1393.9716003417968, + "kl_loss_3": 1045.8557373046874, + "kl_loss_7": 350.9205383300781, + "learning_rate": 0.00043670377321312535, + "loss": 729.8014, + "step": 5450 + }, + { + "ce_loss_10": 3.656389832496643, + "ce_loss_13": 3.600922691822052, + "ce_loss_2": 4.24760650396347, + "ce_loss_3": 4.063620638847351, + "ce_loss_7": 3.7452564239501953, + "epoch": 0.546, + "grad_norm": 556.0, + "kl_loss_10": 128.25653953552245, + "kl_loss_2": 1405.696063232422, + "kl_loss_3": 1049.458319091797, + "kl_loss_7": 344.41868896484374, + "learning_rate": 0.0004351301965948991, + "loss": 746.7757, + "step": 5460 + }, + { + "ce_loss_10": 3.5661354064941406, + "ce_loss_13": 3.508555901050568, + "ce_loss_2": 4.156990563869476, + "ce_loss_3": 3.9813518643379213, + "ce_loss_7": 3.6595167994499205, + "epoch": 0.547, + "grad_norm": 600.0, + "kl_loss_10": 126.90561714172364, + "kl_loss_2": 1396.3604797363282, + "kl_loss_3": 1049.8896087646485, + "kl_loss_7": 342.8857620239258, + "learning_rate": 0.000433557273214873, + "loss": 740.9387, + "step": 5470 + }, + { + "ce_loss_10": 3.5527311325073243, + "ce_loss_13": 3.497834849357605, + "ce_loss_2": 4.150702881813049, + "ce_loss_3": 3.9698083996772766, + "ce_loss_7": 3.6472991704940796, + "epoch": 0.548, + "grad_norm": 644.0, + "kl_loss_10": 126.52342948913574, + "kl_loss_2": 1407.9827697753906, + "kl_loss_3": 1056.515103149414, + "kl_loss_7": 345.3242782592773, + "learning_rate": 0.000431985018912368, + "loss": 732.1462, + "step": 5480 + }, + { + "ce_loss_10": 3.5216315269470213, + "ce_loss_13": 3.4642876982688904, + "ce_loss_2": 4.148514151573181, + "ce_loss_3": 3.9579554200172424, + "ce_loss_7": 3.61717346906662, + "epoch": 0.549, + "grad_norm": 458.0, + "kl_loss_10": 129.01484336853028, + "kl_loss_2": 1464.8551086425782, + "kl_loss_3": 1100.7810821533203, + "kl_loss_7": 353.8451156616211, + "learning_rate": 0.0004304134495199674, + "loss": 742.2356, + "step": 5490 + }, + { + "ce_loss_10": 3.551531457901001, + "ce_loss_13": 3.4940086603164673, + "ce_loss_2": 4.170977103710174, + "ce_loss_3": 3.9918709278106688, + "ce_loss_7": 3.6487022042274475, + "epoch": 0.55, + "grad_norm": 696.0, + "kl_loss_10": 129.0471164703369, + "kl_loss_2": 1483.5442321777343, + "kl_loss_3": 1116.8151824951171, + "kl_loss_7": 357.02617797851565, + "learning_rate": 0.0004288425808633575, + "loss": 757.0911, + "step": 5500 + }, + { + "ce_loss_10": 3.524256336688995, + "ce_loss_13": 3.4696091651916503, + "ce_loss_2": 4.134656190872192, + "ce_loss_3": 3.948019301891327, + "ce_loss_7": 3.6168912172317507, + "epoch": 0.551, + "grad_norm": 764.0, + "kl_loss_10": 124.78674278259277, + "kl_loss_2": 1451.5693908691405, + "kl_loss_3": 1084.9963470458983, + "kl_loss_7": 345.0725372314453, + "learning_rate": 0.0004272724287611684, + "loss": 748.5045, + "step": 5510 + }, + { + "ce_loss_10": 3.5044045448303223, + "ce_loss_13": 3.4458850502967833, + "ce_loss_2": 4.11513135433197, + "ce_loss_3": 3.923069179058075, + "ce_loss_7": 3.598716700077057, + "epoch": 0.552, + "grad_norm": 564.0, + "kl_loss_10": 128.57932319641114, + "kl_loss_2": 1450.3947509765626, + "kl_loss_3": 1084.4042419433595, + "kl_loss_7": 349.6862258911133, + "learning_rate": 0.00042570300902481425, + "loss": 748.2176, + "step": 5520 + }, + { + "ce_loss_10": 3.5342564582824707, + "ce_loss_13": 3.4802250385284426, + "ce_loss_2": 4.127187561988831, + "ce_loss_3": 3.9438385248184202, + "ce_loss_7": 3.6231723546981813, + "epoch": 0.553, + "grad_norm": 684.0, + "kl_loss_10": 125.04237670898438, + "kl_loss_2": 1423.2647521972656, + "kl_loss_3": 1067.049887084961, + "kl_loss_7": 344.51769561767577, + "learning_rate": 0.00042413433745833423, + "loss": 740.4593, + "step": 5530 + }, + { + "ce_loss_10": 3.537394309043884, + "ce_loss_13": 3.4792711973190307, + "ce_loss_2": 4.149854254722595, + "ce_loss_3": 3.968058681488037, + "ce_loss_7": 3.6317521929740906, + "epoch": 0.554, + "grad_norm": 592.0, + "kl_loss_10": 126.85282897949219, + "kl_loss_2": 1431.3300415039062, + "kl_loss_3": 1076.83173828125, + "kl_loss_7": 346.4241668701172, + "learning_rate": 0.0004225664298582339, + "loss": 727.5832, + "step": 5540 + }, + { + "ce_loss_10": 3.61503586769104, + "ce_loss_13": 3.5593451619148255, + "ce_loss_2": 4.2083780169487, + "ce_loss_3": 4.028390157222748, + "ce_loss_7": 3.706908369064331, + "epoch": 0.555, + "grad_norm": 472.0, + "kl_loss_10": 124.98624839782715, + "kl_loss_2": 1399.5473022460938, + "kl_loss_3": 1052.0424438476562, + "kl_loss_7": 341.1046676635742, + "learning_rate": 0.000420999302013325, + "loss": 731.5678, + "step": 5550 + }, + { + "ce_loss_10": 3.5158777952194216, + "ce_loss_13": 3.4559564113616945, + "ce_loss_2": 4.1452751636505125, + "ce_loss_3": 3.9481736540794374, + "ce_loss_7": 3.6144383549690247, + "epoch": 0.556, + "grad_norm": 696.0, + "kl_loss_10": 130.04578018188477, + "kl_loss_2": 1463.335516357422, + "kl_loss_3": 1083.2877349853516, + "kl_loss_7": 355.2161560058594, + "learning_rate": 0.000419432969704568, + "loss": 741.9638, + "step": 5560 + }, + { + "ce_loss_10": 3.5586655020713804, + "ce_loss_13": 3.503348696231842, + "ce_loss_2": 4.1602645993232725, + "ce_loss_3": 3.976087248325348, + "ce_loss_7": 3.6495119094848634, + "epoch": 0.557, + "grad_norm": 478.0, + "kl_loss_10": 125.02511405944824, + "kl_loss_2": 1410.6633605957031, + "kl_loss_3": 1053.2783630371093, + "kl_loss_7": 341.9718505859375, + "learning_rate": 0.00041786744870491154, + "loss": 750.6796, + "step": 5570 + }, + { + "ce_loss_10": 3.49469313621521, + "ce_loss_13": 3.4374850153923036, + "ce_loss_2": 4.115650498867035, + "ce_loss_3": 3.9304126381874083, + "ce_loss_7": 3.589133381843567, + "epoch": 0.558, + "grad_norm": 636.0, + "kl_loss_10": 128.60980453491212, + "kl_loss_2": 1454.5861450195312, + "kl_loss_3": 1099.7731384277345, + "kl_loss_7": 352.64312438964845, + "learning_rate": 0.0004163027547791347, + "loss": 750.0387, + "step": 5580 + }, + { + "ce_loss_10": 3.4733791589736938, + "ce_loss_13": 3.417029893398285, + "ce_loss_2": 4.109368133544922, + "ce_loss_3": 3.916108226776123, + "ce_loss_7": 3.57048362493515, + "epoch": 0.559, + "grad_norm": 544.0, + "kl_loss_10": 126.46923561096192, + "kl_loss_2": 1481.6322875976562, + "kl_loss_3": 1110.3733978271484, + "kl_loss_7": 353.67320861816404, + "learning_rate": 0.0004147389036836881, + "loss": 753.4516, + "step": 5590 + }, + { + "ce_loss_10": 3.522750961780548, + "ce_loss_13": 3.4660881876945497, + "ce_loss_2": 4.150948774814606, + "ce_loss_3": 3.9613837599754333, + "ce_loss_7": 3.6164920568466186, + "epoch": 0.56, + "grad_norm": 1008.0, + "kl_loss_10": 127.6525722503662, + "kl_loss_2": 1456.663018798828, + "kl_loss_3": 1093.7019775390625, + "kl_loss_7": 348.2412384033203, + "learning_rate": 0.00041317591116653486, + "loss": 760.9835, + "step": 5600 + }, + { + "ce_loss_10": 3.5656349301338195, + "ce_loss_13": 3.506862556934357, + "ce_loss_2": 4.182165312767029, + "ce_loss_3": 3.996174454689026, + "ce_loss_7": 3.6606045246124266, + "epoch": 0.561, + "grad_norm": 544.0, + "kl_loss_10": 130.3198528289795, + "kl_loss_2": 1451.4231079101562, + "kl_loss_3": 1085.5545135498046, + "kl_loss_7": 355.61197052001955, + "learning_rate": 0.0004116137929669921, + "loss": 742.7544, + "step": 5610 + }, + { + "ce_loss_10": 3.550603926181793, + "ce_loss_13": 3.4951741695404053, + "ce_loss_2": 4.154799246788025, + "ce_loss_3": 3.9741489410400392, + "ce_loss_7": 3.642624282836914, + "epoch": 0.562, + "grad_norm": 612.0, + "kl_loss_10": 125.04610176086426, + "kl_loss_2": 1434.1698974609376, + "kl_loss_3": 1079.523110961914, + "kl_loss_7": 345.24672241210936, + "learning_rate": 0.00041005256481557305, + "loss": 736.6574, + "step": 5620 + }, + { + "ce_loss_10": 3.6528918623924254, + "ce_loss_13": 3.599192976951599, + "ce_loss_2": 4.227750253677368, + "ce_loss_3": 4.054849910736084, + "ce_loss_7": 3.7427762031555174, + "epoch": 0.563, + "grad_norm": 960.0, + "kl_loss_10": 123.06231956481933, + "kl_loss_2": 1355.2447143554687, + "kl_loss_3": 1017.4669219970704, + "kl_loss_7": 334.08680114746096, + "learning_rate": 0.00040849224243382767, + "loss": 721.732, + "step": 5630 + }, + { + "ce_loss_10": 3.5077744126319885, + "ce_loss_13": 3.4527820706367494, + "ce_loss_2": 4.120105528831482, + "ce_loss_3": 3.935258185863495, + "ce_loss_7": 3.6035592913627625, + "epoch": 0.564, + "grad_norm": 572.0, + "kl_loss_10": 124.89362907409668, + "kl_loss_2": 1440.2459045410155, + "kl_loss_3": 1077.581704711914, + "kl_loss_7": 345.66795349121094, + "learning_rate": 0.000406932841534185, + "loss": 734.8656, + "step": 5640 + }, + { + "ce_loss_10": 3.4617214798927307, + "ce_loss_13": 3.404375874996185, + "ce_loss_2": 4.0868846535682675, + "ce_loss_3": 3.9023816704750063, + "ce_loss_7": 3.5591673016548158, + "epoch": 0.565, + "grad_norm": 736.0, + "kl_loss_10": 126.24258880615234, + "kl_loss_2": 1461.9494995117188, + "kl_loss_3": 1098.3101470947265, + "kl_loss_7": 351.8350204467773, + "learning_rate": 0.0004053743778197951, + "loss": 763.8107, + "step": 5650 + }, + { + "ce_loss_10": 3.5757392168045046, + "ce_loss_13": 3.516366720199585, + "ce_loss_2": 4.176673400402069, + "ce_loss_3": 3.9969864964485167, + "ce_loss_7": 3.6699735283851624, + "epoch": 0.566, + "grad_norm": 556.0, + "kl_loss_10": 129.16880264282227, + "kl_loss_2": 1421.8304565429687, + "kl_loss_3": 1070.1267395019531, + "kl_loss_7": 347.0413360595703, + "learning_rate": 0.0004038168669843697, + "loss": 753.9809, + "step": 5660 + }, + { + "ce_loss_10": 3.5368810892105103, + "ce_loss_13": 3.480051815509796, + "ce_loss_2": 4.12183108329773, + "ce_loss_3": 3.947605645656586, + "ce_loss_7": 3.6282246232032778, + "epoch": 0.567, + "grad_norm": 700.0, + "kl_loss_10": 124.853763961792, + "kl_loss_2": 1403.9633483886719, + "kl_loss_3": 1055.4384826660157, + "kl_loss_7": 340.73865661621096, + "learning_rate": 0.000402260324712026, + "loss": 742.4323, + "step": 5670 + }, + { + "ce_loss_10": 3.5802783489227297, + "ce_loss_13": 3.5246822237968445, + "ce_loss_2": 4.191337883472443, + "ce_loss_3": 4.009469735622406, + "ce_loss_7": 3.674550974369049, + "epoch": 0.568, + "grad_norm": 704.0, + "kl_loss_10": 125.56253471374512, + "kl_loss_2": 1435.6087524414063, + "kl_loss_3": 1077.026220703125, + "kl_loss_7": 344.2860565185547, + "learning_rate": 0.00040070476667712743, + "loss": 736.5972, + "step": 5680 + }, + { + "ce_loss_10": 3.6075830340385435, + "ce_loss_13": 3.5494349122047426, + "ce_loss_2": 4.2079323649406435, + "ce_loss_3": 4.023375844955444, + "ce_loss_7": 3.7011712551116944, + "epoch": 0.569, + "grad_norm": 652.0, + "kl_loss_10": 127.4619140625, + "kl_loss_2": 1424.6428833007812, + "kl_loss_3": 1060.6977600097657, + "kl_loss_7": 344.1405715942383, + "learning_rate": 0.0003991502085441259, + "loss": 745.1114, + "step": 5690 + }, + { + "ce_loss_10": 3.6421966791152953, + "ce_loss_13": 3.5865265488624574, + "ce_loss_2": 4.215503621101379, + "ce_loss_3": 4.038523530960083, + "ce_loss_7": 3.7318795323371887, + "epoch": 0.57, + "grad_norm": 552.0, + "kl_loss_10": 124.02103271484376, + "kl_loss_2": 1365.1710144042968, + "kl_loss_3": 1022.5380310058594, + "kl_loss_7": 335.7863967895508, + "learning_rate": 0.0003975966659674047, + "loss": 734.2836, + "step": 5700 + }, + { + "ce_loss_10": 3.6048430800437927, + "ce_loss_13": 3.549668312072754, + "ce_loss_2": 4.207078647613526, + "ce_loss_3": 4.021631062030792, + "ce_loss_7": 3.695444619655609, + "epoch": 0.571, + "grad_norm": 680.0, + "kl_loss_10": 126.19835929870605, + "kl_loss_2": 1414.5069702148437, + "kl_loss_3": 1054.5915283203126, + "kl_loss_7": 343.2915603637695, + "learning_rate": 0.0003960441545911204, + "loss": 733.8609, + "step": 5710 + }, + { + "ce_loss_10": 3.6052708625793457, + "ce_loss_13": 3.5485535979270937, + "ce_loss_2": 4.193977308273316, + "ce_loss_3": 4.016455709934235, + "ce_loss_7": 3.696723520755768, + "epoch": 0.572, + "grad_norm": 652.0, + "kl_loss_10": 125.34993591308594, + "kl_loss_2": 1414.3463562011718, + "kl_loss_3": 1059.2802703857421, + "kl_loss_7": 344.37230987548827, + "learning_rate": 0.0003944926900490452, + "loss": 732.3417, + "step": 5720 + }, + { + "ce_loss_10": 3.519063436985016, + "ce_loss_13": 3.4620590448379516, + "ce_loss_2": 4.135345363616944, + "ce_loss_3": 3.9530657052993776, + "ce_loss_7": 3.614231622219086, + "epoch": 0.573, + "grad_norm": 588.0, + "kl_loss_10": 125.74856452941894, + "kl_loss_2": 1450.9848693847657, + "kl_loss_3": 1088.1497314453125, + "kl_loss_7": 348.66001892089844, + "learning_rate": 0.0003929422879644099, + "loss": 736.7084, + "step": 5730 + }, + { + "ce_loss_10": 3.5202754855155947, + "ce_loss_13": 3.466830384731293, + "ce_loss_2": 4.112395560741424, + "ce_loss_3": 3.926817226409912, + "ce_loss_7": 3.6115634441375732, + "epoch": 0.574, + "grad_norm": 780.0, + "kl_loss_10": 122.84286041259766, + "kl_loss_2": 1409.6117370605468, + "kl_loss_3": 1058.0203704833984, + "kl_loss_7": 337.2828994750977, + "learning_rate": 0.0003913929639497462, + "loss": 725.4092, + "step": 5740 + }, + { + "ce_loss_10": 3.4744406223297117, + "ce_loss_13": 3.417355275154114, + "ce_loss_2": 4.097714972496033, + "ce_loss_3": 3.902831184864044, + "ce_loss_7": 3.5695701360702516, + "epoch": 0.575, + "grad_norm": 524.0, + "kl_loss_10": 124.28501434326172, + "kl_loss_2": 1441.8303283691407, + "kl_loss_3": 1071.458563232422, + "kl_loss_7": 341.63841705322267, + "learning_rate": 0.00038984473360672965, + "loss": 732.2352, + "step": 5750 + }, + { + "ce_loss_10": 3.4835135340690613, + "ce_loss_13": 3.42867751121521, + "ce_loss_2": 4.105578863620758, + "ce_loss_3": 3.919316029548645, + "ce_loss_7": 3.576983118057251, + "epoch": 0.576, + "grad_norm": 604.0, + "kl_loss_10": 122.92881278991699, + "kl_loss_2": 1439.3748413085937, + "kl_loss_3": 1074.1585357666015, + "kl_loss_7": 339.31846008300784, + "learning_rate": 0.0003882976125260229, + "loss": 730.0831, + "step": 5760 + }, + { + "ce_loss_10": 3.5526882290840147, + "ce_loss_13": 3.4981236934661863, + "ce_loss_2": 4.157402575016022, + "ce_loss_3": 3.970548963546753, + "ce_loss_7": 3.6471086144447327, + "epoch": 0.577, + "grad_norm": 556.0, + "kl_loss_10": 125.73483810424804, + "kl_loss_2": 1407.3273315429688, + "kl_loss_3": 1052.3861572265625, + "kl_loss_7": 341.37246856689455, + "learning_rate": 0.00038675161628711776, + "loss": 735.7479, + "step": 5770 + }, + { + "ce_loss_10": 3.58783597946167, + "ce_loss_13": 3.533314561843872, + "ce_loss_2": 4.171427321434021, + "ce_loss_3": 3.994054675102234, + "ce_loss_7": 3.681211507320404, + "epoch": 0.578, + "grad_norm": 560.0, + "kl_loss_10": 125.05509986877442, + "kl_loss_2": 1389.438739013672, + "kl_loss_3": 1037.8104064941406, + "kl_loss_7": 339.4297866821289, + "learning_rate": 0.0003852067604581794, + "loss": 745.1567, + "step": 5780 + }, + { + "ce_loss_10": 3.533677875995636, + "ce_loss_13": 3.4786616802215575, + "ce_loss_2": 4.1458081841468815, + "ce_loss_3": 3.950434994697571, + "ce_loss_7": 3.6267109632492067, + "epoch": 0.579, + "grad_norm": 660.0, + "kl_loss_10": 123.52711334228516, + "kl_loss_2": 1443.4204162597657, + "kl_loss_3": 1072.433172607422, + "kl_loss_7": 341.32163391113284, + "learning_rate": 0.0003836630605958888, + "loss": 735.883, + "step": 5790 + }, + { + "ce_loss_10": 3.590009105205536, + "ce_loss_13": 3.5354955911636354, + "ce_loss_2": 4.183343994617462, + "ce_loss_3": 4.006175303459168, + "ce_loss_7": 3.6826868414878846, + "epoch": 0.58, + "grad_norm": 724.0, + "kl_loss_10": 125.50635833740235, + "kl_loss_2": 1422.5762878417968, + "kl_loss_3": 1067.5176635742187, + "kl_loss_7": 343.88026275634763, + "learning_rate": 0.0003821205322452863, + "loss": 758.9688, + "step": 5800 + }, + { + "ce_loss_10": 3.5736894965171815, + "ce_loss_13": 3.518394339084625, + "ce_loss_2": 4.166361927986145, + "ce_loss_3": 3.9780752897262572, + "ce_loss_7": 3.658141016960144, + "epoch": 0.581, + "grad_norm": 520.0, + "kl_loss_10": 124.10846366882325, + "kl_loss_2": 1410.343994140625, + "kl_loss_3": 1048.025698852539, + "kl_loss_7": 337.55668487548826, + "learning_rate": 0.0003805791909396155, + "loss": 735.6956, + "step": 5810 + }, + { + "ce_loss_10": 3.524351119995117, + "ce_loss_13": 3.471441614627838, + "ce_loss_2": 4.123242568969727, + "ce_loss_3": 3.9424999237060545, + "ce_loss_7": 3.6170790791511536, + "epoch": 0.582, + "grad_norm": 656.0, + "kl_loss_10": 123.72361793518067, + "kl_loss_2": 1420.7679870605468, + "kl_loss_3": 1060.7876434326172, + "kl_loss_7": 339.7288070678711, + "learning_rate": 0.0003790390522001662, + "loss": 741.0435, + "step": 5820 + }, + { + "ce_loss_10": 3.455526554584503, + "ce_loss_13": 3.4017743825912476, + "ce_loss_2": 4.064185953140258, + "ce_loss_3": 3.875770378112793, + "ce_loss_7": 3.5457238078117372, + "epoch": 0.583, + "grad_norm": 668.0, + "kl_loss_10": 122.62833824157715, + "kl_loss_2": 1442.9026000976562, + "kl_loss_3": 1073.4856384277343, + "kl_loss_7": 338.7090118408203, + "learning_rate": 0.0003775001315361183, + "loss": 731.9279, + "step": 5830 + }, + { + "ce_loss_10": 3.573053014278412, + "ce_loss_13": 3.5164970636367796, + "ce_loss_2": 4.179701626300812, + "ce_loss_3": 3.9943284034729003, + "ce_loss_7": 3.6686230897903442, + "epoch": 0.584, + "grad_norm": 532.0, + "kl_loss_10": 126.27806777954102, + "kl_loss_2": 1439.9238403320312, + "kl_loss_3": 1072.7950469970704, + "kl_loss_7": 343.28225860595705, + "learning_rate": 0.0003759624444443858, + "loss": 741.2989, + "step": 5840 + }, + { + "ce_loss_10": 3.6071199655532835, + "ce_loss_13": 3.5526736259460447, + "ce_loss_2": 4.190285313129425, + "ce_loss_3": 4.009751296043396, + "ce_loss_7": 3.695155990123749, + "epoch": 0.585, + "grad_norm": 628.0, + "kl_loss_10": 124.63733139038087, + "kl_loss_2": 1396.041973876953, + "kl_loss_3": 1037.517807006836, + "kl_loss_7": 335.564924621582, + "learning_rate": 0.00037442600640946044, + "loss": 725.2555, + "step": 5850 + }, + { + "ce_loss_10": 3.5607337236404417, + "ce_loss_13": 3.5095672369003297, + "ce_loss_2": 4.148695635795593, + "ce_loss_3": 3.9706854939460756, + "ce_loss_7": 3.6532173633575438, + "epoch": 0.586, + "grad_norm": 672.0, + "kl_loss_10": 123.19213104248047, + "kl_loss_2": 1393.3970886230468, + "kl_loss_3": 1047.943051147461, + "kl_loss_7": 339.57422332763673, + "learning_rate": 0.00037289083290325663, + "loss": 720.2672, + "step": 5860 + }, + { + "ce_loss_10": 3.5448028206825257, + "ce_loss_13": 3.488830196857452, + "ce_loss_2": 4.1354421257972716, + "ce_loss_3": 3.9604512453079224, + "ce_loss_7": 3.6347436904907227, + "epoch": 0.587, + "grad_norm": 572.0, + "kl_loss_10": 126.071875, + "kl_loss_2": 1390.8989135742188, + "kl_loss_3": 1037.6858978271484, + "kl_loss_7": 339.9308090209961, + "learning_rate": 0.0003713569393849543, + "loss": 722.7879, + "step": 5870 + }, + { + "ce_loss_10": 3.597252070903778, + "ce_loss_13": 3.5426252484321594, + "ce_loss_2": 4.190739142894745, + "ce_loss_3": 4.007934546470642, + "ce_loss_7": 3.68798006772995, + "epoch": 0.588, + "grad_norm": 616.0, + "kl_loss_10": 124.46675567626953, + "kl_loss_2": 1411.17705078125, + "kl_loss_3": 1055.4470306396483, + "kl_loss_7": 337.98778839111327, + "learning_rate": 0.00036982434130084397, + "loss": 734.5103, + "step": 5880 + }, + { + "ce_loss_10": 3.508778083324432, + "ce_loss_13": 3.4500787973403932, + "ce_loss_2": 4.1147748827934265, + "ce_loss_3": 3.9282889366149902, + "ce_loss_7": 3.6036253452301024, + "epoch": 0.589, + "grad_norm": 732.0, + "kl_loss_10": 127.70808792114258, + "kl_loss_2": 1428.9704956054688, + "kl_loss_3": 1069.9872589111328, + "kl_loss_7": 346.86595458984374, + "learning_rate": 0.00036829305408417166, + "loss": 744.0622, + "step": 5890 + }, + { + "ce_loss_10": 3.4944640517234804, + "ce_loss_13": 3.439568829536438, + "ce_loss_2": 4.117368769645691, + "ce_loss_3": 3.928926873207092, + "ce_loss_7": 3.5905642032623293, + "epoch": 0.59, + "grad_norm": 920.0, + "kl_loss_10": 127.46843223571777, + "kl_loss_2": 1459.3504028320312, + "kl_loss_3": 1088.396890258789, + "kl_loss_7": 350.43089599609374, + "learning_rate": 0.0003667630931549826, + "loss": 743.2958, + "step": 5900 + }, + { + "ce_loss_10": 3.4649301290512087, + "ce_loss_13": 3.407567024230957, + "ce_loss_2": 4.097899031639099, + "ce_loss_3": 3.909872758388519, + "ce_loss_7": 3.560973751544952, + "epoch": 0.591, + "grad_norm": 820.0, + "kl_loss_10": 125.37057266235351, + "kl_loss_2": 1485.4420349121094, + "kl_loss_3": 1107.6814758300782, + "kl_loss_7": 347.6374969482422, + "learning_rate": 0.00036523447391996613, + "loss": 752.9677, + "step": 5910 + }, + { + "ce_loss_10": 3.5579336881637573, + "ce_loss_13": 3.5037983298301696, + "ce_loss_2": 4.148412072658539, + "ce_loss_3": 3.9693949699401854, + "ce_loss_7": 3.6502652764320374, + "epoch": 0.592, + "grad_norm": 540.0, + "kl_loss_10": 122.69281196594238, + "kl_loss_2": 1394.9801696777345, + "kl_loss_3": 1040.2122680664063, + "kl_loss_7": 341.23377685546876, + "learning_rate": 0.00036370721177230114, + "loss": 725.8651, + "step": 5920 + }, + { + "ce_loss_10": 3.553625154495239, + "ce_loss_13": 3.4975630164146425, + "ce_loss_2": 4.162805891036987, + "ce_loss_3": 3.9766315698623655, + "ce_loss_7": 3.648140561580658, + "epoch": 0.593, + "grad_norm": 506.0, + "kl_loss_10": 125.43215103149414, + "kl_loss_2": 1439.8947875976562, + "kl_loss_3": 1073.3429077148437, + "kl_loss_7": 345.2577102661133, + "learning_rate": 0.00036218132209150044, + "loss": 740.7089, + "step": 5930 + }, + { + "ce_loss_10": 3.5076088428497316, + "ce_loss_13": 3.4480642318725585, + "ce_loss_2": 4.140249872207642, + "ce_loss_3": 3.9551444053649902, + "ce_loss_7": 3.609627366065979, + "epoch": 0.594, + "grad_norm": 512.0, + "kl_loss_10": 129.52795867919923, + "kl_loss_2": 1482.7498962402344, + "kl_loss_3": 1115.7573791503905, + "kl_loss_7": 355.26904449462893, + "learning_rate": 0.0003606568202432562, + "loss": 752.0646, + "step": 5940 + }, + { + "ce_loss_10": 3.57774213552475, + "ce_loss_13": 3.5228155970573427, + "ce_loss_2": 4.19405552148819, + "ce_loss_3": 4.006729650497436, + "ce_loss_7": 3.6705931544303896, + "epoch": 0.595, + "grad_norm": 796.0, + "kl_loss_10": 127.42050399780274, + "kl_loss_2": 1457.0114624023438, + "kl_loss_3": 1078.6867706298829, + "kl_loss_7": 347.5483764648437, + "learning_rate": 0.0003591337215792851, + "loss": 738.7038, + "step": 5950 + }, + { + "ce_loss_10": 3.6211448907852173, + "ce_loss_13": 3.567346215248108, + "ce_loss_2": 4.192822194099426, + "ce_loss_3": 4.018707859516144, + "ce_loss_7": 3.7064581990242003, + "epoch": 0.596, + "grad_norm": 504.0, + "kl_loss_10": 124.89567108154297, + "kl_loss_2": 1390.8341674804688, + "kl_loss_3": 1045.144497680664, + "kl_loss_7": 337.26280822753904, + "learning_rate": 0.00035761204143717383, + "loss": 735.5005, + "step": 5960 + }, + { + "ce_loss_10": 3.5706104040145874, + "ce_loss_13": 3.514435362815857, + "ce_loss_2": 4.1687785387039185, + "ce_loss_3": 3.9898008584976195, + "ce_loss_7": 3.6620924711227416, + "epoch": 0.597, + "grad_norm": 752.0, + "kl_loss_10": 125.53282928466797, + "kl_loss_2": 1417.2740112304687, + "kl_loss_3": 1066.1643615722655, + "kl_loss_7": 341.81075897216795, + "learning_rate": 0.0003560917951402245, + "loss": 752.2635, + "step": 5970 + }, + { + "ce_loss_10": 3.5460765242576597, + "ce_loss_13": 3.4938451290130614, + "ce_loss_2": 4.134422564506531, + "ce_loss_3": 3.9595829844474792, + "ce_loss_7": 3.6350401043891907, + "epoch": 0.598, + "grad_norm": 732.0, + "kl_loss_10": 123.55417900085449, + "kl_loss_2": 1403.7479370117187, + "kl_loss_3": 1060.9109008789062, + "kl_loss_7": 338.4989807128906, + "learning_rate": 0.00035457299799730046, + "loss": 729.4276, + "step": 5980 + }, + { + "ce_loss_10": 3.613729405403137, + "ce_loss_13": 3.560684585571289, + "ce_loss_2": 4.205018150806427, + "ce_loss_3": 4.024239468574524, + "ce_loss_7": 3.7021196484565735, + "epoch": 0.599, + "grad_norm": 506.0, + "kl_loss_10": 123.72282943725585, + "kl_loss_2": 1406.9813415527344, + "kl_loss_3": 1052.2019958496094, + "kl_loss_7": 339.51587066650393, + "learning_rate": 0.0003530556653026721, + "loss": 741.729, + "step": 5990 + }, + { + "ce_loss_10": 3.523304808139801, + "ce_loss_13": 3.4713462948799134, + "ce_loss_2": 4.125553596019745, + "ce_loss_3": 3.935919165611267, + "ce_loss_7": 3.6131145000457763, + "epoch": 0.6, + "grad_norm": 1792.0, + "kl_loss_10": 122.18126983642578, + "kl_loss_2": 1418.4766357421875, + "kl_loss_3": 1045.9818206787108, + "kl_loss_7": 333.5164306640625, + "learning_rate": 0.00035153981233586274, + "loss": 736.1904, + "step": 6000 + }, + { + "ce_loss_10": 3.503693675994873, + "ce_loss_13": 3.449203038215637, + "ce_loss_2": 4.1076519846916195, + "ce_loss_3": 3.9302319645881654, + "ce_loss_7": 3.59748318195343, + "epoch": 0.601, + "grad_norm": 624.0, + "kl_loss_10": 123.16584014892578, + "kl_loss_2": 1427.8430603027343, + "kl_loss_3": 1067.8656494140625, + "kl_loss_7": 338.2985580444336, + "learning_rate": 0.00035002545436149473, + "loss": 755.88, + "step": 6010 + }, + { + "ce_loss_10": 3.5142654895782472, + "ce_loss_13": 3.4579913139343263, + "ce_loss_2": 4.128756499290466, + "ce_loss_3": 3.9431054472923277, + "ce_loss_7": 3.6089595675468447, + "epoch": 0.602, + "grad_norm": 732.0, + "kl_loss_10": 128.0988456726074, + "kl_loss_2": 1457.7831787109376, + "kl_loss_3": 1094.435125732422, + "kl_loss_7": 346.98584442138673, + "learning_rate": 0.0003485126066291364, + "loss": 739.3451, + "step": 6020 + }, + { + "ce_loss_10": 3.560540997982025, + "ce_loss_13": 3.5067087769508363, + "ce_loss_2": 4.174036264419556, + "ce_loss_3": 3.9827836632728575, + "ce_loss_7": 3.651438629627228, + "epoch": 0.603, + "grad_norm": 540.0, + "kl_loss_10": 123.4724838256836, + "kl_loss_2": 1428.8510314941407, + "kl_loss_3": 1061.573553466797, + "kl_loss_7": 336.35887298583987, + "learning_rate": 0.0003470012843731476, + "loss": 740.6739, + "step": 6030 + }, + { + "ce_loss_10": 3.50463045835495, + "ce_loss_13": 3.4489762425422668, + "ce_loss_2": 4.112862968444825, + "ce_loss_3": 3.9258066773414613, + "ce_loss_7": 3.596073019504547, + "epoch": 0.604, + "grad_norm": 640.0, + "kl_loss_10": 122.90189971923829, + "kl_loss_2": 1436.7977661132813, + "kl_loss_3": 1067.8268920898438, + "kl_loss_7": 338.7447814941406, + "learning_rate": 0.00034549150281252633, + "loss": 752.0526, + "step": 6040 + }, + { + "ce_loss_10": 3.4799697160720826, + "ce_loss_13": 3.424366593360901, + "ce_loss_2": 4.0825390934944155, + "ce_loss_3": 3.8963089108467104, + "ce_loss_7": 3.5723769664764404, + "epoch": 0.605, + "grad_norm": 516.0, + "kl_loss_10": 123.8424732208252, + "kl_loss_2": 1400.4283142089844, + "kl_loss_3": 1041.9283935546875, + "kl_loss_7": 336.8849105834961, + "learning_rate": 0.0003439832771507565, + "loss": 727.6076, + "step": 6050 + }, + { + "ce_loss_10": 3.486681044101715, + "ce_loss_13": 3.432289206981659, + "ce_loss_2": 4.096179568767548, + "ce_loss_3": 3.9118806958198546, + "ce_loss_7": 3.5778406381607057, + "epoch": 0.606, + "grad_norm": 524.0, + "kl_loss_10": 123.29789390563965, + "kl_loss_2": 1440.8687683105468, + "kl_loss_3": 1077.3507263183594, + "kl_loss_7": 339.76096496582034, + "learning_rate": 0.0003424766225756537, + "loss": 735.1911, + "step": 6060 + }, + { + "ce_loss_10": 3.5489431023597717, + "ce_loss_13": 3.4944167613983153, + "ce_loss_2": 4.150770962238312, + "ce_loss_3": 3.9680647253990173, + "ce_loss_7": 3.642049860954285, + "epoch": 0.607, + "grad_norm": 544.0, + "kl_loss_10": 125.39007148742675, + "kl_loss_2": 1412.7550415039063, + "kl_loss_3": 1051.9062286376952, + "kl_loss_7": 340.649674987793, + "learning_rate": 0.00034097155425921255, + "loss": 725.0725, + "step": 6070 + }, + { + "ce_loss_10": 3.4372221708297728, + "ce_loss_13": 3.381313109397888, + "ce_loss_2": 4.052776217460632, + "ce_loss_3": 3.8650416135787964, + "ce_loss_7": 3.5328315615653993, + "epoch": 0.608, + "grad_norm": 1456.0, + "kl_loss_10": 124.18506164550782, + "kl_loss_2": 1445.7065368652343, + "kl_loss_3": 1072.234182739258, + "kl_loss_7": 342.8041061401367, + "learning_rate": 0.0003394680873574546, + "loss": 736.824, + "step": 6080 + }, + { + "ce_loss_10": 3.553036856651306, + "ce_loss_13": 3.4948946237564087, + "ce_loss_2": 4.163804018497467, + "ce_loss_3": 3.9761640787124635, + "ce_loss_7": 3.6420689702033995, + "epoch": 0.609, + "grad_norm": 640.0, + "kl_loss_10": 125.32937316894531, + "kl_loss_2": 1441.000421142578, + "kl_loss_3": 1071.1825592041016, + "kl_loss_7": 339.07115783691404, + "learning_rate": 0.0003379662370102747, + "loss": 733.7026, + "step": 6090 + }, + { + "ce_loss_10": 3.556735038757324, + "ce_loss_13": 3.5061814308166506, + "ce_loss_2": 4.14713134765625, + "ce_loss_3": 3.96578129529953, + "ce_loss_7": 3.649851453304291, + "epoch": 0.61, + "grad_norm": 708.0, + "kl_loss_10": 123.27509727478028, + "kl_loss_2": 1418.8950134277343, + "kl_loss_3": 1054.7578002929688, + "kl_loss_7": 338.9109802246094, + "learning_rate": 0.0003364660183412892, + "loss": 735.4294, + "step": 6100 + }, + { + "ce_loss_10": 3.538415002822876, + "ce_loss_13": 3.4844126224517824, + "ce_loss_2": 4.1343903064727785, + "ce_loss_3": 3.9533764481544496, + "ce_loss_7": 3.6281407356262205, + "epoch": 0.611, + "grad_norm": 568.0, + "kl_loss_10": 124.66803207397462, + "kl_loss_2": 1415.106756591797, + "kl_loss_3": 1056.7506927490235, + "kl_loss_7": 340.27066802978516, + "learning_rate": 0.0003349674464576834, + "loss": 741.5298, + "step": 6110 + }, + { + "ce_loss_10": 3.491645836830139, + "ce_loss_13": 3.43500040769577, + "ce_loss_2": 4.0973351955413815, + "ce_loss_3": 3.9120434165000915, + "ce_loss_7": 3.5810877084732056, + "epoch": 0.612, + "grad_norm": 676.0, + "kl_loss_10": 124.53080596923829, + "kl_loss_2": 1429.85146484375, + "kl_loss_3": 1068.896337890625, + "kl_loss_7": 338.6284439086914, + "learning_rate": 0.00033347053645005966, + "loss": 725.7744, + "step": 6120 + }, + { + "ce_loss_10": 3.6020973324775696, + "ce_loss_13": 3.54690443277359, + "ce_loss_2": 4.190545213222504, + "ce_loss_3": 4.01407161951065, + "ce_loss_7": 3.6918676257133485, + "epoch": 0.613, + "grad_norm": 1016.0, + "kl_loss_10": 123.26982383728027, + "kl_loss_2": 1385.2622436523438, + "kl_loss_3": 1041.1985900878906, + "kl_loss_7": 335.70687713623045, + "learning_rate": 0.00033197530339228485, + "loss": 735.7329, + "step": 6130 + }, + { + "ce_loss_10": 3.5564875841140746, + "ce_loss_13": 3.5000383257865906, + "ce_loss_2": 4.153977084159851, + "ce_loss_3": 3.976674497127533, + "ce_loss_7": 3.650752902030945, + "epoch": 0.614, + "grad_norm": 520.0, + "kl_loss_10": 125.21984558105468, + "kl_loss_2": 1410.2340087890625, + "kl_loss_3": 1060.819757080078, + "kl_loss_7": 342.1903564453125, + "learning_rate": 0.00033048176234133967, + "loss": 730.7129, + "step": 6140 + }, + { + "ce_loss_10": 3.5401206254959106, + "ce_loss_13": 3.48342844247818, + "ce_loss_2": 4.13830029964447, + "ce_loss_3": 3.9539244890213014, + "ce_loss_7": 3.632445764541626, + "epoch": 0.615, + "grad_norm": 600.0, + "kl_loss_10": 124.45763664245605, + "kl_loss_2": 1419.0204040527344, + "kl_loss_3": 1063.9102508544922, + "kl_loss_7": 341.1919967651367, + "learning_rate": 0.0003289899283371657, + "loss": 739.3029, + "step": 6150 + }, + { + "ce_loss_10": 3.5686771631240846, + "ce_loss_13": 3.5131590008735656, + "ce_loss_2": 4.169058787822723, + "ce_loss_3": 3.9850056529045106, + "ce_loss_7": 3.6635145783424377, + "epoch": 0.616, + "grad_norm": 828.0, + "kl_loss_10": 123.31291618347169, + "kl_loss_2": 1402.3692749023437, + "kl_loss_3": 1052.1495544433594, + "kl_loss_7": 335.0513031005859, + "learning_rate": 0.0003274998164025148, + "loss": 738.5246, + "step": 6160 + }, + { + "ce_loss_10": 3.5952057123184202, + "ce_loss_13": 3.538980412483215, + "ce_loss_2": 4.18984272480011, + "ce_loss_3": 4.006945097446442, + "ce_loss_7": 3.684834325313568, + "epoch": 0.617, + "grad_norm": 508.0, + "kl_loss_10": 126.31275444030761, + "kl_loss_2": 1409.8781188964845, + "kl_loss_3": 1055.6673553466796, + "kl_loss_7": 341.4982223510742, + "learning_rate": 0.0003260114415427975, + "loss": 748.9741, + "step": 6170 + }, + { + "ce_loss_10": 3.5180846571922304, + "ce_loss_13": 3.4631958842277526, + "ce_loss_2": 4.117404592037201, + "ce_loss_3": 3.942760634422302, + "ce_loss_7": 3.6095527529716493, + "epoch": 0.618, + "grad_norm": 544.0, + "kl_loss_10": 123.8319091796875, + "kl_loss_2": 1425.0959899902343, + "kl_loss_3": 1070.6471069335937, + "kl_loss_7": 336.5234832763672, + "learning_rate": 0.0003245248187459323, + "loss": 747.7899, + "step": 6180 + }, + { + "ce_loss_10": 3.503745806217194, + "ce_loss_13": 3.452336239814758, + "ce_loss_2": 4.087360787391662, + "ce_loss_3": 3.902265763282776, + "ce_loss_7": 3.589401423931122, + "epoch": 0.619, + "grad_norm": 812.0, + "kl_loss_10": 119.91349830627442, + "kl_loss_2": 1387.6965454101562, + "kl_loss_3": 1031.423031616211, + "kl_loss_7": 329.9644256591797, + "learning_rate": 0.00032303996298219416, + "loss": 722.3549, + "step": 6190 + }, + { + "ce_loss_10": 3.5912120580673217, + "ce_loss_13": 3.532735550403595, + "ce_loss_2": 4.175826632976532, + "ce_loss_3": 3.990143024921417, + "ce_loss_7": 3.678599214553833, + "epoch": 0.62, + "grad_norm": 572.0, + "kl_loss_10": 123.47973709106445, + "kl_loss_2": 1384.380682373047, + "kl_loss_3": 1031.3387481689454, + "kl_loss_7": 333.18008575439455, + "learning_rate": 0.00032155688920406414, + "loss": 721.7089, + "step": 6200 + }, + { + "ce_loss_10": 3.5028671979904176, + "ce_loss_13": 3.4443759083747865, + "ce_loss_2": 4.133521604537964, + "ce_loss_3": 3.941824531555176, + "ce_loss_7": 3.5987909913063048, + "epoch": 0.621, + "grad_norm": 628.0, + "kl_loss_10": 128.0587142944336, + "kl_loss_2": 1455.4984985351562, + "kl_loss_3": 1085.7472259521485, + "kl_loss_7": 344.9437423706055, + "learning_rate": 0.0003200756123460788, + "loss": 757.9494, + "step": 6210 + }, + { + "ce_loss_10": 3.53128182888031, + "ce_loss_13": 3.473754036426544, + "ce_loss_2": 4.1455615043640135, + "ce_loss_3": 3.960498309135437, + "ce_loss_7": 3.627154862880707, + "epoch": 0.622, + "grad_norm": 852.0, + "kl_loss_10": 126.65972442626953, + "kl_loss_2": 1452.3799194335938, + "kl_loss_3": 1080.0731689453125, + "kl_loss_7": 345.12744140625, + "learning_rate": 0.00031859614732467957, + "loss": 747.3707, + "step": 6220 + }, + { + "ce_loss_10": 3.580655241012573, + "ce_loss_13": 3.5249094009399413, + "ce_loss_2": 4.172132050991058, + "ce_loss_3": 3.9869516491889954, + "ce_loss_7": 3.672192335128784, + "epoch": 0.623, + "grad_norm": 704.0, + "kl_loss_10": 122.48365020751953, + "kl_loss_2": 1384.416357421875, + "kl_loss_3": 1029.3415832519531, + "kl_loss_7": 332.85533294677737, + "learning_rate": 0.00031711850903806275, + "loss": 722.4154, + "step": 6230 + }, + { + "ce_loss_10": 3.4871063590049745, + "ce_loss_13": 3.433118486404419, + "ce_loss_2": 4.09981507062912, + "ce_loss_3": 3.9184054851531984, + "ce_loss_7": 3.580829381942749, + "epoch": 0.624, + "grad_norm": 564.0, + "kl_loss_10": 127.4908618927002, + "kl_loss_2": 1445.4068542480468, + "kl_loss_3": 1080.8368865966797, + "kl_loss_7": 346.9587997436523, + "learning_rate": 0.0003156427123660297, + "loss": 733.6306, + "step": 6240 + }, + { + "ce_loss_10": 3.577600693702698, + "ce_loss_13": 3.520812726020813, + "ce_loss_2": 4.167683470249176, + "ce_loss_3": 3.9879754543304444, + "ce_loss_7": 3.6715123414993287, + "epoch": 0.625, + "grad_norm": 580.0, + "kl_loss_10": 123.75414009094239, + "kl_loss_2": 1393.0920532226562, + "kl_loss_3": 1043.1118560791015, + "kl_loss_7": 336.03589630126953, + "learning_rate": 0.0003141687721698363, + "loss": 735.9893, + "step": 6250 + }, + { + "ce_loss_10": 3.5385371804237367, + "ce_loss_13": 3.4868279933929442, + "ce_loss_2": 4.114449465274811, + "ce_loss_3": 3.9369269490242003, + "ce_loss_7": 3.626781237125397, + "epoch": 0.626, + "grad_norm": 474.0, + "kl_loss_10": 120.09895820617676, + "kl_loss_2": 1350.9141357421875, + "kl_loss_3": 1009.9356262207032, + "kl_loss_7": 325.1434753417969, + "learning_rate": 0.00031269670329204396, + "loss": 718.769, + "step": 6260 + }, + { + "ce_loss_10": 3.582829785346985, + "ce_loss_13": 3.528691279888153, + "ce_loss_2": 4.164977276325226, + "ce_loss_3": 3.9852482438087464, + "ce_loss_7": 3.668261468410492, + "epoch": 0.627, + "grad_norm": 648.0, + "kl_loss_10": 124.36363410949707, + "kl_loss_2": 1384.6681701660157, + "kl_loss_3": 1036.161444091797, + "kl_loss_7": 336.17731018066405, + "learning_rate": 0.00031122652055637015, + "loss": 724.7516, + "step": 6270 + }, + { + "ce_loss_10": 3.543643081188202, + "ce_loss_13": 3.4886670470237733, + "ce_loss_2": 4.15304182767868, + "ce_loss_3": 3.9659879207611084, + "ce_loss_7": 3.6350444078445436, + "epoch": 0.628, + "grad_norm": 744.0, + "kl_loss_10": 124.99579772949218, + "kl_loss_2": 1439.9531860351562, + "kl_loss_3": 1072.6888000488282, + "kl_loss_7": 339.7438629150391, + "learning_rate": 0.0003097582387675385, + "loss": 726.9059, + "step": 6280 + }, + { + "ce_loss_10": 3.5834500670433043, + "ce_loss_13": 3.5285070896148683, + "ce_loss_2": 4.176322209835052, + "ce_loss_3": 3.994720828533173, + "ce_loss_7": 3.670943582057953, + "epoch": 0.629, + "grad_norm": 720.0, + "kl_loss_10": 124.72322578430176, + "kl_loss_2": 1416.1133972167968, + "kl_loss_3": 1054.6082000732422, + "kl_loss_7": 338.46196899414065, + "learning_rate": 0.00030829187271113034, + "loss": 727.2747, + "step": 6290 + }, + { + "ce_loss_10": 3.5845503926277162, + "ce_loss_13": 3.5288984060287474, + "ce_loss_2": 4.167012679576874, + "ce_loss_3": 3.9864045858383177, + "ce_loss_7": 3.6704161047935484, + "epoch": 0.63, + "grad_norm": 520.0, + "kl_loss_10": 122.14001274108887, + "kl_loss_2": 1381.4854248046875, + "kl_loss_3": 1031.7789672851563, + "kl_loss_7": 329.18092193603513, + "learning_rate": 0.00030682743715343565, + "loss": 729.9382, + "step": 6300 + }, + { + "ce_loss_10": 3.530526852607727, + "ce_loss_13": 3.4727389931678774, + "ce_loss_2": 4.135389792919159, + "ce_loss_3": 3.9555342316627504, + "ce_loss_7": 3.6228522419929505, + "epoch": 0.631, + "grad_norm": 608.0, + "kl_loss_10": 126.8146873474121, + "kl_loss_2": 1414.6725708007812, + "kl_loss_3": 1060.110418701172, + "kl_loss_7": 343.28450164794924, + "learning_rate": 0.0003053649468413043, + "loss": 735.6304, + "step": 6310 + }, + { + "ce_loss_10": 3.6406059503555297, + "ce_loss_13": 3.584047770500183, + "ce_loss_2": 4.235766565799713, + "ce_loss_3": 4.052544438838959, + "ce_loss_7": 3.7318387985229493, + "epoch": 0.632, + "grad_norm": 872.0, + "kl_loss_10": 126.71429443359375, + "kl_loss_2": 1412.1578735351563, + "kl_loss_3": 1059.7192962646484, + "kl_loss_7": 343.02261505126955, + "learning_rate": 0.00030390441650199725, + "loss": 726.2171, + "step": 6320 + }, + { + "ce_loss_10": 3.53815176486969, + "ce_loss_13": 3.4841132283210756, + "ce_loss_2": 4.135061252117157, + "ce_loss_3": 3.953271949291229, + "ce_loss_7": 3.6305129885673524, + "epoch": 0.633, + "grad_norm": 580.0, + "kl_loss_10": 122.51690254211425, + "kl_loss_2": 1406.480645751953, + "kl_loss_3": 1045.3659637451171, + "kl_loss_7": 338.1310546875, + "learning_rate": 0.00030244586084303903, + "loss": 723.4697, + "step": 6330 + }, + { + "ce_loss_10": 3.5056158542633056, + "ce_loss_13": 3.45035959482193, + "ce_loss_2": 4.1218892455101015, + "ce_loss_3": 3.9383804321289064, + "ce_loss_7": 3.598428511619568, + "epoch": 0.634, + "grad_norm": 524.0, + "kl_loss_10": 125.22274055480958, + "kl_loss_2": 1454.4839721679687, + "kl_loss_3": 1091.4578918457032, + "kl_loss_7": 343.3879928588867, + "learning_rate": 0.00030098929455206903, + "loss": 732.6389, + "step": 6340 + }, + { + "ce_loss_10": 3.5128793120384216, + "ce_loss_13": 3.458734905719757, + "ce_loss_2": 4.109455585479736, + "ce_loss_3": 3.921981763839722, + "ce_loss_7": 3.603852927684784, + "epoch": 0.635, + "grad_norm": 660.0, + "kl_loss_10": 123.21167640686035, + "kl_loss_2": 1429.5969665527343, + "kl_loss_3": 1065.2174682617188, + "kl_loss_7": 343.878678894043, + "learning_rate": 0.00029953473229669324, + "loss": 754.7498, + "step": 6350 + }, + { + "ce_loss_10": 3.5429938197135926, + "ce_loss_13": 3.4890798926353455, + "ce_loss_2": 4.142077577114105, + "ce_loss_3": 3.9637051105499266, + "ce_loss_7": 3.633872401714325, + "epoch": 0.636, + "grad_norm": 532.0, + "kl_loss_10": 122.68165473937988, + "kl_loss_2": 1408.9635009765625, + "kl_loss_3": 1054.9124420166015, + "kl_loss_7": 341.2626647949219, + "learning_rate": 0.00029808218872433767, + "loss": 726.679, + "step": 6360 + }, + { + "ce_loss_10": 3.6060853004455566, + "ce_loss_13": 3.550333082675934, + "ce_loss_2": 4.186102759838104, + "ce_loss_3": 4.006532001495361, + "ce_loss_7": 3.6955028772354126, + "epoch": 0.637, + "grad_norm": 488.0, + "kl_loss_10": 123.89888572692871, + "kl_loss_2": 1393.8600280761718, + "kl_loss_3": 1041.2120056152344, + "kl_loss_7": 339.00569610595704, + "learning_rate": 0.0002966316784621, + "loss": 721.496, + "step": 6370 + }, + { + "ce_loss_10": 3.5151076436042787, + "ce_loss_13": 3.460108757019043, + "ce_loss_2": 4.134497022628784, + "ce_loss_3": 3.949837851524353, + "ce_loss_7": 3.610580575466156, + "epoch": 0.638, + "grad_norm": 556.0, + "kl_loss_10": 124.86854286193848, + "kl_loss_2": 1453.7235046386718, + "kl_loss_3": 1082.8961395263673, + "kl_loss_7": 346.0769012451172, + "learning_rate": 0.0002951832161166024, + "loss": 732.5327, + "step": 6380 + }, + { + "ce_loss_10": 3.59385507106781, + "ce_loss_13": 3.5390156865119935, + "ce_loss_2": 4.197193539142608, + "ce_loss_3": 4.0121661901474, + "ce_loss_7": 3.6833563446998596, + "epoch": 0.639, + "grad_norm": 780.0, + "kl_loss_10": 125.91835517883301, + "kl_loss_2": 1404.7720581054687, + "kl_loss_3": 1048.3076721191405, + "kl_loss_7": 340.255078125, + "learning_rate": 0.0002937368162738445, + "loss": 721.7094, + "step": 6390 + }, + { + "ce_loss_10": 3.531213629245758, + "ce_loss_13": 3.4822975039482116, + "ce_loss_2": 4.113115763664245, + "ce_loss_3": 3.9376835942268373, + "ce_loss_7": 3.6169563055038454, + "epoch": 0.64, + "grad_norm": 800.0, + "kl_loss_10": 120.01525421142578, + "kl_loss_2": 1385.8341735839845, + "kl_loss_3": 1040.7418151855468, + "kl_loss_7": 330.25692291259764, + "learning_rate": 0.0002922924934990568, + "loss": 729.1814, + "step": 6400 + }, + { + "ce_loss_10": 3.470765709877014, + "ce_loss_13": 3.4150401830673216, + "ce_loss_2": 4.09455554485321, + "ce_loss_3": 3.90150500535965, + "ce_loss_7": 3.5624879002571106, + "epoch": 0.641, + "grad_norm": 532.0, + "kl_loss_10": 123.4526496887207, + "kl_loss_2": 1456.2535888671875, + "kl_loss_3": 1080.2837280273438, + "kl_loss_7": 339.2045959472656, + "learning_rate": 0.0002908502623365536, + "loss": 736.9762, + "step": 6410 + }, + { + "ce_loss_10": 3.4067123413085936, + "ce_loss_13": 3.350449573993683, + "ce_loss_2": 4.032287085056305, + "ce_loss_3": 3.845944118499756, + "ce_loss_7": 3.5039458990097048, + "epoch": 0.642, + "grad_norm": 660.0, + "kl_loss_10": 122.2471866607666, + "kl_loss_2": 1452.7192321777343, + "kl_loss_3": 1079.5542907714844, + "kl_loss_7": 340.34538116455076, + "learning_rate": 0.0002894101373095867, + "loss": 734.8263, + "step": 6420 + }, + { + "ce_loss_10": 3.617260241508484, + "ce_loss_13": 3.561350774765015, + "ce_loss_2": 4.2049798488616945, + "ce_loss_3": 4.030479991436005, + "ce_loss_7": 3.7099608182907104, + "epoch": 0.643, + "grad_norm": 580.0, + "kl_loss_10": 126.8806884765625, + "kl_loss_2": 1398.0826477050782, + "kl_loss_3": 1052.9947052001953, + "kl_loss_7": 344.0747329711914, + "learning_rate": 0.00028797213292019926, + "loss": 731.6678, + "step": 6430 + }, + { + "ce_loss_10": 3.597985827922821, + "ce_loss_13": 3.5433950066566466, + "ce_loss_2": 4.183076286315918, + "ce_loss_3": 4.004205846786499, + "ce_loss_7": 3.6886964678764342, + "epoch": 0.644, + "grad_norm": 498.0, + "kl_loss_10": 124.60531654357911, + "kl_loss_2": 1393.4992248535157, + "kl_loss_3": 1047.5234649658203, + "kl_loss_7": 339.99706268310547, + "learning_rate": 0.0002865362636490791, + "loss": 738.58, + "step": 6440 + }, + { + "ce_loss_10": 3.6034607529640197, + "ce_loss_13": 3.551686775684357, + "ce_loss_2": 4.19627479314804, + "ce_loss_3": 4.010957944393158, + "ce_loss_7": 3.695608949661255, + "epoch": 0.645, + "grad_norm": 532.0, + "kl_loss_10": 122.10445442199708, + "kl_loss_2": 1399.2243957519531, + "kl_loss_3": 1038.563070678711, + "kl_loss_7": 334.98720092773436, + "learning_rate": 0.0002851025439554142, + "loss": 722.2279, + "step": 6450 + }, + { + "ce_loss_10": 3.604773259162903, + "ce_loss_13": 3.547994613647461, + "ce_loss_2": 4.190113079547882, + "ce_loss_3": 4.01669454574585, + "ce_loss_7": 3.6950499534606935, + "epoch": 0.646, + "grad_norm": 568.0, + "kl_loss_10": 124.84522972106933, + "kl_loss_2": 1393.757257080078, + "kl_loss_3": 1045.5576049804688, + "kl_loss_7": 340.6378997802734, + "learning_rate": 0.00028367098827674573, + "loss": 721.3065, + "step": 6460 + }, + { + "ce_loss_10": 3.5277416110038757, + "ce_loss_13": 3.4715929985046388, + "ce_loss_2": 4.1209129095077515, + "ce_loss_3": 3.936191809177399, + "ce_loss_7": 3.617918372154236, + "epoch": 0.647, + "grad_norm": 644.0, + "kl_loss_10": 121.93290557861329, + "kl_loss_2": 1392.9058166503905, + "kl_loss_3": 1035.490902709961, + "kl_loss_7": 330.90175476074216, + "learning_rate": 0.00028224161102882397, + "loss": 724.9437, + "step": 6470 + }, + { + "ce_loss_10": 3.506662678718567, + "ce_loss_13": 3.4533620953559874, + "ce_loss_2": 4.092417252063751, + "ce_loss_3": 3.9158310890197754, + "ce_loss_7": 3.596454584598541, + "epoch": 0.648, + "grad_norm": 700.0, + "kl_loss_10": 122.67344360351562, + "kl_loss_2": 1380.4693481445313, + "kl_loss_3": 1032.2998260498048, + "kl_loss_7": 331.9904846191406, + "learning_rate": 0.00028081442660546124, + "loss": 724.6898, + "step": 6480 + }, + { + "ce_loss_10": 3.561033821105957, + "ce_loss_13": 3.5084841728210447, + "ce_loss_2": 4.147166728973389, + "ce_loss_3": 3.9707638025283813, + "ce_loss_7": 3.6506178617477416, + "epoch": 0.649, + "grad_norm": 612.0, + "kl_loss_10": 125.22979164123535, + "kl_loss_2": 1398.154718017578, + "kl_loss_3": 1043.1712646484375, + "kl_loss_7": 336.69880828857424, + "learning_rate": 0.0002793894493783892, + "loss": 728.1853, + "step": 6490 + }, + { + "ce_loss_10": 3.5835307121276854, + "ce_loss_13": 3.5306628704071046, + "ce_loss_2": 4.164869272708893, + "ce_loss_3": 3.9835118532180784, + "ce_loss_7": 3.6689149260520937, + "epoch": 0.65, + "grad_norm": 648.0, + "kl_loss_10": 121.68083915710449, + "kl_loss_2": 1374.832666015625, + "kl_loss_3": 1024.9227661132813, + "kl_loss_7": 329.3937103271484, + "learning_rate": 0.0002779666936971129, + "loss": 715.8627, + "step": 6500 + }, + { + "ce_loss_10": 3.5850860357284544, + "ce_loss_13": 3.5323161482810974, + "ce_loss_2": 4.190707218647003, + "ce_loss_3": 4.01264888048172, + "ce_loss_7": 3.678597128391266, + "epoch": 0.651, + "grad_norm": 588.0, + "kl_loss_10": 123.99790267944336, + "kl_loss_2": 1415.28037109375, + "kl_loss_3": 1063.5095092773438, + "kl_loss_7": 339.4371505737305, + "learning_rate": 0.00027654617388876614, + "loss": 737.8925, + "step": 6510 + }, + { + "ce_loss_10": 3.618337428569794, + "ce_loss_13": 3.5666980028152464, + "ce_loss_2": 4.202663445472718, + "ce_loss_3": 4.029495453834533, + "ce_loss_7": 3.7100385665893554, + "epoch": 0.652, + "grad_norm": 672.0, + "kl_loss_10": 125.23239784240722, + "kl_loss_2": 1392.5545654296875, + "kl_loss_3": 1036.998681640625, + "kl_loss_7": 336.28130493164065, + "learning_rate": 0.0002751279042579672, + "loss": 726.2184, + "step": 6520 + }, + { + "ce_loss_10": 3.5651236891746523, + "ce_loss_13": 3.510008442401886, + "ce_loss_2": 4.147745668888092, + "ce_loss_3": 3.967809629440308, + "ce_loss_7": 3.655534052848816, + "epoch": 0.653, + "grad_norm": 596.0, + "kl_loss_10": 122.48806457519531, + "kl_loss_2": 1376.0337341308593, + "kl_loss_3": 1025.2164947509766, + "kl_loss_7": 331.06346282958987, + "learning_rate": 0.00027371189908667604, + "loss": 727.9293, + "step": 6530 + }, + { + "ce_loss_10": 3.610418975353241, + "ce_loss_13": 3.5547208905220034, + "ce_loss_2": 4.223685729503631, + "ce_loss_3": 4.037129259109497, + "ce_loss_7": 3.7057825326919556, + "epoch": 0.654, + "grad_norm": 664.0, + "kl_loss_10": 127.65673484802247, + "kl_loss_2": 1437.9926696777343, + "kl_loss_3": 1063.241262817383, + "kl_loss_7": 346.38170166015624, + "learning_rate": 0.00027229817263404863, + "loss": 746.9211, + "step": 6540 + }, + { + "ce_loss_10": 3.594118654727936, + "ce_loss_13": 3.5403075575828553, + "ce_loss_2": 4.165842056274414, + "ce_loss_3": 3.9902518033981322, + "ce_loss_7": 3.6817231297492983, + "epoch": 0.655, + "grad_norm": 612.0, + "kl_loss_10": 123.36601295471192, + "kl_loss_2": 1372.2304443359376, + "kl_loss_3": 1023.0949890136719, + "kl_loss_7": 330.6029968261719, + "learning_rate": 0.0002708867391362948, + "loss": 721.8018, + "step": 6550 + }, + { + "ce_loss_10": 3.5721522688865663, + "ce_loss_13": 3.5191043853759765, + "ce_loss_2": 4.140277779102325, + "ce_loss_3": 3.9575397610664367, + "ce_loss_7": 3.658042335510254, + "epoch": 0.656, + "grad_norm": 494.0, + "kl_loss_10": 121.24272346496582, + "kl_loss_2": 1340.1691162109375, + "kl_loss_3": 1000.573861694336, + "kl_loss_7": 325.7244140625, + "learning_rate": 0.0002694776128065345, + "loss": 714.533, + "step": 6560 + }, + { + "ce_loss_10": 3.503811073303223, + "ce_loss_13": 3.450370526313782, + "ce_loss_2": 4.107607281208038, + "ce_loss_3": 3.92315833568573, + "ce_loss_7": 3.597831392288208, + "epoch": 0.657, + "grad_norm": 450.0, + "kl_loss_10": 122.38468208312989, + "kl_loss_2": 1424.4636169433593, + "kl_loss_3": 1064.6302703857423, + "kl_loss_7": 339.14308166503906, + "learning_rate": 0.00026807080783465374, + "loss": 721.975, + "step": 6570 + }, + { + "ce_loss_10": 3.61981600522995, + "ce_loss_13": 3.5655640721321107, + "ce_loss_2": 4.2158555626869205, + "ce_loss_3": 4.038338911533356, + "ce_loss_7": 3.7099917411804197, + "epoch": 0.658, + "grad_norm": 696.0, + "kl_loss_10": 124.21554260253906, + "kl_loss_2": 1399.7592712402343, + "kl_loss_3": 1044.859848022461, + "kl_loss_7": 337.2662155151367, + "learning_rate": 0.00026666633838716316, + "loss": 734.2561, + "step": 6580 + }, + { + "ce_loss_10": 3.5148606061935426, + "ce_loss_13": 3.4572384357452393, + "ce_loss_2": 4.128946197032929, + "ce_loss_3": 3.9375492334365845, + "ce_loss_7": 3.6097939372062684, + "epoch": 0.659, + "grad_norm": 524.0, + "kl_loss_10": 127.11134567260743, + "kl_loss_2": 1438.689013671875, + "kl_loss_3": 1068.9808227539063, + "kl_loss_7": 344.35106506347654, + "learning_rate": 0.00026526421860705474, + "loss": 741.562, + "step": 6590 + }, + { + "ce_loss_10": 3.537730169296265, + "ce_loss_13": 3.4823685765266417, + "ce_loss_2": 4.134520995616913, + "ce_loss_3": 3.9507094264030456, + "ce_loss_7": 3.6302355527877808, + "epoch": 0.66, + "grad_norm": 780.0, + "kl_loss_10": 124.95889053344726, + "kl_loss_2": 1415.88623046875, + "kl_loss_3": 1054.4846405029298, + "kl_loss_7": 341.83483123779297, + "learning_rate": 0.0002638644626136587, + "loss": 726.3774, + "step": 6600 + }, + { + "ce_loss_10": 3.5502317190170287, + "ce_loss_13": 3.495832419395447, + "ce_loss_2": 4.1422311663627625, + "ce_loss_3": 3.9583224058151245, + "ce_loss_7": 3.6375673055648803, + "epoch": 0.661, + "grad_norm": 540.0, + "kl_loss_10": 122.10346221923828, + "kl_loss_2": 1403.828271484375, + "kl_loss_3": 1050.9346862792968, + "kl_loss_7": 334.5736999511719, + "learning_rate": 0.00026246708450250255, + "loss": 731.0771, + "step": 6610 + }, + { + "ce_loss_10": 3.5466328024864198, + "ce_loss_13": 3.4922005414962767, + "ce_loss_2": 4.128600871562957, + "ce_loss_3": 3.9507038831710815, + "ce_loss_7": 3.633044409751892, + "epoch": 0.662, + "grad_norm": 836.0, + "kl_loss_10": 122.05910720825196, + "kl_loss_2": 1380.4447082519532, + "kl_loss_3": 1030.7735382080077, + "kl_loss_7": 329.82720336914065, + "learning_rate": 0.00026107209834516854, + "loss": 719.4666, + "step": 6620 + }, + { + "ce_loss_10": 3.4943259835243223, + "ce_loss_13": 3.441339361667633, + "ce_loss_2": 4.105875480175018, + "ce_loss_3": 3.918928301334381, + "ce_loss_7": 3.5848689913749694, + "epoch": 0.663, + "grad_norm": 604.0, + "kl_loss_10": 123.39703712463378, + "kl_loss_2": 1449.5614074707032, + "kl_loss_3": 1074.3954467773438, + "kl_loss_7": 340.9826889038086, + "learning_rate": 0.0002596795181891514, + "loss": 745.5657, + "step": 6630 + }, + { + "ce_loss_10": 3.5065298676490784, + "ce_loss_13": 3.4482388257980348, + "ce_loss_2": 4.110616528987885, + "ce_loss_3": 3.921921765804291, + "ce_loss_7": 3.597921073436737, + "epoch": 0.664, + "grad_norm": 624.0, + "kl_loss_10": 126.37582130432129, + "kl_loss_2": 1431.846942138672, + "kl_loss_3": 1067.7791381835937, + "kl_loss_7": 343.4202651977539, + "learning_rate": 0.000258289358057718, + "loss": 756.1383, + "step": 6640 + }, + { + "ce_loss_10": 3.577939677238464, + "ce_loss_13": 3.5215064764022825, + "ce_loss_2": 4.174340093135834, + "ce_loss_3": 3.993846929073334, + "ce_loss_7": 3.6716681122779846, + "epoch": 0.665, + "grad_norm": 668.0, + "kl_loss_10": 125.94303131103516, + "kl_loss_2": 1418.1614074707031, + "kl_loss_3": 1063.662789916992, + "kl_loss_7": 345.09018859863284, + "learning_rate": 0.0002569016319497657, + "loss": 737.0033, + "step": 6650 + }, + { + "ce_loss_10": 3.558946442604065, + "ce_loss_13": 3.504122722148895, + "ce_loss_2": 4.157543361186981, + "ce_loss_3": 3.9731531977653503, + "ce_loss_7": 3.6553978085517884, + "epoch": 0.666, + "grad_norm": 456.0, + "kl_loss_10": 126.41628189086914, + "kl_loss_2": 1425.6318298339843, + "kl_loss_3": 1065.6677124023438, + "kl_loss_7": 344.7531311035156, + "learning_rate": 0.00025551635383968066, + "loss": 743.7271, + "step": 6660 + }, + { + "ce_loss_10": 3.475124168395996, + "ce_loss_13": 3.4190633296966553, + "ce_loss_2": 4.0756109118461605, + "ce_loss_3": 3.892716574668884, + "ce_loss_7": 3.5657395362854003, + "epoch": 0.667, + "grad_norm": 776.0, + "kl_loss_10": 125.15830154418946, + "kl_loss_2": 1434.4202819824218, + "kl_loss_3": 1067.0250885009766, + "kl_loss_7": 342.79358825683596, + "learning_rate": 0.00025413353767719804, + "loss": 737.7381, + "step": 6670 + }, + { + "ce_loss_10": 3.5303653359413145, + "ce_loss_13": 3.47825140953064, + "ce_loss_2": 4.123199880123138, + "ce_loss_3": 3.9427443981170653, + "ce_loss_7": 3.6180224299430845, + "epoch": 0.668, + "grad_norm": 624.0, + "kl_loss_10": 121.16463623046874, + "kl_loss_2": 1412.6630737304688, + "kl_loss_3": 1057.0863677978516, + "kl_loss_7": 333.19127655029297, + "learning_rate": 0.0002527531973872617, + "loss": 734.8999, + "step": 6680 + }, + { + "ce_loss_10": 3.547012138366699, + "ce_loss_13": 3.4931538343429565, + "ce_loss_2": 4.134020876884461, + "ce_loss_3": 3.961080086231232, + "ce_loss_7": 3.6357810616493227, + "epoch": 0.669, + "grad_norm": 470.0, + "kl_loss_10": 121.4739860534668, + "kl_loss_2": 1401.3072937011718, + "kl_loss_3": 1050.0212188720702, + "kl_loss_7": 335.34568786621094, + "learning_rate": 0.0002513753468698826, + "loss": 727.7887, + "step": 6690 + }, + { + "ce_loss_10": 3.5165226101875304, + "ce_loss_13": 3.460037863254547, + "ce_loss_2": 4.116587007045746, + "ce_loss_3": 3.9380706429481505, + "ce_loss_7": 3.6079202771186827, + "epoch": 0.67, + "grad_norm": 604.0, + "kl_loss_10": 124.7790916442871, + "kl_loss_2": 1438.8750366210938, + "kl_loss_3": 1075.9765594482421, + "kl_loss_7": 341.0475021362305, + "learning_rate": 0.0002500000000000001, + "loss": 739.6845, + "step": 6700 + }, + { + "ce_loss_10": 3.6302000522613525, + "ce_loss_13": 3.5780060410499575, + "ce_loss_2": 4.188638615608215, + "ce_loss_3": 4.019280982017517, + "ce_loss_7": 3.71583753824234, + "epoch": 0.671, + "grad_norm": 548.0, + "kl_loss_10": 120.78085708618164, + "kl_loss_2": 1356.7482360839845, + "kl_loss_3": 1016.2201110839844, + "kl_loss_7": 327.5862182617187, + "learning_rate": 0.0002486271706273421, + "loss": 736.1638, + "step": 6710 + }, + { + "ce_loss_10": 3.569162166118622, + "ce_loss_13": 3.5176132678985597, + "ce_loss_2": 4.128765881061554, + "ce_loss_3": 3.955933618545532, + "ce_loss_7": 3.6550265192985534, + "epoch": 0.672, + "grad_norm": 540.0, + "kl_loss_10": 120.67655563354492, + "kl_loss_2": 1342.1685119628905, + "kl_loss_3": 1003.4603668212891, + "kl_loss_7": 326.5387390136719, + "learning_rate": 0.0002472568725762853, + "loss": 721.6279, + "step": 6720 + }, + { + "ce_loss_10": 3.5551081538200378, + "ce_loss_13": 3.5027125597000124, + "ce_loss_2": 4.122924709320069, + "ce_loss_3": 3.947139251232147, + "ce_loss_7": 3.640329647064209, + "epoch": 0.673, + "grad_norm": 498.0, + "kl_loss_10": 120.39876937866211, + "kl_loss_2": 1360.6974243164063, + "kl_loss_3": 1011.3546844482422, + "kl_loss_7": 324.81715393066406, + "learning_rate": 0.00024588911964571554, + "loss": 714.1743, + "step": 6730 + }, + { + "ce_loss_10": 3.5764050483703613, + "ce_loss_13": 3.5180741786956786, + "ce_loss_2": 4.191099977493286, + "ce_loss_3": 4.004332780838013, + "ce_loss_7": 3.6720454335212707, + "epoch": 0.674, + "grad_norm": 580.0, + "kl_loss_10": 128.467537689209, + "kl_loss_2": 1438.534490966797, + "kl_loss_3": 1073.315625, + "kl_loss_7": 347.5059188842773, + "learning_rate": 0.00024452392560888974, + "loss": 732.8049, + "step": 6740 + }, + { + "ce_loss_10": 3.463340771198273, + "ce_loss_13": 3.4112897157669066, + "ce_loss_2": 4.061108565330505, + "ce_loss_3": 3.8762802481651306, + "ce_loss_7": 3.55279803276062, + "epoch": 0.675, + "grad_norm": 536.0, + "kl_loss_10": 121.23651008605957, + "kl_loss_2": 1415.8220092773438, + "kl_loss_3": 1050.3499908447266, + "kl_loss_7": 334.4798278808594, + "learning_rate": 0.00024316130421329695, + "loss": 724.3385, + "step": 6750 + }, + { + "ce_loss_10": 3.544707751274109, + "ce_loss_13": 3.4911407709121702, + "ce_loss_2": 4.13702780008316, + "ce_loss_3": 3.952805519104004, + "ce_loss_7": 3.6338755965232847, + "epoch": 0.676, + "grad_norm": 504.0, + "kl_loss_10": 122.65901947021484, + "kl_loss_2": 1379.1199523925782, + "kl_loss_3": 1026.6421783447265, + "kl_loss_7": 329.13787536621095, + "learning_rate": 0.00024180126918051909, + "loss": 720.2172, + "step": 6760 + }, + { + "ce_loss_10": 3.585362899303436, + "ce_loss_13": 3.533327639102936, + "ce_loss_2": 4.1589976906776425, + "ce_loss_3": 3.9885815143585206, + "ce_loss_7": 3.674212193489075, + "epoch": 0.677, + "grad_norm": 580.0, + "kl_loss_10": 122.34142112731934, + "kl_loss_2": 1375.4806579589845, + "kl_loss_3": 1026.1968933105468, + "kl_loss_7": 332.3818161010742, + "learning_rate": 0.00024044383420609406, + "loss": 716.5443, + "step": 6770 + }, + { + "ce_loss_10": 3.5967800617218018, + "ce_loss_13": 3.54542738199234, + "ce_loss_2": 4.162489807605743, + "ce_loss_3": 3.9877678751945496, + "ce_loss_7": 3.6823439836502074, + "epoch": 0.678, + "grad_norm": 684.0, + "kl_loss_10": 121.3688591003418, + "kl_loss_2": 1374.671405029297, + "kl_loss_3": 1025.4278350830077, + "kl_loss_7": 330.73584136962893, + "learning_rate": 0.00023908901295937712, + "loss": 726.931, + "step": 6780 + }, + { + "ce_loss_10": 3.5975115299224854, + "ce_loss_13": 3.541917252540588, + "ce_loss_2": 4.178954315185547, + "ce_loss_3": 3.9986441016197203, + "ce_loss_7": 3.685844695568085, + "epoch": 0.679, + "grad_norm": 796.0, + "kl_loss_10": 123.49897994995118, + "kl_loss_2": 1371.78173828125, + "kl_loss_3": 1025.5135620117187, + "kl_loss_7": 330.7898513793945, + "learning_rate": 0.00023773681908340283, + "loss": 734.601, + "step": 6790 + }, + { + "ce_loss_10": 3.570222854614258, + "ce_loss_13": 3.5119346141815186, + "ce_loss_2": 4.171153092384339, + "ce_loss_3": 3.988275647163391, + "ce_loss_7": 3.661699855327606, + "epoch": 0.68, + "grad_norm": 648.0, + "kl_loss_10": 128.40531120300292, + "kl_loss_2": 1437.0543151855468, + "kl_loss_3": 1074.290087890625, + "kl_loss_7": 346.53758697509767, + "learning_rate": 0.00023638726619474876, + "loss": 749.3265, + "step": 6800 + }, + { + "ce_loss_10": 3.555301105976105, + "ce_loss_13": 3.50036598443985, + "ce_loss_2": 4.171612620353699, + "ce_loss_3": 3.9867984652519226, + "ce_loss_7": 3.6493401288986207, + "epoch": 0.681, + "grad_norm": 628.0, + "kl_loss_10": 123.96326141357422, + "kl_loss_2": 1442.6337036132813, + "kl_loss_3": 1080.4893859863282, + "kl_loss_7": 341.9077606201172, + "learning_rate": 0.0002350403678833976, + "loss": 737.603, + "step": 6810 + }, + { + "ce_loss_10": 3.4832152485847474, + "ce_loss_13": 3.4290732622146605, + "ce_loss_2": 4.0865050792694095, + "ce_loss_3": 3.9012642741203307, + "ce_loss_7": 3.5707468390464783, + "epoch": 0.682, + "grad_norm": 388.0, + "kl_loss_10": 121.88200225830079, + "kl_loss_2": 1425.3134704589843, + "kl_loss_3": 1067.0139251708983, + "kl_loss_7": 333.2817916870117, + "learning_rate": 0.00023369613771260007, + "loss": 730.0828, + "step": 6820 + }, + { + "ce_loss_10": 3.602006256580353, + "ce_loss_13": 3.546574425697327, + "ce_loss_2": 4.19493260383606, + "ce_loss_3": 4.014666783809662, + "ce_loss_7": 3.694399046897888, + "epoch": 0.683, + "grad_norm": 896.0, + "kl_loss_10": 124.15231056213379, + "kl_loss_2": 1416.5041259765626, + "kl_loss_3": 1057.1513946533203, + "kl_loss_7": 338.50926971435547, + "learning_rate": 0.00023235458921873925, + "loss": 737.5669, + "step": 6830 + }, + { + "ce_loss_10": 3.5514315247535704, + "ce_loss_13": 3.4963643193244933, + "ce_loss_2": 4.175014972686768, + "ce_loss_3": 3.987934386730194, + "ce_loss_7": 3.649992787837982, + "epoch": 0.684, + "grad_norm": 780.0, + "kl_loss_10": 126.71570472717285, + "kl_loss_2": 1460.5936950683595, + "kl_loss_3": 1091.2405181884765, + "kl_loss_7": 351.1172622680664, + "learning_rate": 0.0002310157359111938, + "loss": 752.2999, + "step": 6840 + }, + { + "ce_loss_10": 3.4418938040733336, + "ce_loss_13": 3.3859288573265074, + "ce_loss_2": 4.088224470615387, + "ce_loss_3": 3.8917805433273314, + "ce_loss_7": 3.539370059967041, + "epoch": 0.685, + "grad_norm": 932.0, + "kl_loss_10": 125.29055137634278, + "kl_loss_2": 1493.244873046875, + "kl_loss_3": 1106.4089385986329, + "kl_loss_7": 346.971989440918, + "learning_rate": 0.0002296795912722014, + "loss": 747.3044, + "step": 6850 + }, + { + "ce_loss_10": 3.582219159603119, + "ce_loss_13": 3.5297583818435667, + "ce_loss_2": 4.163276970386505, + "ce_loss_3": 3.9838827729225157, + "ce_loss_7": 3.673186790943146, + "epoch": 0.686, + "grad_norm": 688.0, + "kl_loss_10": 122.75707702636718, + "kl_loss_2": 1381.4411376953126, + "kl_loss_3": 1023.4174530029297, + "kl_loss_7": 331.40368804931643, + "learning_rate": 0.0002283461687567236, + "loss": 713.3312, + "step": 6860 + }, + { + "ce_loss_10": 3.64503128528595, + "ce_loss_13": 3.5887781262397764, + "ce_loss_2": 4.219438052177429, + "ce_loss_3": 4.043782579898834, + "ce_loss_7": 3.7344509243965147, + "epoch": 0.687, + "grad_norm": 516.0, + "kl_loss_10": 123.2198699951172, + "kl_loss_2": 1361.601385498047, + "kl_loss_3": 1017.4406616210938, + "kl_loss_7": 328.3076904296875, + "learning_rate": 0.00022701548179231045, + "loss": 725.902, + "step": 6870 + }, + { + "ce_loss_10": 3.5936806321144106, + "ce_loss_13": 3.538178300857544, + "ce_loss_2": 4.177520775794983, + "ce_loss_3": 3.9925229072570803, + "ce_loss_7": 3.6811250925064085, + "epoch": 0.688, + "grad_norm": 812.0, + "kl_loss_10": 124.56343841552734, + "kl_loss_2": 1398.6163452148437, + "kl_loss_3": 1039.188638305664, + "kl_loss_7": 333.23457489013674, + "learning_rate": 0.00022568754377896516, + "loss": 717.3196, + "step": 6880 + }, + { + "ce_loss_10": 3.5798384070396425, + "ce_loss_13": 3.52534077167511, + "ce_loss_2": 4.165230524539948, + "ce_loss_3": 3.9855322360992433, + "ce_loss_7": 3.669637417793274, + "epoch": 0.689, + "grad_norm": 636.0, + "kl_loss_10": 122.47865982055664, + "kl_loss_2": 1400.101580810547, + "kl_loss_3": 1045.2517852783203, + "kl_loss_7": 337.63702697753905, + "learning_rate": 0.00022436236808900844, + "loss": 724.2279, + "step": 6890 + }, + { + "ce_loss_10": 3.4826380729675295, + "ce_loss_13": 3.4284149289131163, + "ce_loss_2": 4.081106758117675, + "ce_loss_3": 3.8968387603759767, + "ce_loss_7": 3.5732897400856016, + "epoch": 0.69, + "grad_norm": 720.0, + "kl_loss_10": 123.13472061157226, + "kl_loss_2": 1424.4590087890624, + "kl_loss_3": 1061.0906616210937, + "kl_loss_7": 336.4748291015625, + "learning_rate": 0.00022303996806694487, + "loss": 726.4395, + "step": 6900 + }, + { + "ce_loss_10": 3.5565951108932494, + "ce_loss_13": 3.5028126001358033, + "ce_loss_2": 4.153330898284912, + "ce_loss_3": 3.972140097618103, + "ce_loss_7": 3.6470712184906007, + "epoch": 0.691, + "grad_norm": 620.0, + "kl_loss_10": 121.5474639892578, + "kl_loss_2": 1406.192333984375, + "kl_loss_3": 1054.1312591552735, + "kl_loss_7": 333.32630310058596, + "learning_rate": 0.00022172035702932823, + "loss": 725.3298, + "step": 6910 + }, + { + "ce_loss_10": 3.596930432319641, + "ce_loss_13": 3.5424793004989623, + "ce_loss_2": 4.174435448646546, + "ce_loss_3": 4.0008144736289974, + "ce_loss_7": 3.6854524970054627, + "epoch": 0.692, + "grad_norm": 588.0, + "kl_loss_10": 122.45778541564941, + "kl_loss_2": 1364.4023864746093, + "kl_loss_3": 1023.2470947265625, + "kl_loss_7": 330.3191375732422, + "learning_rate": 0.00022040354826462666, + "loss": 715.1212, + "step": 6920 + }, + { + "ce_loss_10": 3.5372806906700136, + "ce_loss_13": 3.484300172328949, + "ce_loss_2": 4.122054016590118, + "ce_loss_3": 3.9410375475883486, + "ce_loss_7": 3.6267031908035277, + "epoch": 0.693, + "grad_norm": 592.0, + "kl_loss_10": 121.36230354309082, + "kl_loss_2": 1387.7280639648438, + "kl_loss_3": 1035.8701171875, + "kl_loss_7": 328.8681671142578, + "learning_rate": 0.0002190895550330899, + "loss": 723.1688, + "step": 6930 + }, + { + "ce_loss_10": 3.4684043288230897, + "ce_loss_13": 3.4110892057418822, + "ce_loss_2": 4.0784489750862125, + "ce_loss_3": 3.8970334768295287, + "ce_loss_7": 3.5636763691902162, + "epoch": 0.694, + "grad_norm": 708.0, + "kl_loss_10": 125.41013412475586, + "kl_loss_2": 1438.7724853515624, + "kl_loss_3": 1074.242709350586, + "kl_loss_7": 343.83753967285156, + "learning_rate": 0.00021777839056661552, + "loss": 726.9423, + "step": 6940 + }, + { + "ce_loss_10": 3.5493281960487364, + "ce_loss_13": 3.4966490268707275, + "ce_loss_2": 4.138473987579346, + "ce_loss_3": 3.9581478118896483, + "ce_loss_7": 3.6378383159637453, + "epoch": 0.695, + "grad_norm": 482.0, + "kl_loss_10": 121.87666625976563, + "kl_loss_2": 1388.4705749511718, + "kl_loss_3": 1035.8258056640625, + "kl_loss_7": 330.15142211914065, + "learning_rate": 0.0002164700680686147, + "loss": 714.1528, + "step": 6950 + }, + { + "ce_loss_10": 3.5943965315818787, + "ce_loss_13": 3.5415783882141114, + "ce_loss_2": 4.166309404373169, + "ce_loss_3": 3.9927005410194396, + "ce_loss_7": 3.6852437376976015, + "epoch": 0.696, + "grad_norm": 524.0, + "kl_loss_10": 123.10262947082519, + "kl_loss_2": 1358.2594055175782, + "kl_loss_3": 1022.4194183349609, + "kl_loss_7": 332.60595855712893, + "learning_rate": 0.0002151646007138806, + "loss": 711.1066, + "step": 6960 + }, + { + "ce_loss_10": 3.4674277782440184, + "ce_loss_13": 3.4147743582725525, + "ce_loss_2": 4.0764969229698185, + "ce_loss_3": 3.8895933270454406, + "ce_loss_7": 3.559154045581818, + "epoch": 0.697, + "grad_norm": 506.0, + "kl_loss_10": 124.91875114440919, + "kl_loss_2": 1433.1934448242187, + "kl_loss_3": 1072.8756225585937, + "kl_loss_7": 340.41393890380857, + "learning_rate": 0.00021386200164845526, + "loss": 732.8288, + "step": 6970 + }, + { + "ce_loss_10": 3.655595052242279, + "ce_loss_13": 3.600779819488525, + "ce_loss_2": 4.214467906951905, + "ce_loss_3": 4.045685410499573, + "ce_loss_7": 3.7411547183990477, + "epoch": 0.698, + "grad_norm": 492.0, + "kl_loss_10": 123.51595115661621, + "kl_loss_2": 1358.5181579589844, + "kl_loss_3": 1018.3159729003906, + "kl_loss_7": 331.73258514404296, + "learning_rate": 0.0002125622839894964, + "loss": 717.1124, + "step": 6980 + }, + { + "ce_loss_10": 3.5993648409843444, + "ce_loss_13": 3.5441166758537292, + "ce_loss_2": 4.181045114994049, + "ce_loss_3": 4.002468681335449, + "ce_loss_7": 3.6823740720748903, + "epoch": 0.699, + "grad_norm": 470.0, + "kl_loss_10": 122.90305519104004, + "kl_loss_2": 1383.0861511230469, + "kl_loss_3": 1029.9532836914063, + "kl_loss_7": 329.35023193359376, + "learning_rate": 0.00021126546082514663, + "loss": 715.5651, + "step": 6990 + }, + { + "ce_loss_10": 3.6185239911079408, + "ce_loss_13": 3.565263593196869, + "ce_loss_2": 4.191958248615265, + "ce_loss_3": 4.013105678558349, + "ce_loss_7": 3.705930233001709, + "epoch": 0.7, + "grad_norm": 600.0, + "kl_loss_10": 123.65934333801269, + "kl_loss_2": 1374.6581237792968, + "kl_loss_3": 1025.8725860595703, + "kl_loss_7": 331.70873565673827, + "learning_rate": 0.00020997154521440098, + "loss": 715.4568, + "step": 7000 + }, + { + "ce_loss_10": 3.557955777645111, + "ce_loss_13": 3.503411018848419, + "ce_loss_2": 4.137540674209594, + "ce_loss_3": 3.959644913673401, + "ce_loss_7": 3.6458826899528503, + "epoch": 0.701, + "grad_norm": 536.0, + "kl_loss_10": 120.48597831726075, + "kl_loss_2": 1374.6299133300781, + "kl_loss_3": 1030.6099670410156, + "kl_loss_7": 329.32201385498047, + "learning_rate": 0.0002086805501869749, + "loss": 710.937, + "step": 7010 + }, + { + "ce_loss_10": 3.526408576965332, + "ce_loss_13": 3.4718175530433655, + "ce_loss_2": 4.136154270172119, + "ce_loss_3": 3.9502137660980225, + "ce_loss_7": 3.618171179294586, + "epoch": 0.702, + "grad_norm": 676.0, + "kl_loss_10": 123.665576171875, + "kl_loss_2": 1445.9505981445313, + "kl_loss_3": 1081.7786437988282, + "kl_loss_7": 344.37613677978516, + "learning_rate": 0.0002073924887431744, + "loss": 737.9115, + "step": 7020 + }, + { + "ce_loss_10": 3.537488567829132, + "ce_loss_13": 3.4824650168418883, + "ce_loss_2": 4.120364391803742, + "ce_loss_3": 3.942833948135376, + "ce_loss_7": 3.628614103794098, + "epoch": 0.703, + "grad_norm": 580.0, + "kl_loss_10": 122.33941802978515, + "kl_loss_2": 1393.7160217285157, + "kl_loss_3": 1044.1484924316405, + "kl_loss_7": 335.0018844604492, + "learning_rate": 0.00020610737385376348, + "loss": 733.2367, + "step": 7030 + }, + { + "ce_loss_10": 3.6018667101860045, + "ce_loss_13": 3.547727274894714, + "ce_loss_2": 4.1699677348136905, + "ce_loss_3": 3.9950947046279905, + "ce_loss_7": 3.6891037821769714, + "epoch": 0.704, + "grad_norm": 664.0, + "kl_loss_10": 122.15450096130371, + "kl_loss_2": 1356.7330383300782, + "kl_loss_3": 1011.3340454101562, + "kl_loss_7": 326.781640625, + "learning_rate": 0.00020482521845983521, + "loss": 721.0978, + "step": 7040 + }, + { + "ce_loss_10": 3.59405198097229, + "ce_loss_13": 3.538354456424713, + "ce_loss_2": 4.1835708022117615, + "ce_loss_3": 3.999073255062103, + "ce_loss_7": 3.6849095940589907, + "epoch": 0.705, + "grad_norm": 688.0, + "kl_loss_10": 126.76047058105469, + "kl_loss_2": 1398.0797119140625, + "kl_loss_3": 1040.0777679443358, + "kl_loss_7": 339.4572814941406, + "learning_rate": 0.00020354603547267987, + "loss": 733.3687, + "step": 7050 + }, + { + "ce_loss_10": 3.579659843444824, + "ce_loss_13": 3.521292781829834, + "ce_loss_2": 4.17962476015091, + "ce_loss_3": 3.9999419927597044, + "ce_loss_7": 3.672910511493683, + "epoch": 0.706, + "grad_norm": 506.0, + "kl_loss_10": 125.07297286987304, + "kl_loss_2": 1407.0430908203125, + "kl_loss_3": 1054.5457977294923, + "kl_loss_7": 338.6408721923828, + "learning_rate": 0.00020226983777365604, + "loss": 743.0122, + "step": 7060 + }, + { + "ce_loss_10": 3.4797715306282044, + "ce_loss_13": 3.4280895590782166, + "ce_loss_2": 4.093261420726776, + "ce_loss_3": 3.9058175683021545, + "ce_loss_7": 3.5675304889678956, + "epoch": 0.707, + "grad_norm": 460.0, + "kl_loss_10": 118.65347862243652, + "kl_loss_2": 1437.4217590332032, + "kl_loss_3": 1060.7939331054688, + "kl_loss_7": 327.5643600463867, + "learning_rate": 0.00020099663821406056, + "loss": 725.6605, + "step": 7070 + }, + { + "ce_loss_10": 3.584267723560333, + "ce_loss_13": 3.5309558272361756, + "ce_loss_2": 4.158235001564026, + "ce_loss_3": 3.9820183753967284, + "ce_loss_7": 3.6715306878089904, + "epoch": 0.708, + "grad_norm": 736.0, + "kl_loss_10": 120.5715232849121, + "kl_loss_2": 1372.4082824707032, + "kl_loss_3": 1020.9255096435547, + "kl_loss_7": 326.9998489379883, + "learning_rate": 0.00019972644961499853, + "loss": 723.5401, + "step": 7080 + }, + { + "ce_loss_10": 3.5517083406448364, + "ce_loss_13": 3.498913753032684, + "ce_loss_2": 4.1585370898246765, + "ce_loss_3": 3.977364408969879, + "ce_loss_7": 3.6439939856529238, + "epoch": 0.709, + "grad_norm": 596.0, + "kl_loss_10": 123.60837135314941, + "kl_loss_2": 1431.0212951660155, + "kl_loss_3": 1069.8634002685546, + "kl_loss_7": 340.6724349975586, + "learning_rate": 0.00019845928476725522, + "loss": 732.3908, + "step": 7090 + }, + { + "ce_loss_10": 3.626398813724518, + "ce_loss_13": 3.572152090072632, + "ce_loss_2": 4.209421014785766, + "ce_loss_3": 4.036573505401611, + "ce_loss_7": 3.7170314311981203, + "epoch": 0.71, + "grad_norm": 596.0, + "kl_loss_10": 123.35403976440429, + "kl_loss_2": 1387.5173217773438, + "kl_loss_3": 1038.8995300292968, + "kl_loss_7": 335.42689056396483, + "learning_rate": 0.00019719515643116677, + "loss": 738.7718, + "step": 7100 + }, + { + "ce_loss_10": 3.572767961025238, + "ce_loss_13": 3.515077757835388, + "ce_loss_2": 4.155257606506348, + "ce_loss_3": 3.9734201788902284, + "ce_loss_7": 3.6621560573577883, + "epoch": 0.711, + "grad_norm": 580.0, + "kl_loss_10": 123.21557540893555, + "kl_loss_2": 1376.1987548828124, + "kl_loss_3": 1026.8055908203125, + "kl_loss_7": 331.16675262451173, + "learning_rate": 0.0001959340773364911, + "loss": 723.44, + "step": 7110 + }, + { + "ce_loss_10": 3.5861273288726805, + "ce_loss_13": 3.5300360202789305, + "ce_loss_2": 4.173725187778473, + "ce_loss_3": 3.996710407733917, + "ce_loss_7": 3.674662780761719, + "epoch": 0.712, + "grad_norm": 482.0, + "kl_loss_10": 123.5694019317627, + "kl_loss_2": 1397.5086608886718, + "kl_loss_3": 1041.5564971923827, + "kl_loss_7": 333.47118682861327, + "learning_rate": 0.0001946760601822809, + "loss": 715.5152, + "step": 7120 + }, + { + "ce_loss_10": 3.638036513328552, + "ce_loss_13": 3.585539197921753, + "ce_loss_2": 4.208897590637207, + "ce_loss_3": 4.033627045154572, + "ce_loss_7": 3.7303940296173095, + "epoch": 0.713, + "grad_norm": 588.0, + "kl_loss_10": 120.84820327758788, + "kl_loss_2": 1364.6588439941406, + "kl_loss_3": 1017.2987609863281, + "kl_loss_7": 331.6714385986328, + "learning_rate": 0.00019342111763675512, + "loss": 705.8723, + "step": 7130 + }, + { + "ce_loss_10": 3.6417887210845947, + "ce_loss_13": 3.5866674900054933, + "ce_loss_2": 4.206362402439117, + "ce_loss_3": 4.032591104507446, + "ce_loss_7": 3.726455843448639, + "epoch": 0.714, + "grad_norm": 488.0, + "kl_loss_10": 125.14588623046875, + "kl_loss_2": 1367.4994873046876, + "kl_loss_3": 1022.9796356201172, + "kl_loss_7": 332.9434066772461, + "learning_rate": 0.00019216926233717085, + "loss": 711.7196, + "step": 7140 + }, + { + "ce_loss_10": 3.5203476548194885, + "ce_loss_13": 3.4688742399215697, + "ce_loss_2": 4.134868347644806, + "ce_loss_3": 3.9454983115196227, + "ce_loss_7": 3.6101885557174684, + "epoch": 0.715, + "grad_norm": 536.0, + "kl_loss_10": 121.2591423034668, + "kl_loss_2": 1433.3612915039062, + "kl_loss_3": 1061.6283813476562, + "kl_loss_7": 328.62769012451173, + "learning_rate": 0.00019092050688969737, + "loss": 730.5997, + "step": 7150 + }, + { + "ce_loss_10": 3.5955933809280394, + "ce_loss_13": 3.544034016132355, + "ce_loss_2": 4.171285545825958, + "ce_loss_3": 3.988926124572754, + "ce_loss_7": 3.680844259262085, + "epoch": 0.716, + "grad_norm": 552.0, + "kl_loss_10": 121.48760719299317, + "kl_loss_2": 1380.6668640136718, + "kl_loss_3": 1031.5187957763671, + "kl_loss_7": 330.45118255615233, + "learning_rate": 0.00018967486386928817, + "loss": 714.2, + "step": 7160 + }, + { + "ce_loss_10": 3.469675064086914, + "ce_loss_13": 3.414101004600525, + "ce_loss_2": 4.0789219498634335, + "ce_loss_3": 3.894315481185913, + "ce_loss_7": 3.5637245774269104, + "epoch": 0.717, + "grad_norm": 656.0, + "kl_loss_10": 121.65066604614258, + "kl_loss_2": 1434.495782470703, + "kl_loss_3": 1071.64462890625, + "kl_loss_7": 339.27454376220703, + "learning_rate": 0.00018843234581955443, + "loss": 752.8244, + "step": 7170 + }, + { + "ce_loss_10": 3.485479485988617, + "ce_loss_13": 3.429234707355499, + "ce_loss_2": 4.093677091598511, + "ce_loss_3": 3.906477701663971, + "ce_loss_7": 3.5766185998916624, + "epoch": 0.718, + "grad_norm": 608.0, + "kl_loss_10": 124.83513031005859, + "kl_loss_2": 1436.6897827148437, + "kl_loss_3": 1067.1097259521484, + "kl_loss_7": 339.108544921875, + "learning_rate": 0.00018719296525263924, + "loss": 735.575, + "step": 7180 + }, + { + "ce_loss_10": 3.5846765637397766, + "ce_loss_13": 3.529562759399414, + "ce_loss_2": 4.150809407234192, + "ce_loss_3": 3.9718990206718443, + "ce_loss_7": 3.6702256917953493, + "epoch": 0.719, + "grad_norm": 442.0, + "kl_loss_10": 122.53091278076172, + "kl_loss_2": 1356.4698181152344, + "kl_loss_3": 1008.6680480957032, + "kl_loss_7": 327.7504058837891, + "learning_rate": 0.0001859567346490913, + "loss": 712.9806, + "step": 7190 + }, + { + "ce_loss_10": 3.554144012928009, + "ce_loss_13": 3.4977105259895325, + "ce_loss_2": 4.150786626338959, + "ce_loss_3": 3.9735541462898256, + "ce_loss_7": 3.6462294340133665, + "epoch": 0.72, + "grad_norm": 676.0, + "kl_loss_10": 123.88136787414551, + "kl_loss_2": 1414.2480834960938, + "kl_loss_3": 1060.4930938720704, + "kl_loss_7": 338.99412231445314, + "learning_rate": 0.0001847236664577389, + "loss": 719.9778, + "step": 7200 + }, + { + "ce_loss_10": 3.5858793020248414, + "ce_loss_13": 3.5322806358337404, + "ce_loss_2": 4.153151452541351, + "ce_loss_3": 3.978500175476074, + "ce_loss_7": 3.6706031560897827, + "epoch": 0.721, + "grad_norm": 430.0, + "kl_loss_10": 123.21329269409179, + "kl_loss_2": 1359.6646850585937, + "kl_loss_3": 1009.3902221679688, + "kl_loss_7": 329.09681243896483, + "learning_rate": 0.00018349377309556487, + "loss": 702.6776, + "step": 7210 + }, + { + "ce_loss_10": 3.525311827659607, + "ce_loss_13": 3.4706099390983582, + "ce_loss_2": 4.130126202106476, + "ce_loss_3": 3.9440832138061523, + "ce_loss_7": 3.615574586391449, + "epoch": 0.722, + "grad_norm": 684.0, + "kl_loss_10": 122.96382064819336, + "kl_loss_2": 1434.9597778320312, + "kl_loss_3": 1071.8526824951173, + "kl_loss_7": 338.4400665283203, + "learning_rate": 0.00018226706694758193, + "loss": 733.9826, + "step": 7220 + }, + { + "ce_loss_10": 3.6010716795921325, + "ce_loss_13": 3.5477463483810423, + "ce_loss_2": 4.1720555305480955, + "ce_loss_3": 3.999657225608826, + "ce_loss_7": 3.6874852776527405, + "epoch": 0.723, + "grad_norm": 596.0, + "kl_loss_10": 122.04865837097168, + "kl_loss_2": 1376.6821655273438, + "kl_loss_3": 1035.7532440185546, + "kl_loss_7": 331.5156555175781, + "learning_rate": 0.0001810435603667075, + "loss": 733.5101, + "step": 7230 + }, + { + "ce_loss_10": 3.448229801654816, + "ce_loss_13": 3.395207440853119, + "ce_loss_2": 4.045393764972687, + "ce_loss_3": 3.8576236844062803, + "ce_loss_7": 3.538518488407135, + "epoch": 0.724, + "grad_norm": 568.0, + "kl_loss_10": 118.53696784973144, + "kl_loss_2": 1409.9154846191407, + "kl_loss_3": 1048.6354064941406, + "kl_loss_7": 329.32737426757814, + "learning_rate": 0.0001798232656736389, + "loss": 731.7912, + "step": 7240 + }, + { + "ce_loss_10": 3.6248636484146117, + "ce_loss_13": 3.5711260557174684, + "ce_loss_2": 4.182618510723114, + "ce_loss_3": 4.011233007907867, + "ce_loss_7": 3.7107828497886657, + "epoch": 0.725, + "grad_norm": 648.0, + "kl_loss_10": 123.01177520751953, + "kl_loss_2": 1340.389141845703, + "kl_loss_3": 1003.9724822998047, + "kl_loss_7": 328.3069091796875, + "learning_rate": 0.0001786061951567303, + "loss": 717.0013, + "step": 7250 + }, + { + "ce_loss_10": 3.5421255350112917, + "ce_loss_13": 3.4872742772102354, + "ce_loss_2": 4.13176680803299, + "ce_loss_3": 3.9531715869903565, + "ce_loss_7": 3.6325071692466735, + "epoch": 0.726, + "grad_norm": 672.0, + "kl_loss_10": 124.13465118408203, + "kl_loss_2": 1389.2992919921876, + "kl_loss_3": 1037.6413665771483, + "kl_loss_7": 334.9740844726563, + "learning_rate": 0.00017739236107186857, + "loss": 725.9827, + "step": 7260 + }, + { + "ce_loss_10": 3.628635025024414, + "ce_loss_13": 3.5775145173072813, + "ce_loss_2": 4.183088374137879, + "ce_loss_3": 4.009625935554505, + "ce_loss_7": 3.7096946001052857, + "epoch": 0.727, + "grad_norm": 484.0, + "kl_loss_10": 119.83437309265136, + "kl_loss_2": 1335.5197631835938, + "kl_loss_3": 996.1497222900391, + "kl_loss_7": 322.9276321411133, + "learning_rate": 0.00017618177564234904, + "loss": 706.5093, + "step": 7270 + }, + { + "ce_loss_10": 3.606297266483307, + "ce_loss_13": 3.5551859974861144, + "ce_loss_2": 4.166692161560059, + "ce_loss_3": 3.9936567068099977, + "ce_loss_7": 3.6888177514076235, + "epoch": 0.728, + "grad_norm": 510.0, + "kl_loss_10": 120.30095024108887, + "kl_loss_2": 1330.818670654297, + "kl_loss_3": 1000.4637176513672, + "kl_loss_7": 321.7369613647461, + "learning_rate": 0.00017497445105875377, + "loss": 706.7148, + "step": 7280 + }, + { + "ce_loss_10": 3.513353967666626, + "ce_loss_13": 3.4601715326309206, + "ce_loss_2": 4.118406116962433, + "ce_loss_3": 3.934498977661133, + "ce_loss_7": 3.60476815700531, + "epoch": 0.729, + "grad_norm": 716.0, + "kl_loss_10": 122.56389846801758, + "kl_loss_2": 1424.7031311035157, + "kl_loss_3": 1066.4053436279296, + "kl_loss_7": 335.16686553955077, + "learning_rate": 0.000173770399478828, + "loss": 727.812, + "step": 7290 + }, + { + "ce_loss_10": 3.435167062282562, + "ce_loss_13": 3.3832804918289185, + "ce_loss_2": 4.021002113819122, + "ce_loss_3": 3.841395652294159, + "ce_loss_7": 3.519324839115143, + "epoch": 0.73, + "grad_norm": 636.0, + "kl_loss_10": 121.18411865234376, + "kl_loss_2": 1399.8541625976563, + "kl_loss_3": 1044.4448303222657, + "kl_loss_7": 331.79268341064454, + "learning_rate": 0.0001725696330273575, + "loss": 737.3452, + "step": 7300 + }, + { + "ce_loss_10": 3.6242716908454895, + "ce_loss_13": 3.5716004967689514, + "ce_loss_2": 4.191676688194275, + "ce_loss_3": 4.013358986377716, + "ce_loss_7": 3.712622547149658, + "epoch": 0.731, + "grad_norm": 652.0, + "kl_loss_10": 119.75424537658691, + "kl_loss_2": 1345.9469665527345, + "kl_loss_3": 1000.0272003173828, + "kl_loss_7": 325.01279907226564, + "learning_rate": 0.00017137216379604724, + "loss": 701.3725, + "step": 7310 + }, + { + "ce_loss_10": 3.506552290916443, + "ce_loss_13": 3.452127659320831, + "ce_loss_2": 4.097395420074463, + "ce_loss_3": 3.9160946011543274, + "ce_loss_7": 3.5951048493385316, + "epoch": 0.732, + "grad_norm": 884.0, + "kl_loss_10": 121.7713752746582, + "kl_loss_2": 1385.7196105957032, + "kl_loss_3": 1034.3794219970703, + "kl_loss_7": 329.5286666870117, + "learning_rate": 0.00017017800384339925, + "loss": 717.8389, + "step": 7320 + }, + { + "ce_loss_10": 3.4542035102844237, + "ce_loss_13": 3.398160481452942, + "ce_loss_2": 4.066942834854126, + "ce_loss_3": 3.8794593691825865, + "ce_loss_7": 3.5470592975616455, + "epoch": 0.733, + "grad_norm": 548.0, + "kl_loss_10": 122.54219818115234, + "kl_loss_2": 1439.6918151855468, + "kl_loss_3": 1065.2620758056642, + "kl_loss_7": 336.47518768310545, + "learning_rate": 0.00016898716519459073, + "loss": 717.2488, + "step": 7330 + }, + { + "ce_loss_10": 3.581343674659729, + "ce_loss_13": 3.526875352859497, + "ce_loss_2": 4.190316534042358, + "ce_loss_3": 4.004525983333588, + "ce_loss_7": 3.6749117851257322, + "epoch": 0.734, + "grad_norm": 624.0, + "kl_loss_10": 125.29887886047364, + "kl_loss_2": 1416.729052734375, + "kl_loss_3": 1057.2286193847656, + "kl_loss_7": 342.11007537841795, + "learning_rate": 0.00016779965984135375, + "loss": 727.1093, + "step": 7340 + }, + { + "ce_loss_10": 3.48441618680954, + "ce_loss_13": 3.431960880756378, + "ce_loss_2": 4.071764206886291, + "ce_loss_3": 3.887656939029694, + "ce_loss_7": 3.571474778652191, + "epoch": 0.735, + "grad_norm": 652.0, + "kl_loss_10": 118.69619903564453, + "kl_loss_2": 1381.495343017578, + "kl_loss_3": 1023.2910827636719, + "kl_loss_7": 324.31610412597655, + "learning_rate": 0.00016661549974185424, + "loss": 716.1001, + "step": 7350 + }, + { + "ce_loss_10": 3.5254514336586, + "ce_loss_13": 3.4723365902900696, + "ce_loss_2": 4.108010959625244, + "ce_loss_3": 3.933693265914917, + "ce_loss_7": 3.613771104812622, + "epoch": 0.736, + "grad_norm": 510.0, + "kl_loss_10": 123.67498970031738, + "kl_loss_2": 1388.5188293457031, + "kl_loss_3": 1037.3818817138672, + "kl_loss_7": 333.49166717529295, + "learning_rate": 0.00016543469682057105, + "loss": 711.0732, + "step": 7360 + }, + { + "ce_loss_10": 3.5577150702476503, + "ce_loss_13": 3.5016186833381653, + "ce_loss_2": 4.143563580513001, + "ce_loss_3": 3.9626933932304382, + "ce_loss_7": 3.6453551292419433, + "epoch": 0.737, + "grad_norm": 486.0, + "kl_loss_10": 124.5495719909668, + "kl_loss_2": 1398.1186157226562, + "kl_loss_3": 1044.5123626708985, + "kl_loss_7": 336.8351364135742, + "learning_rate": 0.00016425726296817632, + "loss": 723.8395, + "step": 7370 + }, + { + "ce_loss_10": 3.570967364311218, + "ce_loss_13": 3.519097864627838, + "ce_loss_2": 4.144868564605713, + "ce_loss_3": 3.9664488077163695, + "ce_loss_7": 3.6589640259742735, + "epoch": 0.738, + "grad_norm": 564.0, + "kl_loss_10": 120.96943435668945, + "kl_loss_2": 1359.615216064453, + "kl_loss_3": 1016.4945587158203, + "kl_loss_7": 326.19647216796875, + "learning_rate": 0.00016308321004141607, + "loss": 710.8186, + "step": 7380 + }, + { + "ce_loss_10": 3.5232744455337524, + "ce_loss_13": 3.467752158641815, + "ce_loss_2": 4.123220896720886, + "ce_loss_3": 3.9425734519958495, + "ce_loss_7": 3.615010941028595, + "epoch": 0.739, + "grad_norm": 528.0, + "kl_loss_10": 124.12866439819337, + "kl_loss_2": 1396.836865234375, + "kl_loss_3": 1039.4474639892578, + "kl_loss_7": 335.5626510620117, + "learning_rate": 0.00016191254986299043, + "loss": 719.3172, + "step": 7390 + }, + { + "ce_loss_10": 3.5758928298950194, + "ce_loss_13": 3.523856747150421, + "ce_loss_2": 4.139862871170044, + "ce_loss_3": 3.963184416294098, + "ce_loss_7": 3.658868145942688, + "epoch": 0.74, + "grad_norm": 656.0, + "kl_loss_10": 120.1721866607666, + "kl_loss_2": 1373.3153991699219, + "kl_loss_3": 1020.2515319824219, + "kl_loss_7": 326.2480270385742, + "learning_rate": 0.00016074529422143398, + "loss": 722.962, + "step": 7400 + }, + { + "ce_loss_10": 3.512345218658447, + "ce_loss_13": 3.460979771614075, + "ce_loss_2": 4.098102223873139, + "ce_loss_3": 3.9215418100357056, + "ce_loss_7": 3.603636908531189, + "epoch": 0.741, + "grad_norm": 736.0, + "kl_loss_10": 122.76387939453124, + "kl_loss_2": 1396.6786743164062, + "kl_loss_3": 1042.0954986572265, + "kl_loss_7": 331.67714538574216, + "learning_rate": 0.0001595814548709983, + "loss": 725.9221, + "step": 7410 + }, + { + "ce_loss_10": 3.5902742028236387, + "ce_loss_13": 3.537025344371796, + "ce_loss_2": 4.177208817005157, + "ce_loss_3": 3.997780406475067, + "ce_loss_7": 3.6821145176887513, + "epoch": 0.742, + "grad_norm": 564.0, + "kl_loss_10": 124.67025566101074, + "kl_loss_2": 1403.5627502441407, + "kl_loss_3": 1044.860205078125, + "kl_loss_7": 338.93174896240237, + "learning_rate": 0.00015842104353153285, + "loss": 727.9301, + "step": 7420 + }, + { + "ce_loss_10": 3.604291892051697, + "ce_loss_13": 3.548382747173309, + "ce_loss_2": 4.191494596004486, + "ce_loss_3": 4.010533785820007, + "ce_loss_7": 3.6923877358436585, + "epoch": 0.743, + "grad_norm": 472.0, + "kl_loss_10": 124.3033836364746, + "kl_loss_2": 1400.2394775390626, + "kl_loss_3": 1049.6304718017577, + "kl_loss_7": 337.5371841430664, + "learning_rate": 0.0001572640718883667, + "loss": 738.9812, + "step": 7430 + }, + { + "ce_loss_10": 3.538733124732971, + "ce_loss_13": 3.4872957110404967, + "ce_loss_2": 4.113576173782349, + "ce_loss_3": 3.934070587158203, + "ce_loss_7": 3.625633692741394, + "epoch": 0.744, + "grad_norm": 572.0, + "kl_loss_10": 119.72441368103027, + "kl_loss_2": 1360.4320068359375, + "kl_loss_3": 1016.749853515625, + "kl_loss_7": 324.85569152832034, + "learning_rate": 0.0001561105515921915, + "loss": 723.982, + "step": 7440 + }, + { + "ce_loss_10": 3.3811042308807373, + "ce_loss_13": 3.330593299865723, + "ce_loss_2": 3.999517023563385, + "ce_loss_3": 3.807965099811554, + "ce_loss_7": 3.472578394412994, + "epoch": 0.745, + "grad_norm": 536.0, + "kl_loss_10": 118.95663032531738, + "kl_loss_2": 1440.7939697265624, + "kl_loss_3": 1063.4994506835938, + "kl_loss_7": 330.3991363525391, + "learning_rate": 0.0001549604942589441, + "loss": 720.67, + "step": 7450 + }, + { + "ce_loss_10": 3.574368488788605, + "ce_loss_13": 3.523599100112915, + "ce_loss_2": 4.1251343250274655, + "ce_loss_3": 3.9507637143135073, + "ce_loss_7": 3.656884014606476, + "epoch": 0.746, + "grad_norm": 544.0, + "kl_loss_10": 118.31760025024414, + "kl_loss_2": 1313.1225219726562, + "kl_loss_3": 979.9062561035156, + "kl_loss_7": 317.6555862426758, + "learning_rate": 0.00015381391146968864, + "loss": 701.5394, + "step": 7460 + }, + { + "ce_loss_10": 3.5501048445701597, + "ce_loss_13": 3.498226988315582, + "ce_loss_2": 4.138570165634155, + "ce_loss_3": 3.95827853679657, + "ce_loss_7": 3.636888933181763, + "epoch": 0.747, + "grad_norm": 496.0, + "kl_loss_10": 119.63836708068848, + "kl_loss_2": 1383.4101440429688, + "kl_loss_3": 1029.2711975097657, + "kl_loss_7": 327.9986038208008, + "learning_rate": 0.00015267081477050133, + "loss": 722.7866, + "step": 7470 + }, + { + "ce_loss_10": 3.651386225223541, + "ce_loss_13": 3.595274817943573, + "ce_loss_2": 4.220568442344666, + "ce_loss_3": 4.041546940803528, + "ce_loss_7": 3.73706601858139, + "epoch": 0.748, + "grad_norm": 640.0, + "kl_loss_10": 124.5992389678955, + "kl_loss_2": 1365.5502685546876, + "kl_loss_3": 1015.7865600585938, + "kl_loss_7": 333.7725234985352, + "learning_rate": 0.00015153121567235335, + "loss": 706.2273, + "step": 7480 + }, + { + "ce_loss_10": 3.543039095401764, + "ce_loss_13": 3.4888150691986084, + "ce_loss_2": 4.1341440916061405, + "ce_loss_3": 3.9522986888885496, + "ce_loss_7": 3.628537285327911, + "epoch": 0.749, + "grad_norm": 506.0, + "kl_loss_10": 122.94704780578613, + "kl_loss_2": 1414.9927001953124, + "kl_loss_3": 1049.655908203125, + "kl_loss_7": 334.1303375244141, + "learning_rate": 0.00015039512565099468, + "loss": 708.8927, + "step": 7490 + }, + { + "ce_loss_10": 3.6088897585868835, + "ce_loss_13": 3.5548375248908997, + "ce_loss_2": 4.185053658485413, + "ce_loss_3": 4.00367146730423, + "ce_loss_7": 3.693446183204651, + "epoch": 0.75, + "grad_norm": 592.0, + "kl_loss_10": 123.0703842163086, + "kl_loss_2": 1378.3978271484375, + "kl_loss_3": 1023.3384765625, + "kl_loss_7": 330.60157012939453, + "learning_rate": 0.00014926255614683932, + "loss": 733.1395, + "step": 7500 + }, + { + "ce_loss_10": 3.5458021998405456, + "ce_loss_13": 3.4934080958366396, + "ce_loss_2": 4.122628927230835, + "ce_loss_3": 3.943239653110504, + "ce_loss_7": 3.6338084936141968, + "epoch": 0.751, + "grad_norm": 576.0, + "kl_loss_10": 121.31806373596191, + "kl_loss_2": 1371.3551818847657, + "kl_loss_3": 1020.9572631835938, + "kl_loss_7": 328.8893646240234, + "learning_rate": 0.0001481335185648498, + "loss": 724.9404, + "step": 7510 + }, + { + "ce_loss_10": 3.557645547389984, + "ce_loss_13": 3.5049761295318604, + "ce_loss_2": 4.132593739032745, + "ce_loss_3": 3.9585251092910765, + "ce_loss_7": 3.6444791197776794, + "epoch": 0.752, + "grad_norm": 716.0, + "kl_loss_10": 120.7141185760498, + "kl_loss_2": 1376.1685791015625, + "kl_loss_3": 1027.3751647949218, + "kl_loss_7": 331.6845397949219, + "learning_rate": 0.0001470080242744218, + "loss": 711.1242, + "step": 7520 + }, + { + "ce_loss_10": 3.5526819705963133, + "ce_loss_13": 3.5017164587974547, + "ce_loss_2": 4.13426855802536, + "ce_loss_3": 3.950691211223602, + "ce_loss_7": 3.637460446357727, + "epoch": 0.753, + "grad_norm": 752.0, + "kl_loss_10": 118.99393234252929, + "kl_loss_2": 1376.8853271484375, + "kl_loss_3": 1021.0036926269531, + "kl_loss_7": 324.0994644165039, + "learning_rate": 0.0001458860846092705, + "loss": 721.2631, + "step": 7530 + }, + { + "ce_loss_10": 3.587147796154022, + "ce_loss_13": 3.534837579727173, + "ce_loss_2": 4.153036820888519, + "ce_loss_3": 3.982726287841797, + "ce_loss_7": 3.6733571171760557, + "epoch": 0.754, + "grad_norm": 568.0, + "kl_loss_10": 120.59756584167481, + "kl_loss_2": 1358.3469970703125, + "kl_loss_3": 1018.7931213378906, + "kl_loss_7": 325.6051193237305, + "learning_rate": 0.00014476771086731566, + "loss": 702.7048, + "step": 7540 + }, + { + "ce_loss_10": 3.6986522316932677, + "ce_loss_13": 3.640228807926178, + "ce_loss_2": 4.267730498313904, + "ce_loss_3": 4.08877055644989, + "ce_loss_7": 3.7881681442260744, + "epoch": 0.755, + "grad_norm": 716.0, + "kl_loss_10": 125.66169509887695, + "kl_loss_2": 1361.5933959960937, + "kl_loss_3": 1006.495458984375, + "kl_loss_7": 334.8022232055664, + "learning_rate": 0.00014365291431056872, + "loss": 725.5867, + "step": 7550 + }, + { + "ce_loss_10": 3.526888978481293, + "ce_loss_13": 3.4713703632354735, + "ce_loss_2": 4.122870457172394, + "ce_loss_3": 3.9425498366355898, + "ce_loss_7": 3.622254657745361, + "epoch": 0.756, + "grad_norm": 804.0, + "kl_loss_10": 124.94528083801269, + "kl_loss_2": 1422.8938781738282, + "kl_loss_3": 1061.9186767578126, + "kl_loss_7": 342.73841705322263, + "learning_rate": 0.00014254170616501827, + "loss": 726.9622, + "step": 7560 + }, + { + "ce_loss_10": 3.4563034534454347, + "ce_loss_13": 3.399739706516266, + "ce_loss_2": 4.073044645786285, + "ce_loss_3": 3.8895306468009947, + "ce_loss_7": 3.552928638458252, + "epoch": 0.757, + "grad_norm": 756.0, + "kl_loss_10": 123.76986045837403, + "kl_loss_2": 1446.2292724609374, + "kl_loss_3": 1085.1556243896484, + "kl_loss_7": 342.7120895385742, + "learning_rate": 0.0001414340976205183, + "loss": 745.9122, + "step": 7570 + }, + { + "ce_loss_10": 3.4699150562286376, + "ce_loss_13": 3.4161347150802612, + "ce_loss_2": 4.070275318622589, + "ce_loss_3": 3.888191211223602, + "ce_loss_7": 3.563317656517029, + "epoch": 0.758, + "grad_norm": 502.0, + "kl_loss_10": 121.47369956970215, + "kl_loss_2": 1404.6524047851562, + "kl_loss_3": 1044.3208923339844, + "kl_loss_7": 333.030908203125, + "learning_rate": 0.00014033009983067452, + "loss": 723.4891, + "step": 7580 + }, + { + "ce_loss_10": 3.635603678226471, + "ce_loss_13": 3.5829265475273133, + "ce_loss_2": 4.192306113243103, + "ce_loss_3": 4.022082531452179, + "ce_loss_7": 3.7206236600875853, + "epoch": 0.759, + "grad_norm": 464.0, + "kl_loss_10": 119.94292831420898, + "kl_loss_2": 1340.960137939453, + "kl_loss_3": 1001.6939056396484, + "kl_loss_7": 322.9109375, + "learning_rate": 0.00013922972391273224, + "loss": 710.9632, + "step": 7590 + }, + { + "ce_loss_10": 3.635016143321991, + "ce_loss_13": 3.582967531681061, + "ce_loss_2": 4.220010781288147, + "ce_loss_3": 4.042008626461029, + "ce_loss_7": 3.722532641887665, + "epoch": 0.76, + "grad_norm": 476.0, + "kl_loss_10": 121.22842979431152, + "kl_loss_2": 1370.4319946289063, + "kl_loss_3": 1019.5878387451172, + "kl_loss_7": 326.00676879882815, + "learning_rate": 0.0001381329809474649, + "loss": 718.1754, + "step": 7600 + }, + { + "ce_loss_10": 3.5380415320396423, + "ce_loss_13": 3.4831011414527895, + "ce_loss_2": 4.144773721694946, + "ce_loss_3": 3.96180077791214, + "ce_loss_7": 3.6306992173194885, + "epoch": 0.761, + "grad_norm": 744.0, + "kl_loss_10": 122.81468658447265, + "kl_loss_2": 1420.2211853027343, + "kl_loss_3": 1060.393051147461, + "kl_loss_7": 335.4973602294922, + "learning_rate": 0.0001370398819790621, + "loss": 732.0773, + "step": 7610 + }, + { + "ce_loss_10": 3.678859758377075, + "ce_loss_13": 3.6265331745147704, + "ce_loss_2": 4.236479330062866, + "ce_loss_3": 4.066269385814667, + "ce_loss_7": 3.7672220945358275, + "epoch": 0.762, + "grad_norm": 4000.0, + "kl_loss_10": 122.85980682373047, + "kl_loss_2": 1334.2934326171876, + "kl_loss_3": 996.7666259765625, + "kl_loss_7": 327.2108581542969, + "learning_rate": 0.00013595043801501794, + "loss": 697.9223, + "step": 7620 + }, + { + "ce_loss_10": 3.472164535522461, + "ce_loss_13": 3.4196288228034972, + "ce_loss_2": 4.094062793254852, + "ce_loss_3": 3.90572075843811, + "ce_loss_7": 3.565885674953461, + "epoch": 0.763, + "grad_norm": 940.0, + "kl_loss_10": 120.75509605407714, + "kl_loss_2": 1461.8281555175781, + "kl_loss_3": 1080.7115509033204, + "kl_loss_7": 336.56712799072267, + "learning_rate": 0.00013486466002602133, + "loss": 732.8728, + "step": 7630 + }, + { + "ce_loss_10": 3.594163513183594, + "ce_loss_13": 3.538478171825409, + "ce_loss_2": 4.157584226131439, + "ce_loss_3": 3.9850178241729735, + "ce_loss_7": 3.678713285923004, + "epoch": 0.764, + "grad_norm": 520.0, + "kl_loss_10": 122.31045188903809, + "kl_loss_2": 1349.2873657226562, + "kl_loss_3": 1010.3630493164062, + "kl_loss_7": 327.035498046875, + "learning_rate": 0.00013378255894584462, + "loss": 726.2445, + "step": 7640 + }, + { + "ce_loss_10": 3.524076557159424, + "ce_loss_13": 3.467902934551239, + "ce_loss_2": 4.118615138530731, + "ce_loss_3": 3.9384886622428894, + "ce_loss_7": 3.6175344228744506, + "epoch": 0.765, + "grad_norm": 572.0, + "kl_loss_10": 123.03233604431152, + "kl_loss_2": 1402.026629638672, + "kl_loss_3": 1045.6179962158203, + "kl_loss_7": 336.69920959472654, + "learning_rate": 0.0001327041456712334, + "loss": 726.6062, + "step": 7650 + }, + { + "ce_loss_10": 3.564585876464844, + "ce_loss_13": 3.5087494015693665, + "ce_loss_2": 4.150593364238739, + "ce_loss_3": 3.973892557621002, + "ce_loss_7": 3.6529523134231567, + "epoch": 0.766, + "grad_norm": 572.0, + "kl_loss_10": 123.00491752624512, + "kl_loss_2": 1400.3231262207032, + "kl_loss_3": 1047.99375, + "kl_loss_7": 335.2826919555664, + "learning_rate": 0.00013162943106179747, + "loss": 728.8339, + "step": 7660 + }, + { + "ce_loss_10": 3.545767056941986, + "ce_loss_13": 3.491313898563385, + "ce_loss_2": 4.129991114139557, + "ce_loss_3": 3.9483101606369018, + "ce_loss_7": 3.6297065734863283, + "epoch": 0.767, + "grad_norm": 680.0, + "kl_loss_10": 123.4360237121582, + "kl_loss_2": 1383.0119750976562, + "kl_loss_3": 1028.4972290039063, + "kl_loss_7": 329.80051574707034, + "learning_rate": 0.00013055842593990132, + "loss": 717.3344, + "step": 7670 + }, + { + "ce_loss_10": 3.484251117706299, + "ce_loss_13": 3.4325303077697753, + "ce_loss_2": 4.067324638366699, + "ce_loss_3": 3.8889448761940004, + "ce_loss_7": 3.5747691988945007, + "epoch": 0.768, + "grad_norm": 520.0, + "kl_loss_10": 119.67072257995605, + "kl_loss_2": 1369.2867492675782, + "kl_loss_3": 1022.6093109130859, + "kl_loss_7": 327.1118728637695, + "learning_rate": 0.00012949114109055414, + "loss": 725.7104, + "step": 7680 + }, + { + "ce_loss_10": 3.531809902191162, + "ce_loss_13": 3.47885684967041, + "ce_loss_2": 4.124879586696625, + "ce_loss_3": 3.9439189553260805, + "ce_loss_7": 3.622943699359894, + "epoch": 0.769, + "grad_norm": 624.0, + "kl_loss_10": 122.69542655944824, + "kl_loss_2": 1398.2911865234375, + "kl_loss_3": 1038.9411651611329, + "kl_loss_7": 336.0517639160156, + "learning_rate": 0.00012842758726130281, + "loss": 730.0852, + "step": 7690 + }, + { + "ce_loss_10": 3.56878023147583, + "ce_loss_13": 3.5146379590034487, + "ce_loss_2": 4.161990666389466, + "ce_loss_3": 3.9848286867141725, + "ce_loss_7": 3.663095223903656, + "epoch": 0.77, + "grad_norm": 644.0, + "kl_loss_10": 123.06643791198731, + "kl_loss_2": 1403.577325439453, + "kl_loss_3": 1048.3497802734375, + "kl_loss_7": 337.41247100830077, + "learning_rate": 0.00012736777516212267, + "loss": 719.0299, + "step": 7700 + }, + { + "ce_loss_10": 3.564795804023743, + "ce_loss_13": 3.5100102066993712, + "ce_loss_2": 4.15794951915741, + "ce_loss_3": 3.9753509163856506, + "ce_loss_7": 3.656312108039856, + "epoch": 0.771, + "grad_norm": 540.0, + "kl_loss_10": 123.37502822875976, + "kl_loss_2": 1401.978204345703, + "kl_loss_3": 1048.225601196289, + "kl_loss_7": 339.4851440429687, + "learning_rate": 0.00012631171546530968, + "loss": 716.8339, + "step": 7710 + }, + { + "ce_loss_10": 3.5768836736679077, + "ce_loss_13": 3.521056818962097, + "ce_loss_2": 4.168219780921936, + "ce_loss_3": 3.9888468265533445, + "ce_loss_7": 3.6682852506637573, + "epoch": 0.772, + "grad_norm": 556.0, + "kl_loss_10": 124.41245651245117, + "kl_loss_2": 1394.1679321289062, + "kl_loss_3": 1046.6662353515626, + "kl_loss_7": 335.3405014038086, + "learning_rate": 0.00012525941880537307, + "loss": 729.6016, + "step": 7720 + }, + { + "ce_loss_10": 3.615005624294281, + "ce_loss_13": 3.5605417847633363, + "ce_loss_2": 4.187514328956604, + "ce_loss_3": 4.013498651981354, + "ce_loss_7": 3.7059258341789247, + "epoch": 0.773, + "grad_norm": 430.0, + "kl_loss_10": 121.39755172729492, + "kl_loss_2": 1357.1861572265625, + "kl_loss_3": 1015.121484375, + "kl_loss_7": 327.748600769043, + "learning_rate": 0.00012421089577892869, + "loss": 711.3485, + "step": 7730 + }, + { + "ce_loss_10": 3.56311240196228, + "ce_loss_13": 3.5084868669509888, + "ce_loss_2": 4.148178255558014, + "ce_loss_3": 3.9718670725822447, + "ce_loss_7": 3.6510336875915526, + "epoch": 0.774, + "grad_norm": 700.0, + "kl_loss_10": 123.63254661560059, + "kl_loss_2": 1398.957989501953, + "kl_loss_3": 1048.010546875, + "kl_loss_7": 337.9348663330078, + "learning_rate": 0.0001231661569445919, + "loss": 727.9016, + "step": 7740 + }, + { + "ce_loss_10": 3.4179163813591003, + "ce_loss_13": 3.3657287478446962, + "ce_loss_2": 4.01623170375824, + "ce_loss_3": 3.8275793313980104, + "ce_loss_7": 3.5108714938163756, + "epoch": 0.775, + "grad_norm": 512.0, + "kl_loss_10": 120.1868335723877, + "kl_loss_2": 1396.8933349609374, + "kl_loss_3": 1037.240249633789, + "kl_loss_7": 331.2000259399414, + "learning_rate": 0.00012212521282287093, + "loss": 730.0172, + "step": 7750 + }, + { + "ce_loss_10": 3.578013074398041, + "ce_loss_13": 3.521529698371887, + "ce_loss_2": 4.159704029560089, + "ce_loss_3": 3.979478430747986, + "ce_loss_7": 3.667074370384216, + "epoch": 0.776, + "grad_norm": 636.0, + "kl_loss_10": 125.35750999450684, + "kl_loss_2": 1375.0942077636719, + "kl_loss_3": 1026.904037475586, + "kl_loss_7": 334.176286315918, + "learning_rate": 0.00012108807389606158, + "loss": 729.955, + "step": 7760 + }, + { + "ce_loss_10": 3.5703899383544924, + "ce_loss_13": 3.5183567881584166, + "ce_loss_2": 4.152936434745788, + "ce_loss_3": 3.975999045372009, + "ce_loss_7": 3.6579775333404543, + "epoch": 0.777, + "grad_norm": 564.0, + "kl_loss_10": 119.99107208251954, + "kl_loss_2": 1371.4642456054687, + "kl_loss_3": 1021.7969329833984, + "kl_loss_7": 325.3572662353516, + "learning_rate": 0.00012005475060814159, + "loss": 713.174, + "step": 7770 + }, + { + "ce_loss_10": 3.5056790947914123, + "ce_loss_13": 3.452779543399811, + "ce_loss_2": 4.10383517742157, + "ce_loss_3": 3.917849028110504, + "ce_loss_7": 3.595883679389954, + "epoch": 0.778, + "grad_norm": 880.0, + "kl_loss_10": 123.71958351135254, + "kl_loss_2": 1422.6599975585937, + "kl_loss_3": 1058.5932189941407, + "kl_loss_7": 337.1080978393555, + "learning_rate": 0.00011902525336466464, + "loss": 729.9841, + "step": 7780 + }, + { + "ce_loss_10": 3.498458230495453, + "ce_loss_13": 3.4422589898109437, + "ce_loss_2": 4.102750754356384, + "ce_loss_3": 3.920209753513336, + "ce_loss_7": 3.590150833129883, + "epoch": 0.779, + "grad_norm": 616.0, + "kl_loss_10": 125.57383270263672, + "kl_loss_2": 1428.4908935546875, + "kl_loss_3": 1067.4621215820312, + "kl_loss_7": 342.1434066772461, + "learning_rate": 0.00011799959253265668, + "loss": 725.2893, + "step": 7790 + }, + { + "ce_loss_10": 3.5606491565704346, + "ce_loss_13": 3.504223346710205, + "ce_loss_2": 4.145915222167969, + "ce_loss_3": 3.9612861037254334, + "ce_loss_7": 3.6475781321525576, + "epoch": 0.78, + "grad_norm": 552.0, + "kl_loss_10": 124.7652572631836, + "kl_loss_2": 1399.7738220214844, + "kl_loss_3": 1040.1979217529297, + "kl_loss_7": 335.10644683837893, + "learning_rate": 0.00011697777844051105, + "loss": 725.9237, + "step": 7800 + }, + { + "ce_loss_10": 3.5388702630996702, + "ce_loss_13": 3.4825316429138184, + "ce_loss_2": 4.142555356025696, + "ce_loss_3": 3.955347108840942, + "ce_loss_7": 3.6329334139823914, + "epoch": 0.781, + "grad_norm": 664.0, + "kl_loss_10": 123.95801277160645, + "kl_loss_2": 1431.2548095703125, + "kl_loss_3": 1059.7420715332032, + "kl_loss_7": 337.7406295776367, + "learning_rate": 0.00011595982137788402, + "loss": 730.6986, + "step": 7810 + }, + { + "ce_loss_10": 3.5142834782600403, + "ce_loss_13": 3.462408125400543, + "ce_loss_2": 4.0907470941543576, + "ce_loss_3": 3.9156432271003725, + "ce_loss_7": 3.6036725878715514, + "epoch": 0.782, + "grad_norm": 572.0, + "kl_loss_10": 120.28751068115234, + "kl_loss_2": 1360.7383850097656, + "kl_loss_3": 1015.1698822021484, + "kl_loss_7": 327.2494552612305, + "learning_rate": 0.00011494573159559212, + "loss": 716.929, + "step": 7820 + }, + { + "ce_loss_10": 3.4999680519104004, + "ce_loss_13": 3.444619429111481, + "ce_loss_2": 4.088768780231476, + "ce_loss_3": 3.917073929309845, + "ce_loss_7": 3.5901222705841063, + "epoch": 0.783, + "grad_norm": 520.0, + "kl_loss_10": 121.69903526306152, + "kl_loss_2": 1381.9976928710937, + "kl_loss_3": 1044.1288269042968, + "kl_loss_7": 330.6407958984375, + "learning_rate": 0.00011393551930550828, + "loss": 732.0409, + "step": 7830 + }, + { + "ce_loss_10": 3.6485252261161802, + "ce_loss_13": 3.591708707809448, + "ce_loss_2": 4.217781341075897, + "ce_loss_3": 4.040521049499512, + "ce_loss_7": 3.7363662481307984, + "epoch": 0.784, + "grad_norm": 588.0, + "kl_loss_10": 124.61389122009277, + "kl_loss_2": 1368.7540710449218, + "kl_loss_3": 1018.7755676269531, + "kl_loss_7": 332.2383743286133, + "learning_rate": 0.00011292919468045875, + "loss": 715.6162, + "step": 7840 + }, + { + "ce_loss_10": 3.594165802001953, + "ce_loss_13": 3.5395636916160584, + "ce_loss_2": 4.171285450458527, + "ce_loss_3": 3.9935011982917787, + "ce_loss_7": 3.679953920841217, + "epoch": 0.785, + "grad_norm": 492.0, + "kl_loss_10": 122.30264816284179, + "kl_loss_2": 1382.2249145507812, + "kl_loss_3": 1033.9700012207031, + "kl_loss_7": 332.99535675048827, + "learning_rate": 0.00011192676785412154, + "loss": 713.0011, + "step": 7850 + }, + { + "ce_loss_10": 3.5345438003540037, + "ce_loss_13": 3.4770457625389097, + "ce_loss_2": 4.13993616104126, + "ce_loss_3": 3.951429307460785, + "ce_loss_7": 3.627079486846924, + "epoch": 0.786, + "grad_norm": 588.0, + "kl_loss_10": 123.3083724975586, + "kl_loss_2": 1404.34658203125, + "kl_loss_3": 1041.594400024414, + "kl_loss_7": 334.9306671142578, + "learning_rate": 0.00011092824892092374, + "loss": 727.4104, + "step": 7860 + }, + { + "ce_loss_10": 3.459770083427429, + "ce_loss_13": 3.408163917064667, + "ce_loss_2": 4.069538617134095, + "ce_loss_3": 3.8821940660476684, + "ce_loss_7": 3.5510602235794066, + "epoch": 0.787, + "grad_norm": 568.0, + "kl_loss_10": 120.58967247009278, + "kl_loss_2": 1421.5660217285156, + "kl_loss_3": 1058.3918731689453, + "kl_loss_7": 332.33583374023436, + "learning_rate": 0.0001099336479359398, + "loss": 722.3462, + "step": 7870 + }, + { + "ce_loss_10": 3.5867392897605894, + "ce_loss_13": 3.5377102971076964, + "ce_loss_2": 4.162244534492492, + "ce_loss_3": 3.9819667458534242, + "ce_loss_7": 3.6743969202041624, + "epoch": 0.788, + "grad_norm": 556.0, + "kl_loss_10": 120.32511672973632, + "kl_loss_2": 1368.990167236328, + "kl_loss_3": 1015.7321105957031, + "kl_loss_7": 326.9243728637695, + "learning_rate": 0.00010894297491479043, + "loss": 718.1454, + "step": 7880 + }, + { + "ce_loss_10": 3.587001371383667, + "ce_loss_13": 3.53232136964798, + "ce_loss_2": 4.164430761337281, + "ce_loss_3": 3.9850462794303896, + "ce_loss_7": 3.6721792697906492, + "epoch": 0.789, + "grad_norm": 484.0, + "kl_loss_10": 122.12056427001953, + "kl_loss_2": 1371.6335876464843, + "kl_loss_3": 1019.5728363037109, + "kl_loss_7": 329.8185546875, + "learning_rate": 0.00010795623983354214, + "loss": 710.1288, + "step": 7890 + }, + { + "ce_loss_10": 3.467512822151184, + "ce_loss_13": 3.415497064590454, + "ce_loss_2": 4.068468177318573, + "ce_loss_3": 3.88867791891098, + "ce_loss_7": 3.561111843585968, + "epoch": 0.79, + "grad_norm": 684.0, + "kl_loss_10": 123.31999626159669, + "kl_loss_2": 1425.7031616210938, + "kl_loss_3": 1062.748342895508, + "kl_loss_7": 341.4196151733398, + "learning_rate": 0.00010697345262860636, + "loss": 728.0182, + "step": 7900 + }, + { + "ce_loss_10": 3.6126829385757446, + "ce_loss_13": 3.5593294978141783, + "ce_loss_2": 4.182345855236053, + "ce_loss_3": 4.000946879386902, + "ce_loss_7": 3.7014246940612794, + "epoch": 0.791, + "grad_norm": 704.0, + "kl_loss_10": 123.03226280212402, + "kl_loss_2": 1358.3401000976562, + "kl_loss_3": 1010.9559631347656, + "kl_loss_7": 329.75187530517576, + "learning_rate": 0.00010599462319663906, + "loss": 708.3709, + "step": 7910 + }, + { + "ce_loss_10": 3.5866196155548096, + "ce_loss_13": 3.532234954833984, + "ce_loss_2": 4.136071729660034, + "ce_loss_3": 3.971488630771637, + "ce_loss_7": 3.672000765800476, + "epoch": 0.792, + "grad_norm": 472.0, + "kl_loss_10": 120.42074203491211, + "kl_loss_2": 1328.9012939453125, + "kl_loss_3": 996.4815002441406, + "kl_loss_7": 325.10135955810546, + "learning_rate": 0.00010501976139444191, + "loss": 702.0556, + "step": 7920 + }, + { + "ce_loss_10": 3.6193451285362244, + "ce_loss_13": 3.564842689037323, + "ce_loss_2": 4.190324020385742, + "ce_loss_3": 4.011660039424896, + "ce_loss_7": 3.700591838359833, + "epoch": 0.793, + "grad_norm": 640.0, + "kl_loss_10": 122.40432777404786, + "kl_loss_2": 1363.9416870117188, + "kl_loss_3": 1013.281216430664, + "kl_loss_7": 326.6585952758789, + "learning_rate": 0.0001040488770388625, + "loss": 717.8754, + "step": 7930 + }, + { + "ce_loss_10": 3.5575379133224487, + "ce_loss_13": 3.5070634841918946, + "ce_loss_2": 4.132549190521241, + "ce_loss_3": 3.9550219655036924, + "ce_loss_7": 3.642544448375702, + "epoch": 0.794, + "grad_norm": 640.0, + "kl_loss_10": 122.45821609497071, + "kl_loss_2": 1379.8712585449218, + "kl_loss_3": 1030.4774230957032, + "kl_loss_7": 332.6387283325195, + "learning_rate": 0.00010308197990669538, + "loss": 717.4487, + "step": 7940 + }, + { + "ce_loss_10": 3.67416570186615, + "ce_loss_13": 3.6173601388931274, + "ce_loss_2": 4.2498343110084535, + "ce_loss_3": 4.069960105419159, + "ce_loss_7": 3.7626309990882874, + "epoch": 0.795, + "grad_norm": 640.0, + "kl_loss_10": 126.44907455444336, + "kl_loss_2": 1373.5286987304687, + "kl_loss_3": 1024.227099609375, + "kl_loss_7": 334.73041381835935, + "learning_rate": 0.0001021190797345839, + "loss": 713.8917, + "step": 7950 + }, + { + "ce_loss_10": 3.3944215536117555, + "ce_loss_13": 3.3384058237075807, + "ce_loss_2": 4.020748329162598, + "ce_loss_3": 3.8264388918876646, + "ce_loss_7": 3.4902202129364013, + "epoch": 0.796, + "grad_norm": 628.0, + "kl_loss_10": 124.78897056579589, + "kl_loss_2": 1471.5206115722656, + "kl_loss_3": 1092.6566253662108, + "kl_loss_7": 346.82877960205076, + "learning_rate": 0.00010116018621892236, + "loss": 734.37, + "step": 7960 + }, + { + "ce_loss_10": 3.6051715135574343, + "ce_loss_13": 3.548674774169922, + "ce_loss_2": 4.196666061878204, + "ce_loss_3": 4.012264215946198, + "ce_loss_7": 3.6960227131843566, + "epoch": 0.797, + "grad_norm": 660.0, + "kl_loss_10": 127.20088500976563, + "kl_loss_2": 1406.4417114257812, + "kl_loss_3": 1049.11025390625, + "kl_loss_7": 343.34503784179685, + "learning_rate": 0.00010020530901575753, + "loss": 714.2687, + "step": 7970 + }, + { + "ce_loss_10": 3.640721297264099, + "ce_loss_13": 3.5854681611061094, + "ce_loss_2": 4.2142220616340635, + "ce_loss_3": 4.037451386451721, + "ce_loss_7": 3.727248787879944, + "epoch": 0.798, + "grad_norm": 536.0, + "kl_loss_10": 124.72403526306152, + "kl_loss_2": 1381.4070373535155, + "kl_loss_3": 1030.5914398193358, + "kl_loss_7": 332.8453369140625, + "learning_rate": 9.925445774069231e-05, + "loss": 705.9745, + "step": 7980 + }, + { + "ce_loss_10": 3.591674566268921, + "ce_loss_13": 3.5351518869400023, + "ce_loss_2": 4.173741042613983, + "ce_loss_3": 3.9968824982643127, + "ce_loss_7": 3.682232987880707, + "epoch": 0.799, + "grad_norm": 640.0, + "kl_loss_10": 123.57306632995605, + "kl_loss_2": 1377.6592651367187, + "kl_loss_3": 1026.3024841308593, + "kl_loss_7": 331.9126678466797, + "learning_rate": 9.830764196878872e-05, + "loss": 704.6161, + "step": 7990 + }, + { + "ce_loss_10": 3.523613679409027, + "ce_loss_13": 3.470061206817627, + "ce_loss_2": 4.108954405784607, + "ce_loss_3": 3.929824376106262, + "ce_loss_7": 3.61134192943573, + "epoch": 0.8, + "grad_norm": 470.0, + "kl_loss_10": 120.64988784790039, + "kl_loss_2": 1404.6622375488282, + "kl_loss_3": 1043.9930480957032, + "kl_loss_7": 329.1615982055664, + "learning_rate": 9.736487123447069e-05, + "loss": 721.8352, + "step": 8000 + }, + { + "ce_loss_10": 3.4729248166084288, + "ce_loss_13": 3.41879620552063, + "ce_loss_2": 4.0886385679245, + "ce_loss_3": 3.8962788820266723, + "ce_loss_7": 3.5611618995666503, + "epoch": 0.801, + "grad_norm": 560.0, + "kl_loss_10": 123.18882446289062, + "kl_loss_2": 1455.8741088867187, + "kl_loss_3": 1067.2413635253906, + "kl_loss_7": 335.87705688476564, + "learning_rate": 9.642615503142926e-05, + "loss": 732.712, + "step": 8010 + }, + { + "ce_loss_10": 3.5466567516326903, + "ce_loss_13": 3.4922700762748717, + "ce_loss_2": 4.133945047855377, + "ce_loss_3": 3.9467738389968874, + "ce_loss_7": 3.6311212301254274, + "epoch": 0.802, + "grad_norm": 544.0, + "kl_loss_10": 121.13310089111329, + "kl_loss_2": 1388.1513732910157, + "kl_loss_3": 1021.6431732177734, + "kl_loss_7": 326.09789276123047, + "learning_rate": 9.549150281252633e-05, + "loss": 714.0566, + "step": 8020 + }, + { + "ce_loss_10": 3.574864935874939, + "ce_loss_13": 3.5205947041511534, + "ce_loss_2": 4.153928911685943, + "ce_loss_3": 3.979640507698059, + "ce_loss_7": 3.66382520198822, + "epoch": 0.803, + "grad_norm": 438.0, + "kl_loss_10": 122.92463874816895, + "kl_loss_2": 1376.0149536132812, + "kl_loss_3": 1028.9556762695313, + "kl_loss_7": 330.7016662597656, + "learning_rate": 9.4560923989699e-05, + "loss": 724.1703, + "step": 8030 + }, + { + "ce_loss_10": 3.566141927242279, + "ce_loss_13": 3.5096715688705444, + "ce_loss_2": 4.149633848667145, + "ce_loss_3": 3.9718169212341308, + "ce_loss_7": 3.6562745332717896, + "epoch": 0.804, + "grad_norm": 548.0, + "kl_loss_10": 123.20092582702637, + "kl_loss_2": 1395.099835205078, + "kl_loss_3": 1034.7683349609374, + "kl_loss_7": 333.7354141235352, + "learning_rate": 9.363442793386607e-05, + "loss": 730.3557, + "step": 8040 + }, + { + "ce_loss_10": 3.5398176670074464, + "ce_loss_13": 3.483390522003174, + "ce_loss_2": 4.144168162345887, + "ce_loss_3": 3.963836133480072, + "ce_loss_7": 3.629956769943237, + "epoch": 0.805, + "grad_norm": 708.0, + "kl_loss_10": 123.99459381103516, + "kl_loss_2": 1423.4671752929687, + "kl_loss_3": 1066.5139556884765, + "kl_loss_7": 341.58594970703126, + "learning_rate": 9.271202397483213e-05, + "loss": 716.4859, + "step": 8050 + }, + { + "ce_loss_10": 3.559576690196991, + "ce_loss_13": 3.5078131318092347, + "ce_loss_2": 4.1300270557403564, + "ce_loss_3": 3.948986303806305, + "ce_loss_7": 3.6449739813804625, + "epoch": 0.806, + "grad_norm": 616.0, + "kl_loss_10": 120.9858283996582, + "kl_loss_2": 1365.0379943847656, + "kl_loss_3": 1009.7584777832031, + "kl_loss_7": 326.2353118896484, + "learning_rate": 9.179372140119524e-05, + "loss": 720.3497, + "step": 8060 + }, + { + "ce_loss_10": 3.507103145122528, + "ce_loss_13": 3.4538211464881896, + "ce_loss_2": 4.078395879268646, + "ce_loss_3": 3.9048713564872743, + "ce_loss_7": 3.595319855213165, + "epoch": 0.807, + "grad_norm": 564.0, + "kl_loss_10": 120.51027412414551, + "kl_loss_2": 1373.4655517578126, + "kl_loss_3": 1023.649853515625, + "kl_loss_7": 328.5488845825195, + "learning_rate": 9.087952946025175e-05, + "loss": 723.5377, + "step": 8070 + }, + { + "ce_loss_10": 3.620259165763855, + "ce_loss_13": 3.566833519935608, + "ce_loss_2": 4.171628093719482, + "ce_loss_3": 3.99437997341156, + "ce_loss_7": 3.7069010019302366, + "epoch": 0.808, + "grad_norm": 564.0, + "kl_loss_10": 121.02688407897949, + "kl_loss_2": 1331.2975463867188, + "kl_loss_3": 991.3489471435547, + "kl_loss_7": 323.82676849365237, + "learning_rate": 8.996945735790446e-05, + "loss": 710.1383, + "step": 8080 + }, + { + "ce_loss_10": 3.5165679335594175, + "ce_loss_13": 3.464854347705841, + "ce_loss_2": 4.098834156990051, + "ce_loss_3": 3.920664381980896, + "ce_loss_7": 3.6038545966148376, + "epoch": 0.809, + "grad_norm": 692.0, + "kl_loss_10": 120.93192329406739, + "kl_loss_2": 1395.3546203613282, + "kl_loss_3": 1041.6934967041016, + "kl_loss_7": 328.9375244140625, + "learning_rate": 8.906351425856951e-05, + "loss": 724.6123, + "step": 8090 + }, + { + "ce_loss_10": 3.500584590435028, + "ce_loss_13": 3.446643114089966, + "ce_loss_2": 4.105693817138672, + "ce_loss_3": 3.91541211605072, + "ce_loss_7": 3.5908884406089783, + "epoch": 0.81, + "grad_norm": 528.0, + "kl_loss_10": 122.57808074951171, + "kl_loss_2": 1425.2539184570312, + "kl_loss_3": 1056.913995361328, + "kl_loss_7": 333.52864227294924, + "learning_rate": 8.816170928508365e-05, + "loss": 733.849, + "step": 8100 + }, + { + "ce_loss_10": 3.4640366792678834, + "ce_loss_13": 3.408512365818024, + "ce_loss_2": 4.075878214836121, + "ce_loss_3": 3.885686254501343, + "ce_loss_7": 3.555466377735138, + "epoch": 0.811, + "grad_norm": 488.0, + "kl_loss_10": 122.87005310058593, + "kl_loss_2": 1440.5128112792968, + "kl_loss_3": 1072.4588012695312, + "kl_loss_7": 337.0792922973633, + "learning_rate": 8.7264051518613e-05, + "loss": 731.1898, + "step": 8110 + }, + { + "ce_loss_10": 3.555766189098358, + "ce_loss_13": 3.5061283111572266, + "ce_loss_2": 4.124446415901184, + "ce_loss_3": 3.9479679465293884, + "ce_loss_7": 3.643243062496185, + "epoch": 0.812, + "grad_norm": 636.0, + "kl_loss_10": 119.26363830566406, + "kl_loss_2": 1364.3051879882812, + "kl_loss_3": 1016.0982299804688, + "kl_loss_7": 325.98375396728517, + "learning_rate": 8.637054999856148e-05, + "loss": 716.5947, + "step": 8120 + }, + { + "ce_loss_10": 3.5404892563819885, + "ce_loss_13": 3.485300052165985, + "ce_loss_2": 4.1336122989654545, + "ce_loss_3": 3.949459767341614, + "ce_loss_7": 3.632100021839142, + "epoch": 0.813, + "grad_norm": 732.0, + "kl_loss_10": 122.73268966674804, + "kl_loss_2": 1402.0976989746093, + "kl_loss_3": 1043.5717376708985, + "kl_loss_7": 334.2214889526367, + "learning_rate": 8.548121372247918e-05, + "loss": 731.7564, + "step": 8130 + }, + { + "ce_loss_10": 3.6156569719314575, + "ce_loss_13": 3.5624505519866942, + "ce_loss_2": 4.1817370533943174, + "ce_loss_3": 4.006722009181976, + "ce_loss_7": 3.7026910543441773, + "epoch": 0.814, + "grad_norm": 604.0, + "kl_loss_10": 122.37059669494629, + "kl_loss_2": 1370.1346496582032, + "kl_loss_3": 1023.8347961425782, + "kl_loss_7": 328.72179260253904, + "learning_rate": 8.459605164597267e-05, + "loss": 713.0094, + "step": 8140 + }, + { + "ce_loss_10": 3.4985602378845213, + "ce_loss_13": 3.445442032814026, + "ce_loss_2": 4.088584578037262, + "ce_loss_3": 3.9099313855171203, + "ce_loss_7": 3.5871086597442625, + "epoch": 0.815, + "grad_norm": 540.0, + "kl_loss_10": 120.2810188293457, + "kl_loss_2": 1388.320379638672, + "kl_loss_3": 1032.2357604980468, + "kl_loss_7": 326.0157501220703, + "learning_rate": 8.371507268261436e-05, + "loss": 722.8974, + "step": 8150 + }, + { + "ce_loss_10": 3.575037980079651, + "ce_loss_13": 3.5237354397773744, + "ce_loss_2": 4.15966123342514, + "ce_loss_3": 3.9799288749694823, + "ce_loss_7": 3.662469041347504, + "epoch": 0.816, + "grad_norm": 468.0, + "kl_loss_10": 122.56170921325683, + "kl_loss_2": 1387.7402465820312, + "kl_loss_3": 1035.590023803711, + "kl_loss_7": 331.77565307617186, + "learning_rate": 8.283828570385238e-05, + "loss": 702.9698, + "step": 8160 + }, + { + "ce_loss_10": 3.575086772441864, + "ce_loss_13": 3.521743583679199, + "ce_loss_2": 4.162600886821747, + "ce_loss_3": 3.980288898944855, + "ce_loss_7": 3.6681403875350953, + "epoch": 0.817, + "grad_norm": 536.0, + "kl_loss_10": 121.89449920654297, + "kl_loss_2": 1363.099432373047, + "kl_loss_3": 1020.9603607177735, + "kl_loss_7": 330.69257202148435, + "learning_rate": 8.196569953892202e-05, + "loss": 714.9846, + "step": 8170 + }, + { + "ce_loss_10": 3.48666273355484, + "ce_loss_13": 3.4311928510665894, + "ce_loss_2": 4.078561079502106, + "ce_loss_3": 3.8965752482414246, + "ce_loss_7": 3.574566900730133, + "epoch": 0.818, + "grad_norm": 564.0, + "kl_loss_10": 121.38032493591308, + "kl_loss_2": 1383.4040405273438, + "kl_loss_3": 1031.3620330810547, + "kl_loss_7": 329.9718292236328, + "learning_rate": 8.109732297475635e-05, + "loss": 716.3832, + "step": 8180 + }, + { + "ce_loss_10": 3.4624683260917664, + "ce_loss_13": 3.4071059107780455, + "ce_loss_2": 4.093664598464966, + "ce_loss_3": 3.904293656349182, + "ce_loss_7": 3.5597102522850035, + "epoch": 0.819, + "grad_norm": 612.0, + "kl_loss_10": 124.19370193481446, + "kl_loss_2": 1449.606427001953, + "kl_loss_3": 1083.9475494384765, + "kl_loss_7": 344.92893371582034, + "learning_rate": 8.023316475589754e-05, + "loss": 736.8709, + "step": 8190 + }, + { + "ce_loss_10": 3.426418936252594, + "ce_loss_13": 3.369425415992737, + "ce_loss_2": 4.0705150127410885, + "ce_loss_3": 3.8759565591812133, + "ce_loss_7": 3.523312973976135, + "epoch": 0.82, + "grad_norm": 960.0, + "kl_loss_10": 127.30840263366699, + "kl_loss_2": 1489.4338317871093, + "kl_loss_3": 1101.4859008789062, + "kl_loss_7": 350.6812271118164, + "learning_rate": 7.937323358440934e-05, + "loss": 747.0156, + "step": 8200 + }, + { + "ce_loss_10": 3.5525304675102234, + "ce_loss_13": 3.501710522174835, + "ce_loss_2": 4.118105101585388, + "ce_loss_3": 3.940403604507446, + "ce_loss_7": 3.634892928600311, + "epoch": 0.821, + "grad_norm": 592.0, + "kl_loss_10": 120.62562103271485, + "kl_loss_2": 1362.5559814453125, + "kl_loss_3": 1013.3167175292969, + "kl_loss_7": 324.7458526611328, + "learning_rate": 7.851753811978923e-05, + "loss": 715.6536, + "step": 8210 + }, + { + "ce_loss_10": 3.572048854827881, + "ce_loss_13": 3.51797137260437, + "ce_loss_2": 4.168267369270325, + "ce_loss_3": 3.9798033118247984, + "ce_loss_7": 3.6613059639930725, + "epoch": 0.822, + "grad_norm": 584.0, + "kl_loss_10": 123.35920181274415, + "kl_loss_2": 1404.1062316894531, + "kl_loss_3": 1032.806460571289, + "kl_loss_7": 332.36991729736326, + "learning_rate": 7.766608697888095e-05, + "loss": 716.8437, + "step": 8220 + }, + { + "ce_loss_10": 3.5873825192451476, + "ce_loss_13": 3.531443381309509, + "ce_loss_2": 4.165439331531525, + "ce_loss_3": 3.9851741552352906, + "ce_loss_7": 3.677255082130432, + "epoch": 0.823, + "grad_norm": 596.0, + "kl_loss_10": 125.00053062438965, + "kl_loss_2": 1401.6642272949218, + "kl_loss_3": 1042.9925048828125, + "kl_loss_7": 337.75463714599607, + "learning_rate": 7.681888873578785e-05, + "loss": 727.6588, + "step": 8230 + }, + { + "ce_loss_10": 3.521757662296295, + "ce_loss_13": 3.4622273206710816, + "ce_loss_2": 4.119533801078797, + "ce_loss_3": 3.934826338291168, + "ce_loss_7": 3.613308382034302, + "epoch": 0.824, + "grad_norm": 528.0, + "kl_loss_10": 124.3594253540039, + "kl_loss_2": 1415.3692321777344, + "kl_loss_3": 1054.1624694824218, + "kl_loss_7": 337.27447814941405, + "learning_rate": 7.597595192178702e-05, + "loss": 720.58, + "step": 8240 + }, + { + "ce_loss_10": 3.510056436061859, + "ce_loss_13": 3.4549400925636293, + "ce_loss_2": 4.117804288864136, + "ce_loss_3": 3.931263828277588, + "ce_loss_7": 3.603116035461426, + "epoch": 0.825, + "grad_norm": 732.0, + "kl_loss_10": 125.15355644226074, + "kl_loss_2": 1448.2377319335938, + "kl_loss_3": 1071.90830078125, + "kl_loss_7": 342.34349822998047, + "learning_rate": 7.513728502524286e-05, + "loss": 737.9312, + "step": 8250 + }, + { + "ce_loss_10": 3.5135778069496153, + "ce_loss_13": 3.4605449557304384, + "ce_loss_2": 4.092071676254273, + "ce_loss_3": 3.911950242519379, + "ce_loss_7": 3.6011463046073913, + "epoch": 0.826, + "grad_norm": 724.0, + "kl_loss_10": 119.03209075927734, + "kl_loss_2": 1362.1486572265626, + "kl_loss_3": 1014.1013732910156, + "kl_loss_7": 323.3859298706055, + "learning_rate": 7.430289649152156e-05, + "loss": 719.2754, + "step": 8260 + }, + { + "ce_loss_10": 3.4184691548347472, + "ce_loss_13": 3.366116726398468, + "ce_loss_2": 4.024687492847443, + "ce_loss_3": 3.840518391132355, + "ce_loss_7": 3.5109126925468446, + "epoch": 0.827, + "grad_norm": 620.0, + "kl_loss_10": 121.1826301574707, + "kl_loss_2": 1442.715380859375, + "kl_loss_3": 1078.2603820800782, + "kl_loss_7": 337.94980773925784, + "learning_rate": 7.347279472290646e-05, + "loss": 725.4499, + "step": 8270 + }, + { + "ce_loss_10": 3.553591012954712, + "ce_loss_13": 3.5011276602745056, + "ce_loss_2": 4.143419885635376, + "ce_loss_3": 3.963691782951355, + "ce_loss_7": 3.6430840849876405, + "epoch": 0.828, + "grad_norm": 632.0, + "kl_loss_10": 122.11264228820801, + "kl_loss_2": 1390.1591186523438, + "kl_loss_3": 1031.9824768066405, + "kl_loss_7": 331.70887298583983, + "learning_rate": 7.264698807851328e-05, + "loss": 723.2428, + "step": 8280 + }, + { + "ce_loss_10": 3.5191181659698487, + "ce_loss_13": 3.469066548347473, + "ce_loss_2": 4.081725561618805, + "ce_loss_3": 3.910724198818207, + "ce_loss_7": 3.6018188834190368, + "epoch": 0.829, + "grad_norm": 588.0, + "kl_loss_10": 118.70943183898926, + "kl_loss_2": 1345.8255798339844, + "kl_loss_3": 1008.5260772705078, + "kl_loss_7": 322.22408599853514, + "learning_rate": 7.182548487420554e-05, + "loss": 711.3388, + "step": 8290 + }, + { + "ce_loss_10": 3.5789464712142944, + "ce_loss_13": 3.5243069410324095, + "ce_loss_2": 4.157199239730835, + "ce_loss_3": 3.980182957649231, + "ce_loss_7": 3.6695273160934447, + "epoch": 0.83, + "grad_norm": 532.0, + "kl_loss_10": 123.22674674987793, + "kl_loss_2": 1386.9601989746093, + "kl_loss_3": 1033.9500579833984, + "kl_loss_7": 333.202214050293, + "learning_rate": 7.100829338251146e-05, + "loss": 715.895, + "step": 8300 + }, + { + "ce_loss_10": 3.509700119495392, + "ce_loss_13": 3.454431247711182, + "ce_loss_2": 4.1180780053138735, + "ce_loss_3": 3.9303313374519346, + "ce_loss_7": 3.601513075828552, + "epoch": 0.831, + "grad_norm": 620.0, + "kl_loss_10": 123.90524368286133, + "kl_loss_2": 1420.9776611328125, + "kl_loss_3": 1058.4737640380858, + "kl_loss_7": 338.4593246459961, + "learning_rate": 7.019542183254046e-05, + "loss": 720.623, + "step": 8310 + }, + { + "ce_loss_10": 3.5525806307792664, + "ce_loss_13": 3.4959965467453005, + "ce_loss_2": 4.128799915313721, + "ce_loss_3": 3.9509485840797423, + "ce_loss_7": 3.6412771344184875, + "epoch": 0.832, + "grad_norm": 716.0, + "kl_loss_10": 127.12583503723144, + "kl_loss_2": 1391.4539794921875, + "kl_loss_3": 1033.12021484375, + "kl_loss_7": 336.7935516357422, + "learning_rate": 6.938687840989971e-05, + "loss": 719.1849, + "step": 8320 + }, + { + "ce_loss_10": 3.4853139400482176, + "ce_loss_13": 3.429826629161835, + "ce_loss_2": 4.072561550140381, + "ce_loss_3": 3.893218147754669, + "ce_loss_7": 3.576352059841156, + "epoch": 0.833, + "grad_norm": 652.0, + "kl_loss_10": 123.63401412963867, + "kl_loss_2": 1387.767022705078, + "kl_loss_3": 1034.514959716797, + "kl_loss_7": 333.90601654052733, + "learning_rate": 6.858267125661271e-05, + "loss": 724.1394, + "step": 8330 + }, + { + "ce_loss_10": 3.5483754873275757, + "ce_loss_13": 3.4943613171577455, + "ce_loss_2": 4.1425862312316895, + "ce_loss_3": 3.9642663478851317, + "ce_loss_7": 3.637398338317871, + "epoch": 0.834, + "grad_norm": 688.0, + "kl_loss_10": 120.76418533325196, + "kl_loss_2": 1390.6704711914062, + "kl_loss_3": 1037.854913330078, + "kl_loss_7": 329.1157623291016, + "learning_rate": 6.778280847103668e-05, + "loss": 734.3215, + "step": 8340 + }, + { + "ce_loss_10": 3.5596023082733153, + "ce_loss_13": 3.505242204666138, + "ce_loss_2": 4.147267746925354, + "ce_loss_3": 3.966709625720978, + "ce_loss_7": 3.6490352749824524, + "epoch": 0.835, + "grad_norm": 616.0, + "kl_loss_10": 124.8666488647461, + "kl_loss_2": 1410.322442626953, + "kl_loss_3": 1046.9622619628906, + "kl_loss_7": 337.84252166748047, + "learning_rate": 6.698729810778065e-05, + "loss": 721.4561, + "step": 8350 + }, + { + "ce_loss_10": 3.4651718974113463, + "ce_loss_13": 3.4114996194839478, + "ce_loss_2": 4.066473186016083, + "ce_loss_3": 3.881610298156738, + "ce_loss_7": 3.5564165115356445, + "epoch": 0.836, + "grad_norm": 744.0, + "kl_loss_10": 119.62793006896973, + "kl_loss_2": 1401.6785705566406, + "kl_loss_3": 1042.894009399414, + "kl_loss_7": 327.8875930786133, + "learning_rate": 6.619614817762538e-05, + "loss": 723.3062, + "step": 8360 + }, + { + "ce_loss_10": 3.4274404406547547, + "ce_loss_13": 3.374155807495117, + "ce_loss_2": 4.061884236335755, + "ce_loss_3": 3.8700376033782957, + "ce_loss_7": 3.5248605847358703, + "epoch": 0.837, + "grad_norm": 476.0, + "kl_loss_10": 121.19938850402832, + "kl_loss_2": 1469.314794921875, + "kl_loss_3": 1094.0616088867187, + "kl_loss_7": 343.3897903442383, + "learning_rate": 6.540936664744196e-05, + "loss": 736.0315, + "step": 8370 + }, + { + "ce_loss_10": 3.5783516645431517, + "ce_loss_13": 3.5220268607139587, + "ce_loss_2": 4.174437272548675, + "ce_loss_3": 3.990994656085968, + "ce_loss_7": 3.667269802093506, + "epoch": 0.838, + "grad_norm": 414.0, + "kl_loss_10": 122.77630615234375, + "kl_loss_2": 1392.2082885742188, + "kl_loss_3": 1041.2133331298828, + "kl_loss_7": 332.893977355957, + "learning_rate": 6.462696144011149e-05, + "loss": 716.4901, + "step": 8380 + }, + { + "ce_loss_10": 3.5278226017951964, + "ce_loss_13": 3.4739001631736754, + "ce_loss_2": 4.114171576499939, + "ce_loss_3": 3.937883758544922, + "ce_loss_7": 3.62037034034729, + "epoch": 0.839, + "grad_norm": 506.0, + "kl_loss_10": 125.26272201538086, + "kl_loss_2": 1392.125311279297, + "kl_loss_3": 1040.5689788818358, + "kl_loss_7": 336.6851501464844, + "learning_rate": 6.384894043444567e-05, + "loss": 715.348, + "step": 8390 + }, + { + "ce_loss_10": 3.5627179741859436, + "ce_loss_13": 3.510275673866272, + "ce_loss_2": 4.151006007194519, + "ce_loss_3": 3.9754858732223513, + "ce_loss_7": 3.650054228305817, + "epoch": 0.84, + "grad_norm": 840.0, + "kl_loss_10": 122.64037246704102, + "kl_loss_2": 1398.1174865722655, + "kl_loss_3": 1038.6087371826172, + "kl_loss_7": 331.8931655883789, + "learning_rate": 6.307531146510753e-05, + "loss": 716.8575, + "step": 8400 + }, + { + "ce_loss_10": 3.5370197057724, + "ce_loss_13": 3.481518292427063, + "ce_loss_2": 4.113574099540711, + "ce_loss_3": 3.934196209907532, + "ce_loss_7": 3.6256550788879394, + "epoch": 0.841, + "grad_norm": 564.0, + "kl_loss_10": 122.6060775756836, + "kl_loss_2": 1369.6577880859375, + "kl_loss_3": 1022.7808959960937, + "kl_loss_7": 330.5647766113281, + "learning_rate": 6.230608232253226e-05, + "loss": 710.1536, + "step": 8410 + }, + { + "ce_loss_10": 3.489914321899414, + "ce_loss_13": 3.4370184540748596, + "ce_loss_2": 4.1010636448860165, + "ce_loss_3": 3.9170850038528444, + "ce_loss_7": 3.5802398562431335, + "epoch": 0.842, + "grad_norm": 544.0, + "kl_loss_10": 121.60591506958008, + "kl_loss_2": 1432.9647521972656, + "kl_loss_3": 1068.4068176269532, + "kl_loss_7": 334.95867614746095, + "learning_rate": 6.154126075284855e-05, + "loss": 721.6619, + "step": 8420 + }, + { + "ce_loss_10": 3.583195209503174, + "ce_loss_13": 3.530484902858734, + "ce_loss_2": 4.156937325000763, + "ce_loss_3": 3.9834988713264465, + "ce_loss_7": 3.6713316440582275, + "epoch": 0.843, + "grad_norm": 548.0, + "kl_loss_10": 119.82128868103027, + "kl_loss_2": 1350.0745971679687, + "kl_loss_3": 1013.26455078125, + "kl_loss_7": 326.27203674316405, + "learning_rate": 6.078085445780129e-05, + "loss": 702.2121, + "step": 8430 + }, + { + "ce_loss_10": 3.5919320344924928, + "ce_loss_13": 3.5380735635757445, + "ce_loss_2": 4.185958683490753, + "ce_loss_3": 4.000372779369354, + "ce_loss_7": 3.680304741859436, + "epoch": 0.844, + "grad_norm": 624.0, + "kl_loss_10": 122.76680603027344, + "kl_loss_2": 1400.8627502441407, + "kl_loss_3": 1034.3849639892578, + "kl_loss_7": 332.1929397583008, + "learning_rate": 6.002487109467347e-05, + "loss": 712.4612, + "step": 8440 + }, + { + "ce_loss_10": 3.598021149635315, + "ce_loss_13": 3.544245195388794, + "ce_loss_2": 4.1716133713722225, + "ce_loss_3": 3.996164381504059, + "ce_loss_7": 3.684845209121704, + "epoch": 0.845, + "grad_norm": 624.0, + "kl_loss_10": 122.97368698120117, + "kl_loss_2": 1382.0722961425781, + "kl_loss_3": 1033.296566772461, + "kl_loss_7": 335.2268692016602, + "learning_rate": 5.927331827620902e-05, + "loss": 714.5446, + "step": 8450 + }, + { + "ce_loss_10": 3.583691966533661, + "ce_loss_13": 3.532571244239807, + "ce_loss_2": 4.150978291034699, + "ce_loss_3": 3.980244314670563, + "ce_loss_7": 3.668419122695923, + "epoch": 0.846, + "grad_norm": 442.0, + "kl_loss_10": 119.68389167785645, + "kl_loss_2": 1349.830810546875, + "kl_loss_3": 1009.5769073486329, + "kl_loss_7": 326.7773956298828, + "learning_rate": 5.852620357053651e-05, + "loss": 711.2381, + "step": 8460 + }, + { + "ce_loss_10": 3.621056246757507, + "ce_loss_13": 3.566351020336151, + "ce_loss_2": 4.186827552318573, + "ce_loss_3": 4.011941504478455, + "ce_loss_7": 3.707981014251709, + "epoch": 0.847, + "grad_norm": 604.0, + "kl_loss_10": 120.51869049072266, + "kl_loss_2": 1355.9027709960938, + "kl_loss_3": 1009.8785064697265, + "kl_loss_7": 326.8977752685547, + "learning_rate": 5.778353450109286e-05, + "loss": 709.6583, + "step": 8470 + }, + { + "ce_loss_10": 3.6638654470443726, + "ce_loss_13": 3.6074997663497923, + "ce_loss_2": 4.246403753757477, + "ce_loss_3": 4.066155457496643, + "ce_loss_7": 3.7529626727104186, + "epoch": 0.848, + "grad_norm": 496.0, + "kl_loss_10": 124.99574241638183, + "kl_loss_2": 1393.828564453125, + "kl_loss_3": 1038.2093139648437, + "kl_loss_7": 335.6842575073242, + "learning_rate": 5.7045318546547206e-05, + "loss": 718.2406, + "step": 8480 + }, + { + "ce_loss_10": 3.5572060704231263, + "ce_loss_13": 3.502591860294342, + "ce_loss_2": 4.141818737983703, + "ce_loss_3": 3.964042770862579, + "ce_loss_7": 3.6434488534927367, + "epoch": 0.849, + "grad_norm": 720.0, + "kl_loss_10": 122.56028099060059, + "kl_loss_2": 1399.0360961914062, + "kl_loss_3": 1046.2178619384765, + "kl_loss_7": 331.661442565918, + "learning_rate": 5.631156314072605e-05, + "loss": 715.0076, + "step": 8490 + }, + { + "ce_loss_10": 3.56727614402771, + "ce_loss_13": 3.5156813502311706, + "ce_loss_2": 4.136505460739135, + "ce_loss_3": 3.9590030670166017, + "ce_loss_7": 3.655721139907837, + "epoch": 0.85, + "grad_norm": 552.0, + "kl_loss_10": 120.79662399291992, + "kl_loss_2": 1366.066455078125, + "kl_loss_3": 1009.6999877929687, + "kl_loss_7": 325.6238540649414, + "learning_rate": 5.5582275672538315e-05, + "loss": 705.6439, + "step": 8500 + }, + { + "ce_loss_10": 3.489551877975464, + "ce_loss_13": 3.4322364211082457, + "ce_loss_2": 4.108343851566315, + "ce_loss_3": 3.922130513191223, + "ce_loss_7": 3.582332742214203, + "epoch": 0.851, + "grad_norm": 572.0, + "kl_loss_10": 125.23014411926269, + "kl_loss_2": 1452.216046142578, + "kl_loss_3": 1083.0195220947267, + "kl_loss_7": 340.2689270019531, + "learning_rate": 5.4857463485900484e-05, + "loss": 735.124, + "step": 8510 + }, + { + "ce_loss_10": 3.5416255474090574, + "ce_loss_13": 3.4896072506904603, + "ce_loss_2": 4.120888018608094, + "ce_loss_3": 3.9421190857887267, + "ce_loss_7": 3.631319212913513, + "epoch": 0.852, + "grad_norm": 492.0, + "kl_loss_10": 121.33798141479492, + "kl_loss_2": 1376.6540832519531, + "kl_loss_3": 1028.2473266601562, + "kl_loss_7": 331.88528594970705, + "learning_rate": 5.413713387966329e-05, + "loss": 712.989, + "step": 8520 + }, + { + "ce_loss_10": 3.56683589220047, + "ce_loss_13": 3.511161148548126, + "ce_loss_2": 4.154978930950165, + "ce_loss_3": 3.97281414270401, + "ce_loss_7": 3.656111001968384, + "epoch": 0.853, + "grad_norm": 908.0, + "kl_loss_10": 124.57011680603027, + "kl_loss_2": 1397.046124267578, + "kl_loss_3": 1044.6578674316406, + "kl_loss_7": 333.34014892578125, + "learning_rate": 5.34212941075381e-05, + "loss": 723.4795, + "step": 8530 + }, + { + "ce_loss_10": 3.571316683292389, + "ce_loss_13": 3.518352448940277, + "ce_loss_2": 4.139795422554016, + "ce_loss_3": 3.96577330827713, + "ce_loss_7": 3.6558568358421324, + "epoch": 0.854, + "grad_norm": 704.0, + "kl_loss_10": 119.71459197998047, + "kl_loss_2": 1355.7856689453124, + "kl_loss_3": 1011.7955169677734, + "kl_loss_7": 321.64742279052734, + "learning_rate": 5.270995137802315e-05, + "loss": 706.1101, + "step": 8540 + }, + { + "ce_loss_10": 3.502516198158264, + "ce_loss_13": 3.4539687633514404, + "ce_loss_2": 4.084729707241058, + "ce_loss_3": 3.9072174072265624, + "ce_loss_7": 3.5934065103530886, + "epoch": 0.855, + "grad_norm": 532.0, + "kl_loss_10": 119.76500358581544, + "kl_loss_2": 1392.7714599609376, + "kl_loss_3": 1033.4823822021485, + "kl_loss_7": 329.61717224121094, + "learning_rate": 5.2003112854332125e-05, + "loss": 718.4223, + "step": 8550 + }, + { + "ce_loss_10": 3.5103381156921385, + "ce_loss_13": 3.457986581325531, + "ce_loss_2": 4.087642765045166, + "ce_loss_3": 3.9084141731262205, + "ce_loss_7": 3.5956949472427366, + "epoch": 0.856, + "grad_norm": 724.0, + "kl_loss_10": 120.56714744567871, + "kl_loss_2": 1380.4932006835938, + "kl_loss_3": 1032.8779724121093, + "kl_loss_7": 327.5092468261719, + "learning_rate": 5.130078565432089e-05, + "loss": 704.4872, + "step": 8560 + }, + { + "ce_loss_10": 3.5799346208572387, + "ce_loss_13": 3.5277039766311646, + "ce_loss_2": 4.142898440361023, + "ce_loss_3": 3.9674919128417967, + "ce_loss_7": 3.662895882129669, + "epoch": 0.857, + "grad_norm": 492.0, + "kl_loss_10": 120.35042152404785, + "kl_loss_2": 1359.3675537109375, + "kl_loss_3": 1015.0716613769531, + "kl_loss_7": 324.0651107788086, + "learning_rate": 5.060297685041659e-05, + "loss": 701.0533, + "step": 8570 + }, + { + "ce_loss_10": 3.5102761030197143, + "ce_loss_13": 3.4554622650146483, + "ce_loss_2": 4.102907431125641, + "ce_loss_3": 3.920087468624115, + "ce_loss_7": 3.5963584423065185, + "epoch": 0.858, + "grad_norm": 560.0, + "kl_loss_10": 123.81989250183105, + "kl_loss_2": 1410.788916015625, + "kl_loss_3": 1048.8169738769532, + "kl_loss_7": 335.03162536621096, + "learning_rate": 4.99096934695461e-05, + "loss": 732.0039, + "step": 8580 + }, + { + "ce_loss_10": 3.571087384223938, + "ce_loss_13": 3.5143581509590147, + "ce_loss_2": 4.155477786064148, + "ce_loss_3": 3.9760597348213196, + "ce_loss_7": 3.659903717041016, + "epoch": 0.859, + "grad_norm": 524.0, + "kl_loss_10": 122.09205551147461, + "kl_loss_2": 1377.2748107910156, + "kl_loss_3": 1023.4202880859375, + "kl_loss_7": 329.93336029052733, + "learning_rate": 4.922094249306558e-05, + "loss": 708.5735, + "step": 8590 + }, + { + "ce_loss_10": 3.5948333024978636, + "ce_loss_13": 3.542141377925873, + "ce_loss_2": 4.168703019618988, + "ce_loss_3": 3.9915787816047668, + "ce_loss_7": 3.6865815997123716, + "epoch": 0.86, + "grad_norm": 512.0, + "kl_loss_10": 122.46472663879395, + "kl_loss_2": 1371.870458984375, + "kl_loss_3": 1024.2403106689453, + "kl_loss_7": 333.7798217773437, + "learning_rate": 4.853673085668947e-05, + "loss": 703.5446, + "step": 8600 + }, + { + "ce_loss_10": 3.621105194091797, + "ce_loss_13": 3.5630958437919618, + "ce_loss_2": 4.201367115974426, + "ce_loss_3": 4.027955961227417, + "ce_loss_7": 3.7082759618759153, + "epoch": 0.861, + "grad_norm": 560.0, + "kl_loss_10": 123.08092155456544, + "kl_loss_2": 1377.9781494140625, + "kl_loss_3": 1025.2794342041016, + "kl_loss_7": 328.5953765869141, + "learning_rate": 4.78570654504214e-05, + "loss": 717.6915, + "step": 8610 + }, + { + "ce_loss_10": 3.5619210958480836, + "ce_loss_13": 3.507647895812988, + "ce_loss_2": 4.149072754383087, + "ce_loss_3": 3.970789170265198, + "ce_loss_7": 3.648519480228424, + "epoch": 0.862, + "grad_norm": 516.0, + "kl_loss_10": 120.92977561950684, + "kl_loss_2": 1399.8869262695312, + "kl_loss_3": 1044.365057373047, + "kl_loss_7": 330.3472198486328, + "learning_rate": 4.7181953118484556e-05, + "loss": 723.0992, + "step": 8620 + }, + { + "ce_loss_10": 3.5900183796882628, + "ce_loss_13": 3.5374162673950194, + "ce_loss_2": 4.16554582118988, + "ce_loss_3": 3.994710421562195, + "ce_loss_7": 3.678675186634064, + "epoch": 0.863, + "grad_norm": 520.0, + "kl_loss_10": 120.54650344848633, + "kl_loss_2": 1357.929443359375, + "kl_loss_3": 1018.2571746826172, + "kl_loss_7": 328.08815002441406, + "learning_rate": 4.651140065925269e-05, + "loss": 721.2698, + "step": 8630 + }, + { + "ce_loss_10": 3.5209784507751465, + "ce_loss_13": 3.4652896523475647, + "ce_loss_2": 4.103803014755249, + "ce_loss_3": 3.9223596453666687, + "ce_loss_7": 3.611543357372284, + "epoch": 0.864, + "grad_norm": 604.0, + "kl_loss_10": 123.07759132385254, + "kl_loss_2": 1391.4273742675782, + "kl_loss_3": 1032.6504333496093, + "kl_loss_7": 332.56396484375, + "learning_rate": 4.58454148251814e-05, + "loss": 725.9105, + "step": 8640 + }, + { + "ce_loss_10": 3.5388185143470765, + "ce_loss_13": 3.4819789290428163, + "ce_loss_2": 4.147819340229034, + "ce_loss_3": 3.9573601484298706, + "ce_loss_7": 3.6289569973945617, + "epoch": 0.865, + "grad_norm": 600.0, + "kl_loss_10": 122.27800941467285, + "kl_loss_2": 1421.122607421875, + "kl_loss_3": 1049.9566833496094, + "kl_loss_7": 332.15319976806643, + "learning_rate": 4.518400232274078e-05, + "loss": 721.911, + "step": 8650 + }, + { + "ce_loss_10": 3.552417826652527, + "ce_loss_13": 3.4967941880226134, + "ce_loss_2": 4.138116705417633, + "ce_loss_3": 3.959863018989563, + "ce_loss_7": 3.642176163196564, + "epoch": 0.866, + "grad_norm": 536.0, + "kl_loss_10": 123.07808303833008, + "kl_loss_2": 1379.3301574707032, + "kl_loss_3": 1030.6435760498048, + "kl_loss_7": 333.7181594848633, + "learning_rate": 4.452716981234745e-05, + "loss": 702.8377, + "step": 8660 + }, + { + "ce_loss_10": 3.5320884346961976, + "ce_loss_13": 3.480946350097656, + "ce_loss_2": 4.1082984685897825, + "ce_loss_3": 3.930745470523834, + "ce_loss_7": 3.6193148374557493, + "epoch": 0.867, + "grad_norm": 486.0, + "kl_loss_10": 119.37153434753418, + "kl_loss_2": 1378.435809326172, + "kl_loss_3": 1026.5518157958984, + "kl_loss_7": 327.11744079589846, + "learning_rate": 4.3874923908297335e-05, + "loss": 706.7176, + "step": 8670 + }, + { + "ce_loss_10": 3.5820580959320067, + "ce_loss_13": 3.5283179759979246, + "ce_loss_2": 4.165662562847137, + "ce_loss_3": 3.9848700284957888, + "ce_loss_7": 3.668103575706482, + "epoch": 0.868, + "grad_norm": 576.0, + "kl_loss_10": 123.53395080566406, + "kl_loss_2": 1397.873388671875, + "kl_loss_3": 1040.8485137939454, + "kl_loss_7": 332.79543151855466, + "learning_rate": 4.322727117869951e-05, + "loss": 718.6055, + "step": 8680 + }, + { + "ce_loss_10": 3.5884267687797546, + "ce_loss_13": 3.5351755380630494, + "ce_loss_2": 4.178534460067749, + "ce_loss_3": 3.9966900825500487, + "ce_loss_7": 3.6785280823707582, + "epoch": 0.869, + "grad_norm": 628.0, + "kl_loss_10": 123.76097946166992, + "kl_loss_2": 1403.0090087890626, + "kl_loss_3": 1049.369467163086, + "kl_loss_7": 333.3337600708008, + "learning_rate": 4.2584218145409916e-05, + "loss": 715.6396, + "step": 8690 + }, + { + "ce_loss_10": 3.6362990498542787, + "ce_loss_13": 3.5836689591407778, + "ce_loss_2": 4.204011178016662, + "ce_loss_3": 4.026763451099396, + "ce_loss_7": 3.723234498500824, + "epoch": 0.87, + "grad_norm": 640.0, + "kl_loss_10": 121.82669830322266, + "kl_loss_2": 1356.0820190429688, + "kl_loss_3": 1011.6945983886719, + "kl_loss_7": 326.52610778808594, + "learning_rate": 4.194577128396521e-05, + "loss": 701.9309, + "step": 8700 + }, + { + "ce_loss_10": 3.509493625164032, + "ce_loss_13": 3.45839284658432, + "ce_loss_2": 4.095285260677338, + "ce_loss_3": 3.9108598709106444, + "ce_loss_7": 3.5953684210777284, + "epoch": 0.871, + "grad_norm": 466.0, + "kl_loss_10": 119.7718490600586, + "kl_loss_2": 1389.6183959960938, + "kl_loss_3": 1027.7238983154298, + "kl_loss_7": 325.0868804931641, + "learning_rate": 4.1311937023518264e-05, + "loss": 718.8018, + "step": 8710 + }, + { + "ce_loss_10": 3.529244434833527, + "ce_loss_13": 3.4789538025856017, + "ce_loss_2": 4.10749055147171, + "ce_loss_3": 3.9206608176231383, + "ce_loss_7": 3.613613450527191, + "epoch": 0.872, + "grad_norm": 552.0, + "kl_loss_10": 118.38729095458984, + "kl_loss_2": 1381.1460327148438, + "kl_loss_3": 1020.1363098144532, + "kl_loss_7": 319.3013519287109, + "learning_rate": 4.0682721746773344e-05, + "loss": 710.0534, + "step": 8720 + }, + { + "ce_loss_10": 3.3995256781578065, + "ce_loss_13": 3.3480949640274047, + "ce_loss_2": 4.007587265968323, + "ce_loss_3": 3.8226790904998778, + "ce_loss_7": 3.4936659812927244, + "epoch": 0.873, + "grad_norm": 680.0, + "kl_loss_10": 119.34899826049805, + "kl_loss_2": 1401.68115234375, + "kl_loss_3": 1045.86484375, + "kl_loss_7": 331.2798797607422, + "learning_rate": 4.0058131789920904e-05, + "loss": 709.8896, + "step": 8730 + }, + { + "ce_loss_10": 3.5508158564567567, + "ce_loss_13": 3.4964456081390383, + "ce_loss_2": 4.1352542519569395, + "ce_loss_3": 3.9534544229507445, + "ce_loss_7": 3.636772119998932, + "epoch": 0.874, + "grad_norm": 584.0, + "kl_loss_10": 120.94258270263671, + "kl_loss_2": 1390.5471740722655, + "kl_loss_3": 1033.9399536132812, + "kl_loss_7": 328.365657043457, + "learning_rate": 3.9438173442575e-05, + "loss": 736.2612, + "step": 8740 + }, + { + "ce_loss_10": 3.5804495930671694, + "ce_loss_13": 3.5271941781044007, + "ce_loss_2": 4.154019808769226, + "ce_loss_3": 3.978409993648529, + "ce_loss_7": 3.6670626044273376, + "epoch": 0.875, + "grad_norm": 660.0, + "kl_loss_10": 121.4291778564453, + "kl_loss_2": 1363.337957763672, + "kl_loss_3": 1017.4170349121093, + "kl_loss_7": 330.21262817382814, + "learning_rate": 3.882285294770937e-05, + "loss": 711.9646, + "step": 8750 + }, + { + "ce_loss_10": 3.548740530014038, + "ce_loss_13": 3.4961506009101866, + "ce_loss_2": 4.1233531594276425, + "ce_loss_3": 3.9456629276275637, + "ce_loss_7": 3.635119545459747, + "epoch": 0.876, + "grad_norm": 720.0, + "kl_loss_10": 122.49603576660157, + "kl_loss_2": 1370.9503845214845, + "kl_loss_3": 1018.0048278808594, + "kl_loss_7": 328.40975494384764, + "learning_rate": 3.821217650159453e-05, + "loss": 718.8667, + "step": 8760 + }, + { + "ce_loss_10": 3.418784809112549, + "ce_loss_13": 3.365262305736542, + "ce_loss_2": 4.043610215187073, + "ce_loss_3": 3.850952887535095, + "ce_loss_7": 3.512869107723236, + "epoch": 0.877, + "grad_norm": 864.0, + "kl_loss_10": 121.31179428100586, + "kl_loss_2": 1448.362078857422, + "kl_loss_3": 1071.3075866699219, + "kl_loss_7": 338.1376647949219, + "learning_rate": 3.760615025373543e-05, + "loss": 729.3138, + "step": 8770 + }, + { + "ce_loss_10": 3.600062382221222, + "ce_loss_13": 3.5458203673362734, + "ce_loss_2": 4.1950247406959535, + "ce_loss_3": 4.014675962924957, + "ce_loss_7": 3.6932862639427184, + "epoch": 0.878, + "grad_norm": 684.0, + "kl_loss_10": 125.84054679870606, + "kl_loss_2": 1408.6423461914062, + "kl_loss_3": 1048.4514282226562, + "kl_loss_7": 337.6252685546875, + "learning_rate": 3.700478030680987e-05, + "loss": 731.0535, + "step": 8780 + }, + { + "ce_loss_10": 3.588497185707092, + "ce_loss_13": 3.5357969880104063, + "ce_loss_2": 4.16150141954422, + "ce_loss_3": 3.9861255884170532, + "ce_loss_7": 3.6764004111289976, + "epoch": 0.879, + "grad_norm": 616.0, + "kl_loss_10": 121.01314735412598, + "kl_loss_2": 1369.610662841797, + "kl_loss_3": 1017.0912567138672, + "kl_loss_7": 326.87464447021483, + "learning_rate": 3.6408072716606344e-05, + "loss": 710.2985, + "step": 8790 + }, + { + "ce_loss_10": 3.511665999889374, + "ce_loss_13": 3.455206000804901, + "ce_loss_2": 4.111484944820404, + "ce_loss_3": 3.9289029002189637, + "ce_loss_7": 3.6025957107543944, + "epoch": 0.88, + "grad_norm": 780.0, + "kl_loss_10": 123.63747024536133, + "kl_loss_2": 1420.9652282714844, + "kl_loss_3": 1057.6597961425782, + "kl_loss_7": 336.1513076782227, + "learning_rate": 3.5816033491963716e-05, + "loss": 739.0035, + "step": 8800 + }, + { + "ce_loss_10": 3.371099352836609, + "ce_loss_13": 3.316172993183136, + "ce_loss_2": 3.9832777261734007, + "ce_loss_3": 3.789150130748749, + "ce_loss_7": 3.461447310447693, + "epoch": 0.881, + "grad_norm": 512.0, + "kl_loss_10": 119.77749671936036, + "kl_loss_2": 1419.2442565917968, + "kl_loss_3": 1050.3526092529296, + "kl_loss_7": 326.61090087890625, + "learning_rate": 3.522866859471047e-05, + "loss": 723.13, + "step": 8810 + }, + { + "ce_loss_10": 3.611319732666016, + "ce_loss_13": 3.561573588848114, + "ce_loss_2": 4.164506709575653, + "ce_loss_3": 3.9919765472412108, + "ce_loss_7": 3.6927991986274717, + "epoch": 0.882, + "grad_norm": 668.0, + "kl_loss_10": 117.92159461975098, + "kl_loss_2": 1322.9934326171874, + "kl_loss_3": 989.4436096191406, + "kl_loss_7": 318.43848724365233, + "learning_rate": 3.46459839396045e-05, + "loss": 702.0629, + "step": 8820 + }, + { + "ce_loss_10": 3.538902699947357, + "ce_loss_13": 3.4826046943664553, + "ce_loss_2": 4.136587750911713, + "ce_loss_3": 3.952931213378906, + "ce_loss_7": 3.6263251304626465, + "epoch": 0.883, + "grad_norm": 692.0, + "kl_loss_10": 123.36521034240722, + "kl_loss_2": 1392.0963928222657, + "kl_loss_3": 1039.635934448242, + "kl_loss_7": 335.2001617431641, + "learning_rate": 3.406798539427386e-05, + "loss": 735.3253, + "step": 8830 + }, + { + "ce_loss_10": 3.5910847425460815, + "ce_loss_13": 3.537630581855774, + "ce_loss_2": 4.170853447914124, + "ce_loss_3": 3.988684332370758, + "ce_loss_7": 3.6767850637435915, + "epoch": 0.884, + "grad_norm": 676.0, + "kl_loss_10": 121.19235725402832, + "kl_loss_2": 1382.8566284179688, + "kl_loss_3": 1032.8232696533203, + "kl_loss_7": 330.4065704345703, + "learning_rate": 3.349467877915746e-05, + "loss": 719.4985, + "step": 8840 + }, + { + "ce_loss_10": 3.549594223499298, + "ce_loss_13": 3.4964571714401247, + "ce_loss_2": 4.147552752494812, + "ce_loss_3": 3.9666833519935607, + "ce_loss_7": 3.6410465359687807, + "epoch": 0.885, + "grad_norm": 756.0, + "kl_loss_10": 121.92766189575195, + "kl_loss_2": 1412.680615234375, + "kl_loss_3": 1051.8612976074219, + "kl_loss_7": 334.56292724609375, + "learning_rate": 3.292606986744667e-05, + "loss": 738.1569, + "step": 8850 + }, + { + "ce_loss_10": 3.5065809965133665, + "ce_loss_13": 3.4557624578475954, + "ce_loss_2": 4.09570015668869, + "ce_loss_3": 3.9213839888572695, + "ce_loss_7": 3.5942333817481993, + "epoch": 0.886, + "grad_norm": 512.0, + "kl_loss_10": 120.2051342010498, + "kl_loss_2": 1395.1206481933593, + "kl_loss_3": 1046.7202606201172, + "kl_loss_7": 328.31476440429685, + "learning_rate": 3.23621643850267e-05, + "loss": 722.0618, + "step": 8860 + }, + { + "ce_loss_10": 3.5800418615341187, + "ce_loss_13": 3.528806471824646, + "ce_loss_2": 4.1604786038398744, + "ce_loss_3": 3.9754431366920473, + "ce_loss_7": 3.6690159320831297, + "epoch": 0.887, + "grad_norm": 668.0, + "kl_loss_10": 122.70932960510254, + "kl_loss_2": 1398.141259765625, + "kl_loss_3": 1041.2169250488282, + "kl_loss_7": 333.5382675170898, + "learning_rate": 3.180296801041971e-05, + "loss": 714.5479, + "step": 8870 + }, + { + "ce_loss_10": 3.605249297618866, + "ce_loss_13": 3.552952218055725, + "ce_loss_2": 4.17888309955597, + "ce_loss_3": 4.001760041713714, + "ce_loss_7": 3.6925482630729674, + "epoch": 0.888, + "grad_norm": 640.0, + "kl_loss_10": 121.45921478271484, + "kl_loss_2": 1367.8746765136718, + "kl_loss_3": 1020.4184844970703, + "kl_loss_7": 326.21177368164064, + "learning_rate": 3.124848637472688e-05, + "loss": 704.8254, + "step": 8880 + }, + { + "ce_loss_10": 3.425955653190613, + "ce_loss_13": 3.372738444805145, + "ce_loss_2": 4.030108571052551, + "ce_loss_3": 3.8461776614189147, + "ce_loss_7": 3.5158618688583374, + "epoch": 0.889, + "grad_norm": 836.0, + "kl_loss_10": 119.03256759643554, + "kl_loss_2": 1409.894512939453, + "kl_loss_3": 1046.528024291992, + "kl_loss_7": 325.74627990722655, + "learning_rate": 3.069872506157212e-05, + "loss": 719.6042, + "step": 8890 + }, + { + "ce_loss_10": 3.5282222867012023, + "ce_loss_13": 3.475008749961853, + "ce_loss_2": 4.108449482917786, + "ce_loss_3": 3.9331199288368226, + "ce_loss_7": 3.617098903656006, + "epoch": 0.89, + "grad_norm": 628.0, + "kl_loss_10": 120.31341552734375, + "kl_loss_2": 1379.0091857910156, + "kl_loss_3": 1029.306707763672, + "kl_loss_7": 330.0529190063477, + "learning_rate": 3.0153689607045842e-05, + "loss": 710.0303, + "step": 8900 + }, + { + "ce_loss_10": 3.4255795001983644, + "ce_loss_13": 3.371835172176361, + "ce_loss_2": 4.053711616992951, + "ce_loss_3": 3.8632387042045595, + "ce_loss_7": 3.523522126674652, + "epoch": 0.891, + "grad_norm": 580.0, + "kl_loss_10": 124.12492294311524, + "kl_loss_2": 1471.6921020507812, + "kl_loss_3": 1089.004214477539, + "kl_loss_7": 339.9063247680664, + "learning_rate": 2.9613385499648926e-05, + "loss": 724.8125, + "step": 8910 + }, + { + "ce_loss_10": 3.47619286775589, + "ce_loss_13": 3.424519944190979, + "ce_loss_2": 4.070773077011109, + "ce_loss_3": 3.896732985973358, + "ce_loss_7": 3.571604323387146, + "epoch": 0.892, + "grad_norm": 532.0, + "kl_loss_10": 120.07063331604004, + "kl_loss_2": 1390.2006958007812, + "kl_loss_3": 1039.9530181884766, + "kl_loss_7": 328.3143585205078, + "learning_rate": 2.9077818180237692e-05, + "loss": 718.9485, + "step": 8920 + }, + { + "ce_loss_10": 3.5255975484848023, + "ce_loss_13": 3.4710716724395754, + "ce_loss_2": 4.129261720180511, + "ce_loss_3": 3.9440025210380556, + "ce_loss_7": 3.6209316849708557, + "epoch": 0.893, + "grad_norm": 796.0, + "kl_loss_10": 121.16016159057617, + "kl_loss_2": 1394.0591796875, + "kl_loss_3": 1037.3413208007812, + "kl_loss_7": 329.095556640625, + "learning_rate": 2.8546993041969172e-05, + "loss": 716.5755, + "step": 8930 + }, + { + "ce_loss_10": 3.5639485001564024, + "ce_loss_13": 3.5113102793693542, + "ce_loss_2": 4.132089781761169, + "ce_loss_3": 3.958901858329773, + "ce_loss_7": 3.651428020000458, + "epoch": 0.894, + "grad_norm": 584.0, + "kl_loss_10": 119.52723693847656, + "kl_loss_2": 1364.812921142578, + "kl_loss_3": 1022.4147399902344, + "kl_loss_7": 326.37430267333986, + "learning_rate": 2.802091543024671e-05, + "loss": 717.1699, + "step": 8940 + }, + { + "ce_loss_10": 3.557297170162201, + "ce_loss_13": 3.5066465854644777, + "ce_loss_2": 4.156519591808319, + "ce_loss_3": 3.973790967464447, + "ce_loss_7": 3.6468861937522887, + "epoch": 0.895, + "grad_norm": 584.0, + "kl_loss_10": 122.12691040039063, + "kl_loss_2": 1419.7261474609375, + "kl_loss_3": 1054.6295593261718, + "kl_loss_7": 334.19554290771487, + "learning_rate": 2.7499590642665774e-05, + "loss": 738.7192, + "step": 8950 + }, + { + "ce_loss_10": 3.573615825176239, + "ce_loss_13": 3.519201862812042, + "ce_loss_2": 4.15412825345993, + "ce_loss_3": 3.969862473011017, + "ce_loss_7": 3.6663325428962708, + "epoch": 0.896, + "grad_norm": 494.0, + "kl_loss_10": 123.42191467285156, + "kl_loss_2": 1381.5768737792969, + "kl_loss_3": 1020.5846405029297, + "kl_loss_7": 340.43305053710935, + "learning_rate": 2.6983023928961405e-05, + "loss": 711.4102, + "step": 8960 + }, + { + "ce_loss_10": 3.5397876501083374, + "ce_loss_13": 3.488338351249695, + "ce_loss_2": 4.125940537452697, + "ce_loss_3": 3.9469223976135255, + "ce_loss_7": 3.6297765612602233, + "epoch": 0.897, + "grad_norm": 728.0, + "kl_loss_10": 122.37606697082519, + "kl_loss_2": 1384.7419189453126, + "kl_loss_3": 1034.7682739257812, + "kl_loss_7": 330.70338439941406, + "learning_rate": 2.6471220490954628e-05, + "loss": 723.2938, + "step": 8970 + }, + { + "ce_loss_10": 3.5269408941268923, + "ce_loss_13": 3.4752776265144347, + "ce_loss_2": 4.100160777568817, + "ce_loss_3": 3.9212745785713197, + "ce_loss_7": 3.610729730129242, + "epoch": 0.898, + "grad_norm": 628.0, + "kl_loss_10": 119.57647438049317, + "kl_loss_2": 1371.9164184570313, + "kl_loss_3": 1019.5086395263672, + "kl_loss_7": 324.32104339599607, + "learning_rate": 2.596418548250029e-05, + "loss": 718.0098, + "step": 8980 + }, + { + "ce_loss_10": 3.571890485286713, + "ce_loss_13": 3.515963816642761, + "ce_loss_2": 4.1442801594734195, + "ce_loss_3": 3.9694687128067017, + "ce_loss_7": 3.659953308105469, + "epoch": 0.899, + "grad_norm": 580.0, + "kl_loss_10": 123.79312667846679, + "kl_loss_2": 1387.2481201171875, + "kl_loss_3": 1035.6228240966798, + "kl_loss_7": 332.8598129272461, + "learning_rate": 2.5461924009435368e-05, + "loss": 711.2326, + "step": 8990 + }, + { + "ce_loss_10": 3.565682017803192, + "ce_loss_13": 3.5124054074287416, + "ce_loss_2": 4.145227205753327, + "ce_loss_3": 3.971540629863739, + "ce_loss_7": 3.652699661254883, + "epoch": 0.9, + "grad_norm": 700.0, + "kl_loss_10": 122.56795883178711, + "kl_loss_2": 1366.3777648925782, + "kl_loss_3": 1030.5671783447265, + "kl_loss_7": 328.8155014038086, + "learning_rate": 2.4964441129527336e-05, + "loss": 725.1452, + "step": 9000 + }, + { + "ce_loss_10": 3.5651490688323975, + "ce_loss_13": 3.51036376953125, + "ce_loss_2": 4.134415197372436, + "ce_loss_3": 3.954487180709839, + "ce_loss_7": 3.6518504142761232, + "epoch": 0.901, + "grad_norm": 700.0, + "kl_loss_10": 120.3508358001709, + "kl_loss_2": 1357.658221435547, + "kl_loss_3": 1008.9297454833984, + "kl_loss_7": 324.7009643554687, + "learning_rate": 2.4471741852423235e-05, + "loss": 704.2774, + "step": 9010 + }, + { + "ce_loss_10": 3.6071965098381042, + "ce_loss_13": 3.5544245719909666, + "ce_loss_2": 4.1868168115615845, + "ce_loss_3": 4.0108413934707645, + "ce_loss_7": 3.695762050151825, + "epoch": 0.902, + "grad_norm": 704.0, + "kl_loss_10": 122.07797813415527, + "kl_loss_2": 1359.7636352539062, + "kl_loss_3": 1012.6970703125, + "kl_loss_7": 327.2878921508789, + "learning_rate": 2.3983831139599287e-05, + "loss": 709.9809, + "step": 9020 + }, + { + "ce_loss_10": 3.5311748027801513, + "ce_loss_13": 3.47797931432724, + "ce_loss_2": 4.098857629299164, + "ce_loss_3": 3.923769438266754, + "ce_loss_7": 3.61720005273819, + "epoch": 0.903, + "grad_norm": 1040.0, + "kl_loss_10": 119.62764778137208, + "kl_loss_2": 1359.2027770996094, + "kl_loss_3": 1016.2866149902344, + "kl_loss_7": 322.86070251464844, + "learning_rate": 2.3500713904311022e-05, + "loss": 696.3648, + "step": 9030 + }, + { + "ce_loss_10": 3.572555947303772, + "ce_loss_13": 3.522153615951538, + "ce_loss_2": 4.132451498508454, + "ce_loss_3": 3.9519538998603823, + "ce_loss_7": 3.65519403219223, + "epoch": 0.904, + "grad_norm": 540.0, + "kl_loss_10": 119.08038482666015, + "kl_loss_2": 1330.948681640625, + "kl_loss_3": 984.9072479248047, + "kl_loss_7": 319.1572540283203, + "learning_rate": 2.3022395011543685e-05, + "loss": 697.6727, + "step": 9040 + }, + { + "ce_loss_10": 3.6023966908454894, + "ce_loss_13": 3.549352025985718, + "ce_loss_2": 4.193000841140747, + "ce_loss_3": 4.014123034477234, + "ce_loss_7": 3.6947824120521546, + "epoch": 0.905, + "grad_norm": 640.0, + "kl_loss_10": 123.01276931762695, + "kl_loss_2": 1397.0471923828125, + "kl_loss_3": 1045.0749206542969, + "kl_loss_7": 336.0963409423828, + "learning_rate": 2.2548879277963063e-05, + "loss": 729.8438, + "step": 9050 + }, + { + "ce_loss_10": 3.517413592338562, + "ce_loss_13": 3.4641988396644594, + "ce_loss_2": 4.097688150405884, + "ce_loss_3": 3.9156952857971192, + "ce_loss_7": 3.603403353691101, + "epoch": 0.906, + "grad_norm": 548.0, + "kl_loss_10": 120.84680671691895, + "kl_loss_2": 1380.5550354003906, + "kl_loss_3": 1028.3634216308594, + "kl_loss_7": 326.4955429077148, + "learning_rate": 2.208017147186736e-05, + "loss": 700.2203, + "step": 9060 + }, + { + "ce_loss_10": 3.5140005707740785, + "ce_loss_13": 3.461076188087463, + "ce_loss_2": 4.10326977968216, + "ce_loss_3": 3.9209877371788027, + "ce_loss_7": 3.6043805360794066, + "epoch": 0.907, + "grad_norm": 492.0, + "kl_loss_10": 121.33981552124024, + "kl_loss_2": 1388.803759765625, + "kl_loss_3": 1033.561016845703, + "kl_loss_7": 330.547900390625, + "learning_rate": 2.1616276313139227e-05, + "loss": 708.1628, + "step": 9070 + }, + { + "ce_loss_10": 3.55505918264389, + "ce_loss_13": 3.5002601861953737, + "ce_loss_2": 4.133837890625, + "ce_loss_3": 3.9574143290519714, + "ce_loss_7": 3.645774781703949, + "epoch": 0.908, + "grad_norm": 548.0, + "kl_loss_10": 121.9651065826416, + "kl_loss_2": 1372.4083923339845, + "kl_loss_3": 1024.9973175048829, + "kl_loss_7": 329.2260681152344, + "learning_rate": 2.1157198473197415e-05, + "loss": 716.0005, + "step": 9080 + }, + { + "ce_loss_10": 3.6146026134490965, + "ce_loss_13": 3.5627476572990417, + "ce_loss_2": 4.199038410186768, + "ce_loss_3": 4.024258577823639, + "ce_loss_7": 3.7075974106788636, + "epoch": 0.909, + "grad_norm": 660.0, + "kl_loss_10": 122.82046203613281, + "kl_loss_2": 1376.7192138671876, + "kl_loss_3": 1033.8393127441407, + "kl_loss_7": 333.55931091308594, + "learning_rate": 2.0702942574950812e-05, + "loss": 717.5656, + "step": 9090 + }, + { + "ce_loss_10": 3.536907970905304, + "ce_loss_13": 3.4830648064613343, + "ce_loss_2": 4.130435681343078, + "ce_loss_3": 3.9466469168663023, + "ce_loss_7": 3.630732071399689, + "epoch": 0.91, + "grad_norm": 430.0, + "kl_loss_10": 122.83297958374024, + "kl_loss_2": 1402.052911376953, + "kl_loss_3": 1039.2423034667968, + "kl_loss_7": 334.35711669921875, + "learning_rate": 2.025351319275137e-05, + "loss": 720.0911, + "step": 9100 + }, + { + "ce_loss_10": 3.6753941059112547, + "ce_loss_13": 3.6204377889633177, + "ce_loss_2": 4.253736305236816, + "ce_loss_3": 4.077165937423706, + "ce_loss_7": 3.7643893718719483, + "epoch": 0.911, + "grad_norm": 684.0, + "kl_loss_10": 127.13848838806152, + "kl_loss_2": 1410.5498107910157, + "kl_loss_3": 1052.9429016113281, + "kl_loss_7": 338.90263214111326, + "learning_rate": 1.9808914852347816e-05, + "loss": 739.4578, + "step": 9110 + }, + { + "ce_loss_10": 3.5164944171905517, + "ce_loss_13": 3.460708129405975, + "ce_loss_2": 4.116264307498932, + "ce_loss_3": 3.9366886615753174, + "ce_loss_7": 3.608424687385559, + "epoch": 0.912, + "grad_norm": 466.0, + "kl_loss_10": 122.47514915466309, + "kl_loss_2": 1399.5153747558593, + "kl_loss_3": 1043.8174133300781, + "kl_loss_7": 330.74664306640625, + "learning_rate": 1.9369152030840554e-05, + "loss": 717.7691, + "step": 9120 + }, + { + "ce_loss_10": 3.5966818809509276, + "ce_loss_13": 3.540822982788086, + "ce_loss_2": 4.167646491527558, + "ce_loss_3": 3.9919039726257326, + "ce_loss_7": 3.6817273020744326, + "epoch": 0.913, + "grad_norm": 836.0, + "kl_loss_10": 122.55923118591309, + "kl_loss_2": 1386.7923583984375, + "kl_loss_3": 1030.3927276611328, + "kl_loss_7": 327.60221710205076, + "learning_rate": 1.893422915663645e-05, + "loss": 717.9165, + "step": 9130 + }, + { + "ce_loss_10": 3.463078701496124, + "ce_loss_13": 3.4048054456710815, + "ce_loss_2": 4.085853815078735, + "ce_loss_3": 3.8942277312278746, + "ce_loss_7": 3.555465543270111, + "epoch": 0.914, + "grad_norm": 532.0, + "kl_loss_10": 123.98791694641113, + "kl_loss_2": 1448.6031616210937, + "kl_loss_3": 1079.3504547119142, + "kl_loss_7": 336.90855102539064, + "learning_rate": 1.850415060940386e-05, + "loss": 735.5819, + "step": 9140 + }, + { + "ce_loss_10": 3.5885175228118897, + "ce_loss_13": 3.5330852389335634, + "ce_loss_2": 4.165771210193634, + "ce_loss_3": 3.986107361316681, + "ce_loss_7": 3.673333930969238, + "epoch": 0.915, + "grad_norm": 568.0, + "kl_loss_10": 122.50748710632324, + "kl_loss_2": 1369.035382080078, + "kl_loss_3": 1022.5911560058594, + "kl_loss_7": 329.97228851318357, + "learning_rate": 1.8078920720028978e-05, + "loss": 713.4122, + "step": 9150 + }, + { + "ce_loss_10": 3.513621473312378, + "ce_loss_13": 3.461913502216339, + "ce_loss_2": 4.0891550302505495, + "ce_loss_3": 3.914201045036316, + "ce_loss_7": 3.5989736318588257, + "epoch": 0.916, + "grad_norm": 608.0, + "kl_loss_10": 119.31071891784669, + "kl_loss_2": 1368.6764831542969, + "kl_loss_3": 1018.941748046875, + "kl_loss_7": 323.95599975585935, + "learning_rate": 1.765854377057219e-05, + "loss": 724.9803, + "step": 9160 + }, + { + "ce_loss_10": 3.4946448802948, + "ce_loss_13": 3.444287621974945, + "ce_loss_2": 4.072016930580139, + "ce_loss_3": 3.891819429397583, + "ce_loss_7": 3.581692707538605, + "epoch": 0.917, + "grad_norm": 568.0, + "kl_loss_10": 118.33210716247558, + "kl_loss_2": 1371.4457458496095, + "kl_loss_3": 1019.0162200927734, + "kl_loss_7": 322.0634735107422, + "learning_rate": 1.724302399422456e-05, + "loss": 713.6212, + "step": 9170 + }, + { + "ce_loss_10": 3.443930947780609, + "ce_loss_13": 3.391281247138977, + "ce_loss_2": 4.039050686359405, + "ce_loss_3": 3.8598959922790526, + "ce_loss_7": 3.5371329545974732, + "epoch": 0.918, + "grad_norm": 552.0, + "kl_loss_10": 122.66972045898437, + "kl_loss_2": 1399.7767211914063, + "kl_loss_3": 1046.084487915039, + "kl_loss_7": 337.6794631958008, + "learning_rate": 1.683236557526574e-05, + "loss": 725.2953, + "step": 9180 + }, + { + "ce_loss_10": 3.5698832750320433, + "ce_loss_13": 3.517447316646576, + "ce_loss_2": 4.126532173156738, + "ce_loss_3": 3.9537552118301393, + "ce_loss_7": 3.65188353061676, + "epoch": 0.919, + "grad_norm": 484.0, + "kl_loss_10": 118.96306114196777, + "kl_loss_2": 1337.912353515625, + "kl_loss_3": 994.4553466796875, + "kl_loss_7": 319.3814407348633, + "learning_rate": 1.6426572649021475e-05, + "loss": 706.8777, + "step": 9190 + }, + { + "ce_loss_10": 3.60268212556839, + "ce_loss_13": 3.55240513086319, + "ce_loss_2": 4.154273760318756, + "ce_loss_3": 3.984792101383209, + "ce_loss_7": 3.685652470588684, + "epoch": 0.92, + "grad_norm": 652.0, + "kl_loss_10": 122.0446949005127, + "kl_loss_2": 1341.5455627441406, + "kl_loss_3": 996.6870147705079, + "kl_loss_7": 324.77650756835936, + "learning_rate": 1.6025649301821876e-05, + "loss": 705.4512, + "step": 9200 + }, + { + "ce_loss_10": 3.5914915084838865, + "ce_loss_13": 3.5390833497047423, + "ce_loss_2": 4.147667050361633, + "ce_loss_3": 3.9771674513816833, + "ce_loss_7": 3.677826452255249, + "epoch": 0.921, + "grad_norm": 672.0, + "kl_loss_10": 122.41096229553223, + "kl_loss_2": 1365.7819885253907, + "kl_loss_3": 1026.9352416992188, + "kl_loss_7": 331.3459762573242, + "learning_rate": 1.5629599570960716e-05, + "loss": 710.4124, + "step": 9210 + }, + { + "ce_loss_10": 3.4989588618278504, + "ce_loss_13": 3.443503034114838, + "ce_loss_2": 4.072719764709473, + "ce_loss_3": 3.895606553554535, + "ce_loss_7": 3.583188033103943, + "epoch": 0.922, + "grad_norm": 564.0, + "kl_loss_10": 121.58853492736816, + "kl_loss_2": 1383.309326171875, + "kl_loss_3": 1029.440249633789, + "kl_loss_7": 329.04571533203125, + "learning_rate": 1.5238427444654367e-05, + "loss": 715.6756, + "step": 9220 + }, + { + "ce_loss_10": 3.5566193222999574, + "ce_loss_13": 3.5052598237991335, + "ce_loss_2": 4.12589042186737, + "ce_loss_3": 3.9515591621398927, + "ce_loss_7": 3.6445566058158874, + "epoch": 0.923, + "grad_norm": 656.0, + "kl_loss_10": 120.25884666442872, + "kl_loss_2": 1353.9432434082032, + "kl_loss_3": 1004.0840515136719, + "kl_loss_7": 324.49686737060546, + "learning_rate": 1.4852136862001764e-05, + "loss": 707.9903, + "step": 9230 + }, + { + "ce_loss_10": 3.519503819942474, + "ce_loss_13": 3.4680001974105834, + "ce_loss_2": 4.090194070339203, + "ce_loss_3": 3.9192580938339234, + "ce_loss_7": 3.603424859046936, + "epoch": 0.924, + "grad_norm": 536.0, + "kl_loss_10": 116.57286224365234, + "kl_loss_2": 1356.3960021972657, + "kl_loss_3": 1013.3616424560547, + "kl_loss_7": 322.25772247314455, + "learning_rate": 1.4470731712944884e-05, + "loss": 712.4052, + "step": 9240 + }, + { + "ce_loss_10": 3.546650528907776, + "ce_loss_13": 3.4919941663742065, + "ce_loss_2": 4.125348937511444, + "ce_loss_3": 3.9468334317207336, + "ce_loss_7": 3.637234890460968, + "epoch": 0.925, + "grad_norm": 560.0, + "kl_loss_10": 121.30828056335449, + "kl_loss_2": 1376.0818115234374, + "kl_loss_3": 1026.3591033935547, + "kl_loss_7": 330.9017593383789, + "learning_rate": 1.4094215838229174e-05, + "loss": 725.0938, + "step": 9250 + }, + { + "ce_loss_10": 3.506890153884888, + "ce_loss_13": 3.4531724214553834, + "ce_loss_2": 4.09957047700882, + "ce_loss_3": 3.9183789253234864, + "ce_loss_7": 3.595683777332306, + "epoch": 0.926, + "grad_norm": 668.0, + "kl_loss_10": 120.98923149108887, + "kl_loss_2": 1405.4196044921875, + "kl_loss_3": 1046.142251586914, + "kl_loss_7": 330.8429779052734, + "learning_rate": 1.372259302936546e-05, + "loss": 741.1663, + "step": 9260 + }, + { + "ce_loss_10": 3.6240168809890747, + "ce_loss_13": 3.5658324003219604, + "ce_loss_2": 4.201930189132691, + "ce_loss_3": 4.022644340991974, + "ce_loss_7": 3.7125780820846557, + "epoch": 0.927, + "grad_norm": 458.0, + "kl_loss_10": 125.30646934509278, + "kl_loss_2": 1376.9204162597657, + "kl_loss_3": 1027.8259399414062, + "kl_loss_7": 335.74131164550784, + "learning_rate": 1.3355867028591206e-05, + "loss": 709.1274, + "step": 9270 + }, + { + "ce_loss_10": 3.5262920260429382, + "ce_loss_13": 3.4725812315940856, + "ce_loss_2": 4.089487946033477, + "ce_loss_3": 3.917177438735962, + "ce_loss_7": 3.6126119017601015, + "epoch": 0.928, + "grad_norm": 528.0, + "kl_loss_10": 120.4309455871582, + "kl_loss_2": 1373.5199279785156, + "kl_loss_3": 1023.1070373535156, + "kl_loss_7": 328.47514190673826, + "learning_rate": 1.2994041528833267e-05, + "loss": 708.8351, + "step": 9280 + }, + { + "ce_loss_10": 3.5236886620521544, + "ce_loss_13": 3.470105516910553, + "ce_loss_2": 4.10210679769516, + "ce_loss_3": 3.92024689912796, + "ce_loss_7": 3.6086822271347048, + "epoch": 0.929, + "grad_norm": 576.0, + "kl_loss_10": 120.04236526489258, + "kl_loss_2": 1388.3319274902344, + "kl_loss_3": 1022.3574737548828, + "kl_loss_7": 325.6973571777344, + "learning_rate": 1.2637120173670358e-05, + "loss": 713.4749, + "step": 9290 + }, + { + "ce_loss_10": 3.5528993606567383, + "ce_loss_13": 3.4979908227920533, + "ce_loss_2": 4.1398141264915465, + "ce_loss_3": 3.960055100917816, + "ce_loss_7": 3.6429410934448243, + "epoch": 0.93, + "grad_norm": 752.0, + "kl_loss_10": 121.99558181762696, + "kl_loss_2": 1397.67568359375, + "kl_loss_3": 1041.351336669922, + "kl_loss_7": 332.56233978271484, + "learning_rate": 1.2285106557296478e-05, + "loss": 717.6662, + "step": 9300 + }, + { + "ce_loss_10": 3.4253716588020326, + "ce_loss_13": 3.374079155921936, + "ce_loss_2": 4.050317943096161, + "ce_loss_3": 3.85725314617157, + "ce_loss_7": 3.5170278906822205, + "epoch": 0.931, + "grad_norm": 856.0, + "kl_loss_10": 120.44239501953125, + "kl_loss_2": 1437.18662109375, + "kl_loss_3": 1066.2714141845704, + "kl_loss_7": 331.22974700927733, + "learning_rate": 1.1938004224484989e-05, + "loss": 725.9393, + "step": 9310 + }, + { + "ce_loss_10": 3.6618238210678102, + "ce_loss_13": 3.606413471698761, + "ce_loss_2": 4.2395406603813175, + "ce_loss_3": 4.060720527172089, + "ce_loss_7": 3.7527721166610717, + "epoch": 0.932, + "grad_norm": 620.0, + "kl_loss_10": 124.90289039611817, + "kl_loss_2": 1384.36318359375, + "kl_loss_3": 1029.1616516113281, + "kl_loss_7": 333.3983581542969, + "learning_rate": 1.1595816670552429e-05, + "loss": 726.8374, + "step": 9320 + }, + { + "ce_loss_10": 3.595258188247681, + "ce_loss_13": 3.538734793663025, + "ce_loss_2": 4.16660704612732, + "ce_loss_3": 3.9852279901504515, + "ce_loss_7": 3.6792800307273863, + "epoch": 0.933, + "grad_norm": 620.0, + "kl_loss_10": 122.25293235778808, + "kl_loss_2": 1363.7237548828125, + "kl_loss_3": 1008.4813812255859, + "kl_loss_7": 325.02245025634767, + "learning_rate": 1.1258547341323699e-05, + "loss": 704.2615, + "step": 9330 + }, + { + "ce_loss_10": 3.619642269611359, + "ce_loss_13": 3.5654753684997558, + "ce_loss_2": 4.190784621238708, + "ce_loss_3": 4.012927258014679, + "ce_loss_7": 3.707143473625183, + "epoch": 0.934, + "grad_norm": 704.0, + "kl_loss_10": 122.84645042419433, + "kl_loss_2": 1390.2992065429687, + "kl_loss_3": 1031.7191131591796, + "kl_loss_7": 331.35271606445315, + "learning_rate": 1.0926199633097156e-05, + "loss": 714.1544, + "step": 9340 + }, + { + "ce_loss_10": 3.6207199335098266, + "ce_loss_13": 3.5691072225570677, + "ce_loss_2": 4.169072163105011, + "ce_loss_3": 3.9992053508758545, + "ce_loss_7": 3.7047256231307983, + "epoch": 0.935, + "grad_norm": 524.0, + "kl_loss_10": 119.55591087341308, + "kl_loss_2": 1335.967266845703, + "kl_loss_3": 1000.2812530517579, + "kl_loss_7": 321.08820190429685, + "learning_rate": 1.0598776892610684e-05, + "loss": 714.8448, + "step": 9350 + }, + { + "ce_loss_10": 3.431315243244171, + "ce_loss_13": 3.381504237651825, + "ce_loss_2": 4.028350937366485, + "ce_loss_3": 3.8452399015426635, + "ce_loss_7": 3.5204859137535096, + "epoch": 0.936, + "grad_norm": 492.0, + "kl_loss_10": 118.58314819335938, + "kl_loss_2": 1395.6468383789063, + "kl_loss_3": 1031.9574981689452, + "kl_loss_7": 323.2079360961914, + "learning_rate": 1.0276282417007399e-05, + "loss": 710.134, + "step": 9360 + }, + { + "ce_loss_10": 3.593991827964783, + "ce_loss_13": 3.543764090538025, + "ce_loss_2": 4.150844514369965, + "ce_loss_3": 3.976518452167511, + "ce_loss_7": 3.680371618270874, + "epoch": 0.937, + "grad_norm": 608.0, + "kl_loss_10": 119.22171020507812, + "kl_loss_2": 1340.3917053222656, + "kl_loss_3": 996.455322265625, + "kl_loss_7": 321.72610626220705, + "learning_rate": 9.958719453803277e-06, + "loss": 703.7472, + "step": 9370 + }, + { + "ce_loss_10": 3.5886499643325807, + "ce_loss_13": 3.535153257846832, + "ce_loss_2": 4.169439589977264, + "ce_loss_3": 3.9961138367652893, + "ce_loss_7": 3.681587278842926, + "epoch": 0.938, + "grad_norm": 632.0, + "kl_loss_10": 121.95464172363282, + "kl_loss_2": 1374.1223999023437, + "kl_loss_3": 1033.233740234375, + "kl_loss_7": 331.1907165527344, + "learning_rate": 9.646091200853802e-06, + "loss": 715.3208, + "step": 9380 + }, + { + "ce_loss_10": 3.5463305473327638, + "ce_loss_13": 3.4947377681732177, + "ce_loss_2": 4.119847238063812, + "ce_loss_3": 3.9456714272499083, + "ce_loss_7": 3.632569658756256, + "epoch": 0.939, + "grad_norm": 600.0, + "kl_loss_10": 118.46955299377441, + "kl_loss_2": 1357.3059997558594, + "kl_loss_3": 1010.5910614013671, + "kl_loss_7": 323.0857559204102, + "learning_rate": 9.338400806321978e-06, + "loss": 692.6078, + "step": 9390 + }, + { + "ce_loss_10": 3.58269464969635, + "ce_loss_13": 3.5270420789718626, + "ce_loss_2": 4.153971230983734, + "ce_loss_3": 3.977994203567505, + "ce_loss_7": 3.6744091510772705, + "epoch": 0.94, + "grad_norm": 576.0, + "kl_loss_10": 122.99457702636718, + "kl_loss_2": 1362.3225341796874, + "kl_loss_3": 1014.1672058105469, + "kl_loss_7": 329.80276641845705, + "learning_rate": 9.035651368646646e-06, + "loss": 707.3777, + "step": 9400 + }, + { + "ce_loss_10": 3.583444893360138, + "ce_loss_13": 3.5304938554763794, + "ce_loss_2": 4.144635462760926, + "ce_loss_3": 3.973416244983673, + "ce_loss_7": 3.669160747528076, + "epoch": 0.941, + "grad_norm": 612.0, + "kl_loss_10": 120.20222663879395, + "kl_loss_2": 1350.0137756347656, + "kl_loss_3": 1008.0319641113281, + "kl_loss_7": 325.5893859863281, + "learning_rate": 8.737845936511335e-06, + "loss": 711.1381, + "step": 9410 + }, + { + "ce_loss_10": 3.534907364845276, + "ce_loss_13": 3.4788982629776, + "ce_loss_2": 4.116820418834687, + "ce_loss_3": 3.93233003616333, + "ce_loss_7": 3.6241695284843445, + "epoch": 0.942, + "grad_norm": 552.0, + "kl_loss_10": 123.2990665435791, + "kl_loss_2": 1381.4971130371093, + "kl_loss_3": 1029.372280883789, + "kl_loss_7": 332.8606323242187, + "learning_rate": 8.444987508813451e-06, + "loss": 712.9717, + "step": 9420 + }, + { + "ce_loss_10": 3.486678731441498, + "ce_loss_13": 3.4316638946533202, + "ce_loss_2": 4.079938578605652, + "ce_loss_3": 3.897518050670624, + "ce_loss_7": 3.5741005659103395, + "epoch": 0.943, + "grad_norm": 592.0, + "kl_loss_10": 122.28645133972168, + "kl_loss_2": 1428.6765991210937, + "kl_loss_3": 1060.144955444336, + "kl_loss_7": 334.1707931518555, + "learning_rate": 8.157079034633974e-06, + "loss": 725.4578, + "step": 9430 + }, + { + "ce_loss_10": 3.487192440032959, + "ce_loss_13": 3.434910070896149, + "ce_loss_2": 4.076284384727478, + "ce_loss_3": 3.895754504203796, + "ce_loss_7": 3.5757576704025267, + "epoch": 0.944, + "grad_norm": 438.0, + "kl_loss_10": 120.67500839233398, + "kl_loss_2": 1416.8783447265625, + "kl_loss_3": 1052.7425445556642, + "kl_loss_7": 329.4114395141602, + "learning_rate": 7.874123413208145e-06, + "loss": 718.8903, + "step": 9440 + }, + { + "ce_loss_10": 3.4529663920402527, + "ce_loss_13": 3.4014953494071962, + "ce_loss_2": 4.053472077846527, + "ce_loss_3": 3.86897349357605, + "ce_loss_7": 3.5433228611946106, + "epoch": 0.945, + "grad_norm": 434.0, + "kl_loss_10": 119.67061233520508, + "kl_loss_2": 1402.618475341797, + "kl_loss_3": 1041.5067199707032, + "kl_loss_7": 330.63839569091795, + "learning_rate": 7.59612349389599e-06, + "loss": 719.7231, + "step": 9450 + }, + { + "ce_loss_10": 3.547183060646057, + "ce_loss_13": 3.4943931818008425, + "ce_loss_2": 4.110193264484406, + "ce_loss_3": 3.9380640506744387, + "ce_loss_7": 3.6319218397140505, + "epoch": 0.946, + "grad_norm": 752.0, + "kl_loss_10": 118.77492752075196, + "kl_loss_2": 1337.786553955078, + "kl_loss_3": 997.432504272461, + "kl_loss_7": 323.5429290771484, + "learning_rate": 7.323082076153509e-06, + "loss": 706.6428, + "step": 9460 + }, + { + "ce_loss_10": 3.5930521965026854, + "ce_loss_13": 3.538786733150482, + "ce_loss_2": 4.154218971729279, + "ce_loss_3": 3.979544770717621, + "ce_loss_7": 3.6779901146888734, + "epoch": 0.947, + "grad_norm": 548.0, + "kl_loss_10": 122.71770210266114, + "kl_loss_2": 1348.3024658203126, + "kl_loss_3": 1007.2475158691407, + "kl_loss_7": 330.79364318847655, + "learning_rate": 7.055001909504755e-06, + "loss": 715.8398, + "step": 9470 + }, + { + "ce_loss_10": 3.6217771649360655, + "ce_loss_13": 3.567501354217529, + "ce_loss_2": 4.193489670753479, + "ce_loss_3": 4.013223147392273, + "ce_loss_7": 3.7104034662246703, + "epoch": 0.948, + "grad_norm": 436.0, + "kl_loss_10": 123.1129165649414, + "kl_loss_2": 1368.2989318847656, + "kl_loss_3": 1020.2081787109375, + "kl_loss_7": 330.82146606445315, + "learning_rate": 6.791885693514133e-06, + "loss": 715.8734, + "step": 9480 + }, + { + "ce_loss_10": 3.52802118062973, + "ce_loss_13": 3.475515973567963, + "ce_loss_2": 4.114694261550904, + "ce_loss_3": 3.9377319693565367, + "ce_loss_7": 3.617742133140564, + "epoch": 0.949, + "grad_norm": 1104.0, + "kl_loss_10": 122.59672470092774, + "kl_loss_2": 1394.1083374023438, + "kl_loss_3": 1037.977471923828, + "kl_loss_7": 331.45179290771483, + "learning_rate": 6.533736077758867e-06, + "loss": 721.0864, + "step": 9490 + }, + { + "ce_loss_10": 3.489359438419342, + "ce_loss_13": 3.4337912678718565, + "ce_loss_2": 4.0956980109214784, + "ce_loss_3": 3.90721480846405, + "ce_loss_7": 3.581552469730377, + "epoch": 0.95, + "grad_norm": 840.0, + "kl_loss_10": 123.24414253234863, + "kl_loss_2": 1427.8860778808594, + "kl_loss_3": 1052.8961486816406, + "kl_loss_7": 336.8455841064453, + "learning_rate": 6.2805556618028556e-06, + "loss": 722.0676, + "step": 9500 + }, + { + "ce_loss_10": 3.5827051520347597, + "ce_loss_13": 3.5319561839103697, + "ce_loss_2": 4.139924156665802, + "ce_loss_3": 3.966154897212982, + "ce_loss_7": 3.6660770177841187, + "epoch": 0.951, + "grad_norm": 640.0, + "kl_loss_10": 119.18858680725097, + "kl_loss_2": 1329.3287841796875, + "kl_loss_3": 987.8519744873047, + "kl_loss_7": 318.80920562744143, + "learning_rate": 6.032346995169968e-06, + "loss": 686.1183, + "step": 9510 + }, + { + "ce_loss_10": 3.588587963581085, + "ce_loss_13": 3.5356736183166504, + "ce_loss_2": 4.163759100437164, + "ce_loss_3": 3.986702060699463, + "ce_loss_7": 3.6727798104286196, + "epoch": 0.952, + "grad_norm": 612.0, + "kl_loss_10": 121.3857364654541, + "kl_loss_2": 1371.2152526855468, + "kl_loss_3": 1021.5204376220703, + "kl_loss_7": 325.9972854614258, + "learning_rate": 5.789112577318789e-06, + "loss": 707.1283, + "step": 9520 + }, + { + "ce_loss_10": 3.566323149204254, + "ce_loss_13": 3.5122985243797302, + "ce_loss_2": 4.147273254394531, + "ce_loss_3": 3.9658782124519347, + "ce_loss_7": 3.650079107284546, + "epoch": 0.953, + "grad_norm": 624.0, + "kl_loss_10": 122.64759712219238, + "kl_loss_2": 1404.9125610351562, + "kl_loss_3": 1038.5805755615233, + "kl_loss_7": 331.40319061279297, + "learning_rate": 5.550854857617194e-06, + "loss": 709.3621, + "step": 9530 + }, + { + "ce_loss_10": 3.5508513927459715, + "ce_loss_13": 3.49852135181427, + "ce_loss_2": 4.146372210979462, + "ce_loss_3": 3.9630183458328245, + "ce_loss_7": 3.6422205328941346, + "epoch": 0.954, + "grad_norm": 808.0, + "kl_loss_10": 124.43129844665528, + "kl_loss_2": 1406.9052978515624, + "kl_loss_3": 1050.3708709716798, + "kl_loss_7": 337.19909210205077, + "learning_rate": 5.317576235317756e-06, + "loss": 722.1983, + "step": 9540 + }, + { + "ce_loss_10": 3.579137623310089, + "ce_loss_13": 3.525528633594513, + "ce_loss_2": 4.13328732252121, + "ce_loss_3": 3.9567643284797667, + "ce_loss_7": 3.6605886578559876, + "epoch": 0.955, + "grad_norm": 672.0, + "kl_loss_10": 120.62237091064453, + "kl_loss_2": 1320.1938232421876, + "kl_loss_3": 981.861474609375, + "kl_loss_7": 320.4243621826172, + "learning_rate": 5.089279059533658e-06, + "loss": 709.8652, + "step": 9550 + }, + { + "ce_loss_10": 3.6386606693267822, + "ce_loss_13": 3.5838342666625977, + "ce_loss_2": 4.210740327835083, + "ce_loss_3": 4.036587131023407, + "ce_loss_7": 3.7290936589241026, + "epoch": 0.956, + "grad_norm": 564.0, + "kl_loss_10": 126.81226425170898, + "kl_loss_2": 1377.3177856445313, + "kl_loss_3": 1028.8962188720702, + "kl_loss_7": 336.68345794677737, + "learning_rate": 4.865965629214819e-06, + "loss": 712.0689, + "step": 9560 + }, + { + "ce_loss_10": 3.5826368927955627, + "ce_loss_13": 3.5299017906188963, + "ce_loss_2": 4.156905472278595, + "ce_loss_3": 3.9820918679237365, + "ce_loss_7": 3.6713940501213074, + "epoch": 0.957, + "grad_norm": 540.0, + "kl_loss_10": 122.74875144958496, + "kl_loss_2": 1392.4383850097656, + "kl_loss_3": 1042.0474884033204, + "kl_loss_7": 333.6945205688477, + "learning_rate": 4.6476381931251366e-06, + "loss": 708.1864, + "step": 9570 + }, + { + "ce_loss_10": 3.5621810436248778, + "ce_loss_13": 3.509333276748657, + "ce_loss_2": 4.1382688164711, + "ce_loss_3": 3.9572407245635985, + "ce_loss_7": 3.6505929470062255, + "epoch": 0.958, + "grad_norm": 588.0, + "kl_loss_10": 120.34364318847656, + "kl_loss_2": 1372.2328552246095, + "kl_loss_3": 1022.0481781005859, + "kl_loss_7": 328.2518478393555, + "learning_rate": 4.434298949819449e-06, + "loss": 710.7535, + "step": 9580 + }, + { + "ce_loss_10": 3.517833399772644, + "ce_loss_13": 3.4627076268196104, + "ce_loss_2": 4.125231349468232, + "ce_loss_3": 3.937967097759247, + "ce_loss_7": 3.6109151244163513, + "epoch": 0.959, + "grad_norm": 496.0, + "kl_loss_10": 124.38531723022462, + "kl_loss_2": 1447.3932067871094, + "kl_loss_3": 1073.26435546875, + "kl_loss_7": 339.64784240722656, + "learning_rate": 4.2259500476214406e-06, + "loss": 729.0812, + "step": 9590 + }, + { + "ce_loss_10": 3.5029053330421447, + "ce_loss_13": 3.4484299540519716, + "ce_loss_2": 4.084228038787842, + "ce_loss_3": 3.901972544193268, + "ce_loss_7": 3.5915270805358888, + "epoch": 0.96, + "grad_norm": 482.0, + "kl_loss_10": 121.2360626220703, + "kl_loss_2": 1395.7050842285157, + "kl_loss_3": 1035.7142211914063, + "kl_loss_7": 330.5919525146484, + "learning_rate": 4.02259358460233e-06, + "loss": 712.7003, + "step": 9600 + }, + { + "ce_loss_10": 3.569297027587891, + "ce_loss_13": 3.5153488874435426, + "ce_loss_2": 4.138071930408477, + "ce_loss_3": 3.9621680974960327, + "ce_loss_7": 3.655005395412445, + "epoch": 0.961, + "grad_norm": 520.0, + "kl_loss_10": 122.30363540649414, + "kl_loss_2": 1365.015313720703, + "kl_loss_3": 1017.6215026855468, + "kl_loss_7": 330.8979751586914, + "learning_rate": 3.8242316085594916e-06, + "loss": 706.2896, + "step": 9610 + }, + { + "ce_loss_10": 3.453506600856781, + "ce_loss_13": 3.4016806364059446, + "ce_loss_2": 4.060185205936432, + "ce_loss_3": 3.874277877807617, + "ce_loss_7": 3.5428122758865355, + "epoch": 0.962, + "grad_norm": 548.0, + "kl_loss_10": 121.87925300598144, + "kl_loss_2": 1436.2672180175782, + "kl_loss_3": 1071.4315460205078, + "kl_loss_7": 334.0261703491211, + "learning_rate": 3.630866116995757e-06, + "loss": 735.1148, + "step": 9620 + }, + { + "ce_loss_10": 3.608195972442627, + "ce_loss_13": 3.555492627620697, + "ce_loss_2": 4.172922539710998, + "ce_loss_3": 3.999432122707367, + "ce_loss_7": 3.695437788963318, + "epoch": 0.963, + "grad_norm": 480.0, + "kl_loss_10": 120.99373207092285, + "kl_loss_2": 1338.744921875, + "kl_loss_3": 1001.2767883300781, + "kl_loss_7": 323.61705017089844, + "learning_rate": 3.4424990570994797e-06, + "loss": 713.4098, + "step": 9630 + }, + { + "ce_loss_10": 3.6004083514213563, + "ce_loss_13": 3.545426595211029, + "ce_loss_2": 4.163943660259247, + "ce_loss_3": 3.9917925119400026, + "ce_loss_7": 3.683776152133942, + "epoch": 0.964, + "grad_norm": 422.0, + "kl_loss_10": 120.70787086486817, + "kl_loss_2": 1359.9990783691405, + "kl_loss_3": 1013.7219421386719, + "kl_loss_7": 325.9754806518555, + "learning_rate": 3.2591323257248896e-06, + "loss": 708.6987, + "step": 9640 + }, + { + "ce_loss_10": 3.4461316108703612, + "ce_loss_13": 3.3950406193733214, + "ce_loss_2": 4.035059344768524, + "ce_loss_3": 3.8544202208518983, + "ce_loss_7": 3.5323742747306826, + "epoch": 0.965, + "grad_norm": 588.0, + "kl_loss_10": 119.44026069641113, + "kl_loss_2": 1390.1842041015625, + "kl_loss_3": 1035.5374267578125, + "kl_loss_7": 326.7879272460938, + "learning_rate": 3.0807677693729385e-06, + "loss": 718.7271, + "step": 9650 + }, + { + "ce_loss_10": 3.6339459300041197, + "ce_loss_13": 3.5821001648902895, + "ce_loss_2": 4.195254683494568, + "ce_loss_3": 4.029372644424439, + "ce_loss_7": 3.721749794483185, + "epoch": 0.966, + "grad_norm": 624.0, + "kl_loss_10": 120.16477317810059, + "kl_loss_2": 1345.0911743164063, + "kl_loss_3": 1009.0331939697265, + "kl_loss_7": 326.2617385864258, + "learning_rate": 2.9074071841727055e-06, + "loss": 700.3471, + "step": 9660 + }, + { + "ce_loss_10": 3.564396059513092, + "ce_loss_13": 3.5104949116706847, + "ce_loss_2": 4.14932644367218, + "ce_loss_3": 3.9688431143760683, + "ce_loss_7": 3.655857837200165, + "epoch": 0.967, + "grad_norm": 824.0, + "kl_loss_10": 121.51194686889649, + "kl_loss_2": 1391.90537109375, + "kl_loss_3": 1034.157763671875, + "kl_loss_7": 333.8142547607422, + "learning_rate": 2.739052315863355e-06, + "loss": 705.9752, + "step": 9670 + }, + { + "ce_loss_10": 3.547489809989929, + "ce_loss_13": 3.4908099293708803, + "ce_loss_2": 4.124406385421753, + "ce_loss_3": 3.952444839477539, + "ce_loss_7": 3.6360405683517456, + "epoch": 0.968, + "grad_norm": 708.0, + "kl_loss_10": 123.83374938964843, + "kl_loss_2": 1374.874462890625, + "kl_loss_3": 1031.4533416748047, + "kl_loss_7": 329.6816864013672, + "learning_rate": 2.5757048597765396e-06, + "loss": 709.6677, + "step": 9680 + }, + { + "ce_loss_10": 3.5581902265548706, + "ce_loss_13": 3.5045996785163878, + "ce_loss_2": 4.142367708683014, + "ce_loss_3": 3.960366427898407, + "ce_loss_7": 3.6433528900146483, + "epoch": 0.969, + "grad_norm": 716.0, + "kl_loss_10": 121.9698501586914, + "kl_loss_2": 1398.1140563964843, + "kl_loss_3": 1040.0749725341798, + "kl_loss_7": 330.96569061279297, + "learning_rate": 2.417366460819359e-06, + "loss": 716.8742, + "step": 9690 + }, + { + "ce_loss_10": 3.5652561664581297, + "ce_loss_13": 3.5113066673278808, + "ce_loss_2": 4.170096576213837, + "ce_loss_3": 3.9840278029441833, + "ce_loss_7": 3.657517433166504, + "epoch": 0.97, + "grad_norm": 648.0, + "kl_loss_10": 124.85320587158203, + "kl_loss_2": 1419.4852294921875, + "kl_loss_3": 1056.5964263916017, + "kl_loss_7": 336.44810638427737, + "learning_rate": 2.2640387134577057e-06, + "loss": 715.8505, + "step": 9700 + }, + { + "ce_loss_10": 3.4900317192077637, + "ce_loss_13": 3.438812232017517, + "ce_loss_2": 4.047373950481415, + "ce_loss_3": 3.874912989139557, + "ce_loss_7": 3.5746123671531675, + "epoch": 0.971, + "grad_norm": 552.0, + "kl_loss_10": 115.83142395019532, + "kl_loss_2": 1328.5251892089843, + "kl_loss_3": 984.848501586914, + "kl_loss_7": 316.37536163330077, + "learning_rate": 2.115723161700278e-06, + "loss": 696.8717, + "step": 9710 + }, + { + "ce_loss_10": 3.4717228293418883, + "ce_loss_13": 3.4165940165519713, + "ce_loss_2": 4.07243583202362, + "ce_loss_3": 3.8875492215156555, + "ce_loss_7": 3.5627353191375732, + "epoch": 0.972, + "grad_norm": 588.0, + "kl_loss_10": 124.52442169189453, + "kl_loss_2": 1418.6435485839843, + "kl_loss_3": 1056.855615234375, + "kl_loss_7": 337.8177230834961, + "learning_rate": 1.9724212990830937e-06, + "loss": 729.082, + "step": 9720 + }, + { + "ce_loss_10": 3.620276391506195, + "ce_loss_13": 3.5666480660438538, + "ce_loss_2": 4.208401417732238, + "ce_loss_3": 4.026053476333618, + "ce_loss_7": 3.710793709754944, + "epoch": 0.973, + "grad_norm": 464.0, + "kl_loss_10": 123.11025886535644, + "kl_loss_2": 1394.9627868652344, + "kl_loss_3": 1036.4825164794922, + "kl_loss_7": 333.15184173583987, + "learning_rate": 1.8341345686543331e-06, + "loss": 717.9429, + "step": 9730 + }, + { + "ce_loss_10": 3.6050823092460633, + "ce_loss_13": 3.5513722777366636, + "ce_loss_2": 4.157528936862946, + "ce_loss_3": 3.9856823086738586, + "ce_loss_7": 3.689386820793152, + "epoch": 0.974, + "grad_norm": 804.0, + "kl_loss_10": 120.63820838928223, + "kl_loss_2": 1337.2397399902343, + "kl_loss_3": 1002.3361602783203, + "kl_loss_7": 325.78345794677733, + "learning_rate": 1.7008643629596864e-06, + "loss": 711.823, + "step": 9740 + }, + { + "ce_loss_10": 3.586062693595886, + "ce_loss_13": 3.53137663602829, + "ce_loss_2": 4.1578493475914, + "ce_loss_3": 3.981482672691345, + "ce_loss_7": 3.676142621040344, + "epoch": 0.975, + "grad_norm": 604.0, + "kl_loss_10": 122.01227416992188, + "kl_loss_2": 1380.4110900878907, + "kl_loss_3": 1023.8995727539062, + "kl_loss_7": 329.3552581787109, + "learning_rate": 1.5726120240288633e-06, + "loss": 721.4617, + "step": 9750 + }, + { + "ce_loss_10": 3.4836260080337524, + "ce_loss_13": 3.431521987915039, + "ce_loss_2": 4.062640523910522, + "ce_loss_3": 3.8828797817230223, + "ce_loss_7": 3.5693193793296816, + "epoch": 0.976, + "grad_norm": 528.0, + "kl_loss_10": 119.90267982482911, + "kl_loss_2": 1373.1053588867187, + "kl_loss_3": 1020.3870666503906, + "kl_loss_7": 327.03973999023435, + "learning_rate": 1.4493788433612708e-06, + "loss": 706.3937, + "step": 9760 + }, + { + "ce_loss_10": 3.603860354423523, + "ce_loss_13": 3.5489115476608277, + "ce_loss_2": 4.185849332809449, + "ce_loss_3": 4.005133509635925, + "ce_loss_7": 3.6924179434776305, + "epoch": 0.977, + "grad_norm": 516.0, + "kl_loss_10": 121.7234733581543, + "kl_loss_2": 1390.346942138672, + "kl_loss_3": 1030.5024047851562, + "kl_loss_7": 329.0796890258789, + "learning_rate": 1.3311660619138578e-06, + "loss": 718.6364, + "step": 9770 + }, + { + "ce_loss_10": 3.598566448688507, + "ce_loss_13": 3.546519470214844, + "ce_loss_2": 4.149194121360779, + "ce_loss_3": 3.9783671855926515, + "ce_loss_7": 3.6831716775894163, + "epoch": 0.978, + "grad_norm": 548.0, + "kl_loss_10": 120.76995811462402, + "kl_loss_2": 1329.148126220703, + "kl_loss_3": 995.5872283935547, + "kl_loss_7": 324.982649230957, + "learning_rate": 1.2179748700879012e-06, + "loss": 702.9616, + "step": 9780 + }, + { + "ce_loss_10": 3.527452754974365, + "ce_loss_13": 3.4756274223327637, + "ce_loss_2": 4.103048396110535, + "ce_loss_3": 3.9268922090530394, + "ce_loss_7": 3.6169739961624146, + "epoch": 0.979, + "grad_norm": 576.0, + "kl_loss_10": 121.63832931518554, + "kl_loss_2": 1367.4532653808594, + "kl_loss_3": 1023.8312957763671, + "kl_loss_7": 327.45165252685547, + "learning_rate": 1.1098064077174619e-06, + "loss": 710.6072, + "step": 9790 + }, + { + "ce_loss_10": 3.560258626937866, + "ce_loss_13": 3.5036699771881104, + "ce_loss_2": 4.162828862667084, + "ce_loss_3": 3.9741065382957457, + "ce_loss_7": 3.651784634590149, + "epoch": 0.98, + "grad_norm": 732.0, + "kl_loss_10": 120.97579002380371, + "kl_loss_2": 1405.2699890136719, + "kl_loss_3": 1040.5084320068358, + "kl_loss_7": 329.4268035888672, + "learning_rate": 1.006661764057837e-06, + "loss": 717.7347, + "step": 9800 + }, + { + "ce_loss_10": 3.564870071411133, + "ce_loss_13": 3.5106621384620667, + "ce_loss_2": 4.146739649772644, + "ce_loss_3": 3.9645097851753235, + "ce_loss_7": 3.6526604771614073, + "epoch": 0.981, + "grad_norm": 624.0, + "kl_loss_10": 120.16670303344726, + "kl_loss_2": 1385.8602905273438, + "kl_loss_3": 1033.0024688720703, + "kl_loss_7": 325.0816711425781, + "learning_rate": 9.085419777743465e-07, + "loss": 712.9294, + "step": 9810 + }, + { + "ce_loss_10": 3.502484345436096, + "ce_loss_13": 3.4494829535484315, + "ce_loss_2": 4.077840411663056, + "ce_loss_3": 3.901638996601105, + "ce_loss_7": 3.5905481934547425, + "epoch": 0.982, + "grad_norm": 430.0, + "kl_loss_10": 117.31855773925781, + "kl_loss_2": 1365.9148986816406, + "kl_loss_3": 1018.6503601074219, + "kl_loss_7": 321.9003936767578, + "learning_rate": 8.15448036932176e-07, + "loss": 700.3924, + "step": 9820 + }, + { + "ce_loss_10": 3.553510880470276, + "ce_loss_13": 3.5014522314071654, + "ce_loss_2": 4.126162922382354, + "ce_loss_3": 3.9481998801231386, + "ce_loss_7": 3.6414800763130186, + "epoch": 0.983, + "grad_norm": 704.0, + "kl_loss_10": 120.78939666748047, + "kl_loss_2": 1375.8851440429687, + "kl_loss_3": 1031.1501098632812, + "kl_loss_7": 328.82582702636716, + "learning_rate": 7.273808789862724e-07, + "loss": 719.6496, + "step": 9830 + }, + { + "ce_loss_10": 3.6392180204391478, + "ce_loss_13": 3.5844750881195067, + "ce_loss_2": 4.2080818772315975, + "ce_loss_3": 4.02874116897583, + "ce_loss_7": 3.7243886232376098, + "epoch": 0.984, + "grad_norm": 588.0, + "kl_loss_10": 122.5202766418457, + "kl_loss_2": 1367.7145080566406, + "kl_loss_3": 1019.624478149414, + "kl_loss_7": 330.5028793334961, + "learning_rate": 6.443413907720186e-07, + "loss": 706.112, + "step": 9840 + }, + { + "ce_loss_10": 3.5670676469802856, + "ce_loss_13": 3.5155144810676573, + "ce_loss_2": 4.136582219600678, + "ce_loss_3": 3.958021545410156, + "ce_loss_7": 3.6541135787963865, + "epoch": 0.985, + "grad_norm": 572.0, + "kl_loss_10": 120.43879356384278, + "kl_loss_2": 1362.252947998047, + "kl_loss_3": 1014.570947265625, + "kl_loss_7": 327.1987045288086, + "learning_rate": 5.663304084960185e-07, + "loss": 705.8062, + "step": 9850 + }, + { + "ce_loss_10": 3.493795156478882, + "ce_loss_13": 3.439519703388214, + "ce_loss_2": 4.085382175445557, + "ce_loss_3": 3.902758014202118, + "ce_loss_7": 3.5809911727905273, + "epoch": 0.986, + "grad_norm": 664.0, + "kl_loss_10": 121.81441802978516, + "kl_loss_2": 1393.3001831054687, + "kl_loss_3": 1030.4414154052733, + "kl_loss_7": 327.58775482177737, + "learning_rate": 4.933487177280482e-07, + "loss": 703.9349, + "step": 9860 + }, + { + "ce_loss_10": 3.5870001196861265, + "ce_loss_13": 3.534350836277008, + "ce_loss_2": 4.159269857406616, + "ce_loss_3": 3.9821672320365904, + "ce_loss_7": 3.67519474029541, + "epoch": 0.987, + "grad_norm": 608.0, + "kl_loss_10": 119.21302146911621, + "kl_loss_2": 1369.2319030761719, + "kl_loss_3": 1017.3517211914062, + "kl_loss_7": 324.8195739746094, + "learning_rate": 4.2539705339295075e-07, + "loss": 704.4329, + "step": 9870 + }, + { + "ce_loss_10": 3.4422147393226625, + "ce_loss_13": 3.3882473349571227, + "ce_loss_2": 4.03389185667038, + "ce_loss_3": 3.855961525440216, + "ce_loss_7": 3.5283602476119995, + "epoch": 0.988, + "grad_norm": 616.0, + "kl_loss_10": 120.49687614440919, + "kl_loss_2": 1390.164501953125, + "kl_loss_3": 1040.5754180908202, + "kl_loss_7": 328.028791809082, + "learning_rate": 3.6247609976319816e-07, + "loss": 708.3993, + "step": 9880 + }, + { + "ce_loss_10": 3.5487411856651305, + "ce_loss_13": 3.4932268500328063, + "ce_loss_2": 4.1378997445106505, + "ce_loss_3": 3.956749749183655, + "ce_loss_7": 3.6394209384918215, + "epoch": 0.989, + "grad_norm": 684.0, + "kl_loss_10": 122.58170623779297, + "kl_loss_2": 1392.1090087890625, + "kl_loss_3": 1035.2212493896484, + "kl_loss_7": 332.9962554931641, + "learning_rate": 3.0458649045211895e-07, + "loss": 730.05, + "step": 9890 + }, + { + "ce_loss_10": 3.5127838611602784, + "ce_loss_13": 3.4589962124824525, + "ce_loss_2": 4.101286160945892, + "ce_loss_3": 3.9235021352767943, + "ce_loss_7": 3.6055259227752687, + "epoch": 0.99, + "grad_norm": 624.0, + "kl_loss_10": 123.06077766418457, + "kl_loss_2": 1395.3993530273438, + "kl_loss_3": 1043.9721893310548, + "kl_loss_7": 335.08056182861327, + "learning_rate": 2.517288084074587e-07, + "loss": 727.1166, + "step": 9900 + }, + { + "ce_loss_10": 3.5486058712005617, + "ce_loss_13": 3.494112956523895, + "ce_loss_2": 4.158055460453033, + "ce_loss_3": 3.9726677179336547, + "ce_loss_7": 3.642500126361847, + "epoch": 0.991, + "grad_norm": 604.0, + "kl_loss_10": 123.90233879089355, + "kl_loss_2": 1423.9731567382812, + "kl_loss_3": 1061.412161254883, + "kl_loss_7": 340.47805938720705, + "learning_rate": 2.0390358590538505e-07, + "loss": 725.7766, + "step": 9910 + }, + { + "ce_loss_10": 3.554875147342682, + "ce_loss_13": 3.5004802942276, + "ce_loss_2": 4.137669503688812, + "ce_loss_3": 3.9679285287857056, + "ce_loss_7": 3.645340549945831, + "epoch": 0.992, + "grad_norm": 516.0, + "kl_loss_10": 122.4406665802002, + "kl_loss_2": 1388.8013732910156, + "kl_loss_3": 1043.7246154785157, + "kl_loss_7": 334.8171920776367, + "learning_rate": 1.61111304545436e-07, + "loss": 713.2598, + "step": 9920 + }, + { + "ce_loss_10": 3.5292787551879883, + "ce_loss_13": 3.4751243948936463, + "ce_loss_2": 4.103587174415589, + "ce_loss_3": 3.930684947967529, + "ce_loss_7": 3.6189408540725707, + "epoch": 0.993, + "grad_norm": 524.0, + "kl_loss_10": 120.616743850708, + "kl_loss_2": 1377.230780029297, + "kl_loss_3": 1032.888330078125, + "kl_loss_7": 326.32662811279295, + "learning_rate": 1.2335239524541298e-07, + "loss": 705.4175, + "step": 9930 + }, + { + "ce_loss_10": 3.4969447016716004, + "ce_loss_13": 3.444041609764099, + "ce_loss_2": 4.070138645172119, + "ce_loss_3": 3.897006869316101, + "ce_loss_7": 3.5882808804512023, + "epoch": 0.994, + "grad_norm": 480.0, + "kl_loss_10": 119.86849899291992, + "kl_loss_2": 1356.5659057617188, + "kl_loss_3": 1016.5262145996094, + "kl_loss_7": 323.5000595092773, + "learning_rate": 9.06272382371065e-08, + "loss": 711.4588, + "step": 9940 + }, + { + "ce_loss_10": 3.562340235710144, + "ce_loss_13": 3.509819734096527, + "ce_loss_2": 4.1450182557106015, + "ce_loss_3": 3.96974858045578, + "ce_loss_7": 3.6521814823150636, + "epoch": 0.995, + "grad_norm": 572.0, + "kl_loss_10": 121.95822410583496, + "kl_loss_2": 1386.1470336914062, + "kl_loss_3": 1036.3678161621094, + "kl_loss_7": 331.8135269165039, + "learning_rate": 6.293616306246586e-08, + "loss": 711.5672, + "step": 9950 + }, + { + "ce_loss_10": 3.560911405086517, + "ce_loss_13": 3.5092976689338684, + "ce_loss_2": 4.1187317132949826, + "ce_loss_3": 3.945353388786316, + "ce_loss_7": 3.6452075004577638, + "epoch": 0.996, + "grad_norm": 532.0, + "kl_loss_10": 118.4669750213623, + "kl_loss_2": 1347.5411254882813, + "kl_loss_3": 1008.2919952392579, + "kl_loss_7": 321.9423858642578, + "learning_rate": 4.027944857032395e-08, + "loss": 694.5431, + "step": 9960 + }, + { + "ce_loss_10": 3.5510011553764342, + "ce_loss_13": 3.4982069730758667, + "ce_loss_2": 4.102385640144348, + "ce_loss_3": 3.9248210430145263, + "ce_loss_7": 3.6324383854866027, + "epoch": 0.997, + "grad_norm": 596.0, + "kl_loss_10": 118.21413650512696, + "kl_loss_2": 1309.8837768554688, + "kl_loss_3": 973.4349212646484, + "kl_loss_7": 314.49192199707034, + "learning_rate": 2.265732291356626e-08, + "loss": 688.6318, + "step": 9970 + }, + { + "ce_loss_10": 3.5984220147132873, + "ce_loss_13": 3.544767773151398, + "ce_loss_2": 4.165575993061066, + "ce_loss_3": 3.98673597574234, + "ce_loss_7": 3.684629225730896, + "epoch": 0.998, + "grad_norm": 492.0, + "kl_loss_10": 121.18408889770508, + "kl_loss_2": 1353.7459533691406, + "kl_loss_3": 1007.0253448486328, + "kl_loss_7": 326.501042175293, + "learning_rate": 1.0069963546743833e-08, + "loss": 717.6554, + "step": 9980 + }, + { + "ce_loss_10": 3.578536665439606, + "ce_loss_13": 3.5262981772422792, + "ce_loss_2": 4.151448047161102, + "ce_loss_3": 3.9757529973983763, + "ce_loss_7": 3.666281545162201, + "epoch": 0.999, + "grad_norm": 490.0, + "kl_loss_10": 121.82665138244629, + "kl_loss_2": 1373.672137451172, + "kl_loss_3": 1028.6984802246093, + "kl_loss_7": 330.99612579345705, + "learning_rate": 2.517497224463483e-09, + "loss": 710.4597, + "step": 9990 + }, + { + "ce_loss_10": 3.5323142886161802, + "ce_loss_13": 3.4774991631507874, + "ce_loss_2": 4.149592983722687, + "ce_loss_3": 3.9584404826164246, + "ce_loss_7": 3.6246480464935305, + "epoch": 1.0, + "grad_norm": 576.0, + "kl_loss_10": 122.81040077209472, + "kl_loss_2": 1444.6247863769531, + "kl_loss_3": 1063.938232421875, + "kl_loss_7": 337.08654937744143, + "learning_rate": 0.0, + "loss": 730.4086, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.177819035608023e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}