diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_12": 11.16768741607666, + "ce_loss_17": 10.69606876373291, + "ce_loss_23": 2.8781241178512573, + "ce_loss_3": 13.679932594299316, + "ce_loss_6": 12.782220363616943, + "epoch": 0.0001, + "grad_norm": 132096.0, + "kl_loss_12": 17880.517578125, + "kl_loss_17": 17056.2138671875, + "kl_loss_3": 22473.77734375, + "kl_loss_6": 20747.6396484375, + "learning_rate": 1e-05, + "loss": 20223.791, + "step": 1 + }, + { + "ce_loss_12": 8.242788076400757, + "ce_loss_17": 7.6790783405303955, + "ce_loss_23": 2.9531757831573486, + "ce_loss_3": 9.965806987550524, + "ce_loss_6": 9.590416616863674, + "epoch": 0.001, + "grad_norm": 31232.0, + "kl_loss_12": 11030.585747612848, + "kl_loss_17": 10147.679416232639, + "kl_loss_3": 14175.968532986111, + "kl_loss_6": 13466.597493489584, + "learning_rate": 0.0001, + "loss": 12302.9245, + "step": 10 + }, + { + "ce_loss_12": 5.3866219282150265, + "ce_loss_17": 4.39049506187439, + "ce_loss_23": 2.9577841758728027, + "ce_loss_3": 7.321023607254029, + "ce_loss_6": 6.767641139030457, + "epoch": 0.002, + "grad_norm": 12224.0, + "kl_loss_12": 4647.222497558594, + "kl_loss_17": 2857.6106811523437, + "kl_loss_3": 8220.148486328126, + "kl_loss_6": 7194.997045898437, + "learning_rate": 0.0002, + "loss": 5755.3641, + "step": 20 + }, + { + "ce_loss_12": 4.471867954730987, + "ce_loss_17": 3.596786892414093, + "ce_loss_23": 2.7707528471946716, + "ce_loss_3": 6.453728103637696, + "ce_loss_6": 5.812050318717956, + "epoch": 0.003, + "grad_norm": 5696.0, + "kl_loss_12": 3281.927099609375, + "kl_loss_17": 1651.611065673828, + "kl_loss_3": 7046.269970703125, + "kl_loss_6": 5852.9330078125, + "learning_rate": 0.0003, + "loss": 4405.1645, + "step": 30 + }, + { + "ce_loss_12": 4.35389233827591, + "ce_loss_17": 3.553706979751587, + "ce_loss_23": 2.934287405014038, + "ce_loss_3": 6.034887075424194, + "ce_loss_6": 5.435985016822815, + "epoch": 0.004, + "grad_norm": 9088.0, + "kl_loss_12": 2733.2104614257814, + "kl_loss_17": 1217.1323120117188, + "kl_loss_3": 5916.643041992187, + "kl_loss_6": 4787.089672851563, + "learning_rate": 0.0004, + "loss": 3687.0945, + "step": 40 + }, + { + "ce_loss_12": 4.149027359485626, + "ce_loss_17": 3.4113935589790345, + "ce_loss_23": 2.8995264291763307, + "ce_loss_3": 5.78397433757782, + "ce_loss_6": 5.1235936164855955, + "epoch": 0.005, + "grad_norm": 4704.0, + "kl_loss_12": 2461.4163208007812, + "kl_loss_17": 989.9328491210938, + "kl_loss_3": 5603.44287109375, + "kl_loss_6": 4341.613940429687, + "learning_rate": 0.0005, + "loss": 3334.0184, + "step": 50 + }, + { + "ce_loss_12": 4.0183366417884825, + "ce_loss_17": 3.324011433124542, + "ce_loss_23": 2.9169072985649107, + "ce_loss_3": 5.5873085260391235, + "ce_loss_6": 5.030270624160766, + "epoch": 0.006, + "grad_norm": 14016.0, + "kl_loss_12": 2195.4134033203127, + "kl_loss_17": 842.8528381347656, + "kl_loss_3": 5173.573413085937, + "kl_loss_6": 4158.858227539063, + "learning_rate": 0.0006, + "loss": 3110.6227, + "step": 60 + }, + { + "ce_loss_12": 3.900798332691193, + "ce_loss_17": 3.257086682319641, + "ce_loss_23": 2.8356701850891115, + "ce_loss_3": 5.478265619277954, + "ce_loss_6": 4.897664165496826, + "epoch": 0.007, + "grad_norm": 6752.0, + "kl_loss_12": 2123.1200561523438, + "kl_loss_17": 865.8842193603516, + "kl_loss_3": 5140.0406494140625, + "kl_loss_6": 4031.4986206054687, + "learning_rate": 0.0007, + "loss": 3039.6166, + "step": 70 + }, + { + "ce_loss_12": 3.9320133566856383, + "ce_loss_17": 3.300851571559906, + "ce_loss_23": 2.8350846648216246, + "ce_loss_3": 5.4221717596054075, + "ce_loss_6": 4.767316746711731, + "epoch": 0.008, + "grad_norm": 3232.0, + "kl_loss_12": 2182.824816894531, + "kl_loss_17": 1022.9220184326172, + "kl_loss_3": 5036.355078125, + "kl_loss_6": 3783.0829833984376, + "learning_rate": 0.0008, + "loss": 3021.7758, + "step": 80 + }, + { + "ce_loss_12": 3.963316762447357, + "ce_loss_17": 3.374627947807312, + "ce_loss_23": 2.8013595581054687, + "ce_loss_3": 5.3971950769424435, + "ce_loss_6": 4.755392622947693, + "epoch": 0.009, + "grad_norm": 5504.0, + "kl_loss_12": 2318.512774658203, + "kl_loss_17": 1111.7156463623046, + "kl_loss_3": 5047.43779296875, + "kl_loss_6": 3864.2319091796876, + "learning_rate": 0.0009000000000000001, + "loss": 3077.7311, + "step": 90 + }, + { + "ce_loss_12": 4.037562215328217, + "ce_loss_17": 3.3241353154182436, + "ce_loss_23": 2.906688666343689, + "ce_loss_3": 5.441467499732971, + "ce_loss_6": 4.868034100532531, + "epoch": 0.01, + "grad_norm": 17408.0, + "kl_loss_12": 2229.615863037109, + "kl_loss_17": 832.681689453125, + "kl_loss_3": 4962.237451171875, + "kl_loss_6": 3902.222119140625, + "learning_rate": 0.001, + "loss": 2979.5012, + "step": 100 + }, + { + "ce_loss_12": 3.8746483087539674, + "ce_loss_17": 3.2309106945991517, + "ce_loss_23": 2.868969190120697, + "ce_loss_3": 5.371089196205139, + "ce_loss_6": 4.770741057395935, + "epoch": 0.011, + "grad_norm": 5696.0, + "kl_loss_12": 2015.6501770019531, + "kl_loss_17": 733.734262084961, + "kl_loss_3": 4902.950366210937, + "kl_loss_6": 3788.4685180664064, + "learning_rate": 0.0009999974825027757, + "loss": 2855.4787, + "step": 110 + }, + { + "ce_loss_12": 3.8332433819770815, + "ce_loss_17": 3.2423539996147155, + "ce_loss_23": 2.930523693561554, + "ce_loss_3": 5.298088216781617, + "ce_loss_6": 4.743260788917541, + "epoch": 0.012, + "grad_norm": 7712.0, + "kl_loss_12": 1837.9232055664063, + "kl_loss_17": 639.3664642333985, + "kl_loss_3": 4677.4095458984375, + "kl_loss_6": 3642.7612060546876, + "learning_rate": 0.0009999899300364532, + "loss": 2674.099, + "step": 120 + }, + { + "ce_loss_12": 3.7736268401145936, + "ce_loss_17": 3.2274181962013246, + "ce_loss_23": 2.889433944225311, + "ce_loss_3": 5.3178743600845335, + "ce_loss_6": 4.753137707710266, + "epoch": 0.013, + "grad_norm": 4800.0, + "kl_loss_12": 1765.98681640625, + "kl_loss_17": 678.0200164794921, + "kl_loss_3": 4744.243969726563, + "kl_loss_6": 3663.6985595703127, + "learning_rate": 0.0009999773426770863, + "loss": 2724.3738, + "step": 130 + }, + { + "ce_loss_12": 3.757179248332977, + "ce_loss_17": 3.2548238396644593, + "ce_loss_23": 2.9310826420783997, + "ce_loss_3": 5.288840341567993, + "ce_loss_6": 4.680767583847046, + "epoch": 0.014, + "grad_norm": 3344.0, + "kl_loss_12": 1663.0203979492187, + "kl_loss_17": 667.9883514404297, + "kl_loss_3": 4617.917041015625, + "kl_loss_6": 3446.7722778320312, + "learning_rate": 0.0009999597205514296, + "loss": 2622.384, + "step": 140 + }, + { + "ce_loss_12": 3.7178321361541746, + "ce_loss_17": 3.2042754173278807, + "ce_loss_23": 2.88481262922287, + "ce_loss_3": 5.233242201805115, + "ce_loss_6": 4.54782543182373, + "epoch": 0.015, + "grad_norm": 2336.0, + "kl_loss_12": 1673.2152893066407, + "kl_loss_17": 669.5892211914063, + "kl_loss_3": 4588.619311523437, + "kl_loss_6": 3283.3631958007813, + "learning_rate": 0.0009999370638369377, + "loss": 2563.6086, + "step": 150 + }, + { + "ce_loss_12": 3.8975033283233644, + "ce_loss_17": 3.259253513813019, + "ce_loss_23": 2.925470495223999, + "ce_loss_3": 5.237982249259948, + "ce_loss_6": 4.5195070028305055, + "epoch": 0.016, + "grad_norm": 2752.0, + "kl_loss_12": 1999.6217224121094, + "kl_loss_17": 702.8198181152344, + "kl_loss_3": 4536.748388671875, + "kl_loss_6": 3186.8176391601564, + "learning_rate": 0.000999909372761763, + "loss": 2599.7559, + "step": 160 + }, + { + "ce_loss_12": 3.9207254886627196, + "ce_loss_17": 3.227236843109131, + "ce_loss_23": 2.866022825241089, + "ce_loss_3": 5.2069617986679075, + "ce_loss_6": 4.487017941474915, + "epoch": 0.017, + "grad_norm": 2240.0, + "kl_loss_12": 2135.180523681641, + "kl_loss_17": 742.9576934814453, + "kl_loss_3": 4628.918359375, + "kl_loss_6": 3242.9795776367187, + "learning_rate": 0.0009998766476047546, + "loss": 2709.5734, + "step": 170 + }, + { + "ce_loss_12": 3.880035960674286, + "ce_loss_17": 3.2814144134521483, + "ce_loss_23": 2.898086893558502, + "ce_loss_3": 5.193981313705445, + "ce_loss_6": 4.545022559165955, + "epoch": 0.018, + "grad_norm": 3520.0, + "kl_loss_12": 2003.6199279785155, + "kl_loss_17": 761.2051483154297, + "kl_loss_3": 4523.928857421875, + "kl_loss_6": 3293.190075683594, + "learning_rate": 0.0009998388886954545, + "loss": 2655.2025, + "step": 180 + }, + { + "ce_loss_12": 3.7998528599739076, + "ce_loss_17": 3.2430348992347717, + "ce_loss_23": 2.873298764228821, + "ce_loss_3": 5.10157356262207, + "ce_loss_6": 4.506469011306763, + "epoch": 0.019, + "grad_norm": 2080.0, + "kl_loss_12": 1879.7686462402344, + "kl_loss_17": 774.2787292480468, + "kl_loss_3": 4365.284875488282, + "kl_loss_6": 3259.860400390625, + "learning_rate": 0.0009997960964140947, + "loss": 2557.5475, + "step": 190 + }, + { + "ce_loss_12": 3.7017858624458313, + "ce_loss_17": 3.2199748039245604, + "ce_loss_23": 2.8715609908103943, + "ce_loss_3": 5.05151731967926, + "ce_loss_6": 4.46097469329834, + "epoch": 0.02, + "grad_norm": 2008.0, + "kl_loss_12": 1728.4301513671876, + "kl_loss_17": 724.9210571289062, + "kl_loss_3": 4324.548291015625, + "kl_loss_6": 3188.89443359375, + "learning_rate": 0.0009997482711915926, + "loss": 2478.465, + "step": 200 + }, + { + "ce_loss_12": 3.627460551261902, + "ce_loss_17": 3.153350031375885, + "ce_loss_23": 2.8465389728546144, + "ce_loss_3": 4.983549165725708, + "ce_loss_6": 4.3947282314300535, + "epoch": 0.021, + "grad_norm": 1440.0, + "kl_loss_12": 1616.2701904296875, + "kl_loss_17": 631.39169921875, + "kl_loss_3": 4262.137780761719, + "kl_loss_6": 3138.6790161132812, + "learning_rate": 0.0009996954135095479, + "loss": 2409.6078, + "step": 210 + }, + { + "ce_loss_12": 3.645814502239227, + "ce_loss_17": 3.1951419949531554, + "ce_loss_23": 2.9083182334899904, + "ce_loss_3": 5.008230352401734, + "ce_loss_6": 4.436058068275452, + "epoch": 0.022, + "grad_norm": 1752.0, + "kl_loss_12": 1510.1014770507813, + "kl_loss_17": 580.6792602539062, + "kl_loss_3": 4156.8021240234375, + "kl_loss_6": 3070.6255737304687, + "learning_rate": 0.0009996375239002368, + "loss": 2331.6045, + "step": 220 + }, + { + "ce_loss_12": 3.7145918250083922, + "ce_loss_17": 3.2435343861579895, + "ce_loss_23": 2.9827067852020264, + "ce_loss_3": 5.028963327407837, + "ce_loss_6": 4.5062199354171755, + "epoch": 0.023, + "grad_norm": 2272.0, + "kl_loss_12": 1495.8587890625, + "kl_loss_17": 542.103482055664, + "kl_loss_3": 4070.92353515625, + "kl_loss_6": 3063.4745239257813, + "learning_rate": 0.0009995746029466072, + "loss": 2299.4912, + "step": 230 + }, + { + "ce_loss_12": 3.5051693081855775, + "ce_loss_17": 3.046588110923767, + "ce_loss_23": 2.790693646669388, + "ce_loss_3": 4.909505295753479, + "ce_loss_6": 4.321071481704712, + "epoch": 0.024, + "grad_norm": 2256.0, + "kl_loss_12": 1484.3840026855469, + "kl_loss_17": 527.0030975341797, + "kl_loss_3": 4215.655187988281, + "kl_loss_6": 3102.826525878906, + "learning_rate": 0.0009995066512822719, + "loss": 2265.1627, + "step": 240 + }, + { + "ce_loss_12": 3.604202592372894, + "ce_loss_17": 3.1324368119239807, + "ce_loss_23": 2.885883128643036, + "ce_loss_3": 5.031421732902527, + "ce_loss_6": 4.435573077201843, + "epoch": 0.025, + "grad_norm": 1168.0, + "kl_loss_12": 1476.4596740722657, + "kl_loss_17": 506.74576568603516, + "kl_loss_3": 4252.793518066406, + "kl_loss_6": 3113.574108886719, + "learning_rate": 0.000999433669591504, + "loss": 2256.191, + "step": 250 + }, + { + "ce_loss_12": 3.524767017364502, + "ce_loss_17": 3.0347627997398376, + "ce_loss_23": 2.791684591770172, + "ce_loss_3": 4.904810905456543, + "ce_loss_6": 4.284141218662262, + "epoch": 0.026, + "grad_norm": 1584.0, + "kl_loss_12": 1510.495831298828, + "kl_loss_17": 516.5738159179688, + "kl_loss_3": 4220.025524902344, + "kl_loss_6": 3024.763671875, + "learning_rate": 0.000999355658609228, + "loss": 2270.6297, + "step": 260 + }, + { + "ce_loss_12": 3.5572774767875672, + "ce_loss_17": 3.0783823490142823, + "ce_loss_23": 2.8128905057907105, + "ce_loss_3": 5.009525322914124, + "ce_loss_6": 4.3709392786026005, + "epoch": 0.027, + "grad_norm": 2288.0, + "kl_loss_12": 1488.0630615234375, + "kl_loss_17": 542.858137512207, + "kl_loss_3": 4309.648498535156, + "kl_loss_6": 3095.8380126953125, + "learning_rate": 0.0009992726191210138, + "loss": 2323.3471, + "step": 270 + }, + { + "ce_loss_12": 3.5629722476005554, + "ce_loss_17": 3.111860430240631, + "ce_loss_23": 2.8538637518882752, + "ce_loss_3": 4.906073951721192, + "ce_loss_6": 4.32850182056427, + "epoch": 0.028, + "grad_norm": 1696.0, + "kl_loss_12": 1444.6719482421875, + "kl_loss_17": 527.0346221923828, + "kl_loss_3": 4061.671533203125, + "kl_loss_6": 2958.3097534179688, + "learning_rate": 0.0009991845519630679, + "loss": 2233.8912, + "step": 280 + }, + { + "ce_loss_12": 3.4362335085868834, + "ce_loss_17": 2.9920791387557983, + "ce_loss_23": 2.7459678411483766, + "ce_loss_3": 4.801335573196411, + "ce_loss_6": 4.208979916572571, + "epoch": 0.029, + "grad_norm": 2128.0, + "kl_loss_12": 1418.6752685546876, + "kl_loss_17": 512.7696136474609, + "kl_loss_3": 4087.991943359375, + "kl_loss_6": 2944.8553955078123, + "learning_rate": 0.0009990914580222257, + "loss": 2234.8063, + "step": 290 + }, + { + "ce_loss_12": 3.550733494758606, + "ce_loss_17": 3.1482574582099914, + "ce_loss_23": 2.8788382768630982, + "ce_loss_3": 4.804766297340393, + "ce_loss_6": 4.249274849891663, + "epoch": 0.03, + "grad_norm": 1392.0, + "kl_loss_12": 1404.602276611328, + "kl_loss_17": 557.4966827392578, + "kl_loss_3": 3861.957177734375, + "kl_loss_6": 2783.5663940429686, + "learning_rate": 0.0009989933382359422, + "loss": 2194.6727, + "step": 300 + }, + { + "ce_loss_12": 3.540028524398804, + "ce_loss_17": 3.1266565561294555, + "ce_loss_23": 2.891022598743439, + "ce_loss_3": 4.818720889091492, + "ce_loss_6": 4.2541723370552065, + "epoch": 0.031, + "grad_norm": 1056.0, + "kl_loss_12": 1339.680322265625, + "kl_loss_17": 517.6963348388672, + "kl_loss_3": 3839.874609375, + "kl_loss_6": 2763.8606811523437, + "learning_rate": 0.0009988901935922825, + "loss": 2138.709, + "step": 310 + }, + { + "ce_loss_12": 3.4157469153404234, + "ce_loss_17": 2.9800336003303527, + "ce_loss_23": 2.7450334310531614, + "ce_loss_3": 4.779390811920166, + "ce_loss_6": 4.189510977268219, + "epoch": 0.032, + "grad_norm": 1032.0, + "kl_loss_12": 1387.2072631835938, + "kl_loss_17": 491.8215576171875, + "kl_loss_3": 4056.8338256835937, + "kl_loss_6": 2929.2824951171874, + "learning_rate": 0.0009987820251299122, + "loss": 2160.0242, + "step": 320 + }, + { + "ce_loss_12": 3.5009677052497863, + "ce_loss_17": 3.096353495121002, + "ce_loss_23": 2.861190211772919, + "ce_loss_3": 4.8097329378128055, + "ce_loss_6": 4.232736647129059, + "epoch": 0.033, + "grad_norm": 1376.0, + "kl_loss_12": 1327.8585388183594, + "kl_loss_17": 477.89680938720704, + "kl_loss_3": 3902.2681884765625, + "kl_loss_6": 2785.81064453125, + "learning_rate": 0.0009986688339380862, + "loss": 2112.4908, + "step": 330 + }, + { + "ce_loss_12": 3.4142756819725038, + "ce_loss_17": 3.0398844361305235, + "ce_loss_23": 2.8264341592788695, + "ce_loss_3": 4.700353384017944, + "ce_loss_6": 4.1305066585540775, + "epoch": 0.034, + "grad_norm": 1512.0, + "kl_loss_12": 1242.5486267089843, + "kl_loss_17": 461.6464080810547, + "kl_loss_3": 3757.1355224609374, + "kl_loss_6": 2665.6082275390627, + "learning_rate": 0.0009985506211566387, + "loss": 2053.3332, + "step": 340 + }, + { + "ce_loss_12": 3.43664208650589, + "ce_loss_17": 3.072474813461304, + "ce_loss_23": 2.850420904159546, + "ce_loss_3": 4.6930335521697994, + "ce_loss_6": 4.137925624847412, + "epoch": 0.035, + "grad_norm": 1192.0, + "kl_loss_12": 1218.1204467773437, + "kl_loss_17": 466.7091690063477, + "kl_loss_3": 3688.2295532226562, + "kl_loss_6": 2630.9764404296875, + "learning_rate": 0.0009984273879759713, + "loss": 2012.377, + "step": 350 + }, + { + "ce_loss_12": 3.4873570442199706, + "ce_loss_17": 3.1122740864753724, + "ce_loss_23": 2.8793745040893555, + "ce_loss_3": 4.758946943283081, + "ce_loss_6": 4.204195618629456, + "epoch": 0.036, + "grad_norm": 1368.0, + "kl_loss_12": 1268.138409423828, + "kl_loss_17": 487.7879470825195, + "kl_loss_3": 3747.4706909179686, + "kl_loss_6": 2682.5483276367186, + "learning_rate": 0.0009982991356370402, + "loss": 2068.6064, + "step": 360 + }, + { + "ce_loss_12": 3.446204948425293, + "ce_loss_17": 3.082586574554443, + "ce_loss_23": 2.8578404664993284, + "ce_loss_3": 4.716637182235718, + "ce_loss_6": 4.157662272453308, + "epoch": 0.037, + "grad_norm": 1936.0, + "kl_loss_12": 1237.1415466308595, + "kl_loss_17": 465.0269577026367, + "kl_loss_3": 3733.6424560546875, + "kl_loss_6": 2662.3063354492188, + "learning_rate": 0.0009981658654313456, + "loss": 2041.6695, + "step": 370 + }, + { + "ce_loss_12": 3.5023183941841127, + "ce_loss_17": 3.140884268283844, + "ce_loss_23": 2.924031972885132, + "ce_loss_3": 4.732712912559509, + "ce_loss_6": 4.188665843009948, + "epoch": 0.038, + "grad_norm": 1096.0, + "kl_loss_12": 1206.7100524902344, + "kl_loss_17": 459.29900512695315, + "kl_loss_3": 3623.5970947265623, + "kl_loss_6": 2586.8260009765627, + "learning_rate": 0.000998027578700917, + "loss": 2001.7613, + "step": 380 + }, + { + "ce_loss_12": 3.4590607047080995, + "ce_loss_17": 3.0973698616027834, + "ce_loss_23": 2.8757705688476562, + "ce_loss_3": 4.702216649055481, + "ce_loss_6": 4.169513952732086, + "epoch": 0.039, + "grad_norm": 1984.0, + "kl_loss_12": 1201.043963623047, + "kl_loss_17": 452.04960479736326, + "kl_loss_3": 3640.4640258789063, + "kl_loss_6": 2617.0949829101564, + "learning_rate": 0.0009978842768382998, + "loss": 2006.4363, + "step": 390 + }, + { + "ce_loss_12": 3.4306026816368105, + "ce_loss_17": 3.1000054597854616, + "ce_loss_23": 2.88355758190155, + "ce_loss_3": 4.647925066947937, + "ce_loss_6": 4.152370321750641, + "epoch": 0.04, + "grad_norm": 2400.0, + "kl_loss_12": 1145.0473388671876, + "kl_loss_17": 445.755876159668, + "kl_loss_3": 3536.4222412109375, + "kl_loss_6": 2587.5209228515623, + "learning_rate": 0.0009977359612865424, + "loss": 1947.2914, + "step": 400 + }, + { + "ce_loss_12": 3.448613131046295, + "ce_loss_17": 3.099381995201111, + "ce_loss_23": 2.891097903251648, + "ce_loss_3": 4.676835989952087, + "ce_loss_6": 4.156297981739044, + "epoch": 0.041, + "grad_norm": 1320.0, + "kl_loss_12": 1175.8864990234374, + "kl_loss_17": 440.0459350585937, + "kl_loss_3": 3581.45166015625, + "kl_loss_6": 2609.4977783203126, + "learning_rate": 0.0009975826335391806, + "loss": 1938.3453, + "step": 410 + }, + { + "ce_loss_12": 3.4343478679656982, + "ce_loss_17": 3.125094103813171, + "ce_loss_23": 2.910094475746155, + "ce_loss_3": 4.653401112556457, + "ce_loss_6": 4.147323703765869, + "epoch": 0.042, + "grad_norm": 1872.0, + "kl_loss_12": 1100.34775390625, + "kl_loss_17": 455.45897521972654, + "kl_loss_3": 3498.450341796875, + "kl_loss_6": 2518.9093017578125, + "learning_rate": 0.0009974242951402235, + "loss": 1903.6037, + "step": 420 + }, + { + "ce_loss_12": 3.462802863121033, + "ce_loss_17": 3.127783679962158, + "ce_loss_23": 2.914916479587555, + "ce_loss_3": 4.686368560791015, + "ce_loss_6": 4.148653864860535, + "epoch": 0.043, + "grad_norm": 1168.0, + "kl_loss_12": 1135.933984375, + "kl_loss_17": 439.07324981689453, + "kl_loss_3": 3567.4921997070314, + "kl_loss_6": 2534.5516723632813, + "learning_rate": 0.0009972609476841367, + "loss": 1900.857, + "step": 430 + }, + { + "ce_loss_12": 3.3758052229881286, + "ce_loss_17": 3.0453759908676146, + "ce_loss_23": 2.8323325395584105, + "ce_loss_3": 4.628264284133911, + "ce_loss_6": 4.12095662355423, + "epoch": 0.044, + "grad_norm": 1280.0, + "kl_loss_12": 1123.0216278076173, + "kl_loss_17": 451.2273483276367, + "kl_loss_3": 3576.0677978515623, + "kl_loss_6": 2601.298620605469, + "learning_rate": 0.0009970925928158272, + "loss": 1941.1031, + "step": 440 + }, + { + "ce_loss_12": 3.3380956411361695, + "ce_loss_17": 2.9893947720527647, + "ce_loss_23": 2.7863958835601808, + "ce_loss_3": 4.597060704231263, + "ce_loss_6": 4.063511395454407, + "epoch": 0.045, + "grad_norm": 1696.0, + "kl_loss_12": 1161.7735687255858, + "kl_loss_17": 422.92466888427737, + "kl_loss_3": 3636.7440551757813, + "kl_loss_6": 2627.9586547851563, + "learning_rate": 0.000996919232230627, + "loss": 1953.0043, + "step": 450 + }, + { + "ce_loss_12": 3.3880254507064818, + "ce_loss_17": 3.0471160888671873, + "ce_loss_23": 2.86956307888031, + "ce_loss_3": 4.579530930519104, + "ce_loss_6": 4.073746514320374, + "epoch": 0.046, + "grad_norm": 1528.0, + "kl_loss_12": 1115.3556823730469, + "kl_loss_17": 379.5940216064453, + "kl_loss_3": 3471.3382080078127, + "kl_loss_6": 2500.7124267578124, + "learning_rate": 0.0009967408676742752, + "loss": 1833.6008, + "step": 460 + }, + { + "ce_loss_12": 3.53208132982254, + "ce_loss_17": 3.1919715881347654, + "ce_loss_23": 3.0071330189704897, + "ce_loss_3": 4.685499620437622, + "ce_loss_6": 4.167628622055053, + "epoch": 0.047, + "grad_norm": 1048.0, + "kl_loss_12": 1116.2639068603517, + "kl_loss_17": 399.31142272949216, + "kl_loss_3": 3404.2772705078123, + "kl_loss_6": 2421.417102050781, + "learning_rate": 0.0009965575009429006, + "loss": 1885.0047, + "step": 470 + }, + { + "ce_loss_12": 3.331265389919281, + "ce_loss_17": 3.0240687012672423, + "ce_loss_23": 2.79015998840332, + "ce_loss_3": 4.576504421234131, + "ce_loss_6": 4.021812427043915, + "epoch": 0.048, + "grad_norm": 1152.0, + "kl_loss_12": 1125.945867919922, + "kl_loss_17": 471.67090759277346, + "kl_loss_3": 3577.5029541015624, + "kl_loss_6": 2510.0554321289064, + "learning_rate": 0.0009963691338830043, + "loss": 1901.8957, + "step": 480 + }, + { + "ce_loss_12": 3.4071529150009154, + "ce_loss_17": 3.0926716566085815, + "ce_loss_23": 2.8846765518188477, + "ce_loss_3": 4.632588601112365, + "ce_loss_6": 4.073854601383209, + "epoch": 0.049, + "grad_norm": 1088.0, + "kl_loss_12": 1115.415625, + "kl_loss_17": 448.1295852661133, + "kl_loss_3": 3538.219140625, + "kl_loss_6": 2462.3754272460938, + "learning_rate": 0.0009961757683914405, + "loss": 1878.659, + "step": 490 + }, + { + "ce_loss_12": 3.4084617495536804, + "ce_loss_17": 3.082959520816803, + "ce_loss_23": 2.8706990122795104, + "ce_loss_3": 4.597338938713074, + "ce_loss_6": 4.037699508666992, + "epoch": 0.05, + "grad_norm": 1336.0, + "kl_loss_12": 1137.8160369873046, + "kl_loss_17": 443.1447296142578, + "kl_loss_3": 3480.0999145507812, + "kl_loss_6": 2428.29423828125, + "learning_rate": 0.0009959774064153978, + "loss": 1888.4242, + "step": 500 + }, + { + "ce_loss_12": 3.381693124771118, + "ce_loss_17": 3.0774909257888794, + "ce_loss_23": 2.8901365995407104, + "ce_loss_3": 4.549492239952087, + "ce_loss_6": 4.027258086204529, + "epoch": 0.051, + "grad_norm": 1264.0, + "kl_loss_12": 1058.0331939697267, + "kl_loss_17": 389.85987243652346, + "kl_loss_3": 3354.3579956054687, + "kl_loss_6": 2342.414410400391, + "learning_rate": 0.0009957740499523787, + "loss": 1824.4158, + "step": 510 + }, + { + "ce_loss_12": 3.4225743889808653, + "ce_loss_17": 3.0770078897476196, + "ce_loss_23": 2.898508107662201, + "ce_loss_3": 4.558957409858704, + "ce_loss_6": 4.046568238735199, + "epoch": 0.052, + "grad_norm": 1056.0, + "kl_loss_12": 1102.2143676757812, + "kl_loss_17": 373.25570526123045, + "kl_loss_3": 3354.677380371094, + "kl_loss_6": 2388.7016357421876, + "learning_rate": 0.0009955657010501807, + "loss": 1812.9781, + "step": 520 + }, + { + "ce_loss_12": 3.418683922290802, + "ce_loss_17": 3.033629858493805, + "ce_loss_23": 2.855076479911804, + "ce_loss_3": 4.570754933357239, + "ce_loss_6": 4.044934415817261, + "epoch": 0.053, + "grad_norm": 1248.0, + "kl_loss_12": 1205.6635467529297, + "kl_loss_17": 381.54945373535156, + "kl_loss_3": 3456.4181884765626, + "kl_loss_6": 2436.295849609375, + "learning_rate": 0.000995352361806875, + "loss": 1855.0133, + "step": 530 + }, + { + "ce_loss_12": 3.487868809700012, + "ce_loss_17": 3.09113187789917, + "ce_loss_23": 2.896335208415985, + "ce_loss_3": 4.577032613754272, + "ce_loss_6": 4.054221868515015, + "epoch": 0.054, + "grad_norm": 1152.0, + "kl_loss_12": 1227.9361145019532, + "kl_loss_17": 399.3049880981445, + "kl_loss_3": 3395.7439453125, + "kl_loss_6": 2399.8954833984376, + "learning_rate": 0.0009951340343707852, + "loss": 1867.9363, + "step": 540 + }, + { + "ce_loss_12": 3.473950946331024, + "ce_loss_17": 3.1219199657440186, + "ce_loss_23": 2.942656922340393, + "ce_loss_3": 4.631032228469849, + "ce_loss_6": 4.126621842384338, + "epoch": 0.055, + "grad_norm": 984.0, + "kl_loss_12": 1125.1899505615233, + "kl_loss_17": 393.1251739501953, + "kl_loss_3": 3395.15849609375, + "kl_loss_6": 2425.3002075195313, + "learning_rate": 0.0009949107209404665, + "loss": 1852.8969, + "step": 550 + }, + { + "ce_loss_12": 3.3833325028419496, + "ce_loss_17": 3.039779543876648, + "ce_loss_23": 2.8641862154006956, + "ce_loss_3": 4.5303771734237674, + "ce_loss_6": 4.015872514247894, + "epoch": 0.056, + "grad_norm": 1064.0, + "kl_loss_12": 1090.2752197265625, + "kl_loss_17": 361.8307144165039, + "kl_loss_3": 3359.927282714844, + "kl_loss_6": 2393.180126953125, + "learning_rate": 0.0009946824237646824, + "loss": 1807.0324, + "step": 560 + }, + { + "ce_loss_12": 3.3463765382766724, + "ce_loss_17": 2.9906514286994934, + "ce_loss_23": 2.82177197933197, + "ce_loss_3": 4.515055251121521, + "ce_loss_6": 3.9852001786231996, + "epoch": 0.057, + "grad_norm": 1328.0, + "kl_loss_12": 1129.2081176757813, + "kl_loss_17": 368.39979248046876, + "kl_loss_3": 3436.320520019531, + "kl_loss_6": 2419.8519775390623, + "learning_rate": 0.0009944491451423828, + "loss": 1861.3609, + "step": 570 + }, + { + "ce_loss_12": 3.3455269932746887, + "ce_loss_17": 3.0017722129821776, + "ce_loss_23": 2.8181710720062254, + "ce_loss_3": 4.555279207229614, + "ce_loss_6": 4.013754498958588, + "epoch": 0.058, + "grad_norm": 1048.0, + "kl_loss_12": 1123.0898864746093, + "kl_loss_17": 386.96007080078124, + "kl_loss_3": 3515.637939453125, + "kl_loss_6": 2467.850341796875, + "learning_rate": 0.0009942108874226813, + "loss": 1835.1637, + "step": 580 + }, + { + "ce_loss_12": 3.4131882905960085, + "ce_loss_17": 3.0872496962547302, + "ce_loss_23": 2.9123022317886353, + "ce_loss_3": 4.562892317771912, + "ce_loss_6": 4.037652707099914, + "epoch": 0.059, + "grad_norm": 1312.0, + "kl_loss_12": 1063.407748413086, + "kl_loss_17": 377.90936889648435, + "kl_loss_3": 3328.068469238281, + "kl_loss_6": 2309.729833984375, + "learning_rate": 0.00099396765300483, + "loss": 1766.2221, + "step": 590 + }, + { + "ce_loss_12": 3.4159348487854, + "ce_loss_17": 3.0849705934524536, + "ce_loss_23": 2.896781253814697, + "ce_loss_3": 4.561087894439697, + "ce_loss_6": 4.038153064250946, + "epoch": 0.06, + "grad_norm": 1776.0, + "kl_loss_12": 1077.741293334961, + "kl_loss_17": 390.4696075439453, + "kl_loss_3": 3359.8049560546874, + "kl_loss_6": 2370.4330078125, + "learning_rate": 0.0009937194443381972, + "loss": 1799.4371, + "step": 600 + }, + { + "ce_loss_12": 3.4187211632728576, + "ce_loss_17": 3.101148545742035, + "ce_loss_23": 2.930239200592041, + "ce_loss_3": 4.549526739120483, + "ce_loss_6": 4.049935567378998, + "epoch": 0.061, + "grad_norm": 1112.0, + "kl_loss_12": 1040.6774597167969, + "kl_loss_17": 370.95189514160154, + "kl_loss_3": 3283.81728515625, + "kl_loss_6": 2311.2066345214844, + "learning_rate": 0.0009934662639222412, + "loss": 1785.4037, + "step": 610 + }, + { + "ce_loss_12": 3.3739498376846315, + "ce_loss_17": 3.073314607143402, + "ce_loss_23": 2.8866294622421265, + "ce_loss_3": 4.554197382926941, + "ce_loss_6": 4.047503459453583, + "epoch": 0.062, + "grad_norm": 1792.0, + "kl_loss_12": 1042.084783935547, + "kl_loss_17": 388.0509094238281, + "kl_loss_3": 3388.6704345703124, + "kl_loss_6": 2418.96044921875, + "learning_rate": 0.000993208114306486, + "loss": 1793.3209, + "step": 620 + }, + { + "ce_loss_12": 3.2972922563552856, + "ce_loss_17": 3.011891508102417, + "ce_loss_23": 2.8142054677009583, + "ce_loss_3": 4.5025928020477295, + "ce_loss_6": 3.984578573703766, + "epoch": 0.063, + "grad_norm": 1504.0, + "kl_loss_12": 1031.5344207763671, + "kl_loss_17": 420.5843566894531, + "kl_loss_3": 3421.248095703125, + "kl_loss_6": 2428.8518798828127, + "learning_rate": 0.0009929449980904952, + "loss": 1779.1174, + "step": 630 + }, + { + "ce_loss_12": 3.3357118010520934, + "ce_loss_17": 3.0490943431854247, + "ce_loss_23": 2.877616310119629, + "ce_loss_3": 4.507691192626953, + "ce_loss_6": 4.000881290435791, + "epoch": 0.064, + "grad_norm": 1424.0, + "kl_loss_12": 998.4864044189453, + "kl_loss_17": 375.07586975097655, + "kl_loss_3": 3321.5576049804686, + "kl_loss_6": 2344.2793823242187, + "learning_rate": 0.0009926769179238466, + "loss": 1743.0203, + "step": 640 + }, + { + "ce_loss_12": 3.3966179728507995, + "ce_loss_17": 3.089047574996948, + "ce_loss_23": 2.906193125247955, + "ce_loss_3": 4.535722994804383, + "ce_loss_6": 4.043430185317993, + "epoch": 0.065, + "grad_norm": 1088.0, + "kl_loss_12": 1048.7820037841798, + "kl_loss_17": 387.50828552246094, + "kl_loss_3": 3332.1998657226563, + "kl_loss_6": 2368.378546142578, + "learning_rate": 0.000992403876506104, + "loss": 1766.1207, + "step": 650 + }, + { + "ce_loss_12": 3.3129051446914675, + "ce_loss_17": 3.0166344165802004, + "ce_loss_23": 2.8484310030937197, + "ce_loss_3": 4.497570371627807, + "ce_loss_6": 3.9834340095520018, + "epoch": 0.066, + "grad_norm": 1392.0, + "kl_loss_12": 998.1759429931641, + "kl_loss_17": 359.26244659423827, + "kl_loss_3": 3335.43896484375, + "kl_loss_6": 2332.2134399414062, + "learning_rate": 0.0009921258765867918, + "loss": 1757.334, + "step": 660 + }, + { + "ce_loss_12": 3.2913843750953675, + "ce_loss_17": 2.983316922187805, + "ce_loss_23": 2.825131380558014, + "ce_loss_3": 4.487063193321228, + "ce_loss_6": 3.963192939758301, + "epoch": 0.067, + "grad_norm": 1656.0, + "kl_loss_12": 993.4193237304687, + "kl_loss_17": 340.4703704833984, + "kl_loss_3": 3401.8429321289063, + "kl_loss_6": 2374.357080078125, + "learning_rate": 0.0009918429209653662, + "loss": 1749.6797, + "step": 670 + }, + { + "ce_loss_12": 3.3256665229797364, + "ce_loss_17": 3.0258800268173216, + "ce_loss_23": 2.871187961101532, + "ce_loss_3": 4.512535119056702, + "ce_loss_6": 3.982617509365082, + "epoch": 0.068, + "grad_norm": 1040.0, + "kl_loss_12": 985.1989440917969, + "kl_loss_17": 332.79962768554685, + "kl_loss_3": 3343.0188232421874, + "kl_loss_6": 2328.2467041015625, + "learning_rate": 0.0009915550124911866, + "loss": 1706.5441, + "step": 680 + }, + { + "ce_loss_12": 3.3320120930671693, + "ce_loss_17": 3.0299292683601378, + "ce_loss_23": 2.8721617698669433, + "ce_loss_3": 4.484584331512451, + "ce_loss_6": 3.9666941165924072, + "epoch": 0.069, + "grad_norm": 1008.0, + "kl_loss_12": 984.1631896972656, + "kl_loss_17": 330.89317321777344, + "kl_loss_3": 3254.253063964844, + "kl_loss_6": 2269.092498779297, + "learning_rate": 0.0009912621540634887, + "loss": 1711.3982, + "step": 690 + }, + { + "ce_loss_12": 3.3575613617897035, + "ce_loss_17": 3.060102331638336, + "ce_loss_23": 2.914083182811737, + "ce_loss_3": 4.487043619155884, + "ce_loss_6": 3.976016843318939, + "epoch": 0.07, + "grad_norm": 892.0, + "kl_loss_12": 962.9395385742188, + "kl_loss_17": 315.06245880126954, + "kl_loss_3": 3207.0468017578123, + "kl_loss_6": 2221.482141113281, + "learning_rate": 0.0009909643486313534, + "loss": 1695.9398, + "step": 700 + }, + { + "ce_loss_12": 3.2921235084533693, + "ce_loss_17": 2.9624681949615477, + "ce_loss_23": 2.8101747274398803, + "ce_loss_3": 4.465417528152466, + "ce_loss_6": 3.9272570967674256, + "epoch": 0.071, + "grad_norm": 1088.0, + "kl_loss_12": 1008.410922241211, + "kl_loss_17": 328.5423217773438, + "kl_loss_3": 3353.696728515625, + "kl_loss_6": 2306.0830810546877, + "learning_rate": 0.000990661599193678, + "loss": 1778.1223, + "step": 710 + }, + { + "ce_loss_12": 3.376284325122833, + "ce_loss_17": 3.074607563018799, + "ce_loss_23": 2.9188767075538635, + "ce_loss_3": 4.520675468444824, + "ce_loss_6": 4.008211147785187, + "epoch": 0.072, + "grad_norm": 1160.0, + "kl_loss_12": 992.3700897216797, + "kl_loss_17": 324.57889251708986, + "kl_loss_3": 3247.5807495117188, + "kl_loss_6": 2259.2288513183594, + "learning_rate": 0.0009903539087991462, + "loss": 1706.6916, + "step": 720 + }, + { + "ce_loss_12": 3.3545514583587646, + "ce_loss_17": 3.051961660385132, + "ce_loss_23": 2.9055708765983583, + "ce_loss_3": 4.496706533432007, + "ce_loss_6": 3.9772397875785828, + "epoch": 0.073, + "grad_norm": 980.0, + "kl_loss_12": 994.7562225341796, + "kl_loss_17": 325.16717834472655, + "kl_loss_3": 3252.458361816406, + "kl_loss_6": 2251.217822265625, + "learning_rate": 0.0009900412805461966, + "loss": 1713.7863, + "step": 730 + }, + { + "ce_loss_12": 3.437267518043518, + "ce_loss_17": 3.1153113722801207, + "ce_loss_23": 2.967765748500824, + "ce_loss_3": 4.51895227432251, + "ce_loss_6": 4.015814542770386, + "epoch": 0.074, + "grad_norm": 988.0, + "kl_loss_12": 1007.8217681884765, + "kl_loss_17": 318.0323486328125, + "kl_loss_3": 3190.121484375, + "kl_loss_6": 2194.7572387695313, + "learning_rate": 0.0009897237175829927, + "loss": 1704.4227, + "step": 740 + }, + { + "ce_loss_12": 3.349493718147278, + "ce_loss_17": 3.016627752780914, + "ce_loss_23": 2.858274281024933, + "ce_loss_3": 4.469674086570739, + "ce_loss_6": 3.947020959854126, + "epoch": 0.075, + "grad_norm": 992.0, + "kl_loss_12": 1053.4417694091796, + "kl_loss_17": 327.71359100341795, + "kl_loss_3": 3290.5677490234375, + "kl_loss_6": 2277.1982666015624, + "learning_rate": 0.0009894012231073895, + "loss": 1720.0527, + "step": 750 + }, + { + "ce_loss_12": 3.381664311885834, + "ce_loss_17": 3.0567665100097656, + "ce_loss_23": 2.9069377303123476, + "ce_loss_3": 4.487093877792359, + "ce_loss_6": 3.981345546245575, + "epoch": 0.076, + "grad_norm": 1312.0, + "kl_loss_12": 1018.7256530761719, + "kl_loss_17": 327.3124313354492, + "kl_loss_3": 3220.332568359375, + "kl_loss_6": 2239.033062744141, + "learning_rate": 0.0009890738003669028, + "loss": 1719.7855, + "step": 760 + }, + { + "ce_loss_12": 3.3535516023635865, + "ce_loss_17": 3.0329953789711, + "ce_loss_23": 2.875113677978516, + "ce_loss_3": 4.51433572769165, + "ce_loss_6": 3.9803815722465514, + "epoch": 0.077, + "grad_norm": 1216.0, + "kl_loss_12": 1021.7845306396484, + "kl_loss_17": 333.3127502441406, + "kl_loss_3": 3351.674694824219, + "kl_loss_6": 2306.272119140625, + "learning_rate": 0.0009887414526586764, + "loss": 1712.3004, + "step": 770 + }, + { + "ce_loss_12": 3.3716891407966614, + "ce_loss_17": 3.0681586623191834, + "ce_loss_23": 2.9219251036643983, + "ce_loss_3": 4.514308929443359, + "ce_loss_6": 3.989739179611206, + "epoch": 0.078, + "grad_norm": 1056.0, + "kl_loss_12": 963.1607696533204, + "kl_loss_17": 313.2260986328125, + "kl_loss_3": 3232.1091186523436, + "kl_loss_6": 2218.2968627929686, + "learning_rate": 0.0009884041833294476, + "loss": 1653.7039, + "step": 780 + }, + { + "ce_loss_12": 3.372546339035034, + "ce_loss_17": 3.0722476720809935, + "ce_loss_23": 2.930769956111908, + "ce_loss_3": 4.489286208152771, + "ce_loss_6": 3.9699451208114622, + "epoch": 0.079, + "grad_norm": 1000.0, + "kl_loss_12": 954.4112121582032, + "kl_loss_17": 307.53686676025393, + "kl_loss_3": 3183.226110839844, + "kl_loss_6": 2160.666619873047, + "learning_rate": 0.000988061995775515, + "loss": 1698.0086, + "step": 790 + }, + { + "ce_loss_12": 3.3053468823432923, + "ce_loss_17": 3.0160653710365297, + "ce_loss_23": 2.8685967564582824, + "ce_loss_3": 4.424778723716736, + "ce_loss_6": 3.9103200674057006, + "epoch": 0.08, + "grad_norm": 1048.0, + "kl_loss_12": 953.6102935791016, + "kl_loss_17": 322.9908157348633, + "kl_loss_3": 3191.081994628906, + "kl_loss_6": 2195.573114013672, + "learning_rate": 0.0009877148934427035, + "loss": 1659.377, + "step": 800 + }, + { + "ce_loss_12": 3.336946439743042, + "ce_loss_17": 3.047144114971161, + "ce_loss_23": 2.903570628166199, + "ce_loss_3": 4.465107464790345, + "ce_loss_6": 3.957136833667755, + "epoch": 0.081, + "grad_norm": 984.0, + "kl_loss_12": 940.542431640625, + "kl_loss_17": 314.41667022705076, + "kl_loss_3": 3199.7871215820314, + "kl_loss_6": 2209.312353515625, + "learning_rate": 0.0009873628798263297, + "loss": 1646.1479, + "step": 810 + }, + { + "ce_loss_12": 3.2978206872940063, + "ce_loss_17": 3.0110609769821166, + "ce_loss_23": 2.866278886795044, + "ce_loss_3": 4.406501245498657, + "ce_loss_6": 3.889325964450836, + "epoch": 0.082, + "grad_norm": 1000.0, + "kl_loss_12": 928.1305328369141, + "kl_loss_17": 310.1401123046875, + "kl_loss_3": 3130.672497558594, + "kl_loss_6": 2145.510949707031, + "learning_rate": 0.0009870059584711668, + "loss": 1672.1705, + "step": 820 + }, + { + "ce_loss_12": 3.3061461210250855, + "ce_loss_17": 3.0198216795921327, + "ce_loss_23": 2.8694707274436952, + "ce_loss_3": 4.416663718223572, + "ce_loss_6": 3.9168904542922975, + "epoch": 0.083, + "grad_norm": 908.0, + "kl_loss_12": 935.2707489013671, + "kl_loss_17": 329.5289474487305, + "kl_loss_3": 3150.615368652344, + "kl_loss_6": 2179.1330505371093, + "learning_rate": 0.000986644132971409, + "loss": 1649.4949, + "step": 830 + }, + { + "ce_loss_12": 3.3155024528503416, + "ce_loss_17": 3.0179206132888794, + "ce_loss_23": 2.858084261417389, + "ce_loss_3": 4.44396892786026, + "ce_loss_6": 3.9450360417366026, + "epoch": 0.084, + "grad_norm": 1040.0, + "kl_loss_12": 965.4269836425781, + "kl_loss_17": 333.64192352294924, + "kl_loss_3": 3197.687365722656, + "kl_loss_6": 2229.4203979492186, + "learning_rate": 0.0009862774069706345, + "loss": 1665.3748, + "step": 840 + }, + { + "ce_loss_12": 3.418204295635223, + "ce_loss_17": 3.1322174668312073, + "ce_loss_23": 2.986786162853241, + "ce_loss_3": 4.4888609647750854, + "ce_loss_6": 3.9892574667930605, + "epoch": 0.085, + "grad_norm": 992.0, + "kl_loss_12": 946.3281097412109, + "kl_loss_17": 313.324609375, + "kl_loss_3": 3086.1066284179688, + "kl_loss_6": 2126.8193298339843, + "learning_rate": 0.000985905784161771, + "loss": 1636.6967, + "step": 850 + }, + { + "ce_loss_12": 3.345979356765747, + "ce_loss_17": 3.052992057800293, + "ce_loss_23": 2.9130621552467346, + "ce_loss_3": 4.428451132774353, + "ce_loss_6": 3.9349741697311402, + "epoch": 0.086, + "grad_norm": 956.0, + "kl_loss_12": 954.7295227050781, + "kl_loss_17": 311.8019790649414, + "kl_loss_3": 3119.7036987304687, + "kl_loss_6": 2163.293731689453, + "learning_rate": 0.000985529268287055, + "loss": 1630.4893, + "step": 860 + }, + { + "ce_loss_12": 3.2938442468643188, + "ce_loss_17": 2.9937807083129884, + "ce_loss_23": 2.849967968463898, + "ce_loss_3": 4.411960792541504, + "ce_loss_6": 3.896759867668152, + "epoch": 0.087, + "grad_norm": 1112.0, + "kl_loss_12": 960.398648071289, + "kl_loss_17": 313.8326171875, + "kl_loss_3": 3182.4353515625, + "kl_loss_6": 2195.82099609375, + "learning_rate": 0.0009851478631379982, + "loss": 1667.1393, + "step": 870 + }, + { + "ce_loss_12": 3.352380836009979, + "ce_loss_17": 3.0529614686965942, + "ce_loss_23": 2.9062195897102354, + "ce_loss_3": 4.443659949302673, + "ce_loss_6": 3.934219491481781, + "epoch": 0.088, + "grad_norm": 1024.0, + "kl_loss_12": 953.0646362304688, + "kl_loss_17": 311.8681335449219, + "kl_loss_3": 3139.9689819335936, + "kl_loss_6": 2151.7400024414064, + "learning_rate": 0.0009847615725553456, + "loss": 1641.743, + "step": 880 + }, + { + "ce_loss_12": 3.369493007659912, + "ce_loss_17": 3.087172031402588, + "ce_loss_23": 2.955445909500122, + "ce_loss_3": 4.406086874008179, + "ce_loss_6": 3.927104687690735, + "epoch": 0.089, + "grad_norm": 1112.0, + "kl_loss_12": 909.7544189453125, + "kl_loss_17": 285.7761688232422, + "kl_loss_3": 2972.541784667969, + "kl_loss_6": 2051.965185546875, + "learning_rate": 0.0009843704004290394, + "loss": 1613.0483, + "step": 890 + }, + { + "ce_loss_12": 3.3007118344306945, + "ce_loss_17": 3.0091582536697388, + "ce_loss_23": 2.865347218513489, + "ce_loss_3": 4.389851522445679, + "ce_loss_6": 3.887819600105286, + "epoch": 0.09, + "grad_norm": 1184.0, + "kl_loss_12": 942.951953125, + "kl_loss_17": 302.5071563720703, + "kl_loss_3": 3139.5479125976562, + "kl_loss_6": 2154.337860107422, + "learning_rate": 0.0009839743506981783, + "loss": 1628.214, + "step": 900 + }, + { + "ce_loss_12": 3.244469094276428, + "ce_loss_17": 2.9393270134925844, + "ce_loss_23": 2.793620991706848, + "ce_loss_3": 4.385366058349609, + "ce_loss_6": 3.870071280002594, + "epoch": 0.091, + "grad_norm": 972.0, + "kl_loss_12": 964.6631439208984, + "kl_loss_17": 310.6958312988281, + "kl_loss_3": 3252.1250854492187, + "kl_loss_6": 2248.5163696289064, + "learning_rate": 0.0009835734273509786, + "loss": 1661.4332, + "step": 910 + }, + { + "ce_loss_12": 3.3276159048080443, + "ce_loss_17": 3.0265863656997682, + "ce_loss_23": 2.879580080509186, + "ce_loss_3": 4.428731679916382, + "ce_loss_6": 3.912135696411133, + "epoch": 0.092, + "grad_norm": 972.0, + "kl_loss_12": 934.3377227783203, + "kl_loss_17": 306.7847213745117, + "kl_loss_3": 3130.859033203125, + "kl_loss_6": 2128.042761230469, + "learning_rate": 0.0009831676344247342, + "loss": 1625.7849, + "step": 920 + }, + { + "ce_loss_12": 3.325820744037628, + "ce_loss_17": 3.038645386695862, + "ce_loss_23": 2.905313861370087, + "ce_loss_3": 4.380109333992005, + "ce_loss_6": 3.8928969264030457, + "epoch": 0.093, + "grad_norm": 880.0, + "kl_loss_12": 920.8738159179687, + "kl_loss_17": 293.66180419921875, + "kl_loss_3": 3038.493615722656, + "kl_loss_6": 2091.2300720214844, + "learning_rate": 0.0009827569760057755, + "loss": 1614.7129, + "step": 930 + }, + { + "ce_loss_12": 3.283223259449005, + "ce_loss_17": 2.974070966243744, + "ce_loss_23": 2.8231024861335756, + "ce_loss_3": 4.420139646530151, + "ce_loss_6": 3.896502125263214, + "epoch": 0.094, + "grad_norm": 1048.0, + "kl_loss_12": 967.0422973632812, + "kl_loss_17": 315.1331161499023, + "kl_loss_3": 3246.8406616210937, + "kl_loss_6": 2233.370959472656, + "learning_rate": 0.000982341456229428, + "loss": 1639.7586, + "step": 940 + }, + { + "ce_loss_12": 3.3469450950622557, + "ce_loss_17": 3.0623409390449523, + "ce_loss_23": 2.9173637866973876, + "ce_loss_3": 4.45803382396698, + "ce_loss_6": 3.952650713920593, + "epoch": 0.095, + "grad_norm": 1112.0, + "kl_loss_12": 949.4091888427735, + "kl_loss_17": 315.292790222168, + "kl_loss_3": 3171.674108886719, + "kl_loss_6": 2186.0895812988283, + "learning_rate": 0.000981921079279971, + "loss": 1616.1184, + "step": 950 + }, + { + "ce_loss_12": 3.320824933052063, + "ce_loss_17": 3.0578987121582033, + "ce_loss_23": 2.9246861934661865, + "ce_loss_3": 4.367957544326782, + "ce_loss_6": 3.8789355397224425, + "epoch": 0.096, + "grad_norm": 1264.0, + "kl_loss_12": 884.3493804931641, + "kl_loss_17": 306.3426567077637, + "kl_loss_3": 3007.1193725585936, + "kl_loss_6": 2051.980364990234, + "learning_rate": 0.0009814958493905962, + "loss": 1580.1495, + "step": 960 + }, + { + "ce_loss_12": 3.303164303302765, + "ce_loss_17": 3.0266597151756285, + "ce_loss_23": 2.884927237033844, + "ce_loss_3": 4.41832115650177, + "ce_loss_6": 3.909832787513733, + "epoch": 0.097, + "grad_norm": 1024.0, + "kl_loss_12": 904.8543029785156, + "kl_loss_17": 307.96812438964844, + "kl_loss_3": 3112.451696777344, + "kl_loss_6": 2136.4014526367187, + "learning_rate": 0.0009810657708433637, + "loss": 1641.5656, + "step": 970 + }, + { + "ce_loss_12": 3.357134532928467, + "ce_loss_17": 3.109633040428162, + "ce_loss_23": 2.957169032096863, + "ce_loss_3": 4.392407011985779, + "ce_loss_6": 3.9122292041778564, + "epoch": 0.098, + "grad_norm": 1112.0, + "kl_loss_12": 875.98427734375, + "kl_loss_17": 322.0378616333008, + "kl_loss_3": 2949.5711303710937, + "kl_loss_6": 2031.9008850097657, + "learning_rate": 0.0009806308479691594, + "loss": 1564.0409, + "step": 980 + }, + { + "ce_loss_12": 3.391128623485565, + "ce_loss_17": 3.1226599335670473, + "ce_loss_23": 2.95620356798172, + "ce_loss_3": 4.4521146535873415, + "ce_loss_6": 3.9637447357177735, + "epoch": 0.099, + "grad_norm": 1048.0, + "kl_loss_12": 929.8868896484375, + "kl_loss_17": 364.4446853637695, + "kl_loss_3": 3061.3019775390626, + "kl_loss_6": 2113.771942138672, + "learning_rate": 0.0009801910851476522, + "loss": 1610.7443, + "step": 990 + }, + { + "ce_loss_12": 3.3127102971076967, + "ce_loss_17": 3.0483265042304994, + "ce_loss_23": 2.8895352125167846, + "ce_loss_3": 4.4247009515762326, + "ce_loss_6": 3.9209498167037964, + "epoch": 0.1, + "grad_norm": 956.0, + "kl_loss_12": 926.8501525878906, + "kl_loss_17": 344.10108642578126, + "kl_loss_3": 3181.64150390625, + "kl_loss_6": 2189.818463134766, + "learning_rate": 0.0009797464868072487, + "loss": 1626.2685, + "step": 1000 + }, + { + "ce_loss_12": 3.2965635657310486, + "ce_loss_17": 3.0399394392967225, + "ce_loss_23": 2.8819270968437194, + "ce_loss_3": 4.386479806900025, + "ce_loss_6": 3.8870999932289125, + "epoch": 0.101, + "grad_norm": 1248.0, + "kl_loss_12": 923.0501098632812, + "kl_loss_17": 338.95404663085935, + "kl_loss_3": 3102.4775268554686, + "kl_loss_6": 2139.943634033203, + "learning_rate": 0.0009792970574250492, + "loss": 1619.6861, + "step": 1010 + }, + { + "ce_loss_12": 3.3044546008110047, + "ce_loss_17": 3.0502540946006773, + "ce_loss_23": 2.8980908036231994, + "ce_loss_3": 4.38866947889328, + "ce_loss_6": 3.888150417804718, + "epoch": 0.102, + "grad_norm": 1208.0, + "kl_loss_12": 893.213345336914, + "kl_loss_17": 324.3102798461914, + "kl_loss_3": 3070.095068359375, + "kl_loss_6": 2099.203436279297, + "learning_rate": 0.0009788428015268028, + "loss": 1573.6766, + "step": 1020 + }, + { + "ce_loss_12": 3.2988733649253845, + "ce_loss_17": 3.0434825658798217, + "ce_loss_23": 2.902178335189819, + "ce_loss_3": 4.363534331321716, + "ce_loss_6": 3.8749521732330323, + "epoch": 0.103, + "grad_norm": 1184.0, + "kl_loss_12": 876.8101196289062, + "kl_loss_17": 305.72301330566404, + "kl_loss_3": 3020.4602905273437, + "kl_loss_6": 2069.8065979003904, + "learning_rate": 0.0009783837236868609, + "loss": 1566.3812, + "step": 1030 + }, + { + "ce_loss_12": 3.2709617257118224, + "ce_loss_17": 3.008937418460846, + "ce_loss_23": 2.8611239790916443, + "ce_loss_3": 4.3386149406433105, + "ce_loss_6": 3.858043742179871, + "epoch": 0.104, + "grad_norm": 1224.0, + "kl_loss_12": 890.7757629394531, + "kl_loss_17": 319.31165924072263, + "kl_loss_3": 3030.126110839844, + "kl_loss_6": 2079.913439941406, + "learning_rate": 0.0009779198285281327, + "loss": 1570.5824, + "step": 1040 + }, + { + "ce_loss_12": 3.2658578515052796, + "ce_loss_17": 3.006276023387909, + "ce_loss_23": 2.8662564635276793, + "ce_loss_3": 4.363893914222717, + "ce_loss_6": 3.867250108718872, + "epoch": 0.105, + "grad_norm": 1168.0, + "kl_loss_12": 873.5543426513672, + "kl_loss_17": 301.8095642089844, + "kl_loss_3": 3073.273645019531, + "kl_loss_6": 2108.3940185546876, + "learning_rate": 0.0009774511207220368, + "loss": 1584.198, + "step": 1050 + }, + { + "ce_loss_12": 3.316801738739014, + "ce_loss_17": 3.045045328140259, + "ce_loss_23": 2.907749652862549, + "ce_loss_3": 4.407438325881958, + "ce_loss_6": 3.9095324158668516, + "epoch": 0.106, + "grad_norm": 864.0, + "kl_loss_12": 892.4073059082032, + "kl_loss_17": 299.16715240478516, + "kl_loss_3": 3074.142614746094, + "kl_loss_6": 2108.910284423828, + "learning_rate": 0.0009769776049884564, + "loss": 1581.9543, + "step": 1060 + }, + { + "ce_loss_12": 3.24352912902832, + "ce_loss_17": 2.9564419984817505, + "ce_loss_23": 2.8230312943458555, + "ce_loss_3": 4.339093565940857, + "ce_loss_6": 3.8406063437461855, + "epoch": 0.107, + "grad_norm": 1688.0, + "kl_loss_12": 919.3368377685547, + "kl_loss_17": 291.7588394165039, + "kl_loss_3": 3120.2006225585938, + "kl_loss_6": 2151.6788024902344, + "learning_rate": 0.0009764992860956889, + "loss": 1627.9674, + "step": 1070 + }, + { + "ce_loss_12": 3.3712757229804993, + "ce_loss_17": 3.0845303654670717, + "ce_loss_23": 2.9652393460273743, + "ce_loss_3": 4.350837230682373, + "ce_loss_6": 3.8884527921676635, + "epoch": 0.108, + "grad_norm": 1152.0, + "kl_loss_12": 932.8914611816406, + "kl_loss_17": 269.56608963012695, + "kl_loss_3": 2872.27275390625, + "kl_loss_6": 1973.182550048828, + "learning_rate": 0.0009760161688604008, + "loss": 1542.0248, + "step": 1080 + }, + { + "ce_loss_12": 3.4055436849594116, + "ce_loss_17": 3.088821458816528, + "ce_loss_23": 2.960082447528839, + "ce_loss_3": 4.399594736099243, + "ce_loss_6": 3.9329787254333497, + "epoch": 0.109, + "grad_norm": 1200.0, + "kl_loss_12": 965.5884552001953, + "kl_loss_17": 278.2935195922852, + "kl_loss_3": 2961.8168212890623, + "kl_loss_6": 2049.4507080078124, + "learning_rate": 0.0009755282581475768, + "loss": 1577.8215, + "step": 1090 + }, + { + "ce_loss_12": 3.453993630409241, + "ce_loss_17": 3.1275503635406494, + "ce_loss_23": 2.9921347975730894, + "ce_loss_3": 4.4342069864273075, + "ce_loss_6": 3.9476578712463377, + "epoch": 0.11, + "grad_norm": 960.0, + "kl_loss_12": 991.3485321044922, + "kl_loss_17": 288.25440673828126, + "kl_loss_3": 2963.675, + "kl_loss_6": 2015.0789428710937, + "learning_rate": 0.0009750355588704727, + "loss": 1552.8531, + "step": 1100 + }, + { + "ce_loss_12": 3.297532868385315, + "ce_loss_17": 2.987328219413757, + "ce_loss_23": 2.858099091053009, + "ce_loss_3": 4.321656811237335, + "ce_loss_6": 3.83161541223526, + "epoch": 0.111, + "grad_norm": 1040.0, + "kl_loss_12": 948.488314819336, + "kl_loss_17": 280.5028388977051, + "kl_loss_3": 3007.969384765625, + "kl_loss_6": 2057.8111572265625, + "learning_rate": 0.0009745380759905647, + "loss": 1595.1603, + "step": 1110 + }, + { + "ce_loss_12": 3.2402580618858337, + "ce_loss_17": 2.9434482932090758, + "ce_loss_23": 2.816891813278198, + "ce_loss_3": 4.288587641716004, + "ce_loss_6": 3.8150289416313172, + "epoch": 0.112, + "grad_norm": 1144.0, + "kl_loss_12": 923.8688537597657, + "kl_loss_17": 279.68654861450193, + "kl_loss_3": 3030.0865844726563, + "kl_loss_6": 2093.9303771972654, + "learning_rate": 0.0009740358145174998, + "loss": 1661.5004, + "step": 1120 + }, + { + "ce_loss_12": 3.3753363370895384, + "ce_loss_17": 3.0815247654914857, + "ce_loss_23": 2.9522975087165833, + "ce_loss_3": 4.34820659160614, + "ce_loss_6": 3.8800788521766663, + "epoch": 0.113, + "grad_norm": 1280.0, + "kl_loss_12": 927.2453460693359, + "kl_loss_17": 288.8904830932617, + "kl_loss_3": 2915.881066894531, + "kl_loss_6": 2001.3101745605468, + "learning_rate": 0.0009735287795090455, + "loss": 1554.5693, + "step": 1130 + }, + { + "ce_loss_12": 3.277189326286316, + "ce_loss_17": 2.984852302074432, + "ce_loss_23": 2.8566457867622375, + "ce_loss_3": 4.314880430698395, + "ce_loss_6": 3.826177978515625, + "epoch": 0.114, + "grad_norm": 1088.0, + "kl_loss_12": 923.1986511230468, + "kl_loss_17": 283.3469985961914, + "kl_loss_3": 2998.0419677734376, + "kl_loss_6": 2062.0610595703124, + "learning_rate": 0.0009730169760710386, + "loss": 1559.3103, + "step": 1140 + }, + { + "ce_loss_12": 3.3268452405929567, + "ce_loss_17": 3.044426202774048, + "ce_loss_23": 2.918807125091553, + "ce_loss_3": 4.364259016513825, + "ce_loss_6": 3.880498945713043, + "epoch": 0.115, + "grad_norm": 1552.0, + "kl_loss_12": 890.0449462890625, + "kl_loss_17": 276.7792999267578, + "kl_loss_3": 2952.4790893554687, + "kl_loss_6": 2009.8786682128907, + "learning_rate": 0.0009725004093573342, + "loss": 1554.0646, + "step": 1150 + }, + { + "ce_loss_12": 3.2966678977012633, + "ce_loss_17": 3.009952485561371, + "ce_loss_23": 2.878467881679535, + "ce_loss_3": 4.3358853459358215, + "ce_loss_6": 3.8439332962036135, + "epoch": 0.116, + "grad_norm": 872.0, + "kl_loss_12": 886.3811218261719, + "kl_loss_17": 278.020223236084, + "kl_loss_3": 2973.4177978515627, + "kl_loss_6": 2014.2760498046875, + "learning_rate": 0.0009719790845697534, + "loss": 1527.7352, + "step": 1160 + }, + { + "ce_loss_12": 3.2209246873855593, + "ce_loss_17": 2.964114212989807, + "ce_loss_23": 2.8400999188423155, + "ce_loss_3": 4.245721232891083, + "ce_loss_6": 3.7645427107810976, + "epoch": 0.117, + "grad_norm": 1304.0, + "kl_loss_12": 834.8282470703125, + "kl_loss_17": 265.4806785583496, + "kl_loss_3": 2893.5482788085938, + "kl_loss_6": 1949.34755859375, + "learning_rate": 0.0009714530069580309, + "loss": 1496.4959, + "step": 1170 + }, + { + "ce_loss_12": 3.3160659790039064, + "ce_loss_17": 3.0417101860046385, + "ce_loss_23": 2.9130223631858825, + "ce_loss_3": 4.369610404968261, + "ce_loss_6": 3.8820913434028625, + "epoch": 0.118, + "grad_norm": 1328.0, + "kl_loss_12": 882.3399963378906, + "kl_loss_17": 283.6298973083496, + "kl_loss_3": 2986.9559936523438, + "kl_loss_6": 2038.928778076172, + "learning_rate": 0.0009709221818197624, + "loss": 1533.3811, + "step": 1180 + }, + { + "ce_loss_12": 3.3617777824401855, + "ce_loss_17": 3.090414488315582, + "ce_loss_23": 2.96088707447052, + "ce_loss_3": 4.399712181091308, + "ce_loss_6": 3.9232612371444704, + "epoch": 0.119, + "grad_norm": 924.0, + "kl_loss_12": 879.8941070556641, + "kl_loss_17": 276.8961006164551, + "kl_loss_3": 2980.789501953125, + "kl_loss_6": 2052.8765197753905, + "learning_rate": 0.0009703866145003512, + "loss": 1544.3242, + "step": 1190 + }, + { + "ce_loss_12": 3.321975600719452, + "ce_loss_17": 3.054083788394928, + "ce_loss_23": 2.930968141555786, + "ce_loss_3": 4.354817938804627, + "ce_loss_6": 3.874580407142639, + "epoch": 0.12, + "grad_norm": 1104.0, + "kl_loss_12": 864.2863861083985, + "kl_loss_17": 270.091845703125, + "kl_loss_3": 2934.7477294921873, + "kl_loss_6": 2007.5105346679688, + "learning_rate": 0.0009698463103929542, + "loss": 1543.8357, + "step": 1200 + }, + { + "ce_loss_12": 3.2936753749847414, + "ce_loss_17": 3.019405686855316, + "ce_loss_23": 2.894251120090485, + "ce_loss_3": 4.344498944282532, + "ce_loss_6": 3.8676336646080016, + "epoch": 0.121, + "grad_norm": 1248.0, + "kl_loss_12": 886.2678466796875, + "kl_loss_17": 277.1609245300293, + "kl_loss_3": 2976.411096191406, + "kl_loss_6": 2053.3679931640627, + "learning_rate": 0.0009693012749384279, + "loss": 1554.7787, + "step": 1210 + }, + { + "ce_loss_12": 3.2914728999137877, + "ce_loss_17": 3.022671139240265, + "ce_loss_23": 2.898689293861389, + "ce_loss_3": 4.314063239097595, + "ce_loss_6": 3.8446247458457945, + "epoch": 0.122, + "grad_norm": 1088.0, + "kl_loss_12": 864.2732177734375, + "kl_loss_17": 271.4930358886719, + "kl_loss_3": 2932.395886230469, + "kl_loss_6": 2020.443914794922, + "learning_rate": 0.0009687515136252732, + "loss": 1516.3835, + "step": 1220 + }, + { + "ce_loss_12": 3.2560503005981447, + "ce_loss_17": 2.9858575463294983, + "ce_loss_23": 2.862755537033081, + "ce_loss_3": 4.357694816589356, + "ce_loss_6": 3.8565963745117187, + "epoch": 0.123, + "grad_norm": 980.0, + "kl_loss_12": 858.8930755615235, + "kl_loss_17": 265.92589340209963, + "kl_loss_3": 3066.5052001953127, + "kl_loss_6": 2076.885809326172, + "learning_rate": 0.0009681970319895803, + "loss": 1597.2988, + "step": 1230 + }, + { + "ce_loss_12": 3.3237783312797546, + "ce_loss_17": 3.0648343682289125, + "ce_loss_23": 2.946179783344269, + "ce_loss_3": 4.364671421051026, + "ce_loss_6": 3.8876635074615478, + "epoch": 0.124, + "grad_norm": 1032.0, + "kl_loss_12": 847.8623840332032, + "kl_loss_17": 261.37926483154297, + "kl_loss_3": 2940.520544433594, + "kl_loss_6": 1995.1516235351562, + "learning_rate": 0.0009676378356149733, + "loss": 1508.2539, + "step": 1240 + }, + { + "ce_loss_12": 3.2792726516723634, + "ce_loss_17": 3.026242733001709, + "ce_loss_23": 2.912305271625519, + "ce_loss_3": 4.320637655258179, + "ce_loss_6": 3.8262179017066957, + "epoch": 0.125, + "grad_norm": 1096.0, + "kl_loss_12": 820.703564453125, + "kl_loss_17": 256.4162796020508, + "kl_loss_3": 2895.8089965820313, + "kl_loss_6": 1947.9052673339843, + "learning_rate": 0.0009670739301325534, + "loss": 1494.4266, + "step": 1250 + }, + { + "ce_loss_12": 3.2621443033218385, + "ce_loss_17": 2.9985108733177186, + "ce_loss_23": 2.873834729194641, + "ce_loss_3": 4.284839725494384, + "ce_loss_6": 3.802198255062103, + "epoch": 0.126, + "grad_norm": 916.0, + "kl_loss_12": 842.8930450439453, + "kl_loss_17": 275.43683166503905, + "kl_loss_3": 2915.22626953125, + "kl_loss_6": 1975.2683044433593, + "learning_rate": 0.0009665053212208426, + "loss": 1521.6795, + "step": 1260 + }, + { + "ce_loss_12": 3.287890446186066, + "ce_loss_17": 3.0372009754180906, + "ce_loss_23": 2.9077538013458253, + "ce_loss_3": 4.342221176624298, + "ce_loss_6": 3.8497675180435182, + "epoch": 0.127, + "grad_norm": 1040.0, + "kl_loss_12": 853.9959564208984, + "kl_loss_17": 286.59808654785155, + "kl_loss_3": 2974.9799682617186, + "kl_loss_6": 2006.1366577148438, + "learning_rate": 0.0009659320146057262, + "loss": 1520.3227, + "step": 1270 + }, + { + "ce_loss_12": 3.3012346744537355, + "ce_loss_17": 3.0512335300445557, + "ce_loss_23": 2.9305968999862673, + "ce_loss_3": 4.324515700340271, + "ce_loss_6": 3.852405047416687, + "epoch": 0.128, + "grad_norm": 1064.0, + "kl_loss_12": 823.8801452636719, + "kl_loss_17": 269.6768173217773, + "kl_loss_3": 2893.6376831054686, + "kl_loss_6": 1971.940008544922, + "learning_rate": 0.0009653540160603955, + "loss": 1493.1244, + "step": 1280 + }, + { + "ce_loss_12": 3.2927725553512572, + "ce_loss_17": 3.045177149772644, + "ce_loss_23": 2.931023132801056, + "ce_loss_3": 4.314601492881775, + "ce_loss_6": 3.8426471829414366, + "epoch": 0.129, + "grad_norm": 1216.0, + "kl_loss_12": 820.3086090087891, + "kl_loss_17": 259.5847450256348, + "kl_loss_3": 2895.2479614257813, + "kl_loss_6": 1956.9156921386718, + "learning_rate": 0.0009647713314052896, + "loss": 1476.1674, + "step": 1290 + }, + { + "ce_loss_12": 3.2684458255767823, + "ce_loss_17": 2.9975777506828307, + "ce_loss_23": 2.8703335881233216, + "ce_loss_3": 4.3339741945266725, + "ce_loss_6": 3.859717035293579, + "epoch": 0.13, + "grad_norm": 972.0, + "kl_loss_12": 857.9473876953125, + "kl_loss_17": 273.35472869873047, + "kl_loss_3": 3001.6898559570313, + "kl_loss_6": 2063.1881591796873, + "learning_rate": 0.0009641839665080363, + "loss": 1537.6836, + "step": 1300 + }, + { + "ce_loss_12": 3.2277937054634096, + "ce_loss_17": 2.9757333517074587, + "ce_loss_23": 2.851700460910797, + "ce_loss_3": 4.281731259822846, + "ce_loss_6": 3.7879306077957153, + "epoch": 0.131, + "grad_norm": 2816.0, + "kl_loss_12": 826.8987396240234, + "kl_loss_17": 270.5403312683105, + "kl_loss_3": 2943.7653930664064, + "kl_loss_6": 1985.992041015625, + "learning_rate": 0.0009635919272833937, + "loss": 1488.6453, + "step": 1310 + }, + { + "ce_loss_12": 3.259249973297119, + "ce_loss_17": 2.9989022612571716, + "ce_loss_23": 2.8736031293869018, + "ce_loss_3": 4.318255996704101, + "ce_loss_6": 3.8237802028656005, + "epoch": 0.132, + "grad_norm": 1416.0, + "kl_loss_12": 834.8762725830078, + "kl_loss_17": 283.9470611572266, + "kl_loss_3": 2966.973986816406, + "kl_loss_6": 1988.0608520507812, + "learning_rate": 0.0009629952196931902, + "loss": 1490.9254, + "step": 1320 + }, + { + "ce_loss_12": 3.2333609223365785, + "ce_loss_17": 2.990918219089508, + "ce_loss_23": 2.8692086040973663, + "ce_loss_3": 4.287823891639709, + "ce_loss_6": 3.7886445045471193, + "epoch": 0.133, + "grad_norm": 1020.0, + "kl_loss_12": 802.2819915771485, + "kl_loss_17": 269.66481018066406, + "kl_loss_3": 2919.6731567382812, + "kl_loss_6": 1954.8788635253907, + "learning_rate": 0.0009623938497462645, + "loss": 1478.0739, + "step": 1330 + }, + { + "ce_loss_12": 3.229719305038452, + "ce_loss_17": 2.9842681527137755, + "ce_loss_23": 2.860576260089874, + "ce_loss_3": 4.277899718284607, + "ce_loss_6": 3.7947510719299316, + "epoch": 0.134, + "grad_norm": 932.0, + "kl_loss_12": 829.4476867675781, + "kl_loss_17": 271.8407012939453, + "kl_loss_3": 2928.6218872070312, + "kl_loss_6": 1985.8415222167969, + "learning_rate": 0.0009617878234984055, + "loss": 1507.8875, + "step": 1340 + }, + { + "ce_loss_12": 3.2999788522720337, + "ce_loss_17": 3.0766652107238768, + "ce_loss_23": 2.9456470131874086, + "ce_loss_3": 4.303646683692932, + "ce_loss_6": 3.8252553224563597, + "epoch": 0.135, + "grad_norm": 1088.0, + "kl_loss_12": 798.0401733398437, + "kl_loss_17": 287.97615509033204, + "kl_loss_3": 2827.784289550781, + "kl_loss_6": 1898.5206604003906, + "learning_rate": 0.0009611771470522907, + "loss": 1471.7371, + "step": 1350 + }, + { + "ce_loss_12": 3.2443023204803465, + "ce_loss_17": 3.008663058280945, + "ce_loss_23": 2.872717189788818, + "ce_loss_3": 4.279219973087311, + "ce_loss_6": 3.7957202434539794, + "epoch": 0.136, + "grad_norm": 1208.0, + "kl_loss_12": 799.3139007568359, + "kl_loss_17": 294.02868194580077, + "kl_loss_3": 2879.5612060546873, + "kl_loss_6": 1925.3865356445312, + "learning_rate": 0.0009605618265574251, + "loss": 1462.2784, + "step": 1360 + }, + { + "ce_loss_12": 3.2207038402557373, + "ce_loss_17": 2.9885846972465515, + "ce_loss_23": 2.853401231765747, + "ce_loss_3": 4.292985570430756, + "ce_loss_6": 3.8024420857429506, + "epoch": 0.137, + "grad_norm": 1144.0, + "kl_loss_12": 834.6501495361329, + "kl_loss_17": 299.46264266967773, + "kl_loss_3": 2988.1515380859373, + "kl_loss_6": 2041.3694946289063, + "learning_rate": 0.0009599418682100792, + "loss": 1512.3742, + "step": 1370 + }, + { + "ce_loss_12": 3.247525489330292, + "ce_loss_17": 3.0069747805595397, + "ce_loss_23": 2.884253454208374, + "ce_loss_3": 4.294894897937775, + "ce_loss_6": 3.809372806549072, + "epoch": 0.138, + "grad_norm": 964.0, + "kl_loss_12": 802.948861694336, + "kl_loss_17": 271.59482803344724, + "kl_loss_3": 2909.496472167969, + "kl_loss_6": 1976.5658874511719, + "learning_rate": 0.0009593172782532268, + "loss": 1491.6639, + "step": 1380 + }, + { + "ce_loss_12": 3.280215847492218, + "ce_loss_17": 3.0361931324005127, + "ce_loss_23": 2.914945065975189, + "ce_loss_3": 4.304294717311859, + "ce_loss_6": 3.819801914691925, + "epoch": 0.139, + "grad_norm": 1020.0, + "kl_loss_12": 807.8031372070312, + "kl_loss_17": 265.86458892822264, + "kl_loss_3": 2857.7116943359374, + "kl_loss_6": 1914.3999145507812, + "learning_rate": 0.0009586880629764817, + "loss": 1465.0936, + "step": 1390 + }, + { + "ce_loss_12": 3.2215174078941344, + "ce_loss_17": 2.9842705965042113, + "ce_loss_23": 2.8551847457885744, + "ce_loss_3": 4.27022614479065, + "ce_loss_6": 3.788445198535919, + "epoch": 0.14, + "grad_norm": 1040.0, + "kl_loss_12": 811.4446868896484, + "kl_loss_17": 288.3548393249512, + "kl_loss_3": 2900.03798828125, + "kl_loss_6": 1963.1980285644531, + "learning_rate": 0.0009580542287160348, + "loss": 1467.0395, + "step": 1400 + }, + { + "ce_loss_12": 3.188516676425934, + "ce_loss_17": 2.9619083285331724, + "ce_loss_23": 2.8218137621879578, + "ce_loss_3": 4.234646821022034, + "ce_loss_6": 3.739990162849426, + "epoch": 0.141, + "grad_norm": 1048.0, + "kl_loss_12": 809.8930877685547, + "kl_loss_17": 288.1702690124512, + "kl_loss_3": 2922.703991699219, + "kl_loss_6": 1956.603094482422, + "learning_rate": 0.0009574157818545901, + "loss": 1469.4032, + "step": 1410 + }, + { + "ce_loss_12": 3.2416644215583803, + "ce_loss_17": 3.006680631637573, + "ce_loss_23": 2.8900532841682436, + "ce_loss_3": 4.26418137550354, + "ce_loss_6": 3.788861167430878, + "epoch": 0.142, + "grad_norm": 876.0, + "kl_loss_12": 785.4881103515625, + "kl_loss_17": 259.1768203735352, + "kl_loss_3": 2836.4365356445314, + "kl_loss_6": 1906.6022399902345, + "learning_rate": 0.0009567727288213005, + "loss": 1482.4597, + "step": 1420 + }, + { + "ce_loss_12": 3.235337662696838, + "ce_loss_17": 2.98630108833313, + "ce_loss_23": 2.8627288341522217, + "ce_loss_3": 4.281697344779968, + "ce_loss_6": 3.796532988548279, + "epoch": 0.143, + "grad_norm": 1280.0, + "kl_loss_12": 819.8938232421875, + "kl_loss_17": 269.30180130004885, + "kl_loss_3": 2927.6844848632813, + "kl_loss_6": 1979.854022216797, + "learning_rate": 0.0009561250760917027, + "loss": 1478.084, + "step": 1430 + }, + { + "ce_loss_12": 3.2488888025283815, + "ce_loss_17": 2.9993746876716614, + "ce_loss_23": 2.876301276683807, + "ce_loss_3": 4.269674825668335, + "ce_loss_6": 3.792963969707489, + "epoch": 0.144, + "grad_norm": 1512.0, + "kl_loss_12": 822.9290191650391, + "kl_loss_17": 268.31039962768557, + "kl_loss_3": 2896.881481933594, + "kl_loss_6": 1968.8903930664062, + "learning_rate": 0.0009554728301876525, + "loss": 1456.0495, + "step": 1440 + }, + { + "ce_loss_12": 3.2897768020629883, + "ce_loss_17": 3.033561480045319, + "ce_loss_23": 2.916265618801117, + "ce_loss_3": 4.292733931541443, + "ce_loss_6": 3.8300752878189086, + "epoch": 0.145, + "grad_norm": 1392.0, + "kl_loss_12": 810.3653289794922, + "kl_loss_17": 257.99100799560546, + "kl_loss_3": 2828.956005859375, + "kl_loss_6": 1927.9954711914063, + "learning_rate": 0.0009548159976772592, + "loss": 1499.9711, + "step": 1450 + }, + { + "ce_loss_12": 3.24677711725235, + "ce_loss_17": 2.9964696407318114, + "ce_loss_23": 2.8798062562942506, + "ce_loss_3": 4.300918602943421, + "ce_loss_6": 3.796139717102051, + "epoch": 0.146, + "grad_norm": 1144.0, + "kl_loss_12": 811.711947631836, + "kl_loss_17": 257.1396308898926, + "kl_loss_3": 2927.9613647460938, + "kl_loss_6": 1955.6009887695313, + "learning_rate": 0.0009541545851748186, + "loss": 1472.3645, + "step": 1460 + }, + { + "ce_loss_12": 3.1249361634254456, + "ce_loss_17": 2.8709220051765443, + "ce_loss_23": 2.7551259279251097, + "ce_loss_3": 4.210307502746582, + "ce_loss_6": 3.6961392521858216, + "epoch": 0.147, + "grad_norm": 1216.0, + "kl_loss_12": 803.9768951416015, + "kl_loss_17": 253.5036376953125, + "kl_loss_3": 2965.500927734375, + "kl_loss_6": 1973.1413146972657, + "learning_rate": 0.0009534885993407473, + "loss": 1481.9013, + "step": 1470 + }, + { + "ce_loss_12": 3.2659184217453, + "ce_loss_17": 3.0223120212554933, + "ce_loss_23": 2.907353913784027, + "ce_loss_3": 4.305331182479859, + "ce_loss_6": 3.828053390979767, + "epoch": 0.148, + "grad_norm": 1080.0, + "kl_loss_12": 788.480908203125, + "kl_loss_17": 250.7262405395508, + "kl_loss_3": 2884.44912109375, + "kl_loss_6": 1953.9335510253907, + "learning_rate": 0.0009528180468815154, + "loss": 1472.3178, + "step": 1480 + }, + { + "ce_loss_12": 3.311021792888641, + "ce_loss_17": 3.071496820449829, + "ce_loss_23": 2.9612332940101624, + "ce_loss_3": 4.324043917655945, + "ce_loss_6": 3.845843195915222, + "epoch": 0.149, + "grad_norm": 908.0, + "kl_loss_12": 800.0986083984375, + "kl_loss_17": 250.172859954834, + "kl_loss_3": 2832.765869140625, + "kl_loss_6": 1897.6001281738281, + "learning_rate": 0.0009521429345495787, + "loss": 1447.2758, + "step": 1490 + }, + { + "ce_loss_12": 3.2847871661186216, + "ce_loss_17": 3.050262463092804, + "ce_loss_23": 2.939221966266632, + "ce_loss_3": 4.285373425483703, + "ce_loss_6": 3.809739577770233, + "epoch": 0.15, + "grad_norm": 972.0, + "kl_loss_12": 775.6115142822266, + "kl_loss_17": 245.11468811035155, + "kl_loss_3": 2820.178259277344, + "kl_loss_6": 1885.4727478027344, + "learning_rate": 0.0009514632691433108, + "loss": 1448.602, + "step": 1500 + }, + { + "ce_loss_12": 3.2675668716430666, + "ce_loss_17": 3.024088716506958, + "ce_loss_23": 2.903068017959595, + "ce_loss_3": 4.28171523809433, + "ce_loss_6": 3.8086917757987977, + "epoch": 0.151, + "grad_norm": 1400.0, + "kl_loss_12": 806.0697418212891, + "kl_loss_17": 269.3117622375488, + "kl_loss_3": 2851.461560058594, + "kl_loss_6": 1931.8026123046875, + "learning_rate": 0.0009507790575069346, + "loss": 1463.5699, + "step": 1510 + }, + { + "ce_loss_12": 3.24948410987854, + "ce_loss_17": 2.996292233467102, + "ce_loss_23": 2.8639941811561584, + "ce_loss_3": 4.288215208053589, + "ce_loss_6": 3.800259804725647, + "epoch": 0.152, + "grad_norm": 1008.0, + "kl_loss_12": 829.4756469726562, + "kl_loss_17": 274.5081420898438, + "kl_loss_3": 2920.2522094726564, + "kl_loss_6": 1967.4820678710937, + "learning_rate": 0.0009500903065304539, + "loss": 1501.5518, + "step": 1520 + }, + { + "ce_loss_12": 3.2591757655143736, + "ce_loss_17": 3.027329218387604, + "ce_loss_23": 2.911393141746521, + "ce_loss_3": 4.253254747390747, + "ce_loss_6": 3.7834643006324766, + "epoch": 0.153, + "grad_norm": 1004.0, + "kl_loss_12": 778.630160522461, + "kl_loss_17": 250.61957092285155, + "kl_loss_3": 2784.512976074219, + "kl_loss_6": 1862.2048217773438, + "learning_rate": 0.0009493970231495835, + "loss": 1446.7391, + "step": 1530 + }, + { + "ce_loss_12": 3.2096596479415895, + "ce_loss_17": 2.9711764454841614, + "ce_loss_23": 2.8672603607177733, + "ce_loss_3": 4.199491119384765, + "ce_loss_6": 3.7244733333587647, + "epoch": 0.154, + "grad_norm": 1280.0, + "kl_loss_12": 773.3964080810547, + "kl_loss_17": 244.00584335327147, + "kl_loss_3": 2788.61259765625, + "kl_loss_6": 1862.1662048339845, + "learning_rate": 0.0009486992143456792, + "loss": 1424.141, + "step": 1540 + }, + { + "ce_loss_12": 3.2563757061958314, + "ce_loss_17": 2.998210537433624, + "ce_loss_23": 2.874551975727081, + "ce_loss_3": 4.34161684513092, + "ce_loss_6": 3.837330734729767, + "epoch": 0.155, + "grad_norm": 1048.0, + "kl_loss_12": 833.3443481445313, + "kl_loss_17": 266.61425628662107, + "kl_loss_3": 3008.584338378906, + "kl_loss_6": 2019.5796325683593, + "learning_rate": 0.0009479968871456679, + "loss": 1489.1023, + "step": 1550 + }, + { + "ce_loss_12": 3.21219242811203, + "ce_loss_17": 2.9625760078430177, + "ce_loss_23": 2.8483705639839174, + "ce_loss_3": 4.265359997749329, + "ce_loss_6": 3.76342613697052, + "epoch": 0.156, + "grad_norm": 1104.0, + "kl_loss_12": 800.6179107666015, + "kl_loss_17": 251.26333312988282, + "kl_loss_3": 2913.7487548828126, + "kl_loss_6": 1930.050030517578, + "learning_rate": 0.0009472900486219768, + "loss": 1445.6348, + "step": 1560 + }, + { + "ce_loss_12": 3.2221407175064085, + "ce_loss_17": 2.961968779563904, + "ce_loss_23": 2.8494272470474242, + "ce_loss_3": 4.204352223873139, + "ce_loss_6": 3.7289497375488283, + "epoch": 0.157, + "grad_norm": 1040.0, + "kl_loss_12": 805.2766754150391, + "kl_loss_17": 246.36323776245118, + "kl_loss_3": 2819.4550659179686, + "kl_loss_6": 1890.1749145507813, + "learning_rate": 0.000946578705892462, + "loss": 1451.4247, + "step": 1570 + }, + { + "ce_loss_12": 3.259094977378845, + "ce_loss_17": 2.9870534896850587, + "ce_loss_23": 2.8754953742027283, + "ce_loss_3": 4.228675103187561, + "ce_loss_6": 3.748308801651001, + "epoch": 0.158, + "grad_norm": 1168.0, + "kl_loss_12": 843.6717803955078, + "kl_loss_17": 246.24478073120116, + "kl_loss_3": 2784.921044921875, + "kl_loss_6": 1842.1207641601563, + "learning_rate": 0.0009458628661203367, + "loss": 1458.2909, + "step": 1580 + }, + { + "ce_loss_12": 3.2754209637641907, + "ce_loss_17": 2.9969581842422484, + "ce_loss_23": 2.8823806643486023, + "ce_loss_3": 4.299981880187988, + "ce_loss_6": 3.8002068281173704, + "epoch": 0.159, + "grad_norm": 1216.0, + "kl_loss_12": 866.5703857421875, + "kl_loss_17": 252.2691665649414, + "kl_loss_3": 2910.86015625, + "kl_loss_6": 1942.7434814453125, + "learning_rate": 0.0009451425365140996, + "loss": 1447.4223, + "step": 1590 + }, + { + "ce_loss_12": 3.317850708961487, + "ce_loss_17": 3.06536523103714, + "ce_loss_23": 2.947838509082794, + "ce_loss_3": 4.273639845848083, + "ce_loss_6": 3.804075849056244, + "epoch": 0.16, + "grad_norm": 1320.0, + "kl_loss_12": 831.5212005615234, + "kl_loss_17": 251.46739044189454, + "kl_loss_3": 2753.1792846679687, + "kl_loss_6": 1837.5140747070313, + "learning_rate": 0.0009444177243274617, + "loss": 1415.2524, + "step": 1600 + }, + { + "ce_loss_12": 3.1936524152755736, + "ce_loss_17": 2.9349228024482725, + "ce_loss_23": 2.8170175671577455, + "ce_loss_3": 4.21053067445755, + "ce_loss_6": 3.7250030398368836, + "epoch": 0.161, + "grad_norm": 1192.0, + "kl_loss_12": 843.8219207763672, + "kl_loss_17": 257.3523231506348, + "kl_loss_3": 2868.45556640625, + "kl_loss_6": 1927.8065856933595, + "learning_rate": 0.0009436884368592739, + "loss": 1460.9502, + "step": 1610 + }, + { + "ce_loss_12": 3.239683485031128, + "ce_loss_17": 2.9786096811294556, + "ce_loss_23": 2.8677761435508726, + "ce_loss_3": 4.2137162446975704, + "ce_loss_6": 3.740345096588135, + "epoch": 0.162, + "grad_norm": 1296.0, + "kl_loss_12": 825.1115173339844, + "kl_loss_17": 247.33793029785156, + "kl_loss_3": 2789.4590698242187, + "kl_loss_6": 1862.7459655761718, + "learning_rate": 0.0009429546814534529, + "loss": 1460.2281, + "step": 1620 + }, + { + "ce_loss_12": 3.231956887245178, + "ce_loss_17": 2.9807236790657043, + "ce_loss_23": 2.876505267620087, + "ce_loss_3": 4.215959429740906, + "ce_loss_6": 3.7355253100395203, + "epoch": 0.163, + "grad_norm": 960.0, + "kl_loss_12": 808.4689880371094, + "kl_loss_17": 241.53212051391603, + "kl_loss_3": 2776.0137573242187, + "kl_loss_6": 1856.5346984863281, + "learning_rate": 0.0009422164654989072, + "loss": 1402.5779, + "step": 1630 + }, + { + "ce_loss_12": 3.3346951007843018, + "ce_loss_17": 3.0862001299858095, + "ce_loss_23": 2.9771710872650146, + "ce_loss_3": 4.302021241188049, + "ce_loss_6": 3.8365179657936097, + "epoch": 0.164, + "grad_norm": 1256.0, + "kl_loss_12": 802.1354614257813, + "kl_loss_17": 246.5807846069336, + "kl_loss_3": 2768.77490234375, + "kl_loss_6": 1852.6294067382812, + "learning_rate": 0.0009414737964294635, + "loss": 1425.5844, + "step": 1640 + }, + { + "ce_loss_12": 3.2539820194244387, + "ce_loss_17": 3.0245274782180784, + "ce_loss_23": 2.919032084941864, + "ce_loss_3": 4.217273092269897, + "ce_loss_6": 3.747937524318695, + "epoch": 0.165, + "grad_norm": 1144.0, + "kl_loss_12": 758.6764282226562, + "kl_loss_17": 240.4837532043457, + "kl_loss_3": 2683.5130615234375, + "kl_loss_6": 1773.5954284667969, + "learning_rate": 0.000940726681723791, + "loss": 1408.1107, + "step": 1650 + }, + { + "ce_loss_12": 3.1329286456108094, + "ce_loss_17": 2.895743155479431, + "ce_loss_23": 2.780760633945465, + "ce_loss_3": 4.180186772346497, + "ce_loss_6": 3.68881573677063, + "epoch": 0.166, + "grad_norm": 1248.0, + "kl_loss_12": 794.1787048339844, + "kl_loss_17": 253.62862396240234, + "kl_loss_3": 2912.988879394531, + "kl_loss_6": 1942.947412109375, + "learning_rate": 0.0009399751289053266, + "loss": 1414.577, + "step": 1660 + }, + { + "ce_loss_12": 3.300536847114563, + "ce_loss_17": 3.065656638145447, + "ce_loss_23": 2.955276608467102, + "ce_loss_3": 4.278782832622528, + "ce_loss_6": 3.8195556879043577, + "epoch": 0.167, + "grad_norm": 960.0, + "kl_loss_12": 761.587435913086, + "kl_loss_17": 248.2511199951172, + "kl_loss_3": 2737.0460693359373, + "kl_loss_6": 1834.2865234375, + "learning_rate": 0.0009392191455421988, + "loss": 1425.7268, + "step": 1670 + }, + { + "ce_loss_12": 3.2943241119384767, + "ce_loss_17": 3.058737647533417, + "ce_loss_23": 2.948542630672455, + "ce_loss_3": 4.2761582136154175, + "ce_loss_6": 3.802069568634033, + "epoch": 0.168, + "grad_norm": 1136.0, + "kl_loss_12": 786.0340240478515, + "kl_loss_17": 257.52221908569334, + "kl_loss_3": 2779.2551391601564, + "kl_loss_6": 1863.6268981933595, + "learning_rate": 0.0009384587392471515, + "loss": 1395.0223, + "step": 1680 + }, + { + "ce_loss_12": 3.2665735125541686, + "ce_loss_17": 3.041038954257965, + "ce_loss_23": 2.9342415809631346, + "ce_loss_3": 4.232107269763946, + "ce_loss_6": 3.786698818206787, + "epoch": 0.169, + "grad_norm": 1004.0, + "kl_loss_12": 754.4325256347656, + "kl_loss_17": 239.2375801086426, + "kl_loss_3": 2717.1393188476563, + "kl_loss_6": 1834.2838317871094, + "learning_rate": 0.0009376939176774678, + "loss": 1389.892, + "step": 1690 + }, + { + "ce_loss_12": 3.2542818665504454, + "ce_loss_17": 3.018766474723816, + "ce_loss_23": 2.9072797894477844, + "ce_loss_3": 4.2547527074813845, + "ce_loss_6": 3.778389811515808, + "epoch": 0.17, + "grad_norm": 1296.0, + "kl_loss_12": 763.4822723388672, + "kl_loss_17": 250.73448333740234, + "kl_loss_3": 2776.2912475585936, + "kl_loss_6": 1853.5381958007813, + "learning_rate": 0.0009369246885348925, + "loss": 1434.3868, + "step": 1700 + }, + { + "ce_loss_12": 3.2471718430519103, + "ce_loss_17": 3.020142710208893, + "ce_loss_23": 2.899143636226654, + "ce_loss_3": 4.27671126127243, + "ce_loss_6": 3.797596609592438, + "epoch": 0.171, + "grad_norm": 968.0, + "kl_loss_12": 775.6185882568359, + "kl_loss_17": 265.4308486938477, + "kl_loss_3": 2856.3064575195312, + "kl_loss_6": 1918.2055114746095, + "learning_rate": 0.0009361510595655545, + "loss": 1441.4605, + "step": 1710 + }, + { + "ce_loss_12": 3.215370297431946, + "ce_loss_17": 2.9849206686019896, + "ce_loss_23": 2.858234190940857, + "ce_loss_3": 4.21358277797699, + "ce_loss_6": 3.755553126335144, + "epoch": 0.172, + "grad_norm": 1072.0, + "kl_loss_12": 784.4585998535156, + "kl_loss_17": 264.6829551696777, + "kl_loss_3": 2817.5525634765627, + "kl_loss_6": 1899.7091918945312, + "learning_rate": 0.0009353730385598887, + "loss": 1433.4188, + "step": 1720 + }, + { + "ce_loss_12": 3.1493828177452086, + "ce_loss_17": 2.9145042300224304, + "ce_loss_23": 2.8017009973526, + "ce_loss_3": 4.190014028549195, + "ce_loss_6": 3.7082863092422484, + "epoch": 0.173, + "grad_norm": 1616.0, + "kl_loss_12": 781.0476715087891, + "kl_loss_17": 251.35236129760742, + "kl_loss_3": 2861.7456298828124, + "kl_loss_6": 1924.5235900878906, + "learning_rate": 0.0009345906333525581, + "loss": 1445.5838, + "step": 1730 + }, + { + "ce_loss_12": 3.1840237855911253, + "ce_loss_17": 2.9483115673065186, + "ce_loss_23": 2.831267786026001, + "ce_loss_3": 4.200755143165589, + "ce_loss_6": 3.7272995591163633, + "epoch": 0.174, + "grad_norm": 1200.0, + "kl_loss_12": 791.5306030273438, + "kl_loss_17": 264.3092872619629, + "kl_loss_3": 2830.4186279296873, + "kl_loss_6": 1902.0811584472656, + "learning_rate": 0.0009338038518223745, + "loss": 1424.1994, + "step": 1740 + }, + { + "ce_loss_12": 3.251028025150299, + "ce_loss_17": 3.0087919354438784, + "ce_loss_23": 2.88809187412262, + "ce_loss_3": 4.258924639225006, + "ce_loss_6": 3.7800361275672913, + "epoch": 0.175, + "grad_norm": 1176.0, + "kl_loss_12": 800.4794372558594, + "kl_loss_17": 260.26867828369143, + "kl_loss_3": 2862.462158203125, + "kl_loss_6": 1918.6351745605468, + "learning_rate": 0.0009330127018922195, + "loss": 1475.6443, + "step": 1750 + }, + { + "ce_loss_12": 3.2001158714294435, + "ce_loss_17": 2.9682640552520754, + "ce_loss_23": 2.8567294597625734, + "ce_loss_3": 4.22244223356247, + "ce_loss_6": 3.7290648221969604, + "epoch": 0.176, + "grad_norm": 1464.0, + "kl_loss_12": 762.525503540039, + "kl_loss_17": 243.7757797241211, + "kl_loss_3": 2817.8551391601563, + "kl_loss_6": 1868.523504638672, + "learning_rate": 0.0009322171915289634, + "loss": 1421.9354, + "step": 1760 + }, + { + "ce_loss_12": 3.223832643032074, + "ce_loss_17": 3.0016063809394837, + "ce_loss_23": 2.8990545868873596, + "ce_loss_3": 4.203401672840118, + "ce_loss_6": 3.7404787063598635, + "epoch": 0.177, + "grad_norm": 1176.0, + "kl_loss_12": 759.478857421875, + "kl_loss_17": 234.98663024902345, + "kl_loss_3": 2762.4353637695312, + "kl_loss_6": 1849.6406311035157, + "learning_rate": 0.0009314173287433873, + "loss": 1389.1017, + "step": 1770 + }, + { + "ce_loss_12": 3.214055931568146, + "ce_loss_17": 2.9797696232795716, + "ce_loss_23": 2.872143268585205, + "ce_loss_3": 4.21132276058197, + "ce_loss_6": 3.7429075956344606, + "epoch": 0.178, + "grad_norm": 1096.0, + "kl_loss_12": 780.1603851318359, + "kl_loss_17": 244.01894989013672, + "kl_loss_3": 2798.7838623046873, + "kl_loss_6": 1879.8642150878907, + "learning_rate": 0.0009306131215901003, + "loss": 1396.7419, + "step": 1780 + }, + { + "ce_loss_12": 3.238218307495117, + "ce_loss_17": 3.0090879082679747, + "ce_loss_23": 2.9029579758644104, + "ce_loss_3": 4.228364479541779, + "ce_loss_6": 3.757898378372192, + "epoch": 0.179, + "grad_norm": 1040.0, + "kl_loss_12": 763.7487243652344, + "kl_loss_17": 239.04694061279298, + "kl_loss_3": 2757.7945068359377, + "kl_loss_6": 1843.6190063476563, + "learning_rate": 0.0009298045781674596, + "loss": 1378.4027, + "step": 1790 + }, + { + "ce_loss_12": 3.2191762566566466, + "ce_loss_17": 2.98970650434494, + "ce_loss_23": 2.8842029094696047, + "ce_loss_3": 4.1989802598953245, + "ce_loss_6": 3.735355496406555, + "epoch": 0.18, + "grad_norm": 1496.0, + "kl_loss_12": 752.9535675048828, + "kl_loss_17": 236.7996368408203, + "kl_loss_3": 2740.193994140625, + "kl_loss_6": 1820.6103637695312, + "learning_rate": 0.0009289917066174886, + "loss": 1404.1791, + "step": 1800 + }, + { + "ce_loss_12": 3.2065064549446105, + "ce_loss_17": 2.9845200538635255, + "ce_loss_23": 2.885902488231659, + "ce_loss_3": 4.161723887920379, + "ce_loss_6": 3.7009355306625364, + "epoch": 0.181, + "grad_norm": 1032.0, + "kl_loss_12": 729.0150360107422, + "kl_loss_17": 226.00843200683593, + "kl_loss_3": 2676.867541503906, + "kl_loss_6": 1771.958477783203, + "learning_rate": 0.0009281745151257945, + "loss": 1359.5951, + "step": 1810 + }, + { + "ce_loss_12": 3.2467663407325746, + "ce_loss_17": 3.009868013858795, + "ce_loss_23": 2.9040247201919556, + "ce_loss_3": 4.229266285896301, + "ce_loss_6": 3.7639632701873778, + "epoch": 0.182, + "grad_norm": 956.0, + "kl_loss_12": 746.8623779296875, + "kl_loss_17": 231.6996307373047, + "kl_loss_3": 2731.1640380859376, + "kl_loss_6": 1821.4275512695312, + "learning_rate": 0.0009273530119214868, + "loss": 1400.1968, + "step": 1820 + }, + { + "ce_loss_12": 3.312739670276642, + "ce_loss_17": 3.093990111351013, + "ce_loss_23": 2.990409862995148, + "ce_loss_3": 4.29701189994812, + "ce_loss_6": 3.8379727005958557, + "epoch": 0.183, + "grad_norm": 976.0, + "kl_loss_12": 741.4776031494141, + "kl_loss_17": 232.1430564880371, + "kl_loss_3": 2739.31171875, + "kl_loss_6": 1829.6507995605468, + "learning_rate": 0.0009265272052770935, + "loss": 1371.0215, + "step": 1830 + }, + { + "ce_loss_12": 3.17973473072052, + "ce_loss_17": 2.9410093665122985, + "ce_loss_23": 2.8326215505599976, + "ce_loss_3": 4.19485604763031, + "ce_loss_6": 3.7067224383354187, + "epoch": 0.184, + "grad_norm": 1200.0, + "kl_loss_12": 755.7035552978516, + "kl_loss_17": 238.02146682739257, + "kl_loss_3": 2803.255017089844, + "kl_loss_6": 1848.4059265136718, + "learning_rate": 0.0009256971035084784, + "loss": 1404.2541, + "step": 1840 + }, + { + "ce_loss_12": 3.127467322349548, + "ce_loss_17": 2.8794915318489074, + "ce_loss_23": 2.7674395084381103, + "ce_loss_3": 4.163695001602173, + "ce_loss_6": 3.674273729324341, + "epoch": 0.185, + "grad_norm": 936.0, + "kl_loss_12": 794.7793029785156, + "kl_loss_17": 245.12851486206054, + "kl_loss_3": 2877.0427001953126, + "kl_loss_6": 1919.3818481445312, + "learning_rate": 0.0009248627149747573, + "loss": 1431.63, + "step": 1850 + }, + { + "ce_loss_12": 3.2907007336616516, + "ce_loss_17": 3.065432584285736, + "ce_loss_23": 2.9592000246047974, + "ce_loss_3": 4.26578665971756, + "ce_loss_6": 3.801167941093445, + "epoch": 0.186, + "grad_norm": 1272.0, + "kl_loss_12": 756.5147766113281, + "kl_loss_17": 237.35702362060547, + "kl_loss_3": 2748.3367919921875, + "kl_loss_6": 1831.7043518066407, + "learning_rate": 0.0009240240480782129, + "loss": 1394.3164, + "step": 1860 + }, + { + "ce_loss_12": 3.198230564594269, + "ce_loss_17": 2.9691875100135805, + "ce_loss_23": 2.8620351433753966, + "ce_loss_3": 4.207461893558502, + "ce_loss_6": 3.73105411529541, + "epoch": 0.187, + "grad_norm": 1232.0, + "kl_loss_12": 758.0779357910156, + "kl_loss_17": 236.43061218261718, + "kl_loss_3": 2785.8535034179686, + "kl_loss_6": 1860.8220092773438, + "learning_rate": 0.0009231811112642122, + "loss": 1393.7689, + "step": 1870 + }, + { + "ce_loss_12": 3.2416003346443176, + "ce_loss_17": 3.011486494541168, + "ce_loss_23": 2.908844864368439, + "ce_loss_3": 4.20769385099411, + "ce_loss_6": 3.746701693534851, + "epoch": 0.188, + "grad_norm": 1080.0, + "kl_loss_12": 754.1037841796875, + "kl_loss_17": 232.73772125244142, + "kl_loss_3": 2723.378137207031, + "kl_loss_6": 1814.4259948730469, + "learning_rate": 0.0009223339130211192, + "loss": 1383.0408, + "step": 1880 + }, + { + "ce_loss_12": 3.110693836212158, + "ce_loss_17": 2.8853095531463624, + "ce_loss_23": 2.7808427035808565, + "ce_loss_3": 4.13761682510376, + "ce_loss_6": 3.642694425582886, + "epoch": 0.189, + "grad_norm": 1040.0, + "kl_loss_12": 739.8239501953125, + "kl_loss_17": 229.3358184814453, + "kl_loss_3": 2823.7280517578124, + "kl_loss_6": 1849.0847961425782, + "learning_rate": 0.0009214824618802108, + "loss": 1402.2676, + "step": 1890 + }, + { + "ce_loss_12": 3.2733206272125246, + "ce_loss_17": 3.0429879903793333, + "ce_loss_23": 2.9388628959655763, + "ce_loss_3": 4.262214660644531, + "ce_loss_6": 3.79117773771286, + "epoch": 0.19, + "grad_norm": 1008.0, + "kl_loss_12": 749.5772369384765, + "kl_loss_17": 232.20852584838866, + "kl_loss_3": 2747.72998046875, + "kl_loss_6": 1821.8578186035156, + "learning_rate": 0.0009206267664155906, + "loss": 1417.5346, + "step": 1900 + }, + { + "ce_loss_12": 3.2125448346138, + "ce_loss_17": 2.979010045528412, + "ce_loss_23": 2.8730541586875917, + "ce_loss_3": 4.208014273643494, + "ce_loss_6": 3.7290257573127747, + "epoch": 0.191, + "grad_norm": 1096.0, + "kl_loss_12": 748.1908905029297, + "kl_loss_17": 234.87025756835936, + "kl_loss_3": 2755.835559082031, + "kl_loss_6": 1819.9845886230469, + "learning_rate": 0.0009197668352441024, + "loss": 1395.7588, + "step": 1910 + }, + { + "ce_loss_12": 3.239735448360443, + "ce_loss_17": 3.018569827079773, + "ce_loss_23": 2.917696511745453, + "ce_loss_3": 4.225832998752594, + "ce_loss_6": 3.7482815623283385, + "epoch": 0.192, + "grad_norm": 1080.0, + "kl_loss_12": 737.2051788330078, + "kl_loss_17": 232.76412811279297, + "kl_loss_3": 2725.888525390625, + "kl_loss_6": 1800.2931823730469, + "learning_rate": 0.0009189026770252437, + "loss": 1383.0574, + "step": 1920 + }, + { + "ce_loss_12": 3.272562229633331, + "ce_loss_17": 3.046920895576477, + "ce_loss_23": 2.9422855377197266, + "ce_loss_3": 4.243232655525207, + "ce_loss_6": 3.782981109619141, + "epoch": 0.193, + "grad_norm": 1008.0, + "kl_loss_12": 739.2452575683594, + "kl_loss_17": 230.3068389892578, + "kl_loss_3": 2704.004113769531, + "kl_loss_6": 1798.2643249511718, + "learning_rate": 0.000918034300461078, + "loss": 1411.7353, + "step": 1930 + }, + { + "ce_loss_12": 3.291454005241394, + "ce_loss_17": 3.0682451486587525, + "ce_loss_23": 2.9636853694915772, + "ce_loss_3": 4.24529767036438, + "ce_loss_6": 3.7860057950019836, + "epoch": 0.194, + "grad_norm": 1216.0, + "kl_loss_12": 739.2582458496094, + "kl_loss_17": 230.60228424072267, + "kl_loss_3": 2671.1538330078124, + "kl_loss_6": 1778.3694885253906, + "learning_rate": 0.0009171617142961477, + "loss": 1364.5955, + "step": 1940 + }, + { + "ce_loss_12": 3.2587348222732544, + "ce_loss_17": 3.027367722988129, + "ce_loss_23": 2.9262425184249876, + "ce_loss_3": 4.226789343357086, + "ce_loss_6": 3.7603554725646973, + "epoch": 0.195, + "grad_norm": 1168.0, + "kl_loss_12": 734.6230255126953, + "kl_loss_17": 226.58795089721679, + "kl_loss_3": 2717.46865234375, + "kl_loss_6": 1794.8947204589845, + "learning_rate": 0.0009162849273173857, + "loss": 1369.4313, + "step": 1950 + }, + { + "ce_loss_12": 3.2035290122032167, + "ce_loss_17": 2.976229190826416, + "ce_loss_23": 2.8770426392555235, + "ce_loss_3": 4.178189706802368, + "ce_loss_6": 3.7164478659629823, + "epoch": 0.196, + "grad_norm": 1160.0, + "kl_loss_12": 729.7126403808594, + "kl_loss_17": 222.76492233276366, + "kl_loss_3": 2702.063098144531, + "kl_loss_6": 1786.2204711914062, + "learning_rate": 0.0009154039483540273, + "loss": 1377.5442, + "step": 1960 + }, + { + "ce_loss_12": 3.1775651812553405, + "ce_loss_17": 2.961009216308594, + "ce_loss_23": 2.8616538405418397, + "ce_loss_3": 4.169165551662445, + "ce_loss_6": 3.6863211512565615, + "epoch": 0.197, + "grad_norm": 1104.0, + "kl_loss_12": 720.0626831054688, + "kl_loss_17": 224.1533004760742, + "kl_loss_3": 2727.455603027344, + "kl_loss_6": 1790.6717346191406, + "learning_rate": 0.0009145187862775209, + "loss": 1364.2689, + "step": 1970 + }, + { + "ce_loss_12": 3.2117436170578, + "ce_loss_17": 2.987203574180603, + "ce_loss_23": 2.8880042552948, + "ce_loss_3": 4.178555047512054, + "ce_loss_6": 3.7207028031349183, + "epoch": 0.198, + "grad_norm": 916.0, + "kl_loss_12": 729.9580291748047, + "kl_loss_17": 224.39583053588868, + "kl_loss_3": 2696.8760009765624, + "kl_loss_6": 1799.79140625, + "learning_rate": 0.0009136294500014386, + "loss": 1364.4973, + "step": 1980 + }, + { + "ce_loss_12": 3.1704697370529176, + "ce_loss_17": 2.9398475289344788, + "ce_loss_23": 2.8437638759613035, + "ce_loss_3": 4.21691871881485, + "ce_loss_6": 3.7204419493675234, + "epoch": 0.199, + "grad_norm": 1160.0, + "kl_loss_12": 744.5164001464843, + "kl_loss_17": 223.4737236022949, + "kl_loss_3": 2823.7519653320314, + "kl_loss_6": 1863.6061950683593, + "learning_rate": 0.000912735948481387, + "loss": 1410.1096, + "step": 1990 + }, + { + "ce_loss_12": 3.194620633125305, + "ce_loss_17": 2.970863437652588, + "ce_loss_23": 2.873484969139099, + "ce_loss_3": 4.176866769790649, + "ce_loss_6": 3.7112247705459596, + "epoch": 0.2, + "grad_norm": 1312.0, + "kl_loss_12": 743.6204559326172, + "kl_loss_17": 226.67948532104492, + "kl_loss_3": 2743.9926147460938, + "kl_loss_6": 1821.1581726074219, + "learning_rate": 0.0009118382907149164, + "loss": 1357.7129, + "step": 2000 + }, + { + "ce_loss_12": 3.2195873975753786, + "ce_loss_17": 2.9947704076766968, + "ce_loss_23": 2.8929362654685975, + "ce_loss_3": 4.189015960693359, + "ce_loss_6": 3.7328444957733153, + "epoch": 0.201, + "grad_norm": 1200.0, + "kl_loss_12": 735.7771606445312, + "kl_loss_17": 227.69709014892578, + "kl_loss_3": 2695.451416015625, + "kl_loss_6": 1806.4518920898438, + "learning_rate": 0.0009109364857414306, + "loss": 1357.8611, + "step": 2010 + }, + { + "ce_loss_12": 3.1835444688797, + "ce_loss_17": 2.960699367523193, + "ce_loss_23": 2.8582266688346865, + "ce_loss_3": 4.159953761100769, + "ce_loss_6": 3.6881731033325194, + "epoch": 0.202, + "grad_norm": 1392.0, + "kl_loss_12": 728.1203887939453, + "kl_loss_17": 225.80080337524413, + "kl_loss_3": 2706.2451171875, + "kl_loss_6": 1788.8057861328125, + "learning_rate": 0.0009100305426420956, + "loss": 1390.1856, + "step": 2020 + }, + { + "ce_loss_12": 3.158332920074463, + "ce_loss_17": 2.9300349235534666, + "ce_loss_23": 2.8350396871566774, + "ce_loss_3": 4.194484627246856, + "ce_loss_6": 3.69413343667984, + "epoch": 0.203, + "grad_norm": 1280.0, + "kl_loss_12": 742.0165588378907, + "kl_loss_17": 223.17797317504883, + "kl_loss_3": 2842.8969116210938, + "kl_loss_6": 1853.1241821289063, + "learning_rate": 0.0009091204705397484, + "loss": 1386.8559, + "step": 2030 + }, + { + "ce_loss_12": 3.1536441922187803, + "ce_loss_17": 2.9197498559951782, + "ce_loss_23": 2.8176995158195495, + "ce_loss_3": 4.17947895526886, + "ce_loss_6": 3.6965800404548643, + "epoch": 0.204, + "grad_norm": 1152.0, + "kl_loss_12": 742.1661315917969, + "kl_loss_17": 230.5888328552246, + "kl_loss_3": 2826.4287841796877, + "kl_loss_6": 1875.2062072753906, + "learning_rate": 0.0009082062785988049, + "loss": 1397.4822, + "step": 2040 + }, + { + "ce_loss_12": 3.269137918949127, + "ce_loss_17": 3.0672066807746887, + "ce_loss_23": 2.9520501255989076, + "ce_loss_3": 4.209022450447082, + "ce_loss_6": 3.7591748595237733, + "epoch": 0.205, + "grad_norm": 1208.0, + "kl_loss_12": 731.192514038086, + "kl_loss_17": 245.70539016723632, + "kl_loss_3": 2669.8846923828123, + "kl_loss_6": 1772.4919555664062, + "learning_rate": 0.0009072879760251679, + "loss": 1370.2641, + "step": 2050 + }, + { + "ce_loss_12": 3.2294013857841493, + "ce_loss_17": 3.009316086769104, + "ce_loss_23": 2.9020573258399964, + "ce_loss_3": 4.236497604846955, + "ce_loss_6": 3.7589595079422, + "epoch": 0.206, + "grad_norm": 1192.0, + "kl_loss_12": 745.1193756103515, + "kl_loss_17": 250.63618850708008, + "kl_loss_3": 2800.283349609375, + "kl_loss_6": 1860.894793701172, + "learning_rate": 0.0009063655720661341, + "loss": 1386.6717, + "step": 2060 + }, + { + "ce_loss_12": 3.260830020904541, + "ce_loss_17": 3.0461089611053467, + "ce_loss_23": 2.9399641871452333, + "ce_loss_3": 4.217822635173798, + "ce_loss_6": 3.758314859867096, + "epoch": 0.207, + "grad_norm": 1016.0, + "kl_loss_12": 732.0251220703125, + "kl_loss_17": 235.47347717285157, + "kl_loss_3": 2669.2941284179688, + "kl_loss_6": 1763.8640869140625, + "learning_rate": 0.000905439076010301, + "loss": 1357.1877, + "step": 2070 + }, + { + "ce_loss_12": 3.230174386501312, + "ce_loss_17": 3.0027816772460936, + "ce_loss_23": 2.89189817905426, + "ce_loss_3": 4.2117068529129025, + "ce_loss_6": 3.7440367579460143, + "epoch": 0.208, + "grad_norm": 1432.0, + "kl_loss_12": 743.4530548095703, + "kl_loss_17": 236.2601432800293, + "kl_loss_3": 2725.55673828125, + "kl_loss_6": 1810.9228759765624, + "learning_rate": 0.0009045084971874737, + "loss": 1349.2176, + "step": 2080 + }, + { + "ce_loss_12": 3.2096628665924074, + "ce_loss_17": 2.9940597772598267, + "ce_loss_23": 2.8850452184677122, + "ce_loss_3": 4.1816226720809935, + "ce_loss_6": 3.7097704291343687, + "epoch": 0.209, + "grad_norm": 1240.0, + "kl_loss_12": 738.1762908935547, + "kl_loss_17": 246.9880401611328, + "kl_loss_3": 2709.64326171875, + "kl_loss_6": 1782.71328125, + "learning_rate": 0.0009035738449685707, + "loss": 1388.232, + "step": 2090 + }, + { + "ce_loss_12": 3.1514390230178835, + "ce_loss_17": 2.92235689163208, + "ce_loss_23": 2.8135188102722166, + "ce_loss_3": 4.17981231212616, + "ce_loss_6": 3.6853424429893495, + "epoch": 0.21, + "grad_norm": 1176.0, + "kl_loss_12": 742.7584381103516, + "kl_loss_17": 233.57797088623047, + "kl_loss_3": 2805.8019775390626, + "kl_loss_6": 1845.4294494628907, + "learning_rate": 0.0009026351287655293, + "loss": 1370.0281, + "step": 2100 + }, + { + "ce_loss_12": 3.3175438404083253, + "ce_loss_17": 3.1058226585388184, + "ce_loss_23": 3.0096519112586977, + "ce_loss_3": 4.2248126745224, + "ce_loss_6": 3.779649579524994, + "epoch": 0.211, + "grad_norm": 1384.0, + "kl_loss_12": 705.2593170166016, + "kl_loss_17": 222.40945892333986, + "kl_loss_3": 2558.6331787109375, + "kl_loss_6": 1683.1639892578125, + "learning_rate": 0.0009016923580312113, + "loss": 1303.9625, + "step": 2110 + }, + { + "ce_loss_12": 3.1874075293540955, + "ce_loss_17": 2.975838828086853, + "ce_loss_23": 2.872942864894867, + "ce_loss_3": 4.15403380393982, + "ce_loss_6": 3.683121955394745, + "epoch": 0.212, + "grad_norm": 1432.0, + "kl_loss_12": 723.3132904052734, + "kl_loss_17": 226.7868797302246, + "kl_loss_3": 2680.508654785156, + "kl_loss_6": 1756.263201904297, + "learning_rate": 0.0009007455422593077, + "loss": 1372.0053, + "step": 2120 + }, + { + "ce_loss_12": 3.2143858790397646, + "ce_loss_17": 2.989579474925995, + "ce_loss_23": 2.8894683003425596, + "ce_loss_3": 4.213230812549591, + "ce_loss_6": 3.734225702285767, + "epoch": 0.213, + "grad_norm": 1104.0, + "kl_loss_12": 741.5362884521485, + "kl_loss_17": 224.52107315063478, + "kl_loss_3": 2771.612683105469, + "kl_loss_6": 1827.0911804199218, + "learning_rate": 0.0008997946909842425, + "loss": 1384.9793, + "step": 2130 + }, + { + "ce_loss_12": 3.2382789134979246, + "ce_loss_17": 2.9995150327682496, + "ce_loss_23": 2.8895116806030274, + "ce_loss_3": 4.2739152073860165, + "ce_loss_6": 3.7909855246543884, + "epoch": 0.214, + "grad_norm": 1012.0, + "kl_loss_12": 768.0854797363281, + "kl_loss_17": 242.95709915161132, + "kl_loss_3": 2860.61904296875, + "kl_loss_6": 1907.5541564941407, + "learning_rate": 0.0008988398137810777, + "loss": 1387.8596, + "step": 2140 + }, + { + "ce_loss_12": 3.251051902770996, + "ce_loss_17": 3.0360903263092043, + "ce_loss_23": 2.931905817985535, + "ce_loss_3": 4.211556780338287, + "ce_loss_6": 3.7544206738471986, + "epoch": 0.215, + "grad_norm": 1136.0, + "kl_loss_12": 717.1144714355469, + "kl_loss_17": 236.59547500610353, + "kl_loss_3": 2675.7889892578123, + "kl_loss_6": 1773.575018310547, + "learning_rate": 0.0008978809202654162, + "loss": 1340.9514, + "step": 2150 + }, + { + "ce_loss_12": 3.2336065649986265, + "ce_loss_17": 3.020431411266327, + "ce_loss_23": 2.912239933013916, + "ce_loss_3": 4.21031643152237, + "ce_loss_6": 3.7399205327033997, + "epoch": 0.216, + "grad_norm": 1280.0, + "kl_loss_12": 732.2896759033204, + "kl_loss_17": 241.09996490478517, + "kl_loss_3": 2681.3944580078123, + "kl_loss_6": 1765.8480407714844, + "learning_rate": 0.0008969180200933046, + "loss": 1367.7377, + "step": 2160 + }, + { + "ce_loss_12": 3.2038675785064696, + "ce_loss_17": 2.979322910308838, + "ce_loss_23": 2.8632314205169678, + "ce_loss_3": 4.210316956043243, + "ce_loss_6": 3.7373035192489623, + "epoch": 0.217, + "grad_norm": 1688.0, + "kl_loss_12": 746.2812072753907, + "kl_loss_17": 253.4534484863281, + "kl_loss_3": 2745.815869140625, + "kl_loss_6": 1829.8422790527343, + "learning_rate": 0.0008959511229611376, + "loss": 1391.7037, + "step": 2170 + }, + { + "ce_loss_12": 3.261945140361786, + "ce_loss_17": 3.054550528526306, + "ce_loss_23": 2.943201684951782, + "ce_loss_3": 4.24164912700653, + "ce_loss_6": 3.771306538581848, + "epoch": 0.218, + "grad_norm": 1128.0, + "kl_loss_12": 723.3908752441406, + "kl_loss_17": 245.53762588500976, + "kl_loss_3": 2722.761669921875, + "kl_loss_6": 1795.5326293945313, + "learning_rate": 0.0008949802386055581, + "loss": 1363.358, + "step": 2180 + }, + { + "ce_loss_12": 3.130527687072754, + "ce_loss_17": 2.924950349330902, + "ce_loss_23": 2.813413393497467, + "ce_loss_3": 4.109816062450409, + "ce_loss_6": 3.645668351650238, + "epoch": 0.219, + "grad_norm": 1424.0, + "kl_loss_12": 714.7257232666016, + "kl_loss_17": 246.9955825805664, + "kl_loss_3": 2663.333203125, + "kl_loss_6": 1757.4798095703125, + "learning_rate": 0.0008940053768033609, + "loss": 1379.0065, + "step": 2190 + }, + { + "ce_loss_12": 3.2098819971084596, + "ce_loss_17": 3.0158223152160644, + "ce_loss_23": 2.8984384655952455, + "ce_loss_3": 4.161436128616333, + "ce_loss_6": 3.708323049545288, + "epoch": 0.22, + "grad_norm": 1104.0, + "kl_loss_12": 710.2417388916016, + "kl_loss_17": 263.0302261352539, + "kl_loss_3": 2651.0561279296876, + "kl_loss_6": 1757.548602294922, + "learning_rate": 0.0008930265473713938, + "loss": 1348.1185, + "step": 2200 + }, + { + "ce_loss_12": 3.1847044706344603, + "ce_loss_17": 2.9818301796913147, + "ce_loss_23": 2.8639705538749696, + "ce_loss_3": 4.157197976112366, + "ce_loss_6": 3.6896390080451966, + "epoch": 0.221, + "grad_norm": 1080.0, + "kl_loss_12": 724.5081848144531, + "kl_loss_17": 249.5000907897949, + "kl_loss_3": 2682.2561767578127, + "kl_loss_6": 1762.918389892578, + "learning_rate": 0.0008920437601664579, + "loss": 1327.0632, + "step": 2210 + }, + { + "ce_loss_12": 3.182631862163544, + "ce_loss_17": 2.9684879183769226, + "ce_loss_23": 2.858442449569702, + "ce_loss_3": 4.159274959564209, + "ce_loss_6": 3.700261175632477, + "epoch": 0.222, + "grad_norm": 1168.0, + "kl_loss_12": 737.7363159179688, + "kl_loss_17": 243.66717834472655, + "kl_loss_3": 2716.596545410156, + "kl_loss_6": 1817.600164794922, + "learning_rate": 0.0008910570250852097, + "loss": 1348.7851, + "step": 2220 + }, + { + "ce_loss_12": 3.2627673745155334, + "ce_loss_17": 3.048254370689392, + "ce_loss_23": 2.9470550298690794, + "ce_loss_3": 4.182011067867279, + "ce_loss_6": 3.7384843826293945, + "epoch": 0.223, + "grad_norm": 1424.0, + "kl_loss_12": 702.40751953125, + "kl_loss_17": 227.97779693603516, + "kl_loss_3": 2588.418603515625, + "kl_loss_6": 1716.8679077148438, + "learning_rate": 0.0008900663520640604, + "loss": 1316.1682, + "step": 2230 + }, + { + "ce_loss_12": 3.22307368516922, + "ce_loss_17": 3.0084318041801454, + "ce_loss_23": 2.9019481539726257, + "ce_loss_3": 4.189678382873535, + "ce_loss_6": 3.7194597840309145, + "epoch": 0.224, + "grad_norm": 1144.0, + "kl_loss_12": 719.4921600341797, + "kl_loss_17": 233.06052169799804, + "kl_loss_3": 2691.5576293945314, + "kl_loss_6": 1774.1268249511718, + "learning_rate": 0.0008890717510790764, + "loss": 1353.0216, + "step": 2240 + }, + { + "ce_loss_12": 3.181252729892731, + "ce_loss_17": 2.967786800861359, + "ce_loss_23": 2.8688748478889465, + "ce_loss_3": 4.168042957782745, + "ce_loss_6": 3.693181240558624, + "epoch": 0.225, + "grad_norm": 1080.0, + "kl_loss_12": 713.7260528564453, + "kl_loss_17": 225.8282371520996, + "kl_loss_3": 2697.2067749023436, + "kl_loss_6": 1771.8312927246093, + "learning_rate": 0.0008880732321458784, + "loss": 1359.2141, + "step": 2250 + }, + { + "ce_loss_12": 3.2142521858215334, + "ce_loss_17": 2.9944754362106325, + "ce_loss_23": 2.8961597084999084, + "ce_loss_3": 4.1702636003494264, + "ce_loss_6": 3.70775785446167, + "epoch": 0.226, + "grad_norm": 1240.0, + "kl_loss_12": 718.1211059570312, + "kl_loss_17": 221.46337661743163, + "kl_loss_3": 2660.3265625, + "kl_loss_6": 1751.7075927734375, + "learning_rate": 0.0008870708053195413, + "loss": 1356.8067, + "step": 2260 + }, + { + "ce_loss_12": 3.2218335270881653, + "ce_loss_17": 3.0125537276268006, + "ce_loss_23": 2.9187202572822573, + "ce_loss_3": 4.162897741794586, + "ce_loss_6": 3.7049925565719604, + "epoch": 0.227, + "grad_norm": 1024.0, + "kl_loss_12": 693.9300354003906, + "kl_loss_17": 211.78112869262696, + "kl_loss_3": 2611.0219970703124, + "kl_loss_6": 1716.663330078125, + "learning_rate": 0.0008860644806944918, + "loss": 1325.6539, + "step": 2270 + }, + { + "ce_loss_12": 3.1866829633712768, + "ce_loss_17": 2.962891864776611, + "ce_loss_23": 2.8645379543304443, + "ce_loss_3": 4.165493011474609, + "ce_loss_6": 3.6995256423950194, + "epoch": 0.228, + "grad_norm": 996.0, + "kl_loss_12": 725.6792572021484, + "kl_loss_17": 223.19341049194335, + "kl_loss_3": 2702.9573364257812, + "kl_loss_6": 1784.9772827148438, + "learning_rate": 0.0008850542684044079, + "loss": 1325.885, + "step": 2280 + }, + { + "ce_loss_12": 3.162304496765137, + "ce_loss_17": 2.9285544872283937, + "ce_loss_23": 2.821481502056122, + "ce_loss_3": 4.180591094493866, + "ce_loss_6": 3.6861800074577333, + "epoch": 0.229, + "grad_norm": 1176.0, + "kl_loss_12": 749.9650268554688, + "kl_loss_17": 233.6118423461914, + "kl_loss_3": 2823.8099609375, + "kl_loss_6": 1851.4998291015625, + "learning_rate": 0.0008840401786221159, + "loss": 1370.4396, + "step": 2290 + }, + { + "ce_loss_12": 3.2695318937301634, + "ce_loss_17": 3.056403195858002, + "ce_loss_23": 2.965723288059235, + "ce_loss_3": 4.215631699562072, + "ce_loss_6": 3.76085741519928, + "epoch": 0.23, + "grad_norm": 1136.0, + "kl_loss_12": 693.4002868652344, + "kl_loss_17": 209.44346084594727, + "kl_loss_3": 2616.2891845703125, + "kl_loss_6": 1727.0918090820312, + "learning_rate": 0.000883022221559489, + "loss": 1308.9746, + "step": 2300 + }, + { + "ce_loss_12": 3.239469087123871, + "ce_loss_17": 3.0272177696228026, + "ce_loss_23": 2.9308255076408387, + "ce_loss_3": 4.211579275131226, + "ce_loss_6": 3.751608657836914, + "epoch": 0.231, + "grad_norm": 1360.0, + "kl_loss_12": 709.3472320556641, + "kl_loss_17": 216.6603546142578, + "kl_loss_3": 2677.8682495117187, + "kl_loss_6": 1775.0725708007812, + "learning_rate": 0.0008820004074673434, + "loss": 1373.6508, + "step": 2310 + }, + { + "ce_loss_12": 3.1577208757400514, + "ce_loss_17": 2.9385272026062013, + "ce_loss_23": 2.8453004121780396, + "ce_loss_3": 4.112058687210083, + "ce_loss_6": 3.656074655056, + "epoch": 0.232, + "grad_norm": 1360.0, + "kl_loss_12": 712.0259033203125, + "kl_loss_17": 213.32648620605468, + "kl_loss_3": 2673.76650390625, + "kl_loss_6": 1774.3185424804688, + "learning_rate": 0.0008809747466353355, + "loss": 1325.0434, + "step": 2320 + }, + { + "ce_loss_12": 3.157701861858368, + "ce_loss_17": 2.940626549720764, + "ce_loss_23": 2.842818570137024, + "ce_loss_3": 4.130900573730469, + "ce_loss_6": 3.6584607481956484, + "epoch": 0.233, + "grad_norm": 1088.0, + "kl_loss_12": 702.6765716552734, + "kl_loss_17": 215.8955406188965, + "kl_loss_3": 2674.025427246094, + "kl_loss_6": 1746.1587280273438, + "learning_rate": 0.0008799452493918585, + "loss": 1344.9795, + "step": 2330 + }, + { + "ce_loss_12": 3.224400007724762, + "ce_loss_17": 3.0065416574478148, + "ce_loss_23": 2.912652146816254, + "ce_loss_3": 4.178082239627838, + "ce_loss_6": 3.7230546116828918, + "epoch": 0.234, + "grad_norm": 1520.0, + "kl_loss_12": 704.7937683105469, + "kl_loss_17": 212.39177703857422, + "kl_loss_3": 2653.9332641601563, + "kl_loss_6": 1759.9414428710938, + "learning_rate": 0.0008789119261039385, + "loss": 1374.2182, + "step": 2340 + }, + { + "ce_loss_12": 3.1472853779792787, + "ce_loss_17": 2.9305580973625185, + "ce_loss_23": 2.8364876210689545, + "ce_loss_3": 4.11597170829773, + "ce_loss_6": 3.6585975170135496, + "epoch": 0.235, + "grad_norm": 1248.0, + "kl_loss_12": 706.184780883789, + "kl_loss_17": 212.0734100341797, + "kl_loss_3": 2675.7802612304686, + "kl_loss_6": 1771.2001037597656, + "learning_rate": 0.0008778747871771292, + "loss": 1316.8098, + "step": 2350 + }, + { + "ce_loss_12": 3.179029405117035, + "ce_loss_17": 2.975753378868103, + "ce_loss_23": 2.8873407006263734, + "ce_loss_3": 4.119187295436859, + "ce_loss_6": 3.669054090976715, + "epoch": 0.236, + "grad_norm": 1304.0, + "kl_loss_12": 676.4259826660157, + "kl_loss_17": 203.13166885375978, + "kl_loss_3": 2592.462512207031, + "kl_loss_6": 1700.6154052734375, + "learning_rate": 0.0008768338430554083, + "loss": 1293.2753, + "step": 2360 + }, + { + "ce_loss_12": 3.2074209094047545, + "ce_loss_17": 2.986566960811615, + "ce_loss_23": 2.8928439378738404, + "ce_loss_3": 4.155289900302887, + "ce_loss_6": 3.6990363121032717, + "epoch": 0.237, + "grad_norm": 1104.0, + "kl_loss_12": 705.8325775146484, + "kl_loss_17": 215.99607696533204, + "kl_loss_3": 2624.538525390625, + "kl_loss_6": 1726.0707092285156, + "learning_rate": 0.0008757891042210713, + "loss": 1332.907, + "step": 2370 + }, + { + "ce_loss_12": 3.213341999053955, + "ce_loss_17": 3.00328369140625, + "ce_loss_23": 2.908613753318787, + "ce_loss_3": 4.1675421595573425, + "ce_loss_6": 3.709250104427338, + "epoch": 0.238, + "grad_norm": 1112.0, + "kl_loss_12": 694.2518676757812, + "kl_loss_17": 214.54538650512694, + "kl_loss_3": 2620.0777099609377, + "kl_loss_6": 1721.7463134765626, + "learning_rate": 0.0008747405811946271, + "loss": 1322.9535, + "step": 2380 + }, + { + "ce_loss_12": 3.1364938855171203, + "ce_loss_17": 2.916593599319458, + "ce_loss_23": 2.818560254573822, + "ce_loss_3": 4.146471822261811, + "ce_loss_6": 3.6627113580703736, + "epoch": 0.239, + "grad_norm": 1232.0, + "kl_loss_12": 718.5253265380859, + "kl_loss_17": 218.13416290283203, + "kl_loss_3": 2761.2788330078124, + "kl_loss_6": 1813.5057373046875, + "learning_rate": 0.0008736882845346905, + "loss": 1332.236, + "step": 2390 + }, + { + "ce_loss_12": 3.21389524936676, + "ce_loss_17": 2.9948238492012025, + "ce_loss_23": 2.893634247779846, + "ce_loss_3": 4.187910413742065, + "ce_loss_6": 3.7110480427742005, + "epoch": 0.24, + "grad_norm": 1264.0, + "kl_loss_12": 719.9792236328125, + "kl_loss_17": 223.24748992919922, + "kl_loss_3": 2672.5799560546875, + "kl_loss_6": 1746.6874389648438, + "learning_rate": 0.0008726322248378774, + "loss": 1323.5588, + "step": 2400 + }, + { + "ce_loss_12": 3.211190974712372, + "ce_loss_17": 2.9982844233512878, + "ce_loss_23": 2.9063130021095276, + "ce_loss_3": 4.206434118747711, + "ce_loss_6": 3.72991144657135, + "epoch": 0.241, + "grad_norm": 1184.0, + "kl_loss_12": 704.4481994628907, + "kl_loss_17": 211.553133392334, + "kl_loss_3": 2721.0894775390625, + "kl_loss_6": 1794.480224609375, + "learning_rate": 0.0008715724127386971, + "loss": 1368.4092, + "step": 2410 + }, + { + "ce_loss_12": 3.2690393209457396, + "ce_loss_17": 3.0638194561004637, + "ce_loss_23": 2.967925226688385, + "ce_loss_3": 4.219721961021423, + "ce_loss_6": 3.7553182005882264, + "epoch": 0.242, + "grad_norm": 1328.0, + "kl_loss_12": 686.9312622070313, + "kl_loss_17": 218.79831314086914, + "kl_loss_3": 2634.0843994140623, + "kl_loss_6": 1712.5268676757812, + "learning_rate": 0.0008705088589094458, + "loss": 1325.8676, + "step": 2420 + }, + { + "ce_loss_12": 3.2798361659049986, + "ce_loss_17": 3.078590726852417, + "ce_loss_23": 2.981303107738495, + "ce_loss_3": 4.241901850700378, + "ce_loss_6": 3.780438446998596, + "epoch": 0.243, + "grad_norm": 1472.0, + "kl_loss_12": 698.8630310058594, + "kl_loss_17": 229.17339553833008, + "kl_loss_3": 2658.3953369140627, + "kl_loss_6": 1746.2057495117188, + "learning_rate": 0.0008694415740600988, + "loss": 1338.1035, + "step": 2430 + }, + { + "ce_loss_12": 3.158348333835602, + "ce_loss_17": 2.9655771493911742, + "ce_loss_23": 2.848409104347229, + "ce_loss_3": 4.141533851623535, + "ce_loss_6": 3.6722955226898195, + "epoch": 0.244, + "grad_norm": 1136.0, + "kl_loss_12": 708.9916778564453, + "kl_loss_17": 276.5789794921875, + "kl_loss_3": 2714.6683959960938, + "kl_loss_6": 1797.1813537597657, + "learning_rate": 0.0008683705689382025, + "loss": 1349.2308, + "step": 2440 + }, + { + "ce_loss_12": 3.2215414166450502, + "ce_loss_17": 3.026882493495941, + "ce_loss_23": 2.926299273967743, + "ce_loss_3": 4.159836375713349, + "ce_loss_6": 3.708118665218353, + "epoch": 0.245, + "grad_norm": 988.0, + "kl_loss_12": 686.4640747070313, + "kl_loss_17": 228.41824951171876, + "kl_loss_3": 2609.161340332031, + "kl_loss_6": 1714.3937561035157, + "learning_rate": 0.0008672958543287666, + "loss": 1340.6461, + "step": 2450 + }, + { + "ce_loss_12": 3.2369601130485535, + "ce_loss_17": 3.0324127793312075, + "ce_loss_23": 2.9349398016929626, + "ce_loss_3": 4.168296587467194, + "ce_loss_6": 3.71487580537796, + "epoch": 0.246, + "grad_norm": 1056.0, + "kl_loss_12": 702.6545867919922, + "kl_loss_17": 223.37064590454102, + "kl_loss_3": 2597.1249267578123, + "kl_loss_6": 1710.9740417480468, + "learning_rate": 0.0008662174410541554, + "loss": 1311.1107, + "step": 2460 + }, + { + "ce_loss_12": 3.197036552429199, + "ce_loss_17": 2.9945160865783693, + "ce_loss_23": 2.9003651618957518, + "ce_loss_3": 4.1306620121002195, + "ce_loss_6": 3.6728007078170775, + "epoch": 0.247, + "grad_norm": 1216.0, + "kl_loss_12": 688.5184326171875, + "kl_loss_17": 223.68284378051757, + "kl_loss_3": 2592.430944824219, + "kl_loss_6": 1692.0546813964843, + "learning_rate": 0.0008651353399739787, + "loss": 1337.2021, + "step": 2470 + }, + { + "ce_loss_12": 3.2234519362449645, + "ce_loss_17": 3.017361891269684, + "ce_loss_23": 2.9193612813949583, + "ce_loss_3": 4.171088445186615, + "ce_loss_6": 3.707779347896576, + "epoch": 0.248, + "grad_norm": 1080.0, + "kl_loss_12": 692.3505249023438, + "kl_loss_17": 220.68072967529298, + "kl_loss_3": 2605.4829345703124, + "kl_loss_6": 1707.4573181152343, + "learning_rate": 0.0008640495619849821, + "loss": 1312.7043, + "step": 2480 + }, + { + "ce_loss_12": 3.188669204711914, + "ce_loss_17": 2.9883776545524596, + "ce_loss_23": 2.8864614367485046, + "ce_loss_3": 4.126471364498139, + "ce_loss_6": 3.6681827306747437, + "epoch": 0.249, + "grad_norm": 1376.0, + "kl_loss_12": 692.8002624511719, + "kl_loss_17": 222.35086212158203, + "kl_loss_3": 2602.0055541992188, + "kl_loss_6": 1697.7270385742188, + "learning_rate": 0.0008629601180209381, + "loss": 1307.9302, + "step": 2490 + }, + { + "ce_loss_12": 3.1828148007392882, + "ce_loss_17": 2.9769126296043398, + "ce_loss_23": 2.876669943332672, + "ce_loss_3": 4.1202881097793576, + "ce_loss_6": 3.660766232013702, + "epoch": 0.25, + "grad_norm": 948.0, + "kl_loss_12": 684.8112030029297, + "kl_loss_17": 234.48795318603516, + "kl_loss_3": 2583.1467529296874, + "kl_loss_6": 1682.2519226074219, + "learning_rate": 0.000861867019052535, + "loss": 1320.1142, + "step": 2500 + }, + { + "ce_loss_12": 3.1194626927375793, + "ce_loss_17": 2.9112721920013427, + "ce_loss_23": 2.8053731083869935, + "ce_loss_3": 4.102438044548035, + "ce_loss_6": 3.6318350195884705, + "epoch": 0.251, + "grad_norm": 1072.0, + "kl_loss_12": 698.8854797363281, + "kl_loss_17": 235.78541717529296, + "kl_loss_3": 2680.2626220703123, + "kl_loss_6": 1758.730224609375, + "learning_rate": 0.0008607702760872678, + "loss": 1343.0601, + "step": 2510 + }, + { + "ce_loss_12": 3.204072630405426, + "ce_loss_17": 3.011195695400238, + "ce_loss_23": 2.90932697057724, + "ce_loss_3": 4.138043749332428, + "ce_loss_6": 3.686288094520569, + "epoch": 0.252, + "grad_norm": 1168.0, + "kl_loss_12": 674.006314086914, + "kl_loss_17": 224.07162475585938, + "kl_loss_3": 2564.412548828125, + "kl_loss_6": 1682.5275573730469, + "learning_rate": 0.0008596699001693256, + "loss": 1324.4848, + "step": 2520 + }, + { + "ce_loss_12": 3.2106514811515807, + "ce_loss_17": 3.017582905292511, + "ce_loss_23": 2.922886919975281, + "ce_loss_3": 4.140046346187591, + "ce_loss_6": 3.6838871836662292, + "epoch": 0.253, + "grad_norm": 1192.0, + "kl_loss_12": 676.8567138671875, + "kl_loss_17": 218.53152160644532, + "kl_loss_3": 2574.9288696289063, + "kl_loss_6": 1676.3718872070312, + "learning_rate": 0.0008585659023794818, + "loss": 1328.5582, + "step": 2530 + }, + { + "ce_loss_12": 3.1961737751960753, + "ce_loss_17": 2.985444128513336, + "ce_loss_23": 2.8865877389907837, + "ce_loss_3": 4.184960126876831, + "ce_loss_6": 3.715757930278778, + "epoch": 0.254, + "grad_norm": 1072.0, + "kl_loss_12": 711.092642211914, + "kl_loss_17": 226.49666290283204, + "kl_loss_3": 2703.228369140625, + "kl_loss_6": 1769.7383117675781, + "learning_rate": 0.0008574582938349817, + "loss": 1332.0293, + "step": 2540 + }, + { + "ce_loss_12": 3.184503936767578, + "ce_loss_17": 2.9647669196128845, + "ce_loss_23": 2.8564560532569887, + "ce_loss_3": 4.1622985124588014, + "ce_loss_6": 3.685743486881256, + "epoch": 0.255, + "grad_norm": 1088.0, + "kl_loss_12": 727.4106658935547, + "kl_loss_17": 232.05386352539062, + "kl_loss_3": 2709.0426513671873, + "kl_loss_6": 1767.201641845703, + "learning_rate": 0.0008563470856894315, + "loss": 1316.1617, + "step": 2550 + }, + { + "ce_loss_12": 3.1625547409057617, + "ce_loss_17": 2.9571083307266237, + "ce_loss_23": 2.8621199488639832, + "ce_loss_3": 4.132767677307129, + "ce_loss_6": 3.6766427636146544, + "epoch": 0.256, + "grad_norm": 1296.0, + "kl_loss_12": 692.0477294921875, + "kl_loss_17": 216.78019943237305, + "kl_loss_3": 2642.7876586914062, + "kl_loss_6": 1745.0903747558593, + "learning_rate": 0.0008552322891326845, + "loss": 1316.6967, + "step": 2560 + }, + { + "ce_loss_12": 3.1427706480026245, + "ce_loss_17": 2.935354781150818, + "ce_loss_23": 2.839388430118561, + "ce_loss_3": 4.1130335330963135, + "ce_loss_6": 3.6411257863044737, + "epoch": 0.257, + "grad_norm": 1608.0, + "kl_loss_12": 691.3601867675782, + "kl_loss_17": 217.39927825927734, + "kl_loss_3": 2656.824072265625, + "kl_loss_6": 1726.0087158203125, + "learning_rate": 0.0008541139153907296, + "loss": 1300.7164, + "step": 2570 + }, + { + "ce_loss_12": 3.097336781024933, + "ce_loss_17": 2.8923059940338134, + "ce_loss_23": 2.8015827894210816, + "ce_loss_3": 4.065582001209259, + "ce_loss_6": 3.594275784492493, + "epoch": 0.258, + "grad_norm": 1072.0, + "kl_loss_12": 673.7452972412109, + "kl_loss_17": 205.65441131591797, + "kl_loss_3": 2627.886279296875, + "kl_loss_6": 1704.77724609375, + "learning_rate": 0.0008529919757255782, + "loss": 1320.6559, + "step": 2580 + }, + { + "ce_loss_12": 3.121126043796539, + "ce_loss_17": 2.9228207588195803, + "ce_loss_23": 2.838605988025665, + "ce_loss_3": 4.037435448169708, + "ce_loss_6": 3.584921109676361, + "epoch": 0.259, + "grad_norm": 1152.0, + "kl_loss_12": 657.7913787841796, + "kl_loss_17": 202.52617797851562, + "kl_loss_3": 2537.204150390625, + "kl_loss_6": 1645.8016479492187, + "learning_rate": 0.0008518664814351503, + "loss": 1274.3268, + "step": 2590 + }, + { + "ce_loss_12": 3.104781413078308, + "ce_loss_17": 2.8901186347007752, + "ce_loss_23": 2.795970094203949, + "ce_loss_3": 4.082364869117737, + "ce_loss_6": 3.605860185623169, + "epoch": 0.26, + "grad_norm": 1344.0, + "kl_loss_12": 696.5720184326171, + "kl_loss_17": 214.5135696411133, + "kl_loss_3": 2675.3045654296875, + "kl_loss_6": 1741.6918823242188, + "learning_rate": 0.0008507374438531607, + "loss": 1363.2209, + "step": 2600 + }, + { + "ce_loss_12": 3.081671857833862, + "ce_loss_17": 2.876622939109802, + "ce_loss_23": 2.784547483921051, + "ce_loss_3": 4.034499597549439, + "ce_loss_6": 3.576632249355316, + "epoch": 0.261, + "grad_norm": 1096.0, + "kl_loss_12": 682.1584899902343, + "kl_loss_17": 206.09313354492187, + "kl_loss_3": 2613.8728271484374, + "kl_loss_6": 1720.1293151855468, + "learning_rate": 0.0008496048743490053, + "loss": 1305.6201, + "step": 2610 + }, + { + "ce_loss_12": 3.2290136218070984, + "ce_loss_17": 3.0230956435203553, + "ce_loss_23": 2.9306798219680785, + "ce_loss_3": 4.150061511993409, + "ce_loss_6": 3.707044267654419, + "epoch": 0.262, + "grad_norm": 1192.0, + "kl_loss_12": 678.9570159912109, + "kl_loss_17": 205.46227264404297, + "kl_loss_3": 2554.87841796875, + "kl_loss_6": 1674.7249084472655, + "learning_rate": 0.0008484687843276469, + "loss": 1290.4998, + "step": 2620 + }, + { + "ce_loss_12": 3.1576597094535828, + "ce_loss_17": 2.95311176776886, + "ce_loss_23": 2.8603856325149537, + "ce_loss_3": 4.113218057155609, + "ce_loss_6": 3.6455924272537232, + "epoch": 0.263, + "grad_norm": 1528.0, + "kl_loss_12": 689.7358795166016, + "kl_loss_17": 210.2408332824707, + "kl_loss_3": 2610.7355712890626, + "kl_loss_6": 1700.8219421386718, + "learning_rate": 0.0008473291852294987, + "loss": 1316.9947, + "step": 2630 + }, + { + "ce_loss_12": 3.172579658031464, + "ce_loss_17": 2.9600013256073, + "ce_loss_23": 2.86677725315094, + "ce_loss_3": 4.115126585960388, + "ce_loss_6": 3.6532453536987304, + "epoch": 0.264, + "grad_norm": 1168.0, + "kl_loss_12": 696.9774536132812, + "kl_loss_17": 212.4393394470215, + "kl_loss_3": 2622.4283813476563, + "kl_loss_6": 1705.7791381835937, + "learning_rate": 0.0008461860885303114, + "loss": 1300.3334, + "step": 2640 + }, + { + "ce_loss_12": 3.1920876264572144, + "ce_loss_17": 2.984456753730774, + "ce_loss_23": 2.896187424659729, + "ce_loss_3": 4.123092436790467, + "ce_loss_6": 3.662570369243622, + "epoch": 0.265, + "grad_norm": 1064.0, + "kl_loss_12": 677.250210571289, + "kl_loss_17": 203.22478103637695, + "kl_loss_3": 2562.327197265625, + "kl_loss_6": 1662.8192504882813, + "learning_rate": 0.000845039505741056, + "loss": 1299.5422, + "step": 2650 + }, + { + "ce_loss_12": 3.1844265699386596, + "ce_loss_17": 2.9672574758529664, + "ce_loss_23": 2.8763338327407837, + "ce_loss_3": 4.1377707600593565, + "ce_loss_6": 3.6718860268592834, + "epoch": 0.266, + "grad_norm": 1128.0, + "kl_loss_12": 707.8172332763672, + "kl_loss_17": 211.42220458984374, + "kl_loss_3": 2656.2222534179687, + "kl_loss_6": 1731.8209411621094, + "learning_rate": 0.0008438894484078086, + "loss": 1353.5854, + "step": 2660 + }, + { + "ce_loss_12": 3.186161124706268, + "ce_loss_17": 2.982400453090668, + "ce_loss_23": 2.8897322177886964, + "ce_loss_3": 4.114835977554321, + "ce_loss_6": 3.6600540041923524, + "epoch": 0.267, + "grad_norm": 1064.0, + "kl_loss_12": 677.1057373046875, + "kl_loss_17": 211.50477447509766, + "kl_loss_3": 2567.3158447265623, + "kl_loss_6": 1681.799871826172, + "learning_rate": 0.0008427359281116334, + "loss": 1296.6374, + "step": 2670 + }, + { + "ce_loss_12": 3.093821132183075, + "ce_loss_17": 2.882770228385925, + "ce_loss_23": 2.791520416736603, + "ce_loss_3": 4.071115756034851, + "ce_loss_6": 3.5991916179656984, + "epoch": 0.268, + "grad_norm": 1248.0, + "kl_loss_12": 687.5313018798828, + "kl_loss_17": 210.13628540039062, + "kl_loss_3": 2665.6412109375, + "kl_loss_6": 1737.7759338378905, + "learning_rate": 0.0008415789564684673, + "loss": 1315.315, + "step": 2680 + }, + { + "ce_loss_12": 3.319716191291809, + "ce_loss_17": 3.1131805419921874, + "ce_loss_23": 3.017329823970795, + "ce_loss_3": 4.241286253929138, + "ce_loss_6": 3.787472295761108, + "epoch": 0.269, + "grad_norm": 1496.0, + "kl_loss_12": 685.154019165039, + "kl_loss_17": 214.83649978637695, + "kl_loss_3": 2533.8969482421876, + "kl_loss_6": 1651.515673828125, + "learning_rate": 0.0008404185451290017, + "loss": 1275.1857, + "step": 2690 + }, + { + "ce_loss_12": 3.1939535617828367, + "ce_loss_17": 2.9861059308052065, + "ce_loss_23": 2.895950734615326, + "ce_loss_3": 4.137200510501861, + "ce_loss_6": 3.677232837677002, + "epoch": 0.27, + "grad_norm": 968.0, + "kl_loss_12": 676.4378295898438, + "kl_loss_17": 211.2507179260254, + "kl_loss_3": 2583.730187988281, + "kl_loss_6": 1676.1700622558594, + "learning_rate": 0.0008392547057785661, + "loss": 1288.5631, + "step": 2700 + }, + { + "ce_loss_12": 3.1444958209991456, + "ce_loss_17": 2.932486617565155, + "ce_loss_23": 2.839138376712799, + "ce_loss_3": 4.131505846977234, + "ce_loss_6": 3.650907111167908, + "epoch": 0.271, + "grad_norm": 980.0, + "kl_loss_12": 697.2017883300781, + "kl_loss_17": 217.85244369506836, + "kl_loss_3": 2717.758996582031, + "kl_loss_6": 1766.1455200195312, + "learning_rate": 0.0008380874501370098, + "loss": 1298.7047, + "step": 2710 + }, + { + "ce_loss_12": 3.1357044100761415, + "ce_loss_17": 2.924636518955231, + "ce_loss_23": 2.8253360390663147, + "ce_loss_3": 4.122107923030853, + "ce_loss_6": 3.6408496141433715, + "epoch": 0.272, + "grad_norm": 1192.0, + "kl_loss_12": 708.8865112304687, + "kl_loss_17": 233.48337783813477, + "kl_loss_3": 2702.6140747070312, + "kl_loss_6": 1754.3397705078125, + "learning_rate": 0.0008369167899585841, + "loss": 1331.4025, + "step": 2720 + }, + { + "ce_loss_12": 3.2249266386032103, + "ce_loss_17": 3.0370797514915466, + "ce_loss_23": 2.939964401721954, + "ce_loss_3": 4.134345138072968, + "ce_loss_6": 3.6882724165916443, + "epoch": 0.273, + "grad_norm": 1352.0, + "kl_loss_12": 670.7063415527343, + "kl_loss_17": 240.46832427978515, + "kl_loss_3": 2530.49345703125, + "kl_loss_6": 1649.3007019042968, + "learning_rate": 0.0008357427370318238, + "loss": 1316.18, + "step": 2730 + }, + { + "ce_loss_12": 3.189584970474243, + "ce_loss_17": 3.000252032279968, + "ce_loss_23": 2.890009045600891, + "ce_loss_3": 4.150511598587036, + "ce_loss_6": 3.6770068645477294, + "epoch": 0.274, + "grad_norm": 1400.0, + "kl_loss_12": 682.5615783691406, + "kl_loss_17": 236.95541915893554, + "kl_loss_3": 2626.866320800781, + "kl_loss_6": 1702.4308288574218, + "learning_rate": 0.0008345653031794292, + "loss": 1318.7662, + "step": 2740 + }, + { + "ce_loss_12": 3.1951624870300295, + "ce_loss_17": 2.9964037895202638, + "ce_loss_23": 2.8955544590950013, + "ce_loss_3": 4.138919460773468, + "ce_loss_6": 3.673405385017395, + "epoch": 0.275, + "grad_norm": 880.0, + "kl_loss_12": 689.3633850097656, + "kl_loss_17": 231.51595458984374, + "kl_loss_3": 2603.9886840820313, + "kl_loss_6": 1685.0747619628905, + "learning_rate": 0.0008333845002581458, + "loss": 1300.1859, + "step": 2750 + }, + { + "ce_loss_12": 3.1334605693817137, + "ce_loss_17": 2.9291109323501585, + "ce_loss_23": 2.830948543548584, + "ce_loss_3": 4.099729323387146, + "ce_loss_6": 3.637806749343872, + "epoch": 0.276, + "grad_norm": 952.0, + "kl_loss_12": 701.9874572753906, + "kl_loss_17": 226.16938247680665, + "kl_loss_3": 2669.161376953125, + "kl_loss_6": 1756.4841552734374, + "learning_rate": 0.0008322003401586462, + "loss": 1333.6066, + "step": 2760 + }, + { + "ce_loss_12": 3.150827920436859, + "ce_loss_17": 2.9546613574028013, + "ce_loss_23": 2.8605478763580323, + "ce_loss_3": 4.07707986831665, + "ce_loss_6": 3.6201881170272827, + "epoch": 0.277, + "grad_norm": 1176.0, + "kl_loss_12": 667.2383483886719, + "kl_loss_17": 213.83636016845702, + "kl_loss_3": 2540.439172363281, + "kl_loss_6": 1650.3801879882812, + "learning_rate": 0.0008310128348054094, + "loss": 1259.9731, + "step": 2770 + }, + { + "ce_loss_12": 3.1183719992637635, + "ce_loss_17": 2.924716579914093, + "ce_loss_23": 2.8316155433654786, + "ce_loss_3": 4.072813379764557, + "ce_loss_6": 3.605352246761322, + "epoch": 0.278, + "grad_norm": 1296.0, + "kl_loss_12": 671.1783020019532, + "kl_loss_17": 211.9533592224121, + "kl_loss_3": 2594.124963378906, + "kl_loss_6": 1683.5873413085938, + "learning_rate": 0.0008298219961566008, + "loss": 1293.9365, + "step": 2780 + }, + { + "ce_loss_12": 3.0993417143821715, + "ce_loss_17": 2.890523338317871, + "ce_loss_23": 2.7993259906768797, + "ce_loss_3": 4.072126877307892, + "ce_loss_6": 3.5997986793518066, + "epoch": 0.279, + "grad_norm": 1056.0, + "kl_loss_12": 693.4780517578125, + "kl_loss_17": 213.2262306213379, + "kl_loss_3": 2678.9249267578125, + "kl_loss_6": 1752.5269470214844, + "learning_rate": 0.0008286278362039527, + "loss": 1297.5109, + "step": 2790 + }, + { + "ce_loss_12": 3.121641290187836, + "ce_loss_17": 2.9188699603080748, + "ce_loss_23": 2.823632848262787, + "ce_loss_3": 4.120881426334381, + "ce_loss_6": 3.6421324014663696, + "epoch": 0.28, + "grad_norm": 1216.0, + "kl_loss_12": 687.7910034179688, + "kl_loss_17": 218.2253387451172, + "kl_loss_3": 2705.189245605469, + "kl_loss_6": 1758.8736450195313, + "learning_rate": 0.0008274303669726426, + "loss": 1302.2418, + "step": 2800 + }, + { + "ce_loss_12": 3.0395513653755186, + "ce_loss_17": 2.8333880066871644, + "ce_loss_23": 2.735865366458893, + "ce_loss_3": 4.0482837677001955, + "ce_loss_6": 3.556193935871124, + "epoch": 0.281, + "grad_norm": 1312.0, + "kl_loss_12": 685.361215209961, + "kl_loss_17": 227.50414581298827, + "kl_loss_3": 2726.5383911132812, + "kl_loss_6": 1754.9284606933593, + "learning_rate": 0.0008262296005211721, + "loss": 1304.683, + "step": 2810 + }, + { + "ce_loss_12": 3.154022693634033, + "ce_loss_17": 2.9483704686164858, + "ce_loss_23": 2.8555146217346192, + "ce_loss_3": 4.119756305217743, + "ce_loss_6": 3.65409369468689, + "epoch": 0.282, + "grad_norm": 996.0, + "kl_loss_12": 688.2166076660156, + "kl_loss_17": 218.64757843017577, + "kl_loss_3": 2633.830712890625, + "kl_loss_6": 1726.433770751953, + "learning_rate": 0.0008250255489412463, + "loss": 1300.1041, + "step": 2820 + }, + { + "ce_loss_12": 3.2485215187072756, + "ce_loss_17": 3.0447388768196104, + "ce_loss_23": 2.9463759064674377, + "ce_loss_3": 4.187284290790558, + "ce_loss_6": 3.7238049268722535, + "epoch": 0.283, + "grad_norm": 1432.0, + "kl_loss_12": 683.5297790527344, + "kl_loss_17": 215.8306427001953, + "kl_loss_3": 2594.514514160156, + "kl_loss_6": 1686.3562927246094, + "learning_rate": 0.0008238182243576511, + "loss": 1298.2584, + "step": 2830 + }, + { + "ce_loss_12": 3.1957290291786196, + "ce_loss_17": 3.0053368330001833, + "ce_loss_23": 2.9148476481437684, + "ce_loss_3": 4.07955631017685, + "ce_loss_6": 3.641888213157654, + "epoch": 0.284, + "grad_norm": 1088.0, + "kl_loss_12": 657.5304382324218, + "kl_loss_17": 208.30530853271483, + "kl_loss_3": 2456.5474853515625, + "kl_loss_6": 1602.383233642578, + "learning_rate": 0.0008226076389281315, + "loss": 1255.6082, + "step": 2840 + }, + { + "ce_loss_12": 3.2510815858840942, + "ce_loss_17": 3.048978865146637, + "ce_loss_23": 2.9613125801086424, + "ce_loss_3": 4.161168789863586, + "ce_loss_6": 3.7106049180030825, + "epoch": 0.285, + "grad_norm": 1616.0, + "kl_loss_12": 674.101708984375, + "kl_loss_17": 207.04217147827148, + "kl_loss_3": 2547.9843139648438, + "kl_loss_6": 1659.9259216308594, + "learning_rate": 0.0008213938048432696, + "loss": 1262.7362, + "step": 2850 + }, + { + "ce_loss_12": 3.1783262133598327, + "ce_loss_17": 2.9763714909553527, + "ce_loss_23": 2.8821098804473877, + "ce_loss_3": 4.104459810256958, + "ce_loss_6": 3.6436208963394163, + "epoch": 0.286, + "grad_norm": 1048.0, + "kl_loss_12": 683.6984161376953, + "kl_loss_17": 214.6061233520508, + "kl_loss_3": 2575.304443359375, + "kl_loss_6": 1658.6302917480468, + "learning_rate": 0.0008201767343263612, + "loss": 1292.0326, + "step": 2860 + }, + { + "ce_loss_12": 3.1275757551193237, + "ce_loss_17": 2.9192912459373472, + "ce_loss_23": 2.8295354723930357, + "ce_loss_3": 4.08244069814682, + "ce_loss_6": 3.621002423763275, + "epoch": 0.287, + "grad_norm": 1032.0, + "kl_loss_12": 680.7317443847656, + "kl_loss_17": 206.82701263427734, + "kl_loss_3": 2626.6409423828127, + "kl_loss_6": 1718.0297607421876, + "learning_rate": 0.0008189564396332927, + "loss": 1265.4093, + "step": 2870 + }, + { + "ce_loss_12": 3.1200747966766356, + "ce_loss_17": 2.913948881626129, + "ce_loss_23": 2.8240206956863405, + "ce_loss_3": 4.0817553997039795, + "ce_loss_6": 3.6026079297065734, + "epoch": 0.288, + "grad_norm": 1216.0, + "kl_loss_12": 671.2544067382812, + "kl_loss_17": 208.49227066040038, + "kl_loss_3": 2613.6818237304688, + "kl_loss_6": 1682.8380859375, + "learning_rate": 0.0008177329330524181, + "loss": 1291.9834, + "step": 2880 + }, + { + "ce_loss_12": 3.1660085558891295, + "ce_loss_17": 2.9622753262519836, + "ce_loss_23": 2.8681652665138246, + "ce_loss_3": 4.082670176029206, + "ce_loss_6": 3.6279420375823976, + "epoch": 0.289, + "grad_norm": 1248.0, + "kl_loss_12": 666.8172821044922, + "kl_loss_17": 214.4644989013672, + "kl_loss_3": 2517.624169921875, + "kl_loss_6": 1639.22294921875, + "learning_rate": 0.0008165062269044352, + "loss": 1273.3618, + "step": 2890 + }, + { + "ce_loss_12": 3.129373037815094, + "ce_loss_17": 2.9321088552474976, + "ce_loss_23": 2.8323047518730164, + "ce_loss_3": 4.082630407810211, + "ce_loss_6": 3.6082259893417357, + "epoch": 0.29, + "grad_norm": 1304.0, + "kl_loss_12": 677.7201995849609, + "kl_loss_17": 224.2819839477539, + "kl_loss_3": 2624.9942993164063, + "kl_loss_6": 1694.14033203125, + "learning_rate": 0.0008152763335422613, + "loss": 1307.293, + "step": 2900 + }, + { + "ce_loss_12": 3.110826826095581, + "ce_loss_17": 2.911401462554932, + "ce_loss_23": 2.8098244071006775, + "ce_loss_3": 4.058380711078644, + "ce_loss_6": 3.5944467544555665, + "epoch": 0.291, + "grad_norm": 1896.0, + "kl_loss_12": 683.6442687988281, + "kl_loss_17": 221.27504348754883, + "kl_loss_3": 2608.7736328125, + "kl_loss_6": 1696.7043334960938, + "learning_rate": 0.0008140432653509088, + "loss": 1283.9125, + "step": 2910 + }, + { + "ce_loss_12": 3.158522570133209, + "ce_loss_17": 2.955310845375061, + "ce_loss_23": 2.8611711621284486, + "ce_loss_3": 4.089209735393524, + "ce_loss_6": 3.635050928592682, + "epoch": 0.292, + "grad_norm": 1152.0, + "kl_loss_12": 681.7248046875, + "kl_loss_17": 215.7587547302246, + "kl_loss_3": 2588.3452392578124, + "kl_loss_6": 1680.8540649414062, + "learning_rate": 0.0008128070347473608, + "loss": 1283.4541, + "step": 2920 + }, + { + "ce_loss_12": 3.1713228702545164, + "ce_loss_17": 2.9683526039123533, + "ce_loss_23": 2.87442432641983, + "ce_loss_3": 4.147928857803345, + "ce_loss_6": 3.6574865341186524, + "epoch": 0.293, + "grad_norm": 1012.0, + "kl_loss_12": 688.9380676269532, + "kl_loss_17": 213.04415054321288, + "kl_loss_3": 2661.012451171875, + "kl_loss_6": 1698.3109985351562, + "learning_rate": 0.0008115676541804455, + "loss": 1291.3975, + "step": 2930 + }, + { + "ce_loss_12": 3.167314040660858, + "ce_loss_17": 2.9671000599861146, + "ce_loss_23": 2.8770614743232725, + "ce_loss_3": 4.097289597988128, + "ce_loss_6": 3.6370487570762635, + "epoch": 0.294, + "grad_norm": 1416.0, + "kl_loss_12": 663.3868530273437, + "kl_loss_17": 205.8155601501465, + "kl_loss_3": 2568.6847900390626, + "kl_loss_6": 1653.6200256347656, + "learning_rate": 0.0008103251361307119, + "loss": 1288.7492, + "step": 2940 + }, + { + "ce_loss_12": 3.2008013010025023, + "ce_loss_17": 2.993203580379486, + "ce_loss_23": 2.902731215953827, + "ce_loss_3": 4.136919891834259, + "ce_loss_6": 3.6736745595932008, + "epoch": 0.295, + "grad_norm": 1128.0, + "kl_loss_12": 678.2878875732422, + "kl_loss_17": 210.1094871520996, + "kl_loss_3": 2584.3587036132812, + "kl_loss_6": 1679.0871643066407, + "learning_rate": 0.0008090794931103026, + "loss": 1271.84, + "step": 2950 + }, + { + "ce_loss_12": 3.1641576647758485, + "ce_loss_17": 2.96648188829422, + "ce_loss_23": 2.879633092880249, + "ce_loss_3": 4.091558110713959, + "ce_loss_6": 3.6358474850654603, + "epoch": 0.296, + "grad_norm": 1176.0, + "kl_loss_12": 656.2914367675781, + "kl_loss_17": 201.0710319519043, + "kl_loss_3": 2538.4782958984374, + "kl_loss_6": 1640.0189331054687, + "learning_rate": 0.0008078307376628291, + "loss": 1267.6426, + "step": 2960 + }, + { + "ce_loss_12": 3.2289014220237733, + "ce_loss_17": 3.027685618400574, + "ce_loss_23": 2.9415606260299683, + "ce_loss_3": 4.119142377376557, + "ce_loss_6": 3.6732831835746764, + "epoch": 0.297, + "grad_norm": 1296.0, + "kl_loss_12": 649.4954864501954, + "kl_loss_17": 198.16017379760743, + "kl_loss_3": 2464.59814453125, + "kl_loss_6": 1586.0292541503907, + "learning_rate": 0.000806578882363245, + "loss": 1230.785, + "step": 2970 + }, + { + "ce_loss_12": 3.1504740715026855, + "ce_loss_17": 2.9535369873046875, + "ce_loss_23": 2.86621869802475, + "ce_loss_3": 4.059466123580933, + "ce_loss_6": 3.6136531949043276, + "epoch": 0.298, + "grad_norm": 1192.0, + "kl_loss_12": 659.7547485351563, + "kl_loss_17": 197.65312957763672, + "kl_loss_3": 2533.2875, + "kl_loss_6": 1648.2171325683594, + "learning_rate": 0.0008053239398177191, + "loss": 1283.3854, + "step": 2980 + }, + { + "ce_loss_12": 3.1471254944801332, + "ce_loss_17": 2.9429659843444824, + "ce_loss_23": 2.853777623176575, + "ce_loss_3": 4.086737155914307, + "ce_loss_6": 3.628751742839813, + "epoch": 0.299, + "grad_norm": 1232.0, + "kl_loss_12": 665.6853668212891, + "kl_loss_17": 200.3376953125, + "kl_loss_3": 2570.677282714844, + "kl_loss_6": 1669.5108703613282, + "learning_rate": 0.0008040659226635089, + "loss": 1302.3219, + "step": 2990 + }, + { + "ce_loss_12": 3.2610808968544007, + "ce_loss_17": 3.055904138088226, + "ce_loss_23": 2.962185859680176, + "ce_loss_3": 4.172962117195129, + "ce_loss_6": 3.7252434134483337, + "epoch": 0.3, + "grad_norm": 1392.0, + "kl_loss_12": 691.085482788086, + "kl_loss_17": 211.69791641235352, + "kl_loss_3": 2555.7884155273437, + "kl_loss_6": 1668.6239196777344, + "learning_rate": 0.0008028048435688333, + "loss": 1265.6971, + "step": 3000 + }, + { + "ce_loss_12": 3.1473644733428956, + "ce_loss_17": 2.9388877868652346, + "ce_loss_23": 2.850943756103516, + "ce_loss_3": 4.103834009170532, + "ce_loss_6": 3.6472261905670167, + "epoch": 0.301, + "grad_norm": 1456.0, + "kl_loss_12": 682.3835693359375, + "kl_loss_17": 203.41217880249025, + "kl_loss_3": 2632.33310546875, + "kl_loss_6": 1716.087384033203, + "learning_rate": 0.0008015407152327448, + "loss": 1286.5555, + "step": 3010 + }, + { + "ce_loss_12": 3.1876540660858153, + "ce_loss_17": 2.9817963600158692, + "ce_loss_23": 2.8919683694839478, + "ce_loss_3": 4.1209875583648685, + "ce_loss_6": 3.658332860469818, + "epoch": 0.302, + "grad_norm": 1080.0, + "kl_loss_12": 677.3872039794921, + "kl_loss_17": 207.2067398071289, + "kl_loss_3": 2602.8611083984374, + "kl_loss_6": 1685.0528625488282, + "learning_rate": 0.0008002735503850016, + "loss": 1286.4209, + "step": 3020 + }, + { + "ce_loss_12": 3.086091411113739, + "ce_loss_17": 2.8816797137260437, + "ce_loss_23": 2.7888283014297484, + "ce_loss_3": 4.055278909206391, + "ce_loss_6": 3.5791066169738768, + "epoch": 0.303, + "grad_norm": 1264.0, + "kl_loss_12": 677.491079711914, + "kl_loss_17": 208.38033905029297, + "kl_loss_3": 2636.7327880859375, + "kl_loss_6": 1703.7079956054688, + "learning_rate": 0.0007990033617859396, + "loss": 1304.0175, + "step": 3030 + }, + { + "ce_loss_12": 3.139596962928772, + "ce_loss_17": 2.9386150002479554, + "ce_loss_23": 2.847939932346344, + "ce_loss_3": 4.06524167060852, + "ce_loss_6": 3.6095362424850466, + "epoch": 0.304, + "grad_norm": 1264.0, + "kl_loss_12": 668.4441009521485, + "kl_loss_17": 204.69986114501953, + "kl_loss_3": 2550.4292724609377, + "kl_loss_6": 1657.856756591797, + "learning_rate": 0.000797730162226344, + "loss": 1239.6941, + "step": 3040 + }, + { + "ce_loss_12": 3.1538297176361083, + "ce_loss_17": 2.9472372770309447, + "ce_loss_23": 2.85866756439209, + "ce_loss_3": 4.089906406402588, + "ce_loss_6": 3.625072705745697, + "epoch": 0.305, + "grad_norm": 1064.0, + "kl_loss_12": 680.2299285888672, + "kl_loss_17": 203.93164520263673, + "kl_loss_3": 2562.0833740234375, + "kl_loss_6": 1657.4151428222656, + "learning_rate": 0.0007964539645273203, + "loss": 1268.8176, + "step": 3050 + }, + { + "ce_loss_12": 3.168477141857147, + "ce_loss_17": 2.9669825077056884, + "ce_loss_23": 2.8848891615867616, + "ce_loss_3": 4.0650266289711, + "ce_loss_6": 3.6266412258148195, + "epoch": 0.306, + "grad_norm": 1088.0, + "kl_loss_12": 652.9420104980469, + "kl_loss_17": 194.45923538208007, + "kl_loss_3": 2483.465441894531, + "kl_loss_6": 1617.5318542480468, + "learning_rate": 0.000795174781540165, + "loss": 1259.774, + "step": 3060 + }, + { + "ce_loss_12": 3.2268463492393495, + "ce_loss_17": 3.035295844078064, + "ce_loss_23": 2.9519911646842956, + "ce_loss_3": 4.11122350692749, + "ce_loss_6": 3.676068902015686, + "epoch": 0.307, + "grad_norm": 1160.0, + "kl_loss_12": 649.5947296142579, + "kl_loss_17": 199.54154739379882, + "kl_loss_3": 2452.772106933594, + "kl_loss_6": 1596.8073791503907, + "learning_rate": 0.0007938926261462366, + "loss": 1258.3678, + "step": 3070 + }, + { + "ce_loss_12": 3.186522734165192, + "ce_loss_17": 2.9892637372016906, + "ce_loss_23": 2.902893769741058, + "ce_loss_3": 4.0782225847244264, + "ce_loss_6": 3.632612419128418, + "epoch": 0.308, + "grad_norm": 1288.0, + "kl_loss_12": 659.3931182861328, + "kl_loss_17": 200.50450439453124, + "kl_loss_3": 2494.252551269531, + "kl_loss_6": 1618.629833984375, + "learning_rate": 0.0007926075112568258, + "loss": 1274.0955, + "step": 3080 + }, + { + "ce_loss_12": 3.1873653411865233, + "ce_loss_17": 2.9842730164527893, + "ce_loss_23": 2.8974654197692873, + "ce_loss_3": 4.102190101146698, + "ce_loss_6": 3.6495657086372377, + "epoch": 0.309, + "grad_norm": 1208.0, + "kl_loss_12": 667.6466735839844, + "kl_loss_17": 200.92897567749023, + "kl_loss_3": 2542.1956298828127, + "kl_loss_6": 1646.6694946289062, + "learning_rate": 0.0007913194498130252, + "loss": 1248.2428, + "step": 3090 + }, + { + "ce_loss_12": 3.1232769012451174, + "ce_loss_17": 2.9138604760169984, + "ce_loss_23": 2.8240367650985716, + "ce_loss_3": 4.057645988464356, + "ce_loss_6": 3.5948039293289185, + "epoch": 0.31, + "grad_norm": 1136.0, + "kl_loss_12": 669.67705078125, + "kl_loss_17": 202.97178421020507, + "kl_loss_3": 2555.02431640625, + "kl_loss_6": 1652.5606262207032, + "learning_rate": 0.0007900284547855992, + "loss": 1278.8604, + "step": 3100 + }, + { + "ce_loss_12": 3.126430797576904, + "ce_loss_17": 2.9288961410522463, + "ce_loss_23": 2.846028184890747, + "ce_loss_3": 4.029088962078094, + "ce_loss_6": 3.57817804813385, + "epoch": 0.311, + "grad_norm": 1248.0, + "kl_loss_12": 659.7984893798828, + "kl_loss_17": 196.29704513549805, + "kl_loss_3": 2514.2835327148437, + "kl_loss_6": 1621.8560668945313, + "learning_rate": 0.0007887345391748532, + "loss": 1274.4721, + "step": 3110 + }, + { + "ce_loss_12": 3.22427237033844, + "ce_loss_17": 3.030610752105713, + "ce_loss_23": 2.9471394181251527, + "ce_loss_3": 4.115463936328888, + "ce_loss_6": 3.6681665658950804, + "epoch": 0.312, + "grad_norm": 1020.0, + "kl_loss_12": 642.4313903808594, + "kl_loss_17": 196.25705642700194, + "kl_loss_3": 2458.0077270507813, + "kl_loss_6": 1587.213604736328, + "learning_rate": 0.0007874377160105036, + "loss": 1218.9914, + "step": 3120 + }, + { + "ce_loss_12": 3.1369936227798463, + "ce_loss_17": 2.9515060067176817, + "ce_loss_23": 2.859386706352234, + "ce_loss_3": 4.092330026626587, + "ce_loss_6": 3.6090521454811095, + "epoch": 0.313, + "grad_norm": 2008.0, + "kl_loss_12": 643.0880157470704, + "kl_loss_17": 202.33798446655274, + "kl_loss_3": 2590.1301147460936, + "kl_loss_6": 1639.5181640625, + "learning_rate": 0.0007861379983515449, + "loss": 1298.4299, + "step": 3130 + }, + { + "ce_loss_12": 3.2098602414131165, + "ce_loss_17": 3.01786322593689, + "ce_loss_23": 2.928981566429138, + "ce_loss_3": 4.134507477283478, + "ce_loss_6": 3.689944326877594, + "epoch": 0.314, + "grad_norm": 1368.0, + "kl_loss_12": 652.8991668701171, + "kl_loss_17": 207.90590591430663, + "kl_loss_3": 2531.0458984375, + "kl_loss_6": 1650.2009216308593, + "learning_rate": 0.0007848353992861195, + "loss": 1252.1455, + "step": 3140 + }, + { + "ce_loss_12": 3.2989187717437742, + "ce_loss_17": 3.0994922637939455, + "ce_loss_23": 2.9975876927375795, + "ce_loss_3": 4.220439052581787, + "ce_loss_6": 3.779904568195343, + "epoch": 0.315, + "grad_norm": 1168.0, + "kl_loss_12": 676.4814544677735, + "kl_loss_17": 228.22757949829102, + "kl_loss_3": 2544.5927001953123, + "kl_loss_6": 1678.0079345703125, + "learning_rate": 0.0007835299319313853, + "loss": 1287.3684, + "step": 3150 + }, + { + "ce_loss_12": 3.1737908601760862, + "ce_loss_17": 2.9999840021133424, + "ce_loss_23": 2.9011018872261047, + "ce_loss_3": 4.079487144947052, + "ce_loss_6": 3.620735836029053, + "epoch": 0.316, + "grad_norm": 1256.0, + "kl_loss_12": 645.4729095458985, + "kl_loss_17": 246.87976608276367, + "kl_loss_3": 2485.2554931640625, + "kl_loss_6": 1596.4259765625, + "learning_rate": 0.0007822216094333848, + "loss": 1288.1577, + "step": 3160 + }, + { + "ce_loss_12": 3.197959840297699, + "ce_loss_17": 3.026703345775604, + "ce_loss_23": 2.907000410556793, + "ce_loss_3": 4.129369294643402, + "ce_loss_6": 3.6692471146583556, + "epoch": 0.317, + "grad_norm": 1160.0, + "kl_loss_12": 661.97021484375, + "kl_loss_17": 263.0534294128418, + "kl_loss_3": 2556.1950561523436, + "kl_loss_6": 1647.0233154296875, + "learning_rate": 0.0007809104449669101, + "loss": 1267.7786, + "step": 3170 + }, + { + "ce_loss_12": 3.1298831701278687, + "ce_loss_17": 2.958413553237915, + "ce_loss_23": 2.856259453296661, + "ce_loss_3": 4.039877963066101, + "ce_loss_6": 3.5906583666801453, + "epoch": 0.318, + "grad_norm": 1072.0, + "kl_loss_12": 645.5213134765625, + "kl_loss_17": 245.95380325317382, + "kl_loss_3": 2496.2158325195314, + "kl_loss_6": 1610.8902282714844, + "learning_rate": 0.0007795964517353734, + "loss": 1254.5711, + "step": 3180 + }, + { + "ce_loss_12": 3.139362609386444, + "ce_loss_17": 2.961104655265808, + "ce_loss_23": 2.857077932357788, + "ce_loss_3": 4.078527820110321, + "ce_loss_6": 3.6135438680648804, + "epoch": 0.319, + "grad_norm": 1096.0, + "kl_loss_12": 657.72080078125, + "kl_loss_17": 248.24415435791016, + "kl_loss_3": 2568.1942504882813, + "kl_loss_6": 1646.7337951660156, + "learning_rate": 0.000778279642970672, + "loss": 1245.1319, + "step": 3190 + }, + { + "ce_loss_12": 3.1417649507522585, + "ce_loss_17": 2.971712279319763, + "ce_loss_23": 2.8669353246688845, + "ce_loss_3": 4.045985901355744, + "ce_loss_6": 3.600524604320526, + "epoch": 0.32, + "grad_norm": 1288.0, + "kl_loss_12": 652.0772766113281, + "kl_loss_17": 236.01042251586915, + "kl_loss_3": 2505.472998046875, + "kl_loss_6": 1627.3914123535155, + "learning_rate": 0.0007769600319330552, + "loss": 1239.5256, + "step": 3200 + }, + { + "ce_loss_12": 3.170133948326111, + "ce_loss_17": 2.9851341366767885, + "ce_loss_23": 2.8875450253486634, + "ce_loss_3": 4.1293561339378355, + "ce_loss_6": 3.6525676369667055, + "epoch": 0.321, + "grad_norm": 1376.0, + "kl_loss_12": 659.3075500488281, + "kl_loss_17": 220.2852684020996, + "kl_loss_3": 2596.4176513671873, + "kl_loss_6": 1663.1143737792968, + "learning_rate": 0.0007756376319109917, + "loss": 1267.159, + "step": 3210 + }, + { + "ce_loss_12": 3.208662784099579, + "ce_loss_17": 3.0174351811408995, + "ce_loss_23": 2.9262489438056947, + "ce_loss_3": 4.104580092430115, + "ce_loss_6": 3.66491756439209, + "epoch": 0.322, + "grad_norm": 1296.0, + "kl_loss_12": 647.5737518310547, + "kl_loss_17": 210.26569519042968, + "kl_loss_3": 2495.5554321289064, + "kl_loss_6": 1618.7034240722655, + "learning_rate": 0.0007743124562210351, + "loss": 1224.5053, + "step": 3220 + }, + { + "ce_loss_12": 3.222736394405365, + "ce_loss_17": 3.0304854989051817, + "ce_loss_23": 2.9412006974220275, + "ce_loss_3": 4.119387090206146, + "ce_loss_6": 3.675408494472504, + "epoch": 0.323, + "grad_norm": 1184.0, + "kl_loss_12": 646.6549346923828, + "kl_loss_17": 204.48597564697266, + "kl_loss_3": 2503.8929931640623, + "kl_loss_6": 1622.128240966797, + "learning_rate": 0.0007729845182076895, + "loss": 1249.3895, + "step": 3230 + }, + { + "ce_loss_12": 3.1534232616424562, + "ce_loss_17": 2.9649145007133484, + "ce_loss_23": 2.8808998346328734, + "ce_loss_3": 4.050489735603333, + "ce_loss_6": 3.608076286315918, + "epoch": 0.324, + "grad_norm": 1056.0, + "kl_loss_12": 642.7339508056641, + "kl_loss_17": 196.3539276123047, + "kl_loss_3": 2472.869177246094, + "kl_loss_6": 1592.1756774902344, + "learning_rate": 0.0007716538312432765, + "loss": 1258.6277, + "step": 3240 + }, + { + "ce_loss_12": 3.1309715151786803, + "ce_loss_17": 2.9249958992004395, + "ce_loss_23": 2.8356101751327514, + "ce_loss_3": 4.062940955162048, + "ce_loss_6": 3.607418692111969, + "epoch": 0.325, + "grad_norm": 1004.0, + "kl_loss_12": 667.4138031005859, + "kl_loss_17": 204.44576187133788, + "kl_loss_3": 2559.404052734375, + "kl_loss_6": 1661.5573486328126, + "learning_rate": 0.0007703204087277988, + "loss": 1266.9637, + "step": 3250 + }, + { + "ce_loss_12": 3.204416263103485, + "ce_loss_17": 3.015028953552246, + "ce_loss_23": 2.931392502784729, + "ce_loss_3": 4.094218730926514, + "ce_loss_6": 3.64478086233139, + "epoch": 0.326, + "grad_norm": 1480.0, + "kl_loss_12": 631.9965118408203, + "kl_loss_17": 193.47777786254883, + "kl_loss_3": 2457.03642578125, + "kl_loss_6": 1572.6686767578126, + "learning_rate": 0.0007689842640888063, + "loss": 1226.8134, + "step": 3260 + }, + { + "ce_loss_12": 3.2039347529411315, + "ce_loss_17": 3.0119959115982056, + "ce_loss_23": 2.9236732602119444, + "ce_loss_3": 4.096511793136597, + "ce_loss_6": 3.659037482738495, + "epoch": 0.327, + "grad_norm": 1144.0, + "kl_loss_12": 646.79599609375, + "kl_loss_17": 197.29918746948243, + "kl_loss_3": 2451.0695373535154, + "kl_loss_6": 1593.8397583007813, + "learning_rate": 0.0007676454107812607, + "loss": 1236.0236, + "step": 3270 + }, + { + "ce_loss_12": 3.152002787590027, + "ce_loss_17": 2.957594645023346, + "ce_loss_23": 2.8716594338417054, + "ce_loss_3": 4.086058926582337, + "ce_loss_6": 3.619371509552002, + "epoch": 0.328, + "grad_norm": 1248.0, + "kl_loss_12": 657.9291381835938, + "kl_loss_17": 198.33377532958986, + "kl_loss_3": 2551.7009765625, + "kl_loss_6": 1636.1120056152345, + "learning_rate": 0.0007663038622873999, + "loss": 1238.8723, + "step": 3280 + }, + { + "ce_loss_12": 3.1929536700248717, + "ce_loss_17": 3.001619851589203, + "ce_loss_23": 2.917175257205963, + "ce_loss_3": 4.111210346221924, + "ce_loss_6": 3.6579272747039795, + "epoch": 0.329, + "grad_norm": 1544.0, + "kl_loss_12": 651.0749725341797, + "kl_loss_17": 198.15295867919923, + "kl_loss_3": 2525.5502685546876, + "kl_loss_6": 1625.9446044921874, + "learning_rate": 0.0007649596321166025, + "loss": 1231.7627, + "step": 3290 + }, + { + "ce_loss_12": 3.095591151714325, + "ce_loss_17": 2.9051140904426576, + "ce_loss_23": 2.8213531136512757, + "ce_loss_3": 3.982525849342346, + "ce_loss_6": 3.5510496616363527, + "epoch": 0.33, + "grad_norm": 1080.0, + "kl_loss_12": 636.9413482666016, + "kl_loss_17": 189.79553909301757, + "kl_loss_3": 2442.1396423339843, + "kl_loss_6": 1593.9491882324219, + "learning_rate": 0.0007636127338052513, + "loss": 1232.5529, + "step": 3300 + }, + { + "ce_loss_12": 3.1994780898094177, + "ce_loss_17": 3.0023029327392576, + "ce_loss_23": 2.912818741798401, + "ce_loss_3": 4.138954031467438, + "ce_loss_6": 3.662935256958008, + "epoch": 0.331, + "grad_norm": 976.0, + "kl_loss_12": 654.0823486328125, + "kl_loss_17": 198.2132797241211, + "kl_loss_3": 2574.5783813476564, + "kl_loss_6": 1645.6572509765624, + "learning_rate": 0.0007622631809165971, + "loss": 1242.5524, + "step": 3310 + }, + { + "ce_loss_12": 3.1742364287376406, + "ce_loss_17": 2.9843477964401246, + "ce_loss_23": 2.904312765598297, + "ce_loss_3": 4.043950057029724, + "ce_loss_6": 3.614525556564331, + "epoch": 0.332, + "grad_norm": 1640.0, + "kl_loss_12": 616.8112365722657, + "kl_loss_17": 183.72462158203126, + "kl_loss_3": 2375.9060180664064, + "kl_loss_6": 1535.5855590820313, + "learning_rate": 0.000760910987040623, + "loss": 1205.1776, + "step": 3320 + }, + { + "ce_loss_12": 3.189255142211914, + "ce_loss_17": 2.981306564807892, + "ce_loss_23": 2.893477368354797, + "ce_loss_3": 4.122015202045441, + "ce_loss_6": 3.660171020030975, + "epoch": 0.333, + "grad_norm": 1144.0, + "kl_loss_12": 669.9992126464844, + "kl_loss_17": 198.67136993408204, + "kl_loss_3": 2580.4757690429688, + "kl_loss_6": 1662.0666076660157, + "learning_rate": 0.000759556165793906, + "loss": 1240.4461, + "step": 3330 + }, + { + "ce_loss_12": 3.1871766686439513, + "ce_loss_17": 2.9891620874404907, + "ce_loss_23": 2.902255916595459, + "ce_loss_3": 4.102960324287414, + "ce_loss_6": 3.6463622212409974, + "epoch": 0.334, + "grad_norm": 968.0, + "kl_loss_12": 653.475473022461, + "kl_loss_17": 199.1413902282715, + "kl_loss_3": 2513.588134765625, + "kl_loss_6": 1624.1831848144532, + "learning_rate": 0.000758198730819481, + "loss": 1255.6303, + "step": 3340 + }, + { + "ce_loss_12": 3.1485753536224363, + "ce_loss_17": 2.957745444774628, + "ce_loss_23": 2.8779073357582092, + "ce_loss_3": 4.060862839221954, + "ce_loss_6": 3.6163305282592773, + "epoch": 0.335, + "grad_norm": 1096.0, + "kl_loss_12": 635.0015411376953, + "kl_loss_17": 190.78094253540038, + "kl_loss_3": 2502.9186279296873, + "kl_loss_6": 1622.7606018066406, + "learning_rate": 0.0007568386957867032, + "loss": 1236.9645, + "step": 3350 + }, + { + "ce_loss_12": 3.197234070301056, + "ce_loss_17": 3.002056097984314, + "ce_loss_23": 2.9157747507095335, + "ce_loss_3": 4.096824419498444, + "ce_loss_6": 3.654950773715973, + "epoch": 0.336, + "grad_norm": 1352.0, + "kl_loss_12": 649.6791900634765, + "kl_loss_17": 196.0836959838867, + "kl_loss_3": 2488.7575561523436, + "kl_loss_6": 1614.0893615722657, + "learning_rate": 0.0007554760743911103, + "loss": 1249.6344, + "step": 3360 + }, + { + "ce_loss_12": 3.122460901737213, + "ce_loss_17": 2.934140515327454, + "ce_loss_23": 2.8511914134025576, + "ce_loss_3": 4.024395322799682, + "ce_loss_6": 3.5776939511299135, + "epoch": 0.337, + "grad_norm": 1200.0, + "kl_loss_12": 629.5604217529296, + "kl_loss_17": 187.96243438720703, + "kl_loss_3": 2487.4703979492188, + "kl_loss_6": 1598.3975830078125, + "learning_rate": 0.0007541108803542846, + "loss": 1259.7262, + "step": 3370 + }, + { + "ce_loss_12": 3.15894296169281, + "ce_loss_17": 2.9651761412620545, + "ce_loss_23": 2.8833929538726806, + "ce_loss_3": 4.063071310520172, + "ce_loss_6": 3.607359540462494, + "epoch": 0.338, + "grad_norm": 1376.0, + "kl_loss_12": 639.8570343017578, + "kl_loss_17": 191.9022071838379, + "kl_loss_3": 2516.658801269531, + "kl_loss_6": 1606.4229248046875, + "learning_rate": 0.0007527431274237149, + "loss": 1293.7488, + "step": 3380 + }, + { + "ce_loss_12": 3.1345289826393126, + "ce_loss_17": 2.944065737724304, + "ce_loss_23": 2.8607085824012755, + "ce_loss_3": 4.037069797515869, + "ce_loss_6": 3.57998104095459, + "epoch": 0.339, + "grad_norm": 1208.0, + "kl_loss_12": 636.0152709960937, + "kl_loss_17": 192.22691955566407, + "kl_loss_3": 2498.66904296875, + "kl_loss_6": 1586.3996887207031, + "learning_rate": 0.0007513728293726579, + "loss": 1233.265, + "step": 3390 + }, + { + "ce_loss_12": 3.2337993502616884, + "ce_loss_17": 3.0387322425842287, + "ce_loss_23": 2.9555094718933104, + "ce_loss_3": 4.117582046985627, + "ce_loss_6": 3.6774802207946777, + "epoch": 0.34, + "grad_norm": 1208.0, + "kl_loss_12": 648.2277984619141, + "kl_loss_17": 193.34667205810547, + "kl_loss_3": 2459.2372436523438, + "kl_loss_6": 1592.1968139648438, + "learning_rate": 0.00075, + "loss": 1219.958, + "step": 3400 + }, + { + "ce_loss_12": 3.234040653705597, + "ce_loss_17": 3.0330355405807494, + "ce_loss_23": 2.946638286113739, + "ce_loss_3": 4.1481698751449585, + "ce_loss_6": 3.6888544082641603, + "epoch": 0.341, + "grad_norm": 1616.0, + "kl_loss_12": 657.6758117675781, + "kl_loss_17": 193.70989608764648, + "kl_loss_3": 2527.2737548828127, + "kl_loss_6": 1612.798162841797, + "learning_rate": 0.0007486246531301177, + "loss": 1237.0359, + "step": 3410 + }, + { + "ce_loss_12": 3.0608113527297975, + "ce_loss_17": 2.8603794097900392, + "ce_loss_23": 2.779085946083069, + "ce_loss_3": 3.965061593055725, + "ce_loss_6": 3.515889024734497, + "epoch": 0.342, + "grad_norm": 1512.0, + "kl_loss_12": 640.9623992919921, + "kl_loss_17": 189.16546020507812, + "kl_loss_3": 2486.7409790039064, + "kl_loss_6": 1605.0156677246093, + "learning_rate": 0.0007472468026127384, + "loss": 1218.0469, + "step": 3420 + }, + { + "ce_loss_12": 3.204521358013153, + "ce_loss_17": 3.001431107521057, + "ce_loss_23": 2.9101683497428894, + "ce_loss_3": 4.138897204399109, + "ce_loss_6": 3.673828053474426, + "epoch": 0.343, + "grad_norm": 1328.0, + "kl_loss_12": 675.5233673095703, + "kl_loss_17": 204.8723388671875, + "kl_loss_3": 2599.304162597656, + "kl_loss_6": 1671.3851257324218, + "learning_rate": 0.000745866462322802, + "loss": 1268.8045, + "step": 3430 + }, + { + "ce_loss_12": 3.1597685694694517, + "ce_loss_17": 2.96885906457901, + "ce_loss_23": 2.891816234588623, + "ce_loss_3": 4.049743747711181, + "ce_loss_6": 3.6046136617660522, + "epoch": 0.344, + "grad_norm": 1144.0, + "kl_loss_12": 628.1821197509765, + "kl_loss_17": 187.54821166992187, + "kl_loss_3": 2445.0335205078127, + "kl_loss_6": 1559.4938110351563, + "learning_rate": 0.0007444836461603195, + "loss": 1220.1586, + "step": 3440 + }, + { + "ce_loss_12": 3.2360959887504577, + "ce_loss_17": 3.0426719188690186, + "ce_loss_23": 2.953330385684967, + "ce_loss_3": 4.143719017505646, + "ce_loss_6": 3.699867343902588, + "epoch": 0.345, + "grad_norm": 1064.0, + "kl_loss_12": 669.5976684570312, + "kl_loss_17": 204.9705047607422, + "kl_loss_3": 2542.0022705078127, + "kl_loss_6": 1652.5964782714843, + "learning_rate": 0.0007430983680502344, + "loss": 1268.3637, + "step": 3450 + }, + { + "ce_loss_12": 3.0876049160957337, + "ce_loss_17": 2.8904442191123962, + "ce_loss_23": 2.8051801323890686, + "ce_loss_3": 4.022367370128632, + "ce_loss_6": 3.5601198196411135, + "epoch": 0.346, + "grad_norm": 1248.0, + "kl_loss_12": 655.0568389892578, + "kl_loss_17": 195.21175689697264, + "kl_loss_3": 2541.0522338867186, + "kl_loss_6": 1636.6640930175781, + "learning_rate": 0.0007417106419422819, + "loss": 1252.4146, + "step": 3460 + }, + { + "ce_loss_12": 3.1681289196014406, + "ce_loss_17": 2.9741994857788088, + "ce_loss_23": 2.8874059557914733, + "ce_loss_3": 4.07452574968338, + "ce_loss_6": 3.618625295162201, + "epoch": 0.347, + "grad_norm": 1248.0, + "kl_loss_12": 644.6038970947266, + "kl_loss_17": 193.07538070678712, + "kl_loss_3": 2476.3251525878904, + "kl_loss_6": 1590.3403564453124, + "learning_rate": 0.0007403204818108486, + "loss": 1244.1215, + "step": 3470 + }, + { + "ce_loss_12": 3.162106120586395, + "ce_loss_17": 2.966542291641235, + "ce_loss_23": 2.8839560627937315, + "ce_loss_3": 4.071386611461639, + "ce_loss_6": 3.6170035123825075, + "epoch": 0.348, + "grad_norm": 1272.0, + "kl_loss_12": 644.7951568603515, + "kl_loss_17": 191.4629104614258, + "kl_loss_3": 2531.1408813476564, + "kl_loss_6": 1623.6707824707032, + "learning_rate": 0.0007389279016548316, + "loss": 1211.3204, + "step": 3480 + }, + { + "ce_loss_12": 3.1672859668731688, + "ce_loss_17": 2.96072518825531, + "ce_loss_23": 2.874778151512146, + "ce_loss_3": 4.11575837135315, + "ce_loss_6": 3.6296263933181763, + "epoch": 0.349, + "grad_norm": 1168.0, + "kl_loss_12": 662.0878631591797, + "kl_loss_17": 196.92775039672853, + "kl_loss_3": 2594.9796630859373, + "kl_loss_6": 1644.7096069335937, + "learning_rate": 0.0007375329154974975, + "loss": 1260.271, + "step": 3490 + }, + { + "ce_loss_12": 3.1099432587623594, + "ce_loss_17": 2.9236859321594237, + "ce_loss_23": 2.8439751505851745, + "ce_loss_3": 4.011048936843872, + "ce_loss_6": 3.5630403637886046, + "epoch": 0.35, + "grad_norm": 1096.0, + "kl_loss_12": 626.3424011230469, + "kl_loss_17": 188.74627304077148, + "kl_loss_3": 2442.5823974609375, + "kl_loss_6": 1574.2830627441406, + "learning_rate": 0.0007361355373863414, + "loss": 1240.516, + "step": 3500 + }, + { + "ce_loss_12": 3.1638558864593507, + "ce_loss_17": 2.9696279048919676, + "ce_loss_23": 2.888825249671936, + "ce_loss_3": 4.058097195625305, + "ce_loss_6": 3.603681969642639, + "epoch": 0.351, + "grad_norm": 1016.0, + "kl_loss_12": 629.8948608398438, + "kl_loss_17": 187.6762222290039, + "kl_loss_3": 2460.040576171875, + "kl_loss_6": 1563.9760498046876, + "learning_rate": 0.0007347357813929454, + "loss": 1243.6459, + "step": 3510 + }, + { + "ce_loss_12": 3.1169628858566285, + "ce_loss_17": 2.930294370651245, + "ce_loss_23": 2.846389579772949, + "ce_loss_3": 4.012052595615387, + "ce_loss_6": 3.5693087339401246, + "epoch": 0.352, + "grad_norm": 1200.0, + "kl_loss_12": 626.136929321289, + "kl_loss_17": 188.69427566528321, + "kl_loss_3": 2443.0934204101563, + "kl_loss_6": 1567.1674072265625, + "learning_rate": 0.0007333336616128369, + "loss": 1238.4143, + "step": 3520 + }, + { + "ce_loss_12": 3.0994298815727235, + "ce_loss_17": 2.9009223699569704, + "ce_loss_23": 2.813994586467743, + "ce_loss_3": 4.030776941776276, + "ce_loss_6": 3.563981807231903, + "epoch": 0.353, + "grad_norm": 1512.0, + "kl_loss_12": 649.8764343261719, + "kl_loss_17": 193.6501266479492, + "kl_loss_3": 2539.990856933594, + "kl_loss_6": 1624.6120483398438, + "learning_rate": 0.0007319291921653463, + "loss": 1246.1929, + "step": 3530 + }, + { + "ce_loss_12": 3.1819730043411254, + "ce_loss_17": 2.9840220332145693, + "ce_loss_23": 2.895319640636444, + "ce_loss_3": 4.11333841085434, + "ce_loss_6": 3.6454499125480653, + "epoch": 0.354, + "grad_norm": 1560.0, + "kl_loss_12": 652.9826873779297, + "kl_loss_17": 198.4254135131836, + "kl_loss_3": 2530.1242431640626, + "kl_loss_6": 1630.5962463378905, + "learning_rate": 0.0007305223871934656, + "loss": 1233.6375, + "step": 3540 + }, + { + "ce_loss_12": 3.1405220866203307, + "ce_loss_17": 2.947129189968109, + "ce_loss_23": 2.864759373664856, + "ce_loss_3": 4.039001405239105, + "ce_loss_6": 3.592834734916687, + "epoch": 0.355, + "grad_norm": 1392.0, + "kl_loss_12": 637.2668518066406, + "kl_loss_17": 192.42751998901366, + "kl_loss_3": 2472.7722412109374, + "kl_loss_6": 1586.1993774414063, + "learning_rate": 0.0007291132608637052, + "loss": 1231.5611, + "step": 3550 + }, + { + "ce_loss_12": 3.1172609090805055, + "ce_loss_17": 2.9296460151672363, + "ce_loss_23": 2.846535086631775, + "ce_loss_3": 4.094112932682037, + "ce_loss_6": 3.610998845100403, + "epoch": 0.356, + "grad_norm": 1360.0, + "kl_loss_12": 629.188510131836, + "kl_loss_17": 188.0505683898926, + "kl_loss_3": 2621.151025390625, + "kl_loss_6": 1676.23330078125, + "learning_rate": 0.0007277018273659516, + "loss": 1269.5545, + "step": 3560 + }, + { + "ce_loss_12": 3.242421197891235, + "ce_loss_17": 3.036579656600952, + "ce_loss_23": 2.9507407784461974, + "ce_loss_3": 4.137357699871063, + "ce_loss_6": 3.6962520837783814, + "epoch": 0.357, + "grad_norm": 1584.0, + "kl_loss_12": 668.0106536865235, + "kl_loss_17": 199.97094345092773, + "kl_loss_3": 2518.397570800781, + "kl_loss_6": 1646.419659423828, + "learning_rate": 0.0007262881009133242, + "loss": 1244.0977, + "step": 3570 + }, + { + "ce_loss_12": 3.1462175965309145, + "ce_loss_17": 2.954234480857849, + "ce_loss_23": 2.874486243724823, + "ce_loss_3": 4.042923450469971, + "ce_loss_6": 3.594035029411316, + "epoch": 0.358, + "grad_norm": 1104.0, + "kl_loss_12": 633.8466430664063, + "kl_loss_17": 187.2772117614746, + "kl_loss_3": 2466.479931640625, + "kl_loss_6": 1578.679052734375, + "learning_rate": 0.0007248720957420329, + "loss": 1208.5965, + "step": 3580 + }, + { + "ce_loss_12": 3.138073432445526, + "ce_loss_17": 2.9530703902244566, + "ce_loss_23": 2.877877390384674, + "ce_loss_3": 4.028606414794922, + "ce_loss_6": 3.579486346244812, + "epoch": 0.359, + "grad_norm": 1104.0, + "kl_loss_12": 623.7652374267578, + "kl_loss_17": 185.6518424987793, + "kl_loss_3": 2446.584088134766, + "kl_loss_6": 1559.5703002929688, + "learning_rate": 0.0007234538261112341, + "loss": 1238.4975, + "step": 3590 + }, + { + "ce_loss_12": 3.1939882755279543, + "ce_loss_17": 2.997502303123474, + "ce_loss_23": 2.9144866824150086, + "ce_loss_3": 4.114092516899109, + "ce_loss_6": 3.6445836186408997, + "epoch": 0.36, + "grad_norm": 1048.0, + "kl_loss_12": 642.1127014160156, + "kl_loss_17": 191.9202751159668, + "kl_loss_3": 2509.18154296875, + "kl_loss_6": 1591.3108459472655, + "learning_rate": 0.0007220333063028871, + "loss": 1222.5482, + "step": 3600 + }, + { + "ce_loss_12": 3.2365827560424805, + "ce_loss_17": 3.0361854791641236, + "ce_loss_23": 2.949097955226898, + "ce_loss_3": 4.2037324666976925, + "ce_loss_6": 3.728068196773529, + "epoch": 0.361, + "grad_norm": 1664.0, + "kl_loss_12": 661.2010620117187, + "kl_loss_17": 200.61611557006836, + "kl_loss_3": 2653.8142211914064, + "kl_loss_6": 1708.1386962890624, + "learning_rate": 0.0007206105506216106, + "loss": 1280.1693, + "step": 3610 + }, + { + "ce_loss_12": 3.10485817193985, + "ce_loss_17": 2.913054096698761, + "ce_loss_23": 2.8345122218132017, + "ce_loss_3": 3.989728772640228, + "ce_loss_6": 3.5509069919586183, + "epoch": 0.362, + "grad_norm": 1376.0, + "kl_loss_12": 628.1443359375, + "kl_loss_17": 184.31702575683593, + "kl_loss_3": 2426.3939697265623, + "kl_loss_6": 1563.0535583496094, + "learning_rate": 0.0007191855733945387, + "loss": 1195.5931, + "step": 3620 + }, + { + "ce_loss_12": 3.190879261493683, + "ce_loss_17": 2.999620962142944, + "ce_loss_23": 2.9165725708007812, + "ce_loss_3": 4.087678861618042, + "ce_loss_6": 3.6386709809303284, + "epoch": 0.363, + "grad_norm": 1128.0, + "kl_loss_12": 630.6439605712891, + "kl_loss_17": 186.4598747253418, + "kl_loss_3": 2463.1990234375, + "kl_loss_6": 1577.126171875, + "learning_rate": 0.0007177583889711762, + "loss": 1209.9547, + "step": 3630 + }, + { + "ce_loss_12": 3.104693794250488, + "ce_loss_17": 2.9128946185112, + "ce_loss_23": 2.8351215600967405, + "ce_loss_3": 4.020575773715973, + "ce_loss_6": 3.567472243309021, + "epoch": 0.364, + "grad_norm": 1192.0, + "kl_loss_12": 634.3725769042969, + "kl_loss_17": 186.35895080566405, + "kl_loss_3": 2499.30517578125, + "kl_loss_6": 1601.517596435547, + "learning_rate": 0.0007163290117232541, + "loss": 1229.4205, + "step": 3640 + }, + { + "ce_loss_12": 3.206928777694702, + "ce_loss_17": 3.022316098213196, + "ce_loss_23": 2.9427536964416503, + "ce_loss_3": 4.0679107189178465, + "ce_loss_6": 3.6384734749794005, + "epoch": 0.365, + "grad_norm": 1440.0, + "kl_loss_12": 619.4795654296875, + "kl_loss_17": 183.97281036376953, + "kl_loss_3": 2402.00595703125, + "kl_loss_6": 1549.7066040039062, + "learning_rate": 0.0007148974560445859, + "loss": 1206.8211, + "step": 3650 + }, + { + "ce_loss_12": 3.143563616275787, + "ce_loss_17": 2.953302776813507, + "ce_loss_23": 2.87525874376297, + "ce_loss_3": 4.018783235549927, + "ce_loss_6": 3.582585895061493, + "epoch": 0.366, + "grad_norm": 1024.0, + "kl_loss_12": 623.0079528808594, + "kl_loss_17": 184.78619308471679, + "kl_loss_3": 2404.4192016601564, + "kl_loss_6": 1555.8073486328126, + "learning_rate": 0.0007134637363509209, + "loss": 1192.1729, + "step": 3660 + }, + { + "ce_loss_12": 3.2462796330451966, + "ce_loss_17": 3.0629473090171815, + "ce_loss_23": 2.9862843632698057, + "ce_loss_3": 4.116179370880127, + "ce_loss_6": 3.684998261928558, + "epoch": 0.367, + "grad_norm": 1200.0, + "kl_loss_12": 620.0697937011719, + "kl_loss_17": 182.9595085144043, + "kl_loss_3": 2382.9265014648436, + "kl_loss_6": 1543.4059326171875, + "learning_rate": 0.0007120278670798009, + "loss": 1206.8045, + "step": 3670 + }, + { + "ce_loss_12": 3.0817275762557985, + "ce_loss_17": 2.881822347640991, + "ce_loss_23": 2.7966799259185793, + "ce_loss_3": 4.039555454254151, + "ce_loss_6": 3.572439932823181, + "epoch": 0.368, + "grad_norm": 1528.0, + "kl_loss_12": 652.9435607910157, + "kl_loss_17": 192.2785629272461, + "kl_loss_3": 2608.0561767578124, + "kl_loss_6": 1677.3426330566406, + "learning_rate": 0.0007105898626904133, + "loss": 1276.8469, + "step": 3680 + }, + { + "ce_loss_12": 3.1575241327285766, + "ce_loss_17": 2.966373598575592, + "ce_loss_23": 2.885143148899078, + "ce_loss_3": 4.070808088779449, + "ce_loss_6": 3.609587752819061, + "epoch": 0.369, + "grad_norm": 1128.0, + "kl_loss_12": 630.6559204101562, + "kl_loss_17": 188.17326431274415, + "kl_loss_3": 2485.571112060547, + "kl_loss_6": 1571.32275390625, + "learning_rate": 0.0007091497376634463, + "loss": 1208.1986, + "step": 3690 + }, + { + "ce_loss_12": 3.1057782888412477, + "ce_loss_17": 2.916164147853851, + "ce_loss_23": 2.8346520900726317, + "ce_loss_3": 3.9995585441589356, + "ce_loss_6": 3.5516976594924925, + "epoch": 0.37, + "grad_norm": 1096.0, + "kl_loss_12": 628.3031372070312, + "kl_loss_17": 189.42245864868164, + "kl_loss_3": 2441.6884887695314, + "kl_loss_6": 1562.001318359375, + "learning_rate": 0.0007077075065009433, + "loss": 1237.0109, + "step": 3700 + }, + { + "ce_loss_12": 3.2027547001838683, + "ce_loss_17": 3.0074793815612795, + "ce_loss_23": 2.923413860797882, + "ce_loss_3": 4.125626313686371, + "ce_loss_6": 3.6585047006607057, + "epoch": 0.371, + "grad_norm": 1136.0, + "kl_loss_12": 645.6251739501953, + "kl_loss_17": 199.54634628295898, + "kl_loss_3": 2519.0525024414064, + "kl_loss_6": 1606.2898742675782, + "learning_rate": 0.0007062631837261557, + "loss": 1234.8146, + "step": 3710 + }, + { + "ce_loss_12": 3.0837065100669863, + "ce_loss_17": 2.8989955306053163, + "ce_loss_23": 2.8208890914916993, + "ce_loss_3": 3.9908340334892274, + "ce_loss_6": 3.527920973300934, + "epoch": 0.372, + "grad_norm": 1240.0, + "kl_loss_12": 620.295361328125, + "kl_loss_17": 184.6085075378418, + "kl_loss_3": 2473.2692260742188, + "kl_loss_6": 1559.3187622070313, + "learning_rate": 0.0007048167838833977, + "loss": 1242.3812, + "step": 3720 + }, + { + "ce_loss_12": 3.162278175354004, + "ce_loss_17": 2.974462831020355, + "ce_loss_23": 2.8935346484184263, + "ce_loss_3": 4.03741956949234, + "ce_loss_6": 3.6007279634475706, + "epoch": 0.373, + "grad_norm": 1480.0, + "kl_loss_12": 626.9373077392578, + "kl_loss_17": 190.09740447998047, + "kl_loss_3": 2436.891955566406, + "kl_loss_6": 1547.806591796875, + "learning_rate": 0.0007033683215379002, + "loss": 1211.3609, + "step": 3730 + }, + { + "ce_loss_12": 3.1470221281051636, + "ce_loss_17": 2.959082067012787, + "ce_loss_23": 2.880895709991455, + "ce_loss_3": 4.051479244232178, + "ce_loss_6": 3.590286874771118, + "epoch": 0.374, + "grad_norm": 1280.0, + "kl_loss_12": 619.1704406738281, + "kl_loss_17": 183.4695182800293, + "kl_loss_3": 2461.903515625, + "kl_loss_6": 1554.585955810547, + "learning_rate": 0.0007019178112756625, + "loss": 1224.6299, + "step": 3740 + }, + { + "ce_loss_12": 3.127616310119629, + "ce_loss_17": 2.939793038368225, + "ce_loss_23": 2.8609092473983764, + "ce_loss_3": 4.025426268577576, + "ce_loss_6": 3.571797215938568, + "epoch": 0.375, + "grad_norm": 1384.0, + "kl_loss_12": 619.8853515625, + "kl_loss_17": 186.0279067993164, + "kl_loss_3": 2440.3298583984374, + "kl_loss_6": 1559.0566345214843, + "learning_rate": 0.0007004652677033068, + "loss": 1215.057, + "step": 3750 + }, + { + "ce_loss_12": 3.1839040398597716, + "ce_loss_17": 3.008525323867798, + "ce_loss_23": 2.935053050518036, + "ce_loss_3": 4.0585259079933165, + "ce_loss_6": 3.6220818042755125, + "epoch": 0.376, + "grad_norm": 1504.0, + "kl_loss_12": 603.8833801269532, + "kl_loss_17": 181.97922439575194, + "kl_loss_3": 2395.0477905273438, + "kl_loss_6": 1523.921240234375, + "learning_rate": 0.0006990107054479312, + "loss": 1196.0367, + "step": 3760 + }, + { + "ce_loss_12": 3.1728707432746885, + "ce_loss_17": 2.9855801582336428, + "ce_loss_23": 2.905232536792755, + "ce_loss_3": 4.063840591907502, + "ce_loss_6": 3.6246375679969787, + "epoch": 0.377, + "grad_norm": 1400.0, + "kl_loss_12": 625.2057556152344, + "kl_loss_17": 190.76472930908204, + "kl_loss_3": 2445.6776489257813, + "kl_loss_6": 1578.4298400878906, + "learning_rate": 0.000697554139156961, + "loss": 1217.5118, + "step": 3770 + }, + { + "ce_loss_12": 3.1740211248397827, + "ce_loss_17": 2.9835832476615907, + "ce_loss_23": 2.9033362746238707, + "ce_loss_3": 4.079412627220154, + "ce_loss_6": 3.6197427034378054, + "epoch": 0.378, + "grad_norm": 964.0, + "kl_loss_12": 639.1348205566406, + "kl_loss_17": 192.42096710205078, + "kl_loss_3": 2505.44453125, + "kl_loss_6": 1596.6610412597656, + "learning_rate": 0.0006960955834980027, + "loss": 1203.2656, + "step": 3780 + }, + { + "ce_loss_12": 3.138484704494476, + "ce_loss_17": 2.953498876094818, + "ce_loss_23": 2.8716805815696715, + "ce_loss_3": 4.0312147498130795, + "ce_loss_6": 3.585508608818054, + "epoch": 0.379, + "grad_norm": 1464.0, + "kl_loss_12": 618.7588714599609, + "kl_loss_17": 188.11121139526367, + "kl_loss_3": 2427.6190795898438, + "kl_loss_6": 1553.103253173828, + "learning_rate": 0.0006946350531586958, + "loss": 1207.2389, + "step": 3790 + }, + { + "ce_loss_12": 3.167103588581085, + "ce_loss_17": 2.980492651462555, + "ce_loss_23": 2.899238634109497, + "ce_loss_3": 4.059364914894104, + "ce_loss_6": 3.6126387119293213, + "epoch": 0.38, + "grad_norm": 1272.0, + "kl_loss_12": 621.873583984375, + "kl_loss_17": 183.54898376464843, + "kl_loss_3": 2451.3309692382813, + "kl_loss_6": 1567.5563354492188, + "learning_rate": 0.0006931725628465643, + "loss": 1230.7523, + "step": 3800 + }, + { + "ce_loss_12": 3.169570064544678, + "ce_loss_17": 2.978095519542694, + "ce_loss_23": 2.8957305431365965, + "ce_loss_3": 4.0779964327812195, + "ce_loss_6": 3.613682174682617, + "epoch": 0.381, + "grad_norm": 1912.0, + "kl_loss_12": 631.6369598388671, + "kl_loss_17": 187.99442596435546, + "kl_loss_3": 2470.4658142089843, + "kl_loss_6": 1564.417218017578, + "learning_rate": 0.0006917081272888696, + "loss": 1217.5242, + "step": 3810 + }, + { + "ce_loss_12": 3.0985295176506042, + "ce_loss_17": 2.9078894376754763, + "ce_loss_23": 2.826605522632599, + "ce_loss_3": 4.035587120056152, + "ce_loss_6": 3.56057288646698, + "epoch": 0.382, + "grad_norm": 1272.0, + "kl_loss_12": 634.5787231445313, + "kl_loss_17": 190.97016448974608, + "kl_loss_3": 2558.696630859375, + "kl_loss_6": 1607.7907287597657, + "learning_rate": 0.0006902417612324615, + "loss": 1221.291, + "step": 3820 + }, + { + "ce_loss_12": 3.224716365337372, + "ce_loss_17": 3.0207711338996885, + "ce_loss_23": 2.934830629825592, + "ce_loss_3": 4.1563934803009035, + "ce_loss_6": 3.690185070037842, + "epoch": 0.383, + "grad_norm": 1192.0, + "kl_loss_12": 660.1558837890625, + "kl_loss_17": 197.20094680786133, + "kl_loss_3": 2556.5868530273438, + "kl_loss_6": 1635.0429321289062, + "learning_rate": 0.00068877347944363, + "loss": 1245.9041, + "step": 3830 + }, + { + "ce_loss_12": 3.2066277265548706, + "ce_loss_17": 3.01594934463501, + "ce_loss_23": 2.9391089200973513, + "ce_loss_3": 4.07437949180603, + "ce_loss_6": 3.6364644885063173, + "epoch": 0.384, + "grad_norm": 1400.0, + "kl_loss_12": 622.8762481689453, + "kl_loss_17": 187.0550849914551, + "kl_loss_3": 2403.3736328125, + "kl_loss_6": 1543.0215576171875, + "learning_rate": 0.0006873032967079561, + "loss": 1211.8893, + "step": 3840 + }, + { + "ce_loss_12": 3.1839053630828857, + "ce_loss_17": 3.008801805973053, + "ce_loss_23": 2.9267475485801695, + "ce_loss_3": 4.052586698532105, + "ce_loss_6": 3.611920189857483, + "epoch": 0.385, + "grad_norm": 972.0, + "kl_loss_12": 618.1469970703125, + "kl_loss_17": 184.93197021484374, + "kl_loss_3": 2401.352734375, + "kl_loss_6": 1532.117254638672, + "learning_rate": 0.0006858312278301637, + "loss": 1188.9611, + "step": 3850 + }, + { + "ce_loss_12": 3.2220707893371583, + "ce_loss_17": 3.0395336985588073, + "ce_loss_23": 2.9616737246513365, + "ce_loss_3": 4.066495501995087, + "ce_loss_6": 3.6387622594833373, + "epoch": 0.386, + "grad_norm": 1336.0, + "kl_loss_12": 621.8028442382813, + "kl_loss_17": 186.69632263183593, + "kl_loss_3": 2369.3518310546874, + "kl_loss_6": 1525.177227783203, + "learning_rate": 0.0006843572876339704, + "loss": 1187.79, + "step": 3860 + }, + { + "ce_loss_12": 3.1393592596054076, + "ce_loss_17": 2.9591917514801027, + "ce_loss_23": 2.883060610294342, + "ce_loss_3": 3.990993869304657, + "ce_loss_6": 3.564140295982361, + "epoch": 0.387, + "grad_norm": 1384.0, + "kl_loss_12": 611.3696350097656, + "kl_loss_17": 179.5668182373047, + "kl_loss_3": 2362.6711669921874, + "kl_loss_6": 1517.403125, + "learning_rate": 0.0006828814909619373, + "loss": 1224.166, + "step": 3870 + }, + { + "ce_loss_12": 3.266438162326813, + "ce_loss_17": 3.078672707080841, + "ce_loss_23": 2.997675633430481, + "ce_loss_3": 4.1469242691993715, + "ce_loss_6": 3.694972002506256, + "epoch": 0.388, + "grad_norm": 1032.0, + "kl_loss_12": 625.2180084228515, + "kl_loss_17": 189.66805419921874, + "kl_loss_3": 2417.526257324219, + "kl_loss_6": 1538.9678100585938, + "learning_rate": 0.0006814038526753205, + "loss": 1181.2365, + "step": 3880 + }, + { + "ce_loss_12": 3.1726443648338316, + "ce_loss_17": 2.9824947476387025, + "ce_loss_23": 2.9020296216011046, + "ce_loss_3": 4.046648359298706, + "ce_loss_6": 3.605487620830536, + "epoch": 0.389, + "grad_norm": 1256.0, + "kl_loss_12": 618.3945541381836, + "kl_loss_17": 184.8795082092285, + "kl_loss_3": 2406.640905761719, + "kl_loss_6": 1535.630078125, + "learning_rate": 0.0006799243876539213, + "loss": 1194.9497, + "step": 3890 + }, + { + "ce_loss_12": 3.097509813308716, + "ce_loss_17": 2.9127988815307617, + "ce_loss_23": 2.8334673643112183, + "ce_loss_3": 4.022769427299499, + "ce_loss_6": 3.5444315314292907, + "epoch": 0.39, + "grad_norm": 1216.0, + "kl_loss_12": 619.7410095214843, + "kl_loss_17": 183.83202362060547, + "kl_loss_3": 2497.6127685546876, + "kl_loss_6": 1557.9707397460938, + "learning_rate": 0.0006784431107959359, + "loss": 1221.2479, + "step": 3900 + }, + { + "ce_loss_12": 3.159643363952637, + "ce_loss_17": 2.9656416535377503, + "ce_loss_23": 2.8849711060523986, + "ce_loss_3": 4.089113199710846, + "ce_loss_6": 3.6197752594947814, + "epoch": 0.391, + "grad_norm": 2512.0, + "kl_loss_12": 639.0259399414062, + "kl_loss_17": 189.73209686279296, + "kl_loss_3": 2532.8427001953123, + "kl_loss_6": 1597.2655395507813, + "learning_rate": 0.0006769600370178059, + "loss": 1216.9616, + "step": 3910 + }, + { + "ce_loss_12": 3.119121015071869, + "ce_loss_17": 2.9354542970657347, + "ce_loss_23": 2.8540964841842653, + "ce_loss_3": 4.018436765670776, + "ce_loss_6": 3.5637188076972963, + "epoch": 0.392, + "grad_norm": 984.0, + "kl_loss_12": 626.1691375732422, + "kl_loss_17": 185.33537521362305, + "kl_loss_3": 2448.906945800781, + "kl_loss_6": 1563.4917846679687, + "learning_rate": 0.0006754751812540679, + "loss": 1190.9987, + "step": 3920 + }, + { + "ce_loss_12": 3.1707355618476867, + "ce_loss_17": 2.9823012948036194, + "ce_loss_23": 2.8973543882369994, + "ce_loss_3": 4.073235404491425, + "ce_loss_6": 3.612573671340942, + "epoch": 0.393, + "grad_norm": 1416.0, + "kl_loss_12": 631.8544799804688, + "kl_loss_17": 195.9771957397461, + "kl_loss_3": 2500.5748901367188, + "kl_loss_6": 1586.8189819335937, + "learning_rate": 0.0006739885584572025, + "loss": 1230.1657, + "step": 3930 + }, + { + "ce_loss_12": 3.185942196846008, + "ce_loss_17": 3.0003496408462524, + "ce_loss_23": 2.9172792553901674, + "ce_loss_3": 4.115244591236115, + "ce_loss_6": 3.6516054034233094, + "epoch": 0.394, + "grad_norm": 1272.0, + "kl_loss_12": 634.0373046875, + "kl_loss_17": 193.56379318237305, + "kl_loss_3": 2552.929479980469, + "kl_loss_6": 1623.9166687011718, + "learning_rate": 0.0006725001835974853, + "loss": 1219.0386, + "step": 3940 + }, + { + "ce_loss_12": 3.1856348872184754, + "ce_loss_17": 2.9992774367332458, + "ce_loss_23": 2.9162330746650698, + "ce_loss_3": 4.097616982460022, + "ce_loss_6": 3.641265344619751, + "epoch": 0.395, + "grad_norm": 1120.0, + "kl_loss_12": 633.9897552490235, + "kl_loss_17": 193.42904891967774, + "kl_loss_3": 2500.8137084960936, + "kl_loss_6": 1591.2876098632812, + "learning_rate": 0.0006710100716628344, + "loss": 1198.0403, + "step": 3950 + }, + { + "ce_loss_12": 3.1679529190063476, + "ce_loss_17": 2.9761669516563414, + "ce_loss_23": 2.8977315187454225, + "ce_loss_3": 4.07511625289917, + "ce_loss_6": 3.6221744894981383, + "epoch": 0.396, + "grad_norm": 1208.0, + "kl_loss_12": 621.3185638427734, + "kl_loss_17": 185.20935287475587, + "kl_loss_3": 2475.409619140625, + "kl_loss_6": 1576.9202758789063, + "learning_rate": 0.0006695182376586602, + "loss": 1223.8219, + "step": 3960 + }, + { + "ce_loss_12": 3.1788403749465943, + "ce_loss_17": 3.0021064639091493, + "ce_loss_23": 2.9263312816619873, + "ce_loss_3": 4.03387680053711, + "ce_loss_6": 3.5939263820648195, + "epoch": 0.397, + "grad_norm": 1384.0, + "kl_loss_12": 590.8750671386719, + "kl_loss_17": 177.05664672851563, + "kl_loss_3": 2330.8658203125, + "kl_loss_6": 1466.253466796875, + "learning_rate": 0.000668024696607715, + "loss": 1206.6646, + "step": 3970 + }, + { + "ce_loss_12": 3.154967558383942, + "ce_loss_17": 2.9758860230445863, + "ce_loss_23": 2.896761977672577, + "ce_loss_3": 4.04023425579071, + "ce_loss_6": 3.598306691646576, + "epoch": 0.398, + "grad_norm": 1136.0, + "kl_loss_12": 610.6574310302734, + "kl_loss_17": 182.25591812133788, + "kl_loss_3": 2431.3807861328123, + "kl_loss_6": 1550.7834106445312, + "learning_rate": 0.0006665294635499404, + "loss": 1196.3627, + "step": 3980 + }, + { + "ce_loss_12": 3.180202877521515, + "ce_loss_17": 2.985775589942932, + "ce_loss_23": 2.9002400159835817, + "ce_loss_3": 4.108720588684082, + "ce_loss_6": 3.642514145374298, + "epoch": 0.399, + "grad_norm": 1288.0, + "kl_loss_12": 647.539013671875, + "kl_loss_17": 194.1472381591797, + "kl_loss_3": 2564.7157470703123, + "kl_loss_6": 1636.1509643554687, + "learning_rate": 0.0006650325535423167, + "loss": 1229.0541, + "step": 3990 + }, + { + "ce_loss_12": 3.1735582828521727, + "ce_loss_17": 2.9931819081306457, + "ce_loss_23": 2.9180276870727537, + "ce_loss_3": 4.016478252410889, + "ce_loss_6": 3.5862005710601808, + "epoch": 0.4, + "grad_norm": 1216.0, + "kl_loss_12": 596.537808227539, + "kl_loss_17": 179.41532363891602, + "kl_loss_3": 2333.13134765625, + "kl_loss_6": 1479.8255737304687, + "learning_rate": 0.0006635339816587109, + "loss": 1185.9506, + "step": 4000 + }, + { + "ce_loss_12": 3.1252514243125917, + "ce_loss_17": 2.9375029802322388, + "ce_loss_23": 2.8593607783317565, + "ce_loss_3": 4.044191861152649, + "ce_loss_6": 3.57818717956543, + "epoch": 0.401, + "grad_norm": 932.0, + "kl_loss_12": 624.591438293457, + "kl_loss_17": 188.9966079711914, + "kl_loss_3": 2504.6765014648436, + "kl_loss_6": 1584.8181884765625, + "learning_rate": 0.0006620337629897252, + "loss": 1203.3638, + "step": 4010 + }, + { + "ce_loss_12": 3.1388319969177245, + "ce_loss_17": 2.948263680934906, + "ce_loss_23": 2.8623416900634764, + "ce_loss_3": 4.0201987504959105, + "ce_loss_6": 3.5734472393989565, + "epoch": 0.402, + "grad_norm": 1344.0, + "kl_loss_12": 621.6943450927735, + "kl_loss_17": 188.1446647644043, + "kl_loss_3": 2436.329235839844, + "kl_loss_6": 1549.45478515625, + "learning_rate": 0.0006605319126425454, + "loss": 1219.1758, + "step": 4020 + }, + { + "ce_loss_12": 3.0478885412216186, + "ce_loss_17": 2.8635390639305114, + "ce_loss_23": 2.7847749173641203, + "ce_loss_3": 3.9780253887176515, + "ce_loss_6": 3.509523332118988, + "epoch": 0.403, + "grad_norm": 1088.0, + "kl_loss_12": 623.6564025878906, + "kl_loss_17": 187.83245315551758, + "kl_loss_3": 2524.8685791015623, + "kl_loss_6": 1599.0273132324219, + "learning_rate": 0.0006590284457407876, + "loss": 1219.572, + "step": 4030 + }, + { + "ce_loss_12": 3.1345214009284974, + "ce_loss_17": 2.947396790981293, + "ce_loss_23": 2.8696147322654726, + "ce_loss_3": 4.032974493503571, + "ce_loss_6": 3.581662690639496, + "epoch": 0.404, + "grad_norm": 1296.0, + "kl_loss_12": 622.2457672119141, + "kl_loss_17": 187.03895874023436, + "kl_loss_3": 2448.82939453125, + "kl_loss_6": 1555.9044067382813, + "learning_rate": 0.0006575233774243465, + "loss": 1200.8846, + "step": 4040 + }, + { + "ce_loss_12": 3.136705255508423, + "ce_loss_17": 2.9500337719917296, + "ce_loss_23": 2.8683730244636534, + "ce_loss_3": 4.042927157878876, + "ce_loss_6": 3.5823343634605407, + "epoch": 0.405, + "grad_norm": 1152.0, + "kl_loss_12": 633.6688507080078, + "kl_loss_17": 191.35879440307616, + "kl_loss_3": 2483.2155151367188, + "kl_loss_6": 1563.1441589355468, + "learning_rate": 0.0006560167228492435, + "loss": 1213.0156, + "step": 4050 + }, + { + "ce_loss_12": 3.1671351194381714, + "ce_loss_17": 2.9819561004638673, + "ce_loss_23": 2.906921589374542, + "ce_loss_3": 4.0319117426872255, + "ce_loss_6": 3.59480482339859, + "epoch": 0.406, + "grad_norm": 1400.0, + "kl_loss_12": 603.7755615234375, + "kl_loss_17": 181.3179901123047, + "kl_loss_3": 2384.0647888183594, + "kl_loss_6": 1515.6968994140625, + "learning_rate": 0.0006545084971874737, + "loss": 1202.403, + "step": 4060 + }, + { + "ce_loss_12": 3.1415469646453857, + "ce_loss_17": 2.942392110824585, + "ce_loss_23": 2.859453630447388, + "ce_loss_3": 4.05655928850174, + "ce_loss_6": 3.5980806589126586, + "epoch": 0.407, + "grad_norm": 1488.0, + "kl_loss_12": 641.6716430664062, + "kl_loss_17": 192.19773025512694, + "kl_loss_3": 2512.15908203125, + "kl_loss_6": 1599.2367370605468, + "learning_rate": 0.0006529987156268526, + "loss": 1204.36, + "step": 4070 + }, + { + "ce_loss_12": 3.066053903102875, + "ce_loss_17": 2.8747988820075987, + "ce_loss_23": 2.7897343158721926, + "ce_loss_3": 3.970530641078949, + "ce_loss_6": 3.5074979782104494, + "epoch": 0.408, + "grad_norm": 1120.0, + "kl_loss_12": 621.1017639160157, + "kl_loss_17": 185.82689056396484, + "kl_loss_3": 2472.4393676757813, + "kl_loss_6": 1557.4956115722657, + "learning_rate": 0.0006514873933708637, + "loss": 1230.9574, + "step": 4080 + }, + { + "ce_loss_12": 3.164853739738464, + "ce_loss_17": 2.9797329902648926, + "ce_loss_23": 2.9014252185821534, + "ce_loss_3": 4.050824499130249, + "ce_loss_6": 3.603599953651428, + "epoch": 0.409, + "grad_norm": 1272.0, + "kl_loss_12": 608.2298370361328, + "kl_loss_17": 180.71694564819336, + "kl_loss_3": 2420.9696533203123, + "kl_loss_6": 1539.03232421875, + "learning_rate": 0.0006499745456385053, + "loss": 1189.1555, + "step": 4090 + }, + { + "ce_loss_12": 3.1311935782432556, + "ce_loss_17": 2.9477484822273254, + "ce_loss_23": 2.8674038887023925, + "ce_loss_3": 4.031777393817902, + "ce_loss_6": 3.5701063513755797, + "epoch": 0.41, + "grad_norm": 1376.0, + "kl_loss_12": 621.2535614013672, + "kl_loss_17": 185.45141220092773, + "kl_loss_3": 2455.026086425781, + "kl_loss_6": 1553.7825561523437, + "learning_rate": 0.0006484601876641375, + "loss": 1211.484, + "step": 4100 + }, + { + "ce_loss_12": 3.115593373775482, + "ce_loss_17": 2.9372752904891968, + "ce_loss_23": 2.8610817909240724, + "ce_loss_3": 3.974239504337311, + "ce_loss_6": 3.5369308471679686, + "epoch": 0.411, + "grad_norm": 1488.0, + "kl_loss_12": 603.7558715820312, + "kl_loss_17": 180.66451187133788, + "kl_loss_3": 2351.790808105469, + "kl_loss_6": 1493.5529357910157, + "learning_rate": 0.000646944334697328, + "loss": 1175.2756, + "step": 4110 + }, + { + "ce_loss_12": 3.219191825389862, + "ce_loss_17": 3.0355695724487304, + "ce_loss_23": 2.9580381751060485, + "ce_loss_3": 4.060825407505035, + "ce_loss_6": 3.6392863154411317, + "epoch": 0.412, + "grad_norm": 1296.0, + "kl_loss_12": 605.027815246582, + "kl_loss_17": 181.11124267578126, + "kl_loss_3": 2321.31376953125, + "kl_loss_6": 1490.2986083984374, + "learning_rate": 0.0006454270020026995, + "loss": 1157.5808, + "step": 4120 + }, + { + "ce_loss_12": 3.190293347835541, + "ce_loss_17": 3.0151625514030456, + "ce_loss_23": 2.938095223903656, + "ce_loss_3": 4.0323722004890445, + "ce_loss_6": 3.604380559921265, + "epoch": 0.413, + "grad_norm": 1112.0, + "kl_loss_12": 593.6603637695313, + "kl_loss_17": 176.89496612548828, + "kl_loss_3": 2321.645892333984, + "kl_loss_6": 1483.1813537597657, + "learning_rate": 0.0006439082048597755, + "loss": 1153.7059, + "step": 4130 + }, + { + "ce_loss_12": 3.1885109305381776, + "ce_loss_17": 3.0001250505447388, + "ce_loss_23": 2.9225224018096925, + "ce_loss_3": 4.0738829374313354, + "ce_loss_6": 3.6269838571548463, + "epoch": 0.414, + "grad_norm": 1112.0, + "kl_loss_12": 621.9543090820313, + "kl_loss_17": 183.03130035400392, + "kl_loss_3": 2435.958605957031, + "kl_loss_6": 1556.7580078125, + "learning_rate": 0.0006423879585628261, + "loss": 1204.4623, + "step": 4140 + }, + { + "ce_loss_12": 3.146387219429016, + "ce_loss_17": 2.955020797252655, + "ce_loss_23": 2.876465117931366, + "ce_loss_3": 4.061364984512329, + "ce_loss_6": 3.600324022769928, + "epoch": 0.415, + "grad_norm": 1048.0, + "kl_loss_12": 634.1819671630859, + "kl_loss_17": 187.38957138061522, + "kl_loss_3": 2494.6953979492187, + "kl_loss_6": 1589.542950439453, + "learning_rate": 0.0006408662784207149, + "loss": 1219.8756, + "step": 4150 + }, + { + "ce_loss_12": 3.1147804141044615, + "ce_loss_17": 2.933422553539276, + "ce_loss_23": 2.856143128871918, + "ce_loss_3": 4.003896868228912, + "ce_loss_6": 3.5593339085578917, + "epoch": 0.416, + "grad_norm": 1384.0, + "kl_loss_12": 614.3667526245117, + "kl_loss_17": 179.37629470825195, + "kl_loss_3": 2439.1699462890624, + "kl_loss_6": 1557.9551391601562, + "learning_rate": 0.0006393431797567439, + "loss": 1198.1912, + "step": 4160 + }, + { + "ce_loss_12": 3.1870901346206666, + "ce_loss_17": 3.0070515036582948, + "ce_loss_23": 2.9304370403289797, + "ce_loss_3": 4.027340471744537, + "ce_loss_6": 3.5912632942199707, + "epoch": 0.417, + "grad_norm": 1088.0, + "kl_loss_12": 604.4052337646484, + "kl_loss_17": 181.3666244506836, + "kl_loss_3": 2345.2923583984375, + "kl_loss_6": 1478.760107421875, + "learning_rate": 0.0006378186779084996, + "loss": 1144.3483, + "step": 4170 + }, + { + "ce_loss_12": 3.036437451839447, + "ce_loss_17": 2.847336399555206, + "ce_loss_23": 2.768596684932709, + "ce_loss_3": 3.942387247085571, + "ce_loss_6": 3.491739511489868, + "epoch": 0.418, + "grad_norm": 1648.0, + "kl_loss_12": 625.4832733154296, + "kl_loss_17": 184.05609970092775, + "kl_loss_3": 2447.228466796875, + "kl_loss_6": 1568.6175109863282, + "learning_rate": 0.0006362927882276989, + "loss": 1213.3297, + "step": 4180 + }, + { + "ce_loss_12": 3.1968475103378298, + "ce_loss_17": 3.0216124773025514, + "ce_loss_23": 2.943992519378662, + "ce_loss_3": 4.047649502754211, + "ce_loss_6": 3.618110203742981, + "epoch": 0.419, + "grad_norm": 1872.0, + "kl_loss_12": 593.755502319336, + "kl_loss_17": 177.35777282714844, + "kl_loss_3": 2344.962713623047, + "kl_loss_6": 1489.8335815429687, + "learning_rate": 0.000634765526080034, + "loss": 1148.7749, + "step": 4190 + }, + { + "ce_loss_12": 3.2133609533309935, + "ce_loss_17": 3.02979691028595, + "ce_loss_23": 2.94939581155777, + "ce_loss_3": 4.076133000850677, + "ce_loss_6": 3.640803503990173, + "epoch": 0.42, + "grad_norm": 1328.0, + "kl_loss_12": 612.8868621826172, + "kl_loss_17": 185.13345108032226, + "kl_loss_3": 2380.922399902344, + "kl_loss_6": 1522.4027038574218, + "learning_rate": 0.0006332369068450174, + "loss": 1170.4217, + "step": 4200 + }, + { + "ce_loss_12": 3.1545344591140747, + "ce_loss_17": 2.970043087005615, + "ce_loss_23": 2.8947736382484437, + "ce_loss_3": 4.039136958122254, + "ce_loss_6": 3.5878245711326597, + "epoch": 0.421, + "grad_norm": 1240.0, + "kl_loss_12": 610.0253753662109, + "kl_loss_17": 180.47613830566405, + "kl_loss_3": 2412.1201416015624, + "kl_loss_6": 1535.6862182617188, + "learning_rate": 0.0006317069459158283, + "loss": 1178.7123, + "step": 4210 + }, + { + "ce_loss_12": 3.2384790778160095, + "ce_loss_17": 3.064324343204498, + "ce_loss_23": 2.9877140641212465, + "ce_loss_3": 4.089816331863403, + "ce_loss_6": 3.6487473249435425, + "epoch": 0.422, + "grad_norm": 1112.0, + "kl_loss_12": 602.6154113769531, + "kl_loss_17": 181.77598495483397, + "kl_loss_3": 2361.879040527344, + "kl_loss_6": 1490.4651489257812, + "learning_rate": 0.0006301756586991561, + "loss": 1164.6935, + "step": 4220 + }, + { + "ce_loss_12": 3.0499517917633057, + "ce_loss_17": 2.8621727347373964, + "ce_loss_23": 2.7846267104148863, + "ce_loss_3": 3.9611988425254823, + "ce_loss_6": 3.5068413972854615, + "epoch": 0.423, + "grad_norm": 1128.0, + "kl_loss_12": 619.654345703125, + "kl_loss_17": 183.03083038330078, + "kl_loss_3": 2495.934606933594, + "kl_loss_6": 1583.8559020996095, + "learning_rate": 0.0006286430606150459, + "loss": 1204.7939, + "step": 4230 + }, + { + "ce_loss_12": 3.239712488651276, + "ce_loss_17": 3.0582516312599184, + "ce_loss_23": 2.980309987068176, + "ce_loss_3": 4.110534727573395, + "ce_loss_6": 3.6744491338729857, + "epoch": 0.424, + "grad_norm": 1064.0, + "kl_loss_12": 613.473355102539, + "kl_loss_17": 183.81760330200194, + "kl_loss_3": 2406.3619384765625, + "kl_loss_6": 1523.7447509765625, + "learning_rate": 0.0006271091670967436, + "loss": 1179.3695, + "step": 4240 + }, + { + "ce_loss_12": 3.171178638935089, + "ce_loss_17": 2.977051842212677, + "ce_loss_23": 2.8960009932518007, + "ce_loss_3": 4.089389193058014, + "ce_loss_6": 3.6281680345535277, + "epoch": 0.425, + "grad_norm": 1360.0, + "kl_loss_12": 650.4041778564454, + "kl_loss_17": 190.87026596069336, + "kl_loss_3": 2532.585070800781, + "kl_loss_6": 1618.9200622558594, + "learning_rate": 0.0006255739935905395, + "loss": 1212.168, + "step": 4250 + }, + { + "ce_loss_12": 3.192930221557617, + "ce_loss_17": 3.007972240447998, + "ce_loss_23": 2.931060564517975, + "ce_loss_3": 4.054401051998139, + "ce_loss_6": 3.6081828713417052, + "epoch": 0.426, + "grad_norm": 1216.0, + "kl_loss_12": 611.473422241211, + "kl_loss_17": 182.98992462158202, + "kl_loss_3": 2384.975927734375, + "kl_loss_6": 1505.831365966797, + "learning_rate": 0.0006240375555556145, + "loss": 1215.0896, + "step": 4260 + }, + { + "ce_loss_12": 3.1992557883262633, + "ce_loss_17": 3.009497511386871, + "ce_loss_23": 2.930485475063324, + "ce_loss_3": 4.107662343978882, + "ce_loss_6": 3.652487301826477, + "epoch": 0.427, + "grad_norm": 1192.0, + "kl_loss_12": 621.4902252197265, + "kl_loss_17": 183.88353729248047, + "kl_loss_3": 2474.84443359375, + "kl_loss_6": 1575.9041320800782, + "learning_rate": 0.000622499868463882, + "loss": 1204.1057, + "step": 4270 + }, + { + "ce_loss_12": 3.1563706755638123, + "ce_loss_17": 2.9828593254089357, + "ce_loss_23": 2.907644438743591, + "ce_loss_3": 4.003303325176239, + "ce_loss_6": 3.5699565291404722, + "epoch": 0.428, + "grad_norm": 1112.0, + "kl_loss_12": 595.4530853271484, + "kl_loss_17": 179.88403701782227, + "kl_loss_3": 2351.163684082031, + "kl_loss_6": 1484.134100341797, + "learning_rate": 0.0006209609477998338, + "loss": 1169.7238, + "step": 4280 + }, + { + "ce_loss_12": 3.22188595533371, + "ce_loss_17": 3.0394359707832335, + "ce_loss_23": 2.957653522491455, + "ce_loss_3": 4.087126052379608, + "ce_loss_6": 3.6486384749412535, + "epoch": 0.429, + "grad_norm": 1480.0, + "kl_loss_12": 613.1925567626953, + "kl_loss_17": 185.11133728027343, + "kl_loss_3": 2390.9447509765623, + "kl_loss_6": 1525.0828063964843, + "learning_rate": 0.0006194208090603844, + "loss": 1196.0419, + "step": 4290 + }, + { + "ce_loss_12": 3.1381272315979003, + "ce_loss_17": 2.96423180103302, + "ce_loss_23": 2.8869675517082216, + "ce_loss_3": 4.011167562007904, + "ce_loss_6": 3.5743077039718627, + "epoch": 0.43, + "grad_norm": 916.0, + "kl_loss_12": 594.9730010986328, + "kl_loss_17": 176.15359191894532, + "kl_loss_3": 2377.043664550781, + "kl_loss_6": 1502.032763671875, + "learning_rate": 0.0006178794677547138, + "loss": 1156.8104, + "step": 4300 + }, + { + "ce_loss_12": 3.173213839530945, + "ce_loss_17": 2.9872260212898256, + "ce_loss_23": 2.908361482620239, + "ce_loss_3": 4.054434442520142, + "ce_loss_6": 3.614891541004181, + "epoch": 0.431, + "grad_norm": 1296.0, + "kl_loss_12": 621.83984375, + "kl_loss_17": 182.7801429748535, + "kl_loss_3": 2430.5281005859374, + "kl_loss_6": 1561.0057495117187, + "learning_rate": 0.0006163369394041111, + "loss": 1190.3025, + "step": 4310 + }, + { + "ce_loss_12": 3.102406346797943, + "ce_loss_17": 2.9151142835617065, + "ce_loss_23": 2.837439149618149, + "ce_loss_3": 4.009701907634735, + "ce_loss_6": 3.553163170814514, + "epoch": 0.432, + "grad_norm": 1384.0, + "kl_loss_12": 613.7666687011719, + "kl_loss_17": 180.23058700561523, + "kl_loss_3": 2456.4883544921877, + "kl_loss_6": 1556.5386901855468, + "learning_rate": 0.0006147932395418205, + "loss": 1222.4783, + "step": 4320 + }, + { + "ce_loss_12": 3.141595554351807, + "ce_loss_17": 2.9644735455513, + "ce_loss_23": 2.88581862449646, + "ce_loss_3": 3.9972390055656435, + "ce_loss_6": 3.5638728499412538, + "epoch": 0.433, + "grad_norm": 1016.0, + "kl_loss_12": 605.8929626464844, + "kl_loss_17": 179.01886596679688, + "kl_loss_3": 2384.8349670410157, + "kl_loss_6": 1514.4691040039063, + "learning_rate": 0.0006132483837128823, + "loss": 1165.2039, + "step": 4330 + }, + { + "ce_loss_12": 3.1127786874771117, + "ce_loss_17": 2.9356310844421385, + "ce_loss_23": 2.8605749011039734, + "ce_loss_3": 4.008680784702301, + "ce_loss_6": 3.5570395469665526, + "epoch": 0.434, + "grad_norm": 1272.0, + "kl_loss_12": 605.3210144042969, + "kl_loss_17": 179.10021896362304, + "kl_loss_3": 2443.418591308594, + "kl_loss_6": 1535.6750915527343, + "learning_rate": 0.0006117023874739772, + "loss": 1188.4191, + "step": 4340 + }, + { + "ce_loss_12": 3.122081291675568, + "ce_loss_17": 2.932915461063385, + "ce_loss_23": 2.8596187949180605, + "ce_loss_3": 4.009017038345337, + "ce_loss_6": 3.5538516879081725, + "epoch": 0.435, + "grad_norm": 1344.0, + "kl_loss_12": 616.5628082275391, + "kl_loss_17": 181.03787994384766, + "kl_loss_3": 2449.987927246094, + "kl_loss_6": 1549.4891540527344, + "learning_rate": 0.0006101552663932703, + "loss": 1206.0716, + "step": 4350 + }, + { + "ce_loss_12": 3.150739920139313, + "ce_loss_17": 2.968548262119293, + "ce_loss_23": 2.888369154930115, + "ce_loss_3": 4.025252389907837, + "ce_loss_6": 3.5858799815177917, + "epoch": 0.436, + "grad_norm": 952.0, + "kl_loss_12": 612.4619598388672, + "kl_loss_17": 184.53734970092773, + "kl_loss_3": 2408.1568603515625, + "kl_loss_6": 1534.4716857910157, + "learning_rate": 0.0006086070360502539, + "loss": 1181.0723, + "step": 4360 + }, + { + "ce_loss_12": 3.15641371011734, + "ce_loss_17": 2.9694243907928466, + "ce_loss_23": 2.8904993653297426, + "ce_loss_3": 4.025759434700012, + "ce_loss_6": 3.581454849243164, + "epoch": 0.437, + "grad_norm": 1000.0, + "kl_loss_12": 611.8678588867188, + "kl_loss_17": 180.19417724609374, + "kl_loss_3": 2418.9445251464845, + "kl_loss_6": 1541.1427673339845, + "learning_rate": 0.0006070577120355903, + "loss": 1191.9658, + "step": 4370 + }, + { + "ce_loss_12": 3.151238977909088, + "ce_loss_17": 2.9647199869155885, + "ce_loss_23": 2.888589072227478, + "ce_loss_3": 4.012544178962708, + "ce_loss_6": 3.5770498633384706, + "epoch": 0.438, + "grad_norm": 1072.0, + "kl_loss_12": 599.1971862792968, + "kl_loss_17": 178.08247756958008, + "kl_loss_3": 2343.4568359375, + "kl_loss_6": 1496.306219482422, + "learning_rate": 0.0006055073099509549, + "loss": 1172.6889, + "step": 4380 + }, + { + "ce_loss_12": 3.205288279056549, + "ce_loss_17": 3.02845641374588, + "ce_loss_23": 2.952479887008667, + "ce_loss_3": 4.062442588806152, + "ce_loss_6": 3.6250621557235716, + "epoch": 0.439, + "grad_norm": 1304.0, + "kl_loss_12": 604.9193298339844, + "kl_loss_17": 180.97334976196288, + "kl_loss_3": 2363.5929565429688, + "kl_loss_6": 1502.811962890625, + "learning_rate": 0.0006039558454088796, + "loss": 1185.3265, + "step": 4390 + }, + { + "ce_loss_12": 3.177826189994812, + "ce_loss_17": 2.992183196544647, + "ce_loss_23": 2.9130085825920107, + "ce_loss_3": 4.054619085788727, + "ce_loss_6": 3.6174099802970887, + "epoch": 0.44, + "grad_norm": 940.0, + "kl_loss_12": 616.7141555786133, + "kl_loss_17": 183.2505676269531, + "kl_loss_3": 2406.0845947265625, + "kl_loss_6": 1539.0552612304687, + "learning_rate": 0.0006024033340325954, + "loss": 1165.9447, + "step": 4400 + }, + { + "ce_loss_12": 3.235459733009338, + "ce_loss_17": 3.057520937919617, + "ce_loss_23": 2.982460641860962, + "ce_loss_3": 4.066870248317718, + "ce_loss_6": 3.643921208381653, + "epoch": 0.441, + "grad_norm": 896.0, + "kl_loss_12": 586.6485046386719, + "kl_loss_17": 173.9886131286621, + "kl_loss_3": 2291.1926147460936, + "kl_loss_6": 1456.7806884765625, + "learning_rate": 0.0006008497914558743, + "loss": 1149.7752, + "step": 4410 + }, + { + "ce_loss_12": 3.1969673156738283, + "ce_loss_17": 3.012665021419525, + "ce_loss_23": 2.93061842918396, + "ce_loss_3": 4.075615668296814, + "ce_loss_6": 3.6308741211891173, + "epoch": 0.442, + "grad_norm": 1184.0, + "kl_loss_12": 626.980126953125, + "kl_loss_17": 189.89006118774415, + "kl_loss_3": 2427.6785400390627, + "kl_loss_6": 1548.2725952148437, + "learning_rate": 0.0005992952333228728, + "loss": 1194.7676, + "step": 4420 + }, + { + "ce_loss_12": 3.131419849395752, + "ce_loss_17": 2.9515241384506226, + "ce_loss_23": 2.877249336242676, + "ce_loss_3": 4.010429859161377, + "ce_loss_6": 3.5680622458457947, + "epoch": 0.443, + "grad_norm": 1032.0, + "kl_loss_12": 603.8198883056641, + "kl_loss_17": 177.6677215576172, + "kl_loss_3": 2420.6113891601562, + "kl_loss_6": 1541.025, + "learning_rate": 0.0005977396752879741, + "loss": 1178.0658, + "step": 4430 + }, + { + "ce_loss_12": 3.0689117550849914, + "ce_loss_17": 2.8813390254974367, + "ce_loss_23": 2.8048648595809937, + "ce_loss_3": 3.9471965193748475, + "ce_loss_6": 3.5006853580474853, + "epoch": 0.444, + "grad_norm": 1008.0, + "kl_loss_12": 615.7683135986329, + "kl_loss_17": 178.2631462097168, + "kl_loss_3": 2437.4211181640626, + "kl_loss_6": 1548.4304260253907, + "learning_rate": 0.0005961831330156305, + "loss": 1178.2352, + "step": 4440 + }, + { + "ce_loss_12": 3.2002723097801207, + "ce_loss_17": 3.017963671684265, + "ce_loss_23": 2.9416601061820984, + "ce_loss_3": 4.094040215015411, + "ce_loss_6": 3.641245484352112, + "epoch": 0.445, + "grad_norm": 1144.0, + "kl_loss_12": 606.8301818847656, + "kl_loss_17": 178.37159881591796, + "kl_loss_3": 2438.9416381835936, + "kl_loss_6": 1540.8159240722657, + "learning_rate": 0.0005946256221802051, + "loss": 1207.8047, + "step": 4450 + }, + { + "ce_loss_12": 3.1546120762825014, + "ce_loss_17": 2.9822561979293822, + "ce_loss_23": 2.913159799575806, + "ce_loss_3": 3.9937642455101012, + "ce_loss_6": 3.5635142087936402, + "epoch": 0.446, + "grad_norm": 1512.0, + "kl_loss_12": 582.0013885498047, + "kl_loss_17": 173.38505935668945, + "kl_loss_3": 2304.2907653808593, + "kl_loss_6": 1465.9250854492188, + "learning_rate": 0.0005930671584658151, + "loss": 1199.5111, + "step": 4460 + }, + { + "ce_loss_12": 3.1791846990585326, + "ce_loss_17": 2.995867121219635, + "ce_loss_23": 2.921609342098236, + "ce_loss_3": 4.046207320690155, + "ce_loss_6": 3.6068936467170714, + "epoch": 0.447, + "grad_norm": 1240.0, + "kl_loss_12": 606.0908996582032, + "kl_loss_17": 179.20726013183594, + "kl_loss_3": 2392.2299682617186, + "kl_loss_6": 1517.7416748046876, + "learning_rate": 0.0005915077575661722, + "loss": 1189.5426, + "step": 4470 + }, + { + "ce_loss_12": 3.19470419883728, + "ce_loss_17": 3.0134175539016725, + "ce_loss_23": 2.936840128898621, + "ce_loss_3": 4.070160377025604, + "ce_loss_6": 3.620807874202728, + "epoch": 0.448, + "grad_norm": 2128.0, + "kl_loss_12": 618.5509094238281, + "kl_loss_17": 184.0712677001953, + "kl_loss_3": 2423.9630615234373, + "kl_loss_6": 1533.7049865722656, + "learning_rate": 0.000589947435184427, + "loss": 1172.1064, + "step": 4480 + }, + { + "ce_loss_12": 3.245382022857666, + "ce_loss_17": 3.069892108440399, + "ce_loss_23": 2.993940007686615, + "ce_loss_3": 4.063095688819885, + "ce_loss_6": 3.645653986930847, + "epoch": 0.449, + "grad_norm": 1088.0, + "kl_loss_12": 602.9532592773437, + "kl_loss_17": 179.3310974121094, + "kl_loss_3": 2319.9205810546873, + "kl_loss_6": 1483.8597900390625, + "learning_rate": 0.0005883862070330078, + "loss": 1164.0664, + "step": 4490 + }, + { + "ce_loss_12": 3.1951568722724915, + "ce_loss_17": 3.0084994196891786, + "ce_loss_23": 2.933244800567627, + "ce_loss_3": 4.06550110578537, + "ce_loss_6": 3.6232324600219727, + "epoch": 0.45, + "grad_norm": 1120.0, + "kl_loss_12": 613.0902038574219, + "kl_loss_17": 180.93068466186523, + "kl_loss_3": 2407.7741455078126, + "kl_loss_6": 1531.2658630371093, + "learning_rate": 0.0005868240888334653, + "loss": 1175.3828, + "step": 4500 + }, + { + "ce_loss_12": 3.0894825577735903, + "ce_loss_17": 2.90518798828125, + "ce_loss_23": 2.8252994894981383, + "ce_loss_3": 3.989874315261841, + "ce_loss_6": 3.528348672389984, + "epoch": 0.451, + "grad_norm": 1004.0, + "kl_loss_12": 614.2226287841797, + "kl_loss_17": 181.90268173217774, + "kl_loss_3": 2449.4162780761717, + "kl_loss_6": 1549.0628845214844, + "learning_rate": 0.0005852610963163119, + "loss": 1194.0572, + "step": 4510 + }, + { + "ce_loss_12": 3.1016993284225465, + "ce_loss_17": 2.9186312317848206, + "ce_loss_23": 2.845429253578186, + "ce_loss_3": 3.9625542402267455, + "ce_loss_6": 3.5307849884033202, + "epoch": 0.452, + "grad_norm": 1080.0, + "kl_loss_12": 602.321940612793, + "kl_loss_17": 176.20557403564453, + "kl_loss_3": 2377.343701171875, + "kl_loss_6": 1510.2603820800782, + "learning_rate": 0.0005836972452208654, + "loss": 1159.2561, + "step": 4520 + }, + { + "ce_loss_12": 3.1053978562355042, + "ce_loss_17": 2.926951897144318, + "ce_loss_23": 2.8549540877342223, + "ce_loss_3": 3.992169404029846, + "ce_loss_6": 3.5429995179176332, + "epoch": 0.453, + "grad_norm": 1184.0, + "kl_loss_12": 605.2806594848632, + "kl_loss_17": 180.5658836364746, + "kl_loss_3": 2423.1808471679688, + "kl_loss_6": 1526.9974060058594, + "learning_rate": 0.0005821325512950885, + "loss": 1183.519, + "step": 4530 + }, + { + "ce_loss_12": 3.1314645528793337, + "ce_loss_17": 2.948435127735138, + "ce_loss_23": 2.8767545342445375, + "ce_loss_3": 3.9899188756942747, + "ce_loss_6": 3.5530348300933836, + "epoch": 0.454, + "grad_norm": 1128.0, + "kl_loss_12": 586.1095260620117, + "kl_loss_17": 175.72238693237304, + "kl_loss_3": 2341.41015625, + "kl_loss_6": 1480.1607421875, + "learning_rate": 0.0005805670302954321, + "loss": 1169.1936, + "step": 4540 + }, + { + "ce_loss_12": 3.1308477878570558, + "ce_loss_17": 2.9560258388519287, + "ce_loss_23": 2.8843746423721313, + "ce_loss_3": 3.9922935366630554, + "ce_loss_6": 3.5547144174575807, + "epoch": 0.455, + "grad_norm": 1368.0, + "kl_loss_12": 589.7868408203125, + "kl_loss_17": 175.0533874511719, + "kl_loss_3": 2359.759100341797, + "kl_loss_6": 1493.0820190429688, + "learning_rate": 0.000579000697986675, + "loss": 1154.3197, + "step": 4550 + }, + { + "ce_loss_12": 3.106800544261932, + "ce_loss_17": 2.9211928248405457, + "ce_loss_23": 2.8407024145126343, + "ce_loss_3": 4.007343494892121, + "ce_loss_6": 3.560846769809723, + "epoch": 0.456, + "grad_norm": 992.0, + "kl_loss_12": 625.1224670410156, + "kl_loss_17": 182.62897796630858, + "kl_loss_3": 2458.1799377441407, + "kl_loss_6": 1573.9145263671876, + "learning_rate": 0.0005774335701417662, + "loss": 1183.7318, + "step": 4560 + }, + { + "ce_loss_12": 3.092662584781647, + "ce_loss_17": 2.9104947686195373, + "ce_loss_23": 2.8373430490493776, + "ce_loss_3": 4.005108213424682, + "ce_loss_6": 3.5392897725105286, + "epoch": 0.457, + "grad_norm": 1216.0, + "kl_loss_12": 602.2989013671875, + "kl_loss_17": 175.17914123535155, + "kl_loss_3": 2467.407958984375, + "kl_loss_6": 1549.7640441894532, + "learning_rate": 0.0005758656625416658, + "loss": 1184.0045, + "step": 4570 + }, + { + "ce_loss_12": 3.147735857963562, + "ce_loss_17": 2.9694060325622558, + "ce_loss_23": 2.8919848680496214, + "ce_loss_3": 4.014414978027344, + "ce_loss_6": 3.577460062503815, + "epoch": 0.458, + "grad_norm": 1296.0, + "kl_loss_12": 607.9190521240234, + "kl_loss_17": 181.15816345214844, + "kl_loss_3": 2386.3705200195313, + "kl_loss_6": 1522.2303894042968, + "learning_rate": 0.0005742969909751859, + "loss": 1162.5008, + "step": 4580 + }, + { + "ce_loss_12": 3.1607869029045106, + "ce_loss_17": 2.977490282058716, + "ce_loss_23": 2.902131140232086, + "ce_loss_3": 4.032933211326599, + "ce_loss_6": 3.5879942059516905, + "epoch": 0.459, + "grad_norm": 980.0, + "kl_loss_12": 605.3396240234375, + "kl_loss_17": 177.57530059814454, + "kl_loss_3": 2404.7664184570312, + "kl_loss_6": 1516.094061279297, + "learning_rate": 0.0005727275712388318, + "loss": 1182.758, + "step": 4590 + }, + { + "ce_loss_12": 3.164100635051727, + "ce_loss_17": 2.9944262981414793, + "ce_loss_23": 2.9201043009757996, + "ce_loss_3": 4.0042870998382565, + "ce_loss_6": 3.5822572112083435, + "epoch": 0.46, + "grad_norm": 1136.0, + "kl_loss_12": 582.4186553955078, + "kl_loss_17": 172.64603500366212, + "kl_loss_3": 2319.947625732422, + "kl_loss_6": 1475.0054870605468, + "learning_rate": 0.0005711574191366427, + "loss": 1154.0926, + "step": 4600 + }, + { + "ce_loss_12": 3.123515796661377, + "ce_loss_17": 2.946636915206909, + "ce_loss_23": 2.8758048892021177, + "ce_loss_3": 3.9856918692588805, + "ce_loss_6": 3.548724818229675, + "epoch": 0.461, + "grad_norm": 1208.0, + "kl_loss_12": 592.2702346801758, + "kl_loss_17": 173.98607940673827, + "kl_loss_3": 2361.8337829589846, + "kl_loss_6": 1486.2902221679688, + "learning_rate": 0.0005695865504800327, + "loss": 1150.3572, + "step": 4610 + }, + { + "ce_loss_12": 3.086802434921265, + "ce_loss_17": 2.893968403339386, + "ce_loss_23": 2.812371277809143, + "ce_loss_3": 4.0360452890396115, + "ce_loss_6": 3.561065137386322, + "epoch": 0.462, + "grad_norm": 1168.0, + "kl_loss_12": 637.6817230224609, + "kl_loss_17": 187.44516372680664, + "kl_loss_3": 2576.360314941406, + "kl_loss_6": 1633.4139465332032, + "learning_rate": 0.0005680149810876322, + "loss": 1208.0934, + "step": 4620 + }, + { + "ce_loss_12": 3.1244890332221984, + "ce_loss_17": 2.949481213092804, + "ce_loss_23": 2.8752119183540343, + "ce_loss_3": 4.005128943920136, + "ce_loss_6": 3.5534419894218443, + "epoch": 0.463, + "grad_norm": 1136.0, + "kl_loss_12": 597.1886444091797, + "kl_loss_17": 177.4503646850586, + "kl_loss_3": 2404.171875, + "kl_loss_6": 1515.5715576171874, + "learning_rate": 0.0005664427267851271, + "loss": 1167.7977, + "step": 4630 + }, + { + "ce_loss_12": 3.046623146533966, + "ce_loss_17": 2.8676019072532655, + "ce_loss_23": 2.7937828421592714, + "ce_loss_3": 3.9279929876327513, + "ce_loss_6": 3.4800993323326113, + "epoch": 0.464, + "grad_norm": 1144.0, + "kl_loss_12": 589.9766616821289, + "kl_loss_17": 173.2282585144043, + "kl_loss_3": 2390.469445800781, + "kl_loss_6": 1500.6643432617188, + "learning_rate": 0.0005648698034051009, + "loss": 1160.5701, + "step": 4640 + }, + { + "ce_loss_12": 3.152076518535614, + "ce_loss_17": 2.969795060157776, + "ce_loss_23": 2.892859363555908, + "ce_loss_3": 4.063055515289307, + "ce_loss_6": 3.595481789112091, + "epoch": 0.465, + "grad_norm": 1112.0, + "kl_loss_12": 599.1829010009766, + "kl_loss_17": 176.86735610961915, + "kl_loss_3": 2457.741827392578, + "kl_loss_6": 1536.1931945800782, + "learning_rate": 0.0005632962267868747, + "loss": 1166.2303, + "step": 4650 + }, + { + "ce_loss_12": 3.094632935523987, + "ce_loss_17": 2.9150583505630494, + "ce_loss_23": 2.844318723678589, + "ce_loss_3": 3.9594098567962646, + "ce_loss_6": 3.519077444076538, + "epoch": 0.466, + "grad_norm": 1024.0, + "kl_loss_12": 585.1817733764649, + "kl_loss_17": 171.51151428222656, + "kl_loss_3": 2366.838446044922, + "kl_loss_6": 1495.7054382324218, + "learning_rate": 0.0005617220127763474, + "loss": 1164.3539, + "step": 4660 + }, + { + "ce_loss_12": 3.169011449813843, + "ce_loss_17": 2.991918349266052, + "ce_loss_23": 2.9186757445335387, + "ce_loss_3": 4.024391627311706, + "ce_loss_6": 3.585629200935364, + "epoch": 0.467, + "grad_norm": 1272.0, + "kl_loss_12": 592.2187469482421, + "kl_loss_17": 175.8935432434082, + "kl_loss_3": 2357.176135253906, + "kl_loss_6": 1491.6616333007812, + "learning_rate": 0.0005601471772258368, + "loss": 1167.6615, + "step": 4670 + }, + { + "ce_loss_12": 3.1551905155181883, + "ce_loss_17": 2.9784803986549377, + "ce_loss_23": 2.9041765928268433, + "ce_loss_3": 4.006201231479645, + "ce_loss_6": 3.5734246969223022, + "epoch": 0.468, + "grad_norm": 1544.0, + "kl_loss_12": 590.1953628540039, + "kl_loss_17": 175.58686141967775, + "kl_loss_3": 2324.540203857422, + "kl_loss_6": 1472.1936279296874, + "learning_rate": 0.0005585717359939192, + "loss": 1169.4467, + "step": 4680 + }, + { + "ce_loss_12": 3.0665130019187927, + "ce_loss_17": 2.8923226475715635, + "ce_loss_23": 2.8199669241905214, + "ce_loss_3": 3.920028805732727, + "ce_loss_6": 3.4936899185180663, + "epoch": 0.469, + "grad_norm": 976.0, + "kl_loss_12": 588.025910949707, + "kl_loss_17": 172.29647674560547, + "kl_loss_3": 2333.3386657714846, + "kl_loss_6": 1484.1640563964843, + "learning_rate": 0.0005569957049452703, + "loss": 1175.3557, + "step": 4690 + }, + { + "ce_loss_12": 3.1306982755661013, + "ce_loss_17": 2.9464347124099732, + "ce_loss_23": 2.8718415260314942, + "ce_loss_3": 4.01355744600296, + "ce_loss_6": 3.5599818110466, + "epoch": 0.47, + "grad_norm": 1496.0, + "kl_loss_12": 603.9425079345704, + "kl_loss_17": 177.87748336791992, + "kl_loss_3": 2412.801232910156, + "kl_loss_6": 1522.6386840820312, + "learning_rate": 0.0005554190999505056, + "loss": 1183.6113, + "step": 4700 + }, + { + "ce_loss_12": 3.250467562675476, + "ce_loss_17": 3.063236713409424, + "ce_loss_23": 2.987007200717926, + "ce_loss_3": 4.112039339542389, + "ce_loss_6": 3.6746443629264833, + "epoch": 0.471, + "grad_norm": 1240.0, + "kl_loss_12": 617.5975311279296, + "kl_loss_17": 181.46721878051758, + "kl_loss_3": 2398.5910400390626, + "kl_loss_6": 1523.2029846191406, + "learning_rate": 0.0005538419368860196, + "loss": 1141.2441, + "step": 4710 + }, + { + "ce_loss_12": 3.16205176115036, + "ce_loss_17": 2.9881635546684264, + "ce_loss_23": 2.912904453277588, + "ce_loss_3": 4.027502238750458, + "ce_loss_6": 3.5856967091560366, + "epoch": 0.472, + "grad_norm": 988.0, + "kl_loss_12": 595.6430084228516, + "kl_loss_17": 177.5866683959961, + "kl_loss_3": 2361.086083984375, + "kl_loss_6": 1491.6454711914062, + "learning_rate": 0.0005522642316338268, + "loss": 1183.4592, + "step": 4720 + }, + { + "ce_loss_12": 3.1781874775886534, + "ce_loss_17": 3.002910315990448, + "ce_loss_23": 2.9332908749580384, + "ce_loss_3": 4.03815621137619, + "ce_loss_6": 3.591979217529297, + "epoch": 0.473, + "grad_norm": 1608.0, + "kl_loss_12": 600.0981414794921, + "kl_loss_17": 175.05057067871093, + "kl_loss_3": 2363.9915405273437, + "kl_loss_6": 1487.9787231445312, + "learning_rate": 0.0005506860000814017, + "loss": 1187.1951, + "step": 4730 + }, + { + "ce_loss_12": 3.192247247695923, + "ce_loss_17": 3.020113730430603, + "ce_loss_23": 2.951002764701843, + "ce_loss_3": 4.026438915729523, + "ce_loss_6": 3.601203989982605, + "epoch": 0.474, + "grad_norm": 1208.0, + "kl_loss_12": 585.6193969726562, + "kl_loss_17": 169.32979736328124, + "kl_loss_3": 2318.0686279296874, + "kl_loss_6": 1473.0704528808594, + "learning_rate": 0.0005491072581215186, + "loss": 1158.4854, + "step": 4740 + }, + { + "ce_loss_12": 3.195116567611694, + "ce_loss_17": 3.014424669742584, + "ce_loss_23": 2.9354434967041017, + "ce_loss_3": 4.054661548137664, + "ce_loss_6": 3.6271575689315796, + "epoch": 0.475, + "grad_norm": 1360.0, + "kl_loss_12": 608.3226257324219, + "kl_loss_17": 181.31658325195312, + "kl_loss_3": 2394.527795410156, + "kl_loss_6": 1526.428887939453, + "learning_rate": 0.0005475280216520913, + "loss": 1152.4799, + "step": 4750 + }, + { + "ce_loss_12": 3.118545651435852, + "ce_loss_17": 2.9431566119194033, + "ce_loss_23": 2.870669364929199, + "ce_loss_3": 3.9644595265388487, + "ce_loss_6": 3.5394273519515993, + "epoch": 0.476, + "grad_norm": 1184.0, + "kl_loss_12": 583.4711624145508, + "kl_loss_17": 170.9568717956543, + "kl_loss_3": 2322.106579589844, + "kl_loss_6": 1471.2399169921875, + "learning_rate": 0.0005459483065760138, + "loss": 1175.8216, + "step": 4760 + }, + { + "ce_loss_12": 3.0740485072135924, + "ce_loss_17": 2.8895705699920655, + "ce_loss_23": 2.817665231227875, + "ce_loss_3": 3.995547378063202, + "ce_loss_6": 3.5293294548988343, + "epoch": 0.477, + "grad_norm": 1056.0, + "kl_loss_12": 604.4830047607422, + "kl_loss_17": 173.51560821533204, + "kl_loss_3": 2491.076806640625, + "kl_loss_6": 1575.8518920898437, + "learning_rate": 0.0005443681288009991, + "loss": 1182.1915, + "step": 4770 + }, + { + "ce_loss_12": 3.1086962938308718, + "ce_loss_17": 2.9318193912506105, + "ce_loss_23": 2.8591856479644777, + "ce_loss_3": 3.9815015316009523, + "ce_loss_6": 3.542095589637756, + "epoch": 0.478, + "grad_norm": 1072.0, + "kl_loss_12": 587.2374114990234, + "kl_loss_17": 174.34748458862305, + "kl_loss_3": 2389.9313842773436, + "kl_loss_6": 1511.5686096191407, + "learning_rate": 0.0005427875042394199, + "loss": 1168.9535, + "step": 4780 + }, + { + "ce_loss_12": 3.155102550983429, + "ce_loss_17": 2.9787638783454895, + "ce_loss_23": 2.9013282537460325, + "ce_loss_3": 4.002556395530701, + "ce_loss_6": 3.5739779591560366, + "epoch": 0.479, + "grad_norm": 1004.0, + "kl_loss_12": 596.6852615356445, + "kl_loss_17": 181.65953826904297, + "kl_loss_3": 2330.943408203125, + "kl_loss_6": 1494.3890380859375, + "learning_rate": 0.0005412064488081482, + "loss": 1173.9174, + "step": 4790 + }, + { + "ce_loss_12": 3.137504005432129, + "ce_loss_17": 2.96456116437912, + "ce_loss_23": 2.890325403213501, + "ce_loss_3": 3.9840017795562743, + "ce_loss_6": 3.5466713428497316, + "epoch": 0.48, + "grad_norm": 1024.0, + "kl_loss_12": 574.2095184326172, + "kl_loss_17": 170.73358688354492, + "kl_loss_3": 2306.77890625, + "kl_loss_6": 1446.0272033691406, + "learning_rate": 0.0005396249784283942, + "loss": 1137.2696, + "step": 4800 + }, + { + "ce_loss_12": 3.163385605812073, + "ce_loss_17": 2.9815895676612856, + "ce_loss_23": 2.905221951007843, + "ce_loss_3": 4.061469888687133, + "ce_loss_6": 3.610790729522705, + "epoch": 0.481, + "grad_norm": 1160.0, + "kl_loss_12": 609.4963623046875, + "kl_loss_17": 179.51625061035156, + "kl_loss_3": 2454.91064453125, + "kl_loss_6": 1549.9529907226563, + "learning_rate": 0.0005380431090255476, + "loss": 1187.0682, + "step": 4810 + }, + { + "ce_loss_12": 3.153098165988922, + "ce_loss_17": 2.9833488702774047, + "ce_loss_23": 2.914642608165741, + "ce_loss_3": 3.994583249092102, + "ce_loss_6": 3.5685108065605164, + "epoch": 0.482, + "grad_norm": 1144.0, + "kl_loss_12": 570.3982986450195, + "kl_loss_17": 167.1259292602539, + "kl_loss_3": 2317.8790771484373, + "kl_loss_6": 1466.2108764648438, + "learning_rate": 0.0005364608565290155, + "loss": 1139.4896, + "step": 4820 + }, + { + "ce_loss_12": 3.1717878460884092, + "ce_loss_17": 2.993881106376648, + "ce_loss_23": 2.92106374502182, + "ce_loss_3": 4.0339394330978395, + "ce_loss_6": 3.5958771109580994, + "epoch": 0.483, + "grad_norm": 1096.0, + "kl_loss_12": 596.1869064331055, + "kl_loss_17": 176.21066665649414, + "kl_loss_3": 2377.3220947265627, + "kl_loss_6": 1508.0138366699218, + "learning_rate": 0.0005348782368720626, + "loss": 1163.1882, + "step": 4830 + }, + { + "ce_loss_12": 3.1062735438346865, + "ce_loss_17": 2.9300807118415833, + "ce_loss_23": 2.859765887260437, + "ce_loss_3": 3.9669827103614805, + "ce_loss_6": 3.522811996936798, + "epoch": 0.484, + "grad_norm": 1352.0, + "kl_loss_12": 577.6051483154297, + "kl_loss_17": 169.41027145385743, + "kl_loss_3": 2328.596221923828, + "kl_loss_6": 1461.5553283691406, + "learning_rate": 0.000533295265991652, + "loss": 1155.1955, + "step": 4840 + }, + { + "ce_loss_12": 3.1678555846214294, + "ce_loss_17": 2.9899966955184936, + "ce_loss_23": 2.9158806920051576, + "ce_loss_3": 4.013157033920288, + "ce_loss_6": 3.5860086560249327, + "epoch": 0.485, + "grad_norm": 1056.0, + "kl_loss_12": 585.3820983886719, + "kl_loss_17": 172.69294281005858, + "kl_loss_3": 2317.670733642578, + "kl_loss_6": 1474.7983093261719, + "learning_rate": 0.0005317119598282822, + "loss": 1139.6301, + "step": 4850 + }, + { + "ce_loss_12": 3.1802164673805238, + "ce_loss_17": 2.9998749136924743, + "ce_loss_23": 2.92397803068161, + "ce_loss_3": 4.0372141122817995, + "ce_loss_6": 3.6036622881889344, + "epoch": 0.486, + "grad_norm": 1328.0, + "kl_loss_12": 596.8292175292969, + "kl_loss_17": 175.77464294433594, + "kl_loss_3": 2348.7905883789062, + "kl_loss_6": 1489.0216064453125, + "learning_rate": 0.0005301283343258293, + "loss": 1153.5375, + "step": 4860 + }, + { + "ce_loss_12": 3.222676360607147, + "ce_loss_17": 3.051359176635742, + "ce_loss_23": 2.9793429374694824, + "ce_loss_3": 4.057096111774444, + "ce_loss_6": 3.638009774684906, + "epoch": 0.487, + "grad_norm": 1392.0, + "kl_loss_12": 589.1835266113281, + "kl_loss_17": 175.26502990722656, + "kl_loss_3": 2319.251104736328, + "kl_loss_6": 1483.1206115722657, + "learning_rate": 0.000528544405431384, + "loss": 1137.7982, + "step": 4870 + }, + { + "ce_loss_12": 3.1201239466667174, + "ce_loss_17": 2.937250566482544, + "ce_loss_23": 2.859792160987854, + "ce_loss_3": 3.985008120536804, + "ce_loss_6": 3.552453351020813, + "epoch": 0.488, + "grad_norm": 1040.0, + "kl_loss_12": 609.219938659668, + "kl_loss_17": 179.44284210205078, + "kl_loss_3": 2386.5765075683594, + "kl_loss_6": 1525.4093200683594, + "learning_rate": 0.000526960189095093, + "loss": 1172.7885, + "step": 4880 + }, + { + "ce_loss_12": 3.1049619555473327, + "ce_loss_17": 2.9306660652160645, + "ce_loss_23": 2.856823241710663, + "ce_loss_3": 3.955999433994293, + "ce_loss_6": 3.5207136631011964, + "epoch": 0.489, + "grad_norm": 1224.0, + "kl_loss_12": 582.8979614257812, + "kl_loss_17": 172.3718475341797, + "kl_loss_3": 2330.7440551757813, + "kl_loss_6": 1474.2697814941407, + "learning_rate": 0.0005253757012699972, + "loss": 1146.8992, + "step": 4890 + }, + { + "ce_loss_12": 3.1677612900733947, + "ce_loss_17": 2.996410632133484, + "ce_loss_23": 2.924814796447754, + "ce_loss_3": 4.016883933544159, + "ce_loss_6": 3.586358439922333, + "epoch": 0.49, + "grad_norm": 1112.0, + "kl_loss_12": 588.945590209961, + "kl_loss_17": 173.74278259277344, + "kl_loss_3": 2331.7134399414062, + "kl_loss_6": 1481.8559936523438, + "learning_rate": 0.0005237909579118712, + "loss": 1164.1603, + "step": 4900 + }, + { + "ce_loss_12": 3.14421226978302, + "ce_loss_17": 2.9585715532302856, + "ce_loss_23": 2.8813907504081726, + "ce_loss_3": 4.029925990104675, + "ce_loss_6": 3.5790133357048033, + "epoch": 0.491, + "grad_norm": 1144.0, + "kl_loss_12": 604.090365600586, + "kl_loss_17": 180.36501693725586, + "kl_loss_3": 2419.7515502929687, + "kl_loss_6": 1519.3355224609375, + "learning_rate": 0.0005222059749790631, + "loss": 1171.4746, + "step": 4910 + }, + { + "ce_loss_12": 3.193829929828644, + "ce_loss_17": 3.0222328066825868, + "ce_loss_23": 2.949049413204193, + "ce_loss_3": 4.016946339607239, + "ce_loss_6": 3.588582158088684, + "epoch": 0.492, + "grad_norm": 1020.0, + "kl_loss_12": 575.816943359375, + "kl_loss_17": 169.9915428161621, + "kl_loss_3": 2285.2912475585936, + "kl_loss_6": 1437.871759033203, + "learning_rate": 0.0005206207684323337, + "loss": 1119.0843, + "step": 4920 + }, + { + "ce_loss_12": 3.176484978199005, + "ce_loss_17": 3.0019878506660462, + "ce_loss_23": 2.930724596977234, + "ce_loss_3": 4.027780544757843, + "ce_loss_6": 3.5989270567893983, + "epoch": 0.493, + "grad_norm": 1360.0, + "kl_loss_12": 596.4255554199219, + "kl_loss_17": 177.78146667480468, + "kl_loss_3": 2352.6630004882813, + "kl_loss_6": 1504.6399047851562, + "learning_rate": 0.000519035354234695, + "loss": 1173.89, + "step": 4930 + }, + { + "ce_loss_12": 3.1620043873786927, + "ce_loss_17": 2.9795578479766847, + "ce_loss_23": 2.8983842492103578, + "ce_loss_3": 4.012789058685303, + "ce_loss_6": 3.590153527259827, + "epoch": 0.494, + "grad_norm": 1528.0, + "kl_loss_12": 599.5278976440429, + "kl_loss_17": 179.3766830444336, + "kl_loss_3": 2332.3432373046876, + "kl_loss_6": 1490.5878845214843, + "learning_rate": 0.0005174497483512506, + "loss": 1136.2885, + "step": 4940 + }, + { + "ce_loss_12": 3.1927493691444395, + "ce_loss_17": 3.023730146884918, + "ce_loss_23": 2.9540415048599242, + "ce_loss_3": 4.028357303142547, + "ce_loss_6": 3.6044982194900514, + "epoch": 0.495, + "grad_norm": 1168.0, + "kl_loss_12": 581.2431701660156, + "kl_loss_17": 170.51922912597655, + "kl_loss_3": 2321.1480102539062, + "kl_loss_6": 1473.8497741699218, + "learning_rate": 0.0005158639667490339, + "loss": 1163.9057, + "step": 4950 + }, + { + "ce_loss_12": 3.117106306552887, + "ce_loss_17": 2.938692343235016, + "ce_loss_23": 2.864122462272644, + "ce_loss_3": 3.9715336084365847, + "ce_loss_6": 3.5310477137565615, + "epoch": 0.496, + "grad_norm": 900.0, + "kl_loss_12": 589.8403610229492, + "kl_loss_17": 173.67026596069337, + "kl_loss_3": 2349.2982543945313, + "kl_loss_6": 1489.48037109375, + "learning_rate": 0.0005142780253968481, + "loss": 1152.0074, + "step": 4960 + }, + { + "ce_loss_12": 3.0620609402656553, + "ce_loss_17": 2.8893876552581785, + "ce_loss_23": 2.821596598625183, + "ce_loss_3": 3.9072123169898987, + "ce_loss_6": 3.47282874584198, + "epoch": 0.497, + "grad_norm": 1112.0, + "kl_loss_12": 571.6155212402343, + "kl_loss_17": 168.01318740844727, + "kl_loss_3": 2300.569201660156, + "kl_loss_6": 1443.906103515625, + "learning_rate": 0.0005126919402651053, + "loss": 1116.5506, + "step": 4970 + }, + { + "ce_loss_12": 3.1318010568618773, + "ce_loss_17": 2.952656900882721, + "ce_loss_23": 2.8750259399414064, + "ce_loss_3": 4.002028000354767, + "ce_loss_6": 3.563919460773468, + "epoch": 0.498, + "grad_norm": 1048.0, + "kl_loss_12": 600.1055236816406, + "kl_loss_17": 179.3028305053711, + "kl_loss_3": 2363.5748291015625, + "kl_loss_6": 1505.253466796875, + "learning_rate": 0.0005111057273256647, + "loss": 1161.7385, + "step": 4980 + }, + { + "ce_loss_12": 3.2104085087776184, + "ce_loss_17": 3.0453226447105406, + "ce_loss_23": 2.9771073341369627, + "ce_loss_3": 4.010699796676636, + "ce_loss_6": 3.5954882144927978, + "epoch": 0.499, + "grad_norm": 1024.0, + "kl_loss_12": 558.9208541870117, + "kl_loss_17": 164.41656036376952, + "kl_loss_3": 2212.5747131347657, + "kl_loss_6": 1405.6594848632812, + "learning_rate": 0.0005095194025516733, + "loss": 1106.8965, + "step": 4990 + }, + { + "ce_loss_12": 3.147287356853485, + "ce_loss_17": 2.978268301486969, + "ce_loss_23": 2.912423849105835, + "ce_loss_3": 3.9957066416740417, + "ce_loss_6": 3.555610489845276, + "epoch": 0.5, + "grad_norm": 1216.0, + "kl_loss_12": 573.2591278076172, + "kl_loss_17": 166.6346778869629, + "kl_loss_3": 2320.488146972656, + "kl_loss_6": 1453.042413330078, + "learning_rate": 0.000507932981917404, + "loss": 1167.6072, + "step": 5000 + }, + { + "ce_loss_12": 3.1100293159484864, + "ce_loss_17": 2.9278237581253053, + "ce_loss_23": 2.8520235896110533, + "ce_loss_3": 4.011444246768951, + "ce_loss_6": 3.5609185338020324, + "epoch": 0.501, + "grad_norm": 1012.0, + "kl_loss_12": 614.2513397216796, + "kl_loss_17": 179.44054489135743, + "kl_loss_3": 2452.9594482421876, + "kl_loss_6": 1557.2860961914062, + "learning_rate": 0.0005063464813980949, + "loss": 1193.0545, + "step": 5010 + }, + { + "ce_loss_12": 3.085975396633148, + "ce_loss_17": 2.9166126370429994, + "ce_loss_23": 2.843967413902283, + "ce_loss_3": 3.957671511173248, + "ce_loss_6": 3.5102660059928894, + "epoch": 0.502, + "grad_norm": 1192.0, + "kl_loss_12": 585.8398132324219, + "kl_loss_17": 171.25574417114257, + "kl_loss_3": 2375.1840698242186, + "kl_loss_6": 1501.98916015625, + "learning_rate": 0.0005047599169697884, + "loss": 1148.2912, + "step": 5020 + }, + { + "ce_loss_12": 3.046398901939392, + "ce_loss_17": 2.8624019742012026, + "ce_loss_23": 2.786366331577301, + "ce_loss_3": 3.915739905834198, + "ce_loss_6": 3.469613456726074, + "epoch": 0.503, + "grad_norm": 1360.0, + "kl_loss_12": 593.1336456298828, + "kl_loss_17": 174.98776779174804, + "kl_loss_3": 2362.1496337890626, + "kl_loss_6": 1487.0949829101562, + "learning_rate": 0.000503173304609171, + "loss": 1130.8209, + "step": 5030 + }, + { + "ce_loss_12": 3.147076654434204, + "ce_loss_17": 2.973100447654724, + "ce_loss_23": 2.8968443393707277, + "ce_loss_3": 4.002816176414489, + "ce_loss_6": 3.5767434000968934, + "epoch": 0.504, + "grad_norm": 1616.0, + "kl_loss_12": 581.4584167480468, + "kl_loss_17": 172.5049819946289, + "kl_loss_3": 2333.448352050781, + "kl_loss_6": 1488.5466552734374, + "learning_rate": 0.0005015866602934111, + "loss": 1128.6855, + "step": 5040 + }, + { + "ce_loss_12": 3.1321513056755066, + "ce_loss_17": 2.9529102206230164, + "ce_loss_23": 2.874933648109436, + "ce_loss_3": 4.0189503788948056, + "ce_loss_6": 3.5767377614974976, + "epoch": 0.505, + "grad_norm": 1048.0, + "kl_loss_12": 617.4561584472656, + "kl_loss_17": 184.7288917541504, + "kl_loss_3": 2410.4678466796877, + "kl_loss_6": 1548.379412841797, + "learning_rate": 0.0005, + "loss": 1167.1738, + "step": 5050 + }, + { + "ce_loss_12": 3.1210977911949156, + "ce_loss_17": 2.9430198550224302, + "ce_loss_23": 2.868890976905823, + "ce_loss_3": 3.9791624903678895, + "ce_loss_6": 3.5414319157600405, + "epoch": 0.506, + "grad_norm": 1464.0, + "kl_loss_12": 598.0599334716796, + "kl_loss_17": 179.60112915039062, + "kl_loss_3": 2336.926403808594, + "kl_loss_6": 1482.9856384277343, + "learning_rate": 0.0004984133397065889, + "loss": 1135.6876, + "step": 5060 + }, + { + "ce_loss_12": 3.1238099575042724, + "ce_loss_17": 2.947644829750061, + "ce_loss_23": 2.8705275416374207, + "ce_loss_3": 4.00325483083725, + "ce_loss_6": 3.5608957290649412, + "epoch": 0.507, + "grad_norm": 1648.0, + "kl_loss_12": 594.76513671875, + "kl_loss_17": 175.90363845825195, + "kl_loss_3": 2370.6013061523436, + "kl_loss_6": 1506.6740661621093, + "learning_rate": 0.0004968266953908291, + "loss": 1140.9098, + "step": 5070 + }, + { + "ce_loss_12": 3.1559859991073607, + "ce_loss_17": 2.9826855659484863, + "ce_loss_23": 2.911471796035767, + "ce_loss_3": 4.028853631019592, + "ce_loss_6": 3.580644130706787, + "epoch": 0.508, + "grad_norm": 1224.0, + "kl_loss_12": 580.6352752685547, + "kl_loss_17": 168.34320297241212, + "kl_loss_3": 2376.8561279296873, + "kl_loss_6": 1492.9738037109375, + "learning_rate": 0.0004952400830302117, + "loss": 1150.0828, + "step": 5080 + }, + { + "ce_loss_12": 3.0969050884246827, + "ce_loss_17": 2.9111073136329653, + "ce_loss_23": 2.83800984621048, + "ce_loss_3": 3.978143048286438, + "ce_loss_6": 3.5241355776786802, + "epoch": 0.509, + "grad_norm": 1448.0, + "kl_loss_12": 597.989111328125, + "kl_loss_17": 174.9261489868164, + "kl_loss_3": 2389.203204345703, + "kl_loss_6": 1511.5739440917969, + "learning_rate": 0.0004936535186019053, + "loss": 1151.776, + "step": 5090 + }, + { + "ce_loss_12": 3.176733374595642, + "ce_loss_17": 3.0086188673973084, + "ce_loss_23": 2.9422256112098695, + "ce_loss_3": 4.0128894448280334, + "ce_loss_6": 3.5810292601585387, + "epoch": 0.51, + "grad_norm": 980.0, + "kl_loss_12": 567.3318069458007, + "kl_loss_17": 166.57700958251954, + "kl_loss_3": 2283.7829772949217, + "kl_loss_6": 1431.895245361328, + "learning_rate": 0.000492067018082596, + "loss": 1129.6188, + "step": 5100 + }, + { + "ce_loss_12": 3.1334537506103515, + "ce_loss_17": 2.9527031540870667, + "ce_loss_23": 2.875639271736145, + "ce_loss_3": 4.036728489398956, + "ce_loss_6": 3.573836934566498, + "epoch": 0.511, + "grad_norm": 1312.0, + "kl_loss_12": 607.42119140625, + "kl_loss_17": 177.14556655883788, + "kl_loss_3": 2447.327471923828, + "kl_loss_6": 1535.374688720703, + "learning_rate": 0.0004904805974483267, + "loss": 1192.1066, + "step": 5110 + }, + { + "ce_loss_12": 3.2413255333900453, + "ce_loss_17": 3.0535327911376955, + "ce_loss_23": 2.9726045727729797, + "ce_loss_3": 4.114557325839996, + "ce_loss_6": 3.683498430252075, + "epoch": 0.512, + "grad_norm": 1080.0, + "kl_loss_12": 630.6290008544922, + "kl_loss_17": 189.75421066284179, + "kl_loss_3": 2422.121484375, + "kl_loss_6": 1566.5547302246093, + "learning_rate": 0.0004888942726743353, + "loss": 1213.6861, + "step": 5120 + }, + { + "ce_loss_12": 3.110649573802948, + "ce_loss_17": 2.9337003946304323, + "ce_loss_23": 2.859462559223175, + "ce_loss_3": 3.9760597348213196, + "ce_loss_6": 3.540888249874115, + "epoch": 0.513, + "grad_norm": 1096.0, + "kl_loss_12": 596.3653182983398, + "kl_loss_17": 175.1172088623047, + "kl_loss_3": 2394.8271545410157, + "kl_loss_6": 1516.221905517578, + "learning_rate": 0.0004873080597348947, + "loss": 1171.3652, + "step": 5130 + }, + { + "ce_loss_12": 3.0032384753227235, + "ce_loss_17": 2.8270904660224914, + "ce_loss_23": 2.7542188584804537, + "ce_loss_3": 3.9286187887191772, + "ce_loss_6": 3.4638774037361144, + "epoch": 0.514, + "grad_norm": 940.0, + "kl_loss_12": 594.5371871948242, + "kl_loss_17": 171.3603889465332, + "kl_loss_3": 2475.858020019531, + "kl_loss_6": 1556.1022399902345, + "learning_rate": 0.0004857219746031519, + "loss": 1174.0678, + "step": 5140 + }, + { + "ce_loss_12": 3.165748357772827, + "ce_loss_17": 2.99380042552948, + "ce_loss_23": 2.9208311080932616, + "ce_loss_3": 4.014376854896545, + "ce_loss_6": 3.5815898060798643, + "epoch": 0.515, + "grad_norm": 1168.0, + "kl_loss_12": 583.4323303222657, + "kl_loss_17": 172.83356170654298, + "kl_loss_3": 2324.2773193359376, + "kl_loss_6": 1464.8506713867187, + "learning_rate": 0.0004841360332509663, + "loss": 1146.9793, + "step": 5150 + }, + { + "ce_loss_12": 3.117990791797638, + "ce_loss_17": 2.949844980239868, + "ce_loss_23": 2.8793536186218263, + "ce_loss_3": 3.9670523524284365, + "ce_loss_6": 3.5323256254196167, + "epoch": 0.516, + "grad_norm": 1120.0, + "kl_loss_12": 573.2609649658203, + "kl_loss_17": 166.92510604858398, + "kl_loss_3": 2310.5695861816407, + "kl_loss_6": 1455.3383422851562, + "learning_rate": 0.0004825502516487497, + "loss": 1099.3861, + "step": 5160 + }, + { + "ce_loss_12": 3.0859371185302735, + "ce_loss_17": 2.912708246707916, + "ce_loss_23": 2.844784665107727, + "ce_loss_3": 3.958569324016571, + "ce_loss_6": 3.5233945488929748, + "epoch": 0.517, + "grad_norm": 1328.0, + "kl_loss_12": 588.0229721069336, + "kl_loss_17": 169.20474548339843, + "kl_loss_3": 2382.887353515625, + "kl_loss_6": 1513.6566467285156, + "learning_rate": 0.00048096464576530507, + "loss": 1167.3519, + "step": 5170 + }, + { + "ce_loss_12": 3.184209370613098, + "ce_loss_17": 3.0182522535324097, + "ce_loss_23": 2.9456828832626343, + "ce_loss_3": 3.9989057421684264, + "ce_loss_6": 3.581149864196777, + "epoch": 0.518, + "grad_norm": 896.0, + "kl_loss_12": 577.0042404174804, + "kl_loss_17": 171.46255950927736, + "kl_loss_3": 2268.005517578125, + "kl_loss_6": 1434.8917175292968, + "learning_rate": 0.00047937923156766646, + "loss": 1119.1788, + "step": 5180 + }, + { + "ce_loss_12": 3.2192662239074705, + "ce_loss_17": 3.0561550140380858, + "ce_loss_23": 2.985294485092163, + "ce_loss_3": 4.027104759216309, + "ce_loss_6": 3.6101157307624816, + "epoch": 0.519, + "grad_norm": 1096.0, + "kl_loss_12": 577.4416137695313, + "kl_loss_17": 172.38313217163085, + "kl_loss_3": 2269.985217285156, + "kl_loss_6": 1437.1518981933593, + "learning_rate": 0.00047779402502093696, + "loss": 1125.842, + "step": 5190 + }, + { + "ce_loss_12": 3.1905157446861265, + "ce_loss_17": 3.0203575253486634, + "ce_loss_23": 2.9477104902267457, + "ce_loss_3": 4.036243295669555, + "ce_loss_6": 3.6043622374534605, + "epoch": 0.52, + "grad_norm": 1328.0, + "kl_loss_12": 580.0230865478516, + "kl_loss_17": 171.161181640625, + "kl_loss_3": 2317.1341552734375, + "kl_loss_6": 1455.8950622558593, + "learning_rate": 0.0004762090420881289, + "loss": 1141.4177, + "step": 5200 + }, + { + "ce_loss_12": 3.108722817897797, + "ce_loss_17": 2.93821382522583, + "ce_loss_23": 2.8691662311553956, + "ce_loss_3": 3.9434558391571044, + "ce_loss_6": 3.522480773925781, + "epoch": 0.521, + "grad_norm": 988.0, + "kl_loss_12": 578.2749633789062, + "kl_loss_17": 170.9490005493164, + "kl_loss_3": 2295.4980346679686, + "kl_loss_6": 1458.482110595703, + "learning_rate": 0.00047462429873000296, + "loss": 1115.8973, + "step": 5210 + }, + { + "ce_loss_12": 3.1941396236419677, + "ce_loss_17": 3.0241977334022523, + "ce_loss_23": 2.950813615322113, + "ce_loss_3": 4.025123035907745, + "ce_loss_6": 3.5928640723228455, + "epoch": 0.522, + "grad_norm": 1160.0, + "kl_loss_12": 580.0771759033203, + "kl_loss_17": 175.7025573730469, + "kl_loss_3": 2306.2788818359377, + "kl_loss_6": 1451.9521911621093, + "learning_rate": 0.0004730398109049071, + "loss": 1124.7129, + "step": 5220 + }, + { + "ce_loss_12": 3.1333670258522033, + "ce_loss_17": 2.9486419200897216, + "ce_loss_23": 2.873242676258087, + "ce_loss_3": 4.016930389404297, + "ce_loss_6": 3.578941988945007, + "epoch": 0.523, + "grad_norm": 1224.0, + "kl_loss_12": 604.9253570556641, + "kl_loss_17": 177.25698318481446, + "kl_loss_3": 2420.9248046875, + "kl_loss_6": 1552.7471984863282, + "learning_rate": 0.000471455594568616, + "loss": 1160.7181, + "step": 5230 + }, + { + "ce_loss_12": 3.1828583478927612, + "ce_loss_17": 3.0149208068847657, + "ce_loss_23": 2.9433709263801573, + "ce_loss_3": 3.998889207839966, + "ce_loss_6": 3.584208059310913, + "epoch": 0.524, + "grad_norm": 1552.0, + "kl_loss_12": 578.4490600585938, + "kl_loss_17": 172.18267364501952, + "kl_loss_3": 2265.9357666015626, + "kl_loss_6": 1439.397607421875, + "learning_rate": 0.00046987166567417086, + "loss": 1137.8832, + "step": 5240 + }, + { + "ce_loss_12": 3.116793179512024, + "ce_loss_17": 2.94590482711792, + "ce_loss_23": 2.8722461819648744, + "ce_loss_3": 3.978662097454071, + "ce_loss_6": 3.530458962917328, + "epoch": 0.525, + "grad_norm": 1424.0, + "kl_loss_12": 577.2575607299805, + "kl_loss_17": 168.89076232910156, + "kl_loss_3": 2332.1568359375, + "kl_loss_6": 1461.170635986328, + "learning_rate": 0.00046828804017171776, + "loss": 1109.5477, + "step": 5250 + }, + { + "ce_loss_12": 3.1586636900901794, + "ce_loss_17": 2.9801255226135255, + "ce_loss_23": 2.9030737400054933, + "ce_loss_3": 4.046093094348907, + "ce_loss_6": 3.5907180190086363, + "epoch": 0.526, + "grad_norm": 1024.0, + "kl_loss_12": 591.604052734375, + "kl_loss_17": 174.3595977783203, + "kl_loss_3": 2382.9521240234376, + "kl_loss_6": 1490.333349609375, + "learning_rate": 0.00046670473400834805, + "loss": 1162.6271, + "step": 5260 + }, + { + "ce_loss_12": 3.0928870677947997, + "ce_loss_17": 2.925082004070282, + "ce_loss_23": 2.854987645149231, + "ce_loss_3": 3.9424633979797363, + "ce_loss_6": 3.502853310108185, + "epoch": 0.527, + "grad_norm": 1216.0, + "kl_loss_12": 568.4853057861328, + "kl_loss_17": 167.60090255737305, + "kl_loss_3": 2314.8370666503906, + "kl_loss_6": 1446.1728820800781, + "learning_rate": 0.00046512176312793734, + "loss": 1163.3844, + "step": 5270 + }, + { + "ce_loss_12": 3.0925601005554197, + "ce_loss_17": 2.9151213645935057, + "ce_loss_23": 2.8395652532577516, + "ce_loss_3": 3.941300642490387, + "ce_loss_6": 3.505583441257477, + "epoch": 0.528, + "grad_norm": 1456.0, + "kl_loss_12": 583.0032562255859, + "kl_loss_17": 172.08705215454103, + "kl_loss_3": 2332.751837158203, + "kl_loss_6": 1471.8856689453125, + "learning_rate": 0.00046353914347098467, + "loss": 1150.2659, + "step": 5280 + }, + { + "ce_loss_12": 3.184078550338745, + "ce_loss_17": 3.0123846530914307, + "ce_loss_23": 2.9371328949928284, + "ce_loss_3": 4.0311295747756954, + "ce_loss_6": 3.599526607990265, + "epoch": 0.529, + "grad_norm": 1368.0, + "kl_loss_12": 571.3340240478516, + "kl_loss_17": 170.81220932006835, + "kl_loss_3": 2306.427099609375, + "kl_loss_6": 1454.4433288574219, + "learning_rate": 0.0004619568909744524, + "loss": 1149.8136, + "step": 5290 + }, + { + "ce_loss_12": 3.1852566242218017, + "ce_loss_17": 3.01423898935318, + "ce_loss_23": 2.941882300376892, + "ce_loss_3": 4.019960188865662, + "ce_loss_6": 3.5935036301612855, + "epoch": 0.53, + "grad_norm": 1344.0, + "kl_loss_12": 579.0866683959961, + "kl_loss_17": 172.10766830444337, + "kl_loss_3": 2303.6955017089845, + "kl_loss_6": 1460.816339111328, + "learning_rate": 0.00046037502157160573, + "loss": 1143.9945, + "step": 5300 + }, + { + "ce_loss_12": 3.0746464490890504, + "ce_loss_17": 2.8974472761154173, + "ce_loss_23": 2.8233888030052183, + "ce_loss_3": 3.9274016857147216, + "ce_loss_6": 3.4863208651542665, + "epoch": 0.531, + "grad_norm": 1080.0, + "kl_loss_12": 584.6296173095703, + "kl_loss_17": 173.56878509521485, + "kl_loss_3": 2337.2600219726564, + "kl_loss_6": 1467.9676696777344, + "learning_rate": 0.00045879355119185207, + "loss": 1147.8285, + "step": 5310 + }, + { + "ce_loss_12": 3.156270945072174, + "ce_loss_17": 2.97952960729599, + "ce_loss_23": 2.908063507080078, + "ce_loss_3": 4.009477806091309, + "ce_loss_6": 3.5802698493003846, + "epoch": 0.532, + "grad_norm": 1004.0, + "kl_loss_12": 599.6850769042969, + "kl_loss_17": 175.448974609375, + "kl_loss_3": 2376.53125, + "kl_loss_6": 1506.9073852539063, + "learning_rate": 0.0004572124957605803, + "loss": 1167.7115, + "step": 5320 + }, + { + "ce_loss_12": 3.164023780822754, + "ce_loss_17": 2.985978841781616, + "ce_loss_23": 2.910122108459473, + "ce_loss_3": 4.010115242004394, + "ce_loss_6": 3.579763102531433, + "epoch": 0.533, + "grad_norm": 1088.0, + "kl_loss_12": 591.7021179199219, + "kl_loss_17": 174.22025375366212, + "kl_loss_3": 2341.58857421875, + "kl_loss_6": 1477.200946044922, + "learning_rate": 0.00045563187119900103, + "loss": 1132.1896, + "step": 5330 + }, + { + "ce_loss_12": 3.0149919986724854, + "ce_loss_17": 2.8391488671302794, + "ce_loss_23": 2.7716471910476685, + "ce_loss_3": 3.897192454338074, + "ce_loss_6": 3.450315809249878, + "epoch": 0.534, + "grad_norm": 1736.0, + "kl_loss_12": 587.5868286132812, + "kl_loss_17": 170.40618972778321, + "kl_loss_3": 2397.2179321289063, + "kl_loss_6": 1504.325732421875, + "learning_rate": 0.00045405169342398633, + "loss": 1159.4494, + "step": 5340 + }, + { + "ce_loss_12": 3.109479343891144, + "ce_loss_17": 2.931243908405304, + "ce_loss_23": 2.8559011697769163, + "ce_loss_3": 3.977936267852783, + "ce_loss_6": 3.5353504419326782, + "epoch": 0.535, + "grad_norm": 956.0, + "kl_loss_12": 595.8931518554688, + "kl_loss_17": 176.46972579956054, + "kl_loss_3": 2383.1471923828126, + "kl_loss_6": 1504.8654907226562, + "learning_rate": 0.0004524719783479088, + "loss": 1136.5393, + "step": 5350 + }, + { + "ce_loss_12": 3.063349151611328, + "ce_loss_17": 2.883927345275879, + "ce_loss_23": 2.808227801322937, + "ce_loss_3": 3.9513380885124207, + "ce_loss_6": 3.503880572319031, + "epoch": 0.536, + "grad_norm": 908.0, + "kl_loss_12": 596.9293487548828, + "kl_loss_17": 176.37847824096679, + "kl_loss_3": 2422.632647705078, + "kl_loss_6": 1518.7975463867188, + "learning_rate": 0.00045089274187848144, + "loss": 1142.9058, + "step": 5360 + }, + { + "ce_loss_12": 3.1649566173553465, + "ce_loss_17": 2.9962420225143434, + "ce_loss_23": 2.925772321224213, + "ce_loss_3": 4.012749242782593, + "ce_loss_6": 3.575718033313751, + "epoch": 0.537, + "grad_norm": 1328.0, + "kl_loss_12": 578.8294387817383, + "kl_loss_17": 170.3518814086914, + "kl_loss_3": 2328.4602294921874, + "kl_loss_6": 1458.3095703125, + "learning_rate": 0.00044931399991859835, + "loss": 1125.4836, + "step": 5370 + }, + { + "ce_loss_12": 3.0323414325714113, + "ce_loss_17": 2.859511160850525, + "ce_loss_23": 2.7885570168495177, + "ce_loss_3": 3.887762427330017, + "ce_loss_6": 3.447388708591461, + "epoch": 0.538, + "grad_norm": 1120.0, + "kl_loss_12": 576.0092010498047, + "kl_loss_17": 168.75234375, + "kl_loss_3": 2343.1920837402345, + "kl_loss_6": 1467.7802734375, + "learning_rate": 0.00044773576836617336, + "loss": 1126.0914, + "step": 5380 + }, + { + "ce_loss_12": 3.132502889633179, + "ce_loss_17": 2.9554927587509154, + "ce_loss_23": 2.879656362533569, + "ce_loss_3": 3.9968435287475588, + "ce_loss_6": 3.5616152048110963, + "epoch": 0.539, + "grad_norm": 1240.0, + "kl_loss_12": 592.6437683105469, + "kl_loss_17": 173.54042358398436, + "kl_loss_3": 2369.021044921875, + "kl_loss_6": 1508.1837890625, + "learning_rate": 0.00044615806311398056, + "loss": 1172.1301, + "step": 5390 + }, + { + "ce_loss_12": 3.1875888228416445, + "ce_loss_17": 3.021982192993164, + "ce_loss_23": 2.953292524814606, + "ce_loss_3": 3.975070667266846, + "ce_loss_6": 3.5754401087760925, + "epoch": 0.54, + "grad_norm": 1056.0, + "kl_loss_12": 564.4381713867188, + "kl_loss_17": 165.4364440917969, + "kl_loss_3": 2212.189373779297, + "kl_loss_6": 1405.8132629394531, + "learning_rate": 0.00044458090004949454, + "loss": 1126.4178, + "step": 5400 + }, + { + "ce_loss_12": 3.0834463357925417, + "ce_loss_17": 2.896428346633911, + "ce_loss_23": 2.8172748923301696, + "ce_loss_3": 3.983506464958191, + "ce_loss_6": 3.5301677942276, + "epoch": 0.541, + "grad_norm": 1216.0, + "kl_loss_12": 609.4205612182617, + "kl_loss_17": 180.6081115722656, + "kl_loss_3": 2462.3694763183594, + "kl_loss_6": 1565.2915832519532, + "learning_rate": 0.0004430042950547297, + "loss": 1158.1969, + "step": 5410 + }, + { + "ce_loss_12": 3.1545743107795716, + "ce_loss_17": 2.9766462206840516, + "ce_loss_23": 2.9003338694572447, + "ce_loss_3": 4.019918143749237, + "ce_loss_6": 3.5826845288276674, + "epoch": 0.542, + "grad_norm": 900.0, + "kl_loss_12": 599.8469680786133, + "kl_loss_17": 179.4541229248047, + "kl_loss_3": 2381.429577636719, + "kl_loss_6": 1498.4056457519532, + "learning_rate": 0.0004414282640060809, + "loss": 1150.7848, + "step": 5420 + }, + { + "ce_loss_12": 3.228876233100891, + "ce_loss_17": 3.0594538927078245, + "ce_loss_23": 2.984146475791931, + "ce_loss_3": 4.057979154586792, + "ce_loss_6": 3.6407159090042116, + "epoch": 0.543, + "grad_norm": 1176.0, + "kl_loss_12": 575.4662200927735, + "kl_loss_17": 173.09854583740236, + "kl_loss_3": 2263.767956542969, + "kl_loss_6": 1437.279278564453, + "learning_rate": 0.0004398528227741633, + "loss": 1116.2201, + "step": 5430 + }, + { + "ce_loss_12": 3.1138192772865296, + "ce_loss_17": 2.933781898021698, + "ce_loss_23": 2.8625401735305784, + "ce_loss_3": 3.978166365623474, + "ce_loss_6": 3.544347071647644, + "epoch": 0.544, + "grad_norm": 1368.0, + "kl_loss_12": 581.6029159545899, + "kl_loss_17": 172.10035705566406, + "kl_loss_3": 2334.6691040039063, + "kl_loss_6": 1475.057598876953, + "learning_rate": 0.00043827798722365264, + "loss": 1150.7207, + "step": 5440 + }, + { + "ce_loss_12": 3.21498464345932, + "ce_loss_17": 3.048158276081085, + "ce_loss_23": 2.978046691417694, + "ce_loss_3": 4.036542665958405, + "ce_loss_6": 3.6121699094772337, + "epoch": 0.545, + "grad_norm": 1336.0, + "kl_loss_12": 577.015998840332, + "kl_loss_17": 171.95670700073242, + "kl_loss_3": 2278.222900390625, + "kl_loss_6": 1433.7412963867187, + "learning_rate": 0.00043670377321312535, + "loss": 1115.5743, + "step": 5450 + }, + { + "ce_loss_12": 3.216487944126129, + "ce_loss_17": 3.051944351196289, + "ce_loss_23": 2.9818212270736693, + "ce_loss_3": 4.0296752691268924, + "ce_loss_6": 3.622943937778473, + "epoch": 0.546, + "grad_norm": 1632.0, + "kl_loss_12": 568.4437469482422, + "kl_loss_17": 168.3789520263672, + "kl_loss_3": 2252.7482482910154, + "kl_loss_6": 1444.654229736328, + "learning_rate": 0.0004351301965948991, + "loss": 1136.658, + "step": 5460 + }, + { + "ce_loss_12": 3.1321449518203734, + "ce_loss_17": 2.9665717482566833, + "ce_loss_23": 2.897308039665222, + "ce_loss_3": 3.9533140301704406, + "ce_loss_6": 3.539426839351654, + "epoch": 0.547, + "grad_norm": 1536.0, + "kl_loss_12": 559.8824737548828, + "kl_loss_17": 164.7873306274414, + "kl_loss_3": 2255.5875610351563, + "kl_loss_6": 1426.985546875, + "learning_rate": 0.000433557273214873, + "loss": 1118.2308, + "step": 5470 + }, + { + "ce_loss_12": 3.124492418766022, + "ce_loss_17": 2.9505738973617555, + "ce_loss_23": 2.8773013591766357, + "ce_loss_3": 3.9642167687416077, + "ce_loss_6": 3.5437798500061035, + "epoch": 0.548, + "grad_norm": 1200.0, + "kl_loss_12": 573.4230926513671, + "kl_loss_17": 170.81655502319336, + "kl_loss_3": 2286.347790527344, + "kl_loss_6": 1452.3485534667968, + "learning_rate": 0.000431985018912368, + "loss": 1113.127, + "step": 5480 + }, + { + "ce_loss_12": 3.103459191322327, + "ce_loss_17": 2.9267914652824403, + "ce_loss_23": 2.8531687021255494, + "ce_loss_3": 3.980012333393097, + "ce_loss_6": 3.5421520709991454, + "epoch": 0.549, + "grad_norm": 1168.0, + "kl_loss_12": 585.4976928710937, + "kl_loss_17": 172.1935546875, + "kl_loss_3": 2377.3790893554688, + "kl_loss_6": 1508.5367065429687, + "learning_rate": 0.0004304134495199674, + "loss": 1125.7893, + "step": 5490 + }, + { + "ce_loss_12": 3.13008736371994, + "ce_loss_17": 2.95293470621109, + "ce_loss_23": 2.8801159620285035, + "ce_loss_3": 3.988025999069214, + "ce_loss_6": 3.5581201314926147, + "epoch": 0.55, + "grad_norm": 1048.0, + "kl_loss_12": 593.742057800293, + "kl_loss_17": 172.41408920288086, + "kl_loss_3": 2364.9688110351562, + "kl_loss_6": 1508.8888854980469, + "learning_rate": 0.0004288425808633575, + "loss": 1139.3568, + "step": 5500 + }, + { + "ce_loss_12": 3.113992178440094, + "ce_loss_17": 2.941122317314148, + "ce_loss_23": 2.8700260519981384, + "ce_loss_3": 3.965270459651947, + "ce_loss_6": 3.5282594561576843, + "epoch": 0.551, + "grad_norm": 1576.0, + "kl_loss_12": 573.6035720825196, + "kl_loss_17": 167.52117843627929, + "kl_loss_3": 2331.3819274902344, + "kl_loss_6": 1467.6904663085938, + "learning_rate": 0.0004272724287611684, + "loss": 1139.9652, + "step": 5510 + }, + { + "ce_loss_12": 3.0911285281181335, + "ce_loss_17": 2.919494020938873, + "ce_loss_23": 2.8469159841537475, + "ce_loss_3": 3.9676486968994142, + "ce_loss_6": 3.511377143859863, + "epoch": 0.552, + "grad_norm": 1096.0, + "kl_loss_12": 585.2908172607422, + "kl_loss_17": 173.3448402404785, + "kl_loss_3": 2385.7844421386717, + "kl_loss_6": 1485.8661010742187, + "learning_rate": 0.00042570300902481425, + "loss": 1148.9248, + "step": 5520 + }, + { + "ce_loss_12": 3.1088298916816712, + "ce_loss_17": 2.9421172738075256, + "ce_loss_23": 2.8727689743041993, + "ce_loss_3": 3.947227430343628, + "ce_loss_6": 3.5222736239433288, + "epoch": 0.553, + "grad_norm": 1120.0, + "kl_loss_12": 569.1251846313477, + "kl_loss_17": 166.2673828125, + "kl_loss_3": 2305.686419677734, + "kl_loss_6": 1455.715643310547, + "learning_rate": 0.00042413433745833423, + "loss": 1124.1992, + "step": 5530 + }, + { + "ce_loss_12": 3.110207366943359, + "ce_loss_17": 2.935355913639069, + "ce_loss_23": 2.8650208592414854, + "ce_loss_3": 3.9728068828582765, + "ce_loss_6": 3.531010937690735, + "epoch": 0.554, + "grad_norm": 1536.0, + "kl_loss_12": 580.8512084960937, + "kl_loss_17": 169.45902633666992, + "kl_loss_3": 2346.9794067382813, + "kl_loss_6": 1474.6579711914062, + "learning_rate": 0.0004225664298582339, + "loss": 1111.4911, + "step": 5540 + }, + { + "ce_loss_12": 3.18549702167511, + "ce_loss_17": 3.0146400213241575, + "ce_loss_23": 2.9432016134262087, + "ce_loss_3": 4.013700640201568, + "ce_loss_6": 3.5912591457366942, + "epoch": 0.555, + "grad_norm": 1184.0, + "kl_loss_12": 570.0095581054687, + "kl_loss_17": 168.6876480102539, + "kl_loss_3": 2263.831896972656, + "kl_loss_6": 1434.12294921875, + "learning_rate": 0.000420999302013325, + "loss": 1112.2068, + "step": 5550 + }, + { + "ce_loss_12": 3.1032863736152647, + "ce_loss_17": 2.9221776127815247, + "ce_loss_23": 2.8449564576148987, + "ce_loss_3": 4.007036745548248, + "ce_loss_6": 3.5387084245681764, + "epoch": 0.556, + "grad_norm": 1480.0, + "kl_loss_12": 599.1151763916016, + "kl_loss_17": 178.27646865844727, + "kl_loss_3": 2437.7327209472655, + "kl_loss_6": 1509.6392456054687, + "learning_rate": 0.000419432969704568, + "loss": 1141.0631, + "step": 5560 + }, + { + "ce_loss_12": 3.1315610647201537, + "ce_loss_17": 2.958857810497284, + "ce_loss_23": 2.887349987030029, + "ce_loss_3": 3.962913715839386, + "ce_loss_6": 3.5348150610923765, + "epoch": 0.557, + "grad_norm": 1176.0, + "kl_loss_12": 578.5272201538086, + "kl_loss_17": 170.51123962402343, + "kl_loss_3": 2292.166613769531, + "kl_loss_6": 1440.1355895996094, + "learning_rate": 0.00041786744870491154, + "loss": 1150.712, + "step": 5570 + }, + { + "ce_loss_12": 3.0789474129676817, + "ce_loss_17": 2.8997687101364136, + "ce_loss_23": 2.827404201030731, + "ce_loss_3": 3.9318880319595335, + "ce_loss_6": 3.5036757349967957, + "epoch": 0.558, + "grad_norm": 1152.0, + "kl_loss_12": 585.6135848999023, + "kl_loss_17": 173.6684211730957, + "kl_loss_3": 2334.7713134765627, + "kl_loss_6": 1485.8395751953126, + "learning_rate": 0.0004163027547791347, + "loss": 1133.438, + "step": 5580 + }, + { + "ce_loss_12": 3.066434121131897, + "ce_loss_17": 2.8909239292144777, + "ce_loss_23": 2.8182274460792542, + "ce_loss_3": 3.960865688323975, + "ce_loss_6": 3.501590812206268, + "epoch": 0.559, + "grad_norm": 1168.0, + "kl_loss_12": 586.6632537841797, + "kl_loss_17": 172.16040267944337, + "kl_loss_3": 2419.0933044433596, + "kl_loss_6": 1495.1991760253907, + "learning_rate": 0.0004147389036836881, + "loss": 1146.2016, + "step": 5590 + }, + { + "ce_loss_12": 3.101619017124176, + "ce_loss_17": 2.9293001532554626, + "ce_loss_23": 2.858116888999939, + "ce_loss_3": 3.9630075216293337, + "ce_loss_6": 3.5294421792030333, + "epoch": 0.56, + "grad_norm": 1224.0, + "kl_loss_12": 583.156396484375, + "kl_loss_17": 171.53984298706055, + "kl_loss_3": 2341.152209472656, + "kl_loss_6": 1483.7776916503906, + "learning_rate": 0.00041317591116653486, + "loss": 1157.8344, + "step": 5600 + }, + { + "ce_loss_12": 3.140073227882385, + "ce_loss_17": 2.9632779479026796, + "ce_loss_23": 2.892660915851593, + "ce_loss_3": 4.002801692485809, + "ce_loss_6": 3.559722900390625, + "epoch": 0.561, + "grad_norm": 984.0, + "kl_loss_12": 592.0898803710937, + "kl_loss_17": 175.4168487548828, + "kl_loss_3": 2368.6901245117188, + "kl_loss_6": 1483.5403137207031, + "learning_rate": 0.0004116137929669921, + "loss": 1132.2861, + "step": 5610 + }, + { + "ce_loss_12": 3.1353583097457887, + "ce_loss_17": 2.96168429851532, + "ce_loss_23": 2.891963481903076, + "ce_loss_3": 3.980002760887146, + "ce_loss_6": 3.5474584102630615, + "epoch": 0.562, + "grad_norm": 1496.0, + "kl_loss_12": 573.8780532836914, + "kl_loss_17": 167.09454879760742, + "kl_loss_3": 2308.930065917969, + "kl_loss_6": 1459.3727600097657, + "learning_rate": 0.00041005256481557305, + "loss": 1114.6254, + "step": 5620 + }, + { + "ce_loss_12": 3.216406464576721, + "ce_loss_17": 3.0492212176322937, + "ce_loss_23": 2.980920660495758, + "ce_loss_3": 4.013675105571747, + "ce_loss_6": 3.6090791821479797, + "epoch": 0.563, + "grad_norm": 1360.0, + "kl_loss_12": 561.8955993652344, + "kl_loss_17": 165.47934112548828, + "kl_loss_3": 2215.916259765625, + "kl_loss_6": 1405.9870849609374, + "learning_rate": 0.00040849224243382767, + "loss": 1105.3673, + "step": 5630 + }, + { + "ce_loss_12": 3.0871979117393495, + "ce_loss_17": 2.9172948598861694, + "ce_loss_23": 2.844453179836273, + "ce_loss_3": 3.946743881702423, + "ce_loss_6": 3.509100043773651, + "epoch": 0.564, + "grad_norm": 1088.0, + "kl_loss_12": 579.8146392822266, + "kl_loss_17": 169.4729118347168, + "kl_loss_3": 2336.1141845703123, + "kl_loss_6": 1479.3551330566406, + "learning_rate": 0.000406932841534185, + "loss": 1116.5229, + "step": 5640 + }, + { + "ce_loss_12": 3.0505393624305723, + "ce_loss_17": 2.8815552830696105, + "ce_loss_23": 2.8072192072868347, + "ce_loss_3": 3.9187739729881286, + "ce_loss_6": 3.47923538684845, + "epoch": 0.565, + "grad_norm": 1376.0, + "kl_loss_12": 582.4164916992188, + "kl_loss_17": 172.37065200805665, + "kl_loss_3": 2370.2238159179688, + "kl_loss_6": 1488.5986877441405, + "learning_rate": 0.0004053743778197951, + "loss": 1166.7918, + "step": 5650 + }, + { + "ce_loss_12": 3.159424090385437, + "ce_loss_17": 2.983396351337433, + "ce_loss_23": 2.910083532333374, + "ce_loss_3": 4.00204507112503, + "ce_loss_6": 3.5764374852180483, + "epoch": 0.566, + "grad_norm": 1256.0, + "kl_loss_12": 588.7669723510742, + "kl_loss_17": 175.26432876586915, + "kl_loss_3": 2314.8089294433594, + "kl_loss_6": 1473.1597045898438, + "learning_rate": 0.0004038168669843697, + "loss": 1149.6404, + "step": 5660 + }, + { + "ce_loss_12": 3.1078927993774412, + "ce_loss_17": 2.9341798067092895, + "ce_loss_23": 2.864762580394745, + "ce_loss_3": 3.921034610271454, + "ce_loss_6": 3.510339045524597, + "epoch": 0.567, + "grad_norm": 972.0, + "kl_loss_12": 571.4771148681641, + "kl_loss_17": 168.56038589477538, + "kl_loss_3": 2248.109423828125, + "kl_loss_6": 1433.6847351074218, + "learning_rate": 0.000402260324712026, + "loss": 1135.3118, + "step": 5670 + }, + { + "ce_loss_12": 3.1541042566299438, + "ce_loss_17": 2.9780243635177612, + "ce_loss_23": 2.907868194580078, + "ce_loss_3": 4.017433679103851, + "ce_loss_6": 3.5807367086410524, + "epoch": 0.568, + "grad_norm": 1456.0, + "kl_loss_12": 579.9191589355469, + "kl_loss_17": 167.7764877319336, + "kl_loss_3": 2349.619104003906, + "kl_loss_6": 1484.2405151367188, + "learning_rate": 0.00040070476667712743, + "loss": 1123.8162, + "step": 5680 + }, + { + "ce_loss_12": 3.176102542877197, + "ce_loss_17": 3.0069189071655273, + "ce_loss_23": 2.933028447628021, + "ce_loss_3": 4.015802943706513, + "ce_loss_6": 3.5848238110542296, + "epoch": 0.569, + "grad_norm": 1144.0, + "kl_loss_12": 577.5879211425781, + "kl_loss_17": 170.20566177368164, + "kl_loss_3": 2305.2219482421874, + "kl_loss_6": 1445.681903076172, + "learning_rate": 0.0003991502085441259, + "loss": 1134.1186, + "step": 5690 + }, + { + "ce_loss_12": 3.200211489200592, + "ce_loss_17": 3.0393505692481995, + "ce_loss_23": 2.970473277568817, + "ce_loss_3": 4.003493142127991, + "ce_loss_6": 3.593748462200165, + "epoch": 0.57, + "grad_norm": 1264.0, + "kl_loss_12": 563.2113464355468, + "kl_loss_17": 165.5726791381836, + "kl_loss_3": 2215.8523132324217, + "kl_loss_6": 1401.576104736328, + "learning_rate": 0.0003975966659674047, + "loss": 1120.5086, + "step": 5700 + }, + { + "ce_loss_12": 3.186952292919159, + "ce_loss_17": 3.0141292452812194, + "ce_loss_23": 2.942728817462921, + "ce_loss_3": 4.032547473907471, + "ce_loss_6": 3.596217918395996, + "epoch": 0.571, + "grad_norm": 1376.0, + "kl_loss_12": 575.968505859375, + "kl_loss_17": 170.199072265625, + "kl_loss_3": 2308.314581298828, + "kl_loss_6": 1443.2017517089844, + "learning_rate": 0.0003960441545911204, + "loss": 1116.0092, + "step": 5710 + }, + { + "ce_loss_12": 3.1654128432273865, + "ce_loss_17": 2.997965371608734, + "ce_loss_23": 2.9288565039634706, + "ce_loss_3": 3.9943326115608215, + "ce_loss_6": 3.567497718334198, + "epoch": 0.572, + "grad_norm": 1120.0, + "kl_loss_12": 574.8713333129883, + "kl_loss_17": 167.62567596435548, + "kl_loss_3": 2297.2396240234375, + "kl_loss_6": 1450.417529296875, + "learning_rate": 0.0003944926900490452, + "loss": 1119.0949, + "step": 5720 + }, + { + "ce_loss_12": 3.0970802426338198, + "ce_loss_17": 2.920128679275513, + "ce_loss_23": 2.846101438999176, + "ce_loss_3": 3.969974410533905, + "ce_loss_6": 3.526023507118225, + "epoch": 0.573, + "grad_norm": 1432.0, + "kl_loss_12": 589.8715835571289, + "kl_loss_17": 172.74836502075195, + "kl_loss_3": 2365.9664916992188, + "kl_loss_6": 1494.449969482422, + "learning_rate": 0.0003929422879644099, + "loss": 1130.7844, + "step": 5730 + }, + { + "ce_loss_12": 3.097192919254303, + "ce_loss_17": 2.9327561974525453, + "ce_loss_23": 2.86499559879303, + "ce_loss_3": 3.9204100370407104, + "ce_loss_6": 3.494847071170807, + "epoch": 0.574, + "grad_norm": 1360.0, + "kl_loss_12": 560.7226760864257, + "kl_loss_17": 165.36538467407226, + "kl_loss_3": 2273.9782287597654, + "kl_loss_6": 1431.7180114746093, + "learning_rate": 0.0003913929639497462, + "loss": 1092.815, + "step": 5740 + }, + { + "ce_loss_12": 3.059863972663879, + "ce_loss_17": 2.8799789428710936, + "ce_loss_23": 2.8116377234458922, + "ce_loss_3": 3.932484757900238, + "ce_loss_6": 3.486096715927124, + "epoch": 0.575, + "grad_norm": 1240.0, + "kl_loss_12": 569.6117431640625, + "kl_loss_17": 165.87915725708007, + "kl_loss_3": 2351.2241394042967, + "kl_loss_6": 1469.8656066894532, + "learning_rate": 0.00038984473360672965, + "loss": 1115.7053, + "step": 5750 + }, + { + "ce_loss_12": 3.0659531235694883, + "ce_loss_17": 2.8936392068862915, + "ce_loss_23": 2.8194744348526, + "ce_loss_3": 3.9361996173858644, + "ce_loss_6": 3.487998294830322, + "epoch": 0.576, + "grad_norm": 1176.0, + "kl_loss_12": 571.9079559326171, + "kl_loss_17": 166.82796478271484, + "kl_loss_3": 2346.0788513183593, + "kl_loss_6": 1469.1494201660157, + "learning_rate": 0.0003882976125260229, + "loss": 1114.1246, + "step": 5760 + }, + { + "ce_loss_12": 3.1290350794792174, + "ce_loss_17": 2.955051040649414, + "ce_loss_23": 2.8846306800842285, + "ce_loss_3": 3.981457543373108, + "ce_loss_6": 3.5426934719085694, + "epoch": 0.577, + "grad_norm": 1536.0, + "kl_loss_12": 572.9007583618164, + "kl_loss_17": 169.10189514160157, + "kl_loss_3": 2315.0250244140625, + "kl_loss_6": 1454.3143920898438, + "learning_rate": 0.00038675161628711776, + "loss": 1128.9508, + "step": 5770 + }, + { + "ce_loss_12": 3.1652075171470644, + "ce_loss_17": 2.996035838127136, + "ce_loss_23": 2.923553502559662, + "ce_loss_3": 3.988526499271393, + "ce_loss_6": 3.569002163410187, + "epoch": 0.578, + "grad_norm": 1184.0, + "kl_loss_12": 573.2200241088867, + "kl_loss_17": 170.21975860595703, + "kl_loss_3": 2274.1938537597657, + "kl_loss_6": 1439.8227233886719, + "learning_rate": 0.0003852067604581794, + "loss": 1143.6246, + "step": 5780 + }, + { + "ce_loss_12": 3.1154022932052614, + "ce_loss_17": 2.9427364468574524, + "ce_loss_23": 2.8725454330444338, + "ce_loss_3": 3.9664053320884705, + "ce_loss_6": 3.5301053404808043, + "epoch": 0.579, + "grad_norm": 1584.0, + "kl_loss_12": 574.4286529541016, + "kl_loss_17": 166.3837776184082, + "kl_loss_3": 2331.8002807617186, + "kl_loss_6": 1466.2192443847657, + "learning_rate": 0.0003836630605958888, + "loss": 1122.2252, + "step": 5790 + }, + { + "ce_loss_12": 3.1643086671829224, + "ce_loss_17": 2.996836471557617, + "ce_loss_23": 2.9282254457473753, + "ce_loss_3": 4.006624364852906, + "ce_loss_6": 3.5805129528045656, + "epoch": 0.58, + "grad_norm": 1520.0, + "kl_loss_12": 571.5751373291016, + "kl_loss_17": 168.04933166503906, + "kl_loss_3": 2315.909503173828, + "kl_loss_6": 1460.9131408691405, + "learning_rate": 0.0003821205322452863, + "loss": 1165.0018, + "step": 5800 + }, + { + "ce_loss_12": 3.1474431276321413, + "ce_loss_17": 2.978452205657959, + "ce_loss_23": 2.9102728366851807, + "ce_loss_3": 3.977898383140564, + "ce_loss_6": 3.5485727429389953, + "epoch": 0.581, + "grad_norm": 1424.0, + "kl_loss_12": 565.4918029785156, + "kl_loss_17": 165.82336349487304, + "kl_loss_3": 2292.4250366210936, + "kl_loss_6": 1442.7161743164063, + "learning_rate": 0.0003805791909396155, + "loss": 1121.6133, + "step": 5810 + }, + { + "ce_loss_12": 3.096940839290619, + "ce_loss_17": 2.9302207708358763, + "ce_loss_23": 2.8616117596626283, + "ce_loss_3": 3.94871609210968, + "ce_loss_6": 3.513180065155029, + "epoch": 0.582, + "grad_norm": 1328.0, + "kl_loss_12": 567.1476913452149, + "kl_loss_17": 165.21699295043945, + "kl_loss_3": 2315.2934020996095, + "kl_loss_6": 1448.5183471679688, + "learning_rate": 0.0003790390522001662, + "loss": 1129.2562, + "step": 5820 + }, + { + "ce_loss_12": 3.0444005727767944, + "ce_loss_17": 2.87704142332077, + "ce_loss_23": 2.8103655338287354, + "ce_loss_3": 3.90531131029129, + "ce_loss_6": 3.4619930505752565, + "epoch": 0.583, + "grad_norm": 1304.0, + "kl_loss_12": 560.7367935180664, + "kl_loss_17": 163.4321258544922, + "kl_loss_3": 2334.534295654297, + "kl_loss_6": 1455.2254272460937, + "learning_rate": 0.0003775001315361183, + "loss": 1113.3598, + "step": 5830 + }, + { + "ce_loss_12": 3.139377462863922, + "ce_loss_17": 2.9656705498695373, + "ce_loss_23": 2.8942306637763977, + "ce_loss_3": 3.9994457244873045, + "ce_loss_6": 3.561956214904785, + "epoch": 0.584, + "grad_norm": 1104.0, + "kl_loss_12": 576.4854705810546, + "kl_loss_17": 170.18653945922853, + "kl_loss_3": 2338.360107421875, + "kl_loss_6": 1465.3584045410157, + "learning_rate": 0.0003759624444443858, + "loss": 1134.8766, + "step": 5840 + }, + { + "ce_loss_12": 3.168642854690552, + "ce_loss_17": 2.999200773239136, + "ce_loss_23": 2.9328733086586, + "ce_loss_3": 3.996432375907898, + "ce_loss_6": 3.560738229751587, + "epoch": 0.585, + "grad_norm": 1496.0, + "kl_loss_12": 563.6090438842773, + "kl_loss_17": 165.6907615661621, + "kl_loss_3": 2297.1994018554688, + "kl_loss_6": 1435.7785522460938, + "learning_rate": 0.00037442600640946044, + "loss": 1108.7355, + "step": 5850 + }, + { + "ce_loss_12": 3.134856653213501, + "ce_loss_17": 2.9687893748283387, + "ce_loss_23": 2.9005924224853517, + "ce_loss_3": 3.9626365423202516, + "ce_loss_6": 3.5443278789520263, + "epoch": 0.586, + "grad_norm": 896.0, + "kl_loss_12": 571.4762405395508, + "kl_loss_17": 166.68805770874025, + "kl_loss_3": 2276.775286865234, + "kl_loss_6": 1439.7515869140625, + "learning_rate": 0.00037289083290325663, + "loss": 1099.8597, + "step": 5860 + }, + { + "ce_loss_12": 3.111377000808716, + "ce_loss_17": 2.9451043605804443, + "ce_loss_23": 2.874148762226105, + "ce_loss_3": 3.943045997619629, + "ce_loss_6": 3.510099673271179, + "epoch": 0.587, + "grad_norm": 1288.0, + "kl_loss_12": 559.2628280639649, + "kl_loss_17": 168.02631225585938, + "kl_loss_3": 2270.736376953125, + "kl_loss_6": 1418.6463623046875, + "learning_rate": 0.0003713569393849543, + "loss": 1106.6417, + "step": 5870 + }, + { + "ce_loss_12": 3.169297516345978, + "ce_loss_17": 3.0004114866256715, + "ce_loss_23": 2.9307323932647704, + "ce_loss_3": 4.0029584765434265, + "ce_loss_6": 3.577454316616058, + "epoch": 0.588, + "grad_norm": 1472.0, + "kl_loss_12": 572.8251327514648, + "kl_loss_17": 168.60687713623048, + "kl_loss_3": 2296.0158630371093, + "kl_loss_6": 1451.5937255859376, + "learning_rate": 0.00036982434130084397, + "loss": 1121.2643, + "step": 5880 + }, + { + "ce_loss_12": 3.0862842798233032, + "ce_loss_17": 2.918619728088379, + "ce_loss_23": 2.843446373939514, + "ce_loss_3": 3.9210837841033936, + "ce_loss_6": 3.491597616672516, + "epoch": 0.589, + "grad_norm": 1832.0, + "kl_loss_12": 573.2405166625977, + "kl_loss_17": 171.37699279785156, + "kl_loss_3": 2288.494384765625, + "kl_loss_6": 1440.4010803222657, + "learning_rate": 0.00036829305408417166, + "loss": 1131.5167, + "step": 5890 + }, + { + "ce_loss_12": 3.082815647125244, + "ce_loss_17": 2.905299973487854, + "ce_loss_23": 2.834301030635834, + "ce_loss_3": 3.9388489723205566, + "ce_loss_6": 3.499078333377838, + "epoch": 0.59, + "grad_norm": 1112.0, + "kl_loss_12": 583.4097137451172, + "kl_loss_17": 171.28600463867187, + "kl_loss_3": 2353.1889038085938, + "kl_loss_6": 1484.3849609375, + "learning_rate": 0.0003667630931549826, + "loss": 1130.6133, + "step": 5900 + }, + { + "ce_loss_12": 3.050534749031067, + "ce_loss_17": 2.8739961624145507, + "ce_loss_23": 2.8024583578109743, + "ce_loss_3": 3.954657232761383, + "ce_loss_6": 3.48987854719162, + "epoch": 0.591, + "grad_norm": 1080.0, + "kl_loss_12": 585.1016815185546, + "kl_loss_17": 170.39111938476563, + "kl_loss_3": 2433.939892578125, + "kl_loss_6": 1514.365985107422, + "learning_rate": 0.00036523447391996613, + "loss": 1151.823, + "step": 5910 + }, + { + "ce_loss_12": 3.124648427963257, + "ce_loss_17": 2.9544724464416503, + "ce_loss_23": 2.889039993286133, + "ce_loss_3": 3.9625410079956054, + "ce_loss_6": 3.5329968094825746, + "epoch": 0.592, + "grad_norm": 1344.0, + "kl_loss_12": 563.2493103027343, + "kl_loss_17": 164.04151000976563, + "kl_loss_3": 2275.4251037597655, + "kl_loss_6": 1427.4124694824218, + "learning_rate": 0.00036370721177230114, + "loss": 1107.2832, + "step": 5920 + }, + { + "ce_loss_12": 3.130690836906433, + "ce_loss_17": 2.958809518814087, + "ce_loss_23": 2.8872852325439453, + "ce_loss_3": 3.9897098183631896, + "ce_loss_6": 3.5514885783195496, + "epoch": 0.593, + "grad_norm": 1152.0, + "kl_loss_12": 581.3672821044922, + "kl_loss_17": 172.810555267334, + "kl_loss_3": 2338.8616943359375, + "kl_loss_6": 1471.761163330078, + "learning_rate": 0.00036218132209150044, + "loss": 1130.6513, + "step": 5930 + }, + { + "ce_loss_12": 3.0971512675285338, + "ce_loss_17": 2.9094135046005247, + "ce_loss_23": 2.831551361083984, + "ce_loss_3": 3.9879598736763002, + "ce_loss_6": 3.530065882205963, + "epoch": 0.594, + "grad_norm": 3088.0, + "kl_loss_12": 603.7480621337891, + "kl_loss_17": 177.9858055114746, + "kl_loss_3": 2438.20546875, + "kl_loss_6": 1523.2928405761718, + "learning_rate": 0.0003606568202432562, + "loss": 1157.2672, + "step": 5940 + }, + { + "ce_loss_12": 3.1510114312171935, + "ce_loss_17": 2.984342861175537, + "ce_loss_23": 2.9136085629463198, + "ce_loss_3": 4.028652763366699, + "ce_loss_6": 3.5766788005828856, + "epoch": 0.595, + "grad_norm": 1200.0, + "kl_loss_12": 579.799168395996, + "kl_loss_17": 171.63598861694337, + "kl_loss_3": 2380.4233215332033, + "kl_loss_6": 1491.7609802246093, + "learning_rate": 0.0003591337215792851, + "loss": 1120.3156, + "step": 5950 + }, + { + "ce_loss_12": 3.1785937905311585, + "ce_loss_17": 3.013545370101929, + "ce_loss_23": 2.946664047241211, + "ce_loss_3": 3.988059067726135, + "ce_loss_6": 3.581684875488281, + "epoch": 0.596, + "grad_norm": 1280.0, + "kl_loss_12": 558.0707260131836, + "kl_loss_17": 161.99999389648437, + "kl_loss_3": 2246.3234680175783, + "kl_loss_6": 1423.4499633789062, + "learning_rate": 0.00035761204143717383, + "loss": 1116.9531, + "step": 5960 + }, + { + "ce_loss_12": 3.1389800190925596, + "ce_loss_17": 2.9702129483222963, + "ce_loss_23": 2.9017290592193605, + "ce_loss_3": 3.985602331161499, + "ce_loss_6": 3.551557552814484, + "epoch": 0.597, + "grad_norm": 1112.0, + "kl_loss_12": 570.8744079589844, + "kl_loss_17": 167.93110122680665, + "kl_loss_3": 2322.8365112304687, + "kl_loss_6": 1458.6414733886718, + "learning_rate": 0.0003560917951402245, + "loss": 1149.7195, + "step": 5970 + }, + { + "ce_loss_12": 3.1232689023017883, + "ce_loss_17": 2.956403398513794, + "ce_loss_23": 2.8862552642822266, + "ce_loss_3": 3.9653080701828003, + "ce_loss_6": 3.54248868227005, + "epoch": 0.598, + "grad_norm": 1232.0, + "kl_loss_12": 564.7475952148437, + "kl_loss_17": 166.05315551757812, + "kl_loss_3": 2298.1189575195312, + "kl_loss_6": 1452.5067077636718, + "learning_rate": 0.00035457299799730046, + "loss": 1116.1566, + "step": 5980 + }, + { + "ce_loss_12": 3.1833902478218077, + "ce_loss_17": 3.0113158345222475, + "ce_loss_23": 2.9413999676704408, + "ce_loss_3": 4.006715643405914, + "ce_loss_6": 3.5888715624809264, + "epoch": 0.599, + "grad_norm": 1288.0, + "kl_loss_12": 568.6937225341796, + "kl_loss_17": 166.144620513916, + "kl_loss_3": 2271.1790405273437, + "kl_loss_6": 1434.0756225585938, + "learning_rate": 0.0003530556653026721, + "loss": 1119.4857, + "step": 5990 + }, + { + "ce_loss_12": 3.1078683972358703, + "ce_loss_17": 2.9388864517211912, + "ce_loss_23": 2.8658820390701294, + "ce_loss_3": 3.962469220161438, + "ce_loss_6": 3.5266475319862365, + "epoch": 0.6, + "grad_norm": 2448.0, + "kl_loss_12": 558.4937591552734, + "kl_loss_17": 165.0791160583496, + "kl_loss_3": 2321.1028381347655, + "kl_loss_6": 1452.5112854003905, + "learning_rate": 0.00035153981233586274, + "loss": 1130.5031, + "step": 6000 + }, + { + "ce_loss_12": 3.077103114128113, + "ce_loss_17": 2.903602635860443, + "ce_loss_23": 2.8354063630104065, + "ce_loss_3": 3.925050365924835, + "ce_loss_6": 3.49609614610672, + "epoch": 0.601, + "grad_norm": 1304.0, + "kl_loss_12": 564.5199554443359, + "kl_loss_17": 161.67181320190429, + "kl_loss_3": 2316.9648498535157, + "kl_loss_6": 1453.2272521972657, + "learning_rate": 0.00035002545436149473, + "loss": 1158.5654, + "step": 6010 + }, + { + "ce_loss_12": 3.094723129272461, + "ce_loss_17": 2.924634051322937, + "ce_loss_23": 2.85205854177475, + "ce_loss_3": 3.953912055492401, + "ce_loss_6": 3.5232208371162415, + "epoch": 0.602, + "grad_norm": 1576.0, + "kl_loss_12": 582.2882827758789, + "kl_loss_17": 172.62551193237306, + "kl_loss_3": 2357.1133544921877, + "kl_loss_6": 1496.3097351074218, + "learning_rate": 0.0003485126066291364, + "loss": 1120.1215, + "step": 6020 + }, + { + "ce_loss_12": 3.1260809898376465, + "ce_loss_17": 2.955556845664978, + "ce_loss_23": 2.88744113445282, + "ce_loss_3": 3.980349564552307, + "ce_loss_6": 3.550356423854828, + "epoch": 0.603, + "grad_norm": 1320.0, + "kl_loss_12": 561.931997680664, + "kl_loss_17": 165.3617431640625, + "kl_loss_3": 2313.730261230469, + "kl_loss_6": 1468.6224792480468, + "learning_rate": 0.0003470012843731476, + "loss": 1126.7109, + "step": 6030 + }, + { + "ce_loss_12": 3.074591028690338, + "ce_loss_17": 2.906606638431549, + "ce_loss_23": 2.8381201028823853, + "ce_loss_3": 3.94170058965683, + "ce_loss_6": 3.502815544605255, + "epoch": 0.604, + "grad_norm": 1344.0, + "kl_loss_12": 567.1752471923828, + "kl_loss_17": 164.98533630371094, + "kl_loss_3": 2336.706671142578, + "kl_loss_6": 1481.8884155273438, + "learning_rate": 0.00034549150281252633, + "loss": 1153.334, + "step": 6040 + }, + { + "ce_loss_12": 3.063284933567047, + "ce_loss_17": 2.8925641417503356, + "ce_loss_23": 2.8198460817337034, + "ce_loss_3": 3.8918349027633665, + "ce_loss_6": 3.4689099192619324, + "epoch": 0.605, + "grad_norm": 1304.0, + "kl_loss_12": 564.6323394775391, + "kl_loss_17": 167.75587310791016, + "kl_loss_3": 2264.2901000976562, + "kl_loss_6": 1431.2904907226562, + "learning_rate": 0.0003439832771507565, + "loss": 1107.605, + "step": 6050 + }, + { + "ce_loss_12": 3.065265107154846, + "ce_loss_17": 2.895182228088379, + "ce_loss_23": 2.8257480025291444, + "ce_loss_3": 3.9281867504119874, + "ce_loss_6": 3.493267834186554, + "epoch": 0.606, + "grad_norm": 1264.0, + "kl_loss_12": 567.1343521118164, + "kl_loss_17": 167.62005386352538, + "kl_loss_3": 2348.1340209960936, + "kl_loss_6": 1472.3601806640625, + "learning_rate": 0.0003424766225756537, + "loss": 1117.4418, + "step": 6060 + }, + { + "ce_loss_12": 3.122907614707947, + "ce_loss_17": 2.953090226650238, + "ce_loss_23": 2.8842870354652406, + "ce_loss_3": 3.969139504432678, + "ce_loss_6": 3.528507113456726, + "epoch": 0.607, + "grad_norm": 940.0, + "kl_loss_12": 569.6723724365235, + "kl_loss_17": 166.05274810791016, + "kl_loss_3": 2314.9645568847654, + "kl_loss_6": 1443.6838623046874, + "learning_rate": 0.00034097155425921255, + "loss": 1106.7039, + "step": 6070 + }, + { + "ce_loss_12": 3.033176898956299, + "ce_loss_17": 2.8631957292556764, + "ce_loss_23": 2.7933878183364866, + "ce_loss_3": 3.893681752681732, + "ce_loss_6": 3.4490676045417787, + "epoch": 0.608, + "grad_norm": 1256.0, + "kl_loss_12": 569.930812072754, + "kl_loss_17": 166.2875068664551, + "kl_loss_3": 2354.344384765625, + "kl_loss_6": 1470.4756164550781, + "learning_rate": 0.0003394680873574546, + "loss": 1124.7793, + "step": 6080 + }, + { + "ce_loss_12": 3.1261494040489195, + "ce_loss_17": 2.9549895524978638, + "ce_loss_23": 2.8835931181907655, + "ce_loss_3": 3.990994596481323, + "ce_loss_6": 3.551687812805176, + "epoch": 0.609, + "grad_norm": 1272.0, + "kl_loss_12": 580.3812896728516, + "kl_loss_17": 168.92733764648438, + "kl_loss_3": 2362.6959594726563, + "kl_loss_6": 1480.4715942382813, + "learning_rate": 0.0003379662370102747, + "loss": 1123.8231, + "step": 6090 + }, + { + "ce_loss_12": 3.1346469283103944, + "ce_loss_17": 2.9690099239349363, + "ce_loss_23": 2.9022717595100405, + "ce_loss_3": 3.9612194657325746, + "ce_loss_6": 3.529716396331787, + "epoch": 0.61, + "grad_norm": 1120.0, + "kl_loss_12": 564.0862533569336, + "kl_loss_17": 164.72021408081054, + "kl_loss_3": 2301.772625732422, + "kl_loss_6": 1433.7045471191407, + "learning_rate": 0.0003364660183412892, + "loss": 1122.1545, + "step": 6100 + }, + { + "ce_loss_12": 3.1167293906211855, + "ce_loss_17": 2.948679792881012, + "ce_loss_23": 2.878220272064209, + "ce_loss_3": 3.956706476211548, + "ce_loss_6": 3.525891661643982, + "epoch": 0.611, + "grad_norm": 1016.0, + "kl_loss_12": 575.0978164672852, + "kl_loss_17": 167.98723831176758, + "kl_loss_3": 2310.820440673828, + "kl_loss_6": 1452.1403869628907, + "learning_rate": 0.0003349674464576834, + "loss": 1135.3953, + "step": 6110 + }, + { + "ce_loss_12": 3.0683866262435915, + "ce_loss_17": 2.900368940830231, + "ce_loss_23": 2.8294920206069945, + "ce_loss_3": 3.9227017879486086, + "ce_loss_6": 3.4895668745040895, + "epoch": 0.612, + "grad_norm": 1400.0, + "kl_loss_12": 570.9404495239257, + "kl_loss_17": 169.12254333496094, + "kl_loss_3": 2328.972705078125, + "kl_loss_6": 1467.1710876464845, + "learning_rate": 0.00033347053645005966, + "loss": 1101.3781, + "step": 6120 + }, + { + "ce_loss_12": 3.153418040275574, + "ce_loss_17": 2.9871235251426698, + "ce_loss_23": 2.9184412360191345, + "ce_loss_3": 3.9793171763420103, + "ce_loss_6": 3.5582070112228394, + "epoch": 0.613, + "grad_norm": 1576.0, + "kl_loss_12": 560.519694519043, + "kl_loss_17": 163.97461700439453, + "kl_loss_3": 2253.6986877441404, + "kl_loss_6": 1425.6687072753907, + "learning_rate": 0.00033197530339228485, + "loss": 1116.5682, + "step": 6130 + }, + { + "ce_loss_12": 3.1269586086273193, + "ce_loss_17": 2.953156077861786, + "ce_loss_23": 2.880386304855347, + "ce_loss_3": 3.965451443195343, + "ce_loss_6": 3.5458551406860352, + "epoch": 0.614, + "grad_norm": 1384.0, + "kl_loss_12": 576.1701049804688, + "kl_loss_17": 171.7872230529785, + "kl_loss_3": 2296.2677612304688, + "kl_loss_6": 1455.6143737792968, + "learning_rate": 0.00033048176234133967, + "loss": 1117.7045, + "step": 6140 + }, + { + "ce_loss_12": 3.1166322231292725, + "ce_loss_17": 2.947054147720337, + "ce_loss_23": 2.878944230079651, + "ce_loss_3": 3.952300786972046, + "ce_loss_6": 3.5267172336578367, + "epoch": 0.615, + "grad_norm": 1200.0, + "kl_loss_12": 572.8429504394531, + "kl_loss_17": 168.27835922241212, + "kl_loss_3": 2304.2374877929688, + "kl_loss_6": 1455.4294311523438, + "learning_rate": 0.0003289899283371657, + "loss": 1129.0639, + "step": 6150 + }, + { + "ce_loss_12": 3.1242191553115846, + "ce_loss_17": 2.954644775390625, + "ce_loss_23": 2.8852687239646913, + "ce_loss_3": 3.978927218914032, + "ce_loss_6": 3.5400946140289307, + "epoch": 0.616, + "grad_norm": 1608.0, + "kl_loss_12": 561.4634826660156, + "kl_loss_17": 165.0264518737793, + "kl_loss_3": 2316.6402893066406, + "kl_loss_6": 1444.9783081054688, + "learning_rate": 0.0003274998164025148, + "loss": 1135.0775, + "step": 6160 + }, + { + "ce_loss_12": 3.158303916454315, + "ce_loss_17": 2.9911983489990233, + "ce_loss_23": 2.9205039978027343, + "ce_loss_3": 4.000229585170746, + "ce_loss_6": 3.5639469504356383, + "epoch": 0.617, + "grad_norm": 1512.0, + "kl_loss_12": 570.6586273193359, + "kl_loss_17": 167.9722900390625, + "kl_loss_3": 2283.6752319335938, + "kl_loss_6": 1439.0603820800782, + "learning_rate": 0.0003260114415427975, + "loss": 1144.1389, + "step": 6170 + }, + { + "ce_loss_12": 3.093304145336151, + "ce_loss_17": 2.9208192229270935, + "ce_loss_23": 2.8519298434257507, + "ce_loss_3": 3.958202075958252, + "ce_loss_6": 3.5155731081962585, + "epoch": 0.618, + "grad_norm": 1424.0, + "kl_loss_12": 566.3221221923828, + "kl_loss_17": 166.28012084960938, + "kl_loss_3": 2347.4573181152346, + "kl_loss_6": 1468.7472106933594, + "learning_rate": 0.0003245248187459323, + "loss": 1143.1082, + "step": 6180 + }, + { + "ce_loss_12": 3.0748647093772887, + "ce_loss_17": 2.9128819346427917, + "ce_loss_23": 2.846064102649689, + "ce_loss_3": 3.8938180446624755, + "ce_loss_6": 3.4712459087371825, + "epoch": 0.619, + "grad_norm": 1456.0, + "kl_loss_12": 548.8818344116211, + "kl_loss_17": 161.40147171020507, + "kl_loss_3": 2253.5403076171874, + "kl_loss_6": 1401.3277099609375, + "learning_rate": 0.00032303996298219416, + "loss": 1093.641, + "step": 6190 + }, + { + "ce_loss_12": 3.1497726678848266, + "ce_loss_17": 2.986233186721802, + "ce_loss_23": 2.9164626836776733, + "ce_loss_3": 3.9703675627708437, + "ce_loss_6": 3.5505619406700135, + "epoch": 0.62, + "grad_norm": 1104.0, + "kl_loss_12": 553.2283340454102, + "kl_loss_17": 162.91330795288087, + "kl_loss_3": 2224.2183227539062, + "kl_loss_6": 1399.8785522460937, + "learning_rate": 0.00032155688920406414, + "loss": 1091.8606, + "step": 6200 + }, + { + "ce_loss_12": 3.0700011014938355, + "ce_loss_17": 2.896014726161957, + "ce_loss_23": 2.8269984841346742, + "ce_loss_3": 3.951104760169983, + "ce_loss_6": 3.4962696075439452, + "epoch": 0.621, + "grad_norm": 1480.0, + "kl_loss_12": 572.2689865112304, + "kl_loss_17": 168.72110900878906, + "kl_loss_3": 2365.320104980469, + "kl_loss_6": 1464.9130126953125, + "learning_rate": 0.0003200756123460788, + "loss": 1147.8372, + "step": 6210 + }, + { + "ce_loss_12": 3.1097887635231016, + "ce_loss_17": 2.9406883835792543, + "ce_loss_23": 2.8669352293014527, + "ce_loss_3": 3.9747615337371824, + "ce_loss_6": 3.535456907749176, + "epoch": 0.622, + "grad_norm": 2176.0, + "kl_loss_12": 581.556867980957, + "kl_loss_17": 170.02774200439453, + "kl_loss_3": 2369.5360412597656, + "kl_loss_6": 1490.153985595703, + "learning_rate": 0.00031859614732467957, + "loss": 1143.2827, + "step": 6220 + }, + { + "ce_loss_12": 3.1528987884521484, + "ce_loss_17": 2.9876800775527954, + "ce_loss_23": 2.9186222672462465, + "ce_loss_3": 3.9738120079040526, + "ce_loss_6": 3.556571829319, + "epoch": 0.623, + "grad_norm": 1224.0, + "kl_loss_12": 557.099560546875, + "kl_loss_17": 163.19732131958008, + "kl_loss_3": 2246.7878601074217, + "kl_loss_6": 1417.0084716796875, + "learning_rate": 0.00031711850903806275, + "loss": 1099.6331, + "step": 6230 + }, + { + "ce_loss_12": 3.069310748577118, + "ce_loss_17": 2.895755708217621, + "ce_loss_23": 2.825921130180359, + "ce_loss_3": 3.932187294960022, + "ce_loss_6": 3.4879361152648927, + "epoch": 0.624, + "grad_norm": 1104.0, + "kl_loss_12": 577.8297821044922, + "kl_loss_17": 169.72356338500975, + "kl_loss_3": 2350.6295837402345, + "kl_loss_6": 1469.5405395507812, + "learning_rate": 0.0003156427123660297, + "loss": 1117.4363, + "step": 6240 + }, + { + "ce_loss_12": 3.1469658613204956, + "ce_loss_17": 2.9793386697769164, + "ce_loss_23": 2.9091200470924377, + "ce_loss_3": 3.9587064504623415, + "ce_loss_6": 3.5512452244758608, + "epoch": 0.625, + "grad_norm": 1216.0, + "kl_loss_12": 564.7679275512695, + "kl_loss_17": 164.33160171508788, + "kl_loss_3": 2253.2488037109374, + "kl_loss_6": 1427.362451171875, + "learning_rate": 0.0003141687721698363, + "loss": 1117.8021, + "step": 6250 + }, + { + "ce_loss_12": 3.1139580726623537, + "ce_loss_17": 2.951532244682312, + "ce_loss_23": 2.8865836381912233, + "ce_loss_3": 3.9141123294830322, + "ce_loss_6": 3.499807631969452, + "epoch": 0.626, + "grad_norm": 1280.0, + "kl_loss_12": 534.3491195678711, + "kl_loss_17": 157.5703453063965, + "kl_loss_3": 2184.005993652344, + "kl_loss_6": 1367.9380004882812, + "learning_rate": 0.00031269670329204396, + "loss": 1089.5651, + "step": 6260 + }, + { + "ce_loss_12": 3.157158946990967, + "ce_loss_17": 2.9909069776535033, + "ce_loss_23": 2.9246951818466185, + "ce_loss_3": 3.962106227874756, + "ce_loss_6": 3.552718937397003, + "epoch": 0.627, + "grad_norm": 1096.0, + "kl_loss_12": 562.5212127685547, + "kl_loss_17": 164.10081176757814, + "kl_loss_3": 2239.545928955078, + "kl_loss_6": 1418.3596130371093, + "learning_rate": 0.00031122652055637015, + "loss": 1111.2492, + "step": 6270 + }, + { + "ce_loss_12": 3.1191351294517515, + "ce_loss_17": 2.951986086368561, + "ce_loss_23": 2.8855087041854857, + "ce_loss_3": 3.975840079784393, + "ce_loss_6": 3.533181536197662, + "epoch": 0.628, + "grad_norm": 1112.0, + "kl_loss_12": 570.5546478271484, + "kl_loss_17": 165.54044723510742, + "kl_loss_3": 2337.914758300781, + "kl_loss_6": 1459.2587463378907, + "learning_rate": 0.0003097582387675385, + "loss": 1106.6899, + "step": 6280 + }, + { + "ce_loss_12": 3.159350299835205, + "ce_loss_17": 2.991086208820343, + "ce_loss_23": 2.9224894285202025, + "ce_loss_3": 3.9898188948631286, + "ce_loss_6": 3.5668737053871156, + "epoch": 0.629, + "grad_norm": 1464.0, + "kl_loss_12": 566.9471389770508, + "kl_loss_17": 165.4483184814453, + "kl_loss_3": 2297.144598388672, + "kl_loss_6": 1448.1842407226563, + "learning_rate": 0.00030829187271113034, + "loss": 1110.6332, + "step": 6290 + }, + { + "ce_loss_12": 3.1395588636398317, + "ce_loss_17": 2.978374016284943, + "ce_loss_23": 2.9104403972625734, + "ce_loss_3": 3.964379060268402, + "ce_loss_6": 3.5459526419639587, + "epoch": 0.63, + "grad_norm": 1376.0, + "kl_loss_12": 549.1984100341797, + "kl_loss_17": 162.1900848388672, + "kl_loss_3": 2229.6377380371096, + "kl_loss_6": 1406.4794860839843, + "learning_rate": 0.00030682743715343565, + "loss": 1112.6799, + "step": 6300 + }, + { + "ce_loss_12": 3.1101244688034058, + "ce_loss_17": 2.9345804691314696, + "ce_loss_23": 2.8628518342971803, + "ce_loss_3": 3.949520134925842, + "ce_loss_6": 3.5296986937522887, + "epoch": 0.631, + "grad_norm": 1576.0, + "kl_loss_12": 574.5238006591796, + "kl_loss_17": 169.87524490356446, + "kl_loss_3": 2297.4243286132814, + "kl_loss_6": 1459.8432678222657, + "learning_rate": 0.0003053649468413043, + "loss": 1135.6068, + "step": 6310 + }, + { + "ce_loss_12": 3.204981434345245, + "ce_loss_17": 3.0351101756095886, + "ce_loss_23": 2.9643646597862245, + "ce_loss_3": 4.030413317680359, + "ce_loss_6": 3.6091914057731627, + "epoch": 0.632, + "grad_norm": 1496.0, + "kl_loss_12": 565.3837051391602, + "kl_loss_17": 167.2088722229004, + "kl_loss_3": 2273.190069580078, + "kl_loss_6": 1428.8405456542969, + "learning_rate": 0.00030390441650199725, + "loss": 1106.1108, + "step": 6320 + }, + { + "ce_loss_12": 3.1109281897544863, + "ce_loss_17": 2.9481189966201784, + "ce_loss_23": 2.8790109753608704, + "ce_loss_3": 3.9495753407478333, + "ce_loss_6": 3.517538511753082, + "epoch": 0.633, + "grad_norm": 1296.0, + "kl_loss_12": 561.5178146362305, + "kl_loss_17": 164.26847229003906, + "kl_loss_3": 2275.2756469726564, + "kl_loss_6": 1427.8334716796876, + "learning_rate": 0.00030244586084303903, + "loss": 1102.1451, + "step": 6330 + }, + { + "ce_loss_12": 3.0955166697502134, + "ce_loss_17": 2.9205790638923643, + "ce_loss_23": 2.853133475780487, + "ce_loss_3": 3.949768900871277, + "ce_loss_6": 3.5212396383285522, + "epoch": 0.634, + "grad_norm": 1040.0, + "kl_loss_12": 571.7488021850586, + "kl_loss_17": 168.09423217773437, + "kl_loss_3": 2339.385119628906, + "kl_loss_6": 1478.2344299316405, + "learning_rate": 0.00030098929455206903, + "loss": 1111.3738, + "step": 6340 + }, + { + "ce_loss_12": 3.074948859214783, + "ce_loss_17": 2.9132041335105896, + "ce_loss_23": 2.8481234550476073, + "ce_loss_3": 3.927300810813904, + "ce_loss_6": 3.4901010274887083, + "epoch": 0.635, + "grad_norm": 1552.0, + "kl_loss_12": 557.5138580322266, + "kl_loss_17": 161.4115867614746, + "kl_loss_3": 2296.2754333496096, + "kl_loss_6": 1436.0618408203125, + "learning_rate": 0.00029953473229669324, + "loss": 1133.8223, + "step": 6350 + }, + { + "ce_loss_12": 3.110889804363251, + "ce_loss_17": 2.9405208230018616, + "ce_loss_23": 2.8736177682876587, + "ce_loss_3": 3.95476838350296, + "ce_loss_6": 3.5289437413215636, + "epoch": 0.636, + "grad_norm": 1384.0, + "kl_loss_12": 566.539794921875, + "kl_loss_17": 164.1241554260254, + "kl_loss_3": 2304.1476318359373, + "kl_loss_6": 1450.2427001953124, + "learning_rate": 0.00029808218872433767, + "loss": 1106.9477, + "step": 6360 + }, + { + "ce_loss_12": 3.169117248058319, + "ce_loss_17": 3.0026118874549867, + "ce_loss_23": 2.933871877193451, + "ce_loss_3": 3.9984364748001098, + "ce_loss_6": 3.571031415462494, + "epoch": 0.637, + "grad_norm": 1272.0, + "kl_loss_12": 555.6065002441406, + "kl_loss_17": 162.51567840576172, + "kl_loss_3": 2263.9653381347657, + "kl_loss_6": 1415.1207458496094, + "learning_rate": 0.0002966316784621, + "loss": 1092.0043, + "step": 6370 + }, + { + "ce_loss_12": 3.0926617622375487, + "ce_loss_17": 2.92412850856781, + "ce_loss_23": 2.850287711620331, + "ce_loss_3": 3.94654198884964, + "ce_loss_6": 3.5095828056335447, + "epoch": 0.638, + "grad_norm": 1256.0, + "kl_loss_12": 572.4656372070312, + "kl_loss_17": 167.10297546386718, + "kl_loss_3": 2322.6778930664063, + "kl_loss_6": 1458.0764892578125, + "learning_rate": 0.0002951832161166024, + "loss": 1105.7497, + "step": 6380 + }, + { + "ce_loss_12": 3.15998957157135, + "ce_loss_17": 2.9884552478790285, + "ce_loss_23": 2.9132028341293337, + "ce_loss_3": 3.992968261241913, + "ce_loss_6": 3.5755365610122682, + "epoch": 0.639, + "grad_norm": 1048.0, + "kl_loss_12": 571.2973022460938, + "kl_loss_17": 168.8902145385742, + "kl_loss_3": 2286.949426269531, + "kl_loss_6": 1454.9971801757813, + "learning_rate": 0.0002937368162738445, + "loss": 1099.0266, + "step": 6390 + }, + { + "ce_loss_12": 3.1059131979942323, + "ce_loss_17": 2.944585359096527, + "ce_loss_23": 2.8812364816665648, + "ce_loss_3": 3.9365057229995726, + "ce_loss_6": 3.5142826199531556, + "epoch": 0.64, + "grad_norm": 1248.0, + "kl_loss_12": 548.1466659545898, + "kl_loss_17": 158.61914291381837, + "kl_loss_3": 2269.588977050781, + "kl_loss_6": 1421.9814819335938, + "learning_rate": 0.0002922924934990568, + "loss": 1116.3316, + "step": 6400 + }, + { + "ce_loss_12": 3.052070152759552, + "ce_loss_17": 2.8816617727279663, + "ce_loss_23": 2.809593605995178, + "ce_loss_3": 3.9293985247611998, + "ce_loss_6": 3.4850574493408204, + "epoch": 0.641, + "grad_norm": 1016.0, + "kl_loss_12": 567.8729965209961, + "kl_loss_17": 165.7066307067871, + "kl_loss_3": 2372.4416381835936, + "kl_loss_6": 1488.8204223632813, + "learning_rate": 0.0002908502623365536, + "loss": 1130.1512, + "step": 6410 + }, + { + "ce_loss_12": 2.9853854417800902, + "ce_loss_17": 2.819088900089264, + "ce_loss_23": 2.748886638879776, + "ce_loss_3": 3.868946361541748, + "ce_loss_6": 3.4116463661193848, + "epoch": 0.642, + "grad_norm": 1312.0, + "kl_loss_12": 563.417399597168, + "kl_loss_17": 162.88093948364258, + "kl_loss_3": 2373.9099060058593, + "kl_loss_6": 1474.7904296875, + "learning_rate": 0.0002894101373095867, + "loss": 1123.2893, + "step": 6420 + }, + { + "ce_loss_12": 3.1881203293800353, + "ce_loss_17": 3.0227572679519654, + "ce_loss_23": 2.9544938921928408, + "ce_loss_3": 4.006763243675232, + "ce_loss_6": 3.582263541221619, + "epoch": 0.643, + "grad_norm": 1192.0, + "kl_loss_12": 568.1661087036133, + "kl_loss_17": 168.8744743347168, + "kl_loss_3": 2267.1685180664062, + "kl_loss_6": 1420.8440551757812, + "learning_rate": 0.00028797213292019926, + "loss": 1108.8061, + "step": 6430 + }, + { + "ce_loss_12": 3.1680617213249205, + "ce_loss_17": 3.002046263217926, + "ce_loss_23": 2.9295936226844788, + "ce_loss_3": 3.9971763372421263, + "ce_loss_6": 3.5705169558525087, + "epoch": 0.644, + "grad_norm": 1224.0, + "kl_loss_12": 571.6078521728516, + "kl_loss_17": 168.19557723999023, + "kl_loss_3": 2274.578790283203, + "kl_loss_6": 1438.4652709960938, + "learning_rate": 0.0002865362636490791, + "loss": 1133.7952, + "step": 6440 + }, + { + "ce_loss_12": 3.173088526725769, + "ce_loss_17": 3.0073896169662477, + "ce_loss_23": 2.942509913444519, + "ce_loss_3": 4.002819502353669, + "ce_loss_6": 3.5829379558563232, + "epoch": 0.645, + "grad_norm": 1448.0, + "kl_loss_12": 557.0371398925781, + "kl_loss_17": 162.7965171813965, + "kl_loss_3": 2260.254278564453, + "kl_loss_6": 1429.329766845703, + "learning_rate": 0.0002851025439554142, + "loss": 1102.529, + "step": 6450 + }, + { + "ce_loss_12": 3.159893012046814, + "ce_loss_17": 2.99376357793808, + "ce_loss_23": 2.92137326002121, + "ce_loss_3": 3.976053535938263, + "ce_loss_6": 3.569490969181061, + "epoch": 0.646, + "grad_norm": 1568.0, + "kl_loss_12": 562.2715911865234, + "kl_loss_17": 164.8102569580078, + "kl_loss_3": 2224.415234375, + "kl_loss_6": 1420.1577270507812, + "learning_rate": 0.00028367098827674573, + "loss": 1098.1518, + "step": 6460 + }, + { + "ce_loss_12": 3.094073462486267, + "ce_loss_17": 2.926894783973694, + "ce_loss_23": 2.8606561183929444, + "ce_loss_3": 3.9346626162528993, + "ce_loss_6": 3.4975233554840086, + "epoch": 0.647, + "grad_norm": 1056.0, + "kl_loss_12": 557.2730117797852, + "kl_loss_17": 161.74693222045897, + "kl_loss_3": 2265.713836669922, + "kl_loss_6": 1410.1818908691407, + "learning_rate": 0.00028224161102882397, + "loss": 1112.1836, + "step": 6470 + }, + { + "ce_loss_12": 3.0719471096992494, + "ce_loss_17": 2.9085827350616453, + "ce_loss_23": 2.8445157527923586, + "ce_loss_3": 3.892908537387848, + "ce_loss_6": 3.4768115043640138, + "epoch": 0.648, + "grad_norm": 1504.0, + "kl_loss_12": 548.413461303711, + "kl_loss_17": 158.70454483032228, + "kl_loss_3": 2234.7188049316405, + "kl_loss_6": 1414.9549194335937, + "learning_rate": 0.00028081442660546124, + "loss": 1106.9424, + "step": 6480 + }, + { + "ce_loss_12": 3.1347915887832642, + "ce_loss_17": 2.9687870383262633, + "ce_loss_23": 2.901146113872528, + "ce_loss_3": 3.9523842573165893, + "ce_loss_6": 3.5290323853492738, + "epoch": 0.649, + "grad_norm": 1296.0, + "kl_loss_12": 560.3147171020507, + "kl_loss_17": 166.38631439208984, + "kl_loss_3": 2250.884729003906, + "kl_loss_6": 1404.9350158691407, + "learning_rate": 0.0002793894493783892, + "loss": 1106.6742, + "step": 6490 + }, + { + "ce_loss_12": 3.1467607021331787, + "ce_loss_17": 2.985257124900818, + "ce_loss_23": 2.9195136189460755, + "ce_loss_3": 3.9745638847351072, + "ce_loss_6": 3.5555363655090333, + "epoch": 0.65, + "grad_norm": 1256.0, + "kl_loss_12": 548.7041427612305, + "kl_loss_17": 160.34083786010743, + "kl_loss_3": 2242.4936950683596, + "kl_loss_6": 1420.1112548828125, + "learning_rate": 0.0002779666936971129, + "loss": 1096.2486, + "step": 6500 + }, + { + "ce_loss_12": 3.1697442054748537, + "ce_loss_17": 3.0020530343055727, + "ce_loss_23": 2.9306628704071045, + "ce_loss_3": 4.006459021568299, + "ce_loss_6": 3.578002154827118, + "epoch": 0.651, + "grad_norm": 1032.0, + "kl_loss_12": 568.6251037597656, + "kl_loss_17": 165.5076919555664, + "kl_loss_3": 2297.5794372558594, + "kl_loss_6": 1440.566912841797, + "learning_rate": 0.00027654617388876614, + "loss": 1119.0514, + "step": 6510 + }, + { + "ce_loss_12": 3.1790334582328796, + "ce_loss_17": 3.0164214253425596, + "ce_loss_23": 2.947012257575989, + "ce_loss_3": 4.01498510837555, + "ce_loss_6": 3.580212116241455, + "epoch": 0.652, + "grad_norm": 1280.0, + "kl_loss_12": 561.0163177490234, + "kl_loss_17": 165.67923889160156, + "kl_loss_3": 2284.324603271484, + "kl_loss_6": 1423.8077758789063, + "learning_rate": 0.0002751279042579672, + "loss": 1113.2713, + "step": 6520 + }, + { + "ce_loss_12": 3.1225333333015444, + "ce_loss_17": 2.961228346824646, + "ce_loss_23": 2.8934088587760924, + "ce_loss_3": 3.9473448634147643, + "ce_loss_6": 3.522226560115814, + "epoch": 0.653, + "grad_norm": 1104.0, + "kl_loss_12": 546.7998992919922, + "kl_loss_17": 160.3195655822754, + "kl_loss_3": 2248.5444274902343, + "kl_loss_6": 1402.5869018554688, + "learning_rate": 0.00027371189908667604, + "loss": 1116.6242, + "step": 6530 + }, + { + "ce_loss_12": 3.185983431339264, + "ce_loss_17": 3.0128308176994323, + "ce_loss_23": 2.939183759689331, + "ce_loss_3": 4.045200252532959, + "ce_loss_6": 3.6046812176704406, + "epoch": 0.654, + "grad_norm": 1000.0, + "kl_loss_12": 578.8798248291016, + "kl_loss_17": 173.44672164916992, + "kl_loss_3": 2344.2353332519533, + "kl_loss_6": 1459.0255065917968, + "learning_rate": 0.00027229817263404863, + "loss": 1142.0896, + "step": 6540 + }, + { + "ce_loss_12": 3.1544518947601317, + "ce_loss_17": 2.9977415561676026, + "ce_loss_23": 2.9329543232917787, + "ce_loss_3": 3.944475269317627, + "ce_loss_6": 3.5443835496902465, + "epoch": 0.655, + "grad_norm": 988.0, + "kl_loss_12": 547.4029541015625, + "kl_loss_17": 161.00300140380858, + "kl_loss_3": 2180.589074707031, + "kl_loss_6": 1385.0390197753907, + "learning_rate": 0.0002708867391362948, + "loss": 1091.1217, + "step": 6550 + }, + { + "ce_loss_12": 3.1348859310150146, + "ce_loss_17": 2.971785545349121, + "ce_loss_23": 2.909306788444519, + "ce_loss_3": 3.9420260787010193, + "ce_loss_6": 3.5194541931152346, + "epoch": 0.656, + "grad_norm": 1064.0, + "kl_loss_12": 536.2102874755859, + "kl_loss_17": 159.3724105834961, + "kl_loss_3": 2189.0327758789062, + "kl_loss_6": 1359.9533996582031, + "learning_rate": 0.0002694776128065345, + "loss": 1090.7284, + "step": 6560 + }, + { + "ce_loss_12": 3.0816171884536745, + "ce_loss_17": 2.9123438715934755, + "ce_loss_23": 2.8430118560791016, + "ce_loss_3": 3.9183963537216187, + "ce_loss_6": 3.4916830539703367, + "epoch": 0.657, + "grad_norm": 1360.0, + "kl_loss_12": 566.3829650878906, + "kl_loss_17": 165.3619255065918, + "kl_loss_3": 2294.381787109375, + "kl_loss_6": 1452.9326782226562, + "learning_rate": 0.00026807080783465374, + "loss": 1100.9056, + "step": 6570 + }, + { + "ce_loss_12": 3.181067681312561, + "ce_loss_17": 3.01416095495224, + "ce_loss_23": 2.9467032313346864, + "ce_loss_3": 4.022855424880982, + "ce_loss_6": 3.5956613063812255, + "epoch": 0.658, + "grad_norm": 1488.0, + "kl_loss_12": 568.3876220703125, + "kl_loss_17": 166.68803634643555, + "kl_loss_3": 2302.2370361328126, + "kl_loss_6": 1450.1819763183594, + "learning_rate": 0.00026666633838716316, + "loss": 1123.9771, + "step": 6580 + }, + { + "ce_loss_12": 3.0953935623168944, + "ce_loss_17": 2.923772132396698, + "ce_loss_23": 2.851178967952728, + "ce_loss_3": 3.9333749771118165, + "ce_loss_6": 3.5038415670394896, + "epoch": 0.659, + "grad_norm": 1480.0, + "kl_loss_12": 572.9993301391602, + "kl_loss_17": 168.8495315551758, + "kl_loss_3": 2303.81123046875, + "kl_loss_6": 1450.9668212890624, + "learning_rate": 0.00026526421860705474, + "loss": 1129.8791, + "step": 6590 + }, + { + "ce_loss_12": 3.116642653942108, + "ce_loss_17": 2.9426411151885987, + "ce_loss_23": 2.874138903617859, + "ce_loss_3": 3.9514434814453123, + "ce_loss_6": 3.5283915877342222, + "epoch": 0.66, + "grad_norm": 1024.0, + "kl_loss_12": 569.0870574951172, + "kl_loss_17": 166.90686416625977, + "kl_loss_3": 2292.765283203125, + "kl_loss_6": 1443.3717956542969, + "learning_rate": 0.0002638644626136587, + "loss": 1103.042, + "step": 6600 + }, + { + "ce_loss_12": 3.1274876236915587, + "ce_loss_17": 2.9615706145763396, + "ce_loss_23": 2.8964427053928374, + "ce_loss_3": 3.956467306613922, + "ce_loss_6": 3.5342164874076842, + "epoch": 0.661, + "grad_norm": 1376.0, + "kl_loss_12": 556.1025802612305, + "kl_loss_17": 160.9067581176758, + "kl_loss_3": 2265.7707702636717, + "kl_loss_6": 1428.458428955078, + "learning_rate": 0.00026246708450250255, + "loss": 1106.211, + "step": 6610 + }, + { + "ce_loss_12": 3.106378674507141, + "ce_loss_17": 2.94292356967926, + "ce_loss_23": 2.875683069229126, + "ce_loss_3": 3.926833248138428, + "ce_loss_6": 3.5029583096504213, + "epoch": 0.662, + "grad_norm": 1464.0, + "kl_loss_12": 551.8497680664062, + "kl_loss_17": 162.0682846069336, + "kl_loss_3": 2245.946044921875, + "kl_loss_6": 1406.854931640625, + "learning_rate": 0.00026107209834516854, + "loss": 1096.7816, + "step": 6620 + }, + { + "ce_loss_12": 3.081444466114044, + "ce_loss_17": 2.91006817817688, + "ce_loss_23": 2.841499149799347, + "ce_loss_3": 3.9396434903144835, + "ce_loss_6": 3.503396451473236, + "epoch": 0.663, + "grad_norm": 1128.0, + "kl_loss_12": 563.1735321044922, + "kl_loss_17": 164.47392730712892, + "kl_loss_3": 2342.509033203125, + "kl_loss_6": 1470.8843383789062, + "learning_rate": 0.0002596795181891514, + "loss": 1132.7114, + "step": 6630 + }, + { + "ce_loss_12": 3.083864748477936, + "ce_loss_17": 2.9127668380737304, + "ce_loss_23": 2.8387949109077453, + "ce_loss_3": 3.930868887901306, + "ce_loss_6": 3.5026464819908143, + "epoch": 0.664, + "grad_norm": 1096.0, + "kl_loss_12": 577.5287628173828, + "kl_loss_17": 170.67861251831056, + "kl_loss_3": 2308.6623352050783, + "kl_loss_6": 1462.7905578613281, + "learning_rate": 0.000258289358057718, + "loss": 1158.7979, + "step": 6640 + }, + { + "ce_loss_12": 3.1487738728523254, + "ce_loss_17": 2.9731197714805604, + "ce_loss_23": 2.900687944889069, + "ce_loss_3": 3.9934016704559325, + "ce_loss_6": 3.5643016934394836, + "epoch": 0.665, + "grad_norm": 1208.0, + "kl_loss_12": 578.7869308471679, + "kl_loss_17": 172.91688766479493, + "kl_loss_3": 2325.2394836425783, + "kl_loss_6": 1463.2403442382813, + "learning_rate": 0.0002569016319497657, + "loss": 1134.2854, + "step": 6650 + }, + { + "ce_loss_12": 3.138523483276367, + "ce_loss_17": 2.9648420572280885, + "ce_loss_23": 2.891550886631012, + "ce_loss_3": 3.9867674231529238, + "ce_loss_6": 3.5513219594955445, + "epoch": 0.666, + "grad_norm": 964.0, + "kl_loss_12": 581.3247100830079, + "kl_loss_17": 171.40716629028321, + "kl_loss_3": 2334.145227050781, + "kl_loss_6": 1467.3094970703125, + "learning_rate": 0.00025551635383968066, + "loss": 1144.0461, + "step": 6660 + }, + { + "ce_loss_12": 3.0494396567344664, + "ce_loss_17": 2.8817154288291933, + "ce_loss_23": 2.8116140246391295, + "ce_loss_3": 3.897812283039093, + "ce_loss_6": 3.463429093360901, + "epoch": 0.667, + "grad_norm": 1096.0, + "kl_loss_12": 571.2893417358398, + "kl_loss_17": 166.89762115478516, + "kl_loss_3": 2320.7787536621095, + "kl_loss_6": 1455.82001953125, + "learning_rate": 0.00025413353767719804, + "loss": 1128.7535, + "step": 6670 + }, + { + "ce_loss_12": 3.104068899154663, + "ce_loss_17": 2.9416080713272095, + "ce_loss_23": 2.8767688870429993, + "ce_loss_3": 3.9425044536590574, + "ce_loss_6": 3.5188334822654723, + "epoch": 0.668, + "grad_norm": 1376.0, + "kl_loss_12": 561.1372955322265, + "kl_loss_17": 161.93158035278321, + "kl_loss_3": 2303.177655029297, + "kl_loss_6": 1447.1806274414062, + "learning_rate": 0.0002527531973872617, + "loss": 1119.5465, + "step": 6680 + }, + { + "ce_loss_12": 3.114408755302429, + "ce_loss_17": 2.9568856835365294, + "ce_loss_23": 2.8878267884254454, + "ce_loss_3": 3.931940507888794, + "ce_loss_6": 3.505814230442047, + "epoch": 0.669, + "grad_norm": 1128.0, + "kl_loss_12": 554.5485305786133, + "kl_loss_17": 163.08516311645508, + "kl_loss_3": 2254.183935546875, + "kl_loss_6": 1409.7445495605468, + "learning_rate": 0.0002513753468698826, + "loss": 1101.3923, + "step": 6690 + }, + { + "ce_loss_12": 3.087624263763428, + "ce_loss_17": 2.9223830699920654, + "ce_loss_23": 2.8527083516120912, + "ce_loss_3": 3.9336583375930787, + "ce_loss_6": 3.494721603393555, + "epoch": 0.67, + "grad_norm": 1048.0, + "kl_loss_12": 567.5121780395508, + "kl_loss_17": 166.63730545043944, + "kl_loss_3": 2318.508673095703, + "kl_loss_6": 1444.2513488769532, + "learning_rate": 0.0002500000000000001, + "loss": 1119.0619, + "step": 6700 + }, + { + "ce_loss_12": 3.1866146683692933, + "ce_loss_17": 3.0320523500442507, + "ce_loss_23": 2.9677372574806213, + "ce_loss_3": 3.9759655594825745, + "ce_loss_6": 3.56893048286438, + "epoch": 0.671, + "grad_norm": 1048.0, + "kl_loss_12": 545.7672103881836, + "kl_loss_17": 159.36872329711915, + "kl_loss_3": 2177.7454345703127, + "kl_loss_6": 1374.6342407226562, + "learning_rate": 0.0002486271706273421, + "loss": 1122.1479, + "step": 6710 + }, + { + "ce_loss_12": 3.126544237136841, + "ce_loss_17": 2.9684812307357786, + "ce_loss_23": 2.9063286781311035, + "ce_loss_3": 3.9199762105941773, + "ce_loss_6": 3.514430069923401, + "epoch": 0.672, + "grad_norm": 1208.0, + "kl_loss_12": 543.9798583984375, + "kl_loss_17": 159.2358268737793, + "kl_loss_3": 2198.2925354003905, + "kl_loss_6": 1377.329443359375, + "learning_rate": 0.0002472568725762853, + "loss": 1100.6169, + "step": 6720 + }, + { + "ce_loss_12": 3.1231537103652953, + "ce_loss_17": 2.9678090453147887, + "ce_loss_23": 2.9013753533363342, + "ce_loss_3": 3.911601424217224, + "ce_loss_6": 3.505997157096863, + "epoch": 0.673, + "grad_norm": 1512.0, + "kl_loss_12": 534.9500045776367, + "kl_loss_17": 156.77999572753907, + "kl_loss_3": 2186.6644958496095, + "kl_loss_6": 1369.1830505371095, + "learning_rate": 0.00024588911964571554, + "loss": 1079.9992, + "step": 6730 + }, + { + "ce_loss_12": 3.1442111015319822, + "ce_loss_17": 2.968558657169342, + "ce_loss_23": 2.894292151927948, + "ce_loss_3": 4.002666354179382, + "ce_loss_6": 3.57113493680954, + "epoch": 0.674, + "grad_norm": 1032.0, + "kl_loss_12": 584.2673843383789, + "kl_loss_17": 173.74913482666017, + "kl_loss_3": 2329.846893310547, + "kl_loss_6": 1475.91787109375, + "learning_rate": 0.00024452392560888974, + "loss": 1119.3503, + "step": 6740 + }, + { + "ce_loss_12": 3.035334324836731, + "ce_loss_17": 2.8686137318611147, + "ce_loss_23": 2.80149667263031, + "ce_loss_3": 3.856445550918579, + "ce_loss_6": 3.4397828340530396, + "epoch": 0.675, + "grad_norm": 1160.0, + "kl_loss_12": 548.2299728393555, + "kl_loss_17": 158.76625595092773, + "kl_loss_3": 2263.6882507324217, + "kl_loss_6": 1426.7576843261718, + "learning_rate": 0.00024316130421329695, + "loss": 1090.746, + "step": 6750 + }, + { + "ce_loss_12": 3.1035542130470275, + "ce_loss_17": 2.943461501598358, + "ce_loss_23": 2.8759094715118407, + "ce_loss_3": 3.9234148144721983, + "ce_loss_6": 3.5102449536323546, + "epoch": 0.676, + "grad_norm": 1004.0, + "kl_loss_12": 552.3924407958984, + "kl_loss_17": 160.46843795776368, + "kl_loss_3": 2254.888250732422, + "kl_loss_6": 1419.8763732910156, + "learning_rate": 0.00024180126918051909, + "loss": 1106.4746, + "step": 6760 + }, + { + "ce_loss_12": 3.1554140567779543, + "ce_loss_17": 2.989892101287842, + "ce_loss_23": 2.9217707753181457, + "ce_loss_3": 3.968790566921234, + "ce_loss_6": 3.5452707767486573, + "epoch": 0.677, + "grad_norm": 1304.0, + "kl_loss_12": 558.4460464477539, + "kl_loss_17": 162.82315979003906, + "kl_loss_3": 2252.1734924316406, + "kl_loss_6": 1406.660821533203, + "learning_rate": 0.00024044383420609406, + "loss": 1088.5975, + "step": 6770 + }, + { + "ce_loss_12": 3.1568056464195253, + "ce_loss_17": 3.0021663069725038, + "ce_loss_23": 2.9343551278114317, + "ce_loss_3": 3.9495391130447386, + "ce_loss_6": 3.5399969100952147, + "epoch": 0.678, + "grad_norm": 1400.0, + "kl_loss_12": 546.1890930175781, + "kl_loss_17": 159.20334014892578, + "kl_loss_3": 2213.2076782226563, + "kl_loss_6": 1394.3114929199219, + "learning_rate": 0.00023908901295937712, + "loss": 1107.4713, + "step": 6780 + }, + { + "ce_loss_12": 3.1455968618392944, + "ce_loss_17": 2.9837255835533143, + "ce_loss_23": 2.9146998643875124, + "ce_loss_3": 3.958331596851349, + "ce_loss_6": 3.5401602506637575, + "epoch": 0.679, + "grad_norm": 1128.0, + "kl_loss_12": 547.9637756347656, + "kl_loss_17": 161.52648010253907, + "kl_loss_3": 2221.9041259765627, + "kl_loss_6": 1388.3693420410157, + "learning_rate": 0.00023773681908340283, + "loss": 1113.2122, + "step": 6790 + }, + { + "ce_loss_12": 3.1385439157485964, + "ce_loss_17": 2.9627222657203673, + "ce_loss_23": 2.8902212142944337, + "ce_loss_3": 3.979812204837799, + "ce_loss_6": 3.550594687461853, + "epoch": 0.68, + "grad_norm": 1072.0, + "kl_loss_12": 587.1845413208008, + "kl_loss_17": 172.72392654418945, + "kl_loss_3": 2343.5950256347655, + "kl_loss_6": 1487.5733581542968, + "learning_rate": 0.00023638726619474876, + "loss": 1148.0467, + "step": 6800 + }, + { + "ce_loss_12": 3.13368022441864, + "ce_loss_17": 2.9561671733856203, + "ce_loss_23": 2.8830493450164796, + "ce_loss_3": 3.999674940109253, + "ce_loss_6": 3.562345004081726, + "epoch": 0.681, + "grad_norm": 1312.0, + "kl_loss_12": 576.6458923339844, + "kl_loss_17": 169.26600341796876, + "kl_loss_3": 2344.7470458984376, + "kl_loss_6": 1486.2510864257813, + "learning_rate": 0.0002350403678833976, + "loss": 1126.2372, + "step": 6810 + }, + { + "ce_loss_12": 3.0531776189804076, + "ce_loss_17": 2.88701913356781, + "ce_loss_23": 2.8160634517669676, + "ce_loss_3": 3.8899761915206907, + "ce_loss_6": 3.467984676361084, + "epoch": 0.682, + "grad_norm": 1456.0, + "kl_loss_12": 562.7510818481445, + "kl_loss_17": 162.64818725585937, + "kl_loss_3": 2300.0100158691407, + "kl_loss_6": 1451.0055908203126, + "learning_rate": 0.00023369613771260007, + "loss": 1109.835, + "step": 6820 + }, + { + "ce_loss_12": 3.1653555393218995, + "ce_loss_17": 2.9973755836486817, + "ce_loss_23": 2.9283758759498597, + "ce_loss_3": 4.007970345020294, + "ce_loss_6": 3.5784966111183167, + "epoch": 0.683, + "grad_norm": 1232.0, + "kl_loss_12": 567.620637512207, + "kl_loss_17": 165.86778106689454, + "kl_loss_3": 2315.0612060546873, + "kl_loss_6": 1455.6276611328126, + "learning_rate": 0.00023235458921873925, + "loss": 1128.1863, + "step": 6830 + }, + { + "ce_loss_12": 3.14102156162262, + "ce_loss_17": 2.95844361782074, + "ce_loss_23": 2.882523739337921, + "ce_loss_3": 4.013988649845123, + "ce_loss_6": 3.567373180389404, + "epoch": 0.684, + "grad_norm": 1432.0, + "kl_loss_12": 595.2338577270508, + "kl_loss_17": 173.366202545166, + "kl_loss_3": 2408.025115966797, + "kl_loss_6": 1510.5706726074218, + "learning_rate": 0.0002310157359111938, + "loss": 1161.5986, + "step": 6840 + }, + { + "ce_loss_12": 3.031296193599701, + "ce_loss_17": 2.849304759502411, + "ce_loss_23": 2.7756431221961977, + "ce_loss_3": 3.953793489933014, + "ce_loss_6": 3.479044473171234, + "epoch": 0.685, + "grad_norm": 1376.0, + "kl_loss_12": 578.3578979492188, + "kl_loss_17": 168.22599029541016, + "kl_loss_3": 2469.172509765625, + "kl_loss_6": 1525.2212280273438, + "learning_rate": 0.0002296795912722014, + "loss": 1157.8029, + "step": 6850 + }, + { + "ce_loss_12": 3.150490713119507, + "ce_loss_17": 2.9875052690505983, + "ce_loss_23": 2.9206341743469237, + "ce_loss_3": 3.9536224365234376, + "ce_loss_6": 3.5421762228012086, + "epoch": 0.686, + "grad_norm": 920.0, + "kl_loss_12": 555.7742065429687, + "kl_loss_17": 161.9793846130371, + "kl_loss_3": 2232.923474121094, + "kl_loss_6": 1409.4739135742188, + "learning_rate": 0.0002283461687567236, + "loss": 1086.4309, + "step": 6860 + }, + { + "ce_loss_12": 3.195644724369049, + "ce_loss_17": 3.0399562239646913, + "ce_loss_23": 2.9735812902450562, + "ce_loss_3": 3.988665819168091, + "ce_loss_6": 3.5849629998207093, + "epoch": 0.687, + "grad_norm": 1072.0, + "kl_loss_12": 542.861050415039, + "kl_loss_17": 160.58557662963867, + "kl_loss_3": 2180.1105834960936, + "kl_loss_6": 1376.572607421875, + "learning_rate": 0.00022701548179231045, + "loss": 1096.79, + "step": 6870 + }, + { + "ce_loss_12": 3.1640716314315798, + "ce_loss_17": 2.9970357418060303, + "ce_loss_23": 2.9259220004081725, + "ce_loss_3": 4.014358699321747, + "ce_loss_6": 3.573070788383484, + "epoch": 0.688, + "grad_norm": 1112.0, + "kl_loss_12": 565.2588302612305, + "kl_loss_17": 167.01204833984374, + "kl_loss_3": 2316.832342529297, + "kl_loss_6": 1446.0325866699218, + "learning_rate": 0.00022568754377896516, + "loss": 1103.7375, + "step": 6880 + }, + { + "ce_loss_12": 3.1513329982757567, + "ce_loss_17": 2.989849019050598, + "ce_loss_23": 2.9210432052612303, + "ce_loss_3": 3.9666898012161256, + "ce_loss_6": 3.544580614566803, + "epoch": 0.689, + "grad_norm": 1056.0, + "kl_loss_12": 563.2337951660156, + "kl_loss_17": 164.98118209838867, + "kl_loss_3": 2254.496124267578, + "kl_loss_6": 1415.3326232910156, + "learning_rate": 0.00022436236808900844, + "loss": 1098.4635, + "step": 6890 + }, + { + "ce_loss_12": 3.056147313117981, + "ce_loss_17": 2.883034348487854, + "ce_loss_23": 2.813547372817993, + "ce_loss_3": 3.8956146478652953, + "ce_loss_6": 3.4598315596580504, + "epoch": 0.69, + "grad_norm": 1256.0, + "kl_loss_12": 563.8613327026367, + "kl_loss_17": 165.419034576416, + "kl_loss_3": 2311.5637939453127, + "kl_loss_6": 1434.1539184570313, + "learning_rate": 0.00022303996806694487, + "loss": 1109.1252, + "step": 6900 + }, + { + "ce_loss_12": 3.122467875480652, + "ce_loss_17": 2.9579473733901978, + "ce_loss_23": 2.8914283990859984, + "ce_loss_3": 3.960798966884613, + "ce_loss_6": 3.5380321860313417, + "epoch": 0.691, + "grad_norm": 1288.0, + "kl_loss_12": 554.0889297485352, + "kl_loss_17": 160.19549865722655, + "kl_loss_3": 2290.123254394531, + "kl_loss_6": 1434.2134704589844, + "learning_rate": 0.00022172035702932823, + "loss": 1105.6811, + "step": 6910 + }, + { + "ce_loss_12": 3.172382354736328, + "ce_loss_17": 3.0069132089614867, + "ce_loss_23": 2.9380002498626707, + "ce_loss_3": 3.9722339153289794, + "ce_loss_6": 3.5636775851249696, + "epoch": 0.692, + "grad_norm": 940.0, + "kl_loss_12": 558.7679122924804, + "kl_loss_17": 164.84261932373047, + "kl_loss_3": 2207.495928955078, + "kl_loss_6": 1403.4933715820312, + "learning_rate": 0.00022040354826462666, + "loss": 1089.1483, + "step": 6920 + }, + { + "ce_loss_12": 3.097479057312012, + "ce_loss_17": 2.933117616176605, + "ce_loss_23": 2.8654277324676514, + "ce_loss_3": 3.9250967383384703, + "ce_loss_6": 3.5026277899742126, + "epoch": 0.693, + "grad_norm": 1320.0, + "kl_loss_12": 548.0819427490235, + "kl_loss_17": 160.38210525512696, + "kl_loss_3": 2262.3633361816405, + "kl_loss_6": 1417.277899169922, + "learning_rate": 0.0002190895550330899, + "loss": 1110.6444, + "step": 6930 + }, + { + "ce_loss_12": 3.041971814632416, + "ce_loss_17": 2.8696054458618163, + "ce_loss_23": 2.798582601547241, + "ce_loss_3": 3.8966514229774476, + "ce_loss_6": 3.468160080909729, + "epoch": 0.694, + "grad_norm": 1352.0, + "kl_loss_12": 568.6457580566406, + "kl_loss_17": 166.68883590698243, + "kl_loss_3": 2317.4898193359377, + "kl_loss_6": 1467.453192138672, + "learning_rate": 0.00021777839056661552, + "loss": 1104.5019, + "step": 6940 + }, + { + "ce_loss_12": 3.118424892425537, + "ce_loss_17": 2.9517529606819153, + "ce_loss_23": 2.887066733837128, + "ce_loss_3": 3.933793568611145, + "ce_loss_6": 3.5133557438850405, + "epoch": 0.695, + "grad_norm": 1352.0, + "kl_loss_12": 555.2664596557618, + "kl_loss_17": 162.95882873535157, + "kl_loss_3": 2243.5386657714844, + "kl_loss_6": 1410.0547485351562, + "learning_rate": 0.0002164700680686147, + "loss": 1084.6388, + "step": 6950 + }, + { + "ce_loss_12": 3.16032634973526, + "ce_loss_17": 2.9953511714935304, + "ce_loss_23": 2.9277878522872927, + "ce_loss_3": 3.9653631448745728, + "ce_loss_6": 3.5490016460418703, + "epoch": 0.696, + "grad_norm": 1368.0, + "kl_loss_12": 553.6630996704101, + "kl_loss_17": 164.8134864807129, + "kl_loss_3": 2208.6744750976563, + "kl_loss_6": 1389.9367309570312, + "learning_rate": 0.0002151646007138806, + "loss": 1087.7991, + "step": 6960 + }, + { + "ce_loss_12": 3.050205111503601, + "ce_loss_17": 2.884288513660431, + "ce_loss_23": 2.813274657726288, + "ce_loss_3": 3.8992061018943787, + "ce_loss_6": 3.463496470451355, + "epoch": 0.697, + "grad_norm": 1048.0, + "kl_loss_12": 564.9178665161132, + "kl_loss_17": 165.92743377685548, + "kl_loss_3": 2330.1527099609375, + "kl_loss_6": 1455.7044250488282, + "learning_rate": 0.00021386200164845526, + "loss": 1112.7447, + "step": 6970 + }, + { + "ce_loss_12": 3.203552484512329, + "ce_loss_17": 3.045564079284668, + "ce_loss_23": 2.978299582004547, + "ce_loss_3": 3.9887001872062684, + "ce_loss_6": 3.5860018014907835, + "epoch": 0.698, + "grad_norm": 1784.0, + "kl_loss_12": 550.210075378418, + "kl_loss_17": 161.59491348266602, + "kl_loss_3": 2188.434454345703, + "kl_loss_6": 1386.6224975585938, + "learning_rate": 0.0002125622839894964, + "loss": 1079.0363, + "step": 6980 + }, + { + "ce_loss_12": 3.156203365325928, + "ce_loss_17": 2.9958416342735292, + "ce_loss_23": 2.931114614009857, + "ce_loss_3": 3.956476068496704, + "ce_loss_6": 3.5511318325996397, + "epoch": 0.699, + "grad_norm": 1200.0, + "kl_loss_12": 542.128092956543, + "kl_loss_17": 159.17681274414062, + "kl_loss_3": 2200.6436096191405, + "kl_loss_6": 1383.7528442382813, + "learning_rate": 0.00021126546082514663, + "loss": 1082.092, + "step": 6990 + }, + { + "ce_loss_12": 3.179194617271423, + "ce_loss_17": 3.0163188338279725, + "ce_loss_23": 2.951323699951172, + "ce_loss_3": 3.9722177147865296, + "ce_loss_6": 3.564598274230957, + "epoch": 0.7, + "grad_norm": 988.0, + "kl_loss_12": 550.1864288330078, + "kl_loss_17": 161.19149017333984, + "kl_loss_3": 2196.5428955078123, + "kl_loss_6": 1392.7397705078124, + "learning_rate": 0.00020997154521440098, + "loss": 1078.2177, + "step": 7000 + }, + { + "ce_loss_12": 3.1233227133750914, + "ce_loss_17": 2.9633098602294923, + "ce_loss_23": 2.898985981941223, + "ce_loss_3": 3.9442873358726502, + "ce_loss_6": 3.5172589302062987, + "epoch": 0.701, + "grad_norm": 1232.0, + "kl_loss_12": 550.1744873046875, + "kl_loss_17": 160.37801361083984, + "kl_loss_3": 2242.1542419433595, + "kl_loss_6": 1405.3580383300782, + "learning_rate": 0.0002086805501869749, + "loss": 1081.2021, + "step": 7010 + }, + { + "ce_loss_12": 3.1101000189781187, + "ce_loss_17": 2.9367455363273622, + "ce_loss_23": 2.864802801609039, + "ce_loss_3": 3.962554705142975, + "ce_loss_6": 3.5312764286994933, + "epoch": 0.702, + "grad_norm": 1160.0, + "kl_loss_12": 575.6279846191406, + "kl_loss_17": 167.17742538452148, + "kl_loss_3": 2333.5331298828123, + "kl_loss_6": 1465.0747314453124, + "learning_rate": 0.0002073924887431744, + "loss": 1114.5426, + "step": 7020 + }, + { + "ce_loss_12": 3.1102948069572447, + "ce_loss_17": 2.9448712706565856, + "ce_loss_23": 2.8787378907203673, + "ce_loss_3": 3.943867230415344, + "ce_loss_6": 3.516659843921661, + "epoch": 0.703, + "grad_norm": 1080.0, + "kl_loss_12": 561.2340530395508, + "kl_loss_17": 162.04900512695312, + "kl_loss_3": 2285.8710083007813, + "kl_loss_6": 1439.7157775878907, + "learning_rate": 0.00020610737385376348, + "loss": 1134.2076, + "step": 7030 + }, + { + "ce_loss_12": 3.153998517990112, + "ce_loss_17": 2.9913541316986083, + "ce_loss_23": 2.9260885953903197, + "ce_loss_3": 3.9440117359161375, + "ce_loss_6": 3.5408020853996276, + "epoch": 0.704, + "grad_norm": 1256.0, + "kl_loss_12": 549.944775390625, + "kl_loss_17": 162.47129287719727, + "kl_loss_3": 2198.950054931641, + "kl_loss_6": 1381.6360717773437, + "learning_rate": 0.00020482521845983521, + "loss": 1104.1162, + "step": 7040 + }, + { + "ce_loss_12": 3.1619212150573732, + "ce_loss_17": 2.996929383277893, + "ce_loss_23": 2.9238937973976133, + "ce_loss_3": 3.9839595556259155, + "ce_loss_6": 3.5633341431617738, + "epoch": 0.705, + "grad_norm": 1456.0, + "kl_loss_12": 566.3393341064453, + "kl_loss_17": 169.2521484375, + "kl_loss_3": 2277.2437255859377, + "kl_loss_6": 1436.12548828125, + "learning_rate": 0.00020354603547267987, + "loss": 1121.7861, + "step": 7050 + }, + { + "ce_loss_12": 3.1527396559715273, + "ce_loss_17": 2.9844316840171814, + "ce_loss_23": 2.914238953590393, + "ce_loss_3": 3.994079887866974, + "ce_loss_6": 3.5688058733940125, + "epoch": 0.706, + "grad_norm": 1088.0, + "kl_loss_12": 568.8097229003906, + "kl_loss_17": 166.45427703857422, + "kl_loss_3": 2288.9527587890625, + "kl_loss_6": 1444.6506103515626, + "learning_rate": 0.00020226983777365604, + "loss": 1139.0396, + "step": 7060 + }, + { + "ce_loss_12": 3.054299366474152, + "ce_loss_17": 2.8940477848052977, + "ce_loss_23": 2.82825745344162, + "ce_loss_3": 3.93111172914505, + "ce_loss_6": 3.485726547241211, + "epoch": 0.707, + "grad_norm": 1200.0, + "kl_loss_12": 547.2898818969727, + "kl_loss_17": 160.03102264404296, + "kl_loss_3": 2351.719171142578, + "kl_loss_6": 1463.3100341796876, + "learning_rate": 0.00020099663821406056, + "loss": 1112.2619, + "step": 7070 + }, + { + "ce_loss_12": 3.146637439727783, + "ce_loss_17": 2.9824100852012636, + "ce_loss_23": 2.9174304246902465, + "ce_loss_3": 3.953105664253235, + "ce_loss_6": 3.5402570843696592, + "epoch": 0.708, + "grad_norm": 1536.0, + "kl_loss_12": 544.9981689453125, + "kl_loss_17": 160.23975524902343, + "kl_loss_3": 2209.292413330078, + "kl_loss_6": 1389.835498046875, + "learning_rate": 0.00019972644961499853, + "loss": 1102.0812, + "step": 7080 + }, + { + "ce_loss_12": 3.133434867858887, + "ce_loss_17": 2.961903750896454, + "ce_loss_23": 2.8904358386993407, + "ce_loss_3": 3.981268012523651, + "ce_loss_6": 3.5474346041679383, + "epoch": 0.709, + "grad_norm": 1336.0, + "kl_loss_12": 570.3906784057617, + "kl_loss_17": 167.14042587280272, + "kl_loss_3": 2317.08115234375, + "kl_loss_6": 1458.70458984375, + "learning_rate": 0.00019845928476725522, + "loss": 1116.1218, + "step": 7090 + }, + { + "ce_loss_12": 3.202507257461548, + "ce_loss_17": 3.0336686730384828, + "ce_loss_23": 2.961744248867035, + "ce_loss_3": 4.013906419277191, + "ce_loss_6": 3.603237068653107, + "epoch": 0.71, + "grad_norm": 1208.0, + "kl_loss_12": 562.5401397705078, + "kl_loss_17": 165.22715530395507, + "kl_loss_3": 2251.490167236328, + "kl_loss_6": 1427.9730285644532, + "learning_rate": 0.00019719515643116677, + "loss": 1133.9623, + "step": 7100 + }, + { + "ce_loss_12": 3.1314493298530577, + "ce_loss_17": 2.965890979766846, + "ce_loss_23": 2.8989282608032227, + "ce_loss_3": 3.943610680103302, + "ce_loss_6": 3.5178272485733033, + "epoch": 0.711, + "grad_norm": 1168.0, + "kl_loss_12": 547.7857849121094, + "kl_loss_17": 160.55599670410157, + "kl_loss_3": 2237.0823181152346, + "kl_loss_6": 1387.6133056640624, + "learning_rate": 0.0001959340773364911, + "loss": 1104.2832, + "step": 7110 + }, + { + "ce_loss_12": 3.149474573135376, + "ce_loss_17": 2.982474982738495, + "ce_loss_23": 2.9145326018333435, + "ce_loss_3": 3.979136312007904, + "ce_loss_6": 3.5494866251945494, + "epoch": 0.712, + "grad_norm": 1152.0, + "kl_loss_12": 562.058088684082, + "kl_loss_17": 163.32530517578124, + "kl_loss_3": 2275.673974609375, + "kl_loss_6": 1424.4098510742188, + "learning_rate": 0.0001946760601822809, + "loss": 1086.5138, + "step": 7120 + }, + { + "ce_loss_12": 3.2002389788627625, + "ce_loss_17": 3.031752586364746, + "ce_loss_23": 2.9661432981491087, + "ce_loss_3": 4.001896345615387, + "ce_loss_6": 3.58693687915802, + "epoch": 0.713, + "grad_norm": 1424.0, + "kl_loss_12": 552.4800445556641, + "kl_loss_17": 161.70651321411134, + "kl_loss_3": 2226.4401245117188, + "kl_loss_6": 1396.9112182617187, + "learning_rate": 0.00019342111763675512, + "loss": 1073.6355, + "step": 7130 + }, + { + "ce_loss_12": 3.1916829347610474, + "ce_loss_17": 3.034743547439575, + "ce_loss_23": 2.9650686502456667, + "ce_loss_3": 3.9788838386535645, + "ce_loss_6": 3.5754756927490234, + "epoch": 0.714, + "grad_norm": 1384.0, + "kl_loss_12": 551.3403244018555, + "kl_loss_17": 163.68495712280273, + "kl_loss_3": 2187.1236572265625, + "kl_loss_6": 1389.4500671386718, + "learning_rate": 0.00019216926233717085, + "loss": 1076.3952, + "step": 7140 + }, + { + "ce_loss_12": 3.0939472317695618, + "ce_loss_17": 2.9311779618263243, + "ce_loss_23": 2.86406432390213, + "ce_loss_3": 3.990004599094391, + "ce_loss_6": 3.5400795698165894, + "epoch": 0.715, + "grad_norm": 1168.0, + "kl_loss_12": 546.9031967163086, + "kl_loss_17": 160.15991592407227, + "kl_loss_3": 2377.830218505859, + "kl_loss_6": 1489.6150390625, + "learning_rate": 0.00019092050688969737, + "loss": 1126.9752, + "step": 7150 + }, + { + "ce_loss_12": 3.155309784412384, + "ce_loss_17": 2.9939750909805296, + "ce_loss_23": 2.9262243509292603, + "ce_loss_3": 3.9573385953903197, + "ce_loss_6": 3.5473846793174744, + "epoch": 0.716, + "grad_norm": 1104.0, + "kl_loss_12": 548.476106262207, + "kl_loss_17": 160.36773605346679, + "kl_loss_3": 2237.303985595703, + "kl_loss_6": 1409.4257446289062, + "learning_rate": 0.00018967486386928817, + "loss": 1087.5232, + "step": 7160 + }, + { + "ce_loss_12": 3.047571229934692, + "ce_loss_17": 2.877414608001709, + "ce_loss_23": 2.808344340324402, + "ce_loss_3": 3.892746686935425, + "ce_loss_6": 3.459039735794067, + "epoch": 0.717, + "grad_norm": 1264.0, + "kl_loss_12": 563.6357772827148, + "kl_loss_17": 162.5858512878418, + "kl_loss_3": 2310.7905822753905, + "kl_loss_6": 1447.4100341796875, + "learning_rate": 0.00018843234581955443, + "loss": 1148.2967, + "step": 7170 + }, + { + "ce_loss_12": 3.053649604320526, + "ce_loss_17": 2.883082091808319, + "ce_loss_23": 2.810990631580353, + "ce_loss_3": 3.8953574657440186, + "ce_loss_6": 3.4785933017730715, + "epoch": 0.718, + "grad_norm": 1200.0, + "kl_loss_12": 565.9681213378906, + "kl_loss_17": 164.66096267700195, + "kl_loss_3": 2300.275506591797, + "kl_loss_6": 1458.3315551757812, + "learning_rate": 0.00018719296525263924, + "loss": 1114.9986, + "step": 7180 + }, + { + "ce_loss_12": 3.140706789493561, + "ce_loss_17": 2.981066620349884, + "ce_loss_23": 2.9154026985168455, + "ce_loss_3": 3.924810791015625, + "ce_loss_6": 3.5228620290756227, + "epoch": 0.719, + "grad_norm": 1016.0, + "kl_loss_12": 544.8866394042968, + "kl_loss_17": 161.65244522094727, + "kl_loss_3": 2172.4519226074217, + "kl_loss_6": 1369.1830444335938, + "learning_rate": 0.0001859567346490913, + "loss": 1072.4305, + "step": 7190 + }, + { + "ce_loss_12": 3.131831610202789, + "ce_loss_17": 2.961239516735077, + "ce_loss_23": 2.890417754650116, + "ce_loss_3": 3.959796333312988, + "ce_loss_6": 3.537416911125183, + "epoch": 0.72, + "grad_norm": 1272.0, + "kl_loss_12": 566.2015747070312, + "kl_loss_17": 167.16772689819337, + "kl_loss_3": 2285.071160888672, + "kl_loss_6": 1436.5745483398437, + "learning_rate": 0.0001847236664577389, + "loss": 1096.1967, + "step": 7200 + }, + { + "ce_loss_12": 3.139620101451874, + "ce_loss_17": 2.9793400049209593, + "ce_loss_23": 2.913111627101898, + "ce_loss_3": 3.926980221271515, + "ce_loss_6": 3.5231330752372743, + "epoch": 0.721, + "grad_norm": 1096.0, + "kl_loss_12": 545.3568664550781, + "kl_loss_17": 160.95148239135742, + "kl_loss_3": 2178.54990234375, + "kl_loss_6": 1376.5175537109376, + "learning_rate": 0.00018349377309556487, + "loss": 1070.0953, + "step": 7210 + }, + { + "ce_loss_12": 3.1002493500709534, + "ce_loss_17": 2.934877908229828, + "ce_loss_23": 2.8660946249961854, + "ce_loss_3": 3.966576647758484, + "ce_loss_6": 3.5206631064414977, + "epoch": 0.722, + "grad_norm": 1360.0, + "kl_loss_12": 573.0969818115234, + "kl_loss_17": 166.3352149963379, + "kl_loss_3": 2372.4399658203124, + "kl_loss_6": 1479.197607421875, + "learning_rate": 0.00018226706694758193, + "loss": 1128.7332, + "step": 7220 + }, + { + "ce_loss_12": 3.164998912811279, + "ce_loss_17": 3.004845643043518, + "ce_loss_23": 2.939321994781494, + "ce_loss_3": 3.984208607673645, + "ce_loss_6": 3.5630035042762755, + "epoch": 0.723, + "grad_norm": 1152.0, + "kl_loss_12": 557.9833618164063, + "kl_loss_17": 161.2952865600586, + "kl_loss_3": 2261.3576477050783, + "kl_loss_6": 1421.3094360351563, + "learning_rate": 0.0001810435603667075, + "loss": 1127.0174, + "step": 7230 + }, + { + "ce_loss_12": 3.0262806892395018, + "ce_loss_17": 2.862540531158447, + "ce_loss_23": 2.7961209177970887, + "ce_loss_3": 3.8637616872787475, + "ce_loss_6": 3.4397748947143554, + "epoch": 0.724, + "grad_norm": 1024.0, + "kl_loss_12": 546.5524963378906, + "kl_loss_17": 159.55378036499025, + "kl_loss_3": 2263.3500122070313, + "kl_loss_6": 1415.6317260742187, + "learning_rate": 0.0001798232656736389, + "loss": 1119.4006, + "step": 7240 + }, + { + "ce_loss_12": 3.18226363658905, + "ce_loss_17": 3.0180405259132383, + "ce_loss_23": 2.9479485750198364, + "ce_loss_3": 3.9728939294815064, + "ce_loss_6": 3.562802815437317, + "epoch": 0.725, + "grad_norm": 900.0, + "kl_loss_12": 546.6362609863281, + "kl_loss_17": 161.94093399047853, + "kl_loss_3": 2180.735302734375, + "kl_loss_6": 1363.2637145996093, + "learning_rate": 0.0001786061951567303, + "loss": 1085.3965, + "step": 7250 + }, + { + "ce_loss_12": 3.105582582950592, + "ce_loss_17": 2.9399035573005676, + "ce_loss_23": 2.8691635012626646, + "ce_loss_3": 3.9410958766937254, + "ce_loss_6": 3.5125160813331604, + "epoch": 0.726, + "grad_norm": 1432.0, + "kl_loss_12": 559.9968872070312, + "kl_loss_17": 165.33421478271484, + "kl_loss_3": 2276.2733947753904, + "kl_loss_6": 1432.7234924316406, + "learning_rate": 0.00017739236107186857, + "loss": 1119.3277, + "step": 7260 + }, + { + "ce_loss_12": 3.1843878746032717, + "ce_loss_17": 3.030998408794403, + "ce_loss_23": 2.9644922494888304, + "ce_loss_3": 3.9629374146461487, + "ce_loss_6": 3.563722383975983, + "epoch": 0.727, + "grad_norm": 1048.0, + "kl_loss_12": 538.6017959594726, + "kl_loss_17": 158.84636154174805, + "kl_loss_3": 2159.2968811035157, + "kl_loss_6": 1362.6561462402344, + "learning_rate": 0.00017618177564234904, + "loss": 1077.8926, + "step": 7270 + }, + { + "ce_loss_12": 3.1545698285102843, + "ce_loss_17": 3.000528430938721, + "ce_loss_23": 2.9358838319778444, + "ce_loss_3": 3.935468780994415, + "ce_loss_6": 3.5329660058021544, + "epoch": 0.728, + "grad_norm": 1344.0, + "kl_loss_12": 531.7809387207031, + "kl_loss_17": 156.99573287963867, + "kl_loss_3": 2140.3086486816405, + "kl_loss_6": 1345.6030700683593, + "learning_rate": 0.00017497445105875377, + "loss": 1067.2183, + "step": 7280 + }, + { + "ce_loss_12": 3.0874632835388183, + "ce_loss_17": 2.9163344144821166, + "ce_loss_23": 2.8498679399490356, + "ce_loss_3": 3.932736027240753, + "ce_loss_6": 3.4955358624458315, + "epoch": 0.729, + "grad_norm": 952.0, + "kl_loss_12": 567.4337753295898, + "kl_loss_17": 164.65280532836914, + "kl_loss_3": 2311.5119812011717, + "kl_loss_6": 1440.8906311035157, + "learning_rate": 0.000173770399478828, + "loss": 1112.781, + "step": 7290 + }, + { + "ce_loss_12": 3.0088712096214296, + "ce_loss_17": 2.8473345756530763, + "ce_loss_23": 2.784889876842499, + "ce_loss_3": 3.834717857837677, + "ce_loss_6": 3.403339159488678, + "epoch": 0.73, + "grad_norm": 1224.0, + "kl_loss_12": 539.0748489379882, + "kl_loss_17": 158.0688606262207, + "kl_loss_3": 2244.918664550781, + "kl_loss_6": 1394.7807189941407, + "learning_rate": 0.0001725696330273575, + "loss": 1116.7592, + "step": 7300 + }, + { + "ce_loss_12": 3.1739781856536866, + "ce_loss_17": 3.014979827404022, + "ce_loss_23": 2.9461803793907166, + "ce_loss_3": 3.9663755893707275, + "ce_loss_6": 3.565347063541412, + "epoch": 0.731, + "grad_norm": 984.0, + "kl_loss_12": 537.3137969970703, + "kl_loss_17": 158.49719619750977, + "kl_loss_3": 2170.088458251953, + "kl_loss_6": 1369.4702209472657, + "learning_rate": 0.00017137216379604724, + "loss": 1068.5676, + "step": 7310 + }, + { + "ce_loss_12": 3.0623173117637634, + "ce_loss_17": 2.8997745752334594, + "ce_loss_23": 2.8324743151664733, + "ce_loss_3": 3.900362193584442, + "ce_loss_6": 3.470179033279419, + "epoch": 0.732, + "grad_norm": 1008.0, + "kl_loss_12": 547.4601623535157, + "kl_loss_17": 161.21364669799806, + "kl_loss_3": 2262.5873413085938, + "kl_loss_6": 1410.8793884277343, + "learning_rate": 0.00017017800384339925, + "loss": 1099.5278, + "step": 7320 + }, + { + "ce_loss_12": 3.025323486328125, + "ce_loss_17": 2.856322979927063, + "ce_loss_23": 2.7867146492004395, + "ce_loss_3": 3.8915157079696656, + "ce_loss_6": 3.4566821694374084, + "epoch": 0.733, + "grad_norm": 1056.0, + "kl_loss_12": 561.3699798583984, + "kl_loss_17": 162.175146484375, + "kl_loss_3": 2332.4853088378904, + "kl_loss_6": 1466.8850708007812, + "learning_rate": 0.00016898716519459073, + "loss": 1096.8963, + "step": 7330 + }, + { + "ce_loss_12": 3.151548945903778, + "ce_loss_17": 2.9805155396461487, + "ce_loss_23": 2.9074472308158876, + "ce_loss_3": 4.005491101741791, + "ce_loss_6": 3.57403963804245, + "epoch": 0.734, + "grad_norm": 908.0, + "kl_loss_12": 578.352165222168, + "kl_loss_17": 169.664949798584, + "kl_loss_3": 2314.9711486816404, + "kl_loss_6": 1457.3987060546874, + "learning_rate": 0.00016779965984135375, + "loss": 1110.758, + "step": 7340 + }, + { + "ce_loss_12": 3.063838768005371, + "ce_loss_17": 2.901591444015503, + "ce_loss_23": 2.834994339942932, + "ce_loss_3": 3.9005173802375794, + "ce_loss_6": 3.467866039276123, + "epoch": 0.735, + "grad_norm": 1096.0, + "kl_loss_12": 540.851530456543, + "kl_loss_17": 157.2512535095215, + "kl_loss_3": 2253.485968017578, + "kl_loss_6": 1398.4514953613282, + "learning_rate": 0.00016661549974185424, + "loss": 1091.6617, + "step": 7350 + }, + { + "ce_loss_12": 3.094240939617157, + "ce_loss_17": 2.932550811767578, + "ce_loss_23": 2.864250934123993, + "ce_loss_3": 3.917224442958832, + "ce_loss_6": 3.493868517875671, + "epoch": 0.736, + "grad_norm": 1232.0, + "kl_loss_12": 553.4183349609375, + "kl_loss_17": 162.8744571685791, + "kl_loss_3": 2251.1023193359374, + "kl_loss_6": 1406.9132019042968, + "learning_rate": 0.00016543469682057105, + "loss": 1086.0771, + "step": 7360 + }, + { + "ce_loss_12": 3.125151574611664, + "ce_loss_17": 2.957120954990387, + "ce_loss_23": 2.886409246921539, + "ce_loss_3": 3.9414002776145933, + "ce_loss_6": 3.522070753574371, + "epoch": 0.737, + "grad_norm": 936.0, + "kl_loss_12": 561.4340881347656, + "kl_loss_17": 164.44181289672852, + "kl_loss_3": 2250.564172363281, + "kl_loss_6": 1418.036572265625, + "learning_rate": 0.00016425726296817632, + "loss": 1091.9379, + "step": 7370 + }, + { + "ce_loss_12": 3.1298630475997924, + "ce_loss_17": 2.9687084794044494, + "ce_loss_23": 2.9038389205932615, + "ce_loss_3": 3.9451918244361877, + "ce_loss_6": 3.5293861985206605, + "epoch": 0.738, + "grad_norm": 1024.0, + "kl_loss_12": 541.5226104736328, + "kl_loss_17": 159.93819351196288, + "kl_loss_3": 2214.3490661621095, + "kl_loss_6": 1394.9463256835938, + "learning_rate": 0.00016308321004141607, + "loss": 1087.2498, + "step": 7380 + }, + { + "ce_loss_12": 3.087207305431366, + "ce_loss_17": 2.9217296123504637, + "ce_loss_23": 2.851014792919159, + "ce_loss_3": 3.9276385068893434, + "ce_loss_6": 3.4967284917831423, + "epoch": 0.739, + "grad_norm": 1312.0, + "kl_loss_12": 567.2153533935547, + "kl_loss_17": 166.8703598022461, + "kl_loss_3": 2282.3760681152344, + "kl_loss_6": 1435.2972351074218, + "learning_rate": 0.00016191254986299043, + "loss": 1091.6779, + "step": 7390 + }, + { + "ce_loss_12": 3.112338662147522, + "ce_loss_17": 2.9599267840385437, + "ce_loss_23": 2.8951269507408144, + "ce_loss_3": 3.920237624645233, + "ce_loss_6": 3.514072132110596, + "epoch": 0.74, + "grad_norm": 1296.0, + "kl_loss_12": 535.1316421508789, + "kl_loss_17": 157.7860466003418, + "kl_loss_3": 2220.921942138672, + "kl_loss_6": 1406.8481384277343, + "learning_rate": 0.00016074529422143398, + "loss": 1102.7136, + "step": 7400 + }, + { + "ce_loss_12": 3.0936760783195494, + "ce_loss_17": 2.9284700989723205, + "ce_loss_23": 2.861575019359589, + "ce_loss_3": 3.94160852432251, + "ce_loss_6": 3.501969301700592, + "epoch": 0.741, + "grad_norm": 1200.0, + "kl_loss_12": 559.6839050292969, + "kl_loss_17": 166.13983001708985, + "kl_loss_3": 2296.2011291503904, + "kl_loss_6": 1432.079364013672, + "learning_rate": 0.0001595814548709983, + "loss": 1118.8881, + "step": 7410 + }, + { + "ce_loss_12": 3.159260606765747, + "ce_loss_17": 2.990760338306427, + "ce_loss_23": 2.9189414262771605, + "ce_loss_3": 3.9913843154907225, + "ce_loss_6": 3.56757196187973, + "epoch": 0.742, + "grad_norm": 1160.0, + "kl_loss_12": 572.7192138671875, + "kl_loss_17": 167.77596206665038, + "kl_loss_3": 2308.6673828125, + "kl_loss_6": 1447.264874267578, + "learning_rate": 0.00015842104353153285, + "loss": 1114.7347, + "step": 7420 + }, + { + "ce_loss_12": 3.1647661447525026, + "ce_loss_17": 2.9976929903030394, + "ce_loss_23": 2.9297439217567445, + "ce_loss_3": 3.9861793637275698, + "ce_loss_6": 3.568111205101013, + "epoch": 0.743, + "grad_norm": 1256.0, + "kl_loss_12": 556.8150070190429, + "kl_loss_17": 163.89298629760742, + "kl_loss_3": 2258.01337890625, + "kl_loss_6": 1422.5369995117187, + "learning_rate": 0.0001572640718883667, + "loss": 1122.6908, + "step": 7430 + }, + { + "ce_loss_12": 3.0959529757499693, + "ce_loss_17": 2.9399875164031983, + "ce_loss_23": 2.877901887893677, + "ce_loss_3": 3.9096753716468813, + "ce_loss_6": 3.4912124156951903, + "epoch": 0.744, + "grad_norm": 1208.0, + "kl_loss_12": 542.3641479492187, + "kl_loss_17": 157.65117416381835, + "kl_loss_3": 2208.268170166016, + "kl_loss_6": 1384.6812377929687, + "learning_rate": 0.0001561105515921915, + "loss": 1108.7784, + "step": 7440 + }, + { + "ce_loss_12": 2.977490186691284, + "ce_loss_17": 2.814324605464935, + "ce_loss_23": 2.749322760105133, + "ce_loss_3": 3.8388984203338623, + "ce_loss_6": 3.398288404941559, + "epoch": 0.745, + "grad_norm": 1048.0, + "kl_loss_12": 548.0307357788085, + "kl_loss_17": 156.30992317199707, + "kl_loss_3": 2329.5126525878904, + "kl_loss_6": 1452.494940185547, + "learning_rate": 0.0001549604942589441, + "loss": 1101.1797, + "step": 7450 + }, + { + "ce_loss_12": 3.135593664646149, + "ce_loss_17": 2.978959119319916, + "ce_loss_23": 2.9141037464141846, + "ce_loss_3": 3.9119282364845276, + "ce_loss_6": 3.510556936264038, + "epoch": 0.746, + "grad_norm": 1600.0, + "kl_loss_12": 532.6268692016602, + "kl_loss_17": 156.17607040405272, + "kl_loss_3": 2157.824334716797, + "kl_loss_6": 1351.6411987304687, + "learning_rate": 0.00015381391146968864, + "loss": 1070.9182, + "step": 7460 + }, + { + "ce_loss_12": 3.107375943660736, + "ce_loss_17": 2.9511581659317017, + "ce_loss_23": 2.8842229008674622, + "ce_loss_3": 3.941971480846405, + "ce_loss_6": 3.51314240694046, + "epoch": 0.747, + "grad_norm": 1232.0, + "kl_loss_12": 536.7534057617188, + "kl_loss_17": 156.5211166381836, + "kl_loss_3": 2236.07861328125, + "kl_loss_6": 1399.7406127929687, + "learning_rate": 0.00015267081477050133, + "loss": 1095.7694, + "step": 7470 + }, + { + "ce_loss_12": 3.204038417339325, + "ce_loss_17": 3.0420343041419984, + "ce_loss_23": 2.9723053574562073, + "ce_loss_3": 4.002698719501495, + "ce_loss_6": 3.5953757524490357, + "epoch": 0.748, + "grad_norm": 1040.0, + "kl_loss_12": 558.8490585327148, + "kl_loss_17": 166.1141471862793, + "kl_loss_3": 2206.0192810058593, + "kl_loss_6": 1402.625665283203, + "learning_rate": 0.00015153121567235335, + "loss": 1076.1527, + "step": 7480 + }, + { + "ce_loss_12": 3.105157423019409, + "ce_loss_17": 2.947098362445831, + "ce_loss_23": 2.8795030236244203, + "ce_loss_3": 3.933885872364044, + "ce_loss_6": 3.5141167044639587, + "epoch": 0.749, + "grad_norm": 908.0, + "kl_loss_12": 552.778515625, + "kl_loss_17": 161.83009643554686, + "kl_loss_3": 2275.603137207031, + "kl_loss_6": 1430.296209716797, + "learning_rate": 0.00015039512565099468, + "loss": 1075.2557, + "step": 7490 + }, + { + "ce_loss_12": 3.1631266593933107, + "ce_loss_17": 3.0015437960624696, + "ce_loss_23": 2.9345196962356566, + "ce_loss_3": 3.9690038084983827, + "ce_loss_6": 3.5539440274238587, + "epoch": 0.75, + "grad_norm": 984.0, + "kl_loss_12": 547.2122283935547, + "kl_loss_17": 160.6443084716797, + "kl_loss_3": 2220.7647399902344, + "kl_loss_6": 1392.8933410644531, + "learning_rate": 0.00014926255614683932, + "loss": 1124.4953, + "step": 7500 + }, + { + "ce_loss_12": 3.101532554626465, + "ce_loss_17": 2.9439298510551453, + "ce_loss_23": 2.8763969421386717, + "ce_loss_3": 3.9131872415542603, + "ce_loss_6": 3.491703712940216, + "epoch": 0.751, + "grad_norm": 1168.0, + "kl_loss_12": 549.1255447387696, + "kl_loss_17": 160.57257156372071, + "kl_loss_3": 2240.627813720703, + "kl_loss_6": 1390.5830993652344, + "learning_rate": 0.0001481335185648498, + "loss": 1095.8574, + "step": 7510 + }, + { + "ce_loss_12": 3.126468801498413, + "ce_loss_17": 2.9646638870239257, + "ce_loss_23": 2.899153769016266, + "ce_loss_3": 3.939616787433624, + "ce_loss_6": 3.5179593563079834, + "epoch": 0.752, + "grad_norm": 1144.0, + "kl_loss_12": 547.5724853515625, + "kl_loss_17": 160.2132423400879, + "kl_loss_3": 2241.28125, + "kl_loss_6": 1402.0799621582032, + "learning_rate": 0.0001470080242744218, + "loss": 1083.3291, + "step": 7520 + }, + { + "ce_loss_12": 3.1182099103927614, + "ce_loss_17": 2.959747242927551, + "ce_loss_23": 2.8968014121055603, + "ce_loss_3": 3.9456854939460753, + "ce_loss_6": 3.5264230728149415, + "epoch": 0.753, + "grad_norm": 1440.0, + "kl_loss_12": 538.8024368286133, + "kl_loss_17": 156.9289436340332, + "kl_loss_3": 2241.8080932617186, + "kl_loss_6": 1411.8066284179688, + "learning_rate": 0.0001458860846092705, + "loss": 1099.8725, + "step": 7530 + }, + { + "ce_loss_12": 3.163596737384796, + "ce_loss_17": 3.004104268550873, + "ce_loss_23": 2.9375285863876344, + "ce_loss_3": 3.957572305202484, + "ce_loss_6": 3.5540796637535097, + "epoch": 0.754, + "grad_norm": 1216.0, + "kl_loss_12": 541.4927703857422, + "kl_loss_17": 160.5062156677246, + "kl_loss_3": 2173.8907104492187, + "kl_loss_6": 1377.824725341797, + "learning_rate": 0.00014476771086731566, + "loss": 1061.0291, + "step": 7540 + }, + { + "ce_loss_12": 3.2464739799499513, + "ce_loss_17": 3.0824361085891723, + "ce_loss_23": 3.01233891248703, + "ce_loss_3": 4.050685405731201, + "ce_loss_6": 3.637848448753357, + "epoch": 0.755, + "grad_norm": 1176.0, + "kl_loss_12": 562.2777740478516, + "kl_loss_17": 169.30307312011718, + "kl_loss_3": 2217.041650390625, + "kl_loss_6": 1403.5890319824218, + "learning_rate": 0.00014365291431056872, + "loss": 1113.4173, + "step": 7550 + }, + { + "ce_loss_12": 3.0967350363731385, + "ce_loss_17": 2.926898777484894, + "ce_loss_23": 2.855245494842529, + "ce_loss_3": 3.9306687831878664, + "ce_loss_6": 3.501248574256897, + "epoch": 0.756, + "grad_norm": 1360.0, + "kl_loss_12": 567.5073440551757, + "kl_loss_17": 167.36128311157228, + "kl_loss_3": 2294.8422302246095, + "kl_loss_6": 1441.266864013672, + "learning_rate": 0.00014254170616501827, + "loss": 1102.9068, + "step": 7560 + }, + { + "ce_loss_12": 3.0469727873802186, + "ce_loss_17": 2.8708555340766906, + "ce_loss_23": 2.7983035683631896, + "ce_loss_3": 3.9179059267044067, + "ce_loss_6": 3.485325014591217, + "epoch": 0.757, + "grad_norm": 1120.0, + "kl_loss_12": 577.7532028198242, + "kl_loss_17": 166.80727844238282, + "kl_loss_3": 2361.802197265625, + "kl_loss_6": 1497.723193359375, + "learning_rate": 0.0001414340976205183, + "loss": 1142.0348, + "step": 7570 + }, + { + "ce_loss_12": 3.058771586418152, + "ce_loss_17": 2.882201647758484, + "ce_loss_23": 2.815154159069061, + "ce_loss_3": 3.9053560972213743, + "ce_loss_6": 3.4667197585105898, + "epoch": 0.758, + "grad_norm": 1328.0, + "kl_loss_12": 570.4278915405273, + "kl_loss_17": 160.15200500488282, + "kl_loss_3": 2310.2031005859376, + "kl_loss_6": 1439.8280883789062, + "learning_rate": 0.00014033009983067452, + "loss": 1101.2764, + "step": 7580 + }, + { + "ce_loss_12": 3.1888783097267153, + "ce_loss_17": 3.031953418254852, + "ce_loss_23": 2.966986298561096, + "ce_loss_3": 3.9651535511016847, + "ce_loss_6": 3.5674421310424806, + "epoch": 0.759, + "grad_norm": 1024.0, + "kl_loss_12": 533.8454498291015, + "kl_loss_17": 157.25309829711915, + "kl_loss_3": 2158.3534240722656, + "kl_loss_6": 1357.4085571289063, + "learning_rate": 0.00013922972391273224, + "loss": 1076.676, + "step": 7590 + }, + { + "ce_loss_12": 3.18862464427948, + "ce_loss_17": 3.0305684566497804, + "ce_loss_23": 2.9652734279632567, + "ce_loss_3": 4.02052389383316, + "ce_loss_6": 3.5788715481758118, + "epoch": 0.76, + "grad_norm": 1464.0, + "kl_loss_12": 543.7809707641602, + "kl_loss_17": 161.5416374206543, + "kl_loss_3": 2240.5658569335938, + "kl_loss_6": 1375.7309265136719, + "learning_rate": 0.0001381329809474649, + "loss": 1091.1582, + "step": 7600 + }, + { + "ce_loss_12": 3.1171194434165956, + "ce_loss_17": 2.9485602140426637, + "ce_loss_23": 2.8763870716094972, + "ce_loss_3": 3.9727436780929564, + "ce_loss_6": 3.5333943486213686, + "epoch": 0.761, + "grad_norm": 1240.0, + "kl_loss_12": 568.4049987792969, + "kl_loss_17": 165.82229461669922, + "kl_loss_3": 2333.8710205078123, + "kl_loss_6": 1460.953515625, + "learning_rate": 0.0001370398819790621, + "loss": 1118.3304, + "step": 7610 + }, + { + "ce_loss_12": 3.230324959754944, + "ce_loss_17": 3.0689058542251586, + "ce_loss_23": 3.0017220854759215, + "ce_loss_3": 4.028148972988129, + "ce_loss_6": 3.6192330360412597, + "epoch": 0.762, + "grad_norm": 1232.0, + "kl_loss_12": 544.9417282104492, + "kl_loss_17": 161.8925666809082, + "kl_loss_3": 2187.680139160156, + "kl_loss_6": 1370.5897888183595, + "learning_rate": 0.00013595043801501794, + "loss": 1066.1822, + "step": 7620 + }, + { + "ce_loss_12": 3.055271232128143, + "ce_loss_17": 2.884974014759064, + "ce_loss_23": 2.8141839265823365, + "ce_loss_3": 3.9491064667701723, + "ce_loss_6": 3.4900349378585815, + "epoch": 0.763, + "grad_norm": 1496.0, + "kl_loss_12": 568.0841201782226, + "kl_loss_17": 165.38999710083007, + "kl_loss_3": 2394.4039001464844, + "kl_loss_6": 1488.0246459960938, + "learning_rate": 0.00013486466002602133, + "loss": 1123.8639, + "step": 7630 + }, + { + "ce_loss_12": 3.1421401262283326, + "ce_loss_17": 2.981762206554413, + "ce_loss_23": 2.9162477374076845, + "ce_loss_3": 3.9335285305976866, + "ce_loss_6": 3.533025884628296, + "epoch": 0.764, + "grad_norm": 1416.0, + "kl_loss_12": 541.4627029418946, + "kl_loss_17": 160.2584083557129, + "kl_loss_3": 2189.476123046875, + "kl_loss_6": 1387.05546875, + "learning_rate": 0.00013378255894584462, + "loss": 1107.4135, + "step": 7640 + }, + { + "ce_loss_12": 3.0940568923950194, + "ce_loss_17": 2.925947606563568, + "ce_loss_23": 2.856828761100769, + "ce_loss_3": 3.9432212829589846, + "ce_loss_6": 3.5037839889526365, + "epoch": 0.765, + "grad_norm": 996.0, + "kl_loss_12": 563.4930908203125, + "kl_loss_17": 165.41349716186522, + "kl_loss_3": 2311.5087646484376, + "kl_loss_6": 1438.8681091308595, + "learning_rate": 0.0001327041456712334, + "loss": 1113.0895, + "step": 7650 + }, + { + "ce_loss_12": 3.1337642550468443, + "ce_loss_17": 2.9663718938827515, + "ce_loss_23": 2.8984671950340273, + "ce_loss_3": 3.954953646659851, + "ce_loss_6": 3.534931719303131, + "epoch": 0.766, + "grad_norm": 1624.0, + "kl_loss_12": 560.6311721801758, + "kl_loss_17": 164.26156311035157, + "kl_loss_3": 2265.8011779785156, + "kl_loss_6": 1423.2526489257812, + "learning_rate": 0.00013162943106179747, + "loss": 1112.404, + "step": 7660 + }, + { + "ce_loss_12": 3.1063711881637572, + "ce_loss_17": 2.944747340679169, + "ce_loss_23": 2.877778971195221, + "ce_loss_3": 3.9044309973716738, + "ce_loss_6": 3.5003751158714294, + "epoch": 0.767, + "grad_norm": 1056.0, + "kl_loss_12": 542.2193283081054, + "kl_loss_17": 160.44931716918944, + "kl_loss_3": 2210.9869079589844, + "kl_loss_6": 1404.0770751953125, + "learning_rate": 0.00013055842593990132, + "loss": 1088.844, + "step": 7670 + }, + { + "ce_loss_12": 3.0573420405387877, + "ce_loss_17": 2.893602359294891, + "ce_loss_23": 2.829056429862976, + "ce_loss_3": 3.869810235500336, + "ce_loss_6": 3.454623758792877, + "epoch": 0.768, + "grad_norm": 980.0, + "kl_loss_12": 544.5171676635742, + "kl_loss_17": 158.20527267456055, + "kl_loss_3": 2200.6612182617187, + "kl_loss_6": 1381.3787719726563, + "learning_rate": 0.00012949114109055414, + "loss": 1103.4986, + "step": 7680 + }, + { + "ce_loss_12": 3.0988786816596985, + "ce_loss_17": 2.9312047243118284, + "ce_loss_23": 2.862640619277954, + "ce_loss_3": 3.929021441936493, + "ce_loss_6": 3.5049775719642637, + "epoch": 0.769, + "grad_norm": 1176.0, + "kl_loss_12": 558.3166275024414, + "kl_loss_17": 163.206209564209, + "kl_loss_3": 2267.092840576172, + "kl_loss_6": 1426.1697814941406, + "learning_rate": 0.00012842758726130281, + "loss": 1111.3761, + "step": 7690 + }, + { + "ce_loss_12": 3.149468147754669, + "ce_loss_17": 2.97717444896698, + "ce_loss_23": 2.9082438707351685, + "ce_loss_3": 3.9973672151565554, + "ce_loss_6": 3.558988904953003, + "epoch": 0.77, + "grad_norm": 1040.0, + "kl_loss_12": 565.3210571289062, + "kl_loss_17": 164.83067245483397, + "kl_loss_3": 2299.1438171386717, + "kl_loss_6": 1442.9823791503907, + "learning_rate": 0.00012736777516212267, + "loss": 1096.093, + "step": 7700 + }, + { + "ce_loss_12": 3.142407476902008, + "ce_loss_17": 2.9731947422027587, + "ce_loss_23": 2.9043068170547484, + "ce_loss_3": 3.960499668121338, + "ce_loss_6": 3.5475622177124024, + "epoch": 0.771, + "grad_norm": 1128.0, + "kl_loss_12": 563.460971069336, + "kl_loss_17": 164.93396606445313, + "kl_loss_3": 2251.1046752929688, + "kl_loss_6": 1418.974591064453, + "learning_rate": 0.00012631171546530968, + "loss": 1086.3447, + "step": 7710 + }, + { + "ce_loss_12": 3.153670871257782, + "ce_loss_17": 2.9829601645469666, + "ce_loss_23": 2.9120034694671633, + "ce_loss_3": 3.9621705174446107, + "ce_loss_6": 3.5580246329307554, + "epoch": 0.772, + "grad_norm": 1144.0, + "kl_loss_12": 566.0092697143555, + "kl_loss_17": 166.2492805480957, + "kl_loss_3": 2251.34677734375, + "kl_loss_6": 1438.2859375, + "learning_rate": 0.00012525941880537307, + "loss": 1112.6428, + "step": 7720 + }, + { + "ce_loss_12": 3.168710947036743, + "ce_loss_17": 3.01217383146286, + "ce_loss_23": 2.9430142283439635, + "ce_loss_3": 3.9859882950782777, + "ce_loss_6": 3.570967698097229, + "epoch": 0.773, + "grad_norm": 1104.0, + "kl_loss_12": 544.9650009155273, + "kl_loss_17": 159.64609603881837, + "kl_loss_3": 2227.202178955078, + "kl_loss_6": 1404.3751037597656, + "learning_rate": 0.00012421089577892869, + "loss": 1088.829, + "step": 7730 + }, + { + "ce_loss_12": 3.133381700515747, + "ce_loss_17": 2.96497061252594, + "ce_loss_23": 2.8961349368095397, + "ce_loss_3": 3.9726121544837953, + "ce_loss_6": 3.5419203877449035, + "epoch": 0.774, + "grad_norm": 1256.0, + "kl_loss_12": 557.8470489501954, + "kl_loss_17": 161.55676193237304, + "kl_loss_3": 2287.979541015625, + "kl_loss_6": 1432.6632080078125, + "learning_rate": 0.0001231661569445919, + "loss": 1107.4707, + "step": 7740 + }, + { + "ce_loss_12": 3.0016807436943056, + "ce_loss_17": 2.8387063264846804, + "ce_loss_23": 2.7722793340682985, + "ce_loss_3": 3.849887728691101, + "ce_loss_6": 3.411376619338989, + "epoch": 0.775, + "grad_norm": 1200.0, + "kl_loss_12": 551.7725036621093, + "kl_loss_17": 161.13144760131837, + "kl_loss_3": 2293.1958068847657, + "kl_loss_6": 1418.0409729003907, + "learning_rate": 0.00012212521282287093, + "loss": 1123.733, + "step": 7750 + }, + { + "ce_loss_12": 3.136315310001373, + "ce_loss_17": 2.9704002857208254, + "ce_loss_23": 2.903051769733429, + "ce_loss_3": 3.9540756225585936, + "ce_loss_6": 3.5338873744010924, + "epoch": 0.776, + "grad_norm": 1048.0, + "kl_loss_12": 560.9357513427734, + "kl_loss_17": 164.57203521728516, + "kl_loss_3": 2237.190466308594, + "kl_loss_6": 1406.5361328125, + "learning_rate": 0.00012108807389606158, + "loss": 1113.689, + "step": 7760 + }, + { + "ce_loss_12": 3.133539354801178, + "ce_loss_17": 2.972551167011261, + "ce_loss_23": 2.9099935054779054, + "ce_loss_3": 3.956028401851654, + "ce_loss_6": 3.5379050612449645, + "epoch": 0.777, + "grad_norm": 1144.0, + "kl_loss_12": 542.0610504150391, + "kl_loss_17": 159.10600662231445, + "kl_loss_3": 2235.199530029297, + "kl_loss_6": 1404.911279296875, + "learning_rate": 0.00012005475060814159, + "loss": 1088.8854, + "step": 7770 + }, + { + "ce_loss_12": 3.079297161102295, + "ce_loss_17": 2.9101614475250246, + "ce_loss_23": 2.8443787336349486, + "ce_loss_3": 3.9278798580169676, + "ce_loss_6": 3.495157504081726, + "epoch": 0.778, + "grad_norm": 1128.0, + "kl_loss_12": 555.6036270141601, + "kl_loss_17": 161.68514709472657, + "kl_loss_3": 2309.4862731933595, + "kl_loss_6": 1449.1174621582031, + "learning_rate": 0.00011902525336466464, + "loss": 1110.0096, + "step": 7780 + }, + { + "ce_loss_12": 3.071139085292816, + "ce_loss_17": 2.8988611459732057, + "ce_loss_23": 2.828660762310028, + "ce_loss_3": 3.9334118127822877, + "ce_loss_6": 3.4934719800949097, + "epoch": 0.779, + "grad_norm": 1264.0, + "kl_loss_12": 570.3909606933594, + "kl_loss_17": 166.66974716186525, + "kl_loss_3": 2341.7922607421874, + "kl_loss_6": 1471.1030578613281, + "learning_rate": 0.00011799959253265668, + "loss": 1112.8619, + "step": 7790 + }, + { + "ce_loss_12": 3.116266095638275, + "ce_loss_17": 2.9551839351654055, + "ce_loss_23": 2.883231484889984, + "ce_loss_3": 3.95099333524704, + "ce_loss_6": 3.518081533908844, + "epoch": 0.78, + "grad_norm": 1208.0, + "kl_loss_12": 559.7863510131835, + "kl_loss_17": 166.4375129699707, + "kl_loss_3": 2293.211083984375, + "kl_loss_6": 1431.106298828125, + "learning_rate": 0.00011697777844051105, + "loss": 1107.758, + "step": 7800 + }, + { + "ce_loss_12": 3.1162440299987795, + "ce_loss_17": 2.946736991405487, + "ce_loss_23": 2.8777117013931273, + "ce_loss_3": 3.9824501991271974, + "ce_loss_6": 3.544774925708771, + "epoch": 0.781, + "grad_norm": 1176.0, + "kl_loss_12": 561.5905197143554, + "kl_loss_17": 166.06884841918946, + "kl_loss_3": 2337.6393127441406, + "kl_loss_6": 1477.4393798828125, + "learning_rate": 0.00011595982137788402, + "loss": 1119.0977, + "step": 7810 + }, + { + "ce_loss_12": 3.0908493518829347, + "ce_loss_17": 2.93182715177536, + "ce_loss_23": 2.865774428844452, + "ce_loss_3": 3.8857927322387695, + "ce_loss_6": 3.469491732120514, + "epoch": 0.782, + "grad_norm": 1136.0, + "kl_loss_12": 542.1579315185547, + "kl_loss_17": 160.3938201904297, + "kl_loss_3": 2192.916149902344, + "kl_loss_6": 1376.347833251953, + "learning_rate": 0.00011494573159559212, + "loss": 1088.1531, + "step": 7820 + }, + { + "ce_loss_12": 3.0768490195274354, + "ce_loss_17": 2.9117467045783996, + "ce_loss_23": 2.8412336468696595, + "ce_loss_3": 3.9094900727272033, + "ce_loss_6": 3.4854151964187623, + "epoch": 0.783, + "grad_norm": 1064.0, + "kl_loss_12": 554.2974975585937, + "kl_loss_17": 164.9409309387207, + "kl_loss_3": 2269.3820617675783, + "kl_loss_6": 1426.0858764648438, + "learning_rate": 0.00011393551930550828, + "loss": 1121.7828, + "step": 7830 + }, + { + "ce_loss_12": 3.192759084701538, + "ce_loss_17": 3.032848227024078, + "ce_loss_23": 2.9663447737693787, + "ce_loss_3": 3.99887330532074, + "ce_loss_6": 3.580846738815308, + "epoch": 0.784, + "grad_norm": 1072.0, + "kl_loss_12": 547.1056564331054, + "kl_loss_17": 163.2308448791504, + "kl_loss_3": 2215.794158935547, + "kl_loss_6": 1386.7675842285157, + "learning_rate": 0.00011292919468045875, + "loss": 1087.1904, + "step": 7840 + }, + { + "ce_loss_12": 3.1551998615264893, + "ce_loss_17": 2.99315550327301, + "ce_loss_23": 2.923645091056824, + "ce_loss_3": 3.9820490002632143, + "ce_loss_6": 3.5517470717430113, + "epoch": 0.785, + "grad_norm": 1048.0, + "kl_loss_12": 554.8253494262696, + "kl_loss_17": 162.82821502685547, + "kl_loss_3": 2261.1311279296874, + "kl_loss_6": 1406.467626953125, + "learning_rate": 0.00011192676785412154, + "loss": 1087.5309, + "step": 7850 + }, + { + "ce_loss_12": 3.1031730651855467, + "ce_loss_17": 2.9350796341896057, + "ce_loss_23": 2.864694333076477, + "ce_loss_3": 3.9630802512168883, + "ce_loss_6": 3.5200929641723633, + "epoch": 0.786, + "grad_norm": 1216.0, + "kl_loss_12": 558.0501159667969, + "kl_loss_17": 165.17536315917968, + "kl_loss_3": 2308.5718994140625, + "kl_loss_6": 1442.5363403320312, + "learning_rate": 0.00011092824892092374, + "loss": 1111.1633, + "step": 7860 + }, + { + "ce_loss_12": 3.0436164021492003, + "ce_loss_17": 2.8756727457046507, + "ce_loss_23": 2.808738589286804, + "ce_loss_3": 3.9049333810806273, + "ce_loss_6": 3.4604093074798583, + "epoch": 0.787, + "grad_norm": 1432.0, + "kl_loss_12": 555.9726867675781, + "kl_loss_17": 161.20776443481446, + "kl_loss_3": 2315.7523681640623, + "kl_loss_6": 1448.0448669433595, + "learning_rate": 0.0001099336479359398, + "loss": 1100.9196, + "step": 7870 + }, + { + "ce_loss_12": 3.1471460461616516, + "ce_loss_17": 2.991259491443634, + "ce_loss_23": 2.9230222702026367, + "ce_loss_3": 3.9491287231445313, + "ce_loss_6": 3.538780689239502, + "epoch": 0.788, + "grad_norm": 1144.0, + "kl_loss_12": 547.1228820800782, + "kl_loss_17": 161.35846481323242, + "kl_loss_3": 2208.5154541015627, + "kl_loss_6": 1388.1799682617188, + "learning_rate": 0.00010894297491479043, + "loss": 1092.2739, + "step": 7880 + }, + { + "ce_loss_12": 3.134075367450714, + "ce_loss_17": 2.971082592010498, + "ce_loss_23": 2.9070354461669923, + "ce_loss_3": 3.9635905861854552, + "ce_loss_6": 3.534369695186615, + "epoch": 0.789, + "grad_norm": 1112.0, + "kl_loss_12": 550.5070495605469, + "kl_loss_17": 160.23197479248046, + "kl_loss_3": 2256.6437072753906, + "kl_loss_6": 1408.9224609375, + "learning_rate": 0.00010795623983354214, + "loss": 1090.2475, + "step": 7890 + }, + { + "ce_loss_12": 3.0555803418159484, + "ce_loss_17": 2.881819653511047, + "ce_loss_23": 2.8134613275527953, + "ce_loss_3": 3.89397656917572, + "ce_loss_6": 3.4688626885414124, + "epoch": 0.79, + "grad_norm": 1096.0, + "kl_loss_12": 569.0260833740234, + "kl_loss_17": 167.3630630493164, + "kl_loss_3": 2300.7442138671877, + "kl_loss_6": 1453.1748962402344, + "learning_rate": 0.00010697345262860636, + "loss": 1106.0195, + "step": 7900 + }, + { + "ce_loss_12": 3.168570268154144, + "ce_loss_17": 3.0090152263641357, + "ce_loss_23": 2.942872929573059, + "ce_loss_3": 3.9651273488998413, + "ce_loss_6": 3.547044610977173, + "epoch": 0.791, + "grad_norm": 1192.0, + "kl_loss_12": 543.2456344604492, + "kl_loss_17": 160.62493667602538, + "kl_loss_3": 2202.420587158203, + "kl_loss_6": 1377.7202026367188, + "learning_rate": 0.00010599462319663906, + "loss": 1072.1201, + "step": 7910 + }, + { + "ce_loss_12": 3.1340227365493774, + "ce_loss_17": 2.9779136538505555, + "ce_loss_23": 2.9129930257797243, + "ce_loss_3": 3.926445186138153, + "ce_loss_6": 3.5237072110176086, + "epoch": 0.792, + "grad_norm": 1288.0, + "kl_loss_12": 538.3043960571289, + "kl_loss_17": 159.2765205383301, + "kl_loss_3": 2171.213702392578, + "kl_loss_6": 1366.4069458007812, + "learning_rate": 0.00010501976139444191, + "loss": 1068.7393, + "step": 7920 + }, + { + "ce_loss_12": 3.159173882007599, + "ce_loss_17": 3.002285659313202, + "ce_loss_23": 2.9352866649627685, + "ce_loss_3": 3.970040965080261, + "ce_loss_6": 3.5568660140037536, + "epoch": 0.793, + "grad_norm": 1512.0, + "kl_loss_12": 539.3857559204101, + "kl_loss_17": 160.57502517700195, + "kl_loss_3": 2214.8924255371094, + "kl_loss_6": 1396.2409423828126, + "learning_rate": 0.0001040488770388625, + "loss": 1101.3361, + "step": 7930 + }, + { + "ce_loss_12": 3.121935760974884, + "ce_loss_17": 2.9588202238082886, + "ce_loss_23": 2.895924472808838, + "ce_loss_3": 3.947384810447693, + "ce_loss_6": 3.52029892206192, + "epoch": 0.794, + "grad_norm": 1440.0, + "kl_loss_12": 555.7849990844727, + "kl_loss_17": 161.34661026000975, + "kl_loss_3": 2273.8947875976564, + "kl_loss_6": 1424.0836669921875, + "learning_rate": 0.00010308197990669538, + "loss": 1096.0457, + "step": 7940 + }, + { + "ce_loss_12": 3.2306849360466003, + "ce_loss_17": 3.0659042596817017, + "ce_loss_23": 2.999270474910736, + "ce_loss_3": 4.04144002199173, + "ce_loss_6": 3.6200562357902526, + "epoch": 0.795, + "grad_norm": 1176.0, + "kl_loss_12": 560.016635131836, + "kl_loss_17": 166.26138000488282, + "kl_loss_3": 2237.8337158203126, + "kl_loss_6": 1405.325018310547, + "learning_rate": 0.0001021190797345839, + "loss": 1087.4437, + "step": 7950 + }, + { + "ce_loss_12": 2.9893540263175966, + "ce_loss_17": 2.8147446155548095, + "ce_loss_23": 2.741576051712036, + "ce_loss_3": 3.8425682902336122, + "ce_loss_6": 3.410431373119354, + "epoch": 0.796, + "grad_norm": 1552.0, + "kl_loss_12": 574.1566009521484, + "kl_loss_17": 167.23827934265137, + "kl_loss_3": 2333.274517822266, + "kl_loss_6": 1471.6829833984375, + "learning_rate": 0.00010116018621892236, + "loss": 1115.1414, + "step": 7960 + }, + { + "ce_loss_12": 3.1764989376068113, + "ce_loss_17": 3.010814297199249, + "ce_loss_23": 2.943979728221893, + "ce_loss_3": 4.0100136518478395, + "ce_loss_6": 3.5845438480377196, + "epoch": 0.797, + "grad_norm": 1144.0, + "kl_loss_12": 580.7473648071289, + "kl_loss_17": 172.146883392334, + "kl_loss_3": 2311.033654785156, + "kl_loss_6": 1458.1046936035157, + "learning_rate": 0.00010020530901575753, + "loss": 1093.3162, + "step": 7970 + }, + { + "ce_loss_12": 3.19004088640213, + "ce_loss_17": 3.028277337551117, + "ce_loss_23": 2.959457278251648, + "ce_loss_3": 4.00085334777832, + "ce_loss_6": 3.5891884207725524, + "epoch": 0.798, + "grad_norm": 1000.0, + "kl_loss_12": 559.4841430664062, + "kl_loss_17": 165.01912384033204, + "kl_loss_3": 2237.9213806152343, + "kl_loss_6": 1415.7213012695313, + "learning_rate": 9.925445774069231e-05, + "loss": 1079.9097, + "step": 7980 + }, + { + "ce_loss_12": 3.147440028190613, + "ce_loss_17": 2.9840749144554137, + "ce_loss_23": 2.9128615975379946, + "ce_loss_3": 3.959659469127655, + "ce_loss_6": 3.5440555930137636, + "epoch": 0.799, + "grad_norm": 1168.0, + "kl_loss_12": 550.6807968139649, + "kl_loss_17": 163.16437606811525, + "kl_loss_3": 2218.516809082031, + "kl_loss_6": 1395.5287109375, + "learning_rate": 9.830764196878872e-05, + "loss": 1068.5918, + "step": 7990 + }, + { + "ce_loss_12": 3.095009219646454, + "ce_loss_17": 2.934465217590332, + "ce_loss_23": 2.8681960105895996, + "ce_loss_3": 3.9243249773979185, + "ce_loss_6": 3.4954513907432556, + "epoch": 0.8, + "grad_norm": 1416.0, + "kl_loss_12": 549.8090255737304, + "kl_loss_17": 159.90571517944335, + "kl_loss_3": 2278.8769104003904, + "kl_loss_6": 1422.961962890625, + "learning_rate": 9.736487123447069e-05, + "loss": 1096.4008, + "step": 8000 + }, + { + "ce_loss_12": 3.0546049952507017, + "ce_loss_17": 2.8909072041511537, + "ce_loss_23": 2.8225434184074403, + "ce_loss_3": 3.934860420227051, + "ce_loss_6": 3.4992011904716493, + "epoch": 0.801, + "grad_norm": 1240.0, + "kl_loss_12": 562.626187133789, + "kl_loss_17": 166.51637954711913, + "kl_loss_3": 2389.3442626953124, + "kl_loss_6": 1513.7143859863281, + "learning_rate": 9.642615503142926e-05, + "loss": 1133.6734, + "step": 8010 + }, + { + "ce_loss_12": 3.1169554352760316, + "ce_loss_17": 2.948462611436844, + "ce_loss_23": 2.880279469490051, + "ce_loss_3": 3.9491363406181335, + "ce_loss_6": 3.524028706550598, + "epoch": 0.802, + "grad_norm": 1080.0, + "kl_loss_12": 552.5627502441406, + "kl_loss_17": 161.54310836791993, + "kl_loss_3": 2288.007940673828, + "kl_loss_6": 1419.6550842285155, + "learning_rate": 9.549150281252633e-05, + "loss": 1089.2569, + "step": 8020 + }, + { + "ce_loss_12": 3.1377015233039858, + "ce_loss_17": 2.972144603729248, + "ce_loss_23": 2.9036526679992676, + "ce_loss_3": 3.9606377482414246, + "ce_loss_6": 3.535222041606903, + "epoch": 0.803, + "grad_norm": 1208.0, + "kl_loss_12": 559.8434661865234, + "kl_loss_17": 164.10131759643554, + "kl_loss_3": 2280.4359741210938, + "kl_loss_6": 1422.1460754394532, + "learning_rate": 9.4560923989699e-05, + "loss": 1113.476, + "step": 8030 + }, + { + "ce_loss_12": 3.1276845932006836, + "ce_loss_17": 2.966171216964722, + "ce_loss_23": 2.8971128463745117, + "ce_loss_3": 3.9457432627677917, + "ce_loss_6": 3.526905131340027, + "epoch": 0.804, + "grad_norm": 1328.0, + "kl_loss_12": 556.661784362793, + "kl_loss_17": 164.48508224487304, + "kl_loss_3": 2246.531115722656, + "kl_loss_6": 1412.6000244140625, + "learning_rate": 9.363442793386607e-05, + "loss": 1116.8084, + "step": 8040 + }, + { + "ce_loss_12": 3.1118528485298156, + "ce_loss_17": 2.9384084939956665, + "ce_loss_23": 2.865858054161072, + "ce_loss_3": 3.961544167995453, + "ce_loss_6": 3.5322256207466127, + "epoch": 0.805, + "grad_norm": 1624.0, + "kl_loss_12": 571.4155303955079, + "kl_loss_17": 166.18219985961915, + "kl_loss_3": 2314.427600097656, + "kl_loss_6": 1460.6573547363282, + "learning_rate": 9.271202397483213e-05, + "loss": 1092.9216, + "step": 8050 + }, + { + "ce_loss_12": 3.124697983264923, + "ce_loss_17": 2.9692813038825987, + "ce_loss_23": 2.903523635864258, + "ce_loss_3": 3.9298166632652283, + "ce_loss_6": 3.5110038042068483, + "epoch": 0.806, + "grad_norm": 1200.0, + "kl_loss_12": 539.9032760620117, + "kl_loss_17": 159.1617202758789, + "kl_loss_3": 2204.6119384765625, + "kl_loss_6": 1373.1665283203124, + "learning_rate": 9.179372140119524e-05, + "loss": 1096.2607, + "step": 8060 + }, + { + "ce_loss_12": 3.0772274017333983, + "ce_loss_17": 2.9202582120895384, + "ce_loss_23": 2.8505948543548585, + "ce_loss_3": 3.9003460884094237, + "ce_loss_6": 3.4769996047019958, + "epoch": 0.807, + "grad_norm": 1096.0, + "kl_loss_12": 547.1208557128906, + "kl_loss_17": 160.2839126586914, + "kl_loss_3": 2238.119464111328, + "kl_loss_6": 1401.8871643066407, + "learning_rate": 9.087952946025175e-05, + "loss": 1107.101, + "step": 8070 + }, + { + "ce_loss_12": 3.172690415382385, + "ce_loss_17": 3.0231125354766846, + "ce_loss_23": 2.9546570420265197, + "ce_loss_3": 3.9569469451904298, + "ce_loss_6": 3.548564386367798, + "epoch": 0.808, + "grad_norm": 1064.0, + "kl_loss_12": 530.4381561279297, + "kl_loss_17": 158.45982284545897, + "kl_loss_3": 2170.6416198730467, + "kl_loss_6": 1350.5114440917969, + "learning_rate": 8.996945735790446e-05, + "loss": 1087.7603, + "step": 8080 + }, + { + "ce_loss_12": 3.078972852230072, + "ce_loss_17": 2.9214311003685, + "ce_loss_23": 2.856155252456665, + "ce_loss_3": 3.8913928270339966, + "ce_loss_6": 3.485938382148743, + "epoch": 0.809, + "grad_norm": 1464.0, + "kl_loss_12": 548.5154266357422, + "kl_loss_17": 161.0421516418457, + "kl_loss_3": 2236.966937255859, + "kl_loss_6": 1416.3121643066406, + "learning_rate": 8.906351425856951e-05, + "loss": 1105.2345, + "step": 8090 + }, + { + "ce_loss_12": 3.0690324544906615, + "ce_loss_17": 2.904653477668762, + "ce_loss_23": 2.8365507245063784, + "ce_loss_3": 3.9070201873779298, + "ce_loss_6": 3.4807681679725646, + "epoch": 0.81, + "grad_norm": 1384.0, + "kl_loss_12": 557.0683990478516, + "kl_loss_17": 162.01241226196288, + "kl_loss_3": 2309.0856811523436, + "kl_loss_6": 1445.5119323730469, + "learning_rate": 8.816170928508365e-05, + "loss": 1119.2719, + "step": 8100 + }, + { + "ce_loss_12": 3.042520451545715, + "ce_loss_17": 2.873482954502106, + "ce_loss_23": 2.805701220035553, + "ce_loss_3": 3.9062719345092773, + "ce_loss_6": 3.460271942615509, + "epoch": 0.811, + "grad_norm": 1064.0, + "kl_loss_12": 562.3566223144531, + "kl_loss_17": 162.64302597045898, + "kl_loss_3": 2333.633984375, + "kl_loss_6": 1452.9960693359376, + "learning_rate": 8.7264051518613e-05, + "loss": 1107.0025, + "step": 8110 + }, + { + "ce_loss_12": 3.11094753742218, + "ce_loss_17": 2.9495893716812134, + "ce_loss_23": 2.886719620227814, + "ce_loss_3": 3.9172590613365172, + "ce_loss_6": 3.5028425812721253, + "epoch": 0.812, + "grad_norm": 1112.0, + "kl_loss_12": 540.7414581298829, + "kl_loss_17": 157.58410263061523, + "kl_loss_3": 2204.1671936035154, + "kl_loss_6": 1378.162091064453, + "learning_rate": 8.637054999856148e-05, + "loss": 1087.2692, + "step": 8120 + }, + { + "ce_loss_12": 3.1143086314201356, + "ce_loss_17": 2.9487848043441773, + "ce_loss_23": 2.879635775089264, + "ce_loss_3": 3.938987469673157, + "ce_loss_6": 3.5184776306152346, + "epoch": 0.813, + "grad_norm": 884.0, + "kl_loss_12": 557.1480926513672, + "kl_loss_17": 163.79063415527344, + "kl_loss_3": 2253.558184814453, + "kl_loss_6": 1420.3731384277344, + "learning_rate": 8.548121372247918e-05, + "loss": 1111.1787, + "step": 8130 + }, + { + "ce_loss_12": 3.1705002427101134, + "ce_loss_17": 3.0160479664802553, + "ce_loss_23": 2.9523680925369264, + "ce_loss_3": 3.969861078262329, + "ce_loss_6": 3.5573942184448244, + "epoch": 0.814, + "grad_norm": 1216.0, + "kl_loss_12": 543.2226989746093, + "kl_loss_17": 160.13306884765626, + "kl_loss_3": 2231.438720703125, + "kl_loss_6": 1395.8012634277343, + "learning_rate": 8.459605164597267e-05, + "loss": 1084.0575, + "step": 8140 + }, + { + "ce_loss_12": 3.066420042514801, + "ce_loss_17": 2.9058117270469666, + "ce_loss_23": 2.842375338077545, + "ce_loss_3": 3.897522139549255, + "ce_loss_6": 3.476026487350464, + "epoch": 0.815, + "grad_norm": 1224.0, + "kl_loss_12": 548.8289077758789, + "kl_loss_17": 159.9200454711914, + "kl_loss_3": 2262.31728515625, + "kl_loss_6": 1419.397479248047, + "learning_rate": 8.371507268261436e-05, + "loss": 1109.2594, + "step": 8150 + }, + { + "ce_loss_12": 3.1309832453727724, + "ce_loss_17": 2.9698503971099854, + "ce_loss_23": 2.900208055973053, + "ce_loss_3": 3.9546516060829164, + "ce_loss_6": 3.5335537910461428, + "epoch": 0.816, + "grad_norm": 848.0, + "kl_loss_12": 545.8640365600586, + "kl_loss_17": 161.9612693786621, + "kl_loss_3": 2239.576623535156, + "kl_loss_6": 1402.9355529785157, + "learning_rate": 8.283828570385238e-05, + "loss": 1072.7236, + "step": 8160 + }, + { + "ce_loss_12": 3.1363924384117126, + "ce_loss_17": 2.9693047285079954, + "ce_loss_23": 2.9015498042106627, + "ce_loss_3": 3.959295666217804, + "ce_loss_6": 3.535193419456482, + "epoch": 0.817, + "grad_norm": 1120.0, + "kl_loss_12": 547.0559875488282, + "kl_loss_17": 160.7385581970215, + "kl_loss_3": 2233.6774658203126, + "kl_loss_6": 1393.712664794922, + "learning_rate": 8.196569953892202e-05, + "loss": 1089.5018, + "step": 8170 + }, + { + "ce_loss_12": 3.0679932117462156, + "ce_loss_17": 2.8993886709213257, + "ce_loss_23": 2.8328171491622927, + "ce_loss_3": 3.894995379447937, + "ce_loss_6": 3.4704811215400695, + "epoch": 0.818, + "grad_norm": 1232.0, + "kl_loss_12": 558.6704147338867, + "kl_loss_17": 162.2000717163086, + "kl_loss_3": 2243.635137939453, + "kl_loss_6": 1417.9678466796875, + "learning_rate": 8.109732297475635e-05, + "loss": 1089.0442, + "step": 8180 + }, + { + "ce_loss_12": 3.056806230545044, + "ce_loss_17": 2.8779824495315554, + "ce_loss_23": 2.807134783267975, + "ce_loss_3": 3.936583161354065, + "ce_loss_6": 3.4898857831954957, + "epoch": 0.819, + "grad_norm": 1896.0, + "kl_loss_12": 578.1514053344727, + "kl_loss_17": 166.74496994018554, + "kl_loss_3": 2355.654931640625, + "kl_loss_6": 1488.9981994628906, + "learning_rate": 8.023316475589754e-05, + "loss": 1128.7449, + "step": 8190 + }, + { + "ce_loss_12": 3.02653945684433, + "ce_loss_17": 2.8479262471199034, + "ce_loss_23": 2.7727146625518797, + "ce_loss_3": 3.9359703540802, + "ce_loss_6": 3.468432629108429, + "epoch": 0.82, + "grad_norm": 1560.0, + "kl_loss_12": 595.1294815063477, + "kl_loss_17": 174.096484375, + "kl_loss_3": 2443.1336364746094, + "kl_loss_6": 1527.0234619140624, + "learning_rate": 7.937323358440934e-05, + "loss": 1148.4727, + "step": 8200 + }, + { + "ce_loss_12": 3.1162544965744017, + "ce_loss_17": 2.9630125164985657, + "ce_loss_23": 2.8993138432502747, + "ce_loss_3": 3.911135995388031, + "ce_loss_6": 3.496608221530914, + "epoch": 0.821, + "grad_norm": 1152.0, + "kl_loss_12": 544.4355880737305, + "kl_loss_17": 159.98213882446288, + "kl_loss_3": 2191.1896850585936, + "kl_loss_6": 1374.475323486328, + "learning_rate": 7.851753811978923e-05, + "loss": 1085.2391, + "step": 8210 + }, + { + "ce_loss_12": 3.1299094915390016, + "ce_loss_17": 2.9677687168121336, + "ce_loss_23": 2.899359703063965, + "ce_loss_3": 3.9690559029579164, + "ce_loss_6": 3.5370683073997498, + "epoch": 0.822, + "grad_norm": 1192.0, + "kl_loss_12": 551.8927993774414, + "kl_loss_17": 164.45097732543945, + "kl_loss_3": 2285.034063720703, + "kl_loss_6": 1431.9157836914062, + "learning_rate": 7.766608697888095e-05, + "loss": 1093.2594, + "step": 8220 + }, + { + "ce_loss_12": 3.147523856163025, + "ce_loss_17": 2.983336317539215, + "ce_loss_23": 2.9168429255485533, + "ce_loss_3": 3.981073188781738, + "ce_loss_6": 3.5518638849258424, + "epoch": 0.823, + "grad_norm": 840.0, + "kl_loss_12": 556.6430786132812, + "kl_loss_17": 164.469490814209, + "kl_loss_3": 2287.3015747070312, + "kl_loss_6": 1432.2851135253907, + "learning_rate": 7.681888873578785e-05, + "loss": 1117.138, + "step": 8230 + }, + { + "ce_loss_12": 3.081748139858246, + "ce_loss_17": 2.909856748580933, + "ce_loss_23": 2.8364683270454405, + "ce_loss_3": 3.924388611316681, + "ce_loss_6": 3.488308036327362, + "epoch": 0.824, + "grad_norm": 1272.0, + "kl_loss_12": 572.6440856933593, + "kl_loss_17": 169.69739227294923, + "kl_loss_3": 2308.4847412109375, + "kl_loss_6": 1448.6083068847656, + "learning_rate": 7.597595192178702e-05, + "loss": 1103.9938, + "step": 8240 + }, + { + "ce_loss_12": 3.0852334976196287, + "ce_loss_17": 2.913503646850586, + "ce_loss_23": 2.841702175140381, + "ce_loss_3": 3.9479007720947266, + "ce_loss_6": 3.508209836483002, + "epoch": 0.825, + "grad_norm": 1488.0, + "kl_loss_12": 563.913801574707, + "kl_loss_17": 165.11235427856445, + "kl_loss_3": 2359.2099426269533, + "kl_loss_6": 1477.1159729003907, + "learning_rate": 7.513728502524286e-05, + "loss": 1129.6759, + "step": 8250 + }, + { + "ce_loss_12": 3.065532147884369, + "ce_loss_17": 2.9113979578018188, + "ce_loss_23": 2.8476172029972076, + "ce_loss_3": 3.8924211502075194, + "ce_loss_6": 3.4671952962875365, + "epoch": 0.826, + "grad_norm": 1104.0, + "kl_loss_12": 533.2513366699219, + "kl_loss_17": 155.2083396911621, + "kl_loss_3": 2228.1499267578124, + "kl_loss_6": 1379.5601379394532, + "learning_rate": 7.430289649152156e-05, + "loss": 1097.4211, + "step": 8260 + }, + { + "ce_loss_12": 2.9995575308799745, + "ce_loss_17": 2.828830623626709, + "ce_loss_23": 2.7601626992225645, + "ce_loss_3": 3.8649712085723875, + "ce_loss_6": 3.4307517409324646, + "epoch": 0.827, + "grad_norm": 1384.0, + "kl_loss_12": 565.3962280273438, + "kl_loss_17": 162.2247215270996, + "kl_loss_3": 2362.8103149414064, + "kl_loss_6": 1491.5337951660156, + "learning_rate": 7.347279472290646e-05, + "loss": 1109.859, + "step": 8270 + }, + { + "ce_loss_12": 3.122376787662506, + "ce_loss_17": 2.9612316250801087, + "ce_loss_23": 2.8959510564804076, + "ce_loss_3": 3.9570825338363647, + "ce_loss_6": 3.531812274456024, + "epoch": 0.828, + "grad_norm": 1064.0, + "kl_loss_12": 554.3373107910156, + "kl_loss_17": 162.3391372680664, + "kl_loss_3": 2278.9635681152345, + "kl_loss_6": 1429.607843017578, + "learning_rate": 7.264698807851328e-05, + "loss": 1109.9259, + "step": 8280 + }, + { + "ce_loss_12": 3.0933347225189207, + "ce_loss_17": 2.9319000482559203, + "ce_loss_23": 2.8684099674224854, + "ce_loss_3": 3.899102711677551, + "ce_loss_6": 3.483800983428955, + "epoch": 0.829, + "grad_norm": 1048.0, + "kl_loss_12": 544.1003021240234, + "kl_loss_17": 159.55711669921874, + "kl_loss_3": 2213.3914123535155, + "kl_loss_6": 1387.0603576660155, + "learning_rate": 7.182548487420554e-05, + "loss": 1087.8375, + "step": 8290 + }, + { + "ce_loss_12": 3.141780412197113, + "ce_loss_17": 2.9783580780029295, + "ce_loss_23": 2.909892702102661, + "ce_loss_3": 3.9592021346092223, + "ce_loss_6": 3.5420392751693726, + "epoch": 0.83, + "grad_norm": 1072.0, + "kl_loss_12": 560.5811340332032, + "kl_loss_17": 165.19401168823242, + "kl_loss_3": 2255.7765991210936, + "kl_loss_6": 1421.8934265136718, + "learning_rate": 7.100829338251146e-05, + "loss": 1093.7117, + "step": 8300 + }, + { + "ce_loss_12": 3.08468918800354, + "ce_loss_17": 2.9131301283836364, + "ce_loss_23": 2.840294587612152, + "ce_loss_3": 3.9290374636650087, + "ce_loss_6": 3.5016937017440797, + "epoch": 0.831, + "grad_norm": 1352.0, + "kl_loss_12": 566.9378234863282, + "kl_loss_17": 166.56872940063477, + "kl_loss_3": 2305.8603698730467, + "kl_loss_6": 1455.0379943847656, + "learning_rate": 7.019542183254046e-05, + "loss": 1097.4264, + "step": 8310 + }, + { + "ce_loss_12": 3.1170966267585754, + "ce_loss_17": 2.953399920463562, + "ce_loss_23": 2.879462385177612, + "ce_loss_3": 3.928554356098175, + "ce_loss_6": 3.514261078834534, + "epoch": 0.832, + "grad_norm": 1384.0, + "kl_loss_12": 566.4154647827148, + "kl_loss_17": 171.7959846496582, + "kl_loss_3": 2248.6452697753907, + "kl_loss_6": 1419.4371704101563, + "learning_rate": 6.938687840989971e-05, + "loss": 1096.3621, + "step": 8320 + }, + { + "ce_loss_12": 3.0669469594955445, + "ce_loss_17": 2.8998965978622437, + "ce_loss_23": 2.829956316947937, + "ce_loss_3": 3.893748676776886, + "ce_loss_6": 3.4692044615745545, + "epoch": 0.833, + "grad_norm": 1184.0, + "kl_loss_12": 560.9413757324219, + "kl_loss_17": 168.15377655029297, + "kl_loss_3": 2256.4309631347655, + "kl_loss_6": 1422.266094970703, + "learning_rate": 6.858267125661271e-05, + "loss": 1113.0926, + "step": 8330 + }, + { + "ce_loss_12": 3.1150636672973633, + "ce_loss_17": 2.9481117963790893, + "ce_loss_23": 2.8806458711624146, + "ce_loss_3": 3.9465166568756103, + "ce_loss_6": 3.518279528617859, + "epoch": 0.834, + "grad_norm": 1504.0, + "kl_loss_12": 552.7582366943359, + "kl_loss_17": 160.30242538452148, + "kl_loss_3": 2264.86318359375, + "kl_loss_6": 1412.4846130371093, + "learning_rate": 6.778280847103668e-05, + "loss": 1118.2813, + "step": 8340 + }, + { + "ce_loss_12": 3.1263239979743958, + "ce_loss_17": 2.958054792881012, + "ce_loss_23": 2.8883840918540953, + "ce_loss_3": 3.933966672420502, + "ce_loss_6": 3.5245227932929994, + "epoch": 0.835, + "grad_norm": 992.0, + "kl_loss_12": 561.3988143920899, + "kl_loss_17": 165.34767684936523, + "kl_loss_3": 2257.2742126464846, + "kl_loss_6": 1430.91298828125, + "learning_rate": 6.698729810778065e-05, + "loss": 1096.0061, + "step": 8350 + }, + { + "ce_loss_12": 3.0373926401138305, + "ce_loss_17": 2.872546947002411, + "ce_loss_23": 2.806998634338379, + "ce_loss_3": 3.8821595907211304, + "ce_loss_6": 3.442913126945496, + "epoch": 0.836, + "grad_norm": 1848.0, + "kl_loss_12": 548.8032333374024, + "kl_loss_17": 158.58746490478515, + "kl_loss_3": 2281.4795837402344, + "kl_loss_6": 1414.729718017578, + "learning_rate": 6.619614817762538e-05, + "loss": 1104.8056, + "step": 8360 + }, + { + "ce_loss_12": 3.0213045954704283, + "ce_loss_17": 2.8510085225105284, + "ce_loss_23": 2.781916630268097, + "ce_loss_3": 3.9031874537467957, + "ce_loss_6": 3.4542742133140565, + "epoch": 0.837, + "grad_norm": 1168.0, + "kl_loss_12": 568.2625747680664, + "kl_loss_17": 162.45321197509764, + "kl_loss_3": 2375.975720214844, + "kl_loss_6": 1487.9389587402343, + "learning_rate": 6.540936664744196e-05, + "loss": 1126.8619, + "step": 8370 + }, + { + "ce_loss_12": 3.144019269943237, + "ce_loss_17": 2.978832817077637, + "ce_loss_23": 2.911532390117645, + "ce_loss_3": 3.98069965839386, + "ce_loss_6": 3.55452219247818, + "epoch": 0.838, + "grad_norm": 1008.0, + "kl_loss_12": 555.9855712890625, + "kl_loss_17": 163.2048194885254, + "kl_loss_3": 2276.542120361328, + "kl_loss_6": 1436.4474548339845, + "learning_rate": 6.462696144011149e-05, + "loss": 1090.6496, + "step": 8380 + }, + { + "ce_loss_12": 3.1107449650764467, + "ce_loss_17": 2.942846190929413, + "ce_loss_23": 2.8735639452934265, + "ce_loss_3": 3.915558421611786, + "ce_loss_6": 3.513276982307434, + "epoch": 0.839, + "grad_norm": 1272.0, + "kl_loss_12": 563.3623809814453, + "kl_loss_17": 164.73463973999023, + "kl_loss_3": 2243.006005859375, + "kl_loss_6": 1425.6291137695312, + "learning_rate": 6.384894043444567e-05, + "loss": 1083.7996, + "step": 8390 + }, + { + "ce_loss_12": 3.122048246860504, + "ce_loss_17": 2.9576640486717225, + "ce_loss_23": 2.8880361676216126, + "ce_loss_3": 3.957304573059082, + "ce_loss_6": 3.540115237236023, + "epoch": 0.84, + "grad_norm": 1040.0, + "kl_loss_12": 560.0843856811523, + "kl_loss_17": 163.13331756591796, + "kl_loss_3": 2267.2568115234376, + "kl_loss_6": 1435.8717346191406, + "learning_rate": 6.307531146510753e-05, + "loss": 1098.984, + "step": 8400 + }, + { + "ce_loss_12": 3.102569282054901, + "ce_loss_17": 2.9388603925704957, + "ce_loss_23": 2.8691525936126707, + "ce_loss_3": 3.9057474017143248, + "ce_loss_6": 3.5014272809028624, + "epoch": 0.841, + "grad_norm": 988.0, + "kl_loss_12": 547.4675140380859, + "kl_loss_17": 163.2883728027344, + "kl_loss_3": 2202.335076904297, + "kl_loss_6": 1394.86650390625, + "learning_rate": 6.230608232253226e-05, + "loss": 1072.458, + "step": 8410 + }, + { + "ce_loss_12": 3.0702186107635496, + "ce_loss_17": 2.896703338623047, + "ce_loss_23": 2.8263700127601625, + "ce_loss_3": 3.9404627799987795, + "ce_loss_6": 3.501567506790161, + "epoch": 0.842, + "grad_norm": 1096.0, + "kl_loss_12": 567.8404357910156, + "kl_loss_17": 165.14807815551757, + "kl_loss_3": 2338.825054931641, + "kl_loss_6": 1470.2890380859376, + "learning_rate": 6.154126075284855e-05, + "loss": 1104.2164, + "step": 8420 + }, + { + "ce_loss_12": 3.1461366176605225, + "ce_loss_17": 2.9891268968582154, + "ce_loss_23": 2.92186838388443, + "ce_loss_3": 3.945695734024048, + "ce_loss_6": 3.5478767275810243, + "epoch": 0.843, + "grad_norm": 1832.0, + "kl_loss_12": 541.3987503051758, + "kl_loss_17": 159.74902267456054, + "kl_loss_3": 2196.7286682128906, + "kl_loss_6": 1398.2157592773438, + "learning_rate": 6.078085445780129e-05, + "loss": 1071.0024, + "step": 8430 + }, + { + "ce_loss_12": 3.1509676933288575, + "ce_loss_17": 2.986407232284546, + "ce_loss_23": 2.92077054977417, + "ce_loss_3": 3.9918720960617065, + "ce_loss_6": 3.5571364164352417, + "epoch": 0.844, + "grad_norm": 1080.0, + "kl_loss_12": 554.0682266235351, + "kl_loss_17": 162.6667366027832, + "kl_loss_3": 2300.378106689453, + "kl_loss_6": 1428.4713317871094, + "learning_rate": 6.002487109467347e-05, + "loss": 1087.3066, + "step": 8440 + }, + { + "ce_loss_12": 3.161572754383087, + "ce_loss_17": 2.9950670480728148, + "ce_loss_23": 2.926289701461792, + "ce_loss_3": 3.97816481590271, + "ce_loss_6": 3.559042716026306, + "epoch": 0.845, + "grad_norm": 1608.0, + "kl_loss_12": 566.6126342773438, + "kl_loss_17": 167.85883026123048, + "kl_loss_3": 2266.07783203125, + "kl_loss_6": 1422.8419982910157, + "learning_rate": 5.927331827620902e-05, + "loss": 1092.1502, + "step": 8450 + }, + { + "ce_loss_12": 3.143201971054077, + "ce_loss_17": 2.979695534706116, + "ce_loss_23": 2.9125229477882386, + "ce_loss_3": 3.921801245212555, + "ce_loss_6": 3.5282947778701783, + "epoch": 0.846, + "grad_norm": 976.0, + "kl_loss_12": 546.6759765625, + "kl_loss_17": 160.7653564453125, + "kl_loss_3": 2170.8586669921874, + "kl_loss_6": 1373.1689147949219, + "learning_rate": 5.852620357053651e-05, + "loss": 1081.7949, + "step": 8460 + }, + { + "ce_loss_12": 3.1741357922554014, + "ce_loss_17": 3.0152151346206666, + "ce_loss_23": 2.9533477783203126, + "ce_loss_3": 3.977063298225403, + "ce_loss_6": 3.564035105705261, + "epoch": 0.847, + "grad_norm": 864.0, + "kl_loss_12": 546.3927154541016, + "kl_loss_17": 158.32703094482423, + "kl_loss_3": 2209.043560791016, + "kl_loss_6": 1389.9683044433593, + "learning_rate": 5.778353450109286e-05, + "loss": 1086.1645, + "step": 8470 + }, + { + "ce_loss_12": 3.215981650352478, + "ce_loss_17": 3.047750270366669, + "ce_loss_23": 2.978085231781006, + "ce_loss_3": 4.040736985206604, + "ce_loss_6": 3.6170117974281313, + "epoch": 0.848, + "grad_norm": 1320.0, + "kl_loss_12": 560.3518157958985, + "kl_loss_17": 165.00484771728514, + "kl_loss_3": 2262.4346435546877, + "kl_loss_6": 1421.1817565917968, + "learning_rate": 5.7045318546547206e-05, + "loss": 1089.9925, + "step": 8480 + }, + { + "ce_loss_12": 3.11188143491745, + "ce_loss_17": 2.9473851919174194, + "ce_loss_23": 2.879114067554474, + "ce_loss_3": 3.9515628933906557, + "ce_loss_6": 3.5185172319412232, + "epoch": 0.849, + "grad_norm": 1352.0, + "kl_loss_12": 550.3204238891601, + "kl_loss_17": 161.58224029541014, + "kl_loss_3": 2280.7270629882814, + "kl_loss_6": 1424.6280517578125, + "learning_rate": 5.631156314072605e-05, + "loss": 1091.9666, + "step": 8490 + }, + { + "ce_loss_12": 3.1408272266387938, + "ce_loss_17": 2.9797823429107666, + "ce_loss_23": 2.912586140632629, + "ce_loss_3": 3.930195379257202, + "ce_loss_6": 3.519640862941742, + "epoch": 0.85, + "grad_norm": 1440.0, + "kl_loss_12": 546.5631805419922, + "kl_loss_17": 161.9244026184082, + "kl_loss_3": 2195.800213623047, + "kl_loss_6": 1377.4348999023437, + "learning_rate": 5.5582275672538315e-05, + "loss": 1073.6294, + "step": 8500 + }, + { + "ce_loss_12": 3.0622190117835997, + "ce_loss_17": 2.8910266757011414, + "ce_loss_23": 2.8219680070877073, + "ce_loss_3": 3.942839562892914, + "ce_loss_6": 3.5015220046043396, + "epoch": 0.851, + "grad_norm": 1160.0, + "kl_loss_12": 571.0053894042969, + "kl_loss_17": 166.4903663635254, + "kl_loss_3": 2369.3764770507814, + "kl_loss_6": 1494.2642456054687, + "learning_rate": 5.4857463485900484e-05, + "loss": 1121.9272, + "step": 8510 + }, + { + "ce_loss_12": 3.111485755443573, + "ce_loss_17": 2.946943771839142, + "ce_loss_23": 2.880238139629364, + "ce_loss_3": 3.9151260018348695, + "ce_loss_6": 3.5119995951652525, + "epoch": 0.852, + "grad_norm": 1352.0, + "kl_loss_12": 552.024201965332, + "kl_loss_17": 161.01301116943358, + "kl_loss_3": 2224.1720764160154, + "kl_loss_6": 1411.4604736328124, + "learning_rate": 5.413713387966329e-05, + "loss": 1087.1721, + "step": 8520 + }, + { + "ce_loss_12": 3.1266626834869387, + "ce_loss_17": 2.9612136125564574, + "ce_loss_23": 2.894944739341736, + "ce_loss_3": 3.9596535682678224, + "ce_loss_6": 3.5332515597343446, + "epoch": 0.853, + "grad_norm": 1272.0, + "kl_loss_12": 554.592903137207, + "kl_loss_17": 161.4539581298828, + "kl_loss_3": 2281.1273559570313, + "kl_loss_6": 1426.2618041992187, + "learning_rate": 5.34212941075381e-05, + "loss": 1097.3725, + "step": 8530 + }, + { + "ce_loss_12": 3.1214884757995605, + "ce_loss_17": 2.969010281562805, + "ce_loss_23": 2.906864821910858, + "ce_loss_3": 3.92522656917572, + "ce_loss_6": 3.504314923286438, + "epoch": 0.854, + "grad_norm": 1536.0, + "kl_loss_12": 526.8193008422852, + "kl_loss_17": 155.95495262145997, + "kl_loss_3": 2192.919073486328, + "kl_loss_6": 1359.222833251953, + "learning_rate": 5.270995137802315e-05, + "loss": 1074.2971, + "step": 8540 + }, + { + "ce_loss_12": 3.0763959646224976, + "ce_loss_17": 2.916470468044281, + "ce_loss_23": 2.852141809463501, + "ce_loss_3": 3.8961158752441407, + "ce_loss_6": 3.4745135068893434, + "epoch": 0.855, + "grad_norm": 1320.0, + "kl_loss_12": 546.9484146118164, + "kl_loss_17": 158.56359329223633, + "kl_loss_3": 2248.6607177734377, + "kl_loss_6": 1404.589617919922, + "learning_rate": 5.2003112854332125e-05, + "loss": 1098.8508, + "step": 8550 + }, + { + "ce_loss_12": 3.063506841659546, + "ce_loss_17": 2.909021496772766, + "ce_loss_23": 2.846612310409546, + "ce_loss_3": 3.874693489074707, + "ce_loss_6": 3.464722013473511, + "epoch": 0.856, + "grad_norm": 1168.0, + "kl_loss_12": 540.1203262329102, + "kl_loss_17": 155.96851348876953, + "kl_loss_3": 2231.803356933594, + "kl_loss_6": 1405.0531921386719, + "learning_rate": 5.130078565432089e-05, + "loss": 1069.7745, + "step": 8560 + }, + { + "ce_loss_12": 3.121432435512543, + "ce_loss_17": 2.970027732849121, + "ce_loss_23": 2.906119704246521, + "ce_loss_3": 3.9241920351982116, + "ce_loss_6": 3.513047194480896, + "epoch": 0.857, + "grad_norm": 1136.0, + "kl_loss_12": 533.2334243774415, + "kl_loss_17": 155.81883239746094, + "kl_loss_3": 2196.0322692871096, + "kl_loss_6": 1381.9104248046874, + "learning_rate": 5.060297685041659e-05, + "loss": 1062.4382, + "step": 8570 + }, + { + "ce_loss_12": 3.0725590705871584, + "ce_loss_17": 2.905340886116028, + "ce_loss_23": 2.8341203689575196, + "ce_loss_3": 3.914118731021881, + "ce_loss_6": 3.4800498366355894, + "epoch": 0.858, + "grad_norm": 1168.0, + "kl_loss_12": 562.4660766601562, + "kl_loss_17": 166.77214736938475, + "kl_loss_3": 2297.951123046875, + "kl_loss_6": 1436.9736083984376, + "learning_rate": 4.99096934695461e-05, + "loss": 1113.8425, + "step": 8580 + }, + { + "ce_loss_12": 3.129444992542267, + "ce_loss_17": 2.963559329509735, + "ce_loss_23": 2.8960996150970457, + "ce_loss_3": 3.952687406539917, + "ce_loss_6": 3.5332820296287535, + "epoch": 0.859, + "grad_norm": 1120.0, + "kl_loss_12": 542.9299270629883, + "kl_loss_17": 158.55432205200196, + "kl_loss_3": 2231.0780334472656, + "kl_loss_6": 1403.0755798339844, + "learning_rate": 4.922094249306558e-05, + "loss": 1077.5844, + "step": 8590 + }, + { + "ce_loss_12": 3.1639073491096497, + "ce_loss_17": 2.996286356449127, + "ce_loss_23": 2.928344798088074, + "ce_loss_3": 3.9905553221702577, + "ce_loss_6": 3.559040880203247, + "epoch": 0.86, + "grad_norm": 988.0, + "kl_loss_12": 557.205712890625, + "kl_loss_17": 164.4212745666504, + "kl_loss_3": 2263.422235107422, + "kl_loss_6": 1409.6866027832032, + "learning_rate": 4.853673085668947e-05, + "loss": 1076.0809, + "step": 8600 + }, + { + "ce_loss_12": 3.1688145637512206, + "ce_loss_17": 3.006110095977783, + "ce_loss_23": 2.9394213199615478, + "ce_loss_3": 3.99489278793335, + "ce_loss_6": 3.5717530608177186, + "epoch": 0.861, + "grad_norm": 1152.0, + "kl_loss_12": 547.7848281860352, + "kl_loss_17": 159.71785202026368, + "kl_loss_3": 2251.6465454101562, + "kl_loss_6": 1418.1878173828125, + "learning_rate": 4.78570654504214e-05, + "loss": 1093.2465, + "step": 8610 + }, + { + "ce_loss_12": 3.119960296154022, + "ce_loss_17": 2.9601327061653135, + "ce_loss_23": 2.8931104063987734, + "ce_loss_3": 3.9442036390304565, + "ce_loss_6": 3.524333989620209, + "epoch": 0.862, + "grad_norm": 1168.0, + "kl_loss_12": 543.3126388549805, + "kl_loss_17": 159.60537643432616, + "kl_loss_3": 2246.9543395996093, + "kl_loss_6": 1413.2603393554687, + "learning_rate": 4.7181953118484556e-05, + "loss": 1093.0124, + "step": 8620 + }, + { + "ce_loss_12": 3.1447961807250975, + "ce_loss_17": 2.981061267852783, + "ce_loss_23": 2.919272768497467, + "ce_loss_3": 3.950004208087921, + "ce_loss_6": 3.536801242828369, + "epoch": 0.863, + "grad_norm": 1520.0, + "kl_loss_12": 546.9749267578125, + "kl_loss_17": 158.86413192749023, + "kl_loss_3": 2193.546746826172, + "kl_loss_6": 1393.0187927246093, + "learning_rate": 4.651140065925269e-05, + "loss": 1102.2883, + "step": 8630 + }, + { + "ce_loss_12": 3.0848291754722594, + "ce_loss_17": 2.919208121299744, + "ce_loss_23": 2.852880430221558, + "ce_loss_3": 3.9040238738059996, + "ce_loss_6": 3.4828606605529786, + "epoch": 0.864, + "grad_norm": 1004.0, + "kl_loss_12": 550.5586502075196, + "kl_loss_17": 161.43046188354492, + "kl_loss_3": 2262.796240234375, + "kl_loss_6": 1411.1244506835938, + "learning_rate": 4.58454148251814e-05, + "loss": 1104.4396, + "step": 8640 + }, + { + "ce_loss_12": 3.0967994570732116, + "ce_loss_17": 2.9280046820640564, + "ce_loss_23": 2.8609217524528505, + "ce_loss_3": 3.950021493434906, + "ce_loss_6": 3.5200977087020875, + "epoch": 0.865, + "grad_norm": 1128.0, + "kl_loss_12": 551.6484832763672, + "kl_loss_17": 160.92717361450195, + "kl_loss_3": 2294.815301513672, + "kl_loss_6": 1441.8151733398438, + "learning_rate": 4.518400232274078e-05, + "loss": 1098.8965, + "step": 8650 + }, + { + "ce_loss_12": 3.1297909736633303, + "ce_loss_17": 2.9608248710632323, + "ce_loss_23": 2.8919439792633055, + "ce_loss_3": 3.9411178827285767, + "ce_loss_6": 3.523774802684784, + "epoch": 0.866, + "grad_norm": 1288.0, + "kl_loss_12": 555.529444885254, + "kl_loss_17": 163.9142303466797, + "kl_loss_3": 2240.0380859375, + "kl_loss_6": 1405.0944458007812, + "learning_rate": 4.452716981234745e-05, + "loss": 1066.0305, + "step": 8660 + }, + { + "ce_loss_12": 3.0957505226135256, + "ce_loss_17": 2.9325799107551576, + "ce_loss_23": 2.868885409832001, + "ce_loss_3": 3.90879647731781, + "ce_loss_6": 3.490975868701935, + "epoch": 0.867, + "grad_norm": 1496.0, + "kl_loss_12": 539.2698043823242, + "kl_loss_17": 156.92785415649413, + "kl_loss_3": 2228.797253417969, + "kl_loss_6": 1396.8384155273438, + "learning_rate": 4.3874923908297335e-05, + "loss": 1067.8451, + "step": 8670 + }, + { + "ce_loss_12": 3.145753872394562, + "ce_loss_17": 2.983182764053345, + "ce_loss_23": 2.9180320858955384, + "ce_loss_3": 3.9800577998161315, + "ce_loss_6": 3.5526066422462463, + "epoch": 0.868, + "grad_norm": 1152.0, + "kl_loss_12": 555.1043502807618, + "kl_loss_17": 162.09677505493164, + "kl_loss_3": 2282.174005126953, + "kl_loss_6": 1435.6018798828125, + "learning_rate": 4.322727117869951e-05, + "loss": 1089.9818, + "step": 8680 + }, + { + "ce_loss_12": 3.153953719139099, + "ce_loss_17": 2.9918410778045654, + "ce_loss_23": 2.9258020162582397, + "ce_loss_3": 3.9849618911743163, + "ce_loss_6": 3.5556876063346863, + "epoch": 0.869, + "grad_norm": 1352.0, + "kl_loss_12": 553.2969848632813, + "kl_loss_17": 161.93497543334962, + "kl_loss_3": 2279.2611877441404, + "kl_loss_6": 1421.8425659179688, + "learning_rate": 4.2584218145409916e-05, + "loss": 1091.4147, + "step": 8690 + }, + { + "ce_loss_12": 3.1842427015304566, + "ce_loss_17": 3.0284349799156187, + "ce_loss_23": 2.965789806842804, + "ce_loss_3": 3.970459282398224, + "ce_loss_6": 3.56220383644104, + "epoch": 0.87, + "grad_norm": 1632.0, + "kl_loss_12": 535.7023223876953, + "kl_loss_17": 156.79924621582032, + "kl_loss_3": 2181.0572875976563, + "kl_loss_6": 1367.7674987792968, + "learning_rate": 4.194577128396521e-05, + "loss": 1061.4873, + "step": 8700 + }, + { + "ce_loss_12": 3.082056391239166, + "ce_loss_17": 2.9180865406990053, + "ce_loss_23": 2.8544756293296816, + "ce_loss_3": 3.906729853153229, + "ce_loss_6": 3.489392650127411, + "epoch": 0.871, + "grad_norm": 1048.0, + "kl_loss_12": 541.2103164672851, + "kl_loss_17": 157.3700843811035, + "kl_loss_3": 2242.8687072753905, + "kl_loss_6": 1410.3545532226562, + "learning_rate": 4.1311937023518264e-05, + "loss": 1099.8197, + "step": 8710 + }, + { + "ce_loss_12": 3.0946294546127318, + "ce_loss_17": 2.940945291519165, + "ce_loss_23": 2.877529966831207, + "ce_loss_3": 3.9572771549224854, + "ce_loss_6": 3.518484950065613, + "epoch": 0.872, + "grad_norm": 1456.0, + "kl_loss_12": 531.6827041625977, + "kl_loss_17": 154.827400970459, + "kl_loss_3": 2319.347772216797, + "kl_loss_6": 1448.127685546875, + "learning_rate": 4.0682721746773344e-05, + "loss": 1091.5121, + "step": 8720 + }, + { + "ce_loss_12": 2.9802655577659607, + "ce_loss_17": 2.812234103679657, + "ce_loss_23": 2.7481626510620116, + "ce_loss_3": 3.840736758708954, + "ce_loss_6": 3.4024895906448362, + "epoch": 0.873, + "grad_norm": 1120.0, + "kl_loss_12": 549.6573318481445, + "kl_loss_17": 158.25880508422853, + "kl_loss_3": 2307.917236328125, + "kl_loss_6": 1443.5062072753906, + "learning_rate": 4.0058131789920904e-05, + "loss": 1085.316, + "step": 8730 + }, + { + "ce_loss_12": 3.1177361249923705, + "ce_loss_17": 2.956912469863892, + "ce_loss_23": 2.89218031167984, + "ce_loss_3": 3.924745261669159, + "ce_loss_6": 3.5143105387687683, + "epoch": 0.874, + "grad_norm": 1208.0, + "kl_loss_12": 543.285775756836, + "kl_loss_17": 156.44263153076173, + "kl_loss_3": 2231.4059814453126, + "kl_loss_6": 1405.4261901855468, + "learning_rate": 3.9438173442575e-05, + "loss": 1115.9526, + "step": 8740 + }, + { + "ce_loss_12": 3.147689700126648, + "ce_loss_17": 2.980556881427765, + "ce_loss_23": 2.911785435676575, + "ce_loss_3": 3.9486439347267153, + "ce_loss_6": 3.533434844017029, + "epoch": 0.875, + "grad_norm": 980.0, + "kl_loss_12": 547.4045776367187, + "kl_loss_17": 159.73441162109376, + "kl_loss_3": 2201.325646972656, + "kl_loss_6": 1382.7564514160156, + "learning_rate": 3.882285294770937e-05, + "loss": 1078.3527, + "step": 8750 + }, + { + "ce_loss_12": 3.101701581478119, + "ce_loss_17": 2.9411001324653627, + "ce_loss_23": 2.8762433409690855, + "ce_loss_3": 3.8907504320144652, + "ce_loss_6": 3.489359402656555, + "epoch": 0.876, + "grad_norm": 1056.0, + "kl_loss_12": 537.6340866088867, + "kl_loss_17": 158.29504013061523, + "kl_loss_3": 2183.561358642578, + "kl_loss_6": 1377.058038330078, + "learning_rate": 3.821217650159453e-05, + "loss": 1088.5118, + "step": 8760 + }, + { + "ce_loss_12": 3.000348138809204, + "ce_loss_17": 2.830567252635956, + "ce_loss_23": 2.764021897315979, + "ce_loss_3": 3.8653979897499084, + "ce_loss_6": 3.4257630705833435, + "epoch": 0.877, + "grad_norm": 1464.0, + "kl_loss_12": 559.0526672363281, + "kl_loss_17": 160.97410430908204, + "kl_loss_3": 2322.0770874023438, + "kl_loss_6": 1458.2523071289063, + "learning_rate": 3.760615025373543e-05, + "loss": 1102.473, + "step": 8770 + }, + { + "ce_loss_12": 3.1634358167648315, + "ce_loss_17": 2.9957612633705137, + "ce_loss_23": 2.9249557852745056, + "ce_loss_3": 3.998697113990784, + "ce_loss_6": 3.571672594547272, + "epoch": 0.878, + "grad_norm": 1288.0, + "kl_loss_12": 563.2552429199219, + "kl_loss_17": 167.07828216552736, + "kl_loss_3": 2286.887042236328, + "kl_loss_6": 1428.4285522460937, + "learning_rate": 3.700478030680987e-05, + "loss": 1114.3246, + "step": 8780 + }, + { + "ce_loss_12": 3.1507850289344788, + "ce_loss_17": 2.9879728078842165, + "ce_loss_23": 2.92491819858551, + "ce_loss_3": 3.975934851169586, + "ce_loss_6": 3.5524497389793397, + "epoch": 0.879, + "grad_norm": 980.0, + "kl_loss_12": 542.5836776733398, + "kl_loss_17": 157.81104049682617, + "kl_loss_3": 2246.1762939453124, + "kl_loss_6": 1403.3133178710937, + "learning_rate": 3.6408072716606344e-05, + "loss": 1084.585, + "step": 8790 + }, + { + "ce_loss_12": 3.0926483392715456, + "ce_loss_17": 2.92871607542038, + "ce_loss_23": 2.8600263357162476, + "ce_loss_3": 3.9413236498832704, + "ce_loss_6": 3.5104360699653627, + "epoch": 0.88, + "grad_norm": 1648.0, + "kl_loss_12": 558.1578491210937, + "kl_loss_17": 163.44912414550782, + "kl_loss_3": 2309.015557861328, + "kl_loss_6": 1451.2872802734375, + "learning_rate": 3.5816033491963716e-05, + "loss": 1128.1988, + "step": 8800 + }, + { + "ce_loss_12": 2.9630858421325685, + "ce_loss_17": 2.7998567819595337, + "ce_loss_23": 2.731334185600281, + "ce_loss_3": 3.838448178768158, + "ce_loss_6": 3.3801329851150514, + "epoch": 0.881, + "grad_norm": 1272.0, + "kl_loss_12": 546.5424713134765, + "kl_loss_17": 158.14128494262695, + "kl_loss_3": 2349.586346435547, + "kl_loss_6": 1442.6415893554688, + "learning_rate": 3.522866859471047e-05, + "loss": 1105.7555, + "step": 8810 + }, + { + "ce_loss_12": 3.1578266859054565, + "ce_loss_17": 3.0061030626296996, + "ce_loss_23": 2.943174755573273, + "ce_loss_3": 3.9486071348190306, + "ce_loss_6": 3.5392861485481264, + "epoch": 0.882, + "grad_norm": 1160.0, + "kl_loss_12": 527.8236755371094, + "kl_loss_17": 154.08211441040038, + "kl_loss_3": 2152.692315673828, + "kl_loss_6": 1344.2339660644532, + "learning_rate": 3.46459839396045e-05, + "loss": 1067.4445, + "step": 8820 + }, + { + "ce_loss_12": 3.089976954460144, + "ce_loss_17": 2.9213294863700865, + "ce_loss_23": 2.8545565843582152, + "ce_loss_3": 3.920038306713104, + "ce_loss_6": 3.50158896446228, + "epoch": 0.883, + "grad_norm": 1488.0, + "kl_loss_12": 553.8795608520508, + "kl_loss_17": 161.10737075805665, + "kl_loss_3": 2249.959197998047, + "kl_loss_6": 1425.133349609375, + "learning_rate": 3.406798539427386e-05, + "loss": 1115.9717, + "step": 8830 + }, + { + "ce_loss_12": 3.1482776522636415, + "ce_loss_17": 2.9887410402297974, + "ce_loss_23": 2.924669921398163, + "ce_loss_3": 3.9743199706077577, + "ce_loss_6": 3.548464608192444, + "epoch": 0.884, + "grad_norm": 1640.0, + "kl_loss_12": 546.1885131835937, + "kl_loss_17": 158.3856117248535, + "kl_loss_3": 2269.4408935546876, + "kl_loss_6": 1423.7886596679687, + "learning_rate": 3.349467877915746e-05, + "loss": 1094.6814, + "step": 8840 + }, + { + "ce_loss_12": 3.1199790716171263, + "ce_loss_17": 2.9570226430892945, + "ce_loss_23": 2.889908528327942, + "ce_loss_3": 3.955884504318237, + "ce_loss_6": 3.5255047082901, + "epoch": 0.885, + "grad_norm": 1320.0, + "kl_loss_12": 554.7209442138671, + "kl_loss_17": 160.69096908569335, + "kl_loss_3": 2303.524206542969, + "kl_loss_6": 1438.8336608886718, + "learning_rate": 3.292606986744667e-05, + "loss": 1124.6703, + "step": 8850 + }, + { + "ce_loss_12": 3.0656699657440187, + "ce_loss_17": 2.9097426533699036, + "ce_loss_23": 2.8486350178718567, + "ce_loss_3": 3.90006947517395, + "ce_loss_6": 3.4645033359527586, + "epoch": 0.886, + "grad_norm": 1296.0, + "kl_loss_12": 539.5896713256836, + "kl_loss_17": 156.75498657226564, + "kl_loss_3": 2260.9781860351563, + "kl_loss_6": 1407.9178161621094, + "learning_rate": 3.23621643850267e-05, + "loss": 1089.2037, + "step": 8860 + }, + { + "ce_loss_12": 3.1390577077865602, + "ce_loss_17": 2.9780965566635134, + "ce_loss_23": 2.9126534819602967, + "ce_loss_3": 3.9636197805404665, + "ce_loss_6": 3.541094720363617, + "epoch": 0.887, + "grad_norm": 1232.0, + "kl_loss_12": 557.10390625, + "kl_loss_17": 164.1832588195801, + "kl_loss_3": 2257.755401611328, + "kl_loss_6": 1420.0990112304687, + "learning_rate": 3.180296801041971e-05, + "loss": 1082.1096, + "step": 8870 + }, + { + "ce_loss_12": 3.1589428782463074, + "ce_loss_17": 3.0010623216629027, + "ce_loss_23": 2.9385420203208925, + "ce_loss_3": 3.9908467173576354, + "ce_loss_6": 3.5579570651054384, + "epoch": 0.888, + "grad_norm": 1512.0, + "kl_loss_12": 542.31962890625, + "kl_loss_17": 157.5387435913086, + "kl_loss_3": 2263.5656188964845, + "kl_loss_6": 1398.8133972167968, + "learning_rate": 3.124848637472688e-05, + "loss": 1069.7454, + "step": 8880 + }, + { + "ce_loss_12": 2.9980852365493775, + "ce_loss_17": 2.834604871273041, + "ce_loss_23": 2.770075595378876, + "ce_loss_3": 3.837626278400421, + "ce_loss_6": 3.4114407777786253, + "epoch": 0.889, + "grad_norm": 1184.0, + "kl_loss_12": 536.0059249877929, + "kl_loss_17": 155.7068664550781, + "kl_loss_3": 2272.6115905761717, + "kl_loss_6": 1420.6156921386719, + "learning_rate": 3.069872506157212e-05, + "loss": 1085.2605, + "step": 8890 + }, + { + "ce_loss_12": 3.0897985339164733, + "ce_loss_17": 2.9317867755889893, + "ce_loss_23": 2.868427813053131, + "ce_loss_3": 3.910146701335907, + "ce_loss_6": 3.487504768371582, + "epoch": 0.89, + "grad_norm": 1032.0, + "kl_loss_12": 540.8937484741211, + "kl_loss_17": 157.58897399902344, + "kl_loss_3": 2250.160290527344, + "kl_loss_6": 1397.5354064941407, + "learning_rate": 3.0153689607045842e-05, + "loss": 1081.3935, + "step": 8900 + }, + { + "ce_loss_12": 3.014556038379669, + "ce_loss_17": 2.840059781074524, + "ce_loss_23": 2.771026074886322, + "ce_loss_3": 3.8994391798973083, + "ce_loss_6": 3.4500266551971435, + "epoch": 0.891, + "grad_norm": 1256.0, + "kl_loss_12": 568.2254974365235, + "kl_loss_17": 162.95605545043946, + "kl_loss_3": 2407.6415100097656, + "kl_loss_6": 1503.1968627929687, + "learning_rate": 2.9613385499648926e-05, + "loss": 1109.4233, + "step": 8910 + }, + { + "ce_loss_12": 3.048786735534668, + "ce_loss_17": 2.887788689136505, + "ce_loss_23": 2.8222782373428346, + "ce_loss_3": 3.860581302642822, + "ce_loss_6": 3.4523285746574404, + "epoch": 0.892, + "grad_norm": 980.0, + "kl_loss_12": 540.1698181152344, + "kl_loss_17": 158.8286033630371, + "kl_loss_3": 2215.156982421875, + "kl_loss_6": 1401.9125, + "learning_rate": 2.9077818180237692e-05, + "loss": 1090.1676, + "step": 8920 + }, + { + "ce_loss_12": 3.092083919048309, + "ce_loss_17": 2.9248949527740478, + "ce_loss_23": 2.856962502002716, + "ce_loss_3": 3.9272842407226562, + "ce_loss_6": 3.5033080220222472, + "epoch": 0.893, + "grad_norm": 1160.0, + "kl_loss_12": 546.7997436523438, + "kl_loss_17": 160.99884414672852, + "kl_loss_3": 2255.507373046875, + "kl_loss_6": 1412.3314086914063, + "learning_rate": 2.8546993041969172e-05, + "loss": 1091.2465, + "step": 8930 + }, + { + "ce_loss_12": 3.120562505722046, + "ce_loss_17": 2.96583776473999, + "ce_loss_23": 2.8990086793899534, + "ce_loss_3": 3.9110327482223513, + "ce_loss_6": 3.5060871124267576, + "epoch": 0.894, + "grad_norm": 1600.0, + "kl_loss_12": 537.0748641967773, + "kl_loss_17": 156.7382682800293, + "kl_loss_3": 2194.614501953125, + "kl_loss_6": 1376.7156066894531, + "learning_rate": 2.802091543024671e-05, + "loss": 1082.3741, + "step": 8940 + }, + { + "ce_loss_12": 3.1228439927101137, + "ce_loss_17": 2.962463939189911, + "ce_loss_23": 2.8934847354888915, + "ce_loss_3": 3.9641275286674498, + "ce_loss_6": 3.5355375170707704, + "epoch": 0.895, + "grad_norm": 1336.0, + "kl_loss_12": 544.1097839355468, + "kl_loss_17": 160.5014747619629, + "kl_loss_3": 2294.6555725097655, + "kl_loss_6": 1433.5463562011719, + "learning_rate": 2.7499590642665774e-05, + "loss": 1113.1617, + "step": 8950 + }, + { + "ce_loss_12": 3.145519268512726, + "ce_loss_17": 2.9768693923950194, + "ce_loss_23": 2.9099491834640503, + "ce_loss_3": 3.954761433601379, + "ce_loss_6": 3.5245487093925476, + "epoch": 0.896, + "grad_norm": 1112.0, + "kl_loss_12": 568.062287902832, + "kl_loss_17": 163.7352035522461, + "kl_loss_3": 2241.2856689453124, + "kl_loss_6": 1388.786572265625, + "learning_rate": 2.6983023928961405e-05, + "loss": 1083.4912, + "step": 8960 + }, + { + "ce_loss_12": 3.109204924106598, + "ce_loss_17": 2.944039022922516, + "ce_loss_23": 2.8751957297325133, + "ce_loss_3": 3.9350946068763735, + "ce_loss_6": 3.508461356163025, + "epoch": 0.897, + "grad_norm": 1128.0, + "kl_loss_12": 544.3224639892578, + "kl_loss_17": 159.73422317504884, + "kl_loss_3": 2243.127850341797, + "kl_loss_6": 1400.6375061035155, + "learning_rate": 2.6471220490954628e-05, + "loss": 1097.565, + "step": 8970 + }, + { + "ce_loss_12": 3.0935628056526183, + "ce_loss_17": 2.93547545671463, + "ce_loss_23": 2.8789809584617614, + "ce_loss_3": 3.9196462988853455, + "ce_loss_6": 3.488067018985748, + "epoch": 0.898, + "grad_norm": 1032.0, + "kl_loss_12": 533.343212890625, + "kl_loss_17": 155.5143020629883, + "kl_loss_3": 2239.4082153320314, + "kl_loss_6": 1385.2438659667969, + "learning_rate": 2.596418548250029e-05, + "loss": 1087.2333, + "step": 8980 + }, + { + "ce_loss_12": 3.1320006370544435, + "ce_loss_17": 2.971823489665985, + "ce_loss_23": 2.9040478348731993, + "ce_loss_3": 3.9445548057556152, + "ce_loss_6": 3.528187572956085, + "epoch": 0.899, + "grad_norm": 1232.0, + "kl_loss_12": 551.7921142578125, + "kl_loss_17": 161.45555572509767, + "kl_loss_3": 2263.838671875, + "kl_loss_6": 1418.526055908203, + "learning_rate": 2.5461924009435368e-05, + "loss": 1080.5319, + "step": 8990 + }, + { + "ce_loss_12": 3.1223100066184997, + "ce_loss_17": 2.9642346501350403, + "ce_loss_23": 2.89671528339386, + "ce_loss_3": 3.9317273020744326, + "ce_loss_6": 3.522926139831543, + "epoch": 0.9, + "grad_norm": 1032.0, + "kl_loss_12": 548.0636291503906, + "kl_loss_17": 161.5977439880371, + "kl_loss_3": 2225.2694702148438, + "kl_loss_6": 1406.5701416015625, + "learning_rate": 2.4964441129527336e-05, + "loss": 1105.2758, + "step": 9000 + }, + { + "ce_loss_12": 3.115997779369354, + "ce_loss_17": 2.9631489157676696, + "ce_loss_23": 2.9011554479599, + "ce_loss_3": 3.916248857975006, + "ce_loss_6": 3.5043585896492004, + "epoch": 0.901, + "grad_norm": 1152.0, + "kl_loss_12": 529.6404830932618, + "kl_loss_17": 154.54727096557616, + "kl_loss_3": 2182.716204833984, + "kl_loss_6": 1371.240216064453, + "learning_rate": 2.4471741852423235e-05, + "loss": 1069.2625, + "step": 9010 + }, + { + "ce_loss_12": 3.178261375427246, + "ce_loss_17": 3.016998851299286, + "ce_loss_23": 2.948937976360321, + "ce_loss_3": 3.9921558260917664, + "ce_loss_6": 3.5734020829200746, + "epoch": 0.902, + "grad_norm": 1184.0, + "kl_loss_12": 542.1947662353516, + "kl_loss_17": 160.79561767578124, + "kl_loss_3": 2211.367510986328, + "kl_loss_6": 1386.8376098632812, + "learning_rate": 2.3983831139599287e-05, + "loss": 1080.5448, + "step": 9020 + }, + { + "ce_loss_12": 3.0915554642677305, + "ce_loss_17": 2.937301242351532, + "ce_loss_23": 2.8709113359451295, + "ce_loss_3": 3.907834804058075, + "ce_loss_6": 3.4860360860824584, + "epoch": 0.903, + "grad_norm": 1232.0, + "kl_loss_12": 530.9263809204101, + "kl_loss_17": 156.1497802734375, + "kl_loss_3": 2213.0542053222657, + "kl_loss_6": 1376.4850708007812, + "learning_rate": 2.3500713904311022e-05, + "loss": 1058.6828, + "step": 9030 + }, + { + "ce_loss_12": 3.124735951423645, + "ce_loss_17": 2.969620370864868, + "ce_loss_23": 2.908503293991089, + "ce_loss_3": 3.9082423567771913, + "ce_loss_6": 3.50426607131958, + "epoch": 0.904, + "grad_norm": 1296.0, + "kl_loss_12": 523.6847671508789, + "kl_loss_17": 153.13335037231445, + "kl_loss_3": 2150.9460876464846, + "kl_loss_6": 1351.1428833007812, + "learning_rate": 2.3022395011543685e-05, + "loss": 1050.9455, + "step": 9040 + }, + { + "ce_loss_12": 3.161675202846527, + "ce_loss_17": 2.998300814628601, + "ce_loss_23": 2.927753913402557, + "ce_loss_3": 3.975581741333008, + "ce_loss_6": 3.5567883014678956, + "epoch": 0.905, + "grad_norm": 1160.0, + "kl_loss_12": 555.8097091674805, + "kl_loss_17": 163.1023277282715, + "kl_loss_3": 2245.241558837891, + "kl_loss_6": 1410.1447509765626, + "learning_rate": 2.2548879277963063e-05, + "loss": 1108.258, + "step": 9050 + }, + { + "ce_loss_12": 3.0751007795333862, + "ce_loss_17": 2.9190019726753236, + "ce_loss_23": 2.854919970035553, + "ce_loss_3": 3.877568233013153, + "ce_loss_6": 3.467659282684326, + "epoch": 0.906, + "grad_norm": 1440.0, + "kl_loss_12": 531.2436706542969, + "kl_loss_17": 155.4201519012451, + "kl_loss_3": 2194.947058105469, + "kl_loss_6": 1375.2668762207031, + "learning_rate": 2.208017147186736e-05, + "loss": 1050.3398, + "step": 9060 + }, + { + "ce_loss_12": 3.072239875793457, + "ce_loss_17": 2.91306734085083, + "ce_loss_23": 2.8462382197380065, + "ce_loss_3": 3.884073185920715, + "ce_loss_6": 3.4699034214019777, + "epoch": 0.907, + "grad_norm": 1000.0, + "kl_loss_12": 539.5051681518555, + "kl_loss_17": 157.800789642334, + "kl_loss_3": 2230.831646728516, + "kl_loss_6": 1393.0045166015625, + "learning_rate": 2.1616276313139227e-05, + "loss": 1072.0311, + "step": 9070 + }, + { + "ce_loss_12": 3.1148462891578674, + "ce_loss_17": 2.9539944529533386, + "ce_loss_23": 2.8859964847564696, + "ce_loss_3": 3.9291485071182253, + "ce_loss_6": 3.5146759152412415, + "epoch": 0.908, + "grad_norm": 1208.0, + "kl_loss_12": 538.7437683105469, + "kl_loss_17": 158.58870620727538, + "kl_loss_3": 2215.8249328613283, + "kl_loss_6": 1395.8588623046876, + "learning_rate": 2.1157198473197415e-05, + "loss": 1091.5688, + "step": 9080 + }, + { + "ce_loss_12": 3.167890965938568, + "ce_loss_17": 3.005591297149658, + "ce_loss_23": 2.938670742511749, + "ce_loss_3": 3.9942872762680053, + "ce_loss_6": 3.569392716884613, + "epoch": 0.909, + "grad_norm": 1240.0, + "kl_loss_12": 553.3726150512696, + "kl_loss_17": 162.06743621826172, + "kl_loss_3": 2253.4153259277346, + "kl_loss_6": 1412.7976196289062, + "learning_rate": 2.0702942574950812e-05, + "loss": 1087.3652, + "step": 9090 + }, + { + "ce_loss_12": 3.1123293161392214, + "ce_loss_17": 2.9462677955627443, + "ce_loss_23": 2.8751128554344176, + "ce_loss_3": 3.938741445541382, + "ce_loss_6": 3.5134841084480284, + "epoch": 0.91, + "grad_norm": 1136.0, + "kl_loss_12": 557.57490234375, + "kl_loss_17": 164.1454849243164, + "kl_loss_3": 2263.563861083984, + "kl_loss_6": 1420.0111206054687, + "learning_rate": 2.025351319275137e-05, + "loss": 1090.4695, + "step": 9100 + }, + { + "ce_loss_12": 3.2197620272636414, + "ce_loss_17": 3.0535457253456117, + "ce_loss_23": 2.9846746921539307, + "ce_loss_3": 4.033296668529511, + "ce_loss_6": 3.6214271068572996, + "epoch": 0.911, + "grad_norm": 1296.0, + "kl_loss_12": 560.6335205078125, + "kl_loss_17": 162.30851058959962, + "kl_loss_3": 2263.798486328125, + "kl_loss_6": 1431.2504577636719, + "learning_rate": 1.9808914852347816e-05, + "loss": 1115.8676, + "step": 9110 + }, + { + "ce_loss_12": 3.0728222727775574, + "ce_loss_17": 2.9096383094787597, + "ce_loss_23": 2.8406273484230042, + "ce_loss_3": 3.889565372467041, + "ce_loss_6": 3.481040823459625, + "epoch": 0.912, + "grad_norm": 1624.0, + "kl_loss_12": 545.7222076416016, + "kl_loss_17": 160.1074073791504, + "kl_loss_3": 2211.3495666503904, + "kl_loss_6": 1403.9913269042968, + "learning_rate": 1.9369152030840554e-05, + "loss": 1077.0305, + "step": 9120 + }, + { + "ce_loss_12": 3.1469371199607847, + "ce_loss_17": 2.99254857301712, + "ce_loss_23": 2.927776575088501, + "ce_loss_3": 3.9770922541618345, + "ce_loss_6": 3.553990662097931, + "epoch": 0.913, + "grad_norm": 1224.0, + "kl_loss_12": 545.7375885009766, + "kl_loss_17": 159.17574234008788, + "kl_loss_3": 2275.7082397460936, + "kl_loss_6": 1424.6509582519532, + "learning_rate": 1.893422915663645e-05, + "loss": 1096.3979, + "step": 9130 + }, + { + "ce_loss_12": 3.037546753883362, + "ce_loss_17": 2.871594476699829, + "ce_loss_23": 2.806107497215271, + "ce_loss_3": 3.8876906752586367, + "ce_loss_6": 3.459704267978668, + "epoch": 0.914, + "grad_norm": 1008.0, + "kl_loss_12": 554.3168228149414, + "kl_loss_17": 161.52917022705077, + "kl_loss_3": 2310.563195800781, + "kl_loss_6": 1451.2398193359375, + "learning_rate": 1.850415060940386e-05, + "loss": 1108.2693, + "step": 9140 + }, + { + "ce_loss_12": 3.1431529998779295, + "ce_loss_17": 2.986000108718872, + "ce_loss_23": 2.9234211921691893, + "ce_loss_3": 3.9328189492225647, + "ce_loss_6": 3.529419946670532, + "epoch": 0.915, + "grad_norm": 1192.0, + "kl_loss_12": 541.6305496215821, + "kl_loss_17": 158.0989532470703, + "kl_loss_3": 2186.0196838378906, + "kl_loss_6": 1377.559063720703, + "learning_rate": 1.8078920720028978e-05, + "loss": 1079.0549, + "step": 9150 + }, + { + "ce_loss_12": 3.0662479639053344, + "ce_loss_17": 2.9114412546157835, + "ce_loss_23": 2.8517210602760317, + "ce_loss_3": 3.8619790196418764, + "ce_loss_6": 3.4534464478492737, + "epoch": 0.916, + "grad_norm": 868.0, + "kl_loss_12": 527.064845275879, + "kl_loss_17": 153.3120216369629, + "kl_loss_3": 2172.184851074219, + "kl_loss_6": 1368.7500366210938, + "learning_rate": 1.765854377057219e-05, + "loss": 1083.5215, + "step": 9160 + }, + { + "ce_loss_12": 3.0449477195739747, + "ce_loss_17": 2.8911916851997375, + "ce_loss_23": 2.8287646174430847, + "ce_loss_3": 3.8581594944000246, + "ce_loss_6": 3.4393247723579408, + "epoch": 0.917, + "grad_norm": 1208.0, + "kl_loss_12": 525.6621978759765, + "kl_loss_17": 152.03661041259767, + "kl_loss_3": 2207.767950439453, + "kl_loss_6": 1378.0291320800782, + "learning_rate": 1.724302399422456e-05, + "loss": 1077.2742, + "step": 9170 + }, + { + "ce_loss_12": 3.033001518249512, + "ce_loss_17": 2.8675798892974855, + "ce_loss_23": 2.798583674430847, + "ce_loss_3": 3.8532918214797975, + "ce_loss_6": 3.4274834752082826, + "epoch": 0.918, + "grad_norm": 852.0, + "kl_loss_12": 556.0043411254883, + "kl_loss_17": 162.40515594482423, + "kl_loss_3": 2270.423095703125, + "kl_loss_6": 1417.0343627929688, + "learning_rate": 1.683236557526574e-05, + "loss": 1094.157, + "step": 9180 + }, + { + "ce_loss_12": 3.1149054527282716, + "ce_loss_17": 2.9648550748825073, + "ce_loss_23": 2.9029189944267273, + "ce_loss_3": 3.893848168849945, + "ce_loss_6": 3.4919156551361086, + "epoch": 0.919, + "grad_norm": 1328.0, + "kl_loss_12": 520.6415710449219, + "kl_loss_17": 153.77631530761718, + "kl_loss_3": 2137.8181762695312, + "kl_loss_6": 1343.8169311523438, + "learning_rate": 1.6426572649021475e-05, + "loss": 1067.9865, + "step": 9190 + }, + { + "ce_loss_12": 3.148048484325409, + "ce_loss_17": 2.994646632671356, + "ce_loss_23": 2.931679570674896, + "ce_loss_3": 3.920458984375, + "ce_loss_6": 3.52545348405838, + "epoch": 0.92, + "grad_norm": 948.0, + "kl_loss_12": 531.7145721435547, + "kl_loss_17": 157.4194091796875, + "kl_loss_3": 2146.69736328125, + "kl_loss_6": 1351.5185180664062, + "learning_rate": 1.6025649301821876e-05, + "loss": 1065.0173, + "step": 9200 + }, + { + "ce_loss_12": 3.1481565117835997, + "ce_loss_17": 2.9912326455116274, + "ce_loss_23": 2.9256301879882813, + "ce_loss_3": 3.932176637649536, + "ce_loss_6": 3.5256184458732607, + "epoch": 0.921, + "grad_norm": 1232.0, + "kl_loss_12": 541.1270858764649, + "kl_loss_17": 158.6568473815918, + "kl_loss_3": 2181.775390625, + "kl_loss_6": 1374.807177734375, + "learning_rate": 1.5629599570960716e-05, + "loss": 1063.4439, + "step": 9210 + }, + { + "ce_loss_12": 3.0623461604118347, + "ce_loss_17": 2.908112442493439, + "ce_loss_23": 2.841799771785736, + "ce_loss_3": 3.892938697338104, + "ce_loss_6": 3.4622840762138365, + "epoch": 0.922, + "grad_norm": 1320.0, + "kl_loss_12": 541.5634201049804, + "kl_loss_17": 156.80740127563476, + "kl_loss_3": 2252.3447509765624, + "kl_loss_6": 1405.0825500488281, + "learning_rate": 1.5238427444654367e-05, + "loss": 1079.0959, + "step": 9220 + }, + { + "ce_loss_12": 3.1123980045318604, + "ce_loss_17": 2.9523245811462404, + "ce_loss_23": 2.8886606454849244, + "ce_loss_3": 3.9145833969116213, + "ce_loss_6": 3.494296705722809, + "epoch": 0.923, + "grad_norm": 1048.0, + "kl_loss_12": 531.3698974609375, + "kl_loss_17": 157.11926879882813, + "kl_loss_3": 2190.626611328125, + "kl_loss_6": 1359.580322265625, + "learning_rate": 1.4852136862001764e-05, + "loss": 1066.6766, + "step": 9230 + }, + { + "ce_loss_12": 3.0788945317268372, + "ce_loss_17": 2.9183433413505555, + "ce_loss_23": 2.8562278389930724, + "ce_loss_3": 3.8753127574920656, + "ce_loss_6": 3.468129289150238, + "epoch": 0.924, + "grad_norm": 1064.0, + "kl_loss_12": 531.5305938720703, + "kl_loss_17": 154.79893341064454, + "kl_loss_3": 2182.990582275391, + "kl_loss_6": 1373.8393920898438, + "learning_rate": 1.4470731712944884e-05, + "loss": 1078.1009, + "step": 9240 + }, + { + "ce_loss_12": 3.105417084693909, + "ce_loss_17": 2.9438974380493166, + "ce_loss_23": 2.8757981777191164, + "ce_loss_3": 3.9196877241134644, + "ce_loss_6": 3.5000792860984804, + "epoch": 0.925, + "grad_norm": 1248.0, + "kl_loss_12": 544.6679031372071, + "kl_loss_17": 159.6856834411621, + "kl_loss_3": 2213.4506958007814, + "kl_loss_6": 1380.3028869628906, + "learning_rate": 1.4094215838229174e-05, + "loss": 1094.7473, + "step": 9250 + }, + { + "ce_loss_12": 3.0856382489204406, + "ce_loss_17": 2.922611892223358, + "ce_loss_23": 2.857351744174957, + "ce_loss_3": 3.9181522965431212, + "ce_loss_6": 3.4862570762634277, + "epoch": 0.926, + "grad_norm": 1408.0, + "kl_loss_12": 543.434619140625, + "kl_loss_17": 158.3315170288086, + "kl_loss_3": 2270.9244995117188, + "kl_loss_6": 1409.6431457519532, + "learning_rate": 1.372259302936546e-05, + "loss": 1124.6401, + "step": 9260 + }, + { + "ce_loss_12": 3.1793969869613647, + "ce_loss_17": 3.0157102584838866, + "ce_loss_23": 2.946769106388092, + "ce_loss_3": 3.983129787445068, + "ce_loss_6": 3.567841613292694, + "epoch": 0.927, + "grad_norm": 1696.0, + "kl_loss_12": 555.6799179077149, + "kl_loss_17": 166.32063751220704, + "kl_loss_3": 2228.221728515625, + "kl_loss_6": 1397.9705871582032, + "learning_rate": 1.3355867028591206e-05, + "loss": 1074.1199, + "step": 9270 + }, + { + "ce_loss_12": 3.0804280757904055, + "ce_loss_17": 2.9220717191696166, + "ce_loss_23": 2.8591931581497194, + "ce_loss_3": 3.8621233344078063, + "ce_loss_6": 3.455961990356445, + "epoch": 0.928, + "grad_norm": 1120.0, + "kl_loss_12": 530.960791015625, + "kl_loss_17": 155.62761688232422, + "kl_loss_3": 2170.498614501953, + "kl_loss_6": 1355.5591064453124, + "learning_rate": 1.2994041528833267e-05, + "loss": 1062.3898, + "step": 9280 + }, + { + "ce_loss_12": 3.083864176273346, + "ce_loss_17": 2.928153955936432, + "ce_loss_23": 2.8618181586265563, + "ce_loss_3": 3.8916095972061155, + "ce_loss_6": 3.4765172123909, + "epoch": 0.929, + "grad_norm": 1264.0, + "kl_loss_12": 539.578709411621, + "kl_loss_17": 155.33257980346679, + "kl_loss_3": 2217.529052734375, + "kl_loss_6": 1386.2726989746093, + "learning_rate": 1.2637120173670358e-05, + "loss": 1065.5698, + "step": 9290 + }, + { + "ce_loss_12": 3.107132685184479, + "ce_loss_17": 2.942716729640961, + "ce_loss_23": 2.873940134048462, + "ce_loss_3": 3.932835280895233, + "ce_loss_6": 3.510121464729309, + "epoch": 0.93, + "grad_norm": 1240.0, + "kl_loss_12": 544.464077758789, + "kl_loss_17": 160.88489379882813, + "kl_loss_3": 2237.293408203125, + "kl_loss_6": 1402.9177734375, + "learning_rate": 1.2285106557296478e-05, + "loss": 1075.1647, + "step": 9300 + }, + { + "ce_loss_12": 3.000529372692108, + "ce_loss_17": 2.840439808368683, + "ce_loss_23": 2.775368392467499, + "ce_loss_3": 3.877704751491547, + "ce_loss_6": 3.429161012172699, + "epoch": 0.931, + "grad_norm": 1368.0, + "kl_loss_12": 544.9817962646484, + "kl_loss_17": 157.51940689086913, + "kl_loss_3": 2336.6595947265623, + "kl_loss_6": 1450.5424194335938, + "learning_rate": 1.1938004224484989e-05, + "loss": 1098.9723, + "step": 9310 + }, + { + "ce_loss_12": 3.2173678398132326, + "ce_loss_17": 3.056504476070404, + "ce_loss_23": 2.987658143043518, + "ce_loss_3": 4.015758633613586, + "ce_loss_6": 3.6054705142974854, + "epoch": 0.932, + "grad_norm": 1224.0, + "kl_loss_12": 544.5718933105469, + "kl_loss_17": 160.28335723876953, + "kl_loss_3": 2210.1267944335937, + "kl_loss_6": 1393.004949951172, + "learning_rate": 1.1595816670552429e-05, + "loss": 1094.6549, + "step": 9320 + }, + { + "ce_loss_12": 3.1313786029815676, + "ce_loss_17": 2.982147419452667, + "ce_loss_23": 2.9147197008132935, + "ce_loss_3": 3.934300494194031, + "ce_loss_6": 3.5208180427551268, + "epoch": 0.933, + "grad_norm": 1256.0, + "kl_loss_12": 534.8270599365235, + "kl_loss_17": 159.38514709472656, + "kl_loss_3": 2192.3671936035157, + "kl_loss_6": 1370.7300659179687, + "learning_rate": 1.1258547341323699e-05, + "loss": 1060.6582, + "step": 9330 + }, + { + "ce_loss_12": 3.1733838319778442, + "ce_loss_17": 3.014378774166107, + "ce_loss_23": 2.9463321208953857, + "ce_loss_3": 3.9683137059211733, + "ce_loss_6": 3.563319516181946, + "epoch": 0.934, + "grad_norm": 1784.0, + "kl_loss_12": 543.0530883789063, + "kl_loss_17": 159.08293609619142, + "kl_loss_3": 2215.9241577148437, + "kl_loss_6": 1391.2158264160157, + "learning_rate": 1.0926199633097156e-05, + "loss": 1074.9855, + "step": 9340 + }, + { + "ce_loss_12": 3.170001041889191, + "ce_loss_17": 3.014968490600586, + "ce_loss_23": 2.952860116958618, + "ce_loss_3": 3.944246733188629, + "ce_loss_6": 3.542513108253479, + "epoch": 0.935, + "grad_norm": 1280.0, + "kl_loss_12": 527.8997283935547, + "kl_loss_17": 154.46102066040038, + "kl_loss_3": 2160.9652099609375, + "kl_loss_6": 1352.6298950195312, + "learning_rate": 1.0598776892610684e-05, + "loss": 1085.8208, + "step": 9350 + }, + { + "ce_loss_12": 3.0026628494262697, + "ce_loss_17": 2.8449546456336976, + "ce_loss_23": 2.7814199447631838, + "ce_loss_3": 3.8261303305625916, + "ce_loss_6": 3.403845179080963, + "epoch": 0.936, + "grad_norm": 1304.0, + "kl_loss_12": 533.1611785888672, + "kl_loss_17": 153.6280891418457, + "kl_loss_3": 2235.2831970214843, + "kl_loss_6": 1403.28251953125, + "learning_rate": 1.0276282417007399e-05, + "loss": 1067.509, + "step": 9360 + }, + { + "ce_loss_12": 3.138059711456299, + "ce_loss_17": 2.985086512565613, + "ce_loss_23": 2.9205947160720824, + "ce_loss_3": 3.9207167744636537, + "ce_loss_6": 3.5188659548759462, + "epoch": 0.937, + "grad_norm": 1048.0, + "kl_loss_12": 528.6805099487304, + "kl_loss_17": 154.96491088867188, + "kl_loss_3": 2149.21474609375, + "kl_loss_6": 1351.6066040039063, + "learning_rate": 9.958719453803277e-06, + "loss": 1062.6318, + "step": 9370 + }, + { + "ce_loss_12": 3.1419655203819277, + "ce_loss_17": 2.9851008653640747, + "ce_loss_23": 2.917102587223053, + "ce_loss_3": 3.9556142926216125, + "ce_loss_6": 3.5410670757293703, + "epoch": 0.938, + "grad_norm": 1280.0, + "kl_loss_12": 544.192350769043, + "kl_loss_17": 158.0346694946289, + "kl_loss_3": 2221.0482177734375, + "kl_loss_6": 1402.3416381835937, + "learning_rate": 9.646091200853802e-06, + "loss": 1072.5707, + "step": 9380 + }, + { + "ce_loss_12": 3.0962947249412536, + "ce_loss_17": 2.942214035987854, + "ce_loss_23": 2.878101277351379, + "ce_loss_3": 3.8967387557029722, + "ce_loss_6": 3.4878384351730345, + "epoch": 0.939, + "grad_norm": 1400.0, + "kl_loss_12": 530.0220367431641, + "kl_loss_17": 154.4480094909668, + "kl_loss_3": 2168.3790893554688, + "kl_loss_6": 1363.4065490722655, + "learning_rate": 9.338400806321978e-06, + "loss": 1041.948, + "step": 9390 + }, + { + "ce_loss_12": 3.1360417127609255, + "ce_loss_17": 2.9746978521347045, + "ce_loss_23": 2.906171131134033, + "ce_loss_3": 3.936599922180176, + "ce_loss_6": 3.520904242992401, + "epoch": 0.94, + "grad_norm": 1128.0, + "kl_loss_12": 548.192594909668, + "kl_loss_17": 162.06064224243164, + "kl_loss_3": 2202.0309509277345, + "kl_loss_6": 1383.4674560546875, + "learning_rate": 9.035651368646646e-06, + "loss": 1065.3178, + "step": 9400 + }, + { + "ce_loss_12": 3.13859965801239, + "ce_loss_17": 2.985644745826721, + "ce_loss_23": 2.922950530052185, + "ce_loss_3": 3.925608992576599, + "ce_loss_6": 3.5271278023719788, + "epoch": 0.941, + "grad_norm": 1464.0, + "kl_loss_12": 526.8603591918945, + "kl_loss_17": 154.67271118164064, + "kl_loss_3": 2176.2880004882813, + "kl_loss_6": 1372.896905517578, + "learning_rate": 8.737845936511335e-06, + "loss": 1068.3764, + "step": 9410 + }, + { + "ce_loss_12": 3.095977246761322, + "ce_loss_17": 2.931922101974487, + "ce_loss_23": 2.8640459775924683, + "ce_loss_3": 3.918685781955719, + "ce_loss_6": 3.489489185810089, + "epoch": 0.942, + "grad_norm": 936.0, + "kl_loss_12": 545.6118621826172, + "kl_loss_17": 159.8506301879883, + "kl_loss_3": 2244.4978088378907, + "kl_loss_6": 1398.2353454589843, + "learning_rate": 8.444987508813451e-06, + "loss": 1073.6937, + "step": 9420 + }, + { + "ce_loss_12": 3.0587186932563784, + "ce_loss_17": 2.894214355945587, + "ce_loss_23": 2.8275156140327455, + "ce_loss_3": 3.9129406094551085, + "ce_loss_6": 3.47637939453125, + "epoch": 0.943, + "grad_norm": 1216.0, + "kl_loss_12": 555.0885116577149, + "kl_loss_17": 161.69069900512696, + "kl_loss_3": 2330.9462463378904, + "kl_loss_6": 1455.9245971679688, + "learning_rate": 8.157079034633974e-06, + "loss": 1101.4342, + "step": 9430 + }, + { + "ce_loss_12": 3.042380619049072, + "ce_loss_17": 2.884304630756378, + "ce_loss_23": 2.8195446968078612, + "ce_loss_3": 3.8600812911987306, + "ce_loss_6": 3.4441370368003845, + "epoch": 0.944, + "grad_norm": 984.0, + "kl_loss_12": 535.3955963134765, + "kl_loss_17": 156.51687469482422, + "kl_loss_3": 2244.7854187011717, + "kl_loss_6": 1411.4883728027344, + "learning_rate": 7.874123413208145e-06, + "loss": 1077.6412, + "step": 9440 + }, + { + "ce_loss_12": 3.0281569480896, + "ce_loss_17": 2.865230941772461, + "ce_loss_23": 2.798931360244751, + "ce_loss_3": 3.8621835589408873, + "ce_loss_6": 3.436237359046936, + "epoch": 0.945, + "grad_norm": 1048.0, + "kl_loss_12": 536.7286972045898, + "kl_loss_17": 156.12984771728514, + "kl_loss_3": 2247.120428466797, + "kl_loss_6": 1404.0705200195312, + "learning_rate": 7.59612349389599e-06, + "loss": 1086.8323, + "step": 9450 + }, + { + "ce_loss_12": 3.102251076698303, + "ce_loss_17": 2.9473668694496156, + "ce_loss_23": 2.8842572450637816, + "ce_loss_3": 3.891569495201111, + "ce_loss_6": 3.4863331437110903, + "epoch": 0.946, + "grad_norm": 1256.0, + "kl_loss_12": 525.8963073730469, + "kl_loss_17": 152.55707778930665, + "kl_loss_3": 2146.53525390625, + "kl_loss_6": 1347.6020385742188, + "learning_rate": 7.323082076153509e-06, + "loss": 1061.4568, + "step": 9460 + }, + { + "ce_loss_12": 3.1477771043777465, + "ce_loss_17": 2.9896829605102537, + "ce_loss_23": 2.924260699748993, + "ce_loss_3": 3.934639847278595, + "ce_loss_6": 3.526467728614807, + "epoch": 0.947, + "grad_norm": 1232.0, + "kl_loss_12": 541.1049713134765, + "kl_loss_17": 160.94679489135743, + "kl_loss_3": 2172.9966735839844, + "kl_loss_6": 1368.2758056640625, + "learning_rate": 7.055001909504755e-06, + "loss": 1083.6273, + "step": 9470 + }, + { + "ce_loss_12": 3.183323097229004, + "ce_loss_17": 3.023005247116089, + "ce_loss_23": 2.9559802651405334, + "ce_loss_3": 3.9784157991409304, + "ce_loss_6": 3.5707173943519592, + "epoch": 0.948, + "grad_norm": 1176.0, + "kl_loss_12": 539.4944686889648, + "kl_loss_17": 157.5242950439453, + "kl_loss_3": 2207.8321228027344, + "kl_loss_6": 1381.5572387695313, + "learning_rate": 6.791885693514133e-06, + "loss": 1079.0922, + "step": 9480 + }, + { + "ce_loss_12": 3.094246971607208, + "ce_loss_17": 2.935408890247345, + "ce_loss_23": 2.8691577553749084, + "ce_loss_3": 3.920883822441101, + "ce_loss_6": 3.4952668070793154, + "epoch": 0.949, + "grad_norm": 1040.0, + "kl_loss_12": 540.617756652832, + "kl_loss_17": 158.96320114135742, + "kl_loss_3": 2272.402795410156, + "kl_loss_6": 1413.7683044433593, + "learning_rate": 6.533736077758867e-06, + "loss": 1092.891, + "step": 9490 + }, + { + "ce_loss_12": 3.0623729705810545, + "ce_loss_17": 2.8987768054008485, + "ce_loss_23": 2.832401430606842, + "ce_loss_3": 3.911205840110779, + "ce_loss_6": 3.4744672179222107, + "epoch": 0.95, + "grad_norm": 1808.0, + "kl_loss_12": 556.887094116211, + "kl_loss_17": 162.5521026611328, + "kl_loss_3": 2316.6902587890627, + "kl_loss_6": 1439.454327392578, + "learning_rate": 6.2805556618028556e-06, + "loss": 1090.7896, + "step": 9500 + }, + { + "ce_loss_12": 3.1281984210014344, + "ce_loss_17": 2.981964576244354, + "ce_loss_23": 2.9159383177757263, + "ce_loss_3": 3.9170543432235716, + "ce_loss_6": 3.508361804485321, + "epoch": 0.951, + "grad_norm": 1208.0, + "kl_loss_12": 515.7074813842773, + "kl_loss_17": 154.32823028564454, + "kl_loss_3": 2141.530340576172, + "kl_loss_6": 1326.3069274902343, + "learning_rate": 6.032346995169968e-06, + "loss": 1027.9488, + "step": 9510 + }, + { + "ce_loss_12": 3.136077415943146, + "ce_loss_17": 2.9806724548339845, + "ce_loss_23": 2.917427134513855, + "ce_loss_3": 3.9442824363708495, + "ce_loss_6": 3.526597273349762, + "epoch": 0.952, + "grad_norm": 1168.0, + "kl_loss_12": 537.6825424194336, + "kl_loss_17": 157.90337600708008, + "kl_loss_3": 2211.2411254882813, + "kl_loss_6": 1379.8905090332032, + "learning_rate": 5.789112577318789e-06, + "loss": 1067.4238, + "step": 9520 + }, + { + "ce_loss_12": 3.12904806137085, + "ce_loss_17": 2.9698351502418516, + "ce_loss_23": 2.9052838325500487, + "ce_loss_3": 3.9457252740859987, + "ce_loss_6": 3.5223633885383605, + "epoch": 0.953, + "grad_norm": 1064.0, + "kl_loss_12": 548.40458984375, + "kl_loss_17": 159.1815216064453, + "kl_loss_3": 2241.716436767578, + "kl_loss_6": 1404.752862548828, + "learning_rate": 5.550854857617194e-06, + "loss": 1068.0402, + "step": 9530 + }, + { + "ce_loss_12": 3.114034593105316, + "ce_loss_17": 2.94967303276062, + "ce_loss_23": 2.8814757823944093, + "ce_loss_3": 3.9540896892547606, + "ce_loss_6": 3.5182945370674132, + "epoch": 0.954, + "grad_norm": 1264.0, + "kl_loss_12": 553.2667083740234, + "kl_loss_17": 162.21889266967773, + "kl_loss_3": 2289.8888427734373, + "kl_loss_6": 1432.19365234375, + "learning_rate": 5.317576235317756e-06, + "loss": 1101.0947, + "step": 9540 + }, + { + "ce_loss_12": 3.1283044457435607, + "ce_loss_17": 2.9742880702018737, + "ce_loss_23": 2.9097349286079406, + "ce_loss_3": 3.9063483357429503, + "ce_loss_6": 3.509521949291229, + "epoch": 0.955, + "grad_norm": 1000.0, + "kl_loss_12": 523.9976669311524, + "kl_loss_17": 155.9721908569336, + "kl_loss_3": 2127.7600524902346, + "kl_loss_6": 1335.1602172851562, + "learning_rate": 5.089279059533658e-06, + "loss": 1067.269, + "step": 9550 + }, + { + "ce_loss_12": 3.1875174760818483, + "ce_loss_17": 3.025624454021454, + "ce_loss_23": 2.9579347252845762, + "ce_loss_3": 3.9809356689453126, + "ce_loss_6": 3.5737068176269533, + "epoch": 0.956, + "grad_norm": 1840.0, + "kl_loss_12": 554.1400634765625, + "kl_loss_17": 163.7421745300293, + "kl_loss_3": 2195.819970703125, + "kl_loss_6": 1390.9726867675781, + "learning_rate": 4.865965629214819e-06, + "loss": 1069.3086, + "step": 9560 + }, + { + "ce_loss_12": 3.1353691101074217, + "ce_loss_17": 2.9781248807907104, + "ce_loss_23": 2.9155802726745605, + "ce_loss_3": 3.9476658582687376, + "ce_loss_6": 3.531456804275513, + "epoch": 0.957, + "grad_norm": 1232.0, + "kl_loss_12": 546.0765274047851, + "kl_loss_17": 158.91124877929687, + "kl_loss_3": 2245.0339904785155, + "kl_loss_6": 1413.6621520996093, + "learning_rate": 4.6476381931251366e-06, + "loss": 1067.9433, + "step": 9570 + }, + { + "ce_loss_12": 3.1197481036186216, + "ce_loss_17": 2.9656429409980776, + "ce_loss_23": 2.8970610022544863, + "ce_loss_3": 3.913744592666626, + "ce_loss_6": 3.511826777458191, + "epoch": 0.958, + "grad_norm": 1152.0, + "kl_loss_12": 531.675032043457, + "kl_loss_17": 156.55153198242186, + "kl_loss_3": 2180.854107666016, + "kl_loss_6": 1372.5286376953125, + "learning_rate": 4.434298949819449e-06, + "loss": 1069.0238, + "step": 9580 + }, + { + "ce_loss_12": 3.0952879071235655, + "ce_loss_17": 2.932103991508484, + "ce_loss_23": 2.862602686882019, + "ce_loss_3": 3.940154552459717, + "ce_loss_6": 3.506578290462494, + "epoch": 0.959, + "grad_norm": 1160.0, + "kl_loss_12": 562.480191040039, + "kl_loss_17": 166.8973876953125, + "kl_loss_3": 2328.669140625, + "kl_loss_6": 1465.3488891601562, + "learning_rate": 4.2259500476214406e-06, + "loss": 1099.8499, + "step": 9590 + }, + { + "ce_loss_12": 3.06618047952652, + "ce_loss_17": 2.911667048931122, + "ce_loss_23": 2.842763102054596, + "ce_loss_3": 3.8899773359298706, + "ce_loss_6": 3.4630375146865844, + "epoch": 0.96, + "grad_norm": 952.0, + "kl_loss_12": 542.4524353027343, + "kl_loss_17": 157.97199249267578, + "kl_loss_3": 2253.8352966308594, + "kl_loss_6": 1405.9619384765624, + "learning_rate": 4.02259358460233e-06, + "loss": 1073.7738, + "step": 9600 + }, + { + "ce_loss_12": 3.1313651323318483, + "ce_loss_17": 2.97236407995224, + "ce_loss_23": 2.902596282958984, + "ce_loss_3": 3.9350406765937804, + "ce_loss_6": 3.51432626247406, + "epoch": 0.961, + "grad_norm": 1224.0, + "kl_loss_12": 539.8430587768555, + "kl_loss_17": 160.37659225463867, + "kl_loss_3": 2178.180029296875, + "kl_loss_6": 1357.514794921875, + "learning_rate": 3.8242316085594916e-06, + "loss": 1062.6227, + "step": 9610 + }, + { + "ce_loss_12": 3.0283801317214967, + "ce_loss_17": 2.8597097992897034, + "ce_loss_23": 2.791723334789276, + "ce_loss_3": 3.8920037150382996, + "ce_loss_6": 3.4422079205513, + "epoch": 0.962, + "grad_norm": 912.0, + "kl_loss_12": 557.7178024291992, + "kl_loss_17": 162.21144485473633, + "kl_loss_3": 2349.5974975585937, + "kl_loss_6": 1454.2951721191407, + "learning_rate": 3.630866116995757e-06, + "loss": 1113.8287, + "step": 9620 + }, + { + "ce_loss_12": 3.154905390739441, + "ce_loss_17": 3.0037710547447203, + "ce_loss_23": 2.94018212556839, + "ce_loss_3": 3.9456016659736632, + "ce_loss_6": 3.539944219589233, + "epoch": 0.963, + "grad_norm": 1064.0, + "kl_loss_12": 531.0302154541016, + "kl_loss_17": 155.95521697998046, + "kl_loss_3": 2167.8955261230467, + "kl_loss_6": 1353.688787841797, + "learning_rate": 3.4424990570994797e-06, + "loss": 1080.9525, + "step": 9630 + }, + { + "ce_loss_12": 3.1488232731819155, + "ce_loss_17": 2.9942864537239076, + "ce_loss_23": 2.9292378664016723, + "ce_loss_3": 3.945158064365387, + "ce_loss_6": 3.535008955001831, + "epoch": 0.964, + "grad_norm": 1232.0, + "kl_loss_12": 535.8747940063477, + "kl_loss_17": 156.66651306152343, + "kl_loss_3": 2199.7471130371096, + "kl_loss_6": 1382.6786376953125, + "learning_rate": 3.2591323257248896e-06, + "loss": 1068.9333, + "step": 9640 + }, + { + "ce_loss_12": 3.01090430021286, + "ce_loss_17": 2.854299783706665, + "ce_loss_23": 2.7875581622123717, + "ce_loss_3": 3.8374653458595276, + "ce_loss_6": 3.4172415375709533, + "epoch": 0.965, + "grad_norm": 1104.0, + "kl_loss_12": 534.776318359375, + "kl_loss_17": 155.20736236572264, + "kl_loss_3": 2241.1626525878905, + "kl_loss_6": 1406.0585571289062, + "learning_rate": 3.0807677693729385e-06, + "loss": 1091.4187, + "step": 9650 + }, + { + "ce_loss_12": 3.1849361419677735, + "ce_loss_17": 3.0277846574783327, + "ce_loss_23": 2.9649621844291687, + "ce_loss_3": 3.972131061553955, + "ce_loss_6": 3.5725770950317384, + "epoch": 0.966, + "grad_norm": 1176.0, + "kl_loss_12": 536.5375106811523, + "kl_loss_17": 157.157373046875, + "kl_loss_3": 2174.757568359375, + "kl_loss_6": 1369.9640502929688, + "learning_rate": 2.9074071841727055e-06, + "loss": 1053.3885, + "step": 9660 + }, + { + "ce_loss_12": 3.1240636467933656, + "ce_loss_17": 2.9632591128349306, + "ce_loss_23": 2.898830235004425, + "ce_loss_3": 3.920987141132355, + "ce_loss_6": 3.511180579662323, + "epoch": 0.967, + "grad_norm": 1160.0, + "kl_loss_12": 536.1274505615235, + "kl_loss_17": 157.79016342163087, + "kl_loss_3": 2201.714074707031, + "kl_loss_6": 1379.7575561523438, + "learning_rate": 2.739052315863355e-06, + "loss": 1049.1912, + "step": 9670 + }, + { + "ce_loss_12": 3.092950427532196, + "ce_loss_17": 2.93706796169281, + "ce_loss_23": 2.8726242065429686, + "ce_loss_3": 3.9104288458824157, + "ce_loss_6": 3.4850559711456297, + "epoch": 0.968, + "grad_norm": 1056.0, + "kl_loss_12": 534.6220748901367, + "kl_loss_17": 156.35627136230468, + "kl_loss_3": 2232.6286499023436, + "kl_loss_6": 1385.5797973632812, + "learning_rate": 2.5757048597765396e-06, + "loss": 1064.2279, + "step": 9680 + }, + { + "ce_loss_12": 3.1135802984237673, + "ce_loss_17": 2.955181133747101, + "ce_loss_23": 2.8891377568244936, + "ce_loss_3": 3.9307027220726014, + "ce_loss_6": 3.5102320075035096, + "epoch": 0.969, + "grad_norm": 980.0, + "kl_loss_12": 539.390592956543, + "kl_loss_17": 158.1667694091797, + "kl_loss_3": 2229.200085449219, + "kl_loss_6": 1389.3835510253907, + "learning_rate": 2.417366460819359e-06, + "loss": 1076.9048, + "step": 9690 + }, + { + "ce_loss_12": 3.1266333937644957, + "ce_loss_17": 2.9659619688987733, + "ce_loss_23": 2.8987301349639893, + "ce_loss_3": 3.9571635603904722, + "ce_loss_6": 3.5298823595046995, + "epoch": 0.97, + "grad_norm": 1352.0, + "kl_loss_12": 549.3267044067383, + "kl_loss_17": 161.9480407714844, + "kl_loss_3": 2268.3638305664062, + "kl_loss_6": 1410.324609375, + "learning_rate": 2.2640387134577057e-06, + "loss": 1073.3981, + "step": 9700 + }, + { + "ce_loss_12": 3.0461677074432374, + "ce_loss_17": 2.897366940975189, + "ce_loss_23": 2.833691942691803, + "ce_loss_3": 3.8217989802360535, + "ce_loss_6": 3.418661153316498, + "epoch": 0.971, + "grad_norm": 1184.0, + "kl_loss_12": 510.2892547607422, + "kl_loss_17": 149.87894020080566, + "kl_loss_3": 2106.054510498047, + "kl_loss_6": 1317.9690856933594, + "learning_rate": 2.115723161700278e-06, + "loss": 1044.9428, + "step": 9710 + }, + { + "ce_loss_12": 3.0436718821525575, + "ce_loss_17": 2.8807451009750364, + "ce_loss_23": 2.81461740732193, + "ce_loss_3": 3.8790770292282106, + "ce_loss_6": 3.4477824091911318, + "epoch": 0.972, + "grad_norm": 1104.0, + "kl_loss_12": 550.6471160888672, + "kl_loss_17": 160.43698272705078, + "kl_loss_3": 2283.2156616210937, + "kl_loss_6": 1422.974462890625, + "learning_rate": 1.9724212990830937e-06, + "loss": 1093.6719, + "step": 9720 + }, + { + "ce_loss_12": 3.1684569835662844, + "ce_loss_17": 3.006157600879669, + "ce_loss_23": 2.942043721675873, + "ce_loss_3": 3.9932775855064393, + "ce_loss_6": 3.5714781284332275, + "epoch": 0.973, + "grad_norm": 1080.0, + "kl_loss_12": 547.3026596069336, + "kl_loss_17": 160.0761505126953, + "kl_loss_3": 2256.4254943847654, + "kl_loss_6": 1413.3182495117187, + "learning_rate": 1.8341345686543331e-06, + "loss": 1081.5486, + "step": 9730 + }, + { + "ce_loss_12": 3.1531498193740846, + "ce_loss_17": 2.995375192165375, + "ce_loss_23": 2.928957664966583, + "ce_loss_3": 3.9263521432876587, + "ce_loss_6": 3.5328819274902346, + "epoch": 0.974, + "grad_norm": 1136.0, + "kl_loss_12": 528.3388610839844, + "kl_loss_17": 155.42781448364258, + "kl_loss_3": 2144.5085571289064, + "kl_loss_6": 1356.7732421875, + "learning_rate": 1.7008643629596864e-06, + "loss": 1078.8777, + "step": 9740 + }, + { + "ce_loss_12": 3.131344759464264, + "ce_loss_17": 2.9758414387702943, + "ce_loss_23": 2.9093008756637575, + "ce_loss_3": 3.940278100967407, + "ce_loss_6": 3.523132836818695, + "epoch": 0.975, + "grad_norm": 1040.0, + "kl_loss_12": 541.175668334961, + "kl_loss_17": 158.34674148559571, + "kl_loss_3": 2238.6670532226562, + "kl_loss_6": 1384.8669006347657, + "learning_rate": 1.5726120240288633e-06, + "loss": 1088.9391, + "step": 9750 + }, + { + "ce_loss_12": 3.0473599314689634, + "ce_loss_17": 2.8926878452301024, + "ce_loss_23": 2.829075610637665, + "ce_loss_3": 3.850501024723053, + "ce_loss_6": 3.436177659034729, + "epoch": 0.976, + "grad_norm": 1488.0, + "kl_loss_12": 535.9018264770508, + "kl_loss_17": 155.93840789794922, + "kl_loss_3": 2202.1723388671876, + "kl_loss_6": 1390.6271606445312, + "learning_rate": 1.4493788433612708e-06, + "loss": 1067.7689, + "step": 9760 + }, + { + "ce_loss_12": 3.1586664438247682, + "ce_loss_17": 2.99827094078064, + "ce_loss_23": 2.9324021100997926, + "ce_loss_3": 3.969194209575653, + "ce_loss_6": 3.5587011575698853, + "epoch": 0.977, + "grad_norm": 932.0, + "kl_loss_12": 545.4004196166992, + "kl_loss_17": 158.38016357421876, + "kl_loss_3": 2244.3901489257814, + "kl_loss_6": 1409.5289794921875, + "learning_rate": 1.3311660619138578e-06, + "loss": 1086.8182, + "step": 9770 + }, + { + "ce_loss_12": 3.147229480743408, + "ce_loss_17": 2.9906322836875914, + "ce_loss_23": 2.92654265165329, + "ce_loss_3": 3.9147791504859923, + "ce_loss_6": 3.5209817767143248, + "epoch": 0.978, + "grad_norm": 1248.0, + "kl_loss_12": 535.7907821655274, + "kl_loss_17": 158.41741638183595, + "kl_loss_3": 2124.4832458496094, + "kl_loss_6": 1348.4041625976563, + "learning_rate": 1.2179748700879012e-06, + "loss": 1068.7557, + "step": 9780 + }, + { + "ce_loss_12": 3.087145209312439, + "ce_loss_17": 2.9269677639007567, + "ce_loss_23": 2.861637556552887, + "ce_loss_3": 3.889015567302704, + "ce_loss_6": 3.478603518009186, + "epoch": 0.979, + "grad_norm": 1320.0, + "kl_loss_12": 530.5468521118164, + "kl_loss_17": 155.6657455444336, + "kl_loss_3": 2190.927166748047, + "kl_loss_6": 1372.2948608398438, + "learning_rate": 1.1098064077174619e-06, + "loss": 1069.6716, + "step": 9790 + }, + { + "ce_loss_12": 3.119102585315704, + "ce_loss_17": 2.9587631821632385, + "ce_loss_23": 2.8912365794181825, + "ce_loss_3": 3.9466206073760985, + "ce_loss_6": 3.51912544965744, + "epoch": 0.98, + "grad_norm": 1224.0, + "kl_loss_12": 538.2732711791992, + "kl_loss_17": 155.36832962036132, + "kl_loss_3": 2256.1044494628904, + "kl_loss_6": 1405.48935546875, + "learning_rate": 1.006661764057837e-06, + "loss": 1078.2967, + "step": 9800 + }, + { + "ce_loss_12": 3.120966613292694, + "ce_loss_17": 2.9633092999458315, + "ce_loss_23": 2.8999213218688964, + "ce_loss_3": 3.9308133125305176, + "ce_loss_6": 3.51082307100296, + "epoch": 0.981, + "grad_norm": 1136.0, + "kl_loss_12": 538.3776977539062, + "kl_loss_17": 155.93407440185547, + "kl_loss_3": 2217.1360717773437, + "kl_loss_6": 1385.0979919433594, + "learning_rate": 9.085419777743465e-07, + "loss": 1065.499, + "step": 9810 + }, + { + "ce_loss_12": 3.0673842072486877, + "ce_loss_17": 2.9180146932601927, + "ce_loss_23": 2.8566697955131533, + "ce_loss_3": 3.884550166130066, + "ce_loss_6": 3.4691938281059267, + "epoch": 0.982, + "grad_norm": 988.0, + "kl_loss_12": 524.5106842041016, + "kl_loss_17": 151.49212799072265, + "kl_loss_3": 2208.735736083984, + "kl_loss_6": 1387.4951110839843, + "learning_rate": 8.15448036932176e-07, + "loss": 1052.5303, + "step": 9820 + }, + { + "ce_loss_12": 3.1133029580116274, + "ce_loss_17": 2.9559295177459717, + "ce_loss_23": 2.892114531993866, + "ce_loss_3": 3.925271010398865, + "ce_loss_6": 3.5094059109687805, + "epoch": 0.983, + "grad_norm": 1208.0, + "kl_loss_12": 541.3424270629882, + "kl_loss_17": 157.8437515258789, + "kl_loss_3": 2225.1154357910154, + "kl_loss_6": 1397.5705688476562, + "learning_rate": 7.273808789862724e-07, + "loss": 1085.3074, + "step": 9830 + }, + { + "ce_loss_12": 3.1807573556900026, + "ce_loss_17": 3.0257806301116945, + "ce_loss_23": 2.959278440475464, + "ce_loss_3": 3.97727655172348, + "ce_loss_6": 3.5681079745292665, + "epoch": 0.984, + "grad_norm": 1120.0, + "kl_loss_12": 542.2464050292969, + "kl_loss_17": 158.70405960083008, + "kl_loss_3": 2211.959191894531, + "kl_loss_6": 1392.662744140625, + "learning_rate": 6.443413907720186e-07, + "loss": 1067.6535, + "step": 9840 + }, + { + "ce_loss_12": 3.1271595120429994, + "ce_loss_17": 2.9681738018989563, + "ce_loss_23": 2.903019917011261, + "ce_loss_3": 3.937088334560394, + "ce_loss_6": 3.5122796654701234, + "epoch": 0.985, + "grad_norm": 1120.0, + "kl_loss_12": 536.1871688842773, + "kl_loss_17": 157.89398803710938, + "kl_loss_3": 2195.043994140625, + "kl_loss_6": 1368.721240234375, + "learning_rate": 5.663304084960185e-07, + "loss": 1064.3484, + "step": 9850 + }, + { + "ce_loss_12": 3.0556672334671022, + "ce_loss_17": 2.896334195137024, + "ce_loss_23": 2.8314119935035706, + "ce_loss_3": 3.8760223388671875, + "ce_loss_6": 3.450761950016022, + "epoch": 0.986, + "grad_norm": 1416.0, + "kl_loss_12": 539.7659469604492, + "kl_loss_17": 159.50339889526367, + "kl_loss_3": 2237.8478149414063, + "kl_loss_6": 1400.7106201171875, + "learning_rate": 4.933487177280482e-07, + "loss": 1060.8133, + "step": 9860 + }, + { + "ce_loss_12": 3.14933443069458, + "ce_loss_17": 2.995462489128113, + "ce_loss_23": 2.9319130182266235, + "ce_loss_3": 3.9437816858291628, + "ce_loss_6": 3.5285532593727114, + "epoch": 0.987, + "grad_norm": 1136.0, + "kl_loss_12": 525.5044784545898, + "kl_loss_17": 153.37696380615233, + "kl_loss_3": 2184.7522216796874, + "kl_loss_6": 1353.022198486328, + "learning_rate": 4.2539705339295075e-07, + "loss": 1053.7746, + "step": 9870 + }, + { + "ce_loss_12": 3.0208406925201414, + "ce_loss_17": 2.8577373743057253, + "ce_loss_23": 2.7932136178016664, + "ce_loss_3": 3.8365853309631346, + "ce_loss_6": 3.416055428981781, + "epoch": 0.988, + "grad_norm": 1640.0, + "kl_loss_12": 548.2541458129883, + "kl_loss_17": 155.56036453247071, + "kl_loss_3": 2236.24755859375, + "kl_loss_6": 1402.8287963867188, + "learning_rate": 3.6247609976319816e-07, + "loss": 1067.5872, + "step": 9880 + }, + { + "ce_loss_12": 3.103432631492615, + "ce_loss_17": 2.939455437660217, + "ce_loss_23": 2.873777425289154, + "ce_loss_3": 3.9283102631568907, + "ce_loss_6": 3.5072454452514648, + "epoch": 0.989, + "grad_norm": 1120.0, + "kl_loss_12": 544.3662368774415, + "kl_loss_17": 159.0839111328125, + "kl_loss_3": 2247.000842285156, + "kl_loss_6": 1411.2964294433593, + "learning_rate": 3.0458649045211895e-07, + "loss": 1099.9736, + "step": 9890 + }, + { + "ce_loss_12": 3.0800060272216796, + "ce_loss_17": 2.91620032787323, + "ce_loss_23": 2.8455692291259767, + "ce_loss_3": 3.902763283252716, + "ce_loss_6": 3.486640763282776, + "epoch": 0.99, + "grad_norm": 1200.0, + "kl_loss_12": 552.108773803711, + "kl_loss_17": 162.36163406372071, + "kl_loss_3": 2241.4049865722654, + "kl_loss_6": 1414.161883544922, + "learning_rate": 2.517288084074587e-07, + "loss": 1095.7029, + "step": 9900 + }, + { + "ce_loss_12": 3.1283355593681335, + "ce_loss_17": 2.9556203722953795, + "ce_loss_23": 2.884258818626404, + "ce_loss_3": 3.9650839805603026, + "ce_loss_6": 3.543080711364746, + "epoch": 0.991, + "grad_norm": 1112.0, + "kl_loss_12": 562.2683700561523, + "kl_loss_17": 162.61367416381836, + "kl_loss_3": 2298.439349365234, + "kl_loss_6": 1445.863427734375, + "learning_rate": 2.0390358590538505e-07, + "loss": 1095.3518, + "step": 9910 + }, + { + "ce_loss_12": 3.113189232349396, + "ce_loss_17": 2.954020929336548, + "ce_loss_23": 2.887207198143005, + "ce_loss_3": 3.932934558391571, + "ce_loss_6": 3.513229751586914, + "epoch": 0.992, + "grad_norm": 964.0, + "kl_loss_12": 544.9443725585937, + "kl_loss_17": 158.9228401184082, + "kl_loss_3": 2239.096240234375, + "kl_loss_6": 1402.6988220214844, + "learning_rate": 1.61111304545436e-07, + "loss": 1071.2314, + "step": 9920 + }, + { + "ce_loss_12": 3.090544879436493, + "ce_loss_17": 2.932784688472748, + "ce_loss_23": 2.8656368136405943, + "ce_loss_3": 3.892158830165863, + "ce_loss_6": 3.480930304527283, + "epoch": 0.993, + "grad_norm": 1336.0, + "kl_loss_12": 541.1668838500976, + "kl_loss_17": 157.00824203491212, + "kl_loss_3": 2216.0669860839844, + "kl_loss_6": 1397.4162292480469, + "learning_rate": 1.2335239524541298e-07, + "loss": 1061.3529, + "step": 9930 + }, + { + "ce_loss_12": 3.056631350517273, + "ce_loss_17": 2.8970078825950623, + "ce_loss_23": 2.8318159580230713, + "ce_loss_3": 3.8693312406539917, + "ce_loss_6": 3.4506518840789795, + "epoch": 0.994, + "grad_norm": 780.0, + "kl_loss_12": 536.3515884399415, + "kl_loss_17": 156.6513999938965, + "kl_loss_3": 2208.554571533203, + "kl_loss_6": 1377.04248046875, + "learning_rate": 9.06272382371065e-08, + "loss": 1073.0936, + "step": 9940 + }, + { + "ce_loss_12": 3.1232089519500734, + "ce_loss_17": 2.9615687847137453, + "ce_loss_23": 2.899142873287201, + "ce_loss_3": 3.947288119792938, + "ce_loss_6": 3.5209755420684816, + "epoch": 0.995, + "grad_norm": 1448.0, + "kl_loss_12": 547.1004272460938, + "kl_loss_17": 156.98374252319337, + "kl_loss_3": 2258.907989501953, + "kl_loss_6": 1410.8921264648438, + "learning_rate": 6.293616306246586e-08, + "loss": 1080.6102, + "step": 9950 + }, + { + "ce_loss_12": 3.1039719104766847, + "ce_loss_17": 2.952614200115204, + "ce_loss_23": 2.8885851383209227, + "ce_loss_3": 3.891808068752289, + "ce_loss_6": 3.4867576241493223, + "epoch": 0.996, + "grad_norm": 1312.0, + "kl_loss_12": 524.452732849121, + "kl_loss_17": 153.04303207397462, + "kl_loss_3": 2154.74208984375, + "kl_loss_6": 1352.1610107421875, + "learning_rate": 4.027944857032395e-08, + "loss": 1039.3016, + "step": 9960 + }, + { + "ce_loss_12": 3.100421166419983, + "ce_loss_17": 2.954938817024231, + "ce_loss_23": 2.895737874507904, + "ce_loss_3": 3.8738303184509277, + "ce_loss_6": 3.469689059257507, + "epoch": 0.997, + "grad_norm": 960.0, + "kl_loss_12": 509.41295318603517, + "kl_loss_17": 149.07791976928712, + "kl_loss_3": 2100.171911621094, + "kl_loss_6": 1308.5161010742188, + "learning_rate": 2.265732291356626e-08, + "loss": 1028.9963, + "step": 9970 + }, + { + "ce_loss_12": 3.146505868434906, + "ce_loss_17": 2.993780553340912, + "ce_loss_23": 2.926430571079254, + "ce_loss_3": 3.938515841960907, + "ce_loss_6": 3.5314017653465273, + "epoch": 0.998, + "grad_norm": 1160.0, + "kl_loss_12": 533.1759674072266, + "kl_loss_17": 157.0031936645508, + "kl_loss_3": 2155.145294189453, + "kl_loss_6": 1354.1124328613282, + "learning_rate": 1.0069963546743833e-08, + "loss": 1073.4172, + "step": 9980 + }, + { + "ce_loss_12": 3.137771463394165, + "ce_loss_17": 2.977656054496765, + "ce_loss_23": 2.9086800813674927, + "ce_loss_3": 3.9426164269447326, + "ce_loss_6": 3.52817143201828, + "epoch": 0.999, + "grad_norm": 1312.0, + "kl_loss_12": 540.6506790161133, + "kl_loss_17": 157.4025550842285, + "kl_loss_3": 2219.0190673828124, + "kl_loss_6": 1389.755682373047, + "learning_rate": 2.517497224463483e-09, + "loss": 1068.3458, + "step": 9990 + }, + { + "ce_loss_12": 3.0929014682769775, + "ce_loss_17": 2.928170955181122, + "ce_loss_23": 2.8597790002822876, + "ce_loss_3": 3.9463054656982424, + "ce_loss_6": 3.5110191822052004, + "epoch": 1.0, + "grad_norm": 1272.0, + "kl_loss_12": 554.8514373779296, + "kl_loss_17": 161.27447433471679, + "kl_loss_3": 2324.8254516601564, + "kl_loss_6": 1451.786395263672, + "learning_rate": 0.0, + "loss": 1104.2758, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.502582338838856e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}