{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_12": 11.16768741607666, "ce_loss_17": 10.69606876373291, "ce_loss_23": 2.8781241178512573, "ce_loss_3": 13.679932594299316, "ce_loss_6": 12.782220363616943, "epoch": 0.0001, "grad_norm": 132096.0, "kl_loss_12": 17880.517578125, "kl_loss_17": 17056.2138671875, "kl_loss_3": 22473.77734375, "kl_loss_6": 20747.6396484375, "learning_rate": 1e-05, "loss": 20223.791, "step": 1 }, { "ce_loss_12": 8.242788076400757, "ce_loss_17": 7.6790783405303955, "ce_loss_23": 2.9531757831573486, "ce_loss_3": 9.965806987550524, "ce_loss_6": 9.590416616863674, "epoch": 0.001, "grad_norm": 31232.0, "kl_loss_12": 11030.585747612848, "kl_loss_17": 10147.679416232639, "kl_loss_3": 14175.968532986111, "kl_loss_6": 13466.597493489584, "learning_rate": 0.0001, "loss": 12302.9245, "step": 10 }, { "ce_loss_12": 5.3866219282150265, "ce_loss_17": 4.39049506187439, "ce_loss_23": 2.9577841758728027, "ce_loss_3": 7.321023607254029, "ce_loss_6": 6.767641139030457, "epoch": 0.002, "grad_norm": 12224.0, "kl_loss_12": 4647.222497558594, "kl_loss_17": 2857.6106811523437, "kl_loss_3": 8220.148486328126, "kl_loss_6": 7194.997045898437, "learning_rate": 0.0002, "loss": 5755.3641, "step": 20 }, { "ce_loss_12": 4.471867954730987, "ce_loss_17": 3.596786892414093, "ce_loss_23": 2.7707528471946716, "ce_loss_3": 6.453728103637696, "ce_loss_6": 5.812050318717956, "epoch": 0.003, "grad_norm": 5696.0, "kl_loss_12": 3281.927099609375, "kl_loss_17": 1651.611065673828, "kl_loss_3": 7046.269970703125, "kl_loss_6": 5852.9330078125, "learning_rate": 0.0003, "loss": 4405.1645, "step": 30 }, { "ce_loss_12": 4.35389233827591, "ce_loss_17": 3.553706979751587, "ce_loss_23": 2.934287405014038, "ce_loss_3": 6.034887075424194, "ce_loss_6": 5.435985016822815, "epoch": 0.004, "grad_norm": 9088.0, "kl_loss_12": 2733.2104614257814, "kl_loss_17": 1217.1323120117188, "kl_loss_3": 5916.643041992187, "kl_loss_6": 4787.089672851563, "learning_rate": 0.0004, "loss": 3687.0945, "step": 40 }, { "ce_loss_12": 4.149027359485626, "ce_loss_17": 3.4113935589790345, "ce_loss_23": 2.8995264291763307, "ce_loss_3": 5.78397433757782, "ce_loss_6": 5.1235936164855955, "epoch": 0.005, "grad_norm": 4704.0, "kl_loss_12": 2461.4163208007812, "kl_loss_17": 989.9328491210938, "kl_loss_3": 5603.44287109375, "kl_loss_6": 4341.613940429687, "learning_rate": 0.0005, "loss": 3334.0184, "step": 50 }, { "ce_loss_12": 4.0183366417884825, "ce_loss_17": 3.324011433124542, "ce_loss_23": 2.9169072985649107, "ce_loss_3": 5.5873085260391235, "ce_loss_6": 5.030270624160766, "epoch": 0.006, "grad_norm": 14016.0, "kl_loss_12": 2195.4134033203127, "kl_loss_17": 842.8528381347656, "kl_loss_3": 5173.573413085937, "kl_loss_6": 4158.858227539063, "learning_rate": 0.0006, "loss": 3110.6227, "step": 60 }, { "ce_loss_12": 3.900798332691193, "ce_loss_17": 3.257086682319641, "ce_loss_23": 2.8356701850891115, "ce_loss_3": 5.478265619277954, "ce_loss_6": 4.897664165496826, "epoch": 0.007, "grad_norm": 6752.0, "kl_loss_12": 2123.1200561523438, "kl_loss_17": 865.8842193603516, "kl_loss_3": 5140.0406494140625, "kl_loss_6": 4031.4986206054687, "learning_rate": 0.0007, "loss": 3039.6166, "step": 70 }, { "ce_loss_12": 3.9320133566856383, "ce_loss_17": 3.300851571559906, "ce_loss_23": 2.8350846648216246, "ce_loss_3": 5.4221717596054075, "ce_loss_6": 4.767316746711731, "epoch": 0.008, "grad_norm": 3232.0, "kl_loss_12": 2182.824816894531, "kl_loss_17": 1022.9220184326172, "kl_loss_3": 5036.355078125, "kl_loss_6": 3783.0829833984376, "learning_rate": 0.0008, "loss": 3021.7758, "step": 80 }, { "ce_loss_12": 3.963316762447357, "ce_loss_17": 3.374627947807312, "ce_loss_23": 2.8013595581054687, "ce_loss_3": 5.3971950769424435, "ce_loss_6": 4.755392622947693, "epoch": 0.009, "grad_norm": 5504.0, "kl_loss_12": 2318.512774658203, "kl_loss_17": 1111.7156463623046, "kl_loss_3": 5047.43779296875, "kl_loss_6": 3864.2319091796876, "learning_rate": 0.0009000000000000001, "loss": 3077.7311, "step": 90 }, { "ce_loss_12": 4.037562215328217, "ce_loss_17": 3.3241353154182436, "ce_loss_23": 2.906688666343689, "ce_loss_3": 5.441467499732971, "ce_loss_6": 4.868034100532531, "epoch": 0.01, "grad_norm": 17408.0, "kl_loss_12": 2229.615863037109, "kl_loss_17": 832.681689453125, "kl_loss_3": 4962.237451171875, "kl_loss_6": 3902.222119140625, "learning_rate": 0.001, "loss": 2979.5012, "step": 100 }, { "ce_loss_12": 3.8746483087539674, "ce_loss_17": 3.2309106945991517, "ce_loss_23": 2.868969190120697, "ce_loss_3": 5.371089196205139, "ce_loss_6": 4.770741057395935, "epoch": 0.011, "grad_norm": 5696.0, "kl_loss_12": 2015.6501770019531, "kl_loss_17": 733.734262084961, "kl_loss_3": 4902.950366210937, "kl_loss_6": 3788.4685180664064, "learning_rate": 0.0009999974825027757, "loss": 2855.4787, "step": 110 }, { "ce_loss_12": 3.8332433819770815, "ce_loss_17": 3.2423539996147155, "ce_loss_23": 2.930523693561554, "ce_loss_3": 5.298088216781617, "ce_loss_6": 4.743260788917541, "epoch": 0.012, "grad_norm": 7712.0, "kl_loss_12": 1837.9232055664063, "kl_loss_17": 639.3664642333985, "kl_loss_3": 4677.4095458984375, "kl_loss_6": 3642.7612060546876, "learning_rate": 0.0009999899300364532, "loss": 2674.099, "step": 120 }, { "ce_loss_12": 3.7736268401145936, "ce_loss_17": 3.2274181962013246, "ce_loss_23": 2.889433944225311, "ce_loss_3": 5.3178743600845335, "ce_loss_6": 4.753137707710266, "epoch": 0.013, "grad_norm": 4800.0, "kl_loss_12": 1765.98681640625, "kl_loss_17": 678.0200164794921, "kl_loss_3": 4744.243969726563, "kl_loss_6": 3663.6985595703127, "learning_rate": 0.0009999773426770863, "loss": 2724.3738, "step": 130 }, { "ce_loss_12": 3.757179248332977, "ce_loss_17": 3.2548238396644593, "ce_loss_23": 2.9310826420783997, "ce_loss_3": 5.288840341567993, "ce_loss_6": 4.680767583847046, "epoch": 0.014, "grad_norm": 3344.0, "kl_loss_12": 1663.0203979492187, "kl_loss_17": 667.9883514404297, "kl_loss_3": 4617.917041015625, "kl_loss_6": 3446.7722778320312, "learning_rate": 0.0009999597205514296, "loss": 2622.384, "step": 140 }, { "ce_loss_12": 3.7178321361541746, "ce_loss_17": 3.2042754173278807, "ce_loss_23": 2.88481262922287, "ce_loss_3": 5.233242201805115, "ce_loss_6": 4.54782543182373, "epoch": 0.015, "grad_norm": 2336.0, "kl_loss_12": 1673.2152893066407, "kl_loss_17": 669.5892211914063, "kl_loss_3": 4588.619311523437, "kl_loss_6": 3283.3631958007813, "learning_rate": 0.0009999370638369377, "loss": 2563.6086, "step": 150 }, { "ce_loss_12": 3.8975033283233644, "ce_loss_17": 3.259253513813019, "ce_loss_23": 2.925470495223999, "ce_loss_3": 5.237982249259948, "ce_loss_6": 4.5195070028305055, "epoch": 0.016, "grad_norm": 2752.0, "kl_loss_12": 1999.6217224121094, "kl_loss_17": 702.8198181152344, "kl_loss_3": 4536.748388671875, "kl_loss_6": 3186.8176391601564, "learning_rate": 0.000999909372761763, "loss": 2599.7559, "step": 160 }, { "ce_loss_12": 3.9207254886627196, "ce_loss_17": 3.227236843109131, "ce_loss_23": 2.866022825241089, "ce_loss_3": 5.2069617986679075, "ce_loss_6": 4.487017941474915, "epoch": 0.017, "grad_norm": 2240.0, "kl_loss_12": 2135.180523681641, "kl_loss_17": 742.9576934814453, "kl_loss_3": 4628.918359375, "kl_loss_6": 3242.9795776367187, "learning_rate": 0.0009998766476047546, "loss": 2709.5734, "step": 170 }, { "ce_loss_12": 3.880035960674286, "ce_loss_17": 3.2814144134521483, "ce_loss_23": 2.898086893558502, "ce_loss_3": 5.193981313705445, "ce_loss_6": 4.545022559165955, "epoch": 0.018, "grad_norm": 3520.0, "kl_loss_12": 2003.6199279785155, "kl_loss_17": 761.2051483154297, "kl_loss_3": 4523.928857421875, "kl_loss_6": 3293.190075683594, "learning_rate": 0.0009998388886954545, "loss": 2655.2025, "step": 180 }, { "ce_loss_12": 3.7998528599739076, "ce_loss_17": 3.2430348992347717, "ce_loss_23": 2.873298764228821, "ce_loss_3": 5.10157356262207, "ce_loss_6": 4.506469011306763, "epoch": 0.019, "grad_norm": 2080.0, "kl_loss_12": 1879.7686462402344, "kl_loss_17": 774.2787292480468, "kl_loss_3": 4365.284875488282, "kl_loss_6": 3259.860400390625, "learning_rate": 0.0009997960964140947, "loss": 2557.5475, "step": 190 }, { "ce_loss_12": 3.7017858624458313, "ce_loss_17": 3.2199748039245604, "ce_loss_23": 2.8715609908103943, "ce_loss_3": 5.05151731967926, "ce_loss_6": 4.46097469329834, "epoch": 0.02, "grad_norm": 2008.0, "kl_loss_12": 1728.4301513671876, "kl_loss_17": 724.9210571289062, "kl_loss_3": 4324.548291015625, "kl_loss_6": 3188.89443359375, "learning_rate": 0.0009997482711915926, "loss": 2478.465, "step": 200 }, { "ce_loss_12": 3.627460551261902, "ce_loss_17": 3.153350031375885, "ce_loss_23": 2.8465389728546144, "ce_loss_3": 4.983549165725708, "ce_loss_6": 4.3947282314300535, "epoch": 0.021, "grad_norm": 1440.0, "kl_loss_12": 1616.2701904296875, "kl_loss_17": 631.39169921875, "kl_loss_3": 4262.137780761719, "kl_loss_6": 3138.6790161132812, "learning_rate": 0.0009996954135095479, "loss": 2409.6078, "step": 210 }, { "ce_loss_12": 3.645814502239227, "ce_loss_17": 3.1951419949531554, "ce_loss_23": 2.9083182334899904, "ce_loss_3": 5.008230352401734, "ce_loss_6": 4.436058068275452, "epoch": 0.022, "grad_norm": 1752.0, "kl_loss_12": 1510.1014770507813, "kl_loss_17": 580.6792602539062, "kl_loss_3": 4156.8021240234375, "kl_loss_6": 3070.6255737304687, "learning_rate": 0.0009996375239002368, "loss": 2331.6045, "step": 220 }, { "ce_loss_12": 3.7145918250083922, "ce_loss_17": 3.2435343861579895, "ce_loss_23": 2.9827067852020264, "ce_loss_3": 5.028963327407837, "ce_loss_6": 4.5062199354171755, "epoch": 0.023, "grad_norm": 2272.0, "kl_loss_12": 1495.8587890625, "kl_loss_17": 542.103482055664, "kl_loss_3": 4070.92353515625, "kl_loss_6": 3063.4745239257813, "learning_rate": 0.0009995746029466072, "loss": 2299.4912, "step": 230 }, { "ce_loss_12": 3.5051693081855775, "ce_loss_17": 3.046588110923767, "ce_loss_23": 2.790693646669388, "ce_loss_3": 4.909505295753479, "ce_loss_6": 4.321071481704712, "epoch": 0.024, "grad_norm": 2256.0, "kl_loss_12": 1484.3840026855469, "kl_loss_17": 527.0030975341797, "kl_loss_3": 4215.655187988281, "kl_loss_6": 3102.826525878906, "learning_rate": 0.0009995066512822719, "loss": 2265.1627, "step": 240 }, { "ce_loss_12": 3.604202592372894, "ce_loss_17": 3.1324368119239807, "ce_loss_23": 2.885883128643036, "ce_loss_3": 5.031421732902527, "ce_loss_6": 4.435573077201843, "epoch": 0.025, "grad_norm": 1168.0, "kl_loss_12": 1476.4596740722657, "kl_loss_17": 506.74576568603516, "kl_loss_3": 4252.793518066406, "kl_loss_6": 3113.574108886719, "learning_rate": 0.000999433669591504, "loss": 2256.191, "step": 250 }, { "ce_loss_12": 3.524767017364502, "ce_loss_17": 3.0347627997398376, "ce_loss_23": 2.791684591770172, "ce_loss_3": 4.904810905456543, "ce_loss_6": 4.284141218662262, "epoch": 0.026, "grad_norm": 1584.0, "kl_loss_12": 1510.495831298828, "kl_loss_17": 516.5738159179688, "kl_loss_3": 4220.025524902344, "kl_loss_6": 3024.763671875, "learning_rate": 0.000999355658609228, "loss": 2270.6297, "step": 260 }, { "ce_loss_12": 3.5572774767875672, "ce_loss_17": 3.0783823490142823, "ce_loss_23": 2.8128905057907105, "ce_loss_3": 5.009525322914124, "ce_loss_6": 4.3709392786026005, "epoch": 0.027, "grad_norm": 2288.0, "kl_loss_12": 1488.0630615234375, "kl_loss_17": 542.858137512207, "kl_loss_3": 4309.648498535156, "kl_loss_6": 3095.8380126953125, "learning_rate": 0.0009992726191210138, "loss": 2323.3471, "step": 270 }, { "ce_loss_12": 3.5629722476005554, "ce_loss_17": 3.111860430240631, "ce_loss_23": 2.8538637518882752, "ce_loss_3": 4.906073951721192, "ce_loss_6": 4.32850182056427, "epoch": 0.028, "grad_norm": 1696.0, "kl_loss_12": 1444.6719482421875, "kl_loss_17": 527.0346221923828, "kl_loss_3": 4061.671533203125, "kl_loss_6": 2958.3097534179688, "learning_rate": 0.0009991845519630679, "loss": 2233.8912, "step": 280 }, { "ce_loss_12": 3.4362335085868834, "ce_loss_17": 2.9920791387557983, "ce_loss_23": 2.7459678411483766, "ce_loss_3": 4.801335573196411, "ce_loss_6": 4.208979916572571, "epoch": 0.029, "grad_norm": 2128.0, "kl_loss_12": 1418.6752685546876, "kl_loss_17": 512.7696136474609, "kl_loss_3": 4087.991943359375, "kl_loss_6": 2944.8553955078123, "learning_rate": 0.0009990914580222257, "loss": 2234.8063, "step": 290 }, { "ce_loss_12": 3.550733494758606, "ce_loss_17": 3.1482574582099914, "ce_loss_23": 2.8788382768630982, "ce_loss_3": 4.804766297340393, "ce_loss_6": 4.249274849891663, "epoch": 0.03, "grad_norm": 1392.0, "kl_loss_12": 1404.602276611328, "kl_loss_17": 557.4966827392578, "kl_loss_3": 3861.957177734375, "kl_loss_6": 2783.5663940429686, "learning_rate": 0.0009989933382359422, "loss": 2194.6727, "step": 300 }, { "ce_loss_12": 3.540028524398804, "ce_loss_17": 3.1266565561294555, "ce_loss_23": 2.891022598743439, "ce_loss_3": 4.818720889091492, "ce_loss_6": 4.2541723370552065, "epoch": 0.031, "grad_norm": 1056.0, "kl_loss_12": 1339.680322265625, "kl_loss_17": 517.6963348388672, "kl_loss_3": 3839.874609375, "kl_loss_6": 2763.8606811523437, "learning_rate": 0.0009988901935922825, "loss": 2138.709, "step": 310 }, { "ce_loss_12": 3.4157469153404234, "ce_loss_17": 2.9800336003303527, "ce_loss_23": 2.7450334310531614, "ce_loss_3": 4.779390811920166, "ce_loss_6": 4.189510977268219, "epoch": 0.032, "grad_norm": 1032.0, "kl_loss_12": 1387.2072631835938, "kl_loss_17": 491.8215576171875, "kl_loss_3": 4056.8338256835937, "kl_loss_6": 2929.2824951171874, "learning_rate": 0.0009987820251299122, "loss": 2160.0242, "step": 320 }, { "ce_loss_12": 3.5009677052497863, "ce_loss_17": 3.096353495121002, "ce_loss_23": 2.861190211772919, "ce_loss_3": 4.8097329378128055, "ce_loss_6": 4.232736647129059, "epoch": 0.033, "grad_norm": 1376.0, "kl_loss_12": 1327.8585388183594, "kl_loss_17": 477.89680938720704, "kl_loss_3": 3902.2681884765625, "kl_loss_6": 2785.81064453125, "learning_rate": 0.0009986688339380862, "loss": 2112.4908, "step": 330 }, { "ce_loss_12": 3.4142756819725038, "ce_loss_17": 3.0398844361305235, "ce_loss_23": 2.8264341592788695, "ce_loss_3": 4.700353384017944, "ce_loss_6": 4.1305066585540775, "epoch": 0.034, "grad_norm": 1512.0, "kl_loss_12": 1242.5486267089843, "kl_loss_17": 461.6464080810547, "kl_loss_3": 3757.1355224609374, "kl_loss_6": 2665.6082275390627, "learning_rate": 0.0009985506211566387, "loss": 2053.3332, "step": 340 }, { "ce_loss_12": 3.43664208650589, "ce_loss_17": 3.072474813461304, "ce_loss_23": 2.850420904159546, "ce_loss_3": 4.6930335521697994, "ce_loss_6": 4.137925624847412, "epoch": 0.035, "grad_norm": 1192.0, "kl_loss_12": 1218.1204467773437, "kl_loss_17": 466.7091690063477, "kl_loss_3": 3688.2295532226562, "kl_loss_6": 2630.9764404296875, "learning_rate": 0.0009984273879759713, "loss": 2012.377, "step": 350 }, { "ce_loss_12": 3.4873570442199706, "ce_loss_17": 3.1122740864753724, "ce_loss_23": 2.8793745040893555, "ce_loss_3": 4.758946943283081, "ce_loss_6": 4.204195618629456, "epoch": 0.036, "grad_norm": 1368.0, "kl_loss_12": 1268.138409423828, "kl_loss_17": 487.7879470825195, "kl_loss_3": 3747.4706909179686, "kl_loss_6": 2682.5483276367186, "learning_rate": 0.0009982991356370402, "loss": 2068.6064, "step": 360 }, { "ce_loss_12": 3.446204948425293, "ce_loss_17": 3.082586574554443, "ce_loss_23": 2.8578404664993284, "ce_loss_3": 4.716637182235718, "ce_loss_6": 4.157662272453308, "epoch": 0.037, "grad_norm": 1936.0, "kl_loss_12": 1237.1415466308595, "kl_loss_17": 465.0269577026367, "kl_loss_3": 3733.6424560546875, "kl_loss_6": 2662.3063354492188, "learning_rate": 0.0009981658654313456, "loss": 2041.6695, "step": 370 }, { "ce_loss_12": 3.5023183941841127, "ce_loss_17": 3.140884268283844, "ce_loss_23": 2.924031972885132, "ce_loss_3": 4.732712912559509, "ce_loss_6": 4.188665843009948, "epoch": 0.038, "grad_norm": 1096.0, "kl_loss_12": 1206.7100524902344, "kl_loss_17": 459.29900512695315, "kl_loss_3": 3623.5970947265623, "kl_loss_6": 2586.8260009765627, "learning_rate": 0.000998027578700917, "loss": 2001.7613, "step": 380 }, { "ce_loss_12": 3.4590607047080995, "ce_loss_17": 3.0973698616027834, "ce_loss_23": 2.8757705688476562, "ce_loss_3": 4.702216649055481, "ce_loss_6": 4.169513952732086, "epoch": 0.039, "grad_norm": 1984.0, "kl_loss_12": 1201.043963623047, "kl_loss_17": 452.04960479736326, "kl_loss_3": 3640.4640258789063, "kl_loss_6": 2617.0949829101564, "learning_rate": 0.0009978842768382998, "loss": 2006.4363, "step": 390 }, { "ce_loss_12": 3.4306026816368105, "ce_loss_17": 3.1000054597854616, "ce_loss_23": 2.88355758190155, "ce_loss_3": 4.647925066947937, "ce_loss_6": 4.152370321750641, "epoch": 0.04, "grad_norm": 2400.0, "kl_loss_12": 1145.0473388671876, "kl_loss_17": 445.755876159668, "kl_loss_3": 3536.4222412109375, "kl_loss_6": 2587.5209228515623, "learning_rate": 0.0009977359612865424, "loss": 1947.2914, "step": 400 }, { "ce_loss_12": 3.448613131046295, "ce_loss_17": 3.099381995201111, "ce_loss_23": 2.891097903251648, "ce_loss_3": 4.676835989952087, "ce_loss_6": 4.156297981739044, "epoch": 0.041, "grad_norm": 1320.0, "kl_loss_12": 1175.8864990234374, "kl_loss_17": 440.0459350585937, "kl_loss_3": 3581.45166015625, "kl_loss_6": 2609.4977783203126, "learning_rate": 0.0009975826335391806, "loss": 1938.3453, "step": 410 }, { "ce_loss_12": 3.4343478679656982, "ce_loss_17": 3.125094103813171, "ce_loss_23": 2.910094475746155, "ce_loss_3": 4.653401112556457, "ce_loss_6": 4.147323703765869, "epoch": 0.042, "grad_norm": 1872.0, "kl_loss_12": 1100.34775390625, "kl_loss_17": 455.45897521972654, "kl_loss_3": 3498.450341796875, "kl_loss_6": 2518.9093017578125, "learning_rate": 0.0009974242951402235, "loss": 1903.6037, "step": 420 }, { "ce_loss_12": 3.462802863121033, "ce_loss_17": 3.127783679962158, "ce_loss_23": 2.914916479587555, "ce_loss_3": 4.686368560791015, "ce_loss_6": 4.148653864860535, "epoch": 0.043, "grad_norm": 1168.0, "kl_loss_12": 1135.933984375, "kl_loss_17": 439.07324981689453, "kl_loss_3": 3567.4921997070314, "kl_loss_6": 2534.5516723632813, "learning_rate": 0.0009972609476841367, "loss": 1900.857, "step": 430 }, { "ce_loss_12": 3.3758052229881286, "ce_loss_17": 3.0453759908676146, "ce_loss_23": 2.8323325395584105, "ce_loss_3": 4.628264284133911, "ce_loss_6": 4.12095662355423, "epoch": 0.044, "grad_norm": 1280.0, "kl_loss_12": 1123.0216278076173, "kl_loss_17": 451.2273483276367, "kl_loss_3": 3576.0677978515623, "kl_loss_6": 2601.298620605469, "learning_rate": 0.0009970925928158272, "loss": 1941.1031, "step": 440 }, { "ce_loss_12": 3.3380956411361695, "ce_loss_17": 2.9893947720527647, "ce_loss_23": 2.7863958835601808, "ce_loss_3": 4.597060704231263, "ce_loss_6": 4.063511395454407, "epoch": 0.045, "grad_norm": 1696.0, "kl_loss_12": 1161.7735687255858, "kl_loss_17": 422.92466888427737, "kl_loss_3": 3636.7440551757813, "kl_loss_6": 2627.9586547851563, "learning_rate": 0.000996919232230627, "loss": 1953.0043, "step": 450 }, { "ce_loss_12": 3.3880254507064818, "ce_loss_17": 3.0471160888671873, "ce_loss_23": 2.86956307888031, "ce_loss_3": 4.579530930519104, "ce_loss_6": 4.073746514320374, "epoch": 0.046, "grad_norm": 1528.0, "kl_loss_12": 1115.3556823730469, "kl_loss_17": 379.5940216064453, "kl_loss_3": 3471.3382080078127, "kl_loss_6": 2500.7124267578124, "learning_rate": 0.0009967408676742752, "loss": 1833.6008, "step": 460 }, { "ce_loss_12": 3.53208132982254, "ce_loss_17": 3.1919715881347654, "ce_loss_23": 3.0071330189704897, "ce_loss_3": 4.685499620437622, "ce_loss_6": 4.167628622055053, "epoch": 0.047, "grad_norm": 1048.0, "kl_loss_12": 1116.2639068603517, "kl_loss_17": 399.31142272949216, "kl_loss_3": 3404.2772705078123, "kl_loss_6": 2421.417102050781, "learning_rate": 0.0009965575009429006, "loss": 1885.0047, "step": 470 }, { "ce_loss_12": 3.331265389919281, "ce_loss_17": 3.0240687012672423, "ce_loss_23": 2.79015998840332, "ce_loss_3": 4.576504421234131, "ce_loss_6": 4.021812427043915, "epoch": 0.048, "grad_norm": 1152.0, "kl_loss_12": 1125.945867919922, "kl_loss_17": 471.67090759277346, "kl_loss_3": 3577.5029541015624, "kl_loss_6": 2510.0554321289064, "learning_rate": 0.0009963691338830043, "loss": 1901.8957, "step": 480 }, { "ce_loss_12": 3.4071529150009154, "ce_loss_17": 3.0926716566085815, "ce_loss_23": 2.8846765518188477, "ce_loss_3": 4.632588601112365, "ce_loss_6": 4.073854601383209, "epoch": 0.049, "grad_norm": 1088.0, "kl_loss_12": 1115.415625, "kl_loss_17": 448.1295852661133, "kl_loss_3": 3538.219140625, "kl_loss_6": 2462.3754272460938, "learning_rate": 0.0009961757683914405, "loss": 1878.659, "step": 490 }, { "ce_loss_12": 3.4084617495536804, "ce_loss_17": 3.082959520816803, "ce_loss_23": 2.8706990122795104, "ce_loss_3": 4.597338938713074, "ce_loss_6": 4.037699508666992, "epoch": 0.05, "grad_norm": 1336.0, "kl_loss_12": 1137.8160369873046, "kl_loss_17": 443.1447296142578, "kl_loss_3": 3480.0999145507812, "kl_loss_6": 2428.29423828125, "learning_rate": 0.0009959774064153978, "loss": 1888.4242, "step": 500 }, { "ce_loss_12": 3.381693124771118, "ce_loss_17": 3.0774909257888794, "ce_loss_23": 2.8901365995407104, "ce_loss_3": 4.549492239952087, "ce_loss_6": 4.027258086204529, "epoch": 0.051, "grad_norm": 1264.0, "kl_loss_12": 1058.0331939697267, "kl_loss_17": 389.85987243652346, "kl_loss_3": 3354.3579956054687, "kl_loss_6": 2342.414410400391, "learning_rate": 0.0009957740499523787, "loss": 1824.4158, "step": 510 }, { "ce_loss_12": 3.4225743889808653, "ce_loss_17": 3.0770078897476196, "ce_loss_23": 2.898508107662201, "ce_loss_3": 4.558957409858704, "ce_loss_6": 4.046568238735199, "epoch": 0.052, "grad_norm": 1056.0, "kl_loss_12": 1102.2143676757812, "kl_loss_17": 373.25570526123045, "kl_loss_3": 3354.677380371094, "kl_loss_6": 2388.7016357421876, "learning_rate": 0.0009955657010501807, "loss": 1812.9781, "step": 520 }, { "ce_loss_12": 3.418683922290802, "ce_loss_17": 3.033629858493805, "ce_loss_23": 2.855076479911804, "ce_loss_3": 4.570754933357239, "ce_loss_6": 4.044934415817261, "epoch": 0.053, "grad_norm": 1248.0, "kl_loss_12": 1205.6635467529297, "kl_loss_17": 381.54945373535156, "kl_loss_3": 3456.4181884765626, "kl_loss_6": 2436.295849609375, "learning_rate": 0.000995352361806875, "loss": 1855.0133, "step": 530 }, { "ce_loss_12": 3.487868809700012, "ce_loss_17": 3.09113187789917, "ce_loss_23": 2.896335208415985, "ce_loss_3": 4.577032613754272, "ce_loss_6": 4.054221868515015, "epoch": 0.054, "grad_norm": 1152.0, "kl_loss_12": 1227.9361145019532, "kl_loss_17": 399.3049880981445, "kl_loss_3": 3395.7439453125, "kl_loss_6": 2399.8954833984376, "learning_rate": 0.0009951340343707852, "loss": 1867.9363, "step": 540 }, { "ce_loss_12": 3.473950946331024, "ce_loss_17": 3.1219199657440186, "ce_loss_23": 2.942656922340393, "ce_loss_3": 4.631032228469849, "ce_loss_6": 4.126621842384338, "epoch": 0.055, "grad_norm": 984.0, "kl_loss_12": 1125.1899505615233, "kl_loss_17": 393.1251739501953, "kl_loss_3": 3395.15849609375, "kl_loss_6": 2425.3002075195313, "learning_rate": 0.0009949107209404665, "loss": 1852.8969, "step": 550 }, { "ce_loss_12": 3.3833325028419496, "ce_loss_17": 3.039779543876648, "ce_loss_23": 2.8641862154006956, "ce_loss_3": 4.5303771734237674, "ce_loss_6": 4.015872514247894, "epoch": 0.056, "grad_norm": 1064.0, "kl_loss_12": 1090.2752197265625, "kl_loss_17": 361.8307144165039, "kl_loss_3": 3359.927282714844, "kl_loss_6": 2393.180126953125, "learning_rate": 0.0009946824237646824, "loss": 1807.0324, "step": 560 }, { "ce_loss_12": 3.3463765382766724, "ce_loss_17": 2.9906514286994934, "ce_loss_23": 2.82177197933197, "ce_loss_3": 4.515055251121521, "ce_loss_6": 3.9852001786231996, "epoch": 0.057, "grad_norm": 1328.0, "kl_loss_12": 1129.2081176757813, "kl_loss_17": 368.39979248046876, "kl_loss_3": 3436.320520019531, "kl_loss_6": 2419.8519775390623, "learning_rate": 0.0009944491451423828, "loss": 1861.3609, "step": 570 }, { "ce_loss_12": 3.3455269932746887, "ce_loss_17": 3.0017722129821776, "ce_loss_23": 2.8181710720062254, "ce_loss_3": 4.555279207229614, "ce_loss_6": 4.013754498958588, "epoch": 0.058, "grad_norm": 1048.0, "kl_loss_12": 1123.0898864746093, "kl_loss_17": 386.96007080078124, "kl_loss_3": 3515.637939453125, "kl_loss_6": 2467.850341796875, "learning_rate": 0.0009942108874226813, "loss": 1835.1637, "step": 580 }, { "ce_loss_12": 3.4131882905960085, "ce_loss_17": 3.0872496962547302, "ce_loss_23": 2.9123022317886353, "ce_loss_3": 4.562892317771912, "ce_loss_6": 4.037652707099914, "epoch": 0.059, "grad_norm": 1312.0, "kl_loss_12": 1063.407748413086, "kl_loss_17": 377.90936889648435, "kl_loss_3": 3328.068469238281, "kl_loss_6": 2309.729833984375, "learning_rate": 0.00099396765300483, "loss": 1766.2221, "step": 590 }, { "ce_loss_12": 3.4159348487854, "ce_loss_17": 3.0849705934524536, "ce_loss_23": 2.896781253814697, "ce_loss_3": 4.561087894439697, "ce_loss_6": 4.038153064250946, "epoch": 0.06, "grad_norm": 1776.0, "kl_loss_12": 1077.741293334961, "kl_loss_17": 390.4696075439453, "kl_loss_3": 3359.8049560546874, "kl_loss_6": 2370.4330078125, "learning_rate": 0.0009937194443381972, "loss": 1799.4371, "step": 600 }, { "ce_loss_12": 3.4187211632728576, "ce_loss_17": 3.101148545742035, "ce_loss_23": 2.930239200592041, "ce_loss_3": 4.549526739120483, "ce_loss_6": 4.049935567378998, "epoch": 0.061, "grad_norm": 1112.0, "kl_loss_12": 1040.6774597167969, "kl_loss_17": 370.95189514160154, "kl_loss_3": 3283.81728515625, "kl_loss_6": 2311.2066345214844, "learning_rate": 0.0009934662639222412, "loss": 1785.4037, "step": 610 }, { "ce_loss_12": 3.3739498376846315, "ce_loss_17": 3.073314607143402, "ce_loss_23": 2.8866294622421265, "ce_loss_3": 4.554197382926941, "ce_loss_6": 4.047503459453583, "epoch": 0.062, "grad_norm": 1792.0, "kl_loss_12": 1042.084783935547, "kl_loss_17": 388.0509094238281, "kl_loss_3": 3388.6704345703124, "kl_loss_6": 2418.96044921875, "learning_rate": 0.000993208114306486, "loss": 1793.3209, "step": 620 }, { "ce_loss_12": 3.2972922563552856, "ce_loss_17": 3.011891508102417, "ce_loss_23": 2.8142054677009583, "ce_loss_3": 4.5025928020477295, "ce_loss_6": 3.984578573703766, "epoch": 0.063, "grad_norm": 1504.0, "kl_loss_12": 1031.5344207763671, "kl_loss_17": 420.5843566894531, "kl_loss_3": 3421.248095703125, "kl_loss_6": 2428.8518798828127, "learning_rate": 0.0009929449980904952, "loss": 1779.1174, "step": 630 }, { "ce_loss_12": 3.3357118010520934, "ce_loss_17": 3.0490943431854247, "ce_loss_23": 2.877616310119629, "ce_loss_3": 4.507691192626953, "ce_loss_6": 4.000881290435791, "epoch": 0.064, "grad_norm": 1424.0, "kl_loss_12": 998.4864044189453, "kl_loss_17": 375.07586975097655, "kl_loss_3": 3321.5576049804686, "kl_loss_6": 2344.2793823242187, "learning_rate": 0.0009926769179238466, "loss": 1743.0203, "step": 640 }, { "ce_loss_12": 3.3966179728507995, "ce_loss_17": 3.089047574996948, "ce_loss_23": 2.906193125247955, "ce_loss_3": 4.535722994804383, "ce_loss_6": 4.043430185317993, "epoch": 0.065, "grad_norm": 1088.0, "kl_loss_12": 1048.7820037841798, "kl_loss_17": 387.50828552246094, "kl_loss_3": 3332.1998657226563, "kl_loss_6": 2368.378546142578, "learning_rate": 0.000992403876506104, "loss": 1766.1207, "step": 650 }, { "ce_loss_12": 3.3129051446914675, "ce_loss_17": 3.0166344165802004, "ce_loss_23": 2.8484310030937197, "ce_loss_3": 4.497570371627807, "ce_loss_6": 3.9834340095520018, "epoch": 0.066, "grad_norm": 1392.0, "kl_loss_12": 998.1759429931641, "kl_loss_17": 359.26244659423827, "kl_loss_3": 3335.43896484375, "kl_loss_6": 2332.2134399414062, "learning_rate": 0.0009921258765867918, "loss": 1757.334, "step": 660 }, { "ce_loss_12": 3.2913843750953675, "ce_loss_17": 2.983316922187805, "ce_loss_23": 2.825131380558014, "ce_loss_3": 4.487063193321228, "ce_loss_6": 3.963192939758301, "epoch": 0.067, "grad_norm": 1656.0, "kl_loss_12": 993.4193237304687, "kl_loss_17": 340.4703704833984, "kl_loss_3": 3401.8429321289063, "kl_loss_6": 2374.357080078125, "learning_rate": 0.0009918429209653662, "loss": 1749.6797, "step": 670 }, { "ce_loss_12": 3.3256665229797364, "ce_loss_17": 3.0258800268173216, "ce_loss_23": 2.871187961101532, "ce_loss_3": 4.512535119056702, "ce_loss_6": 3.982617509365082, "epoch": 0.068, "grad_norm": 1040.0, "kl_loss_12": 985.1989440917969, "kl_loss_17": 332.79962768554685, "kl_loss_3": 3343.0188232421874, "kl_loss_6": 2328.2467041015625, "learning_rate": 0.0009915550124911866, "loss": 1706.5441, "step": 680 }, { "ce_loss_12": 3.3320120930671693, "ce_loss_17": 3.0299292683601378, "ce_loss_23": 2.8721617698669433, "ce_loss_3": 4.484584331512451, "ce_loss_6": 3.9666941165924072, "epoch": 0.069, "grad_norm": 1008.0, "kl_loss_12": 984.1631896972656, "kl_loss_17": 330.89317321777344, "kl_loss_3": 3254.253063964844, "kl_loss_6": 2269.092498779297, "learning_rate": 0.0009912621540634887, "loss": 1711.3982, "step": 690 }, { "ce_loss_12": 3.3575613617897035, "ce_loss_17": 3.060102331638336, "ce_loss_23": 2.914083182811737, "ce_loss_3": 4.487043619155884, "ce_loss_6": 3.976016843318939, "epoch": 0.07, "grad_norm": 892.0, "kl_loss_12": 962.9395385742188, "kl_loss_17": 315.06245880126954, "kl_loss_3": 3207.0468017578123, "kl_loss_6": 2221.482141113281, "learning_rate": 0.0009909643486313534, "loss": 1695.9398, "step": 700 }, { "ce_loss_12": 3.2921235084533693, "ce_loss_17": 2.9624681949615477, "ce_loss_23": 2.8101747274398803, "ce_loss_3": 4.465417528152466, "ce_loss_6": 3.9272570967674256, "epoch": 0.071, "grad_norm": 1088.0, "kl_loss_12": 1008.410922241211, "kl_loss_17": 328.5423217773438, "kl_loss_3": 3353.696728515625, "kl_loss_6": 2306.0830810546877, "learning_rate": 0.000990661599193678, "loss": 1778.1223, "step": 710 }, { "ce_loss_12": 3.376284325122833, "ce_loss_17": 3.074607563018799, "ce_loss_23": 2.9188767075538635, "ce_loss_3": 4.520675468444824, "ce_loss_6": 4.008211147785187, "epoch": 0.072, "grad_norm": 1160.0, "kl_loss_12": 992.3700897216797, "kl_loss_17": 324.57889251708986, "kl_loss_3": 3247.5807495117188, "kl_loss_6": 2259.2288513183594, "learning_rate": 0.0009903539087991462, "loss": 1706.6916, "step": 720 }, { "ce_loss_12": 3.3545514583587646, "ce_loss_17": 3.051961660385132, "ce_loss_23": 2.9055708765983583, "ce_loss_3": 4.496706533432007, "ce_loss_6": 3.9772397875785828, "epoch": 0.073, "grad_norm": 980.0, "kl_loss_12": 994.7562225341796, "kl_loss_17": 325.16717834472655, "kl_loss_3": 3252.458361816406, "kl_loss_6": 2251.217822265625, "learning_rate": 0.0009900412805461966, "loss": 1713.7863, "step": 730 }, { "ce_loss_12": 3.437267518043518, "ce_loss_17": 3.1153113722801207, "ce_loss_23": 2.967765748500824, "ce_loss_3": 4.51895227432251, "ce_loss_6": 4.015814542770386, "epoch": 0.074, "grad_norm": 988.0, "kl_loss_12": 1007.8217681884765, "kl_loss_17": 318.0323486328125, "kl_loss_3": 3190.121484375, "kl_loss_6": 2194.7572387695313, "learning_rate": 0.0009897237175829927, "loss": 1704.4227, "step": 740 }, { "ce_loss_12": 3.349493718147278, "ce_loss_17": 3.016627752780914, "ce_loss_23": 2.858274281024933, "ce_loss_3": 4.469674086570739, "ce_loss_6": 3.947020959854126, "epoch": 0.075, "grad_norm": 992.0, "kl_loss_12": 1053.4417694091796, "kl_loss_17": 327.71359100341795, "kl_loss_3": 3290.5677490234375, "kl_loss_6": 2277.1982666015624, "learning_rate": 0.0009894012231073895, "loss": 1720.0527, "step": 750 }, { "ce_loss_12": 3.381664311885834, "ce_loss_17": 3.0567665100097656, "ce_loss_23": 2.9069377303123476, "ce_loss_3": 4.487093877792359, "ce_loss_6": 3.981345546245575, "epoch": 0.076, "grad_norm": 1312.0, "kl_loss_12": 1018.7256530761719, "kl_loss_17": 327.3124313354492, "kl_loss_3": 3220.332568359375, "kl_loss_6": 2239.033062744141, "learning_rate": 0.0009890738003669028, "loss": 1719.7855, "step": 760 }, { "ce_loss_12": 3.3535516023635865, "ce_loss_17": 3.0329953789711, "ce_loss_23": 2.875113677978516, "ce_loss_3": 4.51433572769165, "ce_loss_6": 3.9803815722465514, "epoch": 0.077, "grad_norm": 1216.0, "kl_loss_12": 1021.7845306396484, "kl_loss_17": 333.3127502441406, "kl_loss_3": 3351.674694824219, "kl_loss_6": 2306.272119140625, "learning_rate": 0.0009887414526586764, "loss": 1712.3004, "step": 770 }, { "ce_loss_12": 3.3716891407966614, "ce_loss_17": 3.0681586623191834, "ce_loss_23": 2.9219251036643983, "ce_loss_3": 4.514308929443359, "ce_loss_6": 3.989739179611206, "epoch": 0.078, "grad_norm": 1056.0, "kl_loss_12": 963.1607696533204, "kl_loss_17": 313.2260986328125, "kl_loss_3": 3232.1091186523436, "kl_loss_6": 2218.2968627929686, "learning_rate": 0.0009884041833294476, "loss": 1653.7039, "step": 780 }, { "ce_loss_12": 3.372546339035034, "ce_loss_17": 3.0722476720809935, "ce_loss_23": 2.930769956111908, "ce_loss_3": 4.489286208152771, "ce_loss_6": 3.9699451208114622, "epoch": 0.079, "grad_norm": 1000.0, "kl_loss_12": 954.4112121582032, "kl_loss_17": 307.53686676025393, "kl_loss_3": 3183.226110839844, "kl_loss_6": 2160.666619873047, "learning_rate": 0.000988061995775515, "loss": 1698.0086, "step": 790 }, { "ce_loss_12": 3.3053468823432923, "ce_loss_17": 3.0160653710365297, "ce_loss_23": 2.8685967564582824, "ce_loss_3": 4.424778723716736, "ce_loss_6": 3.9103200674057006, "epoch": 0.08, "grad_norm": 1048.0, "kl_loss_12": 953.6102935791016, "kl_loss_17": 322.9908157348633, "kl_loss_3": 3191.081994628906, "kl_loss_6": 2195.573114013672, "learning_rate": 0.0009877148934427035, "loss": 1659.377, "step": 800 }, { "ce_loss_12": 3.336946439743042, "ce_loss_17": 3.047144114971161, "ce_loss_23": 2.903570628166199, "ce_loss_3": 4.465107464790345, "ce_loss_6": 3.957136833667755, "epoch": 0.081, "grad_norm": 984.0, "kl_loss_12": 940.542431640625, "kl_loss_17": 314.41667022705076, "kl_loss_3": 3199.7871215820314, "kl_loss_6": 2209.312353515625, "learning_rate": 0.0009873628798263297, "loss": 1646.1479, "step": 810 }, { "ce_loss_12": 3.2978206872940063, "ce_loss_17": 3.0110609769821166, "ce_loss_23": 2.866278886795044, "ce_loss_3": 4.406501245498657, "ce_loss_6": 3.889325964450836, "epoch": 0.082, "grad_norm": 1000.0, "kl_loss_12": 928.1305328369141, "kl_loss_17": 310.1401123046875, "kl_loss_3": 3130.672497558594, "kl_loss_6": 2145.510949707031, "learning_rate": 0.0009870059584711668, "loss": 1672.1705, "step": 820 }, { "ce_loss_12": 3.3061461210250855, "ce_loss_17": 3.0198216795921327, "ce_loss_23": 2.8694707274436952, "ce_loss_3": 4.416663718223572, "ce_loss_6": 3.9168904542922975, "epoch": 0.083, "grad_norm": 908.0, "kl_loss_12": 935.2707489013671, "kl_loss_17": 329.5289474487305, "kl_loss_3": 3150.615368652344, "kl_loss_6": 2179.1330505371093, "learning_rate": 0.000986644132971409, "loss": 1649.4949, "step": 830 }, { "ce_loss_12": 3.3155024528503416, "ce_loss_17": 3.0179206132888794, "ce_loss_23": 2.858084261417389, "ce_loss_3": 4.44396892786026, "ce_loss_6": 3.9450360417366026, "epoch": 0.084, "grad_norm": 1040.0, "kl_loss_12": 965.4269836425781, "kl_loss_17": 333.64192352294924, "kl_loss_3": 3197.687365722656, "kl_loss_6": 2229.4203979492186, "learning_rate": 0.0009862774069706345, "loss": 1665.3748, "step": 840 }, { "ce_loss_12": 3.418204295635223, "ce_loss_17": 3.1322174668312073, "ce_loss_23": 2.986786162853241, "ce_loss_3": 4.4888609647750854, "ce_loss_6": 3.9892574667930605, "epoch": 0.085, "grad_norm": 992.0, "kl_loss_12": 946.3281097412109, "kl_loss_17": 313.324609375, "kl_loss_3": 3086.1066284179688, "kl_loss_6": 2126.8193298339843, "learning_rate": 0.000985905784161771, "loss": 1636.6967, "step": 850 }, { "ce_loss_12": 3.345979356765747, "ce_loss_17": 3.052992057800293, "ce_loss_23": 2.9130621552467346, "ce_loss_3": 4.428451132774353, "ce_loss_6": 3.9349741697311402, "epoch": 0.086, "grad_norm": 956.0, "kl_loss_12": 954.7295227050781, "kl_loss_17": 311.8019790649414, "kl_loss_3": 3119.7036987304687, "kl_loss_6": 2163.293731689453, "learning_rate": 0.000985529268287055, "loss": 1630.4893, "step": 860 }, { "ce_loss_12": 3.2938442468643188, "ce_loss_17": 2.9937807083129884, "ce_loss_23": 2.849967968463898, "ce_loss_3": 4.411960792541504, "ce_loss_6": 3.896759867668152, "epoch": 0.087, "grad_norm": 1112.0, "kl_loss_12": 960.398648071289, "kl_loss_17": 313.8326171875, "kl_loss_3": 3182.4353515625, "kl_loss_6": 2195.82099609375, "learning_rate": 0.0009851478631379982, "loss": 1667.1393, "step": 870 }, { "ce_loss_12": 3.352380836009979, "ce_loss_17": 3.0529614686965942, "ce_loss_23": 2.9062195897102354, "ce_loss_3": 4.443659949302673, "ce_loss_6": 3.934219491481781, "epoch": 0.088, "grad_norm": 1024.0, "kl_loss_12": 953.0646362304688, "kl_loss_17": 311.8681335449219, "kl_loss_3": 3139.9689819335936, "kl_loss_6": 2151.7400024414064, "learning_rate": 0.0009847615725553456, "loss": 1641.743, "step": 880 }, { "ce_loss_12": 3.369493007659912, "ce_loss_17": 3.087172031402588, "ce_loss_23": 2.955445909500122, "ce_loss_3": 4.406086874008179, "ce_loss_6": 3.927104687690735, "epoch": 0.089, "grad_norm": 1112.0, "kl_loss_12": 909.7544189453125, "kl_loss_17": 285.7761688232422, "kl_loss_3": 2972.541784667969, "kl_loss_6": 2051.965185546875, "learning_rate": 0.0009843704004290394, "loss": 1613.0483, "step": 890 }, { "ce_loss_12": 3.3007118344306945, "ce_loss_17": 3.0091582536697388, "ce_loss_23": 2.865347218513489, "ce_loss_3": 4.389851522445679, "ce_loss_6": 3.887819600105286, "epoch": 0.09, "grad_norm": 1184.0, "kl_loss_12": 942.951953125, "kl_loss_17": 302.5071563720703, "kl_loss_3": 3139.5479125976562, "kl_loss_6": 2154.337860107422, "learning_rate": 0.0009839743506981783, "loss": 1628.214, "step": 900 }, { "ce_loss_12": 3.244469094276428, "ce_loss_17": 2.9393270134925844, "ce_loss_23": 2.793620991706848, "ce_loss_3": 4.385366058349609, "ce_loss_6": 3.870071280002594, "epoch": 0.091, "grad_norm": 972.0, "kl_loss_12": 964.6631439208984, "kl_loss_17": 310.6958312988281, "kl_loss_3": 3252.1250854492187, "kl_loss_6": 2248.5163696289064, "learning_rate": 0.0009835734273509786, "loss": 1661.4332, "step": 910 }, { "ce_loss_12": 3.3276159048080443, "ce_loss_17": 3.0265863656997682, "ce_loss_23": 2.879580080509186, "ce_loss_3": 4.428731679916382, "ce_loss_6": 3.912135696411133, "epoch": 0.092, "grad_norm": 972.0, "kl_loss_12": 934.3377227783203, "kl_loss_17": 306.7847213745117, "kl_loss_3": 3130.859033203125, "kl_loss_6": 2128.042761230469, "learning_rate": 0.0009831676344247342, "loss": 1625.7849, "step": 920 }, { "ce_loss_12": 3.325820744037628, "ce_loss_17": 3.038645386695862, "ce_loss_23": 2.905313861370087, "ce_loss_3": 4.380109333992005, "ce_loss_6": 3.8928969264030457, "epoch": 0.093, "grad_norm": 880.0, "kl_loss_12": 920.8738159179687, "kl_loss_17": 293.66180419921875, "kl_loss_3": 3038.493615722656, "kl_loss_6": 2091.2300720214844, "learning_rate": 0.0009827569760057755, "loss": 1614.7129, "step": 930 }, { "ce_loss_12": 3.283223259449005, "ce_loss_17": 2.974070966243744, "ce_loss_23": 2.8231024861335756, "ce_loss_3": 4.420139646530151, "ce_loss_6": 3.896502125263214, "epoch": 0.094, "grad_norm": 1048.0, "kl_loss_12": 967.0422973632812, "kl_loss_17": 315.1331161499023, "kl_loss_3": 3246.8406616210937, "kl_loss_6": 2233.370959472656, "learning_rate": 0.000982341456229428, "loss": 1639.7586, "step": 940 }, { "ce_loss_12": 3.3469450950622557, "ce_loss_17": 3.0623409390449523, "ce_loss_23": 2.9173637866973876, "ce_loss_3": 4.45803382396698, "ce_loss_6": 3.952650713920593, "epoch": 0.095, "grad_norm": 1112.0, "kl_loss_12": 949.4091888427735, "kl_loss_17": 315.292790222168, "kl_loss_3": 3171.674108886719, "kl_loss_6": 2186.0895812988283, "learning_rate": 0.000981921079279971, "loss": 1616.1184, "step": 950 }, { "ce_loss_12": 3.320824933052063, "ce_loss_17": 3.0578987121582033, "ce_loss_23": 2.9246861934661865, "ce_loss_3": 4.367957544326782, "ce_loss_6": 3.8789355397224425, "epoch": 0.096, "grad_norm": 1264.0, "kl_loss_12": 884.3493804931641, "kl_loss_17": 306.3426567077637, "kl_loss_3": 3007.1193725585936, "kl_loss_6": 2051.980364990234, "learning_rate": 0.0009814958493905962, "loss": 1580.1495, "step": 960 }, { "ce_loss_12": 3.303164303302765, "ce_loss_17": 3.0266597151756285, "ce_loss_23": 2.884927237033844, "ce_loss_3": 4.41832115650177, "ce_loss_6": 3.909832787513733, "epoch": 0.097, "grad_norm": 1024.0, "kl_loss_12": 904.8543029785156, "kl_loss_17": 307.96812438964844, "kl_loss_3": 3112.451696777344, "kl_loss_6": 2136.4014526367187, "learning_rate": 0.0009810657708433637, "loss": 1641.5656, "step": 970 }, { "ce_loss_12": 3.357134532928467, "ce_loss_17": 3.109633040428162, "ce_loss_23": 2.957169032096863, "ce_loss_3": 4.392407011985779, "ce_loss_6": 3.9122292041778564, "epoch": 0.098, "grad_norm": 1112.0, "kl_loss_12": 875.98427734375, "kl_loss_17": 322.0378616333008, "kl_loss_3": 2949.5711303710937, "kl_loss_6": 2031.9008850097657, "learning_rate": 0.0009806308479691594, "loss": 1564.0409, "step": 980 }, { "ce_loss_12": 3.391128623485565, "ce_loss_17": 3.1226599335670473, "ce_loss_23": 2.95620356798172, "ce_loss_3": 4.4521146535873415, "ce_loss_6": 3.9637447357177735, "epoch": 0.099, "grad_norm": 1048.0, "kl_loss_12": 929.8868896484375, "kl_loss_17": 364.4446853637695, "kl_loss_3": 3061.3019775390626, "kl_loss_6": 2113.771942138672, "learning_rate": 0.0009801910851476522, "loss": 1610.7443, "step": 990 }, { "ce_loss_12": 3.3127102971076967, "ce_loss_17": 3.0483265042304994, "ce_loss_23": 2.8895352125167846, "ce_loss_3": 4.4247009515762326, "ce_loss_6": 3.9209498167037964, "epoch": 0.1, "grad_norm": 956.0, "kl_loss_12": 926.8501525878906, "kl_loss_17": 344.10108642578126, "kl_loss_3": 3181.64150390625, "kl_loss_6": 2189.818463134766, "learning_rate": 0.0009797464868072487, "loss": 1626.2685, "step": 1000 }, { "ce_loss_12": 3.2965635657310486, "ce_loss_17": 3.0399394392967225, "ce_loss_23": 2.8819270968437194, "ce_loss_3": 4.386479806900025, "ce_loss_6": 3.8870999932289125, "epoch": 0.101, "grad_norm": 1248.0, "kl_loss_12": 923.0501098632812, "kl_loss_17": 338.95404663085935, "kl_loss_3": 3102.4775268554686, "kl_loss_6": 2139.943634033203, "learning_rate": 0.0009792970574250492, "loss": 1619.6861, "step": 1010 }, { "ce_loss_12": 3.3044546008110047, "ce_loss_17": 3.0502540946006773, "ce_loss_23": 2.8980908036231994, "ce_loss_3": 4.38866947889328, "ce_loss_6": 3.888150417804718, "epoch": 0.102, "grad_norm": 1208.0, "kl_loss_12": 893.213345336914, "kl_loss_17": 324.3102798461914, "kl_loss_3": 3070.095068359375, "kl_loss_6": 2099.203436279297, "learning_rate": 0.0009788428015268028, "loss": 1573.6766, "step": 1020 }, { "ce_loss_12": 3.2988733649253845, "ce_loss_17": 3.0434825658798217, "ce_loss_23": 2.902178335189819, "ce_loss_3": 4.363534331321716, "ce_loss_6": 3.8749521732330323, "epoch": 0.103, "grad_norm": 1184.0, "kl_loss_12": 876.8101196289062, "kl_loss_17": 305.72301330566404, "kl_loss_3": 3020.4602905273437, "kl_loss_6": 2069.8065979003904, "learning_rate": 0.0009783837236868609, "loss": 1566.3812, "step": 1030 }, { "ce_loss_12": 3.2709617257118224, "ce_loss_17": 3.008937418460846, "ce_loss_23": 2.8611239790916443, "ce_loss_3": 4.3386149406433105, "ce_loss_6": 3.858043742179871, "epoch": 0.104, "grad_norm": 1224.0, "kl_loss_12": 890.7757629394531, "kl_loss_17": 319.31165924072263, "kl_loss_3": 3030.126110839844, "kl_loss_6": 2079.913439941406, "learning_rate": 0.0009779198285281327, "loss": 1570.5824, "step": 1040 }, { "ce_loss_12": 3.2658578515052796, "ce_loss_17": 3.006276023387909, "ce_loss_23": 2.8662564635276793, "ce_loss_3": 4.363893914222717, "ce_loss_6": 3.867250108718872, "epoch": 0.105, "grad_norm": 1168.0, "kl_loss_12": 873.5543426513672, "kl_loss_17": 301.8095642089844, "kl_loss_3": 3073.273645019531, "kl_loss_6": 2108.3940185546876, "learning_rate": 0.0009774511207220368, "loss": 1584.198, "step": 1050 }, { "ce_loss_12": 3.316801738739014, "ce_loss_17": 3.045045328140259, "ce_loss_23": 2.907749652862549, "ce_loss_3": 4.407438325881958, "ce_loss_6": 3.9095324158668516, "epoch": 0.106, "grad_norm": 864.0, "kl_loss_12": 892.4073059082032, "kl_loss_17": 299.16715240478516, "kl_loss_3": 3074.142614746094, "kl_loss_6": 2108.910284423828, "learning_rate": 0.0009769776049884564, "loss": 1581.9543, "step": 1060 }, { "ce_loss_12": 3.24352912902832, "ce_loss_17": 2.9564419984817505, "ce_loss_23": 2.8230312943458555, "ce_loss_3": 4.339093565940857, "ce_loss_6": 3.8406063437461855, "epoch": 0.107, "grad_norm": 1688.0, "kl_loss_12": 919.3368377685547, "kl_loss_17": 291.7588394165039, "kl_loss_3": 3120.2006225585938, "kl_loss_6": 2151.6788024902344, "learning_rate": 0.0009764992860956889, "loss": 1627.9674, "step": 1070 }, { "ce_loss_12": 3.3712757229804993, "ce_loss_17": 3.0845303654670717, "ce_loss_23": 2.9652393460273743, "ce_loss_3": 4.350837230682373, "ce_loss_6": 3.8884527921676635, "epoch": 0.108, "grad_norm": 1152.0, "kl_loss_12": 932.8914611816406, "kl_loss_17": 269.56608963012695, "kl_loss_3": 2872.27275390625, "kl_loss_6": 1973.182550048828, "learning_rate": 0.0009760161688604008, "loss": 1542.0248, "step": 1080 }, { "ce_loss_12": 3.4055436849594116, "ce_loss_17": 3.088821458816528, "ce_loss_23": 2.960082447528839, "ce_loss_3": 4.399594736099243, "ce_loss_6": 3.9329787254333497, "epoch": 0.109, "grad_norm": 1200.0, "kl_loss_12": 965.5884552001953, "kl_loss_17": 278.2935195922852, "kl_loss_3": 2961.8168212890623, "kl_loss_6": 2049.4507080078124, "learning_rate": 0.0009755282581475768, "loss": 1577.8215, "step": 1090 }, { "ce_loss_12": 3.453993630409241, "ce_loss_17": 3.1275503635406494, "ce_loss_23": 2.9921347975730894, "ce_loss_3": 4.4342069864273075, "ce_loss_6": 3.9476578712463377, "epoch": 0.11, "grad_norm": 960.0, "kl_loss_12": 991.3485321044922, "kl_loss_17": 288.25440673828126, "kl_loss_3": 2963.675, "kl_loss_6": 2015.0789428710937, "learning_rate": 0.0009750355588704727, "loss": 1552.8531, "step": 1100 }, { "ce_loss_12": 3.297532868385315, "ce_loss_17": 2.987328219413757, "ce_loss_23": 2.858099091053009, "ce_loss_3": 4.321656811237335, "ce_loss_6": 3.83161541223526, "epoch": 0.111, "grad_norm": 1040.0, "kl_loss_12": 948.488314819336, "kl_loss_17": 280.5028388977051, "kl_loss_3": 3007.969384765625, "kl_loss_6": 2057.8111572265625, "learning_rate": 0.0009745380759905647, "loss": 1595.1603, "step": 1110 }, { "ce_loss_12": 3.2402580618858337, "ce_loss_17": 2.9434482932090758, "ce_loss_23": 2.816891813278198, "ce_loss_3": 4.288587641716004, "ce_loss_6": 3.8150289416313172, "epoch": 0.112, "grad_norm": 1144.0, "kl_loss_12": 923.8688537597657, "kl_loss_17": 279.68654861450193, "kl_loss_3": 3030.0865844726563, "kl_loss_6": 2093.9303771972654, "learning_rate": 0.0009740358145174998, "loss": 1661.5004, "step": 1120 }, { "ce_loss_12": 3.3753363370895384, "ce_loss_17": 3.0815247654914857, "ce_loss_23": 2.9522975087165833, "ce_loss_3": 4.34820659160614, "ce_loss_6": 3.8800788521766663, "epoch": 0.113, "grad_norm": 1280.0, "kl_loss_12": 927.2453460693359, "kl_loss_17": 288.8904830932617, "kl_loss_3": 2915.881066894531, "kl_loss_6": 2001.3101745605468, "learning_rate": 0.0009735287795090455, "loss": 1554.5693, "step": 1130 }, { "ce_loss_12": 3.277189326286316, "ce_loss_17": 2.984852302074432, "ce_loss_23": 2.8566457867622375, "ce_loss_3": 4.314880430698395, "ce_loss_6": 3.826177978515625, "epoch": 0.114, "grad_norm": 1088.0, "kl_loss_12": 923.1986511230468, "kl_loss_17": 283.3469985961914, "kl_loss_3": 2998.0419677734376, "kl_loss_6": 2062.0610595703124, "learning_rate": 0.0009730169760710386, "loss": 1559.3103, "step": 1140 }, { "ce_loss_12": 3.3268452405929567, "ce_loss_17": 3.044426202774048, "ce_loss_23": 2.918807125091553, "ce_loss_3": 4.364259016513825, "ce_loss_6": 3.880498945713043, "epoch": 0.115, "grad_norm": 1552.0, "kl_loss_12": 890.0449462890625, "kl_loss_17": 276.7792999267578, "kl_loss_3": 2952.4790893554687, "kl_loss_6": 2009.8786682128907, "learning_rate": 0.0009725004093573342, "loss": 1554.0646, "step": 1150 }, { "ce_loss_12": 3.2966678977012633, "ce_loss_17": 3.009952485561371, "ce_loss_23": 2.878467881679535, "ce_loss_3": 4.3358853459358215, "ce_loss_6": 3.8439332962036135, "epoch": 0.116, "grad_norm": 872.0, "kl_loss_12": 886.3811218261719, "kl_loss_17": 278.020223236084, "kl_loss_3": 2973.4177978515627, "kl_loss_6": 2014.2760498046875, "learning_rate": 0.0009719790845697534, "loss": 1527.7352, "step": 1160 }, { "ce_loss_12": 3.2209246873855593, "ce_loss_17": 2.964114212989807, "ce_loss_23": 2.8400999188423155, "ce_loss_3": 4.245721232891083, "ce_loss_6": 3.7645427107810976, "epoch": 0.117, "grad_norm": 1304.0, "kl_loss_12": 834.8282470703125, "kl_loss_17": 265.4806785583496, "kl_loss_3": 2893.5482788085938, "kl_loss_6": 1949.34755859375, "learning_rate": 0.0009714530069580309, "loss": 1496.4959, "step": 1170 }, { "ce_loss_12": 3.3160659790039064, "ce_loss_17": 3.0417101860046385, "ce_loss_23": 2.9130223631858825, "ce_loss_3": 4.369610404968261, "ce_loss_6": 3.8820913434028625, "epoch": 0.118, "grad_norm": 1328.0, "kl_loss_12": 882.3399963378906, "kl_loss_17": 283.6298973083496, "kl_loss_3": 2986.9559936523438, "kl_loss_6": 2038.928778076172, "learning_rate": 0.0009709221818197624, "loss": 1533.3811, "step": 1180 }, { "ce_loss_12": 3.3617777824401855, "ce_loss_17": 3.090414488315582, "ce_loss_23": 2.96088707447052, "ce_loss_3": 4.399712181091308, "ce_loss_6": 3.9232612371444704, "epoch": 0.119, "grad_norm": 924.0, "kl_loss_12": 879.8941070556641, "kl_loss_17": 276.8961006164551, "kl_loss_3": 2980.789501953125, "kl_loss_6": 2052.8765197753905, "learning_rate": 0.0009703866145003512, "loss": 1544.3242, "step": 1190 }, { "ce_loss_12": 3.321975600719452, "ce_loss_17": 3.054083788394928, "ce_loss_23": 2.930968141555786, "ce_loss_3": 4.354817938804627, "ce_loss_6": 3.874580407142639, "epoch": 0.12, "grad_norm": 1104.0, "kl_loss_12": 864.2863861083985, "kl_loss_17": 270.091845703125, "kl_loss_3": 2934.7477294921873, "kl_loss_6": 2007.5105346679688, "learning_rate": 0.0009698463103929542, "loss": 1543.8357, "step": 1200 }, { "ce_loss_12": 3.2936753749847414, "ce_loss_17": 3.019405686855316, "ce_loss_23": 2.894251120090485, "ce_loss_3": 4.344498944282532, "ce_loss_6": 3.8676336646080016, "epoch": 0.121, "grad_norm": 1248.0, "kl_loss_12": 886.2678466796875, "kl_loss_17": 277.1609245300293, "kl_loss_3": 2976.411096191406, "kl_loss_6": 2053.3679931640627, "learning_rate": 0.0009693012749384279, "loss": 1554.7787, "step": 1210 }, { "ce_loss_12": 3.2914728999137877, "ce_loss_17": 3.022671139240265, "ce_loss_23": 2.898689293861389, "ce_loss_3": 4.314063239097595, "ce_loss_6": 3.8446247458457945, "epoch": 0.122, "grad_norm": 1088.0, "kl_loss_12": 864.2732177734375, "kl_loss_17": 271.4930358886719, "kl_loss_3": 2932.395886230469, "kl_loss_6": 2020.443914794922, "learning_rate": 0.0009687515136252732, "loss": 1516.3835, "step": 1220 }, { "ce_loss_12": 3.2560503005981447, "ce_loss_17": 2.9858575463294983, "ce_loss_23": 2.862755537033081, "ce_loss_3": 4.357694816589356, "ce_loss_6": 3.8565963745117187, "epoch": 0.123, "grad_norm": 980.0, "kl_loss_12": 858.8930755615235, "kl_loss_17": 265.92589340209963, "kl_loss_3": 3066.5052001953127, "kl_loss_6": 2076.885809326172, "learning_rate": 0.0009681970319895803, "loss": 1597.2988, "step": 1230 }, { "ce_loss_12": 3.3237783312797546, "ce_loss_17": 3.0648343682289125, "ce_loss_23": 2.946179783344269, "ce_loss_3": 4.364671421051026, "ce_loss_6": 3.8876635074615478, "epoch": 0.124, "grad_norm": 1032.0, "kl_loss_12": 847.8623840332032, "kl_loss_17": 261.37926483154297, "kl_loss_3": 2940.520544433594, "kl_loss_6": 1995.1516235351562, "learning_rate": 0.0009676378356149733, "loss": 1508.2539, "step": 1240 }, { "ce_loss_12": 3.2792726516723634, "ce_loss_17": 3.026242733001709, "ce_loss_23": 2.912305271625519, "ce_loss_3": 4.320637655258179, "ce_loss_6": 3.8262179017066957, "epoch": 0.125, "grad_norm": 1096.0, "kl_loss_12": 820.703564453125, "kl_loss_17": 256.4162796020508, "kl_loss_3": 2895.8089965820313, "kl_loss_6": 1947.9052673339843, "learning_rate": 0.0009670739301325534, "loss": 1494.4266, "step": 1250 }, { "ce_loss_12": 3.2621443033218385, "ce_loss_17": 2.9985108733177186, "ce_loss_23": 2.873834729194641, "ce_loss_3": 4.284839725494384, "ce_loss_6": 3.802198255062103, "epoch": 0.126, "grad_norm": 916.0, "kl_loss_12": 842.8930450439453, "kl_loss_17": 275.43683166503905, "kl_loss_3": 2915.22626953125, "kl_loss_6": 1975.2683044433593, "learning_rate": 0.0009665053212208426, "loss": 1521.6795, "step": 1260 }, { "ce_loss_12": 3.287890446186066, "ce_loss_17": 3.0372009754180906, "ce_loss_23": 2.9077538013458253, "ce_loss_3": 4.342221176624298, "ce_loss_6": 3.8497675180435182, "epoch": 0.127, "grad_norm": 1040.0, "kl_loss_12": 853.9959564208984, "kl_loss_17": 286.59808654785155, "kl_loss_3": 2974.9799682617186, "kl_loss_6": 2006.1366577148438, "learning_rate": 0.0009659320146057262, "loss": 1520.3227, "step": 1270 }, { "ce_loss_12": 3.3012346744537355, "ce_loss_17": 3.0512335300445557, "ce_loss_23": 2.9305968999862673, "ce_loss_3": 4.324515700340271, "ce_loss_6": 3.852405047416687, "epoch": 0.128, "grad_norm": 1064.0, "kl_loss_12": 823.8801452636719, "kl_loss_17": 269.6768173217773, "kl_loss_3": 2893.6376831054686, "kl_loss_6": 1971.940008544922, "learning_rate": 0.0009653540160603955, "loss": 1493.1244, "step": 1280 }, { "ce_loss_12": 3.2927725553512572, "ce_loss_17": 3.045177149772644, "ce_loss_23": 2.931023132801056, "ce_loss_3": 4.314601492881775, "ce_loss_6": 3.8426471829414366, "epoch": 0.129, "grad_norm": 1216.0, "kl_loss_12": 820.3086090087891, "kl_loss_17": 259.5847450256348, "kl_loss_3": 2895.2479614257813, "kl_loss_6": 1956.9156921386718, "learning_rate": 0.0009647713314052896, "loss": 1476.1674, "step": 1290 }, { "ce_loss_12": 3.2684458255767823, "ce_loss_17": 2.9975777506828307, "ce_loss_23": 2.8703335881233216, "ce_loss_3": 4.3339741945266725, "ce_loss_6": 3.859717035293579, "epoch": 0.13, "grad_norm": 972.0, "kl_loss_12": 857.9473876953125, "kl_loss_17": 273.35472869873047, "kl_loss_3": 3001.6898559570313, "kl_loss_6": 2063.1881591796873, "learning_rate": 0.0009641839665080363, "loss": 1537.6836, "step": 1300 }, { "ce_loss_12": 3.2277937054634096, "ce_loss_17": 2.9757333517074587, "ce_loss_23": 2.851700460910797, "ce_loss_3": 4.281731259822846, "ce_loss_6": 3.7879306077957153, "epoch": 0.131, "grad_norm": 2816.0, "kl_loss_12": 826.8987396240234, "kl_loss_17": 270.5403312683105, "kl_loss_3": 2943.7653930664064, "kl_loss_6": 1985.992041015625, "learning_rate": 0.0009635919272833937, "loss": 1488.6453, "step": 1310 }, { "ce_loss_12": 3.259249973297119, "ce_loss_17": 2.9989022612571716, "ce_loss_23": 2.8736031293869018, "ce_loss_3": 4.318255996704101, "ce_loss_6": 3.8237802028656005, "epoch": 0.132, "grad_norm": 1416.0, "kl_loss_12": 834.8762725830078, "kl_loss_17": 283.9470611572266, "kl_loss_3": 2966.973986816406, "kl_loss_6": 1988.0608520507812, "learning_rate": 0.0009629952196931902, "loss": 1490.9254, "step": 1320 }, { "ce_loss_12": 3.2333609223365785, "ce_loss_17": 2.990918219089508, "ce_loss_23": 2.8692086040973663, "ce_loss_3": 4.287823891639709, "ce_loss_6": 3.7886445045471193, "epoch": 0.133, "grad_norm": 1020.0, "kl_loss_12": 802.2819915771485, "kl_loss_17": 269.66481018066406, "kl_loss_3": 2919.6731567382812, "kl_loss_6": 1954.8788635253907, "learning_rate": 0.0009623938497462645, "loss": 1478.0739, "step": 1330 }, { "ce_loss_12": 3.229719305038452, "ce_loss_17": 2.9842681527137755, "ce_loss_23": 2.860576260089874, "ce_loss_3": 4.277899718284607, "ce_loss_6": 3.7947510719299316, "epoch": 0.134, "grad_norm": 932.0, "kl_loss_12": 829.4476867675781, "kl_loss_17": 271.8407012939453, "kl_loss_3": 2928.6218872070312, "kl_loss_6": 1985.8415222167969, "learning_rate": 0.0009617878234984055, "loss": 1507.8875, "step": 1340 }, { "ce_loss_12": 3.2999788522720337, "ce_loss_17": 3.0766652107238768, "ce_loss_23": 2.9456470131874086, "ce_loss_3": 4.303646683692932, "ce_loss_6": 3.8252553224563597, "epoch": 0.135, "grad_norm": 1088.0, "kl_loss_12": 798.0401733398437, "kl_loss_17": 287.97615509033204, "kl_loss_3": 2827.784289550781, "kl_loss_6": 1898.5206604003906, "learning_rate": 0.0009611771470522907, "loss": 1471.7371, "step": 1350 }, { "ce_loss_12": 3.2443023204803465, "ce_loss_17": 3.008663058280945, "ce_loss_23": 2.872717189788818, "ce_loss_3": 4.279219973087311, "ce_loss_6": 3.7957202434539794, "epoch": 0.136, "grad_norm": 1208.0, "kl_loss_12": 799.3139007568359, "kl_loss_17": 294.02868194580077, "kl_loss_3": 2879.5612060546873, "kl_loss_6": 1925.3865356445312, "learning_rate": 0.0009605618265574251, "loss": 1462.2784, "step": 1360 }, { "ce_loss_12": 3.2207038402557373, "ce_loss_17": 2.9885846972465515, "ce_loss_23": 2.853401231765747, "ce_loss_3": 4.292985570430756, "ce_loss_6": 3.8024420857429506, "epoch": 0.137, "grad_norm": 1144.0, "kl_loss_12": 834.6501495361329, "kl_loss_17": 299.46264266967773, "kl_loss_3": 2988.1515380859373, "kl_loss_6": 2041.3694946289063, "learning_rate": 0.0009599418682100792, "loss": 1512.3742, "step": 1370 }, { "ce_loss_12": 3.247525489330292, "ce_loss_17": 3.0069747805595397, "ce_loss_23": 2.884253454208374, "ce_loss_3": 4.294894897937775, "ce_loss_6": 3.809372806549072, "epoch": 0.138, "grad_norm": 964.0, "kl_loss_12": 802.948861694336, "kl_loss_17": 271.59482803344724, "kl_loss_3": 2909.496472167969, "kl_loss_6": 1976.5658874511719, "learning_rate": 0.0009593172782532268, "loss": 1491.6639, "step": 1380 }, { "ce_loss_12": 3.280215847492218, "ce_loss_17": 3.0361931324005127, "ce_loss_23": 2.914945065975189, "ce_loss_3": 4.304294717311859, "ce_loss_6": 3.819801914691925, "epoch": 0.139, "grad_norm": 1020.0, "kl_loss_12": 807.8031372070312, "kl_loss_17": 265.86458892822264, "kl_loss_3": 2857.7116943359374, "kl_loss_6": 1914.3999145507812, "learning_rate": 0.0009586880629764817, "loss": 1465.0936, "step": 1390 }, { "ce_loss_12": 3.2215174078941344, "ce_loss_17": 2.9842705965042113, "ce_loss_23": 2.8551847457885744, "ce_loss_3": 4.27022614479065, "ce_loss_6": 3.788445198535919, "epoch": 0.14, "grad_norm": 1040.0, "kl_loss_12": 811.4446868896484, "kl_loss_17": 288.3548393249512, "kl_loss_3": 2900.03798828125, "kl_loss_6": 1963.1980285644531, "learning_rate": 0.0009580542287160348, "loss": 1467.0395, "step": 1400 }, { "ce_loss_12": 3.188516676425934, "ce_loss_17": 2.9619083285331724, "ce_loss_23": 2.8218137621879578, "ce_loss_3": 4.234646821022034, "ce_loss_6": 3.739990162849426, "epoch": 0.141, "grad_norm": 1048.0, "kl_loss_12": 809.8930877685547, "kl_loss_17": 288.1702690124512, "kl_loss_3": 2922.703991699219, "kl_loss_6": 1956.603094482422, "learning_rate": 0.0009574157818545901, "loss": 1469.4032, "step": 1410 }, { "ce_loss_12": 3.2416644215583803, "ce_loss_17": 3.006680631637573, "ce_loss_23": 2.8900532841682436, "ce_loss_3": 4.26418137550354, "ce_loss_6": 3.788861167430878, "epoch": 0.142, "grad_norm": 876.0, "kl_loss_12": 785.4881103515625, "kl_loss_17": 259.1768203735352, "kl_loss_3": 2836.4365356445314, "kl_loss_6": 1906.6022399902345, "learning_rate": 0.0009567727288213005, "loss": 1482.4597, "step": 1420 }, { "ce_loss_12": 3.235337662696838, "ce_loss_17": 2.98630108833313, "ce_loss_23": 2.8627288341522217, "ce_loss_3": 4.281697344779968, "ce_loss_6": 3.796532988548279, "epoch": 0.143, "grad_norm": 1280.0, "kl_loss_12": 819.8938232421875, "kl_loss_17": 269.30180130004885, "kl_loss_3": 2927.6844848632813, "kl_loss_6": 1979.854022216797, "learning_rate": 0.0009561250760917027, "loss": 1478.084, "step": 1430 }, { "ce_loss_12": 3.2488888025283815, "ce_loss_17": 2.9993746876716614, "ce_loss_23": 2.876301276683807, "ce_loss_3": 4.269674825668335, "ce_loss_6": 3.792963969707489, "epoch": 0.144, "grad_norm": 1512.0, "kl_loss_12": 822.9290191650391, "kl_loss_17": 268.31039962768557, "kl_loss_3": 2896.881481933594, "kl_loss_6": 1968.8903930664062, "learning_rate": 0.0009554728301876525, "loss": 1456.0495, "step": 1440 }, { "ce_loss_12": 3.2897768020629883, "ce_loss_17": 3.033561480045319, "ce_loss_23": 2.916265618801117, "ce_loss_3": 4.292733931541443, "ce_loss_6": 3.8300752878189086, "epoch": 0.145, "grad_norm": 1392.0, "kl_loss_12": 810.3653289794922, "kl_loss_17": 257.99100799560546, "kl_loss_3": 2828.956005859375, "kl_loss_6": 1927.9954711914063, "learning_rate": 0.0009548159976772592, "loss": 1499.9711, "step": 1450 }, { "ce_loss_12": 3.24677711725235, "ce_loss_17": 2.9964696407318114, "ce_loss_23": 2.8798062562942506, "ce_loss_3": 4.300918602943421, "ce_loss_6": 3.796139717102051, "epoch": 0.146, "grad_norm": 1144.0, "kl_loss_12": 811.711947631836, "kl_loss_17": 257.1396308898926, "kl_loss_3": 2927.9613647460938, "kl_loss_6": 1955.6009887695313, "learning_rate": 0.0009541545851748186, "loss": 1472.3645, "step": 1460 }, { "ce_loss_12": 3.1249361634254456, "ce_loss_17": 2.8709220051765443, "ce_loss_23": 2.7551259279251097, "ce_loss_3": 4.210307502746582, "ce_loss_6": 3.6961392521858216, "epoch": 0.147, "grad_norm": 1216.0, "kl_loss_12": 803.9768951416015, "kl_loss_17": 253.5036376953125, "kl_loss_3": 2965.500927734375, "kl_loss_6": 1973.1413146972657, "learning_rate": 0.0009534885993407473, "loss": 1481.9013, "step": 1470 }, { "ce_loss_12": 3.2659184217453, "ce_loss_17": 3.0223120212554933, "ce_loss_23": 2.907353913784027, "ce_loss_3": 4.305331182479859, "ce_loss_6": 3.828053390979767, "epoch": 0.148, "grad_norm": 1080.0, "kl_loss_12": 788.480908203125, "kl_loss_17": 250.7262405395508, "kl_loss_3": 2884.44912109375, "kl_loss_6": 1953.9335510253907, "learning_rate": 0.0009528180468815154, "loss": 1472.3178, "step": 1480 }, { "ce_loss_12": 3.311021792888641, "ce_loss_17": 3.071496820449829, "ce_loss_23": 2.9612332940101624, "ce_loss_3": 4.324043917655945, "ce_loss_6": 3.845843195915222, "epoch": 0.149, "grad_norm": 908.0, "kl_loss_12": 800.0986083984375, "kl_loss_17": 250.172859954834, "kl_loss_3": 2832.765869140625, "kl_loss_6": 1897.6001281738281, "learning_rate": 0.0009521429345495787, "loss": 1447.2758, "step": 1490 }, { "ce_loss_12": 3.2847871661186216, "ce_loss_17": 3.050262463092804, "ce_loss_23": 2.939221966266632, "ce_loss_3": 4.285373425483703, "ce_loss_6": 3.809739577770233, "epoch": 0.15, "grad_norm": 972.0, "kl_loss_12": 775.6115142822266, "kl_loss_17": 245.11468811035155, "kl_loss_3": 2820.178259277344, "kl_loss_6": 1885.4727478027344, "learning_rate": 0.0009514632691433108, "loss": 1448.602, "step": 1500 }, { "ce_loss_12": 3.2675668716430666, "ce_loss_17": 3.024088716506958, "ce_loss_23": 2.903068017959595, "ce_loss_3": 4.28171523809433, "ce_loss_6": 3.8086917757987977, "epoch": 0.151, "grad_norm": 1400.0, "kl_loss_12": 806.0697418212891, "kl_loss_17": 269.3117622375488, "kl_loss_3": 2851.461560058594, "kl_loss_6": 1931.8026123046875, "learning_rate": 0.0009507790575069346, "loss": 1463.5699, "step": 1510 }, { "ce_loss_12": 3.24948410987854, "ce_loss_17": 2.996292233467102, "ce_loss_23": 2.8639941811561584, "ce_loss_3": 4.288215208053589, "ce_loss_6": 3.800259804725647, "epoch": 0.152, "grad_norm": 1008.0, "kl_loss_12": 829.4756469726562, "kl_loss_17": 274.5081420898438, "kl_loss_3": 2920.2522094726564, "kl_loss_6": 1967.4820678710937, "learning_rate": 0.0009500903065304539, "loss": 1501.5518, "step": 1520 }, { "ce_loss_12": 3.2591757655143736, "ce_loss_17": 3.027329218387604, "ce_loss_23": 2.911393141746521, "ce_loss_3": 4.253254747390747, "ce_loss_6": 3.7834643006324766, "epoch": 0.153, "grad_norm": 1004.0, "kl_loss_12": 778.630160522461, "kl_loss_17": 250.61957092285155, "kl_loss_3": 2784.512976074219, "kl_loss_6": 1862.2048217773438, "learning_rate": 0.0009493970231495835, "loss": 1446.7391, "step": 1530 }, { "ce_loss_12": 3.2096596479415895, "ce_loss_17": 2.9711764454841614, "ce_loss_23": 2.8672603607177733, "ce_loss_3": 4.199491119384765, "ce_loss_6": 3.7244733333587647, "epoch": 0.154, "grad_norm": 1280.0, "kl_loss_12": 773.3964080810547, "kl_loss_17": 244.00584335327147, "kl_loss_3": 2788.61259765625, "kl_loss_6": 1862.1662048339845, "learning_rate": 0.0009486992143456792, "loss": 1424.141, "step": 1540 }, { "ce_loss_12": 3.2563757061958314, "ce_loss_17": 2.998210537433624, "ce_loss_23": 2.874551975727081, "ce_loss_3": 4.34161684513092, "ce_loss_6": 3.837330734729767, "epoch": 0.155, "grad_norm": 1048.0, "kl_loss_12": 833.3443481445313, "kl_loss_17": 266.61425628662107, "kl_loss_3": 3008.584338378906, "kl_loss_6": 2019.5796325683593, "learning_rate": 0.0009479968871456679, "loss": 1489.1023, "step": 1550 }, { "ce_loss_12": 3.21219242811203, "ce_loss_17": 2.9625760078430177, "ce_loss_23": 2.8483705639839174, "ce_loss_3": 4.265359997749329, "ce_loss_6": 3.76342613697052, "epoch": 0.156, "grad_norm": 1104.0, "kl_loss_12": 800.6179107666015, "kl_loss_17": 251.26333312988282, "kl_loss_3": 2913.7487548828126, "kl_loss_6": 1930.050030517578, "learning_rate": 0.0009472900486219768, "loss": 1445.6348, "step": 1560 }, { "ce_loss_12": 3.2221407175064085, "ce_loss_17": 2.961968779563904, "ce_loss_23": 2.8494272470474242, "ce_loss_3": 4.204352223873139, "ce_loss_6": 3.7289497375488283, "epoch": 0.157, "grad_norm": 1040.0, "kl_loss_12": 805.2766754150391, "kl_loss_17": 246.36323776245118, "kl_loss_3": 2819.4550659179686, "kl_loss_6": 1890.1749145507813, "learning_rate": 0.000946578705892462, "loss": 1451.4247, "step": 1570 }, { "ce_loss_12": 3.259094977378845, "ce_loss_17": 2.9870534896850587, "ce_loss_23": 2.8754953742027283, "ce_loss_3": 4.228675103187561, "ce_loss_6": 3.748308801651001, "epoch": 0.158, "grad_norm": 1168.0, "kl_loss_12": 843.6717803955078, "kl_loss_17": 246.24478073120116, "kl_loss_3": 2784.921044921875, "kl_loss_6": 1842.1207641601563, "learning_rate": 0.0009458628661203367, "loss": 1458.2909, "step": 1580 }, { "ce_loss_12": 3.2754209637641907, "ce_loss_17": 2.9969581842422484, "ce_loss_23": 2.8823806643486023, "ce_loss_3": 4.299981880187988, "ce_loss_6": 3.8002068281173704, "epoch": 0.159, "grad_norm": 1216.0, "kl_loss_12": 866.5703857421875, "kl_loss_17": 252.2691665649414, "kl_loss_3": 2910.86015625, "kl_loss_6": 1942.7434814453125, "learning_rate": 0.0009451425365140996, "loss": 1447.4223, "step": 1590 }, { "ce_loss_12": 3.317850708961487, "ce_loss_17": 3.06536523103714, "ce_loss_23": 2.947838509082794, "ce_loss_3": 4.273639845848083, "ce_loss_6": 3.804075849056244, "epoch": 0.16, "grad_norm": 1320.0, "kl_loss_12": 831.5212005615234, "kl_loss_17": 251.46739044189454, "kl_loss_3": 2753.1792846679687, "kl_loss_6": 1837.5140747070313, "learning_rate": 0.0009444177243274617, "loss": 1415.2524, "step": 1600 }, { "ce_loss_12": 3.1936524152755736, "ce_loss_17": 2.9349228024482725, "ce_loss_23": 2.8170175671577455, "ce_loss_3": 4.21053067445755, "ce_loss_6": 3.7250030398368836, "epoch": 0.161, "grad_norm": 1192.0, "kl_loss_12": 843.8219207763672, "kl_loss_17": 257.3523231506348, "kl_loss_3": 2868.45556640625, "kl_loss_6": 1927.8065856933595, "learning_rate": 0.0009436884368592739, "loss": 1460.9502, "step": 1610 }, { "ce_loss_12": 3.239683485031128, "ce_loss_17": 2.9786096811294556, "ce_loss_23": 2.8677761435508726, "ce_loss_3": 4.2137162446975704, "ce_loss_6": 3.740345096588135, "epoch": 0.162, "grad_norm": 1296.0, "kl_loss_12": 825.1115173339844, "kl_loss_17": 247.33793029785156, "kl_loss_3": 2789.4590698242187, "kl_loss_6": 1862.7459655761718, "learning_rate": 0.0009429546814534529, "loss": 1460.2281, "step": 1620 }, { "ce_loss_12": 3.231956887245178, "ce_loss_17": 2.9807236790657043, "ce_loss_23": 2.876505267620087, "ce_loss_3": 4.215959429740906, "ce_loss_6": 3.7355253100395203, "epoch": 0.163, "grad_norm": 960.0, "kl_loss_12": 808.4689880371094, "kl_loss_17": 241.53212051391603, "kl_loss_3": 2776.0137573242187, "kl_loss_6": 1856.5346984863281, "learning_rate": 0.0009422164654989072, "loss": 1402.5779, "step": 1630 }, { "ce_loss_12": 3.3346951007843018, "ce_loss_17": 3.0862001299858095, "ce_loss_23": 2.9771710872650146, "ce_loss_3": 4.302021241188049, "ce_loss_6": 3.8365179657936097, "epoch": 0.164, "grad_norm": 1256.0, "kl_loss_12": 802.1354614257813, "kl_loss_17": 246.5807846069336, "kl_loss_3": 2768.77490234375, "kl_loss_6": 1852.6294067382812, "learning_rate": 0.0009414737964294635, "loss": 1425.5844, "step": 1640 }, { "ce_loss_12": 3.2539820194244387, "ce_loss_17": 3.0245274782180784, "ce_loss_23": 2.919032084941864, "ce_loss_3": 4.217273092269897, "ce_loss_6": 3.747937524318695, "epoch": 0.165, "grad_norm": 1144.0, "kl_loss_12": 758.6764282226562, "kl_loss_17": 240.4837532043457, "kl_loss_3": 2683.5130615234375, "kl_loss_6": 1773.5954284667969, "learning_rate": 0.000940726681723791, "loss": 1408.1107, "step": 1650 }, { "ce_loss_12": 3.1329286456108094, "ce_loss_17": 2.895743155479431, "ce_loss_23": 2.780760633945465, "ce_loss_3": 4.180186772346497, "ce_loss_6": 3.68881573677063, "epoch": 0.166, "grad_norm": 1248.0, "kl_loss_12": 794.1787048339844, "kl_loss_17": 253.62862396240234, "kl_loss_3": 2912.988879394531, "kl_loss_6": 1942.947412109375, "learning_rate": 0.0009399751289053266, "loss": 1414.577, "step": 1660 }, { "ce_loss_12": 3.300536847114563, "ce_loss_17": 3.065656638145447, "ce_loss_23": 2.955276608467102, "ce_loss_3": 4.278782832622528, "ce_loss_6": 3.8195556879043577, "epoch": 0.167, "grad_norm": 960.0, "kl_loss_12": 761.587435913086, "kl_loss_17": 248.2511199951172, "kl_loss_3": 2737.0460693359373, "kl_loss_6": 1834.2865234375, "learning_rate": 0.0009392191455421988, "loss": 1425.7268, "step": 1670 }, { "ce_loss_12": 3.2943241119384767, "ce_loss_17": 3.058737647533417, "ce_loss_23": 2.948542630672455, "ce_loss_3": 4.2761582136154175, "ce_loss_6": 3.802069568634033, "epoch": 0.168, "grad_norm": 1136.0, "kl_loss_12": 786.0340240478515, "kl_loss_17": 257.52221908569334, "kl_loss_3": 2779.2551391601564, "kl_loss_6": 1863.6268981933595, "learning_rate": 0.0009384587392471515, "loss": 1395.0223, "step": 1680 }, { "ce_loss_12": 3.2665735125541686, "ce_loss_17": 3.041038954257965, "ce_loss_23": 2.9342415809631346, "ce_loss_3": 4.232107269763946, "ce_loss_6": 3.786698818206787, "epoch": 0.169, "grad_norm": 1004.0, "kl_loss_12": 754.4325256347656, "kl_loss_17": 239.2375801086426, "kl_loss_3": 2717.1393188476563, "kl_loss_6": 1834.2838317871094, "learning_rate": 0.0009376939176774678, "loss": 1389.892, "step": 1690 }, { "ce_loss_12": 3.2542818665504454, "ce_loss_17": 3.018766474723816, "ce_loss_23": 2.9072797894477844, "ce_loss_3": 4.2547527074813845, "ce_loss_6": 3.778389811515808, "epoch": 0.17, "grad_norm": 1296.0, "kl_loss_12": 763.4822723388672, "kl_loss_17": 250.73448333740234, "kl_loss_3": 2776.2912475585936, "kl_loss_6": 1853.5381958007813, "learning_rate": 0.0009369246885348925, "loss": 1434.3868, "step": 1700 }, { "ce_loss_12": 3.2471718430519103, "ce_loss_17": 3.020142710208893, "ce_loss_23": 2.899143636226654, "ce_loss_3": 4.27671126127243, "ce_loss_6": 3.797596609592438, "epoch": 0.171, "grad_norm": 968.0, "kl_loss_12": 775.6185882568359, "kl_loss_17": 265.4308486938477, "kl_loss_3": 2856.3064575195312, "kl_loss_6": 1918.2055114746095, "learning_rate": 0.0009361510595655545, "loss": 1441.4605, "step": 1710 }, { "ce_loss_12": 3.215370297431946, "ce_loss_17": 2.9849206686019896, "ce_loss_23": 2.858234190940857, "ce_loss_3": 4.21358277797699, "ce_loss_6": 3.755553126335144, "epoch": 0.172, "grad_norm": 1072.0, "kl_loss_12": 784.4585998535156, "kl_loss_17": 264.6829551696777, "kl_loss_3": 2817.5525634765627, "kl_loss_6": 1899.7091918945312, "learning_rate": 0.0009353730385598887, "loss": 1433.4188, "step": 1720 }, { "ce_loss_12": 3.1493828177452086, "ce_loss_17": 2.9145042300224304, "ce_loss_23": 2.8017009973526, "ce_loss_3": 4.190014028549195, "ce_loss_6": 3.7082863092422484, "epoch": 0.173, "grad_norm": 1616.0, "kl_loss_12": 781.0476715087891, "kl_loss_17": 251.35236129760742, "kl_loss_3": 2861.7456298828124, "kl_loss_6": 1924.5235900878906, "learning_rate": 0.0009345906333525581, "loss": 1445.5838, "step": 1730 }, { "ce_loss_12": 3.1840237855911253, "ce_loss_17": 2.9483115673065186, "ce_loss_23": 2.831267786026001, "ce_loss_3": 4.200755143165589, "ce_loss_6": 3.7272995591163633, "epoch": 0.174, "grad_norm": 1200.0, "kl_loss_12": 791.5306030273438, "kl_loss_17": 264.3092872619629, "kl_loss_3": 2830.4186279296873, "kl_loss_6": 1902.0811584472656, "learning_rate": 0.0009338038518223745, "loss": 1424.1994, "step": 1740 }, { "ce_loss_12": 3.251028025150299, "ce_loss_17": 3.0087919354438784, "ce_loss_23": 2.88809187412262, "ce_loss_3": 4.258924639225006, "ce_loss_6": 3.7800361275672913, "epoch": 0.175, "grad_norm": 1176.0, "kl_loss_12": 800.4794372558594, "kl_loss_17": 260.26867828369143, "kl_loss_3": 2862.462158203125, "kl_loss_6": 1918.6351745605468, "learning_rate": 0.0009330127018922195, "loss": 1475.6443, "step": 1750 }, { "ce_loss_12": 3.2001158714294435, "ce_loss_17": 2.9682640552520754, "ce_loss_23": 2.8567294597625734, "ce_loss_3": 4.22244223356247, "ce_loss_6": 3.7290648221969604, "epoch": 0.176, "grad_norm": 1464.0, "kl_loss_12": 762.525503540039, "kl_loss_17": 243.7757797241211, "kl_loss_3": 2817.8551391601563, "kl_loss_6": 1868.523504638672, "learning_rate": 0.0009322171915289634, "loss": 1421.9354, "step": 1760 }, { "ce_loss_12": 3.223832643032074, "ce_loss_17": 3.0016063809394837, "ce_loss_23": 2.8990545868873596, "ce_loss_3": 4.203401672840118, "ce_loss_6": 3.7404787063598635, "epoch": 0.177, "grad_norm": 1176.0, "kl_loss_12": 759.478857421875, "kl_loss_17": 234.98663024902345, "kl_loss_3": 2762.4353637695312, "kl_loss_6": 1849.6406311035157, "learning_rate": 0.0009314173287433873, "loss": 1389.1017, "step": 1770 }, { "ce_loss_12": 3.214055931568146, "ce_loss_17": 2.9797696232795716, "ce_loss_23": 2.872143268585205, "ce_loss_3": 4.21132276058197, "ce_loss_6": 3.7429075956344606, "epoch": 0.178, "grad_norm": 1096.0, "kl_loss_12": 780.1603851318359, "kl_loss_17": 244.01894989013672, "kl_loss_3": 2798.7838623046873, "kl_loss_6": 1879.8642150878907, "learning_rate": 0.0009306131215901003, "loss": 1396.7419, "step": 1780 }, { "ce_loss_12": 3.238218307495117, "ce_loss_17": 3.0090879082679747, "ce_loss_23": 2.9029579758644104, "ce_loss_3": 4.228364479541779, "ce_loss_6": 3.757898378372192, "epoch": 0.179, "grad_norm": 1040.0, "kl_loss_12": 763.7487243652344, "kl_loss_17": 239.04694061279298, "kl_loss_3": 2757.7945068359377, "kl_loss_6": 1843.6190063476563, "learning_rate": 0.0009298045781674596, "loss": 1378.4027, "step": 1790 }, { "ce_loss_12": 3.2191762566566466, "ce_loss_17": 2.98970650434494, "ce_loss_23": 2.8842029094696047, "ce_loss_3": 4.1989802598953245, "ce_loss_6": 3.735355496406555, "epoch": 0.18, "grad_norm": 1496.0, "kl_loss_12": 752.9535675048828, "kl_loss_17": 236.7996368408203, "kl_loss_3": 2740.193994140625, "kl_loss_6": 1820.6103637695312, "learning_rate": 0.0009289917066174886, "loss": 1404.1791, "step": 1800 }, { "ce_loss_12": 3.2065064549446105, "ce_loss_17": 2.9845200538635255, "ce_loss_23": 2.885902488231659, "ce_loss_3": 4.161723887920379, "ce_loss_6": 3.7009355306625364, "epoch": 0.181, "grad_norm": 1032.0, "kl_loss_12": 729.0150360107422, "kl_loss_17": 226.00843200683593, "kl_loss_3": 2676.867541503906, "kl_loss_6": 1771.958477783203, "learning_rate": 0.0009281745151257945, "loss": 1359.5951, "step": 1810 }, { "ce_loss_12": 3.2467663407325746, "ce_loss_17": 3.009868013858795, "ce_loss_23": 2.9040247201919556, "ce_loss_3": 4.229266285896301, "ce_loss_6": 3.7639632701873778, "epoch": 0.182, "grad_norm": 956.0, "kl_loss_12": 746.8623779296875, "kl_loss_17": 231.6996307373047, "kl_loss_3": 2731.1640380859376, "kl_loss_6": 1821.4275512695312, "learning_rate": 0.0009273530119214868, "loss": 1400.1968, "step": 1820 }, { "ce_loss_12": 3.312739670276642, "ce_loss_17": 3.093990111351013, "ce_loss_23": 2.990409862995148, "ce_loss_3": 4.29701189994812, "ce_loss_6": 3.8379727005958557, "epoch": 0.183, "grad_norm": 976.0, "kl_loss_12": 741.4776031494141, "kl_loss_17": 232.1430564880371, "kl_loss_3": 2739.31171875, "kl_loss_6": 1829.6507995605468, "learning_rate": 0.0009265272052770935, "loss": 1371.0215, "step": 1830 }, { "ce_loss_12": 3.17973473072052, "ce_loss_17": 2.9410093665122985, "ce_loss_23": 2.8326215505599976, "ce_loss_3": 4.19485604763031, "ce_loss_6": 3.7067224383354187, "epoch": 0.184, "grad_norm": 1200.0, "kl_loss_12": 755.7035552978516, "kl_loss_17": 238.02146682739257, "kl_loss_3": 2803.255017089844, "kl_loss_6": 1848.4059265136718, "learning_rate": 0.0009256971035084784, "loss": 1404.2541, "step": 1840 }, { "ce_loss_12": 3.127467322349548, "ce_loss_17": 2.8794915318489074, "ce_loss_23": 2.7674395084381103, "ce_loss_3": 4.163695001602173, "ce_loss_6": 3.674273729324341, "epoch": 0.185, "grad_norm": 936.0, "kl_loss_12": 794.7793029785156, "kl_loss_17": 245.12851486206054, "kl_loss_3": 2877.0427001953126, "kl_loss_6": 1919.3818481445312, "learning_rate": 0.0009248627149747573, "loss": 1431.63, "step": 1850 }, { "ce_loss_12": 3.2907007336616516, "ce_loss_17": 3.065432584285736, "ce_loss_23": 2.9592000246047974, "ce_loss_3": 4.26578665971756, "ce_loss_6": 3.801167941093445, "epoch": 0.186, "grad_norm": 1272.0, "kl_loss_12": 756.5147766113281, "kl_loss_17": 237.35702362060547, "kl_loss_3": 2748.3367919921875, "kl_loss_6": 1831.7043518066407, "learning_rate": 0.0009240240480782129, "loss": 1394.3164, "step": 1860 }, { "ce_loss_12": 3.198230564594269, "ce_loss_17": 2.9691875100135805, "ce_loss_23": 2.8620351433753966, "ce_loss_3": 4.207461893558502, "ce_loss_6": 3.73105411529541, "epoch": 0.187, "grad_norm": 1232.0, "kl_loss_12": 758.0779357910156, "kl_loss_17": 236.43061218261718, "kl_loss_3": 2785.8535034179686, "kl_loss_6": 1860.8220092773438, "learning_rate": 0.0009231811112642122, "loss": 1393.7689, "step": 1870 }, { "ce_loss_12": 3.2416003346443176, "ce_loss_17": 3.011486494541168, "ce_loss_23": 2.908844864368439, "ce_loss_3": 4.20769385099411, "ce_loss_6": 3.746701693534851, "epoch": 0.188, "grad_norm": 1080.0, "kl_loss_12": 754.1037841796875, "kl_loss_17": 232.73772125244142, "kl_loss_3": 2723.378137207031, "kl_loss_6": 1814.4259948730469, "learning_rate": 0.0009223339130211192, "loss": 1383.0408, "step": 1880 }, { "ce_loss_12": 3.110693836212158, "ce_loss_17": 2.8853095531463624, "ce_loss_23": 2.7808427035808565, "ce_loss_3": 4.13761682510376, "ce_loss_6": 3.642694425582886, "epoch": 0.189, "grad_norm": 1040.0, "kl_loss_12": 739.8239501953125, "kl_loss_17": 229.3358184814453, "kl_loss_3": 2823.7280517578124, "kl_loss_6": 1849.0847961425782, "learning_rate": 0.0009214824618802108, "loss": 1402.2676, "step": 1890 }, { "ce_loss_12": 3.2733206272125246, "ce_loss_17": 3.0429879903793333, "ce_loss_23": 2.9388628959655763, "ce_loss_3": 4.262214660644531, "ce_loss_6": 3.79117773771286, "epoch": 0.19, "grad_norm": 1008.0, "kl_loss_12": 749.5772369384765, "kl_loss_17": 232.20852584838866, "kl_loss_3": 2747.72998046875, "kl_loss_6": 1821.8578186035156, "learning_rate": 0.0009206267664155906, "loss": 1417.5346, "step": 1900 }, { "ce_loss_12": 3.2125448346138, "ce_loss_17": 2.979010045528412, "ce_loss_23": 2.8730541586875917, "ce_loss_3": 4.208014273643494, "ce_loss_6": 3.7290257573127747, "epoch": 0.191, "grad_norm": 1096.0, "kl_loss_12": 748.1908905029297, "kl_loss_17": 234.87025756835936, "kl_loss_3": 2755.835559082031, "kl_loss_6": 1819.9845886230469, "learning_rate": 0.0009197668352441024, "loss": 1395.7588, "step": 1910 }, { "ce_loss_12": 3.239735448360443, "ce_loss_17": 3.018569827079773, "ce_loss_23": 2.917696511745453, "ce_loss_3": 4.225832998752594, "ce_loss_6": 3.7482815623283385, "epoch": 0.192, "grad_norm": 1080.0, "kl_loss_12": 737.2051788330078, "kl_loss_17": 232.76412811279297, "kl_loss_3": 2725.888525390625, "kl_loss_6": 1800.2931823730469, "learning_rate": 0.0009189026770252437, "loss": 1383.0574, "step": 1920 }, { "ce_loss_12": 3.272562229633331, "ce_loss_17": 3.046920895576477, "ce_loss_23": 2.9422855377197266, "ce_loss_3": 4.243232655525207, "ce_loss_6": 3.782981109619141, "epoch": 0.193, "grad_norm": 1008.0, "kl_loss_12": 739.2452575683594, "kl_loss_17": 230.3068389892578, "kl_loss_3": 2704.004113769531, "kl_loss_6": 1798.2643249511718, "learning_rate": 0.000918034300461078, "loss": 1411.7353, "step": 1930 }, { "ce_loss_12": 3.291454005241394, "ce_loss_17": 3.0682451486587525, "ce_loss_23": 2.9636853694915772, "ce_loss_3": 4.24529767036438, "ce_loss_6": 3.7860057950019836, "epoch": 0.194, "grad_norm": 1216.0, "kl_loss_12": 739.2582458496094, "kl_loss_17": 230.60228424072267, "kl_loss_3": 2671.1538330078124, "kl_loss_6": 1778.3694885253906, "learning_rate": 0.0009171617142961477, "loss": 1364.5955, "step": 1940 }, { "ce_loss_12": 3.2587348222732544, "ce_loss_17": 3.027367722988129, "ce_loss_23": 2.9262425184249876, "ce_loss_3": 4.226789343357086, "ce_loss_6": 3.7603554725646973, "epoch": 0.195, "grad_norm": 1168.0, "kl_loss_12": 734.6230255126953, "kl_loss_17": 226.58795089721679, "kl_loss_3": 2717.46865234375, "kl_loss_6": 1794.8947204589845, "learning_rate": 0.0009162849273173857, "loss": 1369.4313, "step": 1950 }, { "ce_loss_12": 3.2035290122032167, "ce_loss_17": 2.976229190826416, "ce_loss_23": 2.8770426392555235, "ce_loss_3": 4.178189706802368, "ce_loss_6": 3.7164478659629823, "epoch": 0.196, "grad_norm": 1160.0, "kl_loss_12": 729.7126403808594, "kl_loss_17": 222.76492233276366, "kl_loss_3": 2702.063098144531, "kl_loss_6": 1786.2204711914062, "learning_rate": 0.0009154039483540273, "loss": 1377.5442, "step": 1960 }, { "ce_loss_12": 3.1775651812553405, "ce_loss_17": 2.961009216308594, "ce_loss_23": 2.8616538405418397, "ce_loss_3": 4.169165551662445, "ce_loss_6": 3.6863211512565615, "epoch": 0.197, "grad_norm": 1104.0, "kl_loss_12": 720.0626831054688, "kl_loss_17": 224.1533004760742, "kl_loss_3": 2727.455603027344, "kl_loss_6": 1790.6717346191406, "learning_rate": 0.0009145187862775209, "loss": 1364.2689, "step": 1970 }, { "ce_loss_12": 3.2117436170578, "ce_loss_17": 2.987203574180603, "ce_loss_23": 2.8880042552948, "ce_loss_3": 4.178555047512054, "ce_loss_6": 3.7207028031349183, "epoch": 0.198, "grad_norm": 916.0, "kl_loss_12": 729.9580291748047, "kl_loss_17": 224.39583053588868, "kl_loss_3": 2696.8760009765624, "kl_loss_6": 1799.79140625, "learning_rate": 0.0009136294500014386, "loss": 1364.4973, "step": 1980 }, { "ce_loss_12": 3.1704697370529176, "ce_loss_17": 2.9398475289344788, "ce_loss_23": 2.8437638759613035, "ce_loss_3": 4.21691871881485, "ce_loss_6": 3.7204419493675234, "epoch": 0.199, "grad_norm": 1160.0, "kl_loss_12": 744.5164001464843, "kl_loss_17": 223.4737236022949, "kl_loss_3": 2823.7519653320314, "kl_loss_6": 1863.6061950683593, "learning_rate": 0.000912735948481387, "loss": 1410.1096, "step": 1990 }, { "ce_loss_12": 3.194620633125305, "ce_loss_17": 2.970863437652588, "ce_loss_23": 2.873484969139099, "ce_loss_3": 4.176866769790649, "ce_loss_6": 3.7112247705459596, "epoch": 0.2, "grad_norm": 1312.0, "kl_loss_12": 743.6204559326172, "kl_loss_17": 226.67948532104492, "kl_loss_3": 2743.9926147460938, "kl_loss_6": 1821.1581726074219, "learning_rate": 0.0009118382907149164, "loss": 1357.7129, "step": 2000 }, { "ce_loss_12": 3.2195873975753786, "ce_loss_17": 2.9947704076766968, "ce_loss_23": 2.8929362654685975, "ce_loss_3": 4.189015960693359, "ce_loss_6": 3.7328444957733153, "epoch": 0.201, "grad_norm": 1200.0, "kl_loss_12": 735.7771606445312, "kl_loss_17": 227.69709014892578, "kl_loss_3": 2695.451416015625, "kl_loss_6": 1806.4518920898438, "learning_rate": 0.0009109364857414306, "loss": 1357.8611, "step": 2010 }, { "ce_loss_12": 3.1835444688797, "ce_loss_17": 2.960699367523193, "ce_loss_23": 2.8582266688346865, "ce_loss_3": 4.159953761100769, "ce_loss_6": 3.6881731033325194, "epoch": 0.202, "grad_norm": 1392.0, "kl_loss_12": 728.1203887939453, "kl_loss_17": 225.80080337524413, "kl_loss_3": 2706.2451171875, "kl_loss_6": 1788.8057861328125, "learning_rate": 0.0009100305426420956, "loss": 1390.1856, "step": 2020 }, { "ce_loss_12": 3.158332920074463, "ce_loss_17": 2.9300349235534666, "ce_loss_23": 2.8350396871566774, "ce_loss_3": 4.194484627246856, "ce_loss_6": 3.69413343667984, "epoch": 0.203, "grad_norm": 1280.0, "kl_loss_12": 742.0165588378907, "kl_loss_17": 223.17797317504883, "kl_loss_3": 2842.8969116210938, "kl_loss_6": 1853.1241821289063, "learning_rate": 0.0009091204705397484, "loss": 1386.8559, "step": 2030 }, { "ce_loss_12": 3.1536441922187803, "ce_loss_17": 2.9197498559951782, "ce_loss_23": 2.8176995158195495, "ce_loss_3": 4.17947895526886, "ce_loss_6": 3.6965800404548643, "epoch": 0.204, "grad_norm": 1152.0, "kl_loss_12": 742.1661315917969, "kl_loss_17": 230.5888328552246, "kl_loss_3": 2826.4287841796877, "kl_loss_6": 1875.2062072753906, "learning_rate": 0.0009082062785988049, "loss": 1397.4822, "step": 2040 }, { "ce_loss_12": 3.269137918949127, "ce_loss_17": 3.0672066807746887, "ce_loss_23": 2.9520501255989076, "ce_loss_3": 4.209022450447082, "ce_loss_6": 3.7591748595237733, "epoch": 0.205, "grad_norm": 1208.0, "kl_loss_12": 731.192514038086, "kl_loss_17": 245.70539016723632, "kl_loss_3": 2669.8846923828123, "kl_loss_6": 1772.4919555664062, "learning_rate": 0.0009072879760251679, "loss": 1370.2641, "step": 2050 }, { "ce_loss_12": 3.2294013857841493, "ce_loss_17": 3.009316086769104, "ce_loss_23": 2.9020573258399964, "ce_loss_3": 4.236497604846955, "ce_loss_6": 3.7589595079422, "epoch": 0.206, "grad_norm": 1192.0, "kl_loss_12": 745.1193756103515, "kl_loss_17": 250.63618850708008, "kl_loss_3": 2800.283349609375, "kl_loss_6": 1860.894793701172, "learning_rate": 0.0009063655720661341, "loss": 1386.6717, "step": 2060 }, { "ce_loss_12": 3.260830020904541, "ce_loss_17": 3.0461089611053467, "ce_loss_23": 2.9399641871452333, "ce_loss_3": 4.217822635173798, "ce_loss_6": 3.758314859867096, "epoch": 0.207, "grad_norm": 1016.0, "kl_loss_12": 732.0251220703125, "kl_loss_17": 235.47347717285157, "kl_loss_3": 2669.2941284179688, "kl_loss_6": 1763.8640869140625, "learning_rate": 0.000905439076010301, "loss": 1357.1877, "step": 2070 }, { "ce_loss_12": 3.230174386501312, "ce_loss_17": 3.0027816772460936, "ce_loss_23": 2.89189817905426, "ce_loss_3": 4.2117068529129025, "ce_loss_6": 3.7440367579460143, "epoch": 0.208, "grad_norm": 1432.0, "kl_loss_12": 743.4530548095703, "kl_loss_17": 236.2601432800293, "kl_loss_3": 2725.55673828125, "kl_loss_6": 1810.9228759765624, "learning_rate": 0.0009045084971874737, "loss": 1349.2176, "step": 2080 }, { "ce_loss_12": 3.2096628665924074, "ce_loss_17": 2.9940597772598267, "ce_loss_23": 2.8850452184677122, "ce_loss_3": 4.1816226720809935, "ce_loss_6": 3.7097704291343687, "epoch": 0.209, "grad_norm": 1240.0, "kl_loss_12": 738.1762908935547, "kl_loss_17": 246.9880401611328, "kl_loss_3": 2709.64326171875, "kl_loss_6": 1782.71328125, "learning_rate": 0.0009035738449685707, "loss": 1388.232, "step": 2090 }, { "ce_loss_12": 3.1514390230178835, "ce_loss_17": 2.92235689163208, "ce_loss_23": 2.8135188102722166, "ce_loss_3": 4.17981231212616, "ce_loss_6": 3.6853424429893495, "epoch": 0.21, "grad_norm": 1176.0, "kl_loss_12": 742.7584381103516, "kl_loss_17": 233.57797088623047, "kl_loss_3": 2805.8019775390626, "kl_loss_6": 1845.4294494628907, "learning_rate": 0.0009026351287655293, "loss": 1370.0281, "step": 2100 }, { "ce_loss_12": 3.3175438404083253, "ce_loss_17": 3.1058226585388184, "ce_loss_23": 3.0096519112586977, "ce_loss_3": 4.2248126745224, "ce_loss_6": 3.779649579524994, "epoch": 0.211, "grad_norm": 1384.0, "kl_loss_12": 705.2593170166016, "kl_loss_17": 222.40945892333986, "kl_loss_3": 2558.6331787109375, "kl_loss_6": 1683.1639892578125, "learning_rate": 0.0009016923580312113, "loss": 1303.9625, "step": 2110 }, { "ce_loss_12": 3.1874075293540955, "ce_loss_17": 2.975838828086853, "ce_loss_23": 2.872942864894867, "ce_loss_3": 4.15403380393982, "ce_loss_6": 3.683121955394745, "epoch": 0.212, "grad_norm": 1432.0, "kl_loss_12": 723.3132904052734, "kl_loss_17": 226.7868797302246, "kl_loss_3": 2680.508654785156, "kl_loss_6": 1756.263201904297, "learning_rate": 0.0009007455422593077, "loss": 1372.0053, "step": 2120 }, { "ce_loss_12": 3.2143858790397646, "ce_loss_17": 2.989579474925995, "ce_loss_23": 2.8894683003425596, "ce_loss_3": 4.213230812549591, "ce_loss_6": 3.734225702285767, "epoch": 0.213, "grad_norm": 1104.0, "kl_loss_12": 741.5362884521485, "kl_loss_17": 224.52107315063478, "kl_loss_3": 2771.612683105469, "kl_loss_6": 1827.0911804199218, "learning_rate": 0.0008997946909842425, "loss": 1384.9793, "step": 2130 }, { "ce_loss_12": 3.2382789134979246, "ce_loss_17": 2.9995150327682496, "ce_loss_23": 2.8895116806030274, "ce_loss_3": 4.2739152073860165, "ce_loss_6": 3.7909855246543884, "epoch": 0.214, "grad_norm": 1012.0, "kl_loss_12": 768.0854797363281, "kl_loss_17": 242.95709915161132, "kl_loss_3": 2860.61904296875, "kl_loss_6": 1907.5541564941407, "learning_rate": 0.0008988398137810777, "loss": 1387.8596, "step": 2140 }, { "ce_loss_12": 3.251051902770996, "ce_loss_17": 3.0360903263092043, "ce_loss_23": 2.931905817985535, "ce_loss_3": 4.211556780338287, "ce_loss_6": 3.7544206738471986, "epoch": 0.215, "grad_norm": 1136.0, "kl_loss_12": 717.1144714355469, "kl_loss_17": 236.59547500610353, "kl_loss_3": 2675.7889892578123, "kl_loss_6": 1773.575018310547, "learning_rate": 0.0008978809202654162, "loss": 1340.9514, "step": 2150 }, { "ce_loss_12": 3.2336065649986265, "ce_loss_17": 3.020431411266327, "ce_loss_23": 2.912239933013916, "ce_loss_3": 4.21031643152237, "ce_loss_6": 3.7399205327033997, "epoch": 0.216, "grad_norm": 1280.0, "kl_loss_12": 732.2896759033204, "kl_loss_17": 241.09996490478517, "kl_loss_3": 2681.3944580078123, "kl_loss_6": 1765.8480407714844, "learning_rate": 0.0008969180200933046, "loss": 1367.7377, "step": 2160 }, { "ce_loss_12": 3.2038675785064696, "ce_loss_17": 2.979322910308838, "ce_loss_23": 2.8632314205169678, "ce_loss_3": 4.210316956043243, "ce_loss_6": 3.7373035192489623, "epoch": 0.217, "grad_norm": 1688.0, "kl_loss_12": 746.2812072753907, "kl_loss_17": 253.4534484863281, "kl_loss_3": 2745.815869140625, "kl_loss_6": 1829.8422790527343, "learning_rate": 0.0008959511229611376, "loss": 1391.7037, "step": 2170 }, { "ce_loss_12": 3.261945140361786, "ce_loss_17": 3.054550528526306, "ce_loss_23": 2.943201684951782, "ce_loss_3": 4.24164912700653, "ce_loss_6": 3.771306538581848, "epoch": 0.218, "grad_norm": 1128.0, "kl_loss_12": 723.3908752441406, "kl_loss_17": 245.53762588500976, "kl_loss_3": 2722.761669921875, "kl_loss_6": 1795.5326293945313, "learning_rate": 0.0008949802386055581, "loss": 1363.358, "step": 2180 }, { "ce_loss_12": 3.130527687072754, "ce_loss_17": 2.924950349330902, "ce_loss_23": 2.813413393497467, "ce_loss_3": 4.109816062450409, "ce_loss_6": 3.645668351650238, "epoch": 0.219, "grad_norm": 1424.0, "kl_loss_12": 714.7257232666016, "kl_loss_17": 246.9955825805664, "kl_loss_3": 2663.333203125, "kl_loss_6": 1757.4798095703125, "learning_rate": 0.0008940053768033609, "loss": 1379.0065, "step": 2190 }, { "ce_loss_12": 3.2098819971084596, "ce_loss_17": 3.0158223152160644, "ce_loss_23": 2.8984384655952455, "ce_loss_3": 4.161436128616333, "ce_loss_6": 3.708323049545288, "epoch": 0.22, "grad_norm": 1104.0, "kl_loss_12": 710.2417388916016, "kl_loss_17": 263.0302261352539, "kl_loss_3": 2651.0561279296876, "kl_loss_6": 1757.548602294922, "learning_rate": 0.0008930265473713938, "loss": 1348.1185, "step": 2200 }, { "ce_loss_12": 3.1847044706344603, "ce_loss_17": 2.9818301796913147, "ce_loss_23": 2.8639705538749696, "ce_loss_3": 4.157197976112366, "ce_loss_6": 3.6896390080451966, "epoch": 0.221, "grad_norm": 1080.0, "kl_loss_12": 724.5081848144531, "kl_loss_17": 249.5000907897949, "kl_loss_3": 2682.2561767578127, "kl_loss_6": 1762.918389892578, "learning_rate": 0.0008920437601664579, "loss": 1327.0632, "step": 2210 }, { "ce_loss_12": 3.182631862163544, "ce_loss_17": 2.9684879183769226, "ce_loss_23": 2.858442449569702, "ce_loss_3": 4.159274959564209, "ce_loss_6": 3.700261175632477, "epoch": 0.222, "grad_norm": 1168.0, "kl_loss_12": 737.7363159179688, "kl_loss_17": 243.66717834472655, "kl_loss_3": 2716.596545410156, "kl_loss_6": 1817.600164794922, "learning_rate": 0.0008910570250852097, "loss": 1348.7851, "step": 2220 }, { "ce_loss_12": 3.2627673745155334, "ce_loss_17": 3.048254370689392, "ce_loss_23": 2.9470550298690794, "ce_loss_3": 4.182011067867279, "ce_loss_6": 3.7384843826293945, "epoch": 0.223, "grad_norm": 1424.0, "kl_loss_12": 702.40751953125, "kl_loss_17": 227.97779693603516, "kl_loss_3": 2588.418603515625, "kl_loss_6": 1716.8679077148438, "learning_rate": 0.0008900663520640604, "loss": 1316.1682, "step": 2230 }, { "ce_loss_12": 3.22307368516922, "ce_loss_17": 3.0084318041801454, "ce_loss_23": 2.9019481539726257, "ce_loss_3": 4.189678382873535, "ce_loss_6": 3.7194597840309145, "epoch": 0.224, "grad_norm": 1144.0, "kl_loss_12": 719.4921600341797, "kl_loss_17": 233.06052169799804, "kl_loss_3": 2691.5576293945314, "kl_loss_6": 1774.1268249511718, "learning_rate": 0.0008890717510790764, "loss": 1353.0216, "step": 2240 }, { "ce_loss_12": 3.181252729892731, "ce_loss_17": 2.967786800861359, "ce_loss_23": 2.8688748478889465, "ce_loss_3": 4.168042957782745, "ce_loss_6": 3.693181240558624, "epoch": 0.225, "grad_norm": 1080.0, "kl_loss_12": 713.7260528564453, "kl_loss_17": 225.8282371520996, "kl_loss_3": 2697.2067749023436, "kl_loss_6": 1771.8312927246093, "learning_rate": 0.0008880732321458784, "loss": 1359.2141, "step": 2250 }, { "ce_loss_12": 3.2142521858215334, "ce_loss_17": 2.9944754362106325, "ce_loss_23": 2.8961597084999084, "ce_loss_3": 4.1702636003494264, "ce_loss_6": 3.70775785446167, "epoch": 0.226, "grad_norm": 1240.0, "kl_loss_12": 718.1211059570312, "kl_loss_17": 221.46337661743163, "kl_loss_3": 2660.3265625, "kl_loss_6": 1751.7075927734375, "learning_rate": 0.0008870708053195413, "loss": 1356.8067, "step": 2260 }, { "ce_loss_12": 3.2218335270881653, "ce_loss_17": 3.0125537276268006, "ce_loss_23": 2.9187202572822573, "ce_loss_3": 4.162897741794586, "ce_loss_6": 3.7049925565719604, "epoch": 0.227, "grad_norm": 1024.0, "kl_loss_12": 693.9300354003906, "kl_loss_17": 211.78112869262696, "kl_loss_3": 2611.0219970703124, "kl_loss_6": 1716.663330078125, "learning_rate": 0.0008860644806944918, "loss": 1325.6539, "step": 2270 }, { "ce_loss_12": 3.1866829633712768, "ce_loss_17": 2.962891864776611, "ce_loss_23": 2.8645379543304443, "ce_loss_3": 4.165493011474609, "ce_loss_6": 3.6995256423950194, "epoch": 0.228, "grad_norm": 996.0, "kl_loss_12": 725.6792572021484, "kl_loss_17": 223.19341049194335, "kl_loss_3": 2702.9573364257812, "kl_loss_6": 1784.9772827148438, "learning_rate": 0.0008850542684044079, "loss": 1325.885, "step": 2280 }, { "ce_loss_12": 3.162304496765137, "ce_loss_17": 2.9285544872283937, "ce_loss_23": 2.821481502056122, "ce_loss_3": 4.180591094493866, "ce_loss_6": 3.6861800074577333, "epoch": 0.229, "grad_norm": 1176.0, "kl_loss_12": 749.9650268554688, "kl_loss_17": 233.6118423461914, "kl_loss_3": 2823.8099609375, "kl_loss_6": 1851.4998291015625, "learning_rate": 0.0008840401786221159, "loss": 1370.4396, "step": 2290 }, { "ce_loss_12": 3.2695318937301634, "ce_loss_17": 3.056403195858002, "ce_loss_23": 2.965723288059235, "ce_loss_3": 4.215631699562072, "ce_loss_6": 3.76085741519928, "epoch": 0.23, "grad_norm": 1136.0, "kl_loss_12": 693.4002868652344, "kl_loss_17": 209.44346084594727, "kl_loss_3": 2616.2891845703125, "kl_loss_6": 1727.0918090820312, "learning_rate": 0.000883022221559489, "loss": 1308.9746, "step": 2300 }, { "ce_loss_12": 3.239469087123871, "ce_loss_17": 3.0272177696228026, "ce_loss_23": 2.9308255076408387, "ce_loss_3": 4.211579275131226, "ce_loss_6": 3.751608657836914, "epoch": 0.231, "grad_norm": 1360.0, "kl_loss_12": 709.3472320556641, "kl_loss_17": 216.6603546142578, "kl_loss_3": 2677.8682495117187, "kl_loss_6": 1775.0725708007812, "learning_rate": 0.0008820004074673434, "loss": 1373.6508, "step": 2310 }, { "ce_loss_12": 3.1577208757400514, "ce_loss_17": 2.9385272026062013, "ce_loss_23": 2.8453004121780396, "ce_loss_3": 4.112058687210083, "ce_loss_6": 3.656074655056, "epoch": 0.232, "grad_norm": 1360.0, "kl_loss_12": 712.0259033203125, "kl_loss_17": 213.32648620605468, "kl_loss_3": 2673.76650390625, "kl_loss_6": 1774.3185424804688, "learning_rate": 0.0008809747466353355, "loss": 1325.0434, "step": 2320 }, { "ce_loss_12": 3.157701861858368, "ce_loss_17": 2.940626549720764, "ce_loss_23": 2.842818570137024, "ce_loss_3": 4.130900573730469, "ce_loss_6": 3.6584607481956484, "epoch": 0.233, "grad_norm": 1088.0, "kl_loss_12": 702.6765716552734, "kl_loss_17": 215.8955406188965, "kl_loss_3": 2674.025427246094, "kl_loss_6": 1746.1587280273438, "learning_rate": 0.0008799452493918585, "loss": 1344.9795, "step": 2330 }, { "ce_loss_12": 3.224400007724762, "ce_loss_17": 3.0065416574478148, "ce_loss_23": 2.912652146816254, "ce_loss_3": 4.178082239627838, "ce_loss_6": 3.7230546116828918, "epoch": 0.234, "grad_norm": 1520.0, "kl_loss_12": 704.7937683105469, "kl_loss_17": 212.39177703857422, "kl_loss_3": 2653.9332641601563, "kl_loss_6": 1759.9414428710938, "learning_rate": 0.0008789119261039385, "loss": 1374.2182, "step": 2340 }, { "ce_loss_12": 3.1472853779792787, "ce_loss_17": 2.9305580973625185, "ce_loss_23": 2.8364876210689545, "ce_loss_3": 4.11597170829773, "ce_loss_6": 3.6585975170135496, "epoch": 0.235, "grad_norm": 1248.0, "kl_loss_12": 706.184780883789, "kl_loss_17": 212.0734100341797, "kl_loss_3": 2675.7802612304686, "kl_loss_6": 1771.2001037597656, "learning_rate": 0.0008778747871771292, "loss": 1316.8098, "step": 2350 }, { "ce_loss_12": 3.179029405117035, "ce_loss_17": 2.975753378868103, "ce_loss_23": 2.8873407006263734, "ce_loss_3": 4.119187295436859, "ce_loss_6": 3.669054090976715, "epoch": 0.236, "grad_norm": 1304.0, "kl_loss_12": 676.4259826660157, "kl_loss_17": 203.13166885375978, "kl_loss_3": 2592.462512207031, "kl_loss_6": 1700.6154052734375, "learning_rate": 0.0008768338430554083, "loss": 1293.2753, "step": 2360 }, { "ce_loss_12": 3.2074209094047545, "ce_loss_17": 2.986566960811615, "ce_loss_23": 2.8928439378738404, "ce_loss_3": 4.155289900302887, "ce_loss_6": 3.6990363121032717, "epoch": 0.237, "grad_norm": 1104.0, "kl_loss_12": 705.8325775146484, "kl_loss_17": 215.99607696533204, "kl_loss_3": 2624.538525390625, "kl_loss_6": 1726.0707092285156, "learning_rate": 0.0008757891042210713, "loss": 1332.907, "step": 2370 }, { "ce_loss_12": 3.213341999053955, "ce_loss_17": 3.00328369140625, "ce_loss_23": 2.908613753318787, "ce_loss_3": 4.1675421595573425, "ce_loss_6": 3.709250104427338, "epoch": 0.238, "grad_norm": 1112.0, "kl_loss_12": 694.2518676757812, "kl_loss_17": 214.54538650512694, "kl_loss_3": 2620.0777099609377, "kl_loss_6": 1721.7463134765626, "learning_rate": 0.0008747405811946271, "loss": 1322.9535, "step": 2380 }, { "ce_loss_12": 3.1364938855171203, "ce_loss_17": 2.916593599319458, "ce_loss_23": 2.818560254573822, "ce_loss_3": 4.146471822261811, "ce_loss_6": 3.6627113580703736, "epoch": 0.239, "grad_norm": 1232.0, "kl_loss_12": 718.5253265380859, "kl_loss_17": 218.13416290283203, "kl_loss_3": 2761.2788330078124, "kl_loss_6": 1813.5057373046875, "learning_rate": 0.0008736882845346905, "loss": 1332.236, "step": 2390 }, { "ce_loss_12": 3.21389524936676, "ce_loss_17": 2.9948238492012025, "ce_loss_23": 2.893634247779846, "ce_loss_3": 4.187910413742065, "ce_loss_6": 3.7110480427742005, "epoch": 0.24, "grad_norm": 1264.0, "kl_loss_12": 719.9792236328125, "kl_loss_17": 223.24748992919922, "kl_loss_3": 2672.5799560546875, "kl_loss_6": 1746.6874389648438, "learning_rate": 0.0008726322248378774, "loss": 1323.5588, "step": 2400 }, { "ce_loss_12": 3.211190974712372, "ce_loss_17": 2.9982844233512878, "ce_loss_23": 2.9063130021095276, "ce_loss_3": 4.206434118747711, "ce_loss_6": 3.72991144657135, "epoch": 0.241, "grad_norm": 1184.0, "kl_loss_12": 704.4481994628907, "kl_loss_17": 211.553133392334, "kl_loss_3": 2721.0894775390625, "kl_loss_6": 1794.480224609375, "learning_rate": 0.0008715724127386971, "loss": 1368.4092, "step": 2410 }, { "ce_loss_12": 3.2690393209457396, "ce_loss_17": 3.0638194561004637, "ce_loss_23": 2.967925226688385, "ce_loss_3": 4.219721961021423, "ce_loss_6": 3.7553182005882264, "epoch": 0.242, "grad_norm": 1328.0, "kl_loss_12": 686.9312622070313, "kl_loss_17": 218.79831314086914, "kl_loss_3": 2634.0843994140623, "kl_loss_6": 1712.5268676757812, "learning_rate": 0.0008705088589094458, "loss": 1325.8676, "step": 2420 }, { "ce_loss_12": 3.2798361659049986, "ce_loss_17": 3.078590726852417, "ce_loss_23": 2.981303107738495, "ce_loss_3": 4.241901850700378, "ce_loss_6": 3.780438446998596, "epoch": 0.243, "grad_norm": 1472.0, "kl_loss_12": 698.8630310058594, "kl_loss_17": 229.17339553833008, "kl_loss_3": 2658.3953369140627, "kl_loss_6": 1746.2057495117188, "learning_rate": 0.0008694415740600988, "loss": 1338.1035, "step": 2430 }, { "ce_loss_12": 3.158348333835602, "ce_loss_17": 2.9655771493911742, "ce_loss_23": 2.848409104347229, "ce_loss_3": 4.141533851623535, "ce_loss_6": 3.6722955226898195, "epoch": 0.244, "grad_norm": 1136.0, "kl_loss_12": 708.9916778564453, "kl_loss_17": 276.5789794921875, "kl_loss_3": 2714.6683959960938, "kl_loss_6": 1797.1813537597657, "learning_rate": 0.0008683705689382025, "loss": 1349.2308, "step": 2440 }, { "ce_loss_12": 3.2215414166450502, "ce_loss_17": 3.026882493495941, "ce_loss_23": 2.926299273967743, "ce_loss_3": 4.159836375713349, "ce_loss_6": 3.708118665218353, "epoch": 0.245, "grad_norm": 988.0, "kl_loss_12": 686.4640747070313, "kl_loss_17": 228.41824951171876, "kl_loss_3": 2609.161340332031, "kl_loss_6": 1714.3937561035157, "learning_rate": 0.0008672958543287666, "loss": 1340.6461, "step": 2450 }, { "ce_loss_12": 3.2369601130485535, "ce_loss_17": 3.0324127793312075, "ce_loss_23": 2.9349398016929626, "ce_loss_3": 4.168296587467194, "ce_loss_6": 3.71487580537796, "epoch": 0.246, "grad_norm": 1056.0, "kl_loss_12": 702.6545867919922, "kl_loss_17": 223.37064590454102, "kl_loss_3": 2597.1249267578123, "kl_loss_6": 1710.9740417480468, "learning_rate": 0.0008662174410541554, "loss": 1311.1107, "step": 2460 }, { "ce_loss_12": 3.197036552429199, "ce_loss_17": 2.9945160865783693, "ce_loss_23": 2.9003651618957518, "ce_loss_3": 4.1306620121002195, "ce_loss_6": 3.6728007078170775, "epoch": 0.247, "grad_norm": 1216.0, "kl_loss_12": 688.5184326171875, "kl_loss_17": 223.68284378051757, "kl_loss_3": 2592.430944824219, "kl_loss_6": 1692.0546813964843, "learning_rate": 0.0008651353399739787, "loss": 1337.2021, "step": 2470 }, { "ce_loss_12": 3.2234519362449645, "ce_loss_17": 3.017361891269684, "ce_loss_23": 2.9193612813949583, "ce_loss_3": 4.171088445186615, "ce_loss_6": 3.707779347896576, "epoch": 0.248, "grad_norm": 1080.0, "kl_loss_12": 692.3505249023438, "kl_loss_17": 220.68072967529298, "kl_loss_3": 2605.4829345703124, "kl_loss_6": 1707.4573181152343, "learning_rate": 0.0008640495619849821, "loss": 1312.7043, "step": 2480 }, { "ce_loss_12": 3.188669204711914, "ce_loss_17": 2.9883776545524596, "ce_loss_23": 2.8864614367485046, "ce_loss_3": 4.126471364498139, "ce_loss_6": 3.6681827306747437, "epoch": 0.249, "grad_norm": 1376.0, "kl_loss_12": 692.8002624511719, "kl_loss_17": 222.35086212158203, "kl_loss_3": 2602.0055541992188, "kl_loss_6": 1697.7270385742188, "learning_rate": 0.0008629601180209381, "loss": 1307.9302, "step": 2490 }, { "ce_loss_12": 3.1828148007392882, "ce_loss_17": 2.9769126296043398, "ce_loss_23": 2.876669943332672, "ce_loss_3": 4.1202881097793576, "ce_loss_6": 3.660766232013702, "epoch": 0.25, "grad_norm": 948.0, "kl_loss_12": 684.8112030029297, "kl_loss_17": 234.48795318603516, "kl_loss_3": 2583.1467529296874, "kl_loss_6": 1682.2519226074219, "learning_rate": 0.000861867019052535, "loss": 1320.1142, "step": 2500 }, { "ce_loss_12": 3.1194626927375793, "ce_loss_17": 2.9112721920013427, "ce_loss_23": 2.8053731083869935, "ce_loss_3": 4.102438044548035, "ce_loss_6": 3.6318350195884705, "epoch": 0.251, "grad_norm": 1072.0, "kl_loss_12": 698.8854797363281, "kl_loss_17": 235.78541717529296, "kl_loss_3": 2680.2626220703123, "kl_loss_6": 1758.730224609375, "learning_rate": 0.0008607702760872678, "loss": 1343.0601, "step": 2510 }, { "ce_loss_12": 3.204072630405426, "ce_loss_17": 3.011195695400238, "ce_loss_23": 2.90932697057724, "ce_loss_3": 4.138043749332428, "ce_loss_6": 3.686288094520569, "epoch": 0.252, "grad_norm": 1168.0, "kl_loss_12": 674.006314086914, "kl_loss_17": 224.07162475585938, "kl_loss_3": 2564.412548828125, "kl_loss_6": 1682.5275573730469, "learning_rate": 0.0008596699001693256, "loss": 1324.4848, "step": 2520 }, { "ce_loss_12": 3.2106514811515807, "ce_loss_17": 3.017582905292511, "ce_loss_23": 2.922886919975281, "ce_loss_3": 4.140046346187591, "ce_loss_6": 3.6838871836662292, "epoch": 0.253, "grad_norm": 1192.0, "kl_loss_12": 676.8567138671875, "kl_loss_17": 218.53152160644532, "kl_loss_3": 2574.9288696289063, "kl_loss_6": 1676.3718872070312, "learning_rate": 0.0008585659023794818, "loss": 1328.5582, "step": 2530 }, { "ce_loss_12": 3.1961737751960753, "ce_loss_17": 2.985444128513336, "ce_loss_23": 2.8865877389907837, "ce_loss_3": 4.184960126876831, "ce_loss_6": 3.715757930278778, "epoch": 0.254, "grad_norm": 1072.0, "kl_loss_12": 711.092642211914, "kl_loss_17": 226.49666290283204, "kl_loss_3": 2703.228369140625, "kl_loss_6": 1769.7383117675781, "learning_rate": 0.0008574582938349817, "loss": 1332.0293, "step": 2540 }, { "ce_loss_12": 3.184503936767578, "ce_loss_17": 2.9647669196128845, "ce_loss_23": 2.8564560532569887, "ce_loss_3": 4.1622985124588014, "ce_loss_6": 3.685743486881256, "epoch": 0.255, "grad_norm": 1088.0, "kl_loss_12": 727.4106658935547, "kl_loss_17": 232.05386352539062, "kl_loss_3": 2709.0426513671873, "kl_loss_6": 1767.201641845703, "learning_rate": 0.0008563470856894315, "loss": 1316.1617, "step": 2550 }, { "ce_loss_12": 3.1625547409057617, "ce_loss_17": 2.9571083307266237, "ce_loss_23": 2.8621199488639832, "ce_loss_3": 4.132767677307129, "ce_loss_6": 3.6766427636146544, "epoch": 0.256, "grad_norm": 1296.0, "kl_loss_12": 692.0477294921875, "kl_loss_17": 216.78019943237305, "kl_loss_3": 2642.7876586914062, "kl_loss_6": 1745.0903747558593, "learning_rate": 0.0008552322891326845, "loss": 1316.6967, "step": 2560 }, { "ce_loss_12": 3.1427706480026245, "ce_loss_17": 2.935354781150818, "ce_loss_23": 2.839388430118561, "ce_loss_3": 4.1130335330963135, "ce_loss_6": 3.6411257863044737, "epoch": 0.257, "grad_norm": 1608.0, "kl_loss_12": 691.3601867675782, "kl_loss_17": 217.39927825927734, "kl_loss_3": 2656.824072265625, "kl_loss_6": 1726.0087158203125, "learning_rate": 0.0008541139153907296, "loss": 1300.7164, "step": 2570 }, { "ce_loss_12": 3.097336781024933, "ce_loss_17": 2.8923059940338134, "ce_loss_23": 2.8015827894210816, "ce_loss_3": 4.065582001209259, "ce_loss_6": 3.594275784492493, "epoch": 0.258, "grad_norm": 1072.0, "kl_loss_12": 673.7452972412109, "kl_loss_17": 205.65441131591797, "kl_loss_3": 2627.886279296875, "kl_loss_6": 1704.77724609375, "learning_rate": 0.0008529919757255782, "loss": 1320.6559, "step": 2580 }, { "ce_loss_12": 3.121126043796539, "ce_loss_17": 2.9228207588195803, "ce_loss_23": 2.838605988025665, "ce_loss_3": 4.037435448169708, "ce_loss_6": 3.584921109676361, "epoch": 0.259, "grad_norm": 1152.0, "kl_loss_12": 657.7913787841796, "kl_loss_17": 202.52617797851562, "kl_loss_3": 2537.204150390625, "kl_loss_6": 1645.8016479492187, "learning_rate": 0.0008518664814351503, "loss": 1274.3268, "step": 2590 }, { "ce_loss_12": 3.104781413078308, "ce_loss_17": 2.8901186347007752, "ce_loss_23": 2.795970094203949, "ce_loss_3": 4.082364869117737, "ce_loss_6": 3.605860185623169, "epoch": 0.26, "grad_norm": 1344.0, "kl_loss_12": 696.5720184326171, "kl_loss_17": 214.5135696411133, "kl_loss_3": 2675.3045654296875, "kl_loss_6": 1741.6918823242188, "learning_rate": 0.0008507374438531607, "loss": 1363.2209, "step": 2600 }, { "ce_loss_12": 3.081671857833862, "ce_loss_17": 2.876622939109802, "ce_loss_23": 2.784547483921051, "ce_loss_3": 4.034499597549439, "ce_loss_6": 3.576632249355316, "epoch": 0.261, "grad_norm": 1096.0, "kl_loss_12": 682.1584899902343, "kl_loss_17": 206.09313354492187, "kl_loss_3": 2613.8728271484374, "kl_loss_6": 1720.1293151855468, "learning_rate": 0.0008496048743490053, "loss": 1305.6201, "step": 2610 }, { "ce_loss_12": 3.2290136218070984, "ce_loss_17": 3.0230956435203553, "ce_loss_23": 2.9306798219680785, "ce_loss_3": 4.150061511993409, "ce_loss_6": 3.707044267654419, "epoch": 0.262, "grad_norm": 1192.0, "kl_loss_12": 678.9570159912109, "kl_loss_17": 205.46227264404297, "kl_loss_3": 2554.87841796875, "kl_loss_6": 1674.7249084472655, "learning_rate": 0.0008484687843276469, "loss": 1290.4998, "step": 2620 }, { "ce_loss_12": 3.1576597094535828, "ce_loss_17": 2.95311176776886, "ce_loss_23": 2.8603856325149537, "ce_loss_3": 4.113218057155609, "ce_loss_6": 3.6455924272537232, "epoch": 0.263, "grad_norm": 1528.0, "kl_loss_12": 689.7358795166016, "kl_loss_17": 210.2408332824707, "kl_loss_3": 2610.7355712890626, "kl_loss_6": 1700.8219421386718, "learning_rate": 0.0008473291852294987, "loss": 1316.9947, "step": 2630 }, { "ce_loss_12": 3.172579658031464, "ce_loss_17": 2.9600013256073, "ce_loss_23": 2.86677725315094, "ce_loss_3": 4.115126585960388, "ce_loss_6": 3.6532453536987304, "epoch": 0.264, "grad_norm": 1168.0, "kl_loss_12": 696.9774536132812, "kl_loss_17": 212.4393394470215, "kl_loss_3": 2622.4283813476563, "kl_loss_6": 1705.7791381835937, "learning_rate": 0.0008461860885303114, "loss": 1300.3334, "step": 2640 }, { "ce_loss_12": 3.1920876264572144, "ce_loss_17": 2.984456753730774, "ce_loss_23": 2.896187424659729, "ce_loss_3": 4.123092436790467, "ce_loss_6": 3.662570369243622, "epoch": 0.265, "grad_norm": 1064.0, "kl_loss_12": 677.250210571289, "kl_loss_17": 203.22478103637695, "kl_loss_3": 2562.327197265625, "kl_loss_6": 1662.8192504882813, "learning_rate": 0.000845039505741056, "loss": 1299.5422, "step": 2650 }, { "ce_loss_12": 3.1844265699386596, "ce_loss_17": 2.9672574758529664, "ce_loss_23": 2.8763338327407837, "ce_loss_3": 4.1377707600593565, "ce_loss_6": 3.6718860268592834, "epoch": 0.266, "grad_norm": 1128.0, "kl_loss_12": 707.8172332763672, "kl_loss_17": 211.42220458984374, "kl_loss_3": 2656.2222534179687, "kl_loss_6": 1731.8209411621094, "learning_rate": 0.0008438894484078086, "loss": 1353.5854, "step": 2660 }, { "ce_loss_12": 3.186161124706268, "ce_loss_17": 2.982400453090668, "ce_loss_23": 2.8897322177886964, "ce_loss_3": 4.114835977554321, "ce_loss_6": 3.6600540041923524, "epoch": 0.267, "grad_norm": 1064.0, "kl_loss_12": 677.1057373046875, "kl_loss_17": 211.50477447509766, "kl_loss_3": 2567.3158447265623, "kl_loss_6": 1681.799871826172, "learning_rate": 0.0008427359281116334, "loss": 1296.6374, "step": 2670 }, { "ce_loss_12": 3.093821132183075, "ce_loss_17": 2.882770228385925, "ce_loss_23": 2.791520416736603, "ce_loss_3": 4.071115756034851, "ce_loss_6": 3.5991916179656984, "epoch": 0.268, "grad_norm": 1248.0, "kl_loss_12": 687.5313018798828, "kl_loss_17": 210.13628540039062, "kl_loss_3": 2665.6412109375, "kl_loss_6": 1737.7759338378905, "learning_rate": 0.0008415789564684673, "loss": 1315.315, "step": 2680 }, { "ce_loss_12": 3.319716191291809, "ce_loss_17": 3.1131805419921874, "ce_loss_23": 3.017329823970795, "ce_loss_3": 4.241286253929138, "ce_loss_6": 3.787472295761108, "epoch": 0.269, "grad_norm": 1496.0, "kl_loss_12": 685.154019165039, "kl_loss_17": 214.83649978637695, "kl_loss_3": 2533.8969482421876, "kl_loss_6": 1651.515673828125, "learning_rate": 0.0008404185451290017, "loss": 1275.1857, "step": 2690 }, { "ce_loss_12": 3.1939535617828367, "ce_loss_17": 2.9861059308052065, "ce_loss_23": 2.895950734615326, "ce_loss_3": 4.137200510501861, "ce_loss_6": 3.677232837677002, "epoch": 0.27, "grad_norm": 968.0, "kl_loss_12": 676.4378295898438, "kl_loss_17": 211.2507179260254, "kl_loss_3": 2583.730187988281, "kl_loss_6": 1676.1700622558594, "learning_rate": 0.0008392547057785661, "loss": 1288.5631, "step": 2700 }, { "ce_loss_12": 3.1444958209991456, "ce_loss_17": 2.932486617565155, "ce_loss_23": 2.839138376712799, "ce_loss_3": 4.131505846977234, "ce_loss_6": 3.650907111167908, "epoch": 0.271, "grad_norm": 980.0, "kl_loss_12": 697.2017883300781, "kl_loss_17": 217.85244369506836, "kl_loss_3": 2717.758996582031, "kl_loss_6": 1766.1455200195312, "learning_rate": 0.0008380874501370098, "loss": 1298.7047, "step": 2710 }, { "ce_loss_12": 3.1357044100761415, "ce_loss_17": 2.924636518955231, "ce_loss_23": 2.8253360390663147, "ce_loss_3": 4.122107923030853, "ce_loss_6": 3.6408496141433715, "epoch": 0.272, "grad_norm": 1192.0, "kl_loss_12": 708.8865112304687, "kl_loss_17": 233.48337783813477, "kl_loss_3": 2702.6140747070312, "kl_loss_6": 1754.3397705078125, "learning_rate": 0.0008369167899585841, "loss": 1331.4025, "step": 2720 }, { "ce_loss_12": 3.2249266386032103, "ce_loss_17": 3.0370797514915466, "ce_loss_23": 2.939964401721954, "ce_loss_3": 4.134345138072968, "ce_loss_6": 3.6882724165916443, "epoch": 0.273, "grad_norm": 1352.0, "kl_loss_12": 670.7063415527343, "kl_loss_17": 240.46832427978515, "kl_loss_3": 2530.49345703125, "kl_loss_6": 1649.3007019042968, "learning_rate": 0.0008357427370318238, "loss": 1316.18, "step": 2730 }, { "ce_loss_12": 3.189584970474243, "ce_loss_17": 3.000252032279968, "ce_loss_23": 2.890009045600891, "ce_loss_3": 4.150511598587036, "ce_loss_6": 3.6770068645477294, "epoch": 0.274, "grad_norm": 1400.0, "kl_loss_12": 682.5615783691406, "kl_loss_17": 236.95541915893554, "kl_loss_3": 2626.866320800781, "kl_loss_6": 1702.4308288574218, "learning_rate": 0.0008345653031794292, "loss": 1318.7662, "step": 2740 }, { "ce_loss_12": 3.1951624870300295, "ce_loss_17": 2.9964037895202638, "ce_loss_23": 2.8955544590950013, "ce_loss_3": 4.138919460773468, "ce_loss_6": 3.673405385017395, "epoch": 0.275, "grad_norm": 880.0, "kl_loss_12": 689.3633850097656, "kl_loss_17": 231.51595458984374, "kl_loss_3": 2603.9886840820313, "kl_loss_6": 1685.0747619628905, "learning_rate": 0.0008333845002581458, "loss": 1300.1859, "step": 2750 }, { "ce_loss_12": 3.1334605693817137, "ce_loss_17": 2.9291109323501585, "ce_loss_23": 2.830948543548584, "ce_loss_3": 4.099729323387146, "ce_loss_6": 3.637806749343872, "epoch": 0.276, "grad_norm": 952.0, "kl_loss_12": 701.9874572753906, "kl_loss_17": 226.16938247680665, "kl_loss_3": 2669.161376953125, "kl_loss_6": 1756.4841552734374, "learning_rate": 0.0008322003401586462, "loss": 1333.6066, "step": 2760 }, { "ce_loss_12": 3.150827920436859, "ce_loss_17": 2.9546613574028013, "ce_loss_23": 2.8605478763580323, "ce_loss_3": 4.07707986831665, "ce_loss_6": 3.6201881170272827, "epoch": 0.277, "grad_norm": 1176.0, "kl_loss_12": 667.2383483886719, "kl_loss_17": 213.83636016845702, "kl_loss_3": 2540.439172363281, "kl_loss_6": 1650.3801879882812, "learning_rate": 0.0008310128348054094, "loss": 1259.9731, "step": 2770 }, { "ce_loss_12": 3.1183719992637635, "ce_loss_17": 2.924716579914093, "ce_loss_23": 2.8316155433654786, "ce_loss_3": 4.072813379764557, "ce_loss_6": 3.605352246761322, "epoch": 0.278, "grad_norm": 1296.0, "kl_loss_12": 671.1783020019532, "kl_loss_17": 211.9533592224121, "kl_loss_3": 2594.124963378906, "kl_loss_6": 1683.5873413085938, "learning_rate": 0.0008298219961566008, "loss": 1293.9365, "step": 2780 }, { "ce_loss_12": 3.0993417143821715, "ce_loss_17": 2.890523338317871, "ce_loss_23": 2.7993259906768797, "ce_loss_3": 4.072126877307892, "ce_loss_6": 3.5997986793518066, "epoch": 0.279, "grad_norm": 1056.0, "kl_loss_12": 693.4780517578125, "kl_loss_17": 213.2262306213379, "kl_loss_3": 2678.9249267578125, "kl_loss_6": 1752.5269470214844, "learning_rate": 0.0008286278362039527, "loss": 1297.5109, "step": 2790 }, { "ce_loss_12": 3.121641290187836, "ce_loss_17": 2.9188699603080748, "ce_loss_23": 2.823632848262787, "ce_loss_3": 4.120881426334381, "ce_loss_6": 3.6421324014663696, "epoch": 0.28, "grad_norm": 1216.0, "kl_loss_12": 687.7910034179688, "kl_loss_17": 218.2253387451172, "kl_loss_3": 2705.189245605469, "kl_loss_6": 1758.8736450195313, "learning_rate": 0.0008274303669726426, "loss": 1302.2418, "step": 2800 }, { "ce_loss_12": 3.0395513653755186, "ce_loss_17": 2.8333880066871644, "ce_loss_23": 2.735865366458893, "ce_loss_3": 4.0482837677001955, "ce_loss_6": 3.556193935871124, "epoch": 0.281, "grad_norm": 1312.0, "kl_loss_12": 685.361215209961, "kl_loss_17": 227.50414581298827, "kl_loss_3": 2726.5383911132812, "kl_loss_6": 1754.9284606933593, "learning_rate": 0.0008262296005211721, "loss": 1304.683, "step": 2810 }, { "ce_loss_12": 3.154022693634033, "ce_loss_17": 2.9483704686164858, "ce_loss_23": 2.8555146217346192, "ce_loss_3": 4.119756305217743, "ce_loss_6": 3.65409369468689, "epoch": 0.282, "grad_norm": 996.0, "kl_loss_12": 688.2166076660156, "kl_loss_17": 218.64757843017577, "kl_loss_3": 2633.830712890625, "kl_loss_6": 1726.433770751953, "learning_rate": 0.0008250255489412463, "loss": 1300.1041, "step": 2820 }, { "ce_loss_12": 3.2485215187072756, "ce_loss_17": 3.0447388768196104, "ce_loss_23": 2.9463759064674377, "ce_loss_3": 4.187284290790558, "ce_loss_6": 3.7238049268722535, "epoch": 0.283, "grad_norm": 1432.0, "kl_loss_12": 683.5297790527344, "kl_loss_17": 215.8306427001953, "kl_loss_3": 2594.514514160156, "kl_loss_6": 1686.3562927246094, "learning_rate": 0.0008238182243576511, "loss": 1298.2584, "step": 2830 }, { "ce_loss_12": 3.1957290291786196, "ce_loss_17": 3.0053368330001833, "ce_loss_23": 2.9148476481437684, "ce_loss_3": 4.07955631017685, "ce_loss_6": 3.641888213157654, "epoch": 0.284, "grad_norm": 1088.0, "kl_loss_12": 657.5304382324218, "kl_loss_17": 208.30530853271483, "kl_loss_3": 2456.5474853515625, "kl_loss_6": 1602.383233642578, "learning_rate": 0.0008226076389281315, "loss": 1255.6082, "step": 2840 }, { "ce_loss_12": 3.2510815858840942, "ce_loss_17": 3.048978865146637, "ce_loss_23": 2.9613125801086424, "ce_loss_3": 4.161168789863586, "ce_loss_6": 3.7106049180030825, "epoch": 0.285, "grad_norm": 1616.0, "kl_loss_12": 674.101708984375, "kl_loss_17": 207.04217147827148, "kl_loss_3": 2547.9843139648438, "kl_loss_6": 1659.9259216308594, "learning_rate": 0.0008213938048432696, "loss": 1262.7362, "step": 2850 }, { "ce_loss_12": 3.1783262133598327, "ce_loss_17": 2.9763714909553527, "ce_loss_23": 2.8821098804473877, "ce_loss_3": 4.104459810256958, "ce_loss_6": 3.6436208963394163, "epoch": 0.286, "grad_norm": 1048.0, "kl_loss_12": 683.6984161376953, "kl_loss_17": 214.6061233520508, "kl_loss_3": 2575.304443359375, "kl_loss_6": 1658.6302917480468, "learning_rate": 0.0008201767343263612, "loss": 1292.0326, "step": 2860 }, { "ce_loss_12": 3.1275757551193237, "ce_loss_17": 2.9192912459373472, "ce_loss_23": 2.8295354723930357, "ce_loss_3": 4.08244069814682, "ce_loss_6": 3.621002423763275, "epoch": 0.287, "grad_norm": 1032.0, "kl_loss_12": 680.7317443847656, "kl_loss_17": 206.82701263427734, "kl_loss_3": 2626.6409423828127, "kl_loss_6": 1718.0297607421876, "learning_rate": 0.0008189564396332927, "loss": 1265.4093, "step": 2870 }, { "ce_loss_12": 3.1200747966766356, "ce_loss_17": 2.913948881626129, "ce_loss_23": 2.8240206956863405, "ce_loss_3": 4.0817553997039795, "ce_loss_6": 3.6026079297065734, "epoch": 0.288, "grad_norm": 1216.0, "kl_loss_12": 671.2544067382812, "kl_loss_17": 208.49227066040038, "kl_loss_3": 2613.6818237304688, "kl_loss_6": 1682.8380859375, "learning_rate": 0.0008177329330524181, "loss": 1291.9834, "step": 2880 }, { "ce_loss_12": 3.1660085558891295, "ce_loss_17": 2.9622753262519836, "ce_loss_23": 2.8681652665138246, "ce_loss_3": 4.082670176029206, "ce_loss_6": 3.6279420375823976, "epoch": 0.289, "grad_norm": 1248.0, "kl_loss_12": 666.8172821044922, "kl_loss_17": 214.4644989013672, "kl_loss_3": 2517.624169921875, "kl_loss_6": 1639.22294921875, "learning_rate": 0.0008165062269044352, "loss": 1273.3618, "step": 2890 }, { "ce_loss_12": 3.129373037815094, "ce_loss_17": 2.9321088552474976, "ce_loss_23": 2.8323047518730164, "ce_loss_3": 4.082630407810211, "ce_loss_6": 3.6082259893417357, "epoch": 0.29, "grad_norm": 1304.0, "kl_loss_12": 677.7201995849609, "kl_loss_17": 224.2819839477539, "kl_loss_3": 2624.9942993164063, "kl_loss_6": 1694.14033203125, "learning_rate": 0.0008152763335422613, "loss": 1307.293, "step": 2900 }, { "ce_loss_12": 3.110826826095581, "ce_loss_17": 2.911401462554932, "ce_loss_23": 2.8098244071006775, "ce_loss_3": 4.058380711078644, "ce_loss_6": 3.5944467544555665, "epoch": 0.291, "grad_norm": 1896.0, "kl_loss_12": 683.6442687988281, "kl_loss_17": 221.27504348754883, "kl_loss_3": 2608.7736328125, "kl_loss_6": 1696.7043334960938, "learning_rate": 0.0008140432653509088, "loss": 1283.9125, "step": 2910 }, { "ce_loss_12": 3.158522570133209, "ce_loss_17": 2.955310845375061, "ce_loss_23": 2.8611711621284486, "ce_loss_3": 4.089209735393524, "ce_loss_6": 3.635050928592682, "epoch": 0.292, "grad_norm": 1152.0, "kl_loss_12": 681.7248046875, "kl_loss_17": 215.7587547302246, "kl_loss_3": 2588.3452392578124, "kl_loss_6": 1680.8540649414062, "learning_rate": 0.0008128070347473608, "loss": 1283.4541, "step": 2920 }, { "ce_loss_12": 3.1713228702545164, "ce_loss_17": 2.9683526039123533, "ce_loss_23": 2.87442432641983, "ce_loss_3": 4.147928857803345, "ce_loss_6": 3.6574865341186524, "epoch": 0.293, "grad_norm": 1012.0, "kl_loss_12": 688.9380676269532, "kl_loss_17": 213.04415054321288, "kl_loss_3": 2661.012451171875, "kl_loss_6": 1698.3109985351562, "learning_rate": 0.0008115676541804455, "loss": 1291.3975, "step": 2930 }, { "ce_loss_12": 3.167314040660858, "ce_loss_17": 2.9671000599861146, "ce_loss_23": 2.8770614743232725, "ce_loss_3": 4.097289597988128, "ce_loss_6": 3.6370487570762635, "epoch": 0.294, "grad_norm": 1416.0, "kl_loss_12": 663.3868530273437, "kl_loss_17": 205.8155601501465, "kl_loss_3": 2568.6847900390626, "kl_loss_6": 1653.6200256347656, "learning_rate": 0.0008103251361307119, "loss": 1288.7492, "step": 2940 }, { "ce_loss_12": 3.2008013010025023, "ce_loss_17": 2.993203580379486, "ce_loss_23": 2.902731215953827, "ce_loss_3": 4.136919891834259, "ce_loss_6": 3.6736745595932008, "epoch": 0.295, "grad_norm": 1128.0, "kl_loss_12": 678.2878875732422, "kl_loss_17": 210.1094871520996, "kl_loss_3": 2584.3587036132812, "kl_loss_6": 1679.0871643066407, "learning_rate": 0.0008090794931103026, "loss": 1271.84, "step": 2950 }, { "ce_loss_12": 3.1641576647758485, "ce_loss_17": 2.96648188829422, "ce_loss_23": 2.879633092880249, "ce_loss_3": 4.091558110713959, "ce_loss_6": 3.6358474850654603, "epoch": 0.296, "grad_norm": 1176.0, "kl_loss_12": 656.2914367675781, "kl_loss_17": 201.0710319519043, "kl_loss_3": 2538.4782958984374, "kl_loss_6": 1640.0189331054687, "learning_rate": 0.0008078307376628291, "loss": 1267.6426, "step": 2960 }, { "ce_loss_12": 3.2289014220237733, "ce_loss_17": 3.027685618400574, "ce_loss_23": 2.9415606260299683, "ce_loss_3": 4.119142377376557, "ce_loss_6": 3.6732831835746764, "epoch": 0.297, "grad_norm": 1296.0, "kl_loss_12": 649.4954864501954, "kl_loss_17": 198.16017379760743, "kl_loss_3": 2464.59814453125, "kl_loss_6": 1586.0292541503907, "learning_rate": 0.000806578882363245, "loss": 1230.785, "step": 2970 }, { "ce_loss_12": 3.1504740715026855, "ce_loss_17": 2.9535369873046875, "ce_loss_23": 2.86621869802475, "ce_loss_3": 4.059466123580933, "ce_loss_6": 3.6136531949043276, "epoch": 0.298, "grad_norm": 1192.0, "kl_loss_12": 659.7547485351563, "kl_loss_17": 197.65312957763672, "kl_loss_3": 2533.2875, "kl_loss_6": 1648.2171325683594, "learning_rate": 0.0008053239398177191, "loss": 1283.3854, "step": 2980 }, { "ce_loss_12": 3.1471254944801332, "ce_loss_17": 2.9429659843444824, "ce_loss_23": 2.853777623176575, "ce_loss_3": 4.086737155914307, "ce_loss_6": 3.628751742839813, "epoch": 0.299, "grad_norm": 1232.0, "kl_loss_12": 665.6853668212891, "kl_loss_17": 200.3376953125, "kl_loss_3": 2570.677282714844, "kl_loss_6": 1669.5108703613282, "learning_rate": 0.0008040659226635089, "loss": 1302.3219, "step": 2990 }, { "ce_loss_12": 3.2610808968544007, "ce_loss_17": 3.055904138088226, "ce_loss_23": 2.962185859680176, "ce_loss_3": 4.172962117195129, "ce_loss_6": 3.7252434134483337, "epoch": 0.3, "grad_norm": 1392.0, "kl_loss_12": 691.085482788086, "kl_loss_17": 211.69791641235352, "kl_loss_3": 2555.7884155273437, "kl_loss_6": 1668.6239196777344, "learning_rate": 0.0008028048435688333, "loss": 1265.6971, "step": 3000 }, { "ce_loss_12": 3.1473644733428956, "ce_loss_17": 2.9388877868652346, "ce_loss_23": 2.850943756103516, "ce_loss_3": 4.103834009170532, "ce_loss_6": 3.6472261905670167, "epoch": 0.301, "grad_norm": 1456.0, "kl_loss_12": 682.3835693359375, "kl_loss_17": 203.41217880249025, "kl_loss_3": 2632.33310546875, "kl_loss_6": 1716.087384033203, "learning_rate": 0.0008015407152327448, "loss": 1286.5555, "step": 3010 }, { "ce_loss_12": 3.1876540660858153, "ce_loss_17": 2.9817963600158692, "ce_loss_23": 2.8919683694839478, "ce_loss_3": 4.1209875583648685, "ce_loss_6": 3.658332860469818, "epoch": 0.302, "grad_norm": 1080.0, "kl_loss_12": 677.3872039794921, "kl_loss_17": 207.2067398071289, "kl_loss_3": 2602.8611083984374, "kl_loss_6": 1685.0528625488282, "learning_rate": 0.0008002735503850016, "loss": 1286.4209, "step": 3020 }, { "ce_loss_12": 3.086091411113739, "ce_loss_17": 2.8816797137260437, "ce_loss_23": 2.7888283014297484, "ce_loss_3": 4.055278909206391, "ce_loss_6": 3.5791066169738768, "epoch": 0.303, "grad_norm": 1264.0, "kl_loss_12": 677.491079711914, "kl_loss_17": 208.38033905029297, "kl_loss_3": 2636.7327880859375, "kl_loss_6": 1703.7079956054688, "learning_rate": 0.0007990033617859396, "loss": 1304.0175, "step": 3030 }, { "ce_loss_12": 3.139596962928772, "ce_loss_17": 2.9386150002479554, "ce_loss_23": 2.847939932346344, "ce_loss_3": 4.06524167060852, "ce_loss_6": 3.6095362424850466, "epoch": 0.304, "grad_norm": 1264.0, "kl_loss_12": 668.4441009521485, "kl_loss_17": 204.69986114501953, "kl_loss_3": 2550.4292724609377, "kl_loss_6": 1657.856756591797, "learning_rate": 0.000797730162226344, "loss": 1239.6941, "step": 3040 }, { "ce_loss_12": 3.1538297176361083, "ce_loss_17": 2.9472372770309447, "ce_loss_23": 2.85866756439209, "ce_loss_3": 4.089906406402588, "ce_loss_6": 3.625072705745697, "epoch": 0.305, "grad_norm": 1064.0, "kl_loss_12": 680.2299285888672, "kl_loss_17": 203.93164520263673, "kl_loss_3": 2562.0833740234375, "kl_loss_6": 1657.4151428222656, "learning_rate": 0.0007964539645273203, "loss": 1268.8176, "step": 3050 }, { "ce_loss_12": 3.168477141857147, "ce_loss_17": 2.9669825077056884, "ce_loss_23": 2.8848891615867616, "ce_loss_3": 4.0650266289711, "ce_loss_6": 3.6266412258148195, "epoch": 0.306, "grad_norm": 1088.0, "kl_loss_12": 652.9420104980469, "kl_loss_17": 194.45923538208007, "kl_loss_3": 2483.465441894531, "kl_loss_6": 1617.5318542480468, "learning_rate": 0.000795174781540165, "loss": 1259.774, "step": 3060 }, { "ce_loss_12": 3.2268463492393495, "ce_loss_17": 3.035295844078064, "ce_loss_23": 2.9519911646842956, "ce_loss_3": 4.11122350692749, "ce_loss_6": 3.676068902015686, "epoch": 0.307, "grad_norm": 1160.0, "kl_loss_12": 649.5947296142579, "kl_loss_17": 199.54154739379882, "kl_loss_3": 2452.772106933594, "kl_loss_6": 1596.8073791503907, "learning_rate": 0.0007938926261462366, "loss": 1258.3678, "step": 3070 }, { "ce_loss_12": 3.186522734165192, "ce_loss_17": 2.9892637372016906, "ce_loss_23": 2.902893769741058, "ce_loss_3": 4.0782225847244264, "ce_loss_6": 3.632612419128418, "epoch": 0.308, "grad_norm": 1288.0, "kl_loss_12": 659.3931182861328, "kl_loss_17": 200.50450439453124, "kl_loss_3": 2494.252551269531, "kl_loss_6": 1618.629833984375, "learning_rate": 0.0007926075112568258, "loss": 1274.0955, "step": 3080 }, { "ce_loss_12": 3.1873653411865233, "ce_loss_17": 2.9842730164527893, "ce_loss_23": 2.8974654197692873, "ce_loss_3": 4.102190101146698, "ce_loss_6": 3.6495657086372377, "epoch": 0.309, "grad_norm": 1208.0, "kl_loss_12": 667.6466735839844, "kl_loss_17": 200.92897567749023, "kl_loss_3": 2542.1956298828127, "kl_loss_6": 1646.6694946289062, "learning_rate": 0.0007913194498130252, "loss": 1248.2428, "step": 3090 }, { "ce_loss_12": 3.1232769012451174, "ce_loss_17": 2.9138604760169984, "ce_loss_23": 2.8240367650985716, "ce_loss_3": 4.057645988464356, "ce_loss_6": 3.5948039293289185, "epoch": 0.31, "grad_norm": 1136.0, "kl_loss_12": 669.67705078125, "kl_loss_17": 202.97178421020507, "kl_loss_3": 2555.02431640625, "kl_loss_6": 1652.5606262207032, "learning_rate": 0.0007900284547855992, "loss": 1278.8604, "step": 3100 }, { "ce_loss_12": 3.126430797576904, "ce_loss_17": 2.9288961410522463, "ce_loss_23": 2.846028184890747, "ce_loss_3": 4.029088962078094, "ce_loss_6": 3.57817804813385, "epoch": 0.311, "grad_norm": 1248.0, "kl_loss_12": 659.7984893798828, "kl_loss_17": 196.29704513549805, "kl_loss_3": 2514.2835327148437, "kl_loss_6": 1621.8560668945313, "learning_rate": 0.0007887345391748532, "loss": 1274.4721, "step": 3110 }, { "ce_loss_12": 3.22427237033844, "ce_loss_17": 3.030610752105713, "ce_loss_23": 2.9471394181251527, "ce_loss_3": 4.115463936328888, "ce_loss_6": 3.6681665658950804, "epoch": 0.312, "grad_norm": 1020.0, "kl_loss_12": 642.4313903808594, "kl_loss_17": 196.25705642700194, "kl_loss_3": 2458.0077270507813, "kl_loss_6": 1587.213604736328, "learning_rate": 0.0007874377160105036, "loss": 1218.9914, "step": 3120 }, { "ce_loss_12": 3.1369936227798463, "ce_loss_17": 2.9515060067176817, "ce_loss_23": 2.859386706352234, "ce_loss_3": 4.092330026626587, "ce_loss_6": 3.6090521454811095, "epoch": 0.313, "grad_norm": 2008.0, "kl_loss_12": 643.0880157470704, "kl_loss_17": 202.33798446655274, "kl_loss_3": 2590.1301147460936, "kl_loss_6": 1639.5181640625, "learning_rate": 0.0007861379983515449, "loss": 1298.4299, "step": 3130 }, { "ce_loss_12": 3.2098602414131165, "ce_loss_17": 3.01786322593689, "ce_loss_23": 2.928981566429138, "ce_loss_3": 4.134507477283478, "ce_loss_6": 3.689944326877594, "epoch": 0.314, "grad_norm": 1368.0, "kl_loss_12": 652.8991668701171, "kl_loss_17": 207.90590591430663, "kl_loss_3": 2531.0458984375, "kl_loss_6": 1650.2009216308593, "learning_rate": 0.0007848353992861195, "loss": 1252.1455, "step": 3140 }, { "ce_loss_12": 3.2989187717437742, "ce_loss_17": 3.0994922637939455, "ce_loss_23": 2.9975876927375795, "ce_loss_3": 4.220439052581787, "ce_loss_6": 3.779904568195343, "epoch": 0.315, "grad_norm": 1168.0, "kl_loss_12": 676.4814544677735, "kl_loss_17": 228.22757949829102, "kl_loss_3": 2544.5927001953123, "kl_loss_6": 1678.0079345703125, "learning_rate": 0.0007835299319313853, "loss": 1287.3684, "step": 3150 }, { "ce_loss_12": 3.1737908601760862, "ce_loss_17": 2.9999840021133424, "ce_loss_23": 2.9011018872261047, "ce_loss_3": 4.079487144947052, "ce_loss_6": 3.620735836029053, "epoch": 0.316, "grad_norm": 1256.0, "kl_loss_12": 645.4729095458985, "kl_loss_17": 246.87976608276367, "kl_loss_3": 2485.2554931640625, "kl_loss_6": 1596.4259765625, "learning_rate": 0.0007822216094333848, "loss": 1288.1577, "step": 3160 }, { "ce_loss_12": 3.197959840297699, "ce_loss_17": 3.026703345775604, "ce_loss_23": 2.907000410556793, "ce_loss_3": 4.129369294643402, "ce_loss_6": 3.6692471146583556, "epoch": 0.317, "grad_norm": 1160.0, "kl_loss_12": 661.97021484375, "kl_loss_17": 263.0534294128418, "kl_loss_3": 2556.1950561523436, "kl_loss_6": 1647.0233154296875, "learning_rate": 0.0007809104449669101, "loss": 1267.7786, "step": 3170 }, { "ce_loss_12": 3.1298831701278687, "ce_loss_17": 2.958413553237915, "ce_loss_23": 2.856259453296661, "ce_loss_3": 4.039877963066101, "ce_loss_6": 3.5906583666801453, "epoch": 0.318, "grad_norm": 1072.0, "kl_loss_12": 645.5213134765625, "kl_loss_17": 245.95380325317382, "kl_loss_3": 2496.2158325195314, "kl_loss_6": 1610.8902282714844, "learning_rate": 0.0007795964517353734, "loss": 1254.5711, "step": 3180 }, { "ce_loss_12": 3.139362609386444, "ce_loss_17": 2.961104655265808, "ce_loss_23": 2.857077932357788, "ce_loss_3": 4.078527820110321, "ce_loss_6": 3.6135438680648804, "epoch": 0.319, "grad_norm": 1096.0, "kl_loss_12": 657.72080078125, "kl_loss_17": 248.24415435791016, "kl_loss_3": 2568.1942504882813, "kl_loss_6": 1646.7337951660156, "learning_rate": 0.000778279642970672, "loss": 1245.1319, "step": 3190 }, { "ce_loss_12": 3.1417649507522585, "ce_loss_17": 2.971712279319763, "ce_loss_23": 2.8669353246688845, "ce_loss_3": 4.045985901355744, "ce_loss_6": 3.600524604320526, "epoch": 0.32, "grad_norm": 1288.0, "kl_loss_12": 652.0772766113281, "kl_loss_17": 236.01042251586915, "kl_loss_3": 2505.472998046875, "kl_loss_6": 1627.3914123535155, "learning_rate": 0.0007769600319330552, "loss": 1239.5256, "step": 3200 }, { "ce_loss_12": 3.170133948326111, "ce_loss_17": 2.9851341366767885, "ce_loss_23": 2.8875450253486634, "ce_loss_3": 4.1293561339378355, "ce_loss_6": 3.6525676369667055, "epoch": 0.321, "grad_norm": 1376.0, "kl_loss_12": 659.3075500488281, "kl_loss_17": 220.2852684020996, "kl_loss_3": 2596.4176513671873, "kl_loss_6": 1663.1143737792968, "learning_rate": 0.0007756376319109917, "loss": 1267.159, "step": 3210 }, { "ce_loss_12": 3.208662784099579, "ce_loss_17": 3.0174351811408995, "ce_loss_23": 2.9262489438056947, "ce_loss_3": 4.104580092430115, "ce_loss_6": 3.66491756439209, "epoch": 0.322, "grad_norm": 1296.0, "kl_loss_12": 647.5737518310547, "kl_loss_17": 210.26569519042968, "kl_loss_3": 2495.5554321289064, "kl_loss_6": 1618.7034240722655, "learning_rate": 0.0007743124562210351, "loss": 1224.5053, "step": 3220 }, { "ce_loss_12": 3.222736394405365, "ce_loss_17": 3.0304854989051817, "ce_loss_23": 2.9412006974220275, "ce_loss_3": 4.119387090206146, "ce_loss_6": 3.675408494472504, "epoch": 0.323, "grad_norm": 1184.0, "kl_loss_12": 646.6549346923828, "kl_loss_17": 204.48597564697266, "kl_loss_3": 2503.8929931640623, "kl_loss_6": 1622.128240966797, "learning_rate": 0.0007729845182076895, "loss": 1249.3895, "step": 3230 }, { "ce_loss_12": 3.1534232616424562, "ce_loss_17": 2.9649145007133484, "ce_loss_23": 2.8808998346328734, "ce_loss_3": 4.050489735603333, "ce_loss_6": 3.608076286315918, "epoch": 0.324, "grad_norm": 1056.0, "kl_loss_12": 642.7339508056641, "kl_loss_17": 196.3539276123047, "kl_loss_3": 2472.869177246094, "kl_loss_6": 1592.1756774902344, "learning_rate": 0.0007716538312432765, "loss": 1258.6277, "step": 3240 }, { "ce_loss_12": 3.1309715151786803, "ce_loss_17": 2.9249958992004395, "ce_loss_23": 2.8356101751327514, "ce_loss_3": 4.062940955162048, "ce_loss_6": 3.607418692111969, "epoch": 0.325, "grad_norm": 1004.0, "kl_loss_12": 667.4138031005859, "kl_loss_17": 204.44576187133788, "kl_loss_3": 2559.404052734375, "kl_loss_6": 1661.5573486328126, "learning_rate": 0.0007703204087277988, "loss": 1266.9637, "step": 3250 }, { "ce_loss_12": 3.204416263103485, "ce_loss_17": 3.015028953552246, "ce_loss_23": 2.931392502784729, "ce_loss_3": 4.094218730926514, "ce_loss_6": 3.64478086233139, "epoch": 0.326, "grad_norm": 1480.0, "kl_loss_12": 631.9965118408203, "kl_loss_17": 193.47777786254883, "kl_loss_3": 2457.03642578125, "kl_loss_6": 1572.6686767578126, "learning_rate": 0.0007689842640888063, "loss": 1226.8134, "step": 3260 }, { "ce_loss_12": 3.2039347529411315, "ce_loss_17": 3.0119959115982056, "ce_loss_23": 2.9236732602119444, "ce_loss_3": 4.096511793136597, "ce_loss_6": 3.659037482738495, "epoch": 0.327, "grad_norm": 1144.0, "kl_loss_12": 646.79599609375, "kl_loss_17": 197.29918746948243, "kl_loss_3": 2451.0695373535154, "kl_loss_6": 1593.8397583007813, "learning_rate": 0.0007676454107812607, "loss": 1236.0236, "step": 3270 }, { "ce_loss_12": 3.152002787590027, "ce_loss_17": 2.957594645023346, "ce_loss_23": 2.8716594338417054, "ce_loss_3": 4.086058926582337, "ce_loss_6": 3.619371509552002, "epoch": 0.328, "grad_norm": 1248.0, "kl_loss_12": 657.9291381835938, "kl_loss_17": 198.33377532958986, "kl_loss_3": 2551.7009765625, "kl_loss_6": 1636.1120056152345, "learning_rate": 0.0007663038622873999, "loss": 1238.8723, "step": 3280 }, { "ce_loss_12": 3.1929536700248717, "ce_loss_17": 3.001619851589203, "ce_loss_23": 2.917175257205963, "ce_loss_3": 4.111210346221924, "ce_loss_6": 3.6579272747039795, "epoch": 0.329, "grad_norm": 1544.0, "kl_loss_12": 651.0749725341797, "kl_loss_17": 198.15295867919923, "kl_loss_3": 2525.5502685546876, "kl_loss_6": 1625.9446044921874, "learning_rate": 0.0007649596321166025, "loss": 1231.7627, "step": 3290 }, { "ce_loss_12": 3.095591151714325, "ce_loss_17": 2.9051140904426576, "ce_loss_23": 2.8213531136512757, "ce_loss_3": 3.982525849342346, "ce_loss_6": 3.5510496616363527, "epoch": 0.33, "grad_norm": 1080.0, "kl_loss_12": 636.9413482666016, "kl_loss_17": 189.79553909301757, "kl_loss_3": 2442.1396423339843, "kl_loss_6": 1593.9491882324219, "learning_rate": 0.0007636127338052513, "loss": 1232.5529, "step": 3300 }, { "ce_loss_12": 3.1994780898094177, "ce_loss_17": 3.0023029327392576, "ce_loss_23": 2.912818741798401, "ce_loss_3": 4.138954031467438, "ce_loss_6": 3.662935256958008, "epoch": 0.331, "grad_norm": 976.0, "kl_loss_12": 654.0823486328125, "kl_loss_17": 198.2132797241211, "kl_loss_3": 2574.5783813476564, "kl_loss_6": 1645.6572509765624, "learning_rate": 0.0007622631809165971, "loss": 1242.5524, "step": 3310 }, { "ce_loss_12": 3.1742364287376406, "ce_loss_17": 2.9843477964401246, "ce_loss_23": 2.904312765598297, "ce_loss_3": 4.043950057029724, "ce_loss_6": 3.614525556564331, "epoch": 0.332, "grad_norm": 1640.0, "kl_loss_12": 616.8112365722657, "kl_loss_17": 183.72462158203126, "kl_loss_3": 2375.9060180664064, "kl_loss_6": 1535.5855590820313, "learning_rate": 0.000760910987040623, "loss": 1205.1776, "step": 3320 }, { "ce_loss_12": 3.189255142211914, "ce_loss_17": 2.981306564807892, "ce_loss_23": 2.893477368354797, "ce_loss_3": 4.122015202045441, "ce_loss_6": 3.660171020030975, "epoch": 0.333, "grad_norm": 1144.0, "kl_loss_12": 669.9992126464844, "kl_loss_17": 198.67136993408204, "kl_loss_3": 2580.4757690429688, "kl_loss_6": 1662.0666076660157, "learning_rate": 0.000759556165793906, "loss": 1240.4461, "step": 3330 }, { "ce_loss_12": 3.1871766686439513, "ce_loss_17": 2.9891620874404907, "ce_loss_23": 2.902255916595459, "ce_loss_3": 4.102960324287414, "ce_loss_6": 3.6463622212409974, "epoch": 0.334, "grad_norm": 968.0, "kl_loss_12": 653.475473022461, "kl_loss_17": 199.1413902282715, "kl_loss_3": 2513.588134765625, "kl_loss_6": 1624.1831848144532, "learning_rate": 0.000758198730819481, "loss": 1255.6303, "step": 3340 }, { "ce_loss_12": 3.1485753536224363, "ce_loss_17": 2.957745444774628, "ce_loss_23": 2.8779073357582092, "ce_loss_3": 4.060862839221954, "ce_loss_6": 3.6163305282592773, "epoch": 0.335, "grad_norm": 1096.0, "kl_loss_12": 635.0015411376953, "kl_loss_17": 190.78094253540038, "kl_loss_3": 2502.9186279296873, "kl_loss_6": 1622.7606018066406, "learning_rate": 0.0007568386957867032, "loss": 1236.9645, "step": 3350 }, { "ce_loss_12": 3.197234070301056, "ce_loss_17": 3.002056097984314, "ce_loss_23": 2.9157747507095335, "ce_loss_3": 4.096824419498444, "ce_loss_6": 3.654950773715973, "epoch": 0.336, "grad_norm": 1352.0, "kl_loss_12": 649.6791900634765, "kl_loss_17": 196.0836959838867, "kl_loss_3": 2488.7575561523436, "kl_loss_6": 1614.0893615722657, "learning_rate": 0.0007554760743911103, "loss": 1249.6344, "step": 3360 }, { "ce_loss_12": 3.122460901737213, "ce_loss_17": 2.934140515327454, "ce_loss_23": 2.8511914134025576, "ce_loss_3": 4.024395322799682, "ce_loss_6": 3.5776939511299135, "epoch": 0.337, "grad_norm": 1200.0, "kl_loss_12": 629.5604217529296, "kl_loss_17": 187.96243438720703, "kl_loss_3": 2487.4703979492188, "kl_loss_6": 1598.3975830078125, "learning_rate": 0.0007541108803542846, "loss": 1259.7262, "step": 3370 }, { "ce_loss_12": 3.15894296169281, "ce_loss_17": 2.9651761412620545, "ce_loss_23": 2.8833929538726806, "ce_loss_3": 4.063071310520172, "ce_loss_6": 3.607359540462494, "epoch": 0.338, "grad_norm": 1376.0, "kl_loss_12": 639.8570343017578, "kl_loss_17": 191.9022071838379, "kl_loss_3": 2516.658801269531, "kl_loss_6": 1606.4229248046875, "learning_rate": 0.0007527431274237149, "loss": 1293.7488, "step": 3380 }, { "ce_loss_12": 3.1345289826393126, "ce_loss_17": 2.944065737724304, "ce_loss_23": 2.8607085824012755, "ce_loss_3": 4.037069797515869, "ce_loss_6": 3.57998104095459, "epoch": 0.339, "grad_norm": 1208.0, "kl_loss_12": 636.0152709960937, "kl_loss_17": 192.22691955566407, "kl_loss_3": 2498.66904296875, "kl_loss_6": 1586.3996887207031, "learning_rate": 0.0007513728293726579, "loss": 1233.265, "step": 3390 }, { "ce_loss_12": 3.2337993502616884, "ce_loss_17": 3.0387322425842287, "ce_loss_23": 2.9555094718933104, "ce_loss_3": 4.117582046985627, "ce_loss_6": 3.6774802207946777, "epoch": 0.34, "grad_norm": 1208.0, "kl_loss_12": 648.2277984619141, "kl_loss_17": 193.34667205810547, "kl_loss_3": 2459.2372436523438, "kl_loss_6": 1592.1968139648438, "learning_rate": 0.00075, "loss": 1219.958, "step": 3400 }, { "ce_loss_12": 3.234040653705597, "ce_loss_17": 3.0330355405807494, "ce_loss_23": 2.946638286113739, "ce_loss_3": 4.1481698751449585, "ce_loss_6": 3.6888544082641603, "epoch": 0.341, "grad_norm": 1616.0, "kl_loss_12": 657.6758117675781, "kl_loss_17": 193.70989608764648, "kl_loss_3": 2527.2737548828127, "kl_loss_6": 1612.798162841797, "learning_rate": 0.0007486246531301177, "loss": 1237.0359, "step": 3410 }, { "ce_loss_12": 3.0608113527297975, "ce_loss_17": 2.8603794097900392, "ce_loss_23": 2.779085946083069, "ce_loss_3": 3.965061593055725, "ce_loss_6": 3.515889024734497, "epoch": 0.342, "grad_norm": 1512.0, "kl_loss_12": 640.9623992919921, "kl_loss_17": 189.16546020507812, "kl_loss_3": 2486.7409790039064, "kl_loss_6": 1605.0156677246093, "learning_rate": 0.0007472468026127384, "loss": 1218.0469, "step": 3420 }, { "ce_loss_12": 3.204521358013153, "ce_loss_17": 3.001431107521057, "ce_loss_23": 2.9101683497428894, "ce_loss_3": 4.138897204399109, "ce_loss_6": 3.673828053474426, "epoch": 0.343, "grad_norm": 1328.0, "kl_loss_12": 675.5233673095703, "kl_loss_17": 204.8723388671875, "kl_loss_3": 2599.304162597656, "kl_loss_6": 1671.3851257324218, "learning_rate": 0.000745866462322802, "loss": 1268.8045, "step": 3430 }, { "ce_loss_12": 3.1597685694694517, "ce_loss_17": 2.96885906457901, "ce_loss_23": 2.891816234588623, "ce_loss_3": 4.049743747711181, "ce_loss_6": 3.6046136617660522, "epoch": 0.344, "grad_norm": 1144.0, "kl_loss_12": 628.1821197509765, "kl_loss_17": 187.54821166992187, "kl_loss_3": 2445.0335205078127, "kl_loss_6": 1559.4938110351563, "learning_rate": 0.0007444836461603195, "loss": 1220.1586, "step": 3440 }, { "ce_loss_12": 3.2360959887504577, "ce_loss_17": 3.0426719188690186, "ce_loss_23": 2.953330385684967, "ce_loss_3": 4.143719017505646, "ce_loss_6": 3.699867343902588, "epoch": 0.345, "grad_norm": 1064.0, "kl_loss_12": 669.5976684570312, "kl_loss_17": 204.9705047607422, "kl_loss_3": 2542.0022705078127, "kl_loss_6": 1652.5964782714843, "learning_rate": 0.0007430983680502344, "loss": 1268.3637, "step": 3450 }, { "ce_loss_12": 3.0876049160957337, "ce_loss_17": 2.8904442191123962, "ce_loss_23": 2.8051801323890686, "ce_loss_3": 4.022367370128632, "ce_loss_6": 3.5601198196411135, "epoch": 0.346, "grad_norm": 1248.0, "kl_loss_12": 655.0568389892578, "kl_loss_17": 195.21175689697264, "kl_loss_3": 2541.0522338867186, "kl_loss_6": 1636.6640930175781, "learning_rate": 0.0007417106419422819, "loss": 1252.4146, "step": 3460 }, { "ce_loss_12": 3.1681289196014406, "ce_loss_17": 2.9741994857788088, "ce_loss_23": 2.8874059557914733, "ce_loss_3": 4.07452574968338, "ce_loss_6": 3.618625295162201, "epoch": 0.347, "grad_norm": 1248.0, "kl_loss_12": 644.6038970947266, "kl_loss_17": 193.07538070678712, "kl_loss_3": 2476.3251525878904, "kl_loss_6": 1590.3403564453124, "learning_rate": 0.0007403204818108486, "loss": 1244.1215, "step": 3470 }, { "ce_loss_12": 3.162106120586395, "ce_loss_17": 2.966542291641235, "ce_loss_23": 2.8839560627937315, "ce_loss_3": 4.071386611461639, "ce_loss_6": 3.6170035123825075, "epoch": 0.348, "grad_norm": 1272.0, "kl_loss_12": 644.7951568603515, "kl_loss_17": 191.4629104614258, "kl_loss_3": 2531.1408813476564, "kl_loss_6": 1623.6707824707032, "learning_rate": 0.0007389279016548316, "loss": 1211.3204, "step": 3480 }, { "ce_loss_12": 3.1672859668731688, "ce_loss_17": 2.96072518825531, "ce_loss_23": 2.874778151512146, "ce_loss_3": 4.11575837135315, "ce_loss_6": 3.6296263933181763, "epoch": 0.349, "grad_norm": 1168.0, "kl_loss_12": 662.0878631591797, "kl_loss_17": 196.92775039672853, "kl_loss_3": 2594.9796630859373, "kl_loss_6": 1644.7096069335937, "learning_rate": 0.0007375329154974975, "loss": 1260.271, "step": 3490 }, { "ce_loss_12": 3.1099432587623594, "ce_loss_17": 2.9236859321594237, "ce_loss_23": 2.8439751505851745, "ce_loss_3": 4.011048936843872, "ce_loss_6": 3.5630403637886046, "epoch": 0.35, "grad_norm": 1096.0, "kl_loss_12": 626.3424011230469, "kl_loss_17": 188.74627304077148, "kl_loss_3": 2442.5823974609375, "kl_loss_6": 1574.2830627441406, "learning_rate": 0.0007361355373863414, "loss": 1240.516, "step": 3500 }, { "ce_loss_12": 3.1638558864593507, "ce_loss_17": 2.9696279048919676, "ce_loss_23": 2.888825249671936, "ce_loss_3": 4.058097195625305, "ce_loss_6": 3.603681969642639, "epoch": 0.351, "grad_norm": 1016.0, "kl_loss_12": 629.8948608398438, "kl_loss_17": 187.6762222290039, "kl_loss_3": 2460.040576171875, "kl_loss_6": 1563.9760498046876, "learning_rate": 0.0007347357813929454, "loss": 1243.6459, "step": 3510 }, { "ce_loss_12": 3.1169628858566285, "ce_loss_17": 2.930294370651245, "ce_loss_23": 2.846389579772949, "ce_loss_3": 4.012052595615387, "ce_loss_6": 3.5693087339401246, "epoch": 0.352, "grad_norm": 1200.0, "kl_loss_12": 626.136929321289, "kl_loss_17": 188.69427566528321, "kl_loss_3": 2443.0934204101563, "kl_loss_6": 1567.1674072265625, "learning_rate": 0.0007333336616128369, "loss": 1238.4143, "step": 3520 }, { "ce_loss_12": 3.0994298815727235, "ce_loss_17": 2.9009223699569704, "ce_loss_23": 2.813994586467743, "ce_loss_3": 4.030776941776276, "ce_loss_6": 3.563981807231903, "epoch": 0.353, "grad_norm": 1512.0, "kl_loss_12": 649.8764343261719, "kl_loss_17": 193.6501266479492, "kl_loss_3": 2539.990856933594, "kl_loss_6": 1624.6120483398438, "learning_rate": 0.0007319291921653463, "loss": 1246.1929, "step": 3530 }, { "ce_loss_12": 3.1819730043411254, "ce_loss_17": 2.9840220332145693, "ce_loss_23": 2.895319640636444, "ce_loss_3": 4.11333841085434, "ce_loss_6": 3.6454499125480653, "epoch": 0.354, "grad_norm": 1560.0, "kl_loss_12": 652.9826873779297, "kl_loss_17": 198.4254135131836, "kl_loss_3": 2530.1242431640626, "kl_loss_6": 1630.5962463378905, "learning_rate": 0.0007305223871934656, "loss": 1233.6375, "step": 3540 }, { "ce_loss_12": 3.1405220866203307, "ce_loss_17": 2.947129189968109, "ce_loss_23": 2.864759373664856, "ce_loss_3": 4.039001405239105, "ce_loss_6": 3.592834734916687, "epoch": 0.355, "grad_norm": 1392.0, "kl_loss_12": 637.2668518066406, "kl_loss_17": 192.42751998901366, "kl_loss_3": 2472.7722412109374, "kl_loss_6": 1586.1993774414063, "learning_rate": 0.0007291132608637052, "loss": 1231.5611, "step": 3550 }, { "ce_loss_12": 3.1172609090805055, "ce_loss_17": 2.9296460151672363, "ce_loss_23": 2.846535086631775, "ce_loss_3": 4.094112932682037, "ce_loss_6": 3.610998845100403, "epoch": 0.356, "grad_norm": 1360.0, "kl_loss_12": 629.188510131836, "kl_loss_17": 188.0505683898926, "kl_loss_3": 2621.151025390625, "kl_loss_6": 1676.23330078125, "learning_rate": 0.0007277018273659516, "loss": 1269.5545, "step": 3560 }, { "ce_loss_12": 3.242421197891235, "ce_loss_17": 3.036579656600952, "ce_loss_23": 2.9507407784461974, "ce_loss_3": 4.137357699871063, "ce_loss_6": 3.6962520837783814, "epoch": 0.357, "grad_norm": 1584.0, "kl_loss_12": 668.0106536865235, "kl_loss_17": 199.97094345092773, "kl_loss_3": 2518.397570800781, "kl_loss_6": 1646.419659423828, "learning_rate": 0.0007262881009133242, "loss": 1244.0977, "step": 3570 }, { "ce_loss_12": 3.1462175965309145, "ce_loss_17": 2.954234480857849, "ce_loss_23": 2.874486243724823, "ce_loss_3": 4.042923450469971, "ce_loss_6": 3.594035029411316, "epoch": 0.358, "grad_norm": 1104.0, "kl_loss_12": 633.8466430664063, "kl_loss_17": 187.2772117614746, "kl_loss_3": 2466.479931640625, "kl_loss_6": 1578.679052734375, "learning_rate": 0.0007248720957420329, "loss": 1208.5965, "step": 3580 }, { "ce_loss_12": 3.138073432445526, "ce_loss_17": 2.9530703902244566, "ce_loss_23": 2.877877390384674, "ce_loss_3": 4.028606414794922, "ce_loss_6": 3.579486346244812, "epoch": 0.359, "grad_norm": 1104.0, "kl_loss_12": 623.7652374267578, "kl_loss_17": 185.6518424987793, "kl_loss_3": 2446.584088134766, "kl_loss_6": 1559.5703002929688, "learning_rate": 0.0007234538261112341, "loss": 1238.4975, "step": 3590 }, { "ce_loss_12": 3.1939882755279543, "ce_loss_17": 2.997502303123474, "ce_loss_23": 2.9144866824150086, "ce_loss_3": 4.114092516899109, "ce_loss_6": 3.6445836186408997, "epoch": 0.36, "grad_norm": 1048.0, "kl_loss_12": 642.1127014160156, "kl_loss_17": 191.9202751159668, "kl_loss_3": 2509.18154296875, "kl_loss_6": 1591.3108459472655, "learning_rate": 0.0007220333063028871, "loss": 1222.5482, "step": 3600 }, { "ce_loss_12": 3.2365827560424805, "ce_loss_17": 3.0361854791641236, "ce_loss_23": 2.949097955226898, "ce_loss_3": 4.2037324666976925, "ce_loss_6": 3.728068196773529, "epoch": 0.361, "grad_norm": 1664.0, "kl_loss_12": 661.2010620117187, "kl_loss_17": 200.61611557006836, "kl_loss_3": 2653.8142211914064, "kl_loss_6": 1708.1386962890624, "learning_rate": 0.0007206105506216106, "loss": 1280.1693, "step": 3610 }, { "ce_loss_12": 3.10485817193985, "ce_loss_17": 2.913054096698761, "ce_loss_23": 2.8345122218132017, "ce_loss_3": 3.989728772640228, "ce_loss_6": 3.5509069919586183, "epoch": 0.362, "grad_norm": 1376.0, "kl_loss_12": 628.1443359375, "kl_loss_17": 184.31702575683593, "kl_loss_3": 2426.3939697265623, "kl_loss_6": 1563.0535583496094, "learning_rate": 0.0007191855733945387, "loss": 1195.5931, "step": 3620 }, { "ce_loss_12": 3.190879261493683, "ce_loss_17": 2.999620962142944, "ce_loss_23": 2.9165725708007812, "ce_loss_3": 4.087678861618042, "ce_loss_6": 3.6386709809303284, "epoch": 0.363, "grad_norm": 1128.0, "kl_loss_12": 630.6439605712891, "kl_loss_17": 186.4598747253418, "kl_loss_3": 2463.1990234375, "kl_loss_6": 1577.126171875, "learning_rate": 0.0007177583889711762, "loss": 1209.9547, "step": 3630 }, { "ce_loss_12": 3.104693794250488, "ce_loss_17": 2.9128946185112, "ce_loss_23": 2.8351215600967405, "ce_loss_3": 4.020575773715973, "ce_loss_6": 3.567472243309021, "epoch": 0.364, "grad_norm": 1192.0, "kl_loss_12": 634.3725769042969, "kl_loss_17": 186.35895080566405, "kl_loss_3": 2499.30517578125, "kl_loss_6": 1601.517596435547, "learning_rate": 0.0007163290117232541, "loss": 1229.4205, "step": 3640 }, { "ce_loss_12": 3.206928777694702, "ce_loss_17": 3.022316098213196, "ce_loss_23": 2.9427536964416503, "ce_loss_3": 4.0679107189178465, "ce_loss_6": 3.6384734749794005, "epoch": 0.365, "grad_norm": 1440.0, "kl_loss_12": 619.4795654296875, "kl_loss_17": 183.97281036376953, "kl_loss_3": 2402.00595703125, "kl_loss_6": 1549.7066040039062, "learning_rate": 0.0007148974560445859, "loss": 1206.8211, "step": 3650 }, { "ce_loss_12": 3.143563616275787, "ce_loss_17": 2.953302776813507, "ce_loss_23": 2.87525874376297, "ce_loss_3": 4.018783235549927, "ce_loss_6": 3.582585895061493, "epoch": 0.366, "grad_norm": 1024.0, "kl_loss_12": 623.0079528808594, "kl_loss_17": 184.78619308471679, "kl_loss_3": 2404.4192016601564, "kl_loss_6": 1555.8073486328126, "learning_rate": 0.0007134637363509209, "loss": 1192.1729, "step": 3660 }, { "ce_loss_12": 3.2462796330451966, "ce_loss_17": 3.0629473090171815, "ce_loss_23": 2.9862843632698057, "ce_loss_3": 4.116179370880127, "ce_loss_6": 3.684998261928558, "epoch": 0.367, "grad_norm": 1200.0, "kl_loss_12": 620.0697937011719, "kl_loss_17": 182.9595085144043, "kl_loss_3": 2382.9265014648436, "kl_loss_6": 1543.4059326171875, "learning_rate": 0.0007120278670798009, "loss": 1206.8045, "step": 3670 }, { "ce_loss_12": 3.0817275762557985, "ce_loss_17": 2.881822347640991, "ce_loss_23": 2.7966799259185793, "ce_loss_3": 4.039555454254151, "ce_loss_6": 3.572439932823181, "epoch": 0.368, "grad_norm": 1528.0, "kl_loss_12": 652.9435607910157, "kl_loss_17": 192.2785629272461, "kl_loss_3": 2608.0561767578124, "kl_loss_6": 1677.3426330566406, "learning_rate": 0.0007105898626904133, "loss": 1276.8469, "step": 3680 }, { "ce_loss_12": 3.1575241327285766, "ce_loss_17": 2.966373598575592, "ce_loss_23": 2.885143148899078, "ce_loss_3": 4.070808088779449, "ce_loss_6": 3.609587752819061, "epoch": 0.369, "grad_norm": 1128.0, "kl_loss_12": 630.6559204101562, "kl_loss_17": 188.17326431274415, "kl_loss_3": 2485.571112060547, "kl_loss_6": 1571.32275390625, "learning_rate": 0.0007091497376634463, "loss": 1208.1986, "step": 3690 }, { "ce_loss_12": 3.1057782888412477, "ce_loss_17": 2.916164147853851, "ce_loss_23": 2.8346520900726317, "ce_loss_3": 3.9995585441589356, "ce_loss_6": 3.5516976594924925, "epoch": 0.37, "grad_norm": 1096.0, "kl_loss_12": 628.3031372070312, "kl_loss_17": 189.42245864868164, "kl_loss_3": 2441.6884887695314, "kl_loss_6": 1562.001318359375, "learning_rate": 0.0007077075065009433, "loss": 1237.0109, "step": 3700 }, { "ce_loss_12": 3.2027547001838683, "ce_loss_17": 3.0074793815612795, "ce_loss_23": 2.923413860797882, "ce_loss_3": 4.125626313686371, "ce_loss_6": 3.6585047006607057, "epoch": 0.371, "grad_norm": 1136.0, "kl_loss_12": 645.6251739501953, "kl_loss_17": 199.54634628295898, "kl_loss_3": 2519.0525024414064, "kl_loss_6": 1606.2898742675782, "learning_rate": 0.0007062631837261557, "loss": 1234.8146, "step": 3710 }, { "ce_loss_12": 3.0837065100669863, "ce_loss_17": 2.8989955306053163, "ce_loss_23": 2.8208890914916993, "ce_loss_3": 3.9908340334892274, "ce_loss_6": 3.527920973300934, "epoch": 0.372, "grad_norm": 1240.0, "kl_loss_12": 620.295361328125, "kl_loss_17": 184.6085075378418, "kl_loss_3": 2473.2692260742188, "kl_loss_6": 1559.3187622070313, "learning_rate": 0.0007048167838833977, "loss": 1242.3812, "step": 3720 }, { "ce_loss_12": 3.162278175354004, "ce_loss_17": 2.974462831020355, "ce_loss_23": 2.8935346484184263, "ce_loss_3": 4.03741956949234, "ce_loss_6": 3.6007279634475706, "epoch": 0.373, "grad_norm": 1480.0, "kl_loss_12": 626.9373077392578, "kl_loss_17": 190.09740447998047, "kl_loss_3": 2436.891955566406, "kl_loss_6": 1547.806591796875, "learning_rate": 0.0007033683215379002, "loss": 1211.3609, "step": 3730 }, { "ce_loss_12": 3.1470221281051636, "ce_loss_17": 2.959082067012787, "ce_loss_23": 2.880895709991455, "ce_loss_3": 4.051479244232178, "ce_loss_6": 3.590286874771118, "epoch": 0.374, "grad_norm": 1280.0, "kl_loss_12": 619.1704406738281, "kl_loss_17": 183.4695182800293, "kl_loss_3": 2461.903515625, "kl_loss_6": 1554.585955810547, "learning_rate": 0.0007019178112756625, "loss": 1224.6299, "step": 3740 }, { "ce_loss_12": 3.127616310119629, "ce_loss_17": 2.939793038368225, "ce_loss_23": 2.8609092473983764, "ce_loss_3": 4.025426268577576, "ce_loss_6": 3.571797215938568, "epoch": 0.375, "grad_norm": 1384.0, "kl_loss_12": 619.8853515625, "kl_loss_17": 186.0279067993164, "kl_loss_3": 2440.3298583984374, "kl_loss_6": 1559.0566345214843, "learning_rate": 0.0007004652677033068, "loss": 1215.057, "step": 3750 }, { "ce_loss_12": 3.1839040398597716, "ce_loss_17": 3.008525323867798, "ce_loss_23": 2.935053050518036, "ce_loss_3": 4.0585259079933165, "ce_loss_6": 3.6220818042755125, "epoch": 0.376, "grad_norm": 1504.0, "kl_loss_12": 603.8833801269532, "kl_loss_17": 181.97922439575194, "kl_loss_3": 2395.0477905273438, "kl_loss_6": 1523.921240234375, "learning_rate": 0.0006990107054479312, "loss": 1196.0367, "step": 3760 }, { "ce_loss_12": 3.1728707432746885, "ce_loss_17": 2.9855801582336428, "ce_loss_23": 2.905232536792755, "ce_loss_3": 4.063840591907502, "ce_loss_6": 3.6246375679969787, "epoch": 0.377, "grad_norm": 1400.0, "kl_loss_12": 625.2057556152344, "kl_loss_17": 190.76472930908204, "kl_loss_3": 2445.6776489257813, "kl_loss_6": 1578.4298400878906, "learning_rate": 0.000697554139156961, "loss": 1217.5118, "step": 3770 }, { "ce_loss_12": 3.1740211248397827, "ce_loss_17": 2.9835832476615907, "ce_loss_23": 2.9033362746238707, "ce_loss_3": 4.079412627220154, "ce_loss_6": 3.6197427034378054, "epoch": 0.378, "grad_norm": 964.0, "kl_loss_12": 639.1348205566406, "kl_loss_17": 192.42096710205078, "kl_loss_3": 2505.44453125, "kl_loss_6": 1596.6610412597656, "learning_rate": 0.0006960955834980027, "loss": 1203.2656, "step": 3780 }, { "ce_loss_12": 3.138484704494476, "ce_loss_17": 2.953498876094818, "ce_loss_23": 2.8716805815696715, "ce_loss_3": 4.0312147498130795, "ce_loss_6": 3.585508608818054, "epoch": 0.379, "grad_norm": 1464.0, "kl_loss_12": 618.7588714599609, "kl_loss_17": 188.11121139526367, "kl_loss_3": 2427.6190795898438, "kl_loss_6": 1553.103253173828, "learning_rate": 0.0006946350531586958, "loss": 1207.2389, "step": 3790 }, { "ce_loss_12": 3.167103588581085, "ce_loss_17": 2.980492651462555, "ce_loss_23": 2.899238634109497, "ce_loss_3": 4.059364914894104, "ce_loss_6": 3.6126387119293213, "epoch": 0.38, "grad_norm": 1272.0, "kl_loss_12": 621.873583984375, "kl_loss_17": 183.54898376464843, "kl_loss_3": 2451.3309692382813, "kl_loss_6": 1567.5563354492188, "learning_rate": 0.0006931725628465643, "loss": 1230.7523, "step": 3800 }, { "ce_loss_12": 3.169570064544678, "ce_loss_17": 2.978095519542694, "ce_loss_23": 2.8957305431365965, "ce_loss_3": 4.0779964327812195, "ce_loss_6": 3.613682174682617, "epoch": 0.381, "grad_norm": 1912.0, "kl_loss_12": 631.6369598388671, "kl_loss_17": 187.99442596435546, "kl_loss_3": 2470.4658142089843, "kl_loss_6": 1564.417218017578, "learning_rate": 0.0006917081272888696, "loss": 1217.5242, "step": 3810 }, { "ce_loss_12": 3.0985295176506042, "ce_loss_17": 2.9078894376754763, "ce_loss_23": 2.826605522632599, "ce_loss_3": 4.035587120056152, "ce_loss_6": 3.56057288646698, "epoch": 0.382, "grad_norm": 1272.0, "kl_loss_12": 634.5787231445313, "kl_loss_17": 190.97016448974608, "kl_loss_3": 2558.696630859375, "kl_loss_6": 1607.7907287597657, "learning_rate": 0.0006902417612324615, "loss": 1221.291, "step": 3820 }, { "ce_loss_12": 3.224716365337372, "ce_loss_17": 3.0207711338996885, "ce_loss_23": 2.934830629825592, "ce_loss_3": 4.1563934803009035, "ce_loss_6": 3.690185070037842, "epoch": 0.383, "grad_norm": 1192.0, "kl_loss_12": 660.1558837890625, "kl_loss_17": 197.20094680786133, "kl_loss_3": 2556.5868530273438, "kl_loss_6": 1635.0429321289062, "learning_rate": 0.00068877347944363, "loss": 1245.9041, "step": 3830 }, { "ce_loss_12": 3.2066277265548706, "ce_loss_17": 3.01594934463501, "ce_loss_23": 2.9391089200973513, "ce_loss_3": 4.07437949180603, "ce_loss_6": 3.6364644885063173, "epoch": 0.384, "grad_norm": 1400.0, "kl_loss_12": 622.8762481689453, "kl_loss_17": 187.0550849914551, "kl_loss_3": 2403.3736328125, "kl_loss_6": 1543.0215576171875, "learning_rate": 0.0006873032967079561, "loss": 1211.8893, "step": 3840 }, { "ce_loss_12": 3.1839053630828857, "ce_loss_17": 3.008801805973053, "ce_loss_23": 2.9267475485801695, "ce_loss_3": 4.052586698532105, "ce_loss_6": 3.611920189857483, "epoch": 0.385, "grad_norm": 972.0, "kl_loss_12": 618.1469970703125, "kl_loss_17": 184.93197021484374, "kl_loss_3": 2401.352734375, "kl_loss_6": 1532.117254638672, "learning_rate": 0.0006858312278301637, "loss": 1188.9611, "step": 3850 }, { "ce_loss_12": 3.2220707893371583, "ce_loss_17": 3.0395336985588073, "ce_loss_23": 2.9616737246513365, "ce_loss_3": 4.066495501995087, "ce_loss_6": 3.6387622594833373, "epoch": 0.386, "grad_norm": 1336.0, "kl_loss_12": 621.8028442382813, "kl_loss_17": 186.69632263183593, "kl_loss_3": 2369.3518310546874, "kl_loss_6": 1525.177227783203, "learning_rate": 0.0006843572876339704, "loss": 1187.79, "step": 3860 }, { "ce_loss_12": 3.1393592596054076, "ce_loss_17": 2.9591917514801027, "ce_loss_23": 2.883060610294342, "ce_loss_3": 3.990993869304657, "ce_loss_6": 3.564140295982361, "epoch": 0.387, "grad_norm": 1384.0, "kl_loss_12": 611.3696350097656, "kl_loss_17": 179.5668182373047, "kl_loss_3": 2362.6711669921874, "kl_loss_6": 1517.403125, "learning_rate": 0.0006828814909619373, "loss": 1224.166, "step": 3870 }, { "ce_loss_12": 3.266438162326813, "ce_loss_17": 3.078672707080841, "ce_loss_23": 2.997675633430481, "ce_loss_3": 4.1469242691993715, "ce_loss_6": 3.694972002506256, "epoch": 0.388, "grad_norm": 1032.0, "kl_loss_12": 625.2180084228515, "kl_loss_17": 189.66805419921874, "kl_loss_3": 2417.526257324219, "kl_loss_6": 1538.9678100585938, "learning_rate": 0.0006814038526753205, "loss": 1181.2365, "step": 3880 }, { "ce_loss_12": 3.1726443648338316, "ce_loss_17": 2.9824947476387025, "ce_loss_23": 2.9020296216011046, "ce_loss_3": 4.046648359298706, "ce_loss_6": 3.605487620830536, "epoch": 0.389, "grad_norm": 1256.0, "kl_loss_12": 618.3945541381836, "kl_loss_17": 184.8795082092285, "kl_loss_3": 2406.640905761719, "kl_loss_6": 1535.630078125, "learning_rate": 0.0006799243876539213, "loss": 1194.9497, "step": 3890 }, { "ce_loss_12": 3.097509813308716, "ce_loss_17": 2.9127988815307617, "ce_loss_23": 2.8334673643112183, "ce_loss_3": 4.022769427299499, "ce_loss_6": 3.5444315314292907, "epoch": 0.39, "grad_norm": 1216.0, "kl_loss_12": 619.7410095214843, "kl_loss_17": 183.83202362060547, "kl_loss_3": 2497.6127685546876, "kl_loss_6": 1557.9707397460938, "learning_rate": 0.0006784431107959359, "loss": 1221.2479, "step": 3900 }, { "ce_loss_12": 3.159643363952637, "ce_loss_17": 2.9656416535377503, "ce_loss_23": 2.8849711060523986, "ce_loss_3": 4.089113199710846, "ce_loss_6": 3.6197752594947814, "epoch": 0.391, "grad_norm": 2512.0, "kl_loss_12": 639.0259399414062, "kl_loss_17": 189.73209686279296, "kl_loss_3": 2532.8427001953123, "kl_loss_6": 1597.2655395507813, "learning_rate": 0.0006769600370178059, "loss": 1216.9616, "step": 3910 }, { "ce_loss_12": 3.119121015071869, "ce_loss_17": 2.9354542970657347, "ce_loss_23": 2.8540964841842653, "ce_loss_3": 4.018436765670776, "ce_loss_6": 3.5637188076972963, "epoch": 0.392, "grad_norm": 984.0, "kl_loss_12": 626.1691375732422, "kl_loss_17": 185.33537521362305, "kl_loss_3": 2448.906945800781, "kl_loss_6": 1563.4917846679687, "learning_rate": 0.0006754751812540679, "loss": 1190.9987, "step": 3920 }, { "ce_loss_12": 3.1707355618476867, "ce_loss_17": 2.9823012948036194, "ce_loss_23": 2.8973543882369994, "ce_loss_3": 4.073235404491425, "ce_loss_6": 3.612573671340942, "epoch": 0.393, "grad_norm": 1416.0, "kl_loss_12": 631.8544799804688, "kl_loss_17": 195.9771957397461, "kl_loss_3": 2500.5748901367188, "kl_loss_6": 1586.8189819335937, "learning_rate": 0.0006739885584572025, "loss": 1230.1657, "step": 3930 }, { "ce_loss_12": 3.185942196846008, "ce_loss_17": 3.0003496408462524, "ce_loss_23": 2.9172792553901674, "ce_loss_3": 4.115244591236115, "ce_loss_6": 3.6516054034233094, "epoch": 0.394, "grad_norm": 1272.0, "kl_loss_12": 634.0373046875, "kl_loss_17": 193.56379318237305, "kl_loss_3": 2552.929479980469, "kl_loss_6": 1623.9166687011718, "learning_rate": 0.0006725001835974853, "loss": 1219.0386, "step": 3940 }, { "ce_loss_12": 3.1856348872184754, "ce_loss_17": 2.9992774367332458, "ce_loss_23": 2.9162330746650698, "ce_loss_3": 4.097616982460022, "ce_loss_6": 3.641265344619751, "epoch": 0.395, "grad_norm": 1120.0, "kl_loss_12": 633.9897552490235, "kl_loss_17": 193.42904891967774, "kl_loss_3": 2500.8137084960936, "kl_loss_6": 1591.2876098632812, "learning_rate": 0.0006710100716628344, "loss": 1198.0403, "step": 3950 }, { "ce_loss_12": 3.1679529190063476, "ce_loss_17": 2.9761669516563414, "ce_loss_23": 2.8977315187454225, "ce_loss_3": 4.07511625289917, "ce_loss_6": 3.6221744894981383, "epoch": 0.396, "grad_norm": 1208.0, "kl_loss_12": 621.3185638427734, "kl_loss_17": 185.20935287475587, "kl_loss_3": 2475.409619140625, "kl_loss_6": 1576.9202758789063, "learning_rate": 0.0006695182376586602, "loss": 1223.8219, "step": 3960 }, { "ce_loss_12": 3.1788403749465943, "ce_loss_17": 3.0021064639091493, "ce_loss_23": 2.9263312816619873, "ce_loss_3": 4.03387680053711, "ce_loss_6": 3.5939263820648195, "epoch": 0.397, "grad_norm": 1384.0, "kl_loss_12": 590.8750671386719, "kl_loss_17": 177.05664672851563, "kl_loss_3": 2330.8658203125, "kl_loss_6": 1466.253466796875, "learning_rate": 0.000668024696607715, "loss": 1206.6646, "step": 3970 }, { "ce_loss_12": 3.154967558383942, "ce_loss_17": 2.9758860230445863, "ce_loss_23": 2.896761977672577, "ce_loss_3": 4.04023425579071, "ce_loss_6": 3.598306691646576, "epoch": 0.398, "grad_norm": 1136.0, "kl_loss_12": 610.6574310302734, "kl_loss_17": 182.25591812133788, "kl_loss_3": 2431.3807861328123, "kl_loss_6": 1550.7834106445312, "learning_rate": 0.0006665294635499404, "loss": 1196.3627, "step": 3980 }, { "ce_loss_12": 3.180202877521515, "ce_loss_17": 2.985775589942932, "ce_loss_23": 2.9002400159835817, "ce_loss_3": 4.108720588684082, "ce_loss_6": 3.642514145374298, "epoch": 0.399, "grad_norm": 1288.0, "kl_loss_12": 647.539013671875, "kl_loss_17": 194.1472381591797, "kl_loss_3": 2564.7157470703123, "kl_loss_6": 1636.1509643554687, "learning_rate": 0.0006650325535423167, "loss": 1229.0541, "step": 3990 }, { "ce_loss_12": 3.1735582828521727, "ce_loss_17": 2.9931819081306457, "ce_loss_23": 2.9180276870727537, "ce_loss_3": 4.016478252410889, "ce_loss_6": 3.5862005710601808, "epoch": 0.4, "grad_norm": 1216.0, "kl_loss_12": 596.537808227539, "kl_loss_17": 179.41532363891602, "kl_loss_3": 2333.13134765625, "kl_loss_6": 1479.8255737304687, "learning_rate": 0.0006635339816587109, "loss": 1185.9506, "step": 4000 }, { "ce_loss_12": 3.1252514243125917, "ce_loss_17": 2.9375029802322388, "ce_loss_23": 2.8593607783317565, "ce_loss_3": 4.044191861152649, "ce_loss_6": 3.57818717956543, "epoch": 0.401, "grad_norm": 932.0, "kl_loss_12": 624.591438293457, "kl_loss_17": 188.9966079711914, "kl_loss_3": 2504.6765014648436, "kl_loss_6": 1584.8181884765625, "learning_rate": 0.0006620337629897252, "loss": 1203.3638, "step": 4010 }, { "ce_loss_12": 3.1388319969177245, "ce_loss_17": 2.948263680934906, "ce_loss_23": 2.8623416900634764, "ce_loss_3": 4.0201987504959105, "ce_loss_6": 3.5734472393989565, "epoch": 0.402, "grad_norm": 1344.0, "kl_loss_12": 621.6943450927735, "kl_loss_17": 188.1446647644043, "kl_loss_3": 2436.329235839844, "kl_loss_6": 1549.45478515625, "learning_rate": 0.0006605319126425454, "loss": 1219.1758, "step": 4020 }, { "ce_loss_12": 3.0478885412216186, "ce_loss_17": 2.8635390639305114, "ce_loss_23": 2.7847749173641203, "ce_loss_3": 3.9780253887176515, "ce_loss_6": 3.509523332118988, "epoch": 0.403, "grad_norm": 1088.0, "kl_loss_12": 623.6564025878906, "kl_loss_17": 187.83245315551758, "kl_loss_3": 2524.8685791015623, "kl_loss_6": 1599.0273132324219, "learning_rate": 0.0006590284457407876, "loss": 1219.572, "step": 4030 }, { "ce_loss_12": 3.1345214009284974, "ce_loss_17": 2.947396790981293, "ce_loss_23": 2.8696147322654726, "ce_loss_3": 4.032974493503571, "ce_loss_6": 3.581662690639496, "epoch": 0.404, "grad_norm": 1296.0, "kl_loss_12": 622.2457672119141, "kl_loss_17": 187.03895874023436, "kl_loss_3": 2448.82939453125, "kl_loss_6": 1555.9044067382813, "learning_rate": 0.0006575233774243465, "loss": 1200.8846, "step": 4040 }, { "ce_loss_12": 3.136705255508423, "ce_loss_17": 2.9500337719917296, "ce_loss_23": 2.8683730244636534, "ce_loss_3": 4.042927157878876, "ce_loss_6": 3.5823343634605407, "epoch": 0.405, "grad_norm": 1152.0, "kl_loss_12": 633.6688507080078, "kl_loss_17": 191.35879440307616, "kl_loss_3": 2483.2155151367188, "kl_loss_6": 1563.1441589355468, "learning_rate": 0.0006560167228492435, "loss": 1213.0156, "step": 4050 }, { "ce_loss_12": 3.1671351194381714, "ce_loss_17": 2.9819561004638673, "ce_loss_23": 2.906921589374542, "ce_loss_3": 4.0319117426872255, "ce_loss_6": 3.59480482339859, "epoch": 0.406, "grad_norm": 1400.0, "kl_loss_12": 603.7755615234375, "kl_loss_17": 181.3179901123047, "kl_loss_3": 2384.0647888183594, "kl_loss_6": 1515.6968994140625, "learning_rate": 0.0006545084971874737, "loss": 1202.403, "step": 4060 }, { "ce_loss_12": 3.1415469646453857, "ce_loss_17": 2.942392110824585, "ce_loss_23": 2.859453630447388, "ce_loss_3": 4.05655928850174, "ce_loss_6": 3.5980806589126586, "epoch": 0.407, "grad_norm": 1488.0, "kl_loss_12": 641.6716430664062, "kl_loss_17": 192.19773025512694, "kl_loss_3": 2512.15908203125, "kl_loss_6": 1599.2367370605468, "learning_rate": 0.0006529987156268526, "loss": 1204.36, "step": 4070 }, { "ce_loss_12": 3.066053903102875, "ce_loss_17": 2.8747988820075987, "ce_loss_23": 2.7897343158721926, "ce_loss_3": 3.970530641078949, "ce_loss_6": 3.5074979782104494, "epoch": 0.408, "grad_norm": 1120.0, "kl_loss_12": 621.1017639160157, "kl_loss_17": 185.82689056396484, "kl_loss_3": 2472.4393676757813, "kl_loss_6": 1557.4956115722657, "learning_rate": 0.0006514873933708637, "loss": 1230.9574, "step": 4080 }, { "ce_loss_12": 3.164853739738464, "ce_loss_17": 2.9797329902648926, "ce_loss_23": 2.9014252185821534, "ce_loss_3": 4.050824499130249, "ce_loss_6": 3.603599953651428, "epoch": 0.409, "grad_norm": 1272.0, "kl_loss_12": 608.2298370361328, "kl_loss_17": 180.71694564819336, "kl_loss_3": 2420.9696533203123, "kl_loss_6": 1539.03232421875, "learning_rate": 0.0006499745456385053, "loss": 1189.1555, "step": 4090 }, { "ce_loss_12": 3.1311935782432556, "ce_loss_17": 2.9477484822273254, "ce_loss_23": 2.8674038887023925, "ce_loss_3": 4.031777393817902, "ce_loss_6": 3.5701063513755797, "epoch": 0.41, "grad_norm": 1376.0, "kl_loss_12": 621.2535614013672, "kl_loss_17": 185.45141220092773, "kl_loss_3": 2455.026086425781, "kl_loss_6": 1553.7825561523437, "learning_rate": 0.0006484601876641375, "loss": 1211.484, "step": 4100 }, { "ce_loss_12": 3.115593373775482, "ce_loss_17": 2.9372752904891968, "ce_loss_23": 2.8610817909240724, "ce_loss_3": 3.974239504337311, "ce_loss_6": 3.5369308471679686, "epoch": 0.411, "grad_norm": 1488.0, "kl_loss_12": 603.7558715820312, "kl_loss_17": 180.66451187133788, "kl_loss_3": 2351.790808105469, "kl_loss_6": 1493.5529357910157, "learning_rate": 0.000646944334697328, "loss": 1175.2756, "step": 4110 }, { "ce_loss_12": 3.219191825389862, "ce_loss_17": 3.0355695724487304, "ce_loss_23": 2.9580381751060485, "ce_loss_3": 4.060825407505035, "ce_loss_6": 3.6392863154411317, "epoch": 0.412, "grad_norm": 1296.0, "kl_loss_12": 605.027815246582, "kl_loss_17": 181.11124267578126, "kl_loss_3": 2321.31376953125, "kl_loss_6": 1490.2986083984374, "learning_rate": 0.0006454270020026995, "loss": 1157.5808, "step": 4120 }, { "ce_loss_12": 3.190293347835541, "ce_loss_17": 3.0151625514030456, "ce_loss_23": 2.938095223903656, "ce_loss_3": 4.0323722004890445, "ce_loss_6": 3.604380559921265, "epoch": 0.413, "grad_norm": 1112.0, "kl_loss_12": 593.6603637695313, "kl_loss_17": 176.89496612548828, "kl_loss_3": 2321.645892333984, "kl_loss_6": 1483.1813537597657, "learning_rate": 0.0006439082048597755, "loss": 1153.7059, "step": 4130 }, { "ce_loss_12": 3.1885109305381776, "ce_loss_17": 3.0001250505447388, "ce_loss_23": 2.9225224018096925, "ce_loss_3": 4.0738829374313354, "ce_loss_6": 3.6269838571548463, "epoch": 0.414, "grad_norm": 1112.0, "kl_loss_12": 621.9543090820313, "kl_loss_17": 183.03130035400392, "kl_loss_3": 2435.958605957031, "kl_loss_6": 1556.7580078125, "learning_rate": 0.0006423879585628261, "loss": 1204.4623, "step": 4140 }, { "ce_loss_12": 3.146387219429016, "ce_loss_17": 2.955020797252655, "ce_loss_23": 2.876465117931366, "ce_loss_3": 4.061364984512329, "ce_loss_6": 3.600324022769928, "epoch": 0.415, "grad_norm": 1048.0, "kl_loss_12": 634.1819671630859, "kl_loss_17": 187.38957138061522, "kl_loss_3": 2494.6953979492187, "kl_loss_6": 1589.542950439453, "learning_rate": 0.0006408662784207149, "loss": 1219.8756, "step": 4150 }, { "ce_loss_12": 3.1147804141044615, "ce_loss_17": 2.933422553539276, "ce_loss_23": 2.856143128871918, "ce_loss_3": 4.003896868228912, "ce_loss_6": 3.5593339085578917, "epoch": 0.416, "grad_norm": 1384.0, "kl_loss_12": 614.3667526245117, "kl_loss_17": 179.37629470825195, "kl_loss_3": 2439.1699462890624, "kl_loss_6": 1557.9551391601562, "learning_rate": 0.0006393431797567439, "loss": 1198.1912, "step": 4160 }, { "ce_loss_12": 3.1870901346206666, "ce_loss_17": 3.0070515036582948, "ce_loss_23": 2.9304370403289797, "ce_loss_3": 4.027340471744537, "ce_loss_6": 3.5912632942199707, "epoch": 0.417, "grad_norm": 1088.0, "kl_loss_12": 604.4052337646484, "kl_loss_17": 181.3666244506836, "kl_loss_3": 2345.2923583984375, "kl_loss_6": 1478.760107421875, "learning_rate": 0.0006378186779084996, "loss": 1144.3483, "step": 4170 }, { "ce_loss_12": 3.036437451839447, "ce_loss_17": 2.847336399555206, "ce_loss_23": 2.768596684932709, "ce_loss_3": 3.942387247085571, "ce_loss_6": 3.491739511489868, "epoch": 0.418, "grad_norm": 1648.0, "kl_loss_12": 625.4832733154296, "kl_loss_17": 184.05609970092775, "kl_loss_3": 2447.228466796875, "kl_loss_6": 1568.6175109863282, "learning_rate": 0.0006362927882276989, "loss": 1213.3297, "step": 4180 }, { "ce_loss_12": 3.1968475103378298, "ce_loss_17": 3.0216124773025514, "ce_loss_23": 2.943992519378662, "ce_loss_3": 4.047649502754211, "ce_loss_6": 3.618110203742981, "epoch": 0.419, "grad_norm": 1872.0, "kl_loss_12": 593.755502319336, "kl_loss_17": 177.35777282714844, "kl_loss_3": 2344.962713623047, "kl_loss_6": 1489.8335815429687, "learning_rate": 0.000634765526080034, "loss": 1148.7749, "step": 4190 }, { "ce_loss_12": 3.2133609533309935, "ce_loss_17": 3.02979691028595, "ce_loss_23": 2.94939581155777, "ce_loss_3": 4.076133000850677, "ce_loss_6": 3.640803503990173, "epoch": 0.42, "grad_norm": 1328.0, "kl_loss_12": 612.8868621826172, "kl_loss_17": 185.13345108032226, "kl_loss_3": 2380.922399902344, "kl_loss_6": 1522.4027038574218, "learning_rate": 0.0006332369068450174, "loss": 1170.4217, "step": 4200 }, { "ce_loss_12": 3.1545344591140747, "ce_loss_17": 2.970043087005615, "ce_loss_23": 2.8947736382484437, "ce_loss_3": 4.039136958122254, "ce_loss_6": 3.5878245711326597, "epoch": 0.421, "grad_norm": 1240.0, "kl_loss_12": 610.0253753662109, "kl_loss_17": 180.47613830566405, "kl_loss_3": 2412.1201416015624, "kl_loss_6": 1535.6862182617188, "learning_rate": 0.0006317069459158283, "loss": 1178.7123, "step": 4210 }, { "ce_loss_12": 3.2384790778160095, "ce_loss_17": 3.064324343204498, "ce_loss_23": 2.9877140641212465, "ce_loss_3": 4.089816331863403, "ce_loss_6": 3.6487473249435425, "epoch": 0.422, "grad_norm": 1112.0, "kl_loss_12": 602.6154113769531, "kl_loss_17": 181.77598495483397, "kl_loss_3": 2361.879040527344, "kl_loss_6": 1490.4651489257812, "learning_rate": 0.0006301756586991561, "loss": 1164.6935, "step": 4220 }, { "ce_loss_12": 3.0499517917633057, "ce_loss_17": 2.8621727347373964, "ce_loss_23": 2.7846267104148863, "ce_loss_3": 3.9611988425254823, "ce_loss_6": 3.5068413972854615, "epoch": 0.423, "grad_norm": 1128.0, "kl_loss_12": 619.654345703125, "kl_loss_17": 183.03083038330078, "kl_loss_3": 2495.934606933594, "kl_loss_6": 1583.8559020996095, "learning_rate": 0.0006286430606150459, "loss": 1204.7939, "step": 4230 }, { "ce_loss_12": 3.239712488651276, "ce_loss_17": 3.0582516312599184, "ce_loss_23": 2.980309987068176, "ce_loss_3": 4.110534727573395, "ce_loss_6": 3.6744491338729857, "epoch": 0.424, "grad_norm": 1064.0, "kl_loss_12": 613.473355102539, "kl_loss_17": 183.81760330200194, "kl_loss_3": 2406.3619384765625, "kl_loss_6": 1523.7447509765625, "learning_rate": 0.0006271091670967436, "loss": 1179.3695, "step": 4240 }, { "ce_loss_12": 3.171178638935089, "ce_loss_17": 2.977051842212677, "ce_loss_23": 2.8960009932518007, "ce_loss_3": 4.089389193058014, "ce_loss_6": 3.6281680345535277, "epoch": 0.425, "grad_norm": 1360.0, "kl_loss_12": 650.4041778564454, "kl_loss_17": 190.87026596069336, "kl_loss_3": 2532.585070800781, "kl_loss_6": 1618.9200622558594, "learning_rate": 0.0006255739935905395, "loss": 1212.168, "step": 4250 }, { "ce_loss_12": 3.192930221557617, "ce_loss_17": 3.007972240447998, "ce_loss_23": 2.931060564517975, "ce_loss_3": 4.054401051998139, "ce_loss_6": 3.6081828713417052, "epoch": 0.426, "grad_norm": 1216.0, "kl_loss_12": 611.473422241211, "kl_loss_17": 182.98992462158202, "kl_loss_3": 2384.975927734375, "kl_loss_6": 1505.831365966797, "learning_rate": 0.0006240375555556145, "loss": 1215.0896, "step": 4260 }, { "ce_loss_12": 3.1992557883262633, "ce_loss_17": 3.009497511386871, "ce_loss_23": 2.930485475063324, "ce_loss_3": 4.107662343978882, "ce_loss_6": 3.652487301826477, "epoch": 0.427, "grad_norm": 1192.0, "kl_loss_12": 621.4902252197265, "kl_loss_17": 183.88353729248047, "kl_loss_3": 2474.84443359375, "kl_loss_6": 1575.9041320800782, "learning_rate": 0.000622499868463882, "loss": 1204.1057, "step": 4270 }, { "ce_loss_12": 3.1563706755638123, "ce_loss_17": 2.9828593254089357, "ce_loss_23": 2.907644438743591, "ce_loss_3": 4.003303325176239, "ce_loss_6": 3.5699565291404722, "epoch": 0.428, "grad_norm": 1112.0, "kl_loss_12": 595.4530853271484, "kl_loss_17": 179.88403701782227, "kl_loss_3": 2351.163684082031, "kl_loss_6": 1484.134100341797, "learning_rate": 0.0006209609477998338, "loss": 1169.7238, "step": 4280 }, { "ce_loss_12": 3.22188595533371, "ce_loss_17": 3.0394359707832335, "ce_loss_23": 2.957653522491455, "ce_loss_3": 4.087126052379608, "ce_loss_6": 3.6486384749412535, "epoch": 0.429, "grad_norm": 1480.0, "kl_loss_12": 613.1925567626953, "kl_loss_17": 185.11133728027343, "kl_loss_3": 2390.9447509765623, "kl_loss_6": 1525.0828063964843, "learning_rate": 0.0006194208090603844, "loss": 1196.0419, "step": 4290 }, { "ce_loss_12": 3.1381272315979003, "ce_loss_17": 2.96423180103302, "ce_loss_23": 2.8869675517082216, "ce_loss_3": 4.011167562007904, "ce_loss_6": 3.5743077039718627, "epoch": 0.43, "grad_norm": 916.0, "kl_loss_12": 594.9730010986328, "kl_loss_17": 176.15359191894532, "kl_loss_3": 2377.043664550781, "kl_loss_6": 1502.032763671875, "learning_rate": 0.0006178794677547138, "loss": 1156.8104, "step": 4300 }, { "ce_loss_12": 3.173213839530945, "ce_loss_17": 2.9872260212898256, "ce_loss_23": 2.908361482620239, "ce_loss_3": 4.054434442520142, "ce_loss_6": 3.614891541004181, "epoch": 0.431, "grad_norm": 1296.0, "kl_loss_12": 621.83984375, "kl_loss_17": 182.7801429748535, "kl_loss_3": 2430.5281005859374, "kl_loss_6": 1561.0057495117187, "learning_rate": 0.0006163369394041111, "loss": 1190.3025, "step": 4310 }, { "ce_loss_12": 3.102406346797943, "ce_loss_17": 2.9151142835617065, "ce_loss_23": 2.837439149618149, "ce_loss_3": 4.009701907634735, "ce_loss_6": 3.553163170814514, "epoch": 0.432, "grad_norm": 1384.0, "kl_loss_12": 613.7666687011719, "kl_loss_17": 180.23058700561523, "kl_loss_3": 2456.4883544921877, "kl_loss_6": 1556.5386901855468, "learning_rate": 0.0006147932395418205, "loss": 1222.4783, "step": 4320 }, { "ce_loss_12": 3.141595554351807, "ce_loss_17": 2.9644735455513, "ce_loss_23": 2.88581862449646, "ce_loss_3": 3.9972390055656435, "ce_loss_6": 3.5638728499412538, "epoch": 0.433, "grad_norm": 1016.0, "kl_loss_12": 605.8929626464844, "kl_loss_17": 179.01886596679688, "kl_loss_3": 2384.8349670410157, "kl_loss_6": 1514.4691040039063, "learning_rate": 0.0006132483837128823, "loss": 1165.2039, "step": 4330 }, { "ce_loss_12": 3.1127786874771117, "ce_loss_17": 2.9356310844421385, "ce_loss_23": 2.8605749011039734, "ce_loss_3": 4.008680784702301, "ce_loss_6": 3.5570395469665526, "epoch": 0.434, "grad_norm": 1272.0, "kl_loss_12": 605.3210144042969, "kl_loss_17": 179.10021896362304, "kl_loss_3": 2443.418591308594, "kl_loss_6": 1535.6750915527343, "learning_rate": 0.0006117023874739772, "loss": 1188.4191, "step": 4340 }, { "ce_loss_12": 3.122081291675568, "ce_loss_17": 2.932915461063385, "ce_loss_23": 2.8596187949180605, "ce_loss_3": 4.009017038345337, "ce_loss_6": 3.5538516879081725, "epoch": 0.435, "grad_norm": 1344.0, "kl_loss_12": 616.5628082275391, "kl_loss_17": 181.03787994384766, "kl_loss_3": 2449.987927246094, "kl_loss_6": 1549.4891540527344, "learning_rate": 0.0006101552663932703, "loss": 1206.0716, "step": 4350 }, { "ce_loss_12": 3.150739920139313, "ce_loss_17": 2.968548262119293, "ce_loss_23": 2.888369154930115, "ce_loss_3": 4.025252389907837, "ce_loss_6": 3.5858799815177917, "epoch": 0.436, "grad_norm": 952.0, "kl_loss_12": 612.4619598388672, "kl_loss_17": 184.53734970092773, "kl_loss_3": 2408.1568603515625, "kl_loss_6": 1534.4716857910157, "learning_rate": 0.0006086070360502539, "loss": 1181.0723, "step": 4360 }, { "ce_loss_12": 3.15641371011734, "ce_loss_17": 2.9694243907928466, "ce_loss_23": 2.8904993653297426, "ce_loss_3": 4.025759434700012, "ce_loss_6": 3.581454849243164, "epoch": 0.437, "grad_norm": 1000.0, "kl_loss_12": 611.8678588867188, "kl_loss_17": 180.19417724609374, "kl_loss_3": 2418.9445251464845, "kl_loss_6": 1541.1427673339845, "learning_rate": 0.0006070577120355903, "loss": 1191.9658, "step": 4370 }, { "ce_loss_12": 3.151238977909088, "ce_loss_17": 2.9647199869155885, "ce_loss_23": 2.888589072227478, "ce_loss_3": 4.012544178962708, "ce_loss_6": 3.5770498633384706, "epoch": 0.438, "grad_norm": 1072.0, "kl_loss_12": 599.1971862792968, "kl_loss_17": 178.08247756958008, "kl_loss_3": 2343.4568359375, "kl_loss_6": 1496.306219482422, "learning_rate": 0.0006055073099509549, "loss": 1172.6889, "step": 4380 }, { "ce_loss_12": 3.205288279056549, "ce_loss_17": 3.02845641374588, "ce_loss_23": 2.952479887008667, "ce_loss_3": 4.062442588806152, "ce_loss_6": 3.6250621557235716, "epoch": 0.439, "grad_norm": 1304.0, "kl_loss_12": 604.9193298339844, "kl_loss_17": 180.97334976196288, "kl_loss_3": 2363.5929565429688, "kl_loss_6": 1502.811962890625, "learning_rate": 0.0006039558454088796, "loss": 1185.3265, "step": 4390 }, { "ce_loss_12": 3.177826189994812, "ce_loss_17": 2.992183196544647, "ce_loss_23": 2.9130085825920107, "ce_loss_3": 4.054619085788727, "ce_loss_6": 3.6174099802970887, "epoch": 0.44, "grad_norm": 940.0, "kl_loss_12": 616.7141555786133, "kl_loss_17": 183.2505676269531, "kl_loss_3": 2406.0845947265625, "kl_loss_6": 1539.0552612304687, "learning_rate": 0.0006024033340325954, "loss": 1165.9447, "step": 4400 }, { "ce_loss_12": 3.235459733009338, "ce_loss_17": 3.057520937919617, "ce_loss_23": 2.982460641860962, "ce_loss_3": 4.066870248317718, "ce_loss_6": 3.643921208381653, "epoch": 0.441, "grad_norm": 896.0, "kl_loss_12": 586.6485046386719, "kl_loss_17": 173.9886131286621, "kl_loss_3": 2291.1926147460936, "kl_loss_6": 1456.7806884765625, "learning_rate": 0.0006008497914558743, "loss": 1149.7752, "step": 4410 }, { "ce_loss_12": 3.1969673156738283, "ce_loss_17": 3.012665021419525, "ce_loss_23": 2.93061842918396, "ce_loss_3": 4.075615668296814, "ce_loss_6": 3.6308741211891173, "epoch": 0.442, "grad_norm": 1184.0, "kl_loss_12": 626.980126953125, "kl_loss_17": 189.89006118774415, "kl_loss_3": 2427.6785400390627, "kl_loss_6": 1548.2725952148437, "learning_rate": 0.0005992952333228728, "loss": 1194.7676, "step": 4420 }, { "ce_loss_12": 3.131419849395752, "ce_loss_17": 2.9515241384506226, "ce_loss_23": 2.877249336242676, "ce_loss_3": 4.010429859161377, "ce_loss_6": 3.5680622458457947, "epoch": 0.443, "grad_norm": 1032.0, "kl_loss_12": 603.8198883056641, "kl_loss_17": 177.6677215576172, "kl_loss_3": 2420.6113891601562, "kl_loss_6": 1541.025, "learning_rate": 0.0005977396752879741, "loss": 1178.0658, "step": 4430 }, { "ce_loss_12": 3.0689117550849914, "ce_loss_17": 2.8813390254974367, "ce_loss_23": 2.8048648595809937, "ce_loss_3": 3.9471965193748475, "ce_loss_6": 3.5006853580474853, "epoch": 0.444, "grad_norm": 1008.0, "kl_loss_12": 615.7683135986329, "kl_loss_17": 178.2631462097168, "kl_loss_3": 2437.4211181640626, "kl_loss_6": 1548.4304260253907, "learning_rate": 0.0005961831330156305, "loss": 1178.2352, "step": 4440 }, { "ce_loss_12": 3.2002723097801207, "ce_loss_17": 3.017963671684265, "ce_loss_23": 2.9416601061820984, "ce_loss_3": 4.094040215015411, "ce_loss_6": 3.641245484352112, "epoch": 0.445, "grad_norm": 1144.0, "kl_loss_12": 606.8301818847656, "kl_loss_17": 178.37159881591796, "kl_loss_3": 2438.9416381835936, "kl_loss_6": 1540.8159240722657, "learning_rate": 0.0005946256221802051, "loss": 1207.8047, "step": 4450 }, { "ce_loss_12": 3.1546120762825014, "ce_loss_17": 2.9822561979293822, "ce_loss_23": 2.913159799575806, "ce_loss_3": 3.9937642455101012, "ce_loss_6": 3.5635142087936402, "epoch": 0.446, "grad_norm": 1512.0, "kl_loss_12": 582.0013885498047, "kl_loss_17": 173.38505935668945, "kl_loss_3": 2304.2907653808593, "kl_loss_6": 1465.9250854492188, "learning_rate": 0.0005930671584658151, "loss": 1199.5111, "step": 4460 }, { "ce_loss_12": 3.1791846990585326, "ce_loss_17": 2.995867121219635, "ce_loss_23": 2.921609342098236, "ce_loss_3": 4.046207320690155, "ce_loss_6": 3.6068936467170714, "epoch": 0.447, "grad_norm": 1240.0, "kl_loss_12": 606.0908996582032, "kl_loss_17": 179.20726013183594, "kl_loss_3": 2392.2299682617186, "kl_loss_6": 1517.7416748046876, "learning_rate": 0.0005915077575661722, "loss": 1189.5426, "step": 4470 }, { "ce_loss_12": 3.19470419883728, "ce_loss_17": 3.0134175539016725, "ce_loss_23": 2.936840128898621, "ce_loss_3": 4.070160377025604, "ce_loss_6": 3.620807874202728, "epoch": 0.448, "grad_norm": 2128.0, "kl_loss_12": 618.5509094238281, "kl_loss_17": 184.0712677001953, "kl_loss_3": 2423.9630615234373, "kl_loss_6": 1533.7049865722656, "learning_rate": 0.000589947435184427, "loss": 1172.1064, "step": 4480 }, { "ce_loss_12": 3.245382022857666, "ce_loss_17": 3.069892108440399, "ce_loss_23": 2.993940007686615, "ce_loss_3": 4.063095688819885, "ce_loss_6": 3.645653986930847, "epoch": 0.449, "grad_norm": 1088.0, "kl_loss_12": 602.9532592773437, "kl_loss_17": 179.3310974121094, "kl_loss_3": 2319.9205810546873, "kl_loss_6": 1483.8597900390625, "learning_rate": 0.0005883862070330078, "loss": 1164.0664, "step": 4490 }, { "ce_loss_12": 3.1951568722724915, "ce_loss_17": 3.0084994196891786, "ce_loss_23": 2.933244800567627, "ce_loss_3": 4.06550110578537, "ce_loss_6": 3.6232324600219727, "epoch": 0.45, "grad_norm": 1120.0, "kl_loss_12": 613.0902038574219, "kl_loss_17": 180.93068466186523, "kl_loss_3": 2407.7741455078126, "kl_loss_6": 1531.2658630371093, "learning_rate": 0.0005868240888334653, "loss": 1175.3828, "step": 4500 }, { "ce_loss_12": 3.0894825577735903, "ce_loss_17": 2.90518798828125, "ce_loss_23": 2.8252994894981383, "ce_loss_3": 3.989874315261841, "ce_loss_6": 3.528348672389984, "epoch": 0.451, "grad_norm": 1004.0, "kl_loss_12": 614.2226287841797, "kl_loss_17": 181.90268173217774, "kl_loss_3": 2449.4162780761717, "kl_loss_6": 1549.0628845214844, "learning_rate": 0.0005852610963163119, "loss": 1194.0572, "step": 4510 }, { "ce_loss_12": 3.1016993284225465, "ce_loss_17": 2.9186312317848206, "ce_loss_23": 2.845429253578186, "ce_loss_3": 3.9625542402267455, "ce_loss_6": 3.5307849884033202, "epoch": 0.452, "grad_norm": 1080.0, "kl_loss_12": 602.321940612793, "kl_loss_17": 176.20557403564453, "kl_loss_3": 2377.343701171875, "kl_loss_6": 1510.2603820800782, "learning_rate": 0.0005836972452208654, "loss": 1159.2561, "step": 4520 }, { "ce_loss_12": 3.1053978562355042, "ce_loss_17": 2.926951897144318, "ce_loss_23": 2.8549540877342223, "ce_loss_3": 3.992169404029846, "ce_loss_6": 3.5429995179176332, "epoch": 0.453, "grad_norm": 1184.0, "kl_loss_12": 605.2806594848632, "kl_loss_17": 180.5658836364746, "kl_loss_3": 2423.1808471679688, "kl_loss_6": 1526.9974060058594, "learning_rate": 0.0005821325512950885, "loss": 1183.519, "step": 4530 }, { "ce_loss_12": 3.1314645528793337, "ce_loss_17": 2.948435127735138, "ce_loss_23": 2.8767545342445375, "ce_loss_3": 3.9899188756942747, "ce_loss_6": 3.5530348300933836, "epoch": 0.454, "grad_norm": 1128.0, "kl_loss_12": 586.1095260620117, "kl_loss_17": 175.72238693237304, "kl_loss_3": 2341.41015625, "kl_loss_6": 1480.1607421875, "learning_rate": 0.0005805670302954321, "loss": 1169.1936, "step": 4540 }, { "ce_loss_12": 3.1308477878570558, "ce_loss_17": 2.9560258388519287, "ce_loss_23": 2.8843746423721313, "ce_loss_3": 3.9922935366630554, "ce_loss_6": 3.5547144174575807, "epoch": 0.455, "grad_norm": 1368.0, "kl_loss_12": 589.7868408203125, "kl_loss_17": 175.0533874511719, "kl_loss_3": 2359.759100341797, "kl_loss_6": 1493.0820190429688, "learning_rate": 0.000579000697986675, "loss": 1154.3197, "step": 4550 }, { "ce_loss_12": 3.106800544261932, "ce_loss_17": 2.9211928248405457, "ce_loss_23": 2.8407024145126343, "ce_loss_3": 4.007343494892121, "ce_loss_6": 3.560846769809723, "epoch": 0.456, "grad_norm": 992.0, "kl_loss_12": 625.1224670410156, "kl_loss_17": 182.62897796630858, "kl_loss_3": 2458.1799377441407, "kl_loss_6": 1573.9145263671876, "learning_rate": 0.0005774335701417662, "loss": 1183.7318, "step": 4560 }, { "ce_loss_12": 3.092662584781647, "ce_loss_17": 2.9104947686195373, "ce_loss_23": 2.8373430490493776, "ce_loss_3": 4.005108213424682, "ce_loss_6": 3.5392897725105286, "epoch": 0.457, "grad_norm": 1216.0, "kl_loss_12": 602.2989013671875, "kl_loss_17": 175.17914123535155, "kl_loss_3": 2467.407958984375, "kl_loss_6": 1549.7640441894532, "learning_rate": 0.0005758656625416658, "loss": 1184.0045, "step": 4570 }, { "ce_loss_12": 3.147735857963562, "ce_loss_17": 2.9694060325622558, "ce_loss_23": 2.8919848680496214, "ce_loss_3": 4.014414978027344, "ce_loss_6": 3.577460062503815, "epoch": 0.458, "grad_norm": 1296.0, "kl_loss_12": 607.9190521240234, "kl_loss_17": 181.15816345214844, "kl_loss_3": 2386.3705200195313, "kl_loss_6": 1522.2303894042968, "learning_rate": 0.0005742969909751859, "loss": 1162.5008, "step": 4580 }, { "ce_loss_12": 3.1607869029045106, "ce_loss_17": 2.977490282058716, "ce_loss_23": 2.902131140232086, "ce_loss_3": 4.032933211326599, "ce_loss_6": 3.5879942059516905, "epoch": 0.459, "grad_norm": 980.0, "kl_loss_12": 605.3396240234375, "kl_loss_17": 177.57530059814454, "kl_loss_3": 2404.7664184570312, "kl_loss_6": 1516.094061279297, "learning_rate": 0.0005727275712388318, "loss": 1182.758, "step": 4590 }, { "ce_loss_12": 3.164100635051727, "ce_loss_17": 2.9944262981414793, "ce_loss_23": 2.9201043009757996, "ce_loss_3": 4.0042870998382565, "ce_loss_6": 3.5822572112083435, "epoch": 0.46, "grad_norm": 1136.0, "kl_loss_12": 582.4186553955078, "kl_loss_17": 172.64603500366212, "kl_loss_3": 2319.947625732422, "kl_loss_6": 1475.0054870605468, "learning_rate": 0.0005711574191366427, "loss": 1154.0926, "step": 4600 }, { "ce_loss_12": 3.123515796661377, "ce_loss_17": 2.946636915206909, "ce_loss_23": 2.8758048892021177, "ce_loss_3": 3.9856918692588805, "ce_loss_6": 3.548724818229675, "epoch": 0.461, "grad_norm": 1208.0, "kl_loss_12": 592.2702346801758, "kl_loss_17": 173.98607940673827, "kl_loss_3": 2361.8337829589846, "kl_loss_6": 1486.2902221679688, "learning_rate": 0.0005695865504800327, "loss": 1150.3572, "step": 4610 }, { "ce_loss_12": 3.086802434921265, "ce_loss_17": 2.893968403339386, "ce_loss_23": 2.812371277809143, "ce_loss_3": 4.0360452890396115, "ce_loss_6": 3.561065137386322, "epoch": 0.462, "grad_norm": 1168.0, "kl_loss_12": 637.6817230224609, "kl_loss_17": 187.44516372680664, "kl_loss_3": 2576.360314941406, "kl_loss_6": 1633.4139465332032, "learning_rate": 0.0005680149810876322, "loss": 1208.0934, "step": 4620 }, { "ce_loss_12": 3.1244890332221984, "ce_loss_17": 2.949481213092804, "ce_loss_23": 2.8752119183540343, "ce_loss_3": 4.005128943920136, "ce_loss_6": 3.5534419894218443, "epoch": 0.463, "grad_norm": 1136.0, "kl_loss_12": 597.1886444091797, "kl_loss_17": 177.4503646850586, "kl_loss_3": 2404.171875, "kl_loss_6": 1515.5715576171874, "learning_rate": 0.0005664427267851271, "loss": 1167.7977, "step": 4630 }, { "ce_loss_12": 3.046623146533966, "ce_loss_17": 2.8676019072532655, "ce_loss_23": 2.7937828421592714, "ce_loss_3": 3.9279929876327513, "ce_loss_6": 3.4800993323326113, "epoch": 0.464, "grad_norm": 1144.0, "kl_loss_12": 589.9766616821289, "kl_loss_17": 173.2282585144043, "kl_loss_3": 2390.469445800781, "kl_loss_6": 1500.6643432617188, "learning_rate": 0.0005648698034051009, "loss": 1160.5701, "step": 4640 }, { "ce_loss_12": 3.152076518535614, "ce_loss_17": 2.969795060157776, "ce_loss_23": 2.892859363555908, "ce_loss_3": 4.063055515289307, "ce_loss_6": 3.595481789112091, "epoch": 0.465, "grad_norm": 1112.0, "kl_loss_12": 599.1829010009766, "kl_loss_17": 176.86735610961915, "kl_loss_3": 2457.741827392578, "kl_loss_6": 1536.1931945800782, "learning_rate": 0.0005632962267868747, "loss": 1166.2303, "step": 4650 }, { "ce_loss_12": 3.094632935523987, "ce_loss_17": 2.9150583505630494, "ce_loss_23": 2.844318723678589, "ce_loss_3": 3.9594098567962646, "ce_loss_6": 3.519077444076538, "epoch": 0.466, "grad_norm": 1024.0, "kl_loss_12": 585.1817733764649, "kl_loss_17": 171.51151428222656, "kl_loss_3": 2366.838446044922, "kl_loss_6": 1495.7054382324218, "learning_rate": 0.0005617220127763474, "loss": 1164.3539, "step": 4660 }, { "ce_loss_12": 3.169011449813843, "ce_loss_17": 2.991918349266052, "ce_loss_23": 2.9186757445335387, "ce_loss_3": 4.024391627311706, "ce_loss_6": 3.585629200935364, "epoch": 0.467, "grad_norm": 1272.0, "kl_loss_12": 592.2187469482421, "kl_loss_17": 175.8935432434082, "kl_loss_3": 2357.176135253906, "kl_loss_6": 1491.6616333007812, "learning_rate": 0.0005601471772258368, "loss": 1167.6615, "step": 4670 }, { "ce_loss_12": 3.1551905155181883, "ce_loss_17": 2.9784803986549377, "ce_loss_23": 2.9041765928268433, "ce_loss_3": 4.006201231479645, "ce_loss_6": 3.5734246969223022, "epoch": 0.468, "grad_norm": 1544.0, "kl_loss_12": 590.1953628540039, "kl_loss_17": 175.58686141967775, "kl_loss_3": 2324.540203857422, "kl_loss_6": 1472.1936279296874, "learning_rate": 0.0005585717359939192, "loss": 1169.4467, "step": 4680 }, { "ce_loss_12": 3.0665130019187927, "ce_loss_17": 2.8923226475715635, "ce_loss_23": 2.8199669241905214, "ce_loss_3": 3.920028805732727, "ce_loss_6": 3.4936899185180663, "epoch": 0.469, "grad_norm": 976.0, "kl_loss_12": 588.025910949707, "kl_loss_17": 172.29647674560547, "kl_loss_3": 2333.3386657714846, "kl_loss_6": 1484.1640563964843, "learning_rate": 0.0005569957049452703, "loss": 1175.3557, "step": 4690 }, { "ce_loss_12": 3.1306982755661013, "ce_loss_17": 2.9464347124099732, "ce_loss_23": 2.8718415260314942, "ce_loss_3": 4.01355744600296, "ce_loss_6": 3.5599818110466, "epoch": 0.47, "grad_norm": 1496.0, "kl_loss_12": 603.9425079345704, "kl_loss_17": 177.87748336791992, "kl_loss_3": 2412.801232910156, "kl_loss_6": 1522.6386840820312, "learning_rate": 0.0005554190999505056, "loss": 1183.6113, "step": 4700 }, { "ce_loss_12": 3.250467562675476, "ce_loss_17": 3.063236713409424, "ce_loss_23": 2.987007200717926, "ce_loss_3": 4.112039339542389, "ce_loss_6": 3.6746443629264833, "epoch": 0.471, "grad_norm": 1240.0, "kl_loss_12": 617.5975311279296, "kl_loss_17": 181.46721878051758, "kl_loss_3": 2398.5910400390626, "kl_loss_6": 1523.2029846191406, "learning_rate": 0.0005538419368860196, "loss": 1141.2441, "step": 4710 }, { "ce_loss_12": 3.16205176115036, "ce_loss_17": 2.9881635546684264, "ce_loss_23": 2.912904453277588, "ce_loss_3": 4.027502238750458, "ce_loss_6": 3.5856967091560366, "epoch": 0.472, "grad_norm": 988.0, "kl_loss_12": 595.6430084228516, "kl_loss_17": 177.5866683959961, "kl_loss_3": 2361.086083984375, "kl_loss_6": 1491.6454711914062, "learning_rate": 0.0005522642316338268, "loss": 1183.4592, "step": 4720 }, { "ce_loss_12": 3.1781874775886534, "ce_loss_17": 3.002910315990448, "ce_loss_23": 2.9332908749580384, "ce_loss_3": 4.03815621137619, "ce_loss_6": 3.591979217529297, "epoch": 0.473, "grad_norm": 1608.0, "kl_loss_12": 600.0981414794921, "kl_loss_17": 175.05057067871093, "kl_loss_3": 2363.9915405273437, "kl_loss_6": 1487.9787231445312, "learning_rate": 0.0005506860000814017, "loss": 1187.1951, "step": 4730 }, { "ce_loss_12": 3.192247247695923, "ce_loss_17": 3.020113730430603, "ce_loss_23": 2.951002764701843, "ce_loss_3": 4.026438915729523, "ce_loss_6": 3.601203989982605, "epoch": 0.474, "grad_norm": 1208.0, "kl_loss_12": 585.6193969726562, "kl_loss_17": 169.32979736328124, "kl_loss_3": 2318.0686279296874, "kl_loss_6": 1473.0704528808594, "learning_rate": 0.0005491072581215186, "loss": 1158.4854, "step": 4740 }, { "ce_loss_12": 3.195116567611694, "ce_loss_17": 3.014424669742584, "ce_loss_23": 2.9354434967041017, "ce_loss_3": 4.054661548137664, "ce_loss_6": 3.6271575689315796, "epoch": 0.475, "grad_norm": 1360.0, "kl_loss_12": 608.3226257324219, "kl_loss_17": 181.31658325195312, "kl_loss_3": 2394.527795410156, "kl_loss_6": 1526.428887939453, "learning_rate": 0.0005475280216520913, "loss": 1152.4799, "step": 4750 }, { "ce_loss_12": 3.118545651435852, "ce_loss_17": 2.9431566119194033, "ce_loss_23": 2.870669364929199, "ce_loss_3": 3.9644595265388487, "ce_loss_6": 3.5394273519515993, "epoch": 0.476, "grad_norm": 1184.0, "kl_loss_12": 583.4711624145508, "kl_loss_17": 170.9568717956543, "kl_loss_3": 2322.106579589844, "kl_loss_6": 1471.2399169921875, "learning_rate": 0.0005459483065760138, "loss": 1175.8216, "step": 4760 }, { "ce_loss_12": 3.0740485072135924, "ce_loss_17": 2.8895705699920655, "ce_loss_23": 2.817665231227875, "ce_loss_3": 3.995547378063202, "ce_loss_6": 3.5293294548988343, "epoch": 0.477, "grad_norm": 1056.0, "kl_loss_12": 604.4830047607422, "kl_loss_17": 173.51560821533204, "kl_loss_3": 2491.076806640625, "kl_loss_6": 1575.8518920898437, "learning_rate": 0.0005443681288009991, "loss": 1182.1915, "step": 4770 }, { "ce_loss_12": 3.1086962938308718, "ce_loss_17": 2.9318193912506105, "ce_loss_23": 2.8591856479644777, "ce_loss_3": 3.9815015316009523, "ce_loss_6": 3.542095589637756, "epoch": 0.478, "grad_norm": 1072.0, "kl_loss_12": 587.2374114990234, "kl_loss_17": 174.34748458862305, "kl_loss_3": 2389.9313842773436, "kl_loss_6": 1511.5686096191407, "learning_rate": 0.0005427875042394199, "loss": 1168.9535, "step": 4780 }, { "ce_loss_12": 3.155102550983429, "ce_loss_17": 2.9787638783454895, "ce_loss_23": 2.9013282537460325, "ce_loss_3": 4.002556395530701, "ce_loss_6": 3.5739779591560366, "epoch": 0.479, "grad_norm": 1004.0, "kl_loss_12": 596.6852615356445, "kl_loss_17": 181.65953826904297, "kl_loss_3": 2330.943408203125, "kl_loss_6": 1494.3890380859375, "learning_rate": 0.0005412064488081482, "loss": 1173.9174, "step": 4790 }, { "ce_loss_12": 3.137504005432129, "ce_loss_17": 2.96456116437912, "ce_loss_23": 2.890325403213501, "ce_loss_3": 3.9840017795562743, "ce_loss_6": 3.5466713428497316, "epoch": 0.48, "grad_norm": 1024.0, "kl_loss_12": 574.2095184326172, "kl_loss_17": 170.73358688354492, "kl_loss_3": 2306.77890625, "kl_loss_6": 1446.0272033691406, "learning_rate": 0.0005396249784283942, "loss": 1137.2696, "step": 4800 }, { "ce_loss_12": 3.163385605812073, "ce_loss_17": 2.9815895676612856, "ce_loss_23": 2.905221951007843, "ce_loss_3": 4.061469888687133, "ce_loss_6": 3.610790729522705, "epoch": 0.481, "grad_norm": 1160.0, "kl_loss_12": 609.4963623046875, "kl_loss_17": 179.51625061035156, "kl_loss_3": 2454.91064453125, "kl_loss_6": 1549.9529907226563, "learning_rate": 0.0005380431090255476, "loss": 1187.0682, "step": 4810 }, { "ce_loss_12": 3.153098165988922, "ce_loss_17": 2.9833488702774047, "ce_loss_23": 2.914642608165741, "ce_loss_3": 3.994583249092102, "ce_loss_6": 3.5685108065605164, "epoch": 0.482, "grad_norm": 1144.0, "kl_loss_12": 570.3982986450195, "kl_loss_17": 167.1259292602539, "kl_loss_3": 2317.8790771484373, "kl_loss_6": 1466.2108764648438, "learning_rate": 0.0005364608565290155, "loss": 1139.4896, "step": 4820 }, { "ce_loss_12": 3.1717878460884092, "ce_loss_17": 2.993881106376648, "ce_loss_23": 2.92106374502182, "ce_loss_3": 4.0339394330978395, "ce_loss_6": 3.5958771109580994, "epoch": 0.483, "grad_norm": 1096.0, "kl_loss_12": 596.1869064331055, "kl_loss_17": 176.21066665649414, "kl_loss_3": 2377.3220947265627, "kl_loss_6": 1508.0138366699218, "learning_rate": 0.0005348782368720626, "loss": 1163.1882, "step": 4830 }, { "ce_loss_12": 3.1062735438346865, "ce_loss_17": 2.9300807118415833, "ce_loss_23": 2.859765887260437, "ce_loss_3": 3.9669827103614805, "ce_loss_6": 3.522811996936798, "epoch": 0.484, "grad_norm": 1352.0, "kl_loss_12": 577.6051483154297, "kl_loss_17": 169.41027145385743, "kl_loss_3": 2328.596221923828, "kl_loss_6": 1461.5553283691406, "learning_rate": 0.000533295265991652, "loss": 1155.1955, "step": 4840 }, { "ce_loss_12": 3.1678555846214294, "ce_loss_17": 2.9899966955184936, "ce_loss_23": 2.9158806920051576, "ce_loss_3": 4.013157033920288, "ce_loss_6": 3.5860086560249327, "epoch": 0.485, "grad_norm": 1056.0, "kl_loss_12": 585.3820983886719, "kl_loss_17": 172.69294281005858, "kl_loss_3": 2317.670733642578, "kl_loss_6": 1474.7983093261719, "learning_rate": 0.0005317119598282822, "loss": 1139.6301, "step": 4850 }, { "ce_loss_12": 3.1802164673805238, "ce_loss_17": 2.9998749136924743, "ce_loss_23": 2.92397803068161, "ce_loss_3": 4.0372141122817995, "ce_loss_6": 3.6036622881889344, "epoch": 0.486, "grad_norm": 1328.0, "kl_loss_12": 596.8292175292969, "kl_loss_17": 175.77464294433594, "kl_loss_3": 2348.7905883789062, "kl_loss_6": 1489.0216064453125, "learning_rate": 0.0005301283343258293, "loss": 1153.5375, "step": 4860 }, { "ce_loss_12": 3.222676360607147, "ce_loss_17": 3.051359176635742, "ce_loss_23": 2.9793429374694824, "ce_loss_3": 4.057096111774444, "ce_loss_6": 3.638009774684906, "epoch": 0.487, "grad_norm": 1392.0, "kl_loss_12": 589.1835266113281, "kl_loss_17": 175.26502990722656, "kl_loss_3": 2319.251104736328, "kl_loss_6": 1483.1206115722657, "learning_rate": 0.000528544405431384, "loss": 1137.7982, "step": 4870 }, { "ce_loss_12": 3.1201239466667174, "ce_loss_17": 2.937250566482544, "ce_loss_23": 2.859792160987854, "ce_loss_3": 3.985008120536804, "ce_loss_6": 3.552453351020813, "epoch": 0.488, "grad_norm": 1040.0, "kl_loss_12": 609.219938659668, "kl_loss_17": 179.44284210205078, "kl_loss_3": 2386.5765075683594, "kl_loss_6": 1525.4093200683594, "learning_rate": 0.000526960189095093, "loss": 1172.7885, "step": 4880 }, { "ce_loss_12": 3.1049619555473327, "ce_loss_17": 2.9306660652160645, "ce_loss_23": 2.856823241710663, "ce_loss_3": 3.955999433994293, "ce_loss_6": 3.5207136631011964, "epoch": 0.489, "grad_norm": 1224.0, "kl_loss_12": 582.8979614257812, "kl_loss_17": 172.3718475341797, "kl_loss_3": 2330.7440551757813, "kl_loss_6": 1474.2697814941407, "learning_rate": 0.0005253757012699972, "loss": 1146.8992, "step": 4890 }, { "ce_loss_12": 3.1677612900733947, "ce_loss_17": 2.996410632133484, "ce_loss_23": 2.924814796447754, "ce_loss_3": 4.016883933544159, "ce_loss_6": 3.586358439922333, "epoch": 0.49, "grad_norm": 1112.0, "kl_loss_12": 588.945590209961, "kl_loss_17": 173.74278259277344, "kl_loss_3": 2331.7134399414062, "kl_loss_6": 1481.8559936523438, "learning_rate": 0.0005237909579118712, "loss": 1164.1603, "step": 4900 }, { "ce_loss_12": 3.14421226978302, "ce_loss_17": 2.9585715532302856, "ce_loss_23": 2.8813907504081726, "ce_loss_3": 4.029925990104675, "ce_loss_6": 3.5790133357048033, "epoch": 0.491, "grad_norm": 1144.0, "kl_loss_12": 604.090365600586, "kl_loss_17": 180.36501693725586, "kl_loss_3": 2419.7515502929687, "kl_loss_6": 1519.3355224609375, "learning_rate": 0.0005222059749790631, "loss": 1171.4746, "step": 4910 }, { "ce_loss_12": 3.193829929828644, "ce_loss_17": 3.0222328066825868, "ce_loss_23": 2.949049413204193, "ce_loss_3": 4.016946339607239, "ce_loss_6": 3.588582158088684, "epoch": 0.492, "grad_norm": 1020.0, "kl_loss_12": 575.816943359375, "kl_loss_17": 169.9915428161621, "kl_loss_3": 2285.2912475585936, "kl_loss_6": 1437.871759033203, "learning_rate": 0.0005206207684323337, "loss": 1119.0843, "step": 4920 }, { "ce_loss_12": 3.176484978199005, "ce_loss_17": 3.0019878506660462, "ce_loss_23": 2.930724596977234, "ce_loss_3": 4.027780544757843, "ce_loss_6": 3.5989270567893983, "epoch": 0.493, "grad_norm": 1360.0, "kl_loss_12": 596.4255554199219, "kl_loss_17": 177.78146667480468, "kl_loss_3": 2352.6630004882813, "kl_loss_6": 1504.6399047851562, "learning_rate": 0.000519035354234695, "loss": 1173.89, "step": 4930 }, { "ce_loss_12": 3.1620043873786927, "ce_loss_17": 2.9795578479766847, "ce_loss_23": 2.8983842492103578, "ce_loss_3": 4.012789058685303, "ce_loss_6": 3.590153527259827, "epoch": 0.494, "grad_norm": 1528.0, "kl_loss_12": 599.5278976440429, "kl_loss_17": 179.3766830444336, "kl_loss_3": 2332.3432373046876, "kl_loss_6": 1490.5878845214843, "learning_rate": 0.0005174497483512506, "loss": 1136.2885, "step": 4940 }, { "ce_loss_12": 3.1927493691444395, "ce_loss_17": 3.023730146884918, "ce_loss_23": 2.9540415048599242, "ce_loss_3": 4.028357303142547, "ce_loss_6": 3.6044982194900514, "epoch": 0.495, "grad_norm": 1168.0, "kl_loss_12": 581.2431701660156, "kl_loss_17": 170.51922912597655, "kl_loss_3": 2321.1480102539062, "kl_loss_6": 1473.8497741699218, "learning_rate": 0.0005158639667490339, "loss": 1163.9057, "step": 4950 }, { "ce_loss_12": 3.117106306552887, "ce_loss_17": 2.938692343235016, "ce_loss_23": 2.864122462272644, "ce_loss_3": 3.9715336084365847, "ce_loss_6": 3.5310477137565615, "epoch": 0.496, "grad_norm": 900.0, "kl_loss_12": 589.8403610229492, "kl_loss_17": 173.67026596069337, "kl_loss_3": 2349.2982543945313, "kl_loss_6": 1489.48037109375, "learning_rate": 0.0005142780253968481, "loss": 1152.0074, "step": 4960 }, { "ce_loss_12": 3.0620609402656553, "ce_loss_17": 2.8893876552581785, "ce_loss_23": 2.821596598625183, "ce_loss_3": 3.9072123169898987, "ce_loss_6": 3.47282874584198, "epoch": 0.497, "grad_norm": 1112.0, "kl_loss_12": 571.6155212402343, "kl_loss_17": 168.01318740844727, "kl_loss_3": 2300.569201660156, "kl_loss_6": 1443.906103515625, "learning_rate": 0.0005126919402651053, "loss": 1116.5506, "step": 4970 }, { "ce_loss_12": 3.1318010568618773, "ce_loss_17": 2.952656900882721, "ce_loss_23": 2.8750259399414064, "ce_loss_3": 4.002028000354767, "ce_loss_6": 3.563919460773468, "epoch": 0.498, "grad_norm": 1048.0, "kl_loss_12": 600.1055236816406, "kl_loss_17": 179.3028305053711, "kl_loss_3": 2363.5748291015625, "kl_loss_6": 1505.253466796875, "learning_rate": 0.0005111057273256647, "loss": 1161.7385, "step": 4980 }, { "ce_loss_12": 3.2104085087776184, "ce_loss_17": 3.0453226447105406, "ce_loss_23": 2.9771073341369627, "ce_loss_3": 4.010699796676636, "ce_loss_6": 3.5954882144927978, "epoch": 0.499, "grad_norm": 1024.0, "kl_loss_12": 558.9208541870117, "kl_loss_17": 164.41656036376952, "kl_loss_3": 2212.5747131347657, "kl_loss_6": 1405.6594848632812, "learning_rate": 0.0005095194025516733, "loss": 1106.8965, "step": 4990 }, { "ce_loss_12": 3.147287356853485, "ce_loss_17": 2.978268301486969, "ce_loss_23": 2.912423849105835, "ce_loss_3": 3.9957066416740417, "ce_loss_6": 3.555610489845276, "epoch": 0.5, "grad_norm": 1216.0, "kl_loss_12": 573.2591278076172, "kl_loss_17": 166.6346778869629, "kl_loss_3": 2320.488146972656, "kl_loss_6": 1453.042413330078, "learning_rate": 0.000507932981917404, "loss": 1167.6072, "step": 5000 }, { "ce_loss_12": 3.1100293159484864, "ce_loss_17": 2.9278237581253053, "ce_loss_23": 2.8520235896110533, "ce_loss_3": 4.011444246768951, "ce_loss_6": 3.5609185338020324, "epoch": 0.501, "grad_norm": 1012.0, "kl_loss_12": 614.2513397216796, "kl_loss_17": 179.44054489135743, "kl_loss_3": 2452.9594482421876, "kl_loss_6": 1557.2860961914062, "learning_rate": 0.0005063464813980949, "loss": 1193.0545, "step": 5010 }, { "ce_loss_12": 3.085975396633148, "ce_loss_17": 2.9166126370429994, "ce_loss_23": 2.843967413902283, "ce_loss_3": 3.957671511173248, "ce_loss_6": 3.5102660059928894, "epoch": 0.502, "grad_norm": 1192.0, "kl_loss_12": 585.8398132324219, "kl_loss_17": 171.25574417114257, "kl_loss_3": 2375.1840698242186, "kl_loss_6": 1501.98916015625, "learning_rate": 0.0005047599169697884, "loss": 1148.2912, "step": 5020 }, { "ce_loss_12": 3.046398901939392, "ce_loss_17": 2.8624019742012026, "ce_loss_23": 2.786366331577301, "ce_loss_3": 3.915739905834198, "ce_loss_6": 3.469613456726074, "epoch": 0.503, "grad_norm": 1360.0, "kl_loss_12": 593.1336456298828, "kl_loss_17": 174.98776779174804, "kl_loss_3": 2362.1496337890626, "kl_loss_6": 1487.0949829101562, "learning_rate": 0.000503173304609171, "loss": 1130.8209, "step": 5030 }, { "ce_loss_12": 3.147076654434204, "ce_loss_17": 2.973100447654724, "ce_loss_23": 2.8968443393707277, "ce_loss_3": 4.002816176414489, "ce_loss_6": 3.5767434000968934, "epoch": 0.504, "grad_norm": 1616.0, "kl_loss_12": 581.4584167480468, "kl_loss_17": 172.5049819946289, "kl_loss_3": 2333.448352050781, "kl_loss_6": 1488.5466552734374, "learning_rate": 0.0005015866602934111, "loss": 1128.6855, "step": 5040 }, { "ce_loss_12": 3.1321513056755066, "ce_loss_17": 2.9529102206230164, "ce_loss_23": 2.874933648109436, "ce_loss_3": 4.0189503788948056, "ce_loss_6": 3.5767377614974976, "epoch": 0.505, "grad_norm": 1048.0, "kl_loss_12": 617.4561584472656, "kl_loss_17": 184.7288917541504, "kl_loss_3": 2410.4678466796877, "kl_loss_6": 1548.379412841797, "learning_rate": 0.0005, "loss": 1167.1738, "step": 5050 }, { "ce_loss_12": 3.1210977911949156, "ce_loss_17": 2.9430198550224302, "ce_loss_23": 2.868890976905823, "ce_loss_3": 3.9791624903678895, "ce_loss_6": 3.5414319157600405, "epoch": 0.506, "grad_norm": 1464.0, "kl_loss_12": 598.0599334716796, "kl_loss_17": 179.60112915039062, "kl_loss_3": 2336.926403808594, "kl_loss_6": 1482.9856384277343, "learning_rate": 0.0004984133397065889, "loss": 1135.6876, "step": 5060 }, { "ce_loss_12": 3.1238099575042724, "ce_loss_17": 2.947644829750061, "ce_loss_23": 2.8705275416374207, "ce_loss_3": 4.00325483083725, "ce_loss_6": 3.5608957290649412, "epoch": 0.507, "grad_norm": 1648.0, "kl_loss_12": 594.76513671875, "kl_loss_17": 175.90363845825195, "kl_loss_3": 2370.6013061523436, "kl_loss_6": 1506.6740661621093, "learning_rate": 0.0004968266953908291, "loss": 1140.9098, "step": 5070 }, { "ce_loss_12": 3.1559859991073607, "ce_loss_17": 2.9826855659484863, "ce_loss_23": 2.911471796035767, "ce_loss_3": 4.028853631019592, "ce_loss_6": 3.580644130706787, "epoch": 0.508, "grad_norm": 1224.0, "kl_loss_12": 580.6352752685547, "kl_loss_17": 168.34320297241212, "kl_loss_3": 2376.8561279296873, "kl_loss_6": 1492.9738037109375, "learning_rate": 0.0004952400830302117, "loss": 1150.0828, "step": 5080 }, { "ce_loss_12": 3.0969050884246827, "ce_loss_17": 2.9111073136329653, "ce_loss_23": 2.83800984621048, "ce_loss_3": 3.978143048286438, "ce_loss_6": 3.5241355776786802, "epoch": 0.509, "grad_norm": 1448.0, "kl_loss_12": 597.989111328125, "kl_loss_17": 174.9261489868164, "kl_loss_3": 2389.203204345703, "kl_loss_6": 1511.5739440917969, "learning_rate": 0.0004936535186019053, "loss": 1151.776, "step": 5090 }, { "ce_loss_12": 3.176733374595642, "ce_loss_17": 3.0086188673973084, "ce_loss_23": 2.9422256112098695, "ce_loss_3": 4.0128894448280334, "ce_loss_6": 3.5810292601585387, "epoch": 0.51, "grad_norm": 980.0, "kl_loss_12": 567.3318069458007, "kl_loss_17": 166.57700958251954, "kl_loss_3": 2283.7829772949217, "kl_loss_6": 1431.895245361328, "learning_rate": 0.000492067018082596, "loss": 1129.6188, "step": 5100 }, { "ce_loss_12": 3.1334537506103515, "ce_loss_17": 2.9527031540870667, "ce_loss_23": 2.875639271736145, "ce_loss_3": 4.036728489398956, "ce_loss_6": 3.573836934566498, "epoch": 0.511, "grad_norm": 1312.0, "kl_loss_12": 607.42119140625, "kl_loss_17": 177.14556655883788, "kl_loss_3": 2447.327471923828, "kl_loss_6": 1535.374688720703, "learning_rate": 0.0004904805974483267, "loss": 1192.1066, "step": 5110 }, { "ce_loss_12": 3.2413255333900453, "ce_loss_17": 3.0535327911376955, "ce_loss_23": 2.9726045727729797, "ce_loss_3": 4.114557325839996, "ce_loss_6": 3.683498430252075, "epoch": 0.512, "grad_norm": 1080.0, "kl_loss_12": 630.6290008544922, "kl_loss_17": 189.75421066284179, "kl_loss_3": 2422.121484375, "kl_loss_6": 1566.5547302246093, "learning_rate": 0.0004888942726743353, "loss": 1213.6861, "step": 5120 }, { "ce_loss_12": 3.110649573802948, "ce_loss_17": 2.9337003946304323, "ce_loss_23": 2.859462559223175, "ce_loss_3": 3.9760597348213196, "ce_loss_6": 3.540888249874115, "epoch": 0.513, "grad_norm": 1096.0, "kl_loss_12": 596.3653182983398, "kl_loss_17": 175.1172088623047, "kl_loss_3": 2394.8271545410157, "kl_loss_6": 1516.221905517578, "learning_rate": 0.0004873080597348947, "loss": 1171.3652, "step": 5130 }, { "ce_loss_12": 3.0032384753227235, "ce_loss_17": 2.8270904660224914, "ce_loss_23": 2.7542188584804537, "ce_loss_3": 3.9286187887191772, "ce_loss_6": 3.4638774037361144, "epoch": 0.514, "grad_norm": 940.0, "kl_loss_12": 594.5371871948242, "kl_loss_17": 171.3603889465332, "kl_loss_3": 2475.858020019531, "kl_loss_6": 1556.1022399902345, "learning_rate": 0.0004857219746031519, "loss": 1174.0678, "step": 5140 }, { "ce_loss_12": 3.165748357772827, "ce_loss_17": 2.99380042552948, "ce_loss_23": 2.9208311080932616, "ce_loss_3": 4.014376854896545, "ce_loss_6": 3.5815898060798643, "epoch": 0.515, "grad_norm": 1168.0, "kl_loss_12": 583.4323303222657, "kl_loss_17": 172.83356170654298, "kl_loss_3": 2324.2773193359376, "kl_loss_6": 1464.8506713867187, "learning_rate": 0.0004841360332509663, "loss": 1146.9793, "step": 5150 }, { "ce_loss_12": 3.117990791797638, "ce_loss_17": 2.949844980239868, "ce_loss_23": 2.8793536186218263, "ce_loss_3": 3.9670523524284365, "ce_loss_6": 3.5323256254196167, "epoch": 0.516, "grad_norm": 1120.0, "kl_loss_12": 573.2609649658203, "kl_loss_17": 166.92510604858398, "kl_loss_3": 2310.5695861816407, "kl_loss_6": 1455.3383422851562, "learning_rate": 0.0004825502516487497, "loss": 1099.3861, "step": 5160 }, { "ce_loss_12": 3.0859371185302735, "ce_loss_17": 2.912708246707916, "ce_loss_23": 2.844784665107727, "ce_loss_3": 3.958569324016571, "ce_loss_6": 3.5233945488929748, "epoch": 0.517, "grad_norm": 1328.0, "kl_loss_12": 588.0229721069336, "kl_loss_17": 169.20474548339843, "kl_loss_3": 2382.887353515625, "kl_loss_6": 1513.6566467285156, "learning_rate": 0.00048096464576530507, "loss": 1167.3519, "step": 5170 }, { "ce_loss_12": 3.184209370613098, "ce_loss_17": 3.0182522535324097, "ce_loss_23": 2.9456828832626343, "ce_loss_3": 3.9989057421684264, "ce_loss_6": 3.581149864196777, "epoch": 0.518, "grad_norm": 896.0, "kl_loss_12": 577.0042404174804, "kl_loss_17": 171.46255950927736, "kl_loss_3": 2268.005517578125, "kl_loss_6": 1434.8917175292968, "learning_rate": 0.00047937923156766646, "loss": 1119.1788, "step": 5180 }, { "ce_loss_12": 3.2192662239074705, "ce_loss_17": 3.0561550140380858, "ce_loss_23": 2.985294485092163, "ce_loss_3": 4.027104759216309, "ce_loss_6": 3.6101157307624816, "epoch": 0.519, "grad_norm": 1096.0, "kl_loss_12": 577.4416137695313, "kl_loss_17": 172.38313217163085, "kl_loss_3": 2269.985217285156, "kl_loss_6": 1437.1518981933593, "learning_rate": 0.00047779402502093696, "loss": 1125.842, "step": 5190 }, { "ce_loss_12": 3.1905157446861265, "ce_loss_17": 3.0203575253486634, "ce_loss_23": 2.9477104902267457, "ce_loss_3": 4.036243295669555, "ce_loss_6": 3.6043622374534605, "epoch": 0.52, "grad_norm": 1328.0, "kl_loss_12": 580.0230865478516, "kl_loss_17": 171.161181640625, "kl_loss_3": 2317.1341552734375, "kl_loss_6": 1455.8950622558593, "learning_rate": 0.0004762090420881289, "loss": 1141.4177, "step": 5200 }, { "ce_loss_12": 3.108722817897797, "ce_loss_17": 2.93821382522583, "ce_loss_23": 2.8691662311553956, "ce_loss_3": 3.9434558391571044, "ce_loss_6": 3.522480773925781, "epoch": 0.521, "grad_norm": 988.0, "kl_loss_12": 578.2749633789062, "kl_loss_17": 170.9490005493164, "kl_loss_3": 2295.4980346679686, "kl_loss_6": 1458.482110595703, "learning_rate": 0.00047462429873000296, "loss": 1115.8973, "step": 5210 }, { "ce_loss_12": 3.1941396236419677, "ce_loss_17": 3.0241977334022523, "ce_loss_23": 2.950813615322113, "ce_loss_3": 4.025123035907745, "ce_loss_6": 3.5928640723228455, "epoch": 0.522, "grad_norm": 1160.0, "kl_loss_12": 580.0771759033203, "kl_loss_17": 175.7025573730469, "kl_loss_3": 2306.2788818359377, "kl_loss_6": 1451.9521911621093, "learning_rate": 0.0004730398109049071, "loss": 1124.7129, "step": 5220 }, { "ce_loss_12": 3.1333670258522033, "ce_loss_17": 2.9486419200897216, "ce_loss_23": 2.873242676258087, "ce_loss_3": 4.016930389404297, "ce_loss_6": 3.578941988945007, "epoch": 0.523, "grad_norm": 1224.0, "kl_loss_12": 604.9253570556641, "kl_loss_17": 177.25698318481446, "kl_loss_3": 2420.9248046875, "kl_loss_6": 1552.7471984863282, "learning_rate": 0.000471455594568616, "loss": 1160.7181, "step": 5230 }, { "ce_loss_12": 3.1828583478927612, "ce_loss_17": 3.0149208068847657, "ce_loss_23": 2.9433709263801573, "ce_loss_3": 3.998889207839966, "ce_loss_6": 3.584208059310913, "epoch": 0.524, "grad_norm": 1552.0, "kl_loss_12": 578.4490600585938, "kl_loss_17": 172.18267364501952, "kl_loss_3": 2265.9357666015626, "kl_loss_6": 1439.397607421875, "learning_rate": 0.00046987166567417086, "loss": 1137.8832, "step": 5240 }, { "ce_loss_12": 3.116793179512024, "ce_loss_17": 2.94590482711792, "ce_loss_23": 2.8722461819648744, "ce_loss_3": 3.978662097454071, "ce_loss_6": 3.530458962917328, "epoch": 0.525, "grad_norm": 1424.0, "kl_loss_12": 577.2575607299805, "kl_loss_17": 168.89076232910156, "kl_loss_3": 2332.1568359375, "kl_loss_6": 1461.170635986328, "learning_rate": 0.00046828804017171776, "loss": 1109.5477, "step": 5250 }, { "ce_loss_12": 3.1586636900901794, "ce_loss_17": 2.9801255226135255, "ce_loss_23": 2.9030737400054933, "ce_loss_3": 4.046093094348907, "ce_loss_6": 3.5907180190086363, "epoch": 0.526, "grad_norm": 1024.0, "kl_loss_12": 591.604052734375, "kl_loss_17": 174.3595977783203, "kl_loss_3": 2382.9521240234376, "kl_loss_6": 1490.333349609375, "learning_rate": 0.00046670473400834805, "loss": 1162.6271, "step": 5260 }, { "ce_loss_12": 3.0928870677947997, "ce_loss_17": 2.925082004070282, "ce_loss_23": 2.854987645149231, "ce_loss_3": 3.9424633979797363, "ce_loss_6": 3.502853310108185, "epoch": 0.527, "grad_norm": 1216.0, "kl_loss_12": 568.4853057861328, "kl_loss_17": 167.60090255737305, "kl_loss_3": 2314.8370666503906, "kl_loss_6": 1446.1728820800781, "learning_rate": 0.00046512176312793734, "loss": 1163.3844, "step": 5270 }, { "ce_loss_12": 3.0925601005554197, "ce_loss_17": 2.9151213645935057, "ce_loss_23": 2.8395652532577516, "ce_loss_3": 3.941300642490387, "ce_loss_6": 3.505583441257477, "epoch": 0.528, "grad_norm": 1456.0, "kl_loss_12": 583.0032562255859, "kl_loss_17": 172.08705215454103, "kl_loss_3": 2332.751837158203, "kl_loss_6": 1471.8856689453125, "learning_rate": 0.00046353914347098467, "loss": 1150.2659, "step": 5280 }, { "ce_loss_12": 3.184078550338745, "ce_loss_17": 3.0123846530914307, "ce_loss_23": 2.9371328949928284, "ce_loss_3": 4.0311295747756954, "ce_loss_6": 3.599526607990265, "epoch": 0.529, "grad_norm": 1368.0, "kl_loss_12": 571.3340240478516, "kl_loss_17": 170.81220932006835, "kl_loss_3": 2306.427099609375, "kl_loss_6": 1454.4433288574219, "learning_rate": 0.0004619568909744524, "loss": 1149.8136, "step": 5290 }, { "ce_loss_12": 3.1852566242218017, "ce_loss_17": 3.01423898935318, "ce_loss_23": 2.941882300376892, "ce_loss_3": 4.019960188865662, "ce_loss_6": 3.5935036301612855, "epoch": 0.53, "grad_norm": 1344.0, "kl_loss_12": 579.0866683959961, "kl_loss_17": 172.10766830444337, "kl_loss_3": 2303.6955017089845, "kl_loss_6": 1460.816339111328, "learning_rate": 0.00046037502157160573, "loss": 1143.9945, "step": 5300 }, { "ce_loss_12": 3.0746464490890504, "ce_loss_17": 2.8974472761154173, "ce_loss_23": 2.8233888030052183, "ce_loss_3": 3.9274016857147216, "ce_loss_6": 3.4863208651542665, "epoch": 0.531, "grad_norm": 1080.0, "kl_loss_12": 584.6296173095703, "kl_loss_17": 173.56878509521485, "kl_loss_3": 2337.2600219726564, "kl_loss_6": 1467.9676696777344, "learning_rate": 0.00045879355119185207, "loss": 1147.8285, "step": 5310 }, { "ce_loss_12": 3.156270945072174, "ce_loss_17": 2.97952960729599, "ce_loss_23": 2.908063507080078, "ce_loss_3": 4.009477806091309, "ce_loss_6": 3.5802698493003846, "epoch": 0.532, "grad_norm": 1004.0, "kl_loss_12": 599.6850769042969, "kl_loss_17": 175.448974609375, "kl_loss_3": 2376.53125, "kl_loss_6": 1506.9073852539063, "learning_rate": 0.0004572124957605803, "loss": 1167.7115, "step": 5320 }, { "ce_loss_12": 3.164023780822754, "ce_loss_17": 2.985978841781616, "ce_loss_23": 2.910122108459473, "ce_loss_3": 4.010115242004394, "ce_loss_6": 3.579763102531433, "epoch": 0.533, "grad_norm": 1088.0, "kl_loss_12": 591.7021179199219, "kl_loss_17": 174.22025375366212, "kl_loss_3": 2341.58857421875, "kl_loss_6": 1477.200946044922, "learning_rate": 0.00045563187119900103, "loss": 1132.1896, "step": 5330 }, { "ce_loss_12": 3.0149919986724854, "ce_loss_17": 2.8391488671302794, "ce_loss_23": 2.7716471910476685, "ce_loss_3": 3.897192454338074, "ce_loss_6": 3.450315809249878, "epoch": 0.534, "grad_norm": 1736.0, "kl_loss_12": 587.5868286132812, "kl_loss_17": 170.40618972778321, "kl_loss_3": 2397.2179321289063, "kl_loss_6": 1504.325732421875, "learning_rate": 0.00045405169342398633, "loss": 1159.4494, "step": 5340 }, { "ce_loss_12": 3.109479343891144, "ce_loss_17": 2.931243908405304, "ce_loss_23": 2.8559011697769163, "ce_loss_3": 3.977936267852783, "ce_loss_6": 3.5353504419326782, "epoch": 0.535, "grad_norm": 956.0, "kl_loss_12": 595.8931518554688, "kl_loss_17": 176.46972579956054, "kl_loss_3": 2383.1471923828126, "kl_loss_6": 1504.8654907226562, "learning_rate": 0.0004524719783479088, "loss": 1136.5393, "step": 5350 }, { "ce_loss_12": 3.063349151611328, "ce_loss_17": 2.883927345275879, "ce_loss_23": 2.808227801322937, "ce_loss_3": 3.9513380885124207, "ce_loss_6": 3.503880572319031, "epoch": 0.536, "grad_norm": 908.0, "kl_loss_12": 596.9293487548828, "kl_loss_17": 176.37847824096679, "kl_loss_3": 2422.632647705078, "kl_loss_6": 1518.7975463867188, "learning_rate": 0.00045089274187848144, "loss": 1142.9058, "step": 5360 }, { "ce_loss_12": 3.1649566173553465, "ce_loss_17": 2.9962420225143434, "ce_loss_23": 2.925772321224213, "ce_loss_3": 4.012749242782593, "ce_loss_6": 3.575718033313751, "epoch": 0.537, "grad_norm": 1328.0, "kl_loss_12": 578.8294387817383, "kl_loss_17": 170.3518814086914, "kl_loss_3": 2328.4602294921874, "kl_loss_6": 1458.3095703125, "learning_rate": 0.00044931399991859835, "loss": 1125.4836, "step": 5370 }, { "ce_loss_12": 3.0323414325714113, "ce_loss_17": 2.859511160850525, "ce_loss_23": 2.7885570168495177, "ce_loss_3": 3.887762427330017, "ce_loss_6": 3.447388708591461, "epoch": 0.538, "grad_norm": 1120.0, "kl_loss_12": 576.0092010498047, "kl_loss_17": 168.75234375, "kl_loss_3": 2343.1920837402345, "kl_loss_6": 1467.7802734375, "learning_rate": 0.00044773576836617336, "loss": 1126.0914, "step": 5380 }, { "ce_loss_12": 3.132502889633179, "ce_loss_17": 2.9554927587509154, "ce_loss_23": 2.879656362533569, "ce_loss_3": 3.9968435287475588, "ce_loss_6": 3.5616152048110963, "epoch": 0.539, "grad_norm": 1240.0, "kl_loss_12": 592.6437683105469, "kl_loss_17": 173.54042358398436, "kl_loss_3": 2369.021044921875, "kl_loss_6": 1508.1837890625, "learning_rate": 0.00044615806311398056, "loss": 1172.1301, "step": 5390 }, { "ce_loss_12": 3.1875888228416445, "ce_loss_17": 3.021982192993164, "ce_loss_23": 2.953292524814606, "ce_loss_3": 3.975070667266846, "ce_loss_6": 3.5754401087760925, "epoch": 0.54, "grad_norm": 1056.0, "kl_loss_12": 564.4381713867188, "kl_loss_17": 165.4364440917969, "kl_loss_3": 2212.189373779297, "kl_loss_6": 1405.8132629394531, "learning_rate": 0.00044458090004949454, "loss": 1126.4178, "step": 5400 }, { "ce_loss_12": 3.0834463357925417, "ce_loss_17": 2.896428346633911, "ce_loss_23": 2.8172748923301696, "ce_loss_3": 3.983506464958191, "ce_loss_6": 3.5301677942276, "epoch": 0.541, "grad_norm": 1216.0, "kl_loss_12": 609.4205612182617, "kl_loss_17": 180.6081115722656, "kl_loss_3": 2462.3694763183594, "kl_loss_6": 1565.2915832519532, "learning_rate": 0.0004430042950547297, "loss": 1158.1969, "step": 5410 }, { "ce_loss_12": 3.1545743107795716, "ce_loss_17": 2.9766462206840516, "ce_loss_23": 2.9003338694572447, "ce_loss_3": 4.019918143749237, "ce_loss_6": 3.5826845288276674, "epoch": 0.542, "grad_norm": 900.0, "kl_loss_12": 599.8469680786133, "kl_loss_17": 179.4541229248047, "kl_loss_3": 2381.429577636719, "kl_loss_6": 1498.4056457519532, "learning_rate": 0.0004414282640060809, "loss": 1150.7848, "step": 5420 }, { "ce_loss_12": 3.228876233100891, "ce_loss_17": 3.0594538927078245, "ce_loss_23": 2.984146475791931, "ce_loss_3": 4.057979154586792, "ce_loss_6": 3.6407159090042116, "epoch": 0.543, "grad_norm": 1176.0, "kl_loss_12": 575.4662200927735, "kl_loss_17": 173.09854583740236, "kl_loss_3": 2263.767956542969, "kl_loss_6": 1437.279278564453, "learning_rate": 0.0004398528227741633, "loss": 1116.2201, "step": 5430 }, { "ce_loss_12": 3.1138192772865296, "ce_loss_17": 2.933781898021698, "ce_loss_23": 2.8625401735305784, "ce_loss_3": 3.978166365623474, "ce_loss_6": 3.544347071647644, "epoch": 0.544, "grad_norm": 1368.0, "kl_loss_12": 581.6029159545899, "kl_loss_17": 172.10035705566406, "kl_loss_3": 2334.6691040039063, "kl_loss_6": 1475.057598876953, "learning_rate": 0.00043827798722365264, "loss": 1150.7207, "step": 5440 }, { "ce_loss_12": 3.21498464345932, "ce_loss_17": 3.048158276081085, "ce_loss_23": 2.978046691417694, "ce_loss_3": 4.036542665958405, "ce_loss_6": 3.6121699094772337, "epoch": 0.545, "grad_norm": 1336.0, "kl_loss_12": 577.015998840332, "kl_loss_17": 171.95670700073242, "kl_loss_3": 2278.222900390625, "kl_loss_6": 1433.7412963867187, "learning_rate": 0.00043670377321312535, "loss": 1115.5743, "step": 5450 }, { "ce_loss_12": 3.216487944126129, "ce_loss_17": 3.051944351196289, "ce_loss_23": 2.9818212270736693, "ce_loss_3": 4.0296752691268924, "ce_loss_6": 3.622943937778473, "epoch": 0.546, "grad_norm": 1632.0, "kl_loss_12": 568.4437469482422, "kl_loss_17": 168.3789520263672, "kl_loss_3": 2252.7482482910154, "kl_loss_6": 1444.654229736328, "learning_rate": 0.0004351301965948991, "loss": 1136.658, "step": 5460 }, { "ce_loss_12": 3.1321449518203734, "ce_loss_17": 2.9665717482566833, "ce_loss_23": 2.897308039665222, "ce_loss_3": 3.9533140301704406, "ce_loss_6": 3.539426839351654, "epoch": 0.547, "grad_norm": 1536.0, "kl_loss_12": 559.8824737548828, "kl_loss_17": 164.7873306274414, "kl_loss_3": 2255.5875610351563, "kl_loss_6": 1426.985546875, "learning_rate": 0.000433557273214873, "loss": 1118.2308, "step": 5470 }, { "ce_loss_12": 3.124492418766022, "ce_loss_17": 2.9505738973617555, "ce_loss_23": 2.8773013591766357, "ce_loss_3": 3.9642167687416077, "ce_loss_6": 3.5437798500061035, "epoch": 0.548, "grad_norm": 1200.0, "kl_loss_12": 573.4230926513671, "kl_loss_17": 170.81655502319336, "kl_loss_3": 2286.347790527344, "kl_loss_6": 1452.3485534667968, "learning_rate": 0.000431985018912368, "loss": 1113.127, "step": 5480 }, { "ce_loss_12": 3.103459191322327, "ce_loss_17": 2.9267914652824403, "ce_loss_23": 2.8531687021255494, "ce_loss_3": 3.980012333393097, "ce_loss_6": 3.5421520709991454, "epoch": 0.549, "grad_norm": 1168.0, "kl_loss_12": 585.4976928710937, "kl_loss_17": 172.1935546875, "kl_loss_3": 2377.3790893554688, "kl_loss_6": 1508.5367065429687, "learning_rate": 0.0004304134495199674, "loss": 1125.7893, "step": 5490 }, { "ce_loss_12": 3.13008736371994, "ce_loss_17": 2.95293470621109, "ce_loss_23": 2.8801159620285035, "ce_loss_3": 3.988025999069214, "ce_loss_6": 3.5581201314926147, "epoch": 0.55, "grad_norm": 1048.0, "kl_loss_12": 593.742057800293, "kl_loss_17": 172.41408920288086, "kl_loss_3": 2364.9688110351562, "kl_loss_6": 1508.8888854980469, "learning_rate": 0.0004288425808633575, "loss": 1139.3568, "step": 5500 }, { "ce_loss_12": 3.113992178440094, "ce_loss_17": 2.941122317314148, "ce_loss_23": 2.8700260519981384, "ce_loss_3": 3.965270459651947, "ce_loss_6": 3.5282594561576843, "epoch": 0.551, "grad_norm": 1576.0, "kl_loss_12": 573.6035720825196, "kl_loss_17": 167.52117843627929, "kl_loss_3": 2331.3819274902344, "kl_loss_6": 1467.6904663085938, "learning_rate": 0.0004272724287611684, "loss": 1139.9652, "step": 5510 }, { "ce_loss_12": 3.0911285281181335, "ce_loss_17": 2.919494020938873, "ce_loss_23": 2.8469159841537475, "ce_loss_3": 3.9676486968994142, "ce_loss_6": 3.511377143859863, "epoch": 0.552, "grad_norm": 1096.0, "kl_loss_12": 585.2908172607422, "kl_loss_17": 173.3448402404785, "kl_loss_3": 2385.7844421386717, "kl_loss_6": 1485.8661010742187, "learning_rate": 0.00042570300902481425, "loss": 1148.9248, "step": 5520 }, { "ce_loss_12": 3.1088298916816712, "ce_loss_17": 2.9421172738075256, "ce_loss_23": 2.8727689743041993, "ce_loss_3": 3.947227430343628, "ce_loss_6": 3.5222736239433288, "epoch": 0.553, "grad_norm": 1120.0, "kl_loss_12": 569.1251846313477, "kl_loss_17": 166.2673828125, "kl_loss_3": 2305.686419677734, "kl_loss_6": 1455.715643310547, "learning_rate": 0.00042413433745833423, "loss": 1124.1992, "step": 5530 }, { "ce_loss_12": 3.110207366943359, "ce_loss_17": 2.935355913639069, "ce_loss_23": 2.8650208592414854, "ce_loss_3": 3.9728068828582765, "ce_loss_6": 3.531010937690735, "epoch": 0.554, "grad_norm": 1536.0, "kl_loss_12": 580.8512084960937, "kl_loss_17": 169.45902633666992, "kl_loss_3": 2346.9794067382813, "kl_loss_6": 1474.6579711914062, "learning_rate": 0.0004225664298582339, "loss": 1111.4911, "step": 5540 }, { "ce_loss_12": 3.18549702167511, "ce_loss_17": 3.0146400213241575, "ce_loss_23": 2.9432016134262087, "ce_loss_3": 4.013700640201568, "ce_loss_6": 3.5912591457366942, "epoch": 0.555, "grad_norm": 1184.0, "kl_loss_12": 570.0095581054687, "kl_loss_17": 168.6876480102539, "kl_loss_3": 2263.831896972656, "kl_loss_6": 1434.12294921875, "learning_rate": 0.000420999302013325, "loss": 1112.2068, "step": 5550 }, { "ce_loss_12": 3.1032863736152647, "ce_loss_17": 2.9221776127815247, "ce_loss_23": 2.8449564576148987, "ce_loss_3": 4.007036745548248, "ce_loss_6": 3.5387084245681764, "epoch": 0.556, "grad_norm": 1480.0, "kl_loss_12": 599.1151763916016, "kl_loss_17": 178.27646865844727, "kl_loss_3": 2437.7327209472655, "kl_loss_6": 1509.6392456054687, "learning_rate": 0.000419432969704568, "loss": 1141.0631, "step": 5560 }, { "ce_loss_12": 3.1315610647201537, "ce_loss_17": 2.958857810497284, "ce_loss_23": 2.887349987030029, "ce_loss_3": 3.962913715839386, "ce_loss_6": 3.5348150610923765, "epoch": 0.557, "grad_norm": 1176.0, "kl_loss_12": 578.5272201538086, "kl_loss_17": 170.51123962402343, "kl_loss_3": 2292.166613769531, "kl_loss_6": 1440.1355895996094, "learning_rate": 0.00041786744870491154, "loss": 1150.712, "step": 5570 }, { "ce_loss_12": 3.0789474129676817, "ce_loss_17": 2.8997687101364136, "ce_loss_23": 2.827404201030731, "ce_loss_3": 3.9318880319595335, "ce_loss_6": 3.5036757349967957, "epoch": 0.558, "grad_norm": 1152.0, "kl_loss_12": 585.6135848999023, "kl_loss_17": 173.6684211730957, "kl_loss_3": 2334.7713134765627, "kl_loss_6": 1485.8395751953126, "learning_rate": 0.0004163027547791347, "loss": 1133.438, "step": 5580 }, { "ce_loss_12": 3.066434121131897, "ce_loss_17": 2.8909239292144777, "ce_loss_23": 2.8182274460792542, "ce_loss_3": 3.960865688323975, "ce_loss_6": 3.501590812206268, "epoch": 0.559, "grad_norm": 1168.0, "kl_loss_12": 586.6632537841797, "kl_loss_17": 172.16040267944337, "kl_loss_3": 2419.0933044433596, "kl_loss_6": 1495.1991760253907, "learning_rate": 0.0004147389036836881, "loss": 1146.2016, "step": 5590 }, { "ce_loss_12": 3.101619017124176, "ce_loss_17": 2.9293001532554626, "ce_loss_23": 2.858116888999939, "ce_loss_3": 3.9630075216293337, "ce_loss_6": 3.5294421792030333, "epoch": 0.56, "grad_norm": 1224.0, "kl_loss_12": 583.156396484375, "kl_loss_17": 171.53984298706055, "kl_loss_3": 2341.152209472656, "kl_loss_6": 1483.7776916503906, "learning_rate": 0.00041317591116653486, "loss": 1157.8344, "step": 5600 }, { "ce_loss_12": 3.140073227882385, "ce_loss_17": 2.9632779479026796, "ce_loss_23": 2.892660915851593, "ce_loss_3": 4.002801692485809, "ce_loss_6": 3.559722900390625, "epoch": 0.561, "grad_norm": 984.0, "kl_loss_12": 592.0898803710937, "kl_loss_17": 175.4168487548828, "kl_loss_3": 2368.6901245117188, "kl_loss_6": 1483.5403137207031, "learning_rate": 0.0004116137929669921, "loss": 1132.2861, "step": 5610 }, { "ce_loss_12": 3.1353583097457887, "ce_loss_17": 2.96168429851532, "ce_loss_23": 2.891963481903076, "ce_loss_3": 3.980002760887146, "ce_loss_6": 3.5474584102630615, "epoch": 0.562, "grad_norm": 1496.0, "kl_loss_12": 573.8780532836914, "kl_loss_17": 167.09454879760742, "kl_loss_3": 2308.930065917969, "kl_loss_6": 1459.3727600097657, "learning_rate": 0.00041005256481557305, "loss": 1114.6254, "step": 5620 }, { "ce_loss_12": 3.216406464576721, "ce_loss_17": 3.0492212176322937, "ce_loss_23": 2.980920660495758, "ce_loss_3": 4.013675105571747, "ce_loss_6": 3.6090791821479797, "epoch": 0.563, "grad_norm": 1360.0, "kl_loss_12": 561.8955993652344, "kl_loss_17": 165.47934112548828, "kl_loss_3": 2215.916259765625, "kl_loss_6": 1405.9870849609374, "learning_rate": 0.00040849224243382767, "loss": 1105.3673, "step": 5630 }, { "ce_loss_12": 3.0871979117393495, "ce_loss_17": 2.9172948598861694, "ce_loss_23": 2.844453179836273, "ce_loss_3": 3.946743881702423, "ce_loss_6": 3.509100043773651, "epoch": 0.564, "grad_norm": 1088.0, "kl_loss_12": 579.8146392822266, "kl_loss_17": 169.4729118347168, "kl_loss_3": 2336.1141845703123, "kl_loss_6": 1479.3551330566406, "learning_rate": 0.000406932841534185, "loss": 1116.5229, "step": 5640 }, { "ce_loss_12": 3.0505393624305723, "ce_loss_17": 2.8815552830696105, "ce_loss_23": 2.8072192072868347, "ce_loss_3": 3.9187739729881286, "ce_loss_6": 3.47923538684845, "epoch": 0.565, "grad_norm": 1376.0, "kl_loss_12": 582.4164916992188, "kl_loss_17": 172.37065200805665, "kl_loss_3": 2370.2238159179688, "kl_loss_6": 1488.5986877441405, "learning_rate": 0.0004053743778197951, "loss": 1166.7918, "step": 5650 }, { "ce_loss_12": 3.159424090385437, "ce_loss_17": 2.983396351337433, "ce_loss_23": 2.910083532333374, "ce_loss_3": 4.00204507112503, "ce_loss_6": 3.5764374852180483, "epoch": 0.566, "grad_norm": 1256.0, "kl_loss_12": 588.7669723510742, "kl_loss_17": 175.26432876586915, "kl_loss_3": 2314.8089294433594, "kl_loss_6": 1473.1597045898438, "learning_rate": 0.0004038168669843697, "loss": 1149.6404, "step": 5660 }, { "ce_loss_12": 3.1078927993774412, "ce_loss_17": 2.9341798067092895, "ce_loss_23": 2.864762580394745, "ce_loss_3": 3.921034610271454, "ce_loss_6": 3.510339045524597, "epoch": 0.567, "grad_norm": 972.0, "kl_loss_12": 571.4771148681641, "kl_loss_17": 168.56038589477538, "kl_loss_3": 2248.109423828125, "kl_loss_6": 1433.6847351074218, "learning_rate": 0.000402260324712026, "loss": 1135.3118, "step": 5670 }, { "ce_loss_12": 3.1541042566299438, "ce_loss_17": 2.9780243635177612, "ce_loss_23": 2.907868194580078, "ce_loss_3": 4.017433679103851, "ce_loss_6": 3.5807367086410524, "epoch": 0.568, "grad_norm": 1456.0, "kl_loss_12": 579.9191589355469, "kl_loss_17": 167.7764877319336, "kl_loss_3": 2349.619104003906, "kl_loss_6": 1484.2405151367188, "learning_rate": 0.00040070476667712743, "loss": 1123.8162, "step": 5680 }, { "ce_loss_12": 3.176102542877197, "ce_loss_17": 3.0069189071655273, "ce_loss_23": 2.933028447628021, "ce_loss_3": 4.015802943706513, "ce_loss_6": 3.5848238110542296, "epoch": 0.569, "grad_norm": 1144.0, "kl_loss_12": 577.5879211425781, "kl_loss_17": 170.20566177368164, "kl_loss_3": 2305.2219482421874, "kl_loss_6": 1445.681903076172, "learning_rate": 0.0003991502085441259, "loss": 1134.1186, "step": 5690 }, { "ce_loss_12": 3.200211489200592, "ce_loss_17": 3.0393505692481995, "ce_loss_23": 2.970473277568817, "ce_loss_3": 4.003493142127991, "ce_loss_6": 3.593748462200165, "epoch": 0.57, "grad_norm": 1264.0, "kl_loss_12": 563.2113464355468, "kl_loss_17": 165.5726791381836, "kl_loss_3": 2215.8523132324217, "kl_loss_6": 1401.576104736328, "learning_rate": 0.0003975966659674047, "loss": 1120.5086, "step": 5700 }, { "ce_loss_12": 3.186952292919159, "ce_loss_17": 3.0141292452812194, "ce_loss_23": 2.942728817462921, "ce_loss_3": 4.032547473907471, "ce_loss_6": 3.596217918395996, "epoch": 0.571, "grad_norm": 1376.0, "kl_loss_12": 575.968505859375, "kl_loss_17": 170.199072265625, "kl_loss_3": 2308.314581298828, "kl_loss_6": 1443.2017517089844, "learning_rate": 0.0003960441545911204, "loss": 1116.0092, "step": 5710 }, { "ce_loss_12": 3.1654128432273865, "ce_loss_17": 2.997965371608734, "ce_loss_23": 2.9288565039634706, "ce_loss_3": 3.9943326115608215, "ce_loss_6": 3.567497718334198, "epoch": 0.572, "grad_norm": 1120.0, "kl_loss_12": 574.8713333129883, "kl_loss_17": 167.62567596435548, "kl_loss_3": 2297.2396240234375, "kl_loss_6": 1450.417529296875, "learning_rate": 0.0003944926900490452, "loss": 1119.0949, "step": 5720 }, { "ce_loss_12": 3.0970802426338198, "ce_loss_17": 2.920128679275513, "ce_loss_23": 2.846101438999176, "ce_loss_3": 3.969974410533905, "ce_loss_6": 3.526023507118225, "epoch": 0.573, "grad_norm": 1432.0, "kl_loss_12": 589.8715835571289, "kl_loss_17": 172.74836502075195, "kl_loss_3": 2365.9664916992188, "kl_loss_6": 1494.449969482422, "learning_rate": 0.0003929422879644099, "loss": 1130.7844, "step": 5730 }, { "ce_loss_12": 3.097192919254303, "ce_loss_17": 2.9327561974525453, "ce_loss_23": 2.86499559879303, "ce_loss_3": 3.9204100370407104, "ce_loss_6": 3.494847071170807, "epoch": 0.574, "grad_norm": 1360.0, "kl_loss_12": 560.7226760864257, "kl_loss_17": 165.36538467407226, "kl_loss_3": 2273.9782287597654, "kl_loss_6": 1431.7180114746093, "learning_rate": 0.0003913929639497462, "loss": 1092.815, "step": 5740 }, { "ce_loss_12": 3.059863972663879, "ce_loss_17": 2.8799789428710936, "ce_loss_23": 2.8116377234458922, "ce_loss_3": 3.932484757900238, "ce_loss_6": 3.486096715927124, "epoch": 0.575, "grad_norm": 1240.0, "kl_loss_12": 569.6117431640625, "kl_loss_17": 165.87915725708007, "kl_loss_3": 2351.2241394042967, "kl_loss_6": 1469.8656066894532, "learning_rate": 0.00038984473360672965, "loss": 1115.7053, "step": 5750 }, { "ce_loss_12": 3.0659531235694883, "ce_loss_17": 2.8936392068862915, "ce_loss_23": 2.8194744348526, "ce_loss_3": 3.9361996173858644, "ce_loss_6": 3.487998294830322, "epoch": 0.576, "grad_norm": 1176.0, "kl_loss_12": 571.9079559326171, "kl_loss_17": 166.82796478271484, "kl_loss_3": 2346.0788513183593, "kl_loss_6": 1469.1494201660157, "learning_rate": 0.0003882976125260229, "loss": 1114.1246, "step": 5760 }, { "ce_loss_12": 3.1290350794792174, "ce_loss_17": 2.955051040649414, "ce_loss_23": 2.8846306800842285, "ce_loss_3": 3.981457543373108, "ce_loss_6": 3.5426934719085694, "epoch": 0.577, "grad_norm": 1536.0, "kl_loss_12": 572.9007583618164, "kl_loss_17": 169.10189514160157, "kl_loss_3": 2315.0250244140625, "kl_loss_6": 1454.3143920898438, "learning_rate": 0.00038675161628711776, "loss": 1128.9508, "step": 5770 }, { "ce_loss_12": 3.1652075171470644, "ce_loss_17": 2.996035838127136, "ce_loss_23": 2.923553502559662, "ce_loss_3": 3.988526499271393, "ce_loss_6": 3.569002163410187, "epoch": 0.578, "grad_norm": 1184.0, "kl_loss_12": 573.2200241088867, "kl_loss_17": 170.21975860595703, "kl_loss_3": 2274.1938537597657, "kl_loss_6": 1439.8227233886719, "learning_rate": 0.0003852067604581794, "loss": 1143.6246, "step": 5780 }, { "ce_loss_12": 3.1154022932052614, "ce_loss_17": 2.9427364468574524, "ce_loss_23": 2.8725454330444338, "ce_loss_3": 3.9664053320884705, "ce_loss_6": 3.5301053404808043, "epoch": 0.579, "grad_norm": 1584.0, "kl_loss_12": 574.4286529541016, "kl_loss_17": 166.3837776184082, "kl_loss_3": 2331.8002807617186, "kl_loss_6": 1466.2192443847657, "learning_rate": 0.0003836630605958888, "loss": 1122.2252, "step": 5790 }, { "ce_loss_12": 3.1643086671829224, "ce_loss_17": 2.996836471557617, "ce_loss_23": 2.9282254457473753, "ce_loss_3": 4.006624364852906, "ce_loss_6": 3.5805129528045656, "epoch": 0.58, "grad_norm": 1520.0, "kl_loss_12": 571.5751373291016, "kl_loss_17": 168.04933166503906, "kl_loss_3": 2315.909503173828, "kl_loss_6": 1460.9131408691405, "learning_rate": 0.0003821205322452863, "loss": 1165.0018, "step": 5800 }, { "ce_loss_12": 3.1474431276321413, "ce_loss_17": 2.978452205657959, "ce_loss_23": 2.9102728366851807, "ce_loss_3": 3.977898383140564, "ce_loss_6": 3.5485727429389953, "epoch": 0.581, "grad_norm": 1424.0, "kl_loss_12": 565.4918029785156, "kl_loss_17": 165.82336349487304, "kl_loss_3": 2292.4250366210936, "kl_loss_6": 1442.7161743164063, "learning_rate": 0.0003805791909396155, "loss": 1121.6133, "step": 5810 }, { "ce_loss_12": 3.096940839290619, "ce_loss_17": 2.9302207708358763, "ce_loss_23": 2.8616117596626283, "ce_loss_3": 3.94871609210968, "ce_loss_6": 3.513180065155029, "epoch": 0.582, "grad_norm": 1328.0, "kl_loss_12": 567.1476913452149, "kl_loss_17": 165.21699295043945, "kl_loss_3": 2315.2934020996095, "kl_loss_6": 1448.5183471679688, "learning_rate": 0.0003790390522001662, "loss": 1129.2562, "step": 5820 }, { "ce_loss_12": 3.0444005727767944, "ce_loss_17": 2.87704142332077, "ce_loss_23": 2.8103655338287354, "ce_loss_3": 3.90531131029129, "ce_loss_6": 3.4619930505752565, "epoch": 0.583, "grad_norm": 1304.0, "kl_loss_12": 560.7367935180664, "kl_loss_17": 163.4321258544922, "kl_loss_3": 2334.534295654297, "kl_loss_6": 1455.2254272460937, "learning_rate": 0.0003775001315361183, "loss": 1113.3598, "step": 5830 }, { "ce_loss_12": 3.139377462863922, "ce_loss_17": 2.9656705498695373, "ce_loss_23": 2.8942306637763977, "ce_loss_3": 3.9994457244873045, "ce_loss_6": 3.561956214904785, "epoch": 0.584, "grad_norm": 1104.0, "kl_loss_12": 576.4854705810546, "kl_loss_17": 170.18653945922853, "kl_loss_3": 2338.360107421875, "kl_loss_6": 1465.3584045410157, "learning_rate": 0.0003759624444443858, "loss": 1134.8766, "step": 5840 }, { "ce_loss_12": 3.168642854690552, "ce_loss_17": 2.999200773239136, "ce_loss_23": 2.9328733086586, "ce_loss_3": 3.996432375907898, "ce_loss_6": 3.560738229751587, "epoch": 0.585, "grad_norm": 1496.0, "kl_loss_12": 563.6090438842773, "kl_loss_17": 165.6907615661621, "kl_loss_3": 2297.1994018554688, "kl_loss_6": 1435.7785522460938, "learning_rate": 0.00037442600640946044, "loss": 1108.7355, "step": 5850 }, { "ce_loss_12": 3.134856653213501, "ce_loss_17": 2.9687893748283387, "ce_loss_23": 2.9005924224853517, "ce_loss_3": 3.9626365423202516, "ce_loss_6": 3.5443278789520263, "epoch": 0.586, "grad_norm": 896.0, "kl_loss_12": 571.4762405395508, "kl_loss_17": 166.68805770874025, "kl_loss_3": 2276.775286865234, "kl_loss_6": 1439.7515869140625, "learning_rate": 0.00037289083290325663, "loss": 1099.8597, "step": 5860 }, { "ce_loss_12": 3.111377000808716, "ce_loss_17": 2.9451043605804443, "ce_loss_23": 2.874148762226105, "ce_loss_3": 3.943045997619629, "ce_loss_6": 3.510099673271179, "epoch": 0.587, "grad_norm": 1288.0, "kl_loss_12": 559.2628280639649, "kl_loss_17": 168.02631225585938, "kl_loss_3": 2270.736376953125, "kl_loss_6": 1418.6463623046875, "learning_rate": 0.0003713569393849543, "loss": 1106.6417, "step": 5870 }, { "ce_loss_12": 3.169297516345978, "ce_loss_17": 3.0004114866256715, "ce_loss_23": 2.9307323932647704, "ce_loss_3": 4.0029584765434265, "ce_loss_6": 3.577454316616058, "epoch": 0.588, "grad_norm": 1472.0, "kl_loss_12": 572.8251327514648, "kl_loss_17": 168.60687713623048, "kl_loss_3": 2296.0158630371093, "kl_loss_6": 1451.5937255859376, "learning_rate": 0.00036982434130084397, "loss": 1121.2643, "step": 5880 }, { "ce_loss_12": 3.0862842798233032, "ce_loss_17": 2.918619728088379, "ce_loss_23": 2.843446373939514, "ce_loss_3": 3.9210837841033936, "ce_loss_6": 3.491597616672516, "epoch": 0.589, "grad_norm": 1832.0, "kl_loss_12": 573.2405166625977, "kl_loss_17": 171.37699279785156, "kl_loss_3": 2288.494384765625, "kl_loss_6": 1440.4010803222657, "learning_rate": 0.00036829305408417166, "loss": 1131.5167, "step": 5890 }, { "ce_loss_12": 3.082815647125244, "ce_loss_17": 2.905299973487854, "ce_loss_23": 2.834301030635834, "ce_loss_3": 3.9388489723205566, "ce_loss_6": 3.499078333377838, "epoch": 0.59, "grad_norm": 1112.0, "kl_loss_12": 583.4097137451172, "kl_loss_17": 171.28600463867187, "kl_loss_3": 2353.1889038085938, "kl_loss_6": 1484.3849609375, "learning_rate": 0.0003667630931549826, "loss": 1130.6133, "step": 5900 }, { "ce_loss_12": 3.050534749031067, "ce_loss_17": 2.8739961624145507, "ce_loss_23": 2.8024583578109743, "ce_loss_3": 3.954657232761383, "ce_loss_6": 3.48987854719162, "epoch": 0.591, "grad_norm": 1080.0, "kl_loss_12": 585.1016815185546, "kl_loss_17": 170.39111938476563, "kl_loss_3": 2433.939892578125, "kl_loss_6": 1514.365985107422, "learning_rate": 0.00036523447391996613, "loss": 1151.823, "step": 5910 }, { "ce_loss_12": 3.124648427963257, "ce_loss_17": 2.9544724464416503, "ce_loss_23": 2.889039993286133, "ce_loss_3": 3.9625410079956054, "ce_loss_6": 3.5329968094825746, "epoch": 0.592, "grad_norm": 1344.0, "kl_loss_12": 563.2493103027343, "kl_loss_17": 164.04151000976563, "kl_loss_3": 2275.4251037597655, "kl_loss_6": 1427.4124694824218, "learning_rate": 0.00036370721177230114, "loss": 1107.2832, "step": 5920 }, { "ce_loss_12": 3.130690836906433, "ce_loss_17": 2.958809518814087, "ce_loss_23": 2.8872852325439453, "ce_loss_3": 3.9897098183631896, "ce_loss_6": 3.5514885783195496, "epoch": 0.593, "grad_norm": 1152.0, "kl_loss_12": 581.3672821044922, "kl_loss_17": 172.810555267334, "kl_loss_3": 2338.8616943359375, "kl_loss_6": 1471.761163330078, "learning_rate": 0.00036218132209150044, "loss": 1130.6513, "step": 5930 }, { "ce_loss_12": 3.0971512675285338, "ce_loss_17": 2.9094135046005247, "ce_loss_23": 2.831551361083984, "ce_loss_3": 3.9879598736763002, "ce_loss_6": 3.530065882205963, "epoch": 0.594, "grad_norm": 3088.0, "kl_loss_12": 603.7480621337891, "kl_loss_17": 177.9858055114746, "kl_loss_3": 2438.20546875, "kl_loss_6": 1523.2928405761718, "learning_rate": 0.0003606568202432562, "loss": 1157.2672, "step": 5940 }, { "ce_loss_12": 3.1510114312171935, "ce_loss_17": 2.984342861175537, "ce_loss_23": 2.9136085629463198, "ce_loss_3": 4.028652763366699, "ce_loss_6": 3.5766788005828856, "epoch": 0.595, "grad_norm": 1200.0, "kl_loss_12": 579.799168395996, "kl_loss_17": 171.63598861694337, "kl_loss_3": 2380.4233215332033, "kl_loss_6": 1491.7609802246093, "learning_rate": 0.0003591337215792851, "loss": 1120.3156, "step": 5950 }, { "ce_loss_12": 3.1785937905311585, "ce_loss_17": 3.013545370101929, "ce_loss_23": 2.946664047241211, "ce_loss_3": 3.988059067726135, "ce_loss_6": 3.581684875488281, "epoch": 0.596, "grad_norm": 1280.0, "kl_loss_12": 558.0707260131836, "kl_loss_17": 161.99999389648437, "kl_loss_3": 2246.3234680175783, "kl_loss_6": 1423.4499633789062, "learning_rate": 0.00035761204143717383, "loss": 1116.9531, "step": 5960 }, { "ce_loss_12": 3.1389800190925596, "ce_loss_17": 2.9702129483222963, "ce_loss_23": 2.9017290592193605, "ce_loss_3": 3.985602331161499, "ce_loss_6": 3.551557552814484, "epoch": 0.597, "grad_norm": 1112.0, "kl_loss_12": 570.8744079589844, "kl_loss_17": 167.93110122680665, "kl_loss_3": 2322.8365112304687, "kl_loss_6": 1458.6414733886718, "learning_rate": 0.0003560917951402245, "loss": 1149.7195, "step": 5970 }, { "ce_loss_12": 3.1232689023017883, "ce_loss_17": 2.956403398513794, "ce_loss_23": 2.8862552642822266, "ce_loss_3": 3.9653080701828003, "ce_loss_6": 3.54248868227005, "epoch": 0.598, "grad_norm": 1232.0, "kl_loss_12": 564.7475952148437, "kl_loss_17": 166.05315551757812, "kl_loss_3": 2298.1189575195312, "kl_loss_6": 1452.5067077636718, "learning_rate": 0.00035457299799730046, "loss": 1116.1566, "step": 5980 }, { "ce_loss_12": 3.1833902478218077, "ce_loss_17": 3.0113158345222475, "ce_loss_23": 2.9413999676704408, "ce_loss_3": 4.006715643405914, "ce_loss_6": 3.5888715624809264, "epoch": 0.599, "grad_norm": 1288.0, "kl_loss_12": 568.6937225341796, "kl_loss_17": 166.144620513916, "kl_loss_3": 2271.1790405273437, "kl_loss_6": 1434.0756225585938, "learning_rate": 0.0003530556653026721, "loss": 1119.4857, "step": 5990 }, { "ce_loss_12": 3.1078683972358703, "ce_loss_17": 2.9388864517211912, "ce_loss_23": 2.8658820390701294, "ce_loss_3": 3.962469220161438, "ce_loss_6": 3.5266475319862365, "epoch": 0.6, "grad_norm": 2448.0, "kl_loss_12": 558.4937591552734, "kl_loss_17": 165.0791160583496, "kl_loss_3": 2321.1028381347655, "kl_loss_6": 1452.5112854003905, "learning_rate": 0.00035153981233586274, "loss": 1130.5031, "step": 6000 }, { "ce_loss_12": 3.077103114128113, "ce_loss_17": 2.903602635860443, "ce_loss_23": 2.8354063630104065, "ce_loss_3": 3.925050365924835, "ce_loss_6": 3.49609614610672, "epoch": 0.601, "grad_norm": 1304.0, "kl_loss_12": 564.5199554443359, "kl_loss_17": 161.67181320190429, "kl_loss_3": 2316.9648498535157, "kl_loss_6": 1453.2272521972657, "learning_rate": 0.00035002545436149473, "loss": 1158.5654, "step": 6010 }, { "ce_loss_12": 3.094723129272461, "ce_loss_17": 2.924634051322937, "ce_loss_23": 2.85205854177475, "ce_loss_3": 3.953912055492401, "ce_loss_6": 3.5232208371162415, "epoch": 0.602, "grad_norm": 1576.0, "kl_loss_12": 582.2882827758789, "kl_loss_17": 172.62551193237306, "kl_loss_3": 2357.1133544921877, "kl_loss_6": 1496.3097351074218, "learning_rate": 0.0003485126066291364, "loss": 1120.1215, "step": 6020 }, { "ce_loss_12": 3.1260809898376465, "ce_loss_17": 2.955556845664978, "ce_loss_23": 2.88744113445282, "ce_loss_3": 3.980349564552307, "ce_loss_6": 3.550356423854828, "epoch": 0.603, "grad_norm": 1320.0, "kl_loss_12": 561.931997680664, "kl_loss_17": 165.3617431640625, "kl_loss_3": 2313.730261230469, "kl_loss_6": 1468.6224792480468, "learning_rate": 0.0003470012843731476, "loss": 1126.7109, "step": 6030 }, { "ce_loss_12": 3.074591028690338, "ce_loss_17": 2.906606638431549, "ce_loss_23": 2.8381201028823853, "ce_loss_3": 3.94170058965683, "ce_loss_6": 3.502815544605255, "epoch": 0.604, "grad_norm": 1344.0, "kl_loss_12": 567.1752471923828, "kl_loss_17": 164.98533630371094, "kl_loss_3": 2336.706671142578, "kl_loss_6": 1481.8884155273438, "learning_rate": 0.00034549150281252633, "loss": 1153.334, "step": 6040 }, { "ce_loss_12": 3.063284933567047, "ce_loss_17": 2.8925641417503356, "ce_loss_23": 2.8198460817337034, "ce_loss_3": 3.8918349027633665, "ce_loss_6": 3.4689099192619324, "epoch": 0.605, "grad_norm": 1304.0, "kl_loss_12": 564.6323394775391, "kl_loss_17": 167.75587310791016, "kl_loss_3": 2264.2901000976562, "kl_loss_6": 1431.2904907226562, "learning_rate": 0.0003439832771507565, "loss": 1107.605, "step": 6050 }, { "ce_loss_12": 3.065265107154846, "ce_loss_17": 2.895182228088379, "ce_loss_23": 2.8257480025291444, "ce_loss_3": 3.9281867504119874, "ce_loss_6": 3.493267834186554, "epoch": 0.606, "grad_norm": 1264.0, "kl_loss_12": 567.1343521118164, "kl_loss_17": 167.62005386352538, "kl_loss_3": 2348.1340209960936, "kl_loss_6": 1472.3601806640625, "learning_rate": 0.0003424766225756537, "loss": 1117.4418, "step": 6060 }, { "ce_loss_12": 3.122907614707947, "ce_loss_17": 2.953090226650238, "ce_loss_23": 2.8842870354652406, "ce_loss_3": 3.969139504432678, "ce_loss_6": 3.528507113456726, "epoch": 0.607, "grad_norm": 940.0, "kl_loss_12": 569.6723724365235, "kl_loss_17": 166.05274810791016, "kl_loss_3": 2314.9645568847654, "kl_loss_6": 1443.6838623046874, "learning_rate": 0.00034097155425921255, "loss": 1106.7039, "step": 6070 }, { "ce_loss_12": 3.033176898956299, "ce_loss_17": 2.8631957292556764, "ce_loss_23": 2.7933878183364866, "ce_loss_3": 3.893681752681732, "ce_loss_6": 3.4490676045417787, "epoch": 0.608, "grad_norm": 1256.0, "kl_loss_12": 569.930812072754, "kl_loss_17": 166.2875068664551, "kl_loss_3": 2354.344384765625, "kl_loss_6": 1470.4756164550781, "learning_rate": 0.0003394680873574546, "loss": 1124.7793, "step": 6080 }, { "ce_loss_12": 3.1261494040489195, "ce_loss_17": 2.9549895524978638, "ce_loss_23": 2.8835931181907655, "ce_loss_3": 3.990994596481323, "ce_loss_6": 3.551687812805176, "epoch": 0.609, "grad_norm": 1272.0, "kl_loss_12": 580.3812896728516, "kl_loss_17": 168.92733764648438, "kl_loss_3": 2362.6959594726563, "kl_loss_6": 1480.4715942382813, "learning_rate": 0.0003379662370102747, "loss": 1123.8231, "step": 6090 }, { "ce_loss_12": 3.1346469283103944, "ce_loss_17": 2.9690099239349363, "ce_loss_23": 2.9022717595100405, "ce_loss_3": 3.9612194657325746, "ce_loss_6": 3.529716396331787, "epoch": 0.61, "grad_norm": 1120.0, "kl_loss_12": 564.0862533569336, "kl_loss_17": 164.72021408081054, "kl_loss_3": 2301.772625732422, "kl_loss_6": 1433.7045471191407, "learning_rate": 0.0003364660183412892, "loss": 1122.1545, "step": 6100 }, { "ce_loss_12": 3.1167293906211855, "ce_loss_17": 2.948679792881012, "ce_loss_23": 2.878220272064209, "ce_loss_3": 3.956706476211548, "ce_loss_6": 3.525891661643982, "epoch": 0.611, "grad_norm": 1016.0, "kl_loss_12": 575.0978164672852, "kl_loss_17": 167.98723831176758, "kl_loss_3": 2310.820440673828, "kl_loss_6": 1452.1403869628907, "learning_rate": 0.0003349674464576834, "loss": 1135.3953, "step": 6110 }, { "ce_loss_12": 3.0683866262435915, "ce_loss_17": 2.900368940830231, "ce_loss_23": 2.8294920206069945, "ce_loss_3": 3.9227017879486086, "ce_loss_6": 3.4895668745040895, "epoch": 0.612, "grad_norm": 1400.0, "kl_loss_12": 570.9404495239257, "kl_loss_17": 169.12254333496094, "kl_loss_3": 2328.972705078125, "kl_loss_6": 1467.1710876464845, "learning_rate": 0.00033347053645005966, "loss": 1101.3781, "step": 6120 }, { "ce_loss_12": 3.153418040275574, "ce_loss_17": 2.9871235251426698, "ce_loss_23": 2.9184412360191345, "ce_loss_3": 3.9793171763420103, "ce_loss_6": 3.5582070112228394, "epoch": 0.613, "grad_norm": 1576.0, "kl_loss_12": 560.519694519043, "kl_loss_17": 163.97461700439453, "kl_loss_3": 2253.6986877441404, "kl_loss_6": 1425.6687072753907, "learning_rate": 0.00033197530339228485, "loss": 1116.5682, "step": 6130 }, { "ce_loss_12": 3.1269586086273193, "ce_loss_17": 2.953156077861786, "ce_loss_23": 2.880386304855347, "ce_loss_3": 3.965451443195343, "ce_loss_6": 3.5458551406860352, "epoch": 0.614, "grad_norm": 1384.0, "kl_loss_12": 576.1701049804688, "kl_loss_17": 171.7872230529785, "kl_loss_3": 2296.2677612304688, "kl_loss_6": 1455.6143737792968, "learning_rate": 0.00033048176234133967, "loss": 1117.7045, "step": 6140 }, { "ce_loss_12": 3.1166322231292725, "ce_loss_17": 2.947054147720337, "ce_loss_23": 2.878944230079651, "ce_loss_3": 3.952300786972046, "ce_loss_6": 3.5267172336578367, "epoch": 0.615, "grad_norm": 1200.0, "kl_loss_12": 572.8429504394531, "kl_loss_17": 168.27835922241212, "kl_loss_3": 2304.2374877929688, "kl_loss_6": 1455.4294311523438, "learning_rate": 0.0003289899283371657, "loss": 1129.0639, "step": 6150 }, { "ce_loss_12": 3.1242191553115846, "ce_loss_17": 2.954644775390625, "ce_loss_23": 2.8852687239646913, "ce_loss_3": 3.978927218914032, "ce_loss_6": 3.5400946140289307, "epoch": 0.616, "grad_norm": 1608.0, "kl_loss_12": 561.4634826660156, "kl_loss_17": 165.0264518737793, "kl_loss_3": 2316.6402893066406, "kl_loss_6": 1444.9783081054688, "learning_rate": 0.0003274998164025148, "loss": 1135.0775, "step": 6160 }, { "ce_loss_12": 3.158303916454315, "ce_loss_17": 2.9911983489990233, "ce_loss_23": 2.9205039978027343, "ce_loss_3": 4.000229585170746, "ce_loss_6": 3.5639469504356383, "epoch": 0.617, "grad_norm": 1512.0, "kl_loss_12": 570.6586273193359, "kl_loss_17": 167.9722900390625, "kl_loss_3": 2283.6752319335938, "kl_loss_6": 1439.0603820800782, "learning_rate": 0.0003260114415427975, "loss": 1144.1389, "step": 6170 }, { "ce_loss_12": 3.093304145336151, "ce_loss_17": 2.9208192229270935, "ce_loss_23": 2.8519298434257507, "ce_loss_3": 3.958202075958252, "ce_loss_6": 3.5155731081962585, "epoch": 0.618, "grad_norm": 1424.0, "kl_loss_12": 566.3221221923828, "kl_loss_17": 166.28012084960938, "kl_loss_3": 2347.4573181152346, "kl_loss_6": 1468.7472106933594, "learning_rate": 0.0003245248187459323, "loss": 1143.1082, "step": 6180 }, { "ce_loss_12": 3.0748647093772887, "ce_loss_17": 2.9128819346427917, "ce_loss_23": 2.846064102649689, "ce_loss_3": 3.8938180446624755, "ce_loss_6": 3.4712459087371825, "epoch": 0.619, "grad_norm": 1456.0, "kl_loss_12": 548.8818344116211, "kl_loss_17": 161.40147171020507, "kl_loss_3": 2253.5403076171874, "kl_loss_6": 1401.3277099609375, "learning_rate": 0.00032303996298219416, "loss": 1093.641, "step": 6190 }, { "ce_loss_12": 3.1497726678848266, "ce_loss_17": 2.986233186721802, "ce_loss_23": 2.9164626836776733, "ce_loss_3": 3.9703675627708437, "ce_loss_6": 3.5505619406700135, "epoch": 0.62, "grad_norm": 1104.0, "kl_loss_12": 553.2283340454102, "kl_loss_17": 162.91330795288087, "kl_loss_3": 2224.2183227539062, "kl_loss_6": 1399.8785522460937, "learning_rate": 0.00032155688920406414, "loss": 1091.8606, "step": 6200 }, { "ce_loss_12": 3.0700011014938355, "ce_loss_17": 2.896014726161957, "ce_loss_23": 2.8269984841346742, "ce_loss_3": 3.951104760169983, "ce_loss_6": 3.4962696075439452, "epoch": 0.621, "grad_norm": 1480.0, "kl_loss_12": 572.2689865112304, "kl_loss_17": 168.72110900878906, "kl_loss_3": 2365.320104980469, "kl_loss_6": 1464.9130126953125, "learning_rate": 0.0003200756123460788, "loss": 1147.8372, "step": 6210 }, { "ce_loss_12": 3.1097887635231016, "ce_loss_17": 2.9406883835792543, "ce_loss_23": 2.8669352293014527, "ce_loss_3": 3.9747615337371824, "ce_loss_6": 3.535456907749176, "epoch": 0.622, "grad_norm": 2176.0, "kl_loss_12": 581.556867980957, "kl_loss_17": 170.02774200439453, "kl_loss_3": 2369.5360412597656, "kl_loss_6": 1490.153985595703, "learning_rate": 0.00031859614732467957, "loss": 1143.2827, "step": 6220 }, { "ce_loss_12": 3.1528987884521484, "ce_loss_17": 2.9876800775527954, "ce_loss_23": 2.9186222672462465, "ce_loss_3": 3.9738120079040526, "ce_loss_6": 3.556571829319, "epoch": 0.623, "grad_norm": 1224.0, "kl_loss_12": 557.099560546875, "kl_loss_17": 163.19732131958008, "kl_loss_3": 2246.7878601074217, "kl_loss_6": 1417.0084716796875, "learning_rate": 0.00031711850903806275, "loss": 1099.6331, "step": 6230 }, { "ce_loss_12": 3.069310748577118, "ce_loss_17": 2.895755708217621, "ce_loss_23": 2.825921130180359, "ce_loss_3": 3.932187294960022, "ce_loss_6": 3.4879361152648927, "epoch": 0.624, "grad_norm": 1104.0, "kl_loss_12": 577.8297821044922, "kl_loss_17": 169.72356338500975, "kl_loss_3": 2350.6295837402345, "kl_loss_6": 1469.5405395507812, "learning_rate": 0.0003156427123660297, "loss": 1117.4363, "step": 6240 }, { "ce_loss_12": 3.1469658613204956, "ce_loss_17": 2.9793386697769164, "ce_loss_23": 2.9091200470924377, "ce_loss_3": 3.9587064504623415, "ce_loss_6": 3.5512452244758608, "epoch": 0.625, "grad_norm": 1216.0, "kl_loss_12": 564.7679275512695, "kl_loss_17": 164.33160171508788, "kl_loss_3": 2253.2488037109374, "kl_loss_6": 1427.362451171875, "learning_rate": 0.0003141687721698363, "loss": 1117.8021, "step": 6250 }, { "ce_loss_12": 3.1139580726623537, "ce_loss_17": 2.951532244682312, "ce_loss_23": 2.8865836381912233, "ce_loss_3": 3.9141123294830322, "ce_loss_6": 3.499807631969452, "epoch": 0.626, "grad_norm": 1280.0, "kl_loss_12": 534.3491195678711, "kl_loss_17": 157.5703453063965, "kl_loss_3": 2184.005993652344, "kl_loss_6": 1367.9380004882812, "learning_rate": 0.00031269670329204396, "loss": 1089.5651, "step": 6260 }, { "ce_loss_12": 3.157158946990967, "ce_loss_17": 2.9909069776535033, "ce_loss_23": 2.9246951818466185, "ce_loss_3": 3.962106227874756, "ce_loss_6": 3.552718937397003, "epoch": 0.627, "grad_norm": 1096.0, "kl_loss_12": 562.5212127685547, "kl_loss_17": 164.10081176757814, "kl_loss_3": 2239.545928955078, "kl_loss_6": 1418.3596130371093, "learning_rate": 0.00031122652055637015, "loss": 1111.2492, "step": 6270 }, { "ce_loss_12": 3.1191351294517515, "ce_loss_17": 2.951986086368561, "ce_loss_23": 2.8855087041854857, "ce_loss_3": 3.975840079784393, "ce_loss_6": 3.533181536197662, "epoch": 0.628, "grad_norm": 1112.0, "kl_loss_12": 570.5546478271484, "kl_loss_17": 165.54044723510742, "kl_loss_3": 2337.914758300781, "kl_loss_6": 1459.2587463378907, "learning_rate": 0.0003097582387675385, "loss": 1106.6899, "step": 6280 }, { "ce_loss_12": 3.159350299835205, "ce_loss_17": 2.991086208820343, "ce_loss_23": 2.9224894285202025, "ce_loss_3": 3.9898188948631286, "ce_loss_6": 3.5668737053871156, "epoch": 0.629, "grad_norm": 1464.0, "kl_loss_12": 566.9471389770508, "kl_loss_17": 165.4483184814453, "kl_loss_3": 2297.144598388672, "kl_loss_6": 1448.1842407226563, "learning_rate": 0.00030829187271113034, "loss": 1110.6332, "step": 6290 }, { "ce_loss_12": 3.1395588636398317, "ce_loss_17": 2.978374016284943, "ce_loss_23": 2.9104403972625734, "ce_loss_3": 3.964379060268402, "ce_loss_6": 3.5459526419639587, "epoch": 0.63, "grad_norm": 1376.0, "kl_loss_12": 549.1984100341797, "kl_loss_17": 162.1900848388672, "kl_loss_3": 2229.6377380371096, "kl_loss_6": 1406.4794860839843, "learning_rate": 0.00030682743715343565, "loss": 1112.6799, "step": 6300 }, { "ce_loss_12": 3.1101244688034058, "ce_loss_17": 2.9345804691314696, "ce_loss_23": 2.8628518342971803, "ce_loss_3": 3.949520134925842, "ce_loss_6": 3.5296986937522887, "epoch": 0.631, "grad_norm": 1576.0, "kl_loss_12": 574.5238006591796, "kl_loss_17": 169.87524490356446, "kl_loss_3": 2297.4243286132814, "kl_loss_6": 1459.8432678222657, "learning_rate": 0.0003053649468413043, "loss": 1135.6068, "step": 6310 }, { "ce_loss_12": 3.204981434345245, "ce_loss_17": 3.0351101756095886, "ce_loss_23": 2.9643646597862245, "ce_loss_3": 4.030413317680359, "ce_loss_6": 3.6091914057731627, "epoch": 0.632, "grad_norm": 1496.0, "kl_loss_12": 565.3837051391602, "kl_loss_17": 167.2088722229004, "kl_loss_3": 2273.190069580078, "kl_loss_6": 1428.8405456542969, "learning_rate": 0.00030390441650199725, "loss": 1106.1108, "step": 6320 }, { "ce_loss_12": 3.1109281897544863, "ce_loss_17": 2.9481189966201784, "ce_loss_23": 2.8790109753608704, "ce_loss_3": 3.9495753407478333, "ce_loss_6": 3.517538511753082, "epoch": 0.633, "grad_norm": 1296.0, "kl_loss_12": 561.5178146362305, "kl_loss_17": 164.26847229003906, "kl_loss_3": 2275.2756469726564, "kl_loss_6": 1427.8334716796876, "learning_rate": 0.00030244586084303903, "loss": 1102.1451, "step": 6330 }, { "ce_loss_12": 3.0955166697502134, "ce_loss_17": 2.9205790638923643, "ce_loss_23": 2.853133475780487, "ce_loss_3": 3.949768900871277, "ce_loss_6": 3.5212396383285522, "epoch": 0.634, "grad_norm": 1040.0, "kl_loss_12": 571.7488021850586, "kl_loss_17": 168.09423217773437, "kl_loss_3": 2339.385119628906, "kl_loss_6": 1478.2344299316405, "learning_rate": 0.00030098929455206903, "loss": 1111.3738, "step": 6340 }, { "ce_loss_12": 3.074948859214783, "ce_loss_17": 2.9132041335105896, "ce_loss_23": 2.8481234550476073, "ce_loss_3": 3.927300810813904, "ce_loss_6": 3.4901010274887083, "epoch": 0.635, "grad_norm": 1552.0, "kl_loss_12": 557.5138580322266, "kl_loss_17": 161.4115867614746, "kl_loss_3": 2296.2754333496096, "kl_loss_6": 1436.0618408203125, "learning_rate": 0.00029953473229669324, "loss": 1133.8223, "step": 6350 }, { "ce_loss_12": 3.110889804363251, "ce_loss_17": 2.9405208230018616, "ce_loss_23": 2.8736177682876587, "ce_loss_3": 3.95476838350296, "ce_loss_6": 3.5289437413215636, "epoch": 0.636, "grad_norm": 1384.0, "kl_loss_12": 566.539794921875, "kl_loss_17": 164.1241554260254, "kl_loss_3": 2304.1476318359373, "kl_loss_6": 1450.2427001953124, "learning_rate": 0.00029808218872433767, "loss": 1106.9477, "step": 6360 }, { "ce_loss_12": 3.169117248058319, "ce_loss_17": 3.0026118874549867, "ce_loss_23": 2.933871877193451, "ce_loss_3": 3.9984364748001098, "ce_loss_6": 3.571031415462494, "epoch": 0.637, "grad_norm": 1272.0, "kl_loss_12": 555.6065002441406, "kl_loss_17": 162.51567840576172, "kl_loss_3": 2263.9653381347657, "kl_loss_6": 1415.1207458496094, "learning_rate": 0.0002966316784621, "loss": 1092.0043, "step": 6370 }, { "ce_loss_12": 3.0926617622375487, "ce_loss_17": 2.92412850856781, "ce_loss_23": 2.850287711620331, "ce_loss_3": 3.94654198884964, "ce_loss_6": 3.5095828056335447, "epoch": 0.638, "grad_norm": 1256.0, "kl_loss_12": 572.4656372070312, "kl_loss_17": 167.10297546386718, "kl_loss_3": 2322.6778930664063, "kl_loss_6": 1458.0764892578125, "learning_rate": 0.0002951832161166024, "loss": 1105.7497, "step": 6380 }, { "ce_loss_12": 3.15998957157135, "ce_loss_17": 2.9884552478790285, "ce_loss_23": 2.9132028341293337, "ce_loss_3": 3.992968261241913, "ce_loss_6": 3.5755365610122682, "epoch": 0.639, "grad_norm": 1048.0, "kl_loss_12": 571.2973022460938, "kl_loss_17": 168.8902145385742, "kl_loss_3": 2286.949426269531, "kl_loss_6": 1454.9971801757813, "learning_rate": 0.0002937368162738445, "loss": 1099.0266, "step": 6390 }, { "ce_loss_12": 3.1059131979942323, "ce_loss_17": 2.944585359096527, "ce_loss_23": 2.8812364816665648, "ce_loss_3": 3.9365057229995726, "ce_loss_6": 3.5142826199531556, "epoch": 0.64, "grad_norm": 1248.0, "kl_loss_12": 548.1466659545898, "kl_loss_17": 158.61914291381837, "kl_loss_3": 2269.588977050781, "kl_loss_6": 1421.9814819335938, "learning_rate": 0.0002922924934990568, "loss": 1116.3316, "step": 6400 }, { "ce_loss_12": 3.052070152759552, "ce_loss_17": 2.8816617727279663, "ce_loss_23": 2.809593605995178, "ce_loss_3": 3.9293985247611998, "ce_loss_6": 3.4850574493408204, "epoch": 0.641, "grad_norm": 1016.0, "kl_loss_12": 567.8729965209961, "kl_loss_17": 165.7066307067871, "kl_loss_3": 2372.4416381835936, "kl_loss_6": 1488.8204223632813, "learning_rate": 0.0002908502623365536, "loss": 1130.1512, "step": 6410 }, { "ce_loss_12": 2.9853854417800902, "ce_loss_17": 2.819088900089264, "ce_loss_23": 2.748886638879776, "ce_loss_3": 3.868946361541748, "ce_loss_6": 3.4116463661193848, "epoch": 0.642, "grad_norm": 1312.0, "kl_loss_12": 563.417399597168, "kl_loss_17": 162.88093948364258, "kl_loss_3": 2373.9099060058593, "kl_loss_6": 1474.7904296875, "learning_rate": 0.0002894101373095867, "loss": 1123.2893, "step": 6420 }, { "ce_loss_12": 3.1881203293800353, "ce_loss_17": 3.0227572679519654, "ce_loss_23": 2.9544938921928408, "ce_loss_3": 4.006763243675232, "ce_loss_6": 3.582263541221619, "epoch": 0.643, "grad_norm": 1192.0, "kl_loss_12": 568.1661087036133, "kl_loss_17": 168.8744743347168, "kl_loss_3": 2267.1685180664062, "kl_loss_6": 1420.8440551757812, "learning_rate": 0.00028797213292019926, "loss": 1108.8061, "step": 6430 }, { "ce_loss_12": 3.1680617213249205, "ce_loss_17": 3.002046263217926, "ce_loss_23": 2.9295936226844788, "ce_loss_3": 3.9971763372421263, "ce_loss_6": 3.5705169558525087, "epoch": 0.644, "grad_norm": 1224.0, "kl_loss_12": 571.6078521728516, "kl_loss_17": 168.19557723999023, "kl_loss_3": 2274.578790283203, "kl_loss_6": 1438.4652709960938, "learning_rate": 0.0002865362636490791, "loss": 1133.7952, "step": 6440 }, { "ce_loss_12": 3.173088526725769, "ce_loss_17": 3.0073896169662477, "ce_loss_23": 2.942509913444519, "ce_loss_3": 4.002819502353669, "ce_loss_6": 3.5829379558563232, "epoch": 0.645, "grad_norm": 1448.0, "kl_loss_12": 557.0371398925781, "kl_loss_17": 162.7965171813965, "kl_loss_3": 2260.254278564453, "kl_loss_6": 1429.329766845703, "learning_rate": 0.0002851025439554142, "loss": 1102.529, "step": 6450 }, { "ce_loss_12": 3.159893012046814, "ce_loss_17": 2.99376357793808, "ce_loss_23": 2.92137326002121, "ce_loss_3": 3.976053535938263, "ce_loss_6": 3.569490969181061, "epoch": 0.646, "grad_norm": 1568.0, "kl_loss_12": 562.2715911865234, "kl_loss_17": 164.8102569580078, "kl_loss_3": 2224.415234375, "kl_loss_6": 1420.1577270507812, "learning_rate": 0.00028367098827674573, "loss": 1098.1518, "step": 6460 }, { "ce_loss_12": 3.094073462486267, "ce_loss_17": 2.926894783973694, "ce_loss_23": 2.8606561183929444, "ce_loss_3": 3.9346626162528993, "ce_loss_6": 3.4975233554840086, "epoch": 0.647, "grad_norm": 1056.0, "kl_loss_12": 557.2730117797852, "kl_loss_17": 161.74693222045897, "kl_loss_3": 2265.713836669922, "kl_loss_6": 1410.1818908691407, "learning_rate": 0.00028224161102882397, "loss": 1112.1836, "step": 6470 }, { "ce_loss_12": 3.0719471096992494, "ce_loss_17": 2.9085827350616453, "ce_loss_23": 2.8445157527923586, "ce_loss_3": 3.892908537387848, "ce_loss_6": 3.4768115043640138, "epoch": 0.648, "grad_norm": 1504.0, "kl_loss_12": 548.413461303711, "kl_loss_17": 158.70454483032228, "kl_loss_3": 2234.7188049316405, "kl_loss_6": 1414.9549194335937, "learning_rate": 0.00028081442660546124, "loss": 1106.9424, "step": 6480 }, { "ce_loss_12": 3.1347915887832642, "ce_loss_17": 2.9687870383262633, "ce_loss_23": 2.901146113872528, "ce_loss_3": 3.9523842573165893, "ce_loss_6": 3.5290323853492738, "epoch": 0.649, "grad_norm": 1296.0, "kl_loss_12": 560.3147171020507, "kl_loss_17": 166.38631439208984, "kl_loss_3": 2250.884729003906, "kl_loss_6": 1404.9350158691407, "learning_rate": 0.0002793894493783892, "loss": 1106.6742, "step": 6490 }, { "ce_loss_12": 3.1467607021331787, "ce_loss_17": 2.985257124900818, "ce_loss_23": 2.9195136189460755, "ce_loss_3": 3.9745638847351072, "ce_loss_6": 3.5555363655090333, "epoch": 0.65, "grad_norm": 1256.0, "kl_loss_12": 548.7041427612305, "kl_loss_17": 160.34083786010743, "kl_loss_3": 2242.4936950683596, "kl_loss_6": 1420.1112548828125, "learning_rate": 0.0002779666936971129, "loss": 1096.2486, "step": 6500 }, { "ce_loss_12": 3.1697442054748537, "ce_loss_17": 3.0020530343055727, "ce_loss_23": 2.9306628704071045, "ce_loss_3": 4.006459021568299, "ce_loss_6": 3.578002154827118, "epoch": 0.651, "grad_norm": 1032.0, "kl_loss_12": 568.6251037597656, "kl_loss_17": 165.5076919555664, "kl_loss_3": 2297.5794372558594, "kl_loss_6": 1440.566912841797, "learning_rate": 0.00027654617388876614, "loss": 1119.0514, "step": 6510 }, { "ce_loss_12": 3.1790334582328796, "ce_loss_17": 3.0164214253425596, "ce_loss_23": 2.947012257575989, "ce_loss_3": 4.01498510837555, "ce_loss_6": 3.580212116241455, "epoch": 0.652, "grad_norm": 1280.0, "kl_loss_12": 561.0163177490234, "kl_loss_17": 165.67923889160156, "kl_loss_3": 2284.324603271484, "kl_loss_6": 1423.8077758789063, "learning_rate": 0.0002751279042579672, "loss": 1113.2713, "step": 6520 }, { "ce_loss_12": 3.1225333333015444, "ce_loss_17": 2.961228346824646, "ce_loss_23": 2.8934088587760924, "ce_loss_3": 3.9473448634147643, "ce_loss_6": 3.522226560115814, "epoch": 0.653, "grad_norm": 1104.0, "kl_loss_12": 546.7998992919922, "kl_loss_17": 160.3195655822754, "kl_loss_3": 2248.5444274902343, "kl_loss_6": 1402.5869018554688, "learning_rate": 0.00027371189908667604, "loss": 1116.6242, "step": 6530 }, { "ce_loss_12": 3.185983431339264, "ce_loss_17": 3.0128308176994323, "ce_loss_23": 2.939183759689331, "ce_loss_3": 4.045200252532959, "ce_loss_6": 3.6046812176704406, "epoch": 0.654, "grad_norm": 1000.0, "kl_loss_12": 578.8798248291016, "kl_loss_17": 173.44672164916992, "kl_loss_3": 2344.2353332519533, "kl_loss_6": 1459.0255065917968, "learning_rate": 0.00027229817263404863, "loss": 1142.0896, "step": 6540 }, { "ce_loss_12": 3.1544518947601317, "ce_loss_17": 2.9977415561676026, "ce_loss_23": 2.9329543232917787, "ce_loss_3": 3.944475269317627, "ce_loss_6": 3.5443835496902465, "epoch": 0.655, "grad_norm": 988.0, "kl_loss_12": 547.4029541015625, "kl_loss_17": 161.00300140380858, "kl_loss_3": 2180.589074707031, "kl_loss_6": 1385.0390197753907, "learning_rate": 0.0002708867391362948, "loss": 1091.1217, "step": 6550 }, { "ce_loss_12": 3.1348859310150146, "ce_loss_17": 2.971785545349121, "ce_loss_23": 2.909306788444519, "ce_loss_3": 3.9420260787010193, "ce_loss_6": 3.5194541931152346, "epoch": 0.656, "grad_norm": 1064.0, "kl_loss_12": 536.2102874755859, "kl_loss_17": 159.3724105834961, "kl_loss_3": 2189.0327758789062, "kl_loss_6": 1359.9533996582031, "learning_rate": 0.0002694776128065345, "loss": 1090.7284, "step": 6560 }, { "ce_loss_12": 3.0816171884536745, "ce_loss_17": 2.9123438715934755, "ce_loss_23": 2.8430118560791016, "ce_loss_3": 3.9183963537216187, "ce_loss_6": 3.4916830539703367, "epoch": 0.657, "grad_norm": 1360.0, "kl_loss_12": 566.3829650878906, "kl_loss_17": 165.3619255065918, "kl_loss_3": 2294.381787109375, "kl_loss_6": 1452.9326782226562, "learning_rate": 0.00026807080783465374, "loss": 1100.9056, "step": 6570 }, { "ce_loss_12": 3.181067681312561, "ce_loss_17": 3.01416095495224, "ce_loss_23": 2.9467032313346864, "ce_loss_3": 4.022855424880982, "ce_loss_6": 3.5956613063812255, "epoch": 0.658, "grad_norm": 1488.0, "kl_loss_12": 568.3876220703125, "kl_loss_17": 166.68803634643555, "kl_loss_3": 2302.2370361328126, "kl_loss_6": 1450.1819763183594, "learning_rate": 0.00026666633838716316, "loss": 1123.9771, "step": 6580 }, { "ce_loss_12": 3.0953935623168944, "ce_loss_17": 2.923772132396698, "ce_loss_23": 2.851178967952728, "ce_loss_3": 3.9333749771118165, "ce_loss_6": 3.5038415670394896, "epoch": 0.659, "grad_norm": 1480.0, "kl_loss_12": 572.9993301391602, "kl_loss_17": 168.8495315551758, "kl_loss_3": 2303.81123046875, "kl_loss_6": 1450.9668212890624, "learning_rate": 0.00026526421860705474, "loss": 1129.8791, "step": 6590 }, { "ce_loss_12": 3.116642653942108, "ce_loss_17": 2.9426411151885987, "ce_loss_23": 2.874138903617859, "ce_loss_3": 3.9514434814453123, "ce_loss_6": 3.5283915877342222, "epoch": 0.66, "grad_norm": 1024.0, "kl_loss_12": 569.0870574951172, "kl_loss_17": 166.90686416625977, "kl_loss_3": 2292.765283203125, "kl_loss_6": 1443.3717956542969, "learning_rate": 0.0002638644626136587, "loss": 1103.042, "step": 6600 }, { "ce_loss_12": 3.1274876236915587, "ce_loss_17": 2.9615706145763396, "ce_loss_23": 2.8964427053928374, "ce_loss_3": 3.956467306613922, "ce_loss_6": 3.5342164874076842, "epoch": 0.661, "grad_norm": 1376.0, "kl_loss_12": 556.1025802612305, "kl_loss_17": 160.9067581176758, "kl_loss_3": 2265.7707702636717, "kl_loss_6": 1428.458428955078, "learning_rate": 0.00026246708450250255, "loss": 1106.211, "step": 6610 }, { "ce_loss_12": 3.106378674507141, "ce_loss_17": 2.94292356967926, "ce_loss_23": 2.875683069229126, "ce_loss_3": 3.926833248138428, "ce_loss_6": 3.5029583096504213, "epoch": 0.662, "grad_norm": 1464.0, "kl_loss_12": 551.8497680664062, "kl_loss_17": 162.0682846069336, "kl_loss_3": 2245.946044921875, "kl_loss_6": 1406.854931640625, "learning_rate": 0.00026107209834516854, "loss": 1096.7816, "step": 6620 }, { "ce_loss_12": 3.081444466114044, "ce_loss_17": 2.91006817817688, "ce_loss_23": 2.841499149799347, "ce_loss_3": 3.9396434903144835, "ce_loss_6": 3.503396451473236, "epoch": 0.663, "grad_norm": 1128.0, "kl_loss_12": 563.1735321044922, "kl_loss_17": 164.47392730712892, "kl_loss_3": 2342.509033203125, "kl_loss_6": 1470.8843383789062, "learning_rate": 0.0002596795181891514, "loss": 1132.7114, "step": 6630 }, { "ce_loss_12": 3.083864748477936, "ce_loss_17": 2.9127668380737304, "ce_loss_23": 2.8387949109077453, "ce_loss_3": 3.930868887901306, "ce_loss_6": 3.5026464819908143, "epoch": 0.664, "grad_norm": 1096.0, "kl_loss_12": 577.5287628173828, "kl_loss_17": 170.67861251831056, "kl_loss_3": 2308.6623352050783, "kl_loss_6": 1462.7905578613281, "learning_rate": 0.000258289358057718, "loss": 1158.7979, "step": 6640 }, { "ce_loss_12": 3.1487738728523254, "ce_loss_17": 2.9731197714805604, "ce_loss_23": 2.900687944889069, "ce_loss_3": 3.9934016704559325, "ce_loss_6": 3.5643016934394836, "epoch": 0.665, "grad_norm": 1208.0, "kl_loss_12": 578.7869308471679, "kl_loss_17": 172.91688766479493, "kl_loss_3": 2325.2394836425783, "kl_loss_6": 1463.2403442382813, "learning_rate": 0.0002569016319497657, "loss": 1134.2854, "step": 6650 }, { "ce_loss_12": 3.138523483276367, "ce_loss_17": 2.9648420572280885, "ce_loss_23": 2.891550886631012, "ce_loss_3": 3.9867674231529238, "ce_loss_6": 3.5513219594955445, "epoch": 0.666, "grad_norm": 964.0, "kl_loss_12": 581.3247100830079, "kl_loss_17": 171.40716629028321, "kl_loss_3": 2334.145227050781, "kl_loss_6": 1467.3094970703125, "learning_rate": 0.00025551635383968066, "loss": 1144.0461, "step": 6660 }, { "ce_loss_12": 3.0494396567344664, "ce_loss_17": 2.8817154288291933, "ce_loss_23": 2.8116140246391295, "ce_loss_3": 3.897812283039093, "ce_loss_6": 3.463429093360901, "epoch": 0.667, "grad_norm": 1096.0, "kl_loss_12": 571.2893417358398, "kl_loss_17": 166.89762115478516, "kl_loss_3": 2320.7787536621095, "kl_loss_6": 1455.82001953125, "learning_rate": 0.00025413353767719804, "loss": 1128.7535, "step": 6670 }, { "ce_loss_12": 3.104068899154663, "ce_loss_17": 2.9416080713272095, "ce_loss_23": 2.8767688870429993, "ce_loss_3": 3.9425044536590574, "ce_loss_6": 3.5188334822654723, "epoch": 0.668, "grad_norm": 1376.0, "kl_loss_12": 561.1372955322265, "kl_loss_17": 161.93158035278321, "kl_loss_3": 2303.177655029297, "kl_loss_6": 1447.1806274414062, "learning_rate": 0.0002527531973872617, "loss": 1119.5465, "step": 6680 }, { "ce_loss_12": 3.114408755302429, "ce_loss_17": 2.9568856835365294, "ce_loss_23": 2.8878267884254454, "ce_loss_3": 3.931940507888794, "ce_loss_6": 3.505814230442047, "epoch": 0.669, "grad_norm": 1128.0, "kl_loss_12": 554.5485305786133, "kl_loss_17": 163.08516311645508, "kl_loss_3": 2254.183935546875, "kl_loss_6": 1409.7445495605468, "learning_rate": 0.0002513753468698826, "loss": 1101.3923, "step": 6690 }, { "ce_loss_12": 3.087624263763428, "ce_loss_17": 2.9223830699920654, "ce_loss_23": 2.8527083516120912, "ce_loss_3": 3.9336583375930787, "ce_loss_6": 3.494721603393555, "epoch": 0.67, "grad_norm": 1048.0, "kl_loss_12": 567.5121780395508, "kl_loss_17": 166.63730545043944, "kl_loss_3": 2318.508673095703, "kl_loss_6": 1444.2513488769532, "learning_rate": 0.0002500000000000001, "loss": 1119.0619, "step": 6700 }, { "ce_loss_12": 3.1866146683692933, "ce_loss_17": 3.0320523500442507, "ce_loss_23": 2.9677372574806213, "ce_loss_3": 3.9759655594825745, "ce_loss_6": 3.56893048286438, "epoch": 0.671, "grad_norm": 1048.0, "kl_loss_12": 545.7672103881836, "kl_loss_17": 159.36872329711915, "kl_loss_3": 2177.7454345703127, "kl_loss_6": 1374.6342407226562, "learning_rate": 0.0002486271706273421, "loss": 1122.1479, "step": 6710 }, { "ce_loss_12": 3.126544237136841, "ce_loss_17": 2.9684812307357786, "ce_loss_23": 2.9063286781311035, "ce_loss_3": 3.9199762105941773, "ce_loss_6": 3.514430069923401, "epoch": 0.672, "grad_norm": 1208.0, "kl_loss_12": 543.9798583984375, "kl_loss_17": 159.2358268737793, "kl_loss_3": 2198.2925354003905, "kl_loss_6": 1377.329443359375, "learning_rate": 0.0002472568725762853, "loss": 1100.6169, "step": 6720 }, { "ce_loss_12": 3.1231537103652953, "ce_loss_17": 2.9678090453147887, "ce_loss_23": 2.9013753533363342, "ce_loss_3": 3.911601424217224, "ce_loss_6": 3.505997157096863, "epoch": 0.673, "grad_norm": 1512.0, "kl_loss_12": 534.9500045776367, "kl_loss_17": 156.77999572753907, "kl_loss_3": 2186.6644958496095, "kl_loss_6": 1369.1830505371095, "learning_rate": 0.00024588911964571554, "loss": 1079.9992, "step": 6730 }, { "ce_loss_12": 3.1442111015319822, "ce_loss_17": 2.968558657169342, "ce_loss_23": 2.894292151927948, "ce_loss_3": 4.002666354179382, "ce_loss_6": 3.57113493680954, "epoch": 0.674, "grad_norm": 1032.0, "kl_loss_12": 584.2673843383789, "kl_loss_17": 173.74913482666017, "kl_loss_3": 2329.846893310547, "kl_loss_6": 1475.91787109375, "learning_rate": 0.00024452392560888974, "loss": 1119.3503, "step": 6740 }, { "ce_loss_12": 3.035334324836731, "ce_loss_17": 2.8686137318611147, "ce_loss_23": 2.80149667263031, "ce_loss_3": 3.856445550918579, "ce_loss_6": 3.4397828340530396, "epoch": 0.675, "grad_norm": 1160.0, "kl_loss_12": 548.2299728393555, "kl_loss_17": 158.76625595092773, "kl_loss_3": 2263.6882507324217, "kl_loss_6": 1426.7576843261718, "learning_rate": 0.00024316130421329695, "loss": 1090.746, "step": 6750 }, { "ce_loss_12": 3.1035542130470275, "ce_loss_17": 2.943461501598358, "ce_loss_23": 2.8759094715118407, "ce_loss_3": 3.9234148144721983, "ce_loss_6": 3.5102449536323546, "epoch": 0.676, "grad_norm": 1004.0, "kl_loss_12": 552.3924407958984, "kl_loss_17": 160.46843795776368, "kl_loss_3": 2254.888250732422, "kl_loss_6": 1419.8763732910156, "learning_rate": 0.00024180126918051909, "loss": 1106.4746, "step": 6760 }, { "ce_loss_12": 3.1554140567779543, "ce_loss_17": 2.989892101287842, "ce_loss_23": 2.9217707753181457, "ce_loss_3": 3.968790566921234, "ce_loss_6": 3.5452707767486573, "epoch": 0.677, "grad_norm": 1304.0, "kl_loss_12": 558.4460464477539, "kl_loss_17": 162.82315979003906, "kl_loss_3": 2252.1734924316406, "kl_loss_6": 1406.660821533203, "learning_rate": 0.00024044383420609406, "loss": 1088.5975, "step": 6770 }, { "ce_loss_12": 3.1568056464195253, "ce_loss_17": 3.0021663069725038, "ce_loss_23": 2.9343551278114317, "ce_loss_3": 3.9495391130447386, "ce_loss_6": 3.5399969100952147, "epoch": 0.678, "grad_norm": 1400.0, "kl_loss_12": 546.1890930175781, "kl_loss_17": 159.20334014892578, "kl_loss_3": 2213.2076782226563, "kl_loss_6": 1394.3114929199219, "learning_rate": 0.00023908901295937712, "loss": 1107.4713, "step": 6780 }, { "ce_loss_12": 3.1455968618392944, "ce_loss_17": 2.9837255835533143, "ce_loss_23": 2.9146998643875124, "ce_loss_3": 3.958331596851349, "ce_loss_6": 3.5401602506637575, "epoch": 0.679, "grad_norm": 1128.0, "kl_loss_12": 547.9637756347656, "kl_loss_17": 161.52648010253907, "kl_loss_3": 2221.9041259765627, "kl_loss_6": 1388.3693420410157, "learning_rate": 0.00023773681908340283, "loss": 1113.2122, "step": 6790 }, { "ce_loss_12": 3.1385439157485964, "ce_loss_17": 2.9627222657203673, "ce_loss_23": 2.8902212142944337, "ce_loss_3": 3.979812204837799, "ce_loss_6": 3.550594687461853, "epoch": 0.68, "grad_norm": 1072.0, "kl_loss_12": 587.1845413208008, "kl_loss_17": 172.72392654418945, "kl_loss_3": 2343.5950256347655, "kl_loss_6": 1487.5733581542968, "learning_rate": 0.00023638726619474876, "loss": 1148.0467, "step": 6800 }, { "ce_loss_12": 3.13368022441864, "ce_loss_17": 2.9561671733856203, "ce_loss_23": 2.8830493450164796, "ce_loss_3": 3.999674940109253, "ce_loss_6": 3.562345004081726, "epoch": 0.681, "grad_norm": 1312.0, "kl_loss_12": 576.6458923339844, "kl_loss_17": 169.26600341796876, "kl_loss_3": 2344.7470458984376, "kl_loss_6": 1486.2510864257813, "learning_rate": 0.0002350403678833976, "loss": 1126.2372, "step": 6810 }, { "ce_loss_12": 3.0531776189804076, "ce_loss_17": 2.88701913356781, "ce_loss_23": 2.8160634517669676, "ce_loss_3": 3.8899761915206907, "ce_loss_6": 3.467984676361084, "epoch": 0.682, "grad_norm": 1456.0, "kl_loss_12": 562.7510818481445, "kl_loss_17": 162.64818725585937, "kl_loss_3": 2300.0100158691407, "kl_loss_6": 1451.0055908203126, "learning_rate": 0.00023369613771260007, "loss": 1109.835, "step": 6820 }, { "ce_loss_12": 3.1653555393218995, "ce_loss_17": 2.9973755836486817, "ce_loss_23": 2.9283758759498597, "ce_loss_3": 4.007970345020294, "ce_loss_6": 3.5784966111183167, "epoch": 0.683, "grad_norm": 1232.0, "kl_loss_12": 567.620637512207, "kl_loss_17": 165.86778106689454, "kl_loss_3": 2315.0612060546873, "kl_loss_6": 1455.6276611328126, "learning_rate": 0.00023235458921873925, "loss": 1128.1863, "step": 6830 }, { "ce_loss_12": 3.14102156162262, "ce_loss_17": 2.95844361782074, "ce_loss_23": 2.882523739337921, "ce_loss_3": 4.013988649845123, "ce_loss_6": 3.567373180389404, "epoch": 0.684, "grad_norm": 1432.0, "kl_loss_12": 595.2338577270508, "kl_loss_17": 173.366202545166, "kl_loss_3": 2408.025115966797, "kl_loss_6": 1510.5706726074218, "learning_rate": 0.0002310157359111938, "loss": 1161.5986, "step": 6840 }, { "ce_loss_12": 3.031296193599701, "ce_loss_17": 2.849304759502411, "ce_loss_23": 2.7756431221961977, "ce_loss_3": 3.953793489933014, "ce_loss_6": 3.479044473171234, "epoch": 0.685, "grad_norm": 1376.0, "kl_loss_12": 578.3578979492188, "kl_loss_17": 168.22599029541016, "kl_loss_3": 2469.172509765625, "kl_loss_6": 1525.2212280273438, "learning_rate": 0.0002296795912722014, "loss": 1157.8029, "step": 6850 }, { "ce_loss_12": 3.150490713119507, "ce_loss_17": 2.9875052690505983, "ce_loss_23": 2.9206341743469237, "ce_loss_3": 3.9536224365234376, "ce_loss_6": 3.5421762228012086, "epoch": 0.686, "grad_norm": 920.0, "kl_loss_12": 555.7742065429687, "kl_loss_17": 161.9793846130371, "kl_loss_3": 2232.923474121094, "kl_loss_6": 1409.4739135742188, "learning_rate": 0.0002283461687567236, "loss": 1086.4309, "step": 6860 }, { "ce_loss_12": 3.195644724369049, "ce_loss_17": 3.0399562239646913, "ce_loss_23": 2.9735812902450562, "ce_loss_3": 3.988665819168091, "ce_loss_6": 3.5849629998207093, "epoch": 0.687, "grad_norm": 1072.0, "kl_loss_12": 542.861050415039, "kl_loss_17": 160.58557662963867, "kl_loss_3": 2180.1105834960936, "kl_loss_6": 1376.572607421875, "learning_rate": 0.00022701548179231045, "loss": 1096.79, "step": 6870 }, { "ce_loss_12": 3.1640716314315798, "ce_loss_17": 2.9970357418060303, "ce_loss_23": 2.9259220004081725, "ce_loss_3": 4.014358699321747, "ce_loss_6": 3.573070788383484, "epoch": 0.688, "grad_norm": 1112.0, "kl_loss_12": 565.2588302612305, "kl_loss_17": 167.01204833984374, "kl_loss_3": 2316.832342529297, "kl_loss_6": 1446.0325866699218, "learning_rate": 0.00022568754377896516, "loss": 1103.7375, "step": 6880 }, { "ce_loss_12": 3.1513329982757567, "ce_loss_17": 2.989849019050598, "ce_loss_23": 2.9210432052612303, "ce_loss_3": 3.9666898012161256, "ce_loss_6": 3.544580614566803, "epoch": 0.689, "grad_norm": 1056.0, "kl_loss_12": 563.2337951660156, "kl_loss_17": 164.98118209838867, "kl_loss_3": 2254.496124267578, "kl_loss_6": 1415.3326232910156, "learning_rate": 0.00022436236808900844, "loss": 1098.4635, "step": 6890 }, { "ce_loss_12": 3.056147313117981, "ce_loss_17": 2.883034348487854, "ce_loss_23": 2.813547372817993, "ce_loss_3": 3.8956146478652953, "ce_loss_6": 3.4598315596580504, "epoch": 0.69, "grad_norm": 1256.0, "kl_loss_12": 563.8613327026367, "kl_loss_17": 165.419034576416, "kl_loss_3": 2311.5637939453127, "kl_loss_6": 1434.1539184570313, "learning_rate": 0.00022303996806694487, "loss": 1109.1252, "step": 6900 }, { "ce_loss_12": 3.122467875480652, "ce_loss_17": 2.9579473733901978, "ce_loss_23": 2.8914283990859984, "ce_loss_3": 3.960798966884613, "ce_loss_6": 3.5380321860313417, "epoch": 0.691, "grad_norm": 1288.0, "kl_loss_12": 554.0889297485352, "kl_loss_17": 160.19549865722655, "kl_loss_3": 2290.123254394531, "kl_loss_6": 1434.2134704589844, "learning_rate": 0.00022172035702932823, "loss": 1105.6811, "step": 6910 }, { "ce_loss_12": 3.172382354736328, "ce_loss_17": 3.0069132089614867, "ce_loss_23": 2.9380002498626707, "ce_loss_3": 3.9722339153289794, "ce_loss_6": 3.5636775851249696, "epoch": 0.692, "grad_norm": 940.0, "kl_loss_12": 558.7679122924804, "kl_loss_17": 164.84261932373047, "kl_loss_3": 2207.495928955078, "kl_loss_6": 1403.4933715820312, "learning_rate": 0.00022040354826462666, "loss": 1089.1483, "step": 6920 }, { "ce_loss_12": 3.097479057312012, "ce_loss_17": 2.933117616176605, "ce_loss_23": 2.8654277324676514, "ce_loss_3": 3.9250967383384703, "ce_loss_6": 3.5026277899742126, "epoch": 0.693, "grad_norm": 1320.0, "kl_loss_12": 548.0819427490235, "kl_loss_17": 160.38210525512696, "kl_loss_3": 2262.3633361816405, "kl_loss_6": 1417.277899169922, "learning_rate": 0.0002190895550330899, "loss": 1110.6444, "step": 6930 }, { "ce_loss_12": 3.041971814632416, "ce_loss_17": 2.8696054458618163, "ce_loss_23": 2.798582601547241, "ce_loss_3": 3.8966514229774476, "ce_loss_6": 3.468160080909729, "epoch": 0.694, "grad_norm": 1352.0, "kl_loss_12": 568.6457580566406, "kl_loss_17": 166.68883590698243, "kl_loss_3": 2317.4898193359377, "kl_loss_6": 1467.453192138672, "learning_rate": 0.00021777839056661552, "loss": 1104.5019, "step": 6940 }, { "ce_loss_12": 3.118424892425537, "ce_loss_17": 2.9517529606819153, "ce_loss_23": 2.887066733837128, "ce_loss_3": 3.933793568611145, "ce_loss_6": 3.5133557438850405, "epoch": 0.695, "grad_norm": 1352.0, "kl_loss_12": 555.2664596557618, "kl_loss_17": 162.95882873535157, "kl_loss_3": 2243.5386657714844, "kl_loss_6": 1410.0547485351562, "learning_rate": 0.0002164700680686147, "loss": 1084.6388, "step": 6950 }, { "ce_loss_12": 3.16032634973526, "ce_loss_17": 2.9953511714935304, "ce_loss_23": 2.9277878522872927, "ce_loss_3": 3.9653631448745728, "ce_loss_6": 3.5490016460418703, "epoch": 0.696, "grad_norm": 1368.0, "kl_loss_12": 553.6630996704101, "kl_loss_17": 164.8134864807129, "kl_loss_3": 2208.6744750976563, "kl_loss_6": 1389.9367309570312, "learning_rate": 0.0002151646007138806, "loss": 1087.7991, "step": 6960 }, { "ce_loss_12": 3.050205111503601, "ce_loss_17": 2.884288513660431, "ce_loss_23": 2.813274657726288, "ce_loss_3": 3.8992061018943787, "ce_loss_6": 3.463496470451355, "epoch": 0.697, "grad_norm": 1048.0, "kl_loss_12": 564.9178665161132, "kl_loss_17": 165.92743377685548, "kl_loss_3": 2330.1527099609375, "kl_loss_6": 1455.7044250488282, "learning_rate": 0.00021386200164845526, "loss": 1112.7447, "step": 6970 }, { "ce_loss_12": 3.203552484512329, "ce_loss_17": 3.045564079284668, "ce_loss_23": 2.978299582004547, "ce_loss_3": 3.9887001872062684, "ce_loss_6": 3.5860018014907835, "epoch": 0.698, "grad_norm": 1784.0, "kl_loss_12": 550.210075378418, "kl_loss_17": 161.59491348266602, "kl_loss_3": 2188.434454345703, "kl_loss_6": 1386.6224975585938, "learning_rate": 0.0002125622839894964, "loss": 1079.0363, "step": 6980 }, { "ce_loss_12": 3.156203365325928, "ce_loss_17": 2.9958416342735292, "ce_loss_23": 2.931114614009857, "ce_loss_3": 3.956476068496704, "ce_loss_6": 3.5511318325996397, "epoch": 0.699, "grad_norm": 1200.0, "kl_loss_12": 542.128092956543, "kl_loss_17": 159.17681274414062, "kl_loss_3": 2200.6436096191405, "kl_loss_6": 1383.7528442382813, "learning_rate": 0.00021126546082514663, "loss": 1082.092, "step": 6990 }, { "ce_loss_12": 3.179194617271423, "ce_loss_17": 3.0163188338279725, "ce_loss_23": 2.951323699951172, "ce_loss_3": 3.9722177147865296, "ce_loss_6": 3.564598274230957, "epoch": 0.7, "grad_norm": 988.0, "kl_loss_12": 550.1864288330078, "kl_loss_17": 161.19149017333984, "kl_loss_3": 2196.5428955078123, "kl_loss_6": 1392.7397705078124, "learning_rate": 0.00020997154521440098, "loss": 1078.2177, "step": 7000 }, { "ce_loss_12": 3.1233227133750914, "ce_loss_17": 2.9633098602294923, "ce_loss_23": 2.898985981941223, "ce_loss_3": 3.9442873358726502, "ce_loss_6": 3.5172589302062987, "epoch": 0.701, "grad_norm": 1232.0, "kl_loss_12": 550.1744873046875, "kl_loss_17": 160.37801361083984, "kl_loss_3": 2242.1542419433595, "kl_loss_6": 1405.3580383300782, "learning_rate": 0.0002086805501869749, "loss": 1081.2021, "step": 7010 }, { "ce_loss_12": 3.1101000189781187, "ce_loss_17": 2.9367455363273622, "ce_loss_23": 2.864802801609039, "ce_loss_3": 3.962554705142975, "ce_loss_6": 3.5312764286994933, "epoch": 0.702, "grad_norm": 1160.0, "kl_loss_12": 575.6279846191406, "kl_loss_17": 167.17742538452148, "kl_loss_3": 2333.5331298828123, "kl_loss_6": 1465.0747314453124, "learning_rate": 0.0002073924887431744, "loss": 1114.5426, "step": 7020 }, { "ce_loss_12": 3.1102948069572447, "ce_loss_17": 2.9448712706565856, "ce_loss_23": 2.8787378907203673, "ce_loss_3": 3.943867230415344, "ce_loss_6": 3.516659843921661, "epoch": 0.703, "grad_norm": 1080.0, "kl_loss_12": 561.2340530395508, "kl_loss_17": 162.04900512695312, "kl_loss_3": 2285.8710083007813, "kl_loss_6": 1439.7157775878907, "learning_rate": 0.00020610737385376348, "loss": 1134.2076, "step": 7030 }, { "ce_loss_12": 3.153998517990112, "ce_loss_17": 2.9913541316986083, "ce_loss_23": 2.9260885953903197, "ce_loss_3": 3.9440117359161375, "ce_loss_6": 3.5408020853996276, "epoch": 0.704, "grad_norm": 1256.0, "kl_loss_12": 549.944775390625, "kl_loss_17": 162.47129287719727, "kl_loss_3": 2198.950054931641, "kl_loss_6": 1381.6360717773437, "learning_rate": 0.00020482521845983521, "loss": 1104.1162, "step": 7040 }, { "ce_loss_12": 3.1619212150573732, "ce_loss_17": 2.996929383277893, "ce_loss_23": 2.9238937973976133, "ce_loss_3": 3.9839595556259155, "ce_loss_6": 3.5633341431617738, "epoch": 0.705, "grad_norm": 1456.0, "kl_loss_12": 566.3393341064453, "kl_loss_17": 169.2521484375, "kl_loss_3": 2277.2437255859377, "kl_loss_6": 1436.12548828125, "learning_rate": 0.00020354603547267987, "loss": 1121.7861, "step": 7050 }, { "ce_loss_12": 3.1527396559715273, "ce_loss_17": 2.9844316840171814, "ce_loss_23": 2.914238953590393, "ce_loss_3": 3.994079887866974, "ce_loss_6": 3.5688058733940125, "epoch": 0.706, "grad_norm": 1088.0, "kl_loss_12": 568.8097229003906, "kl_loss_17": 166.45427703857422, "kl_loss_3": 2288.9527587890625, "kl_loss_6": 1444.6506103515626, "learning_rate": 0.00020226983777365604, "loss": 1139.0396, "step": 7060 }, { "ce_loss_12": 3.054299366474152, "ce_loss_17": 2.8940477848052977, "ce_loss_23": 2.82825745344162, "ce_loss_3": 3.93111172914505, "ce_loss_6": 3.485726547241211, "epoch": 0.707, "grad_norm": 1200.0, "kl_loss_12": 547.2898818969727, "kl_loss_17": 160.03102264404296, "kl_loss_3": 2351.719171142578, "kl_loss_6": 1463.3100341796876, "learning_rate": 0.00020099663821406056, "loss": 1112.2619, "step": 7070 }, { "ce_loss_12": 3.146637439727783, "ce_loss_17": 2.9824100852012636, "ce_loss_23": 2.9174304246902465, "ce_loss_3": 3.953105664253235, "ce_loss_6": 3.5402570843696592, "epoch": 0.708, "grad_norm": 1536.0, "kl_loss_12": 544.9981689453125, "kl_loss_17": 160.23975524902343, "kl_loss_3": 2209.292413330078, "kl_loss_6": 1389.835498046875, "learning_rate": 0.00019972644961499853, "loss": 1102.0812, "step": 7080 }, { "ce_loss_12": 3.133434867858887, "ce_loss_17": 2.961903750896454, "ce_loss_23": 2.8904358386993407, "ce_loss_3": 3.981268012523651, "ce_loss_6": 3.5474346041679383, "epoch": 0.709, "grad_norm": 1336.0, "kl_loss_12": 570.3906784057617, "kl_loss_17": 167.14042587280272, "kl_loss_3": 2317.08115234375, "kl_loss_6": 1458.70458984375, "learning_rate": 0.00019845928476725522, "loss": 1116.1218, "step": 7090 }, { "ce_loss_12": 3.202507257461548, "ce_loss_17": 3.0336686730384828, "ce_loss_23": 2.961744248867035, "ce_loss_3": 4.013906419277191, "ce_loss_6": 3.603237068653107, "epoch": 0.71, "grad_norm": 1208.0, "kl_loss_12": 562.5401397705078, "kl_loss_17": 165.22715530395507, "kl_loss_3": 2251.490167236328, "kl_loss_6": 1427.9730285644532, "learning_rate": 0.00019719515643116677, "loss": 1133.9623, "step": 7100 }, { "ce_loss_12": 3.1314493298530577, "ce_loss_17": 2.965890979766846, "ce_loss_23": 2.8989282608032227, "ce_loss_3": 3.943610680103302, "ce_loss_6": 3.5178272485733033, "epoch": 0.711, "grad_norm": 1168.0, "kl_loss_12": 547.7857849121094, "kl_loss_17": 160.55599670410157, "kl_loss_3": 2237.0823181152346, "kl_loss_6": 1387.6133056640624, "learning_rate": 0.0001959340773364911, "loss": 1104.2832, "step": 7110 }, { "ce_loss_12": 3.149474573135376, "ce_loss_17": 2.982474982738495, "ce_loss_23": 2.9145326018333435, "ce_loss_3": 3.979136312007904, "ce_loss_6": 3.5494866251945494, "epoch": 0.712, "grad_norm": 1152.0, "kl_loss_12": 562.058088684082, "kl_loss_17": 163.32530517578124, "kl_loss_3": 2275.673974609375, "kl_loss_6": 1424.4098510742188, "learning_rate": 0.0001946760601822809, "loss": 1086.5138, "step": 7120 }, { "ce_loss_12": 3.2002389788627625, "ce_loss_17": 3.031752586364746, "ce_loss_23": 2.9661432981491087, "ce_loss_3": 4.001896345615387, "ce_loss_6": 3.58693687915802, "epoch": 0.713, "grad_norm": 1424.0, "kl_loss_12": 552.4800445556641, "kl_loss_17": 161.70651321411134, "kl_loss_3": 2226.4401245117188, "kl_loss_6": 1396.9112182617187, "learning_rate": 0.00019342111763675512, "loss": 1073.6355, "step": 7130 }, { "ce_loss_12": 3.1916829347610474, "ce_loss_17": 3.034743547439575, "ce_loss_23": 2.9650686502456667, "ce_loss_3": 3.9788838386535645, "ce_loss_6": 3.5754756927490234, "epoch": 0.714, "grad_norm": 1384.0, "kl_loss_12": 551.3403244018555, "kl_loss_17": 163.68495712280273, "kl_loss_3": 2187.1236572265625, "kl_loss_6": 1389.4500671386718, "learning_rate": 0.00019216926233717085, "loss": 1076.3952, "step": 7140 }, { "ce_loss_12": 3.0939472317695618, "ce_loss_17": 2.9311779618263243, "ce_loss_23": 2.86406432390213, "ce_loss_3": 3.990004599094391, "ce_loss_6": 3.5400795698165894, "epoch": 0.715, "grad_norm": 1168.0, "kl_loss_12": 546.9031967163086, "kl_loss_17": 160.15991592407227, "kl_loss_3": 2377.830218505859, "kl_loss_6": 1489.6150390625, "learning_rate": 0.00019092050688969737, "loss": 1126.9752, "step": 7150 }, { "ce_loss_12": 3.155309784412384, "ce_loss_17": 2.9939750909805296, "ce_loss_23": 2.9262243509292603, "ce_loss_3": 3.9573385953903197, "ce_loss_6": 3.5473846793174744, "epoch": 0.716, "grad_norm": 1104.0, "kl_loss_12": 548.476106262207, "kl_loss_17": 160.36773605346679, "kl_loss_3": 2237.303985595703, "kl_loss_6": 1409.4257446289062, "learning_rate": 0.00018967486386928817, "loss": 1087.5232, "step": 7160 }, { "ce_loss_12": 3.047571229934692, "ce_loss_17": 2.877414608001709, "ce_loss_23": 2.808344340324402, "ce_loss_3": 3.892746686935425, "ce_loss_6": 3.459039735794067, "epoch": 0.717, "grad_norm": 1264.0, "kl_loss_12": 563.6357772827148, "kl_loss_17": 162.5858512878418, "kl_loss_3": 2310.7905822753905, "kl_loss_6": 1447.4100341796875, "learning_rate": 0.00018843234581955443, "loss": 1148.2967, "step": 7170 }, { "ce_loss_12": 3.053649604320526, "ce_loss_17": 2.883082091808319, "ce_loss_23": 2.810990631580353, "ce_loss_3": 3.8953574657440186, "ce_loss_6": 3.4785933017730715, "epoch": 0.718, "grad_norm": 1200.0, "kl_loss_12": 565.9681213378906, "kl_loss_17": 164.66096267700195, "kl_loss_3": 2300.275506591797, "kl_loss_6": 1458.3315551757812, "learning_rate": 0.00018719296525263924, "loss": 1114.9986, "step": 7180 }, { "ce_loss_12": 3.140706789493561, "ce_loss_17": 2.981066620349884, "ce_loss_23": 2.9154026985168455, "ce_loss_3": 3.924810791015625, "ce_loss_6": 3.5228620290756227, "epoch": 0.719, "grad_norm": 1016.0, "kl_loss_12": 544.8866394042968, "kl_loss_17": 161.65244522094727, "kl_loss_3": 2172.4519226074217, "kl_loss_6": 1369.1830444335938, "learning_rate": 0.0001859567346490913, "loss": 1072.4305, "step": 7190 }, { "ce_loss_12": 3.131831610202789, "ce_loss_17": 2.961239516735077, "ce_loss_23": 2.890417754650116, "ce_loss_3": 3.959796333312988, "ce_loss_6": 3.537416911125183, "epoch": 0.72, "grad_norm": 1272.0, "kl_loss_12": 566.2015747070312, "kl_loss_17": 167.16772689819337, "kl_loss_3": 2285.071160888672, "kl_loss_6": 1436.5745483398437, "learning_rate": 0.0001847236664577389, "loss": 1096.1967, "step": 7200 }, { "ce_loss_12": 3.139620101451874, "ce_loss_17": 2.9793400049209593, "ce_loss_23": 2.913111627101898, "ce_loss_3": 3.926980221271515, "ce_loss_6": 3.5231330752372743, "epoch": 0.721, "grad_norm": 1096.0, "kl_loss_12": 545.3568664550781, "kl_loss_17": 160.95148239135742, "kl_loss_3": 2178.54990234375, "kl_loss_6": 1376.5175537109376, "learning_rate": 0.00018349377309556487, "loss": 1070.0953, "step": 7210 }, { "ce_loss_12": 3.1002493500709534, "ce_loss_17": 2.934877908229828, "ce_loss_23": 2.8660946249961854, "ce_loss_3": 3.966576647758484, "ce_loss_6": 3.5206631064414977, "epoch": 0.722, "grad_norm": 1360.0, "kl_loss_12": 573.0969818115234, "kl_loss_17": 166.3352149963379, "kl_loss_3": 2372.4399658203124, "kl_loss_6": 1479.197607421875, "learning_rate": 0.00018226706694758193, "loss": 1128.7332, "step": 7220 }, { "ce_loss_12": 3.164998912811279, "ce_loss_17": 3.004845643043518, "ce_loss_23": 2.939321994781494, "ce_loss_3": 3.984208607673645, "ce_loss_6": 3.5630035042762755, "epoch": 0.723, "grad_norm": 1152.0, "kl_loss_12": 557.9833618164063, "kl_loss_17": 161.2952865600586, "kl_loss_3": 2261.3576477050783, "kl_loss_6": 1421.3094360351563, "learning_rate": 0.0001810435603667075, "loss": 1127.0174, "step": 7230 }, { "ce_loss_12": 3.0262806892395018, "ce_loss_17": 2.862540531158447, "ce_loss_23": 2.7961209177970887, "ce_loss_3": 3.8637616872787475, "ce_loss_6": 3.4397748947143554, "epoch": 0.724, "grad_norm": 1024.0, "kl_loss_12": 546.5524963378906, "kl_loss_17": 159.55378036499025, "kl_loss_3": 2263.3500122070313, "kl_loss_6": 1415.6317260742187, "learning_rate": 0.0001798232656736389, "loss": 1119.4006, "step": 7240 }, { "ce_loss_12": 3.18226363658905, "ce_loss_17": 3.0180405259132383, "ce_loss_23": 2.9479485750198364, "ce_loss_3": 3.9728939294815064, "ce_loss_6": 3.562802815437317, "epoch": 0.725, "grad_norm": 900.0, "kl_loss_12": 546.6362609863281, "kl_loss_17": 161.94093399047853, "kl_loss_3": 2180.735302734375, "kl_loss_6": 1363.2637145996093, "learning_rate": 0.0001786061951567303, "loss": 1085.3965, "step": 7250 }, { "ce_loss_12": 3.105582582950592, "ce_loss_17": 2.9399035573005676, "ce_loss_23": 2.8691635012626646, "ce_loss_3": 3.9410958766937254, "ce_loss_6": 3.5125160813331604, "epoch": 0.726, "grad_norm": 1432.0, "kl_loss_12": 559.9968872070312, "kl_loss_17": 165.33421478271484, "kl_loss_3": 2276.2733947753904, "kl_loss_6": 1432.7234924316406, "learning_rate": 0.00017739236107186857, "loss": 1119.3277, "step": 7260 }, { "ce_loss_12": 3.1843878746032717, "ce_loss_17": 3.030998408794403, "ce_loss_23": 2.9644922494888304, "ce_loss_3": 3.9629374146461487, "ce_loss_6": 3.563722383975983, "epoch": 0.727, "grad_norm": 1048.0, "kl_loss_12": 538.6017959594726, "kl_loss_17": 158.84636154174805, "kl_loss_3": 2159.2968811035157, "kl_loss_6": 1362.6561462402344, "learning_rate": 0.00017618177564234904, "loss": 1077.8926, "step": 7270 }, { "ce_loss_12": 3.1545698285102843, "ce_loss_17": 3.000528430938721, "ce_loss_23": 2.9358838319778444, "ce_loss_3": 3.935468780994415, "ce_loss_6": 3.5329660058021544, "epoch": 0.728, "grad_norm": 1344.0, "kl_loss_12": 531.7809387207031, "kl_loss_17": 156.99573287963867, "kl_loss_3": 2140.3086486816405, "kl_loss_6": 1345.6030700683593, "learning_rate": 0.00017497445105875377, "loss": 1067.2183, "step": 7280 }, { "ce_loss_12": 3.0874632835388183, "ce_loss_17": 2.9163344144821166, "ce_loss_23": 2.8498679399490356, "ce_loss_3": 3.932736027240753, "ce_loss_6": 3.4955358624458315, "epoch": 0.729, "grad_norm": 952.0, "kl_loss_12": 567.4337753295898, "kl_loss_17": 164.65280532836914, "kl_loss_3": 2311.5119812011717, "kl_loss_6": 1440.8906311035157, "learning_rate": 0.000173770399478828, "loss": 1112.781, "step": 7290 }, { "ce_loss_12": 3.0088712096214296, "ce_loss_17": 2.8473345756530763, "ce_loss_23": 2.784889876842499, "ce_loss_3": 3.834717857837677, "ce_loss_6": 3.403339159488678, "epoch": 0.73, "grad_norm": 1224.0, "kl_loss_12": 539.0748489379882, "kl_loss_17": 158.0688606262207, "kl_loss_3": 2244.918664550781, "kl_loss_6": 1394.7807189941407, "learning_rate": 0.0001725696330273575, "loss": 1116.7592, "step": 7300 }, { "ce_loss_12": 3.1739781856536866, "ce_loss_17": 3.014979827404022, "ce_loss_23": 2.9461803793907166, "ce_loss_3": 3.9663755893707275, "ce_loss_6": 3.565347063541412, "epoch": 0.731, "grad_norm": 984.0, "kl_loss_12": 537.3137969970703, "kl_loss_17": 158.49719619750977, "kl_loss_3": 2170.088458251953, "kl_loss_6": 1369.4702209472657, "learning_rate": 0.00017137216379604724, "loss": 1068.5676, "step": 7310 }, { "ce_loss_12": 3.0623173117637634, "ce_loss_17": 2.8997745752334594, "ce_loss_23": 2.8324743151664733, "ce_loss_3": 3.900362193584442, "ce_loss_6": 3.470179033279419, "epoch": 0.732, "grad_norm": 1008.0, "kl_loss_12": 547.4601623535157, "kl_loss_17": 161.21364669799806, "kl_loss_3": 2262.5873413085938, "kl_loss_6": 1410.8793884277343, "learning_rate": 0.00017017800384339925, "loss": 1099.5278, "step": 7320 }, { "ce_loss_12": 3.025323486328125, "ce_loss_17": 2.856322979927063, "ce_loss_23": 2.7867146492004395, "ce_loss_3": 3.8915157079696656, "ce_loss_6": 3.4566821694374084, "epoch": 0.733, "grad_norm": 1056.0, "kl_loss_12": 561.3699798583984, "kl_loss_17": 162.175146484375, "kl_loss_3": 2332.4853088378904, "kl_loss_6": 1466.8850708007812, "learning_rate": 0.00016898716519459073, "loss": 1096.8963, "step": 7330 }, { "ce_loss_12": 3.151548945903778, "ce_loss_17": 2.9805155396461487, "ce_loss_23": 2.9074472308158876, "ce_loss_3": 4.005491101741791, "ce_loss_6": 3.57403963804245, "epoch": 0.734, "grad_norm": 908.0, "kl_loss_12": 578.352165222168, "kl_loss_17": 169.664949798584, "kl_loss_3": 2314.9711486816404, "kl_loss_6": 1457.3987060546874, "learning_rate": 0.00016779965984135375, "loss": 1110.758, "step": 7340 }, { "ce_loss_12": 3.063838768005371, "ce_loss_17": 2.901591444015503, "ce_loss_23": 2.834994339942932, "ce_loss_3": 3.9005173802375794, "ce_loss_6": 3.467866039276123, "epoch": 0.735, "grad_norm": 1096.0, "kl_loss_12": 540.851530456543, "kl_loss_17": 157.2512535095215, "kl_loss_3": 2253.485968017578, "kl_loss_6": 1398.4514953613282, "learning_rate": 0.00016661549974185424, "loss": 1091.6617, "step": 7350 }, { "ce_loss_12": 3.094240939617157, "ce_loss_17": 2.932550811767578, "ce_loss_23": 2.864250934123993, "ce_loss_3": 3.917224442958832, "ce_loss_6": 3.493868517875671, "epoch": 0.736, "grad_norm": 1232.0, "kl_loss_12": 553.4183349609375, "kl_loss_17": 162.8744571685791, "kl_loss_3": 2251.1023193359374, "kl_loss_6": 1406.9132019042968, "learning_rate": 0.00016543469682057105, "loss": 1086.0771, "step": 7360 }, { "ce_loss_12": 3.125151574611664, "ce_loss_17": 2.957120954990387, "ce_loss_23": 2.886409246921539, "ce_loss_3": 3.9414002776145933, "ce_loss_6": 3.522070753574371, "epoch": 0.737, "grad_norm": 936.0, "kl_loss_12": 561.4340881347656, "kl_loss_17": 164.44181289672852, "kl_loss_3": 2250.564172363281, "kl_loss_6": 1418.036572265625, "learning_rate": 0.00016425726296817632, "loss": 1091.9379, "step": 7370 }, { "ce_loss_12": 3.1298630475997924, "ce_loss_17": 2.9687084794044494, "ce_loss_23": 2.9038389205932615, "ce_loss_3": 3.9451918244361877, "ce_loss_6": 3.5293861985206605, "epoch": 0.738, "grad_norm": 1024.0, "kl_loss_12": 541.5226104736328, "kl_loss_17": 159.93819351196288, "kl_loss_3": 2214.3490661621095, "kl_loss_6": 1394.9463256835938, "learning_rate": 0.00016308321004141607, "loss": 1087.2498, "step": 7380 }, { "ce_loss_12": 3.087207305431366, "ce_loss_17": 2.9217296123504637, "ce_loss_23": 2.851014792919159, "ce_loss_3": 3.9276385068893434, "ce_loss_6": 3.4967284917831423, "epoch": 0.739, "grad_norm": 1312.0, "kl_loss_12": 567.2153533935547, "kl_loss_17": 166.8703598022461, "kl_loss_3": 2282.3760681152344, "kl_loss_6": 1435.2972351074218, "learning_rate": 0.00016191254986299043, "loss": 1091.6779, "step": 7390 }, { "ce_loss_12": 3.112338662147522, "ce_loss_17": 2.9599267840385437, "ce_loss_23": 2.8951269507408144, "ce_loss_3": 3.920237624645233, "ce_loss_6": 3.514072132110596, "epoch": 0.74, "grad_norm": 1296.0, "kl_loss_12": 535.1316421508789, "kl_loss_17": 157.7860466003418, "kl_loss_3": 2220.921942138672, "kl_loss_6": 1406.8481384277343, "learning_rate": 0.00016074529422143398, "loss": 1102.7136, "step": 7400 }, { "ce_loss_12": 3.0936760783195494, "ce_loss_17": 2.9284700989723205, "ce_loss_23": 2.861575019359589, "ce_loss_3": 3.94160852432251, "ce_loss_6": 3.501969301700592, "epoch": 0.741, "grad_norm": 1200.0, "kl_loss_12": 559.6839050292969, "kl_loss_17": 166.13983001708985, "kl_loss_3": 2296.2011291503904, "kl_loss_6": 1432.079364013672, "learning_rate": 0.0001595814548709983, "loss": 1118.8881, "step": 7410 }, { "ce_loss_12": 3.159260606765747, "ce_loss_17": 2.990760338306427, "ce_loss_23": 2.9189414262771605, "ce_loss_3": 3.9913843154907225, "ce_loss_6": 3.56757196187973, "epoch": 0.742, "grad_norm": 1160.0, "kl_loss_12": 572.7192138671875, "kl_loss_17": 167.77596206665038, "kl_loss_3": 2308.6673828125, "kl_loss_6": 1447.264874267578, "learning_rate": 0.00015842104353153285, "loss": 1114.7347, "step": 7420 }, { "ce_loss_12": 3.1647661447525026, "ce_loss_17": 2.9976929903030394, "ce_loss_23": 2.9297439217567445, "ce_loss_3": 3.9861793637275698, "ce_loss_6": 3.568111205101013, "epoch": 0.743, "grad_norm": 1256.0, "kl_loss_12": 556.8150070190429, "kl_loss_17": 163.89298629760742, "kl_loss_3": 2258.01337890625, "kl_loss_6": 1422.5369995117187, "learning_rate": 0.0001572640718883667, "loss": 1122.6908, "step": 7430 }, { "ce_loss_12": 3.0959529757499693, "ce_loss_17": 2.9399875164031983, "ce_loss_23": 2.877901887893677, "ce_loss_3": 3.9096753716468813, "ce_loss_6": 3.4912124156951903, "epoch": 0.744, "grad_norm": 1208.0, "kl_loss_12": 542.3641479492187, "kl_loss_17": 157.65117416381835, "kl_loss_3": 2208.268170166016, "kl_loss_6": 1384.6812377929687, "learning_rate": 0.0001561105515921915, "loss": 1108.7784, "step": 7440 }, { "ce_loss_12": 2.977490186691284, "ce_loss_17": 2.814324605464935, "ce_loss_23": 2.749322760105133, "ce_loss_3": 3.8388984203338623, "ce_loss_6": 3.398288404941559, "epoch": 0.745, "grad_norm": 1048.0, "kl_loss_12": 548.0307357788085, "kl_loss_17": 156.30992317199707, "kl_loss_3": 2329.5126525878904, "kl_loss_6": 1452.494940185547, "learning_rate": 0.0001549604942589441, "loss": 1101.1797, "step": 7450 }, { "ce_loss_12": 3.135593664646149, "ce_loss_17": 2.978959119319916, "ce_loss_23": 2.9141037464141846, "ce_loss_3": 3.9119282364845276, "ce_loss_6": 3.510556936264038, "epoch": 0.746, "grad_norm": 1600.0, "kl_loss_12": 532.6268692016602, "kl_loss_17": 156.17607040405272, "kl_loss_3": 2157.824334716797, "kl_loss_6": 1351.6411987304687, "learning_rate": 0.00015381391146968864, "loss": 1070.9182, "step": 7460 }, { "ce_loss_12": 3.107375943660736, "ce_loss_17": 2.9511581659317017, "ce_loss_23": 2.8842229008674622, "ce_loss_3": 3.941971480846405, "ce_loss_6": 3.51314240694046, "epoch": 0.747, "grad_norm": 1232.0, "kl_loss_12": 536.7534057617188, "kl_loss_17": 156.5211166381836, "kl_loss_3": 2236.07861328125, "kl_loss_6": 1399.7406127929687, "learning_rate": 0.00015267081477050133, "loss": 1095.7694, "step": 7470 }, { "ce_loss_12": 3.204038417339325, "ce_loss_17": 3.0420343041419984, "ce_loss_23": 2.9723053574562073, "ce_loss_3": 4.002698719501495, "ce_loss_6": 3.5953757524490357, "epoch": 0.748, "grad_norm": 1040.0, "kl_loss_12": 558.8490585327148, "kl_loss_17": 166.1141471862793, "kl_loss_3": 2206.0192810058593, "kl_loss_6": 1402.625665283203, "learning_rate": 0.00015153121567235335, "loss": 1076.1527, "step": 7480 }, { "ce_loss_12": 3.105157423019409, "ce_loss_17": 2.947098362445831, "ce_loss_23": 2.8795030236244203, "ce_loss_3": 3.933885872364044, "ce_loss_6": 3.5141167044639587, "epoch": 0.749, "grad_norm": 908.0, "kl_loss_12": 552.778515625, "kl_loss_17": 161.83009643554686, "kl_loss_3": 2275.603137207031, "kl_loss_6": 1430.296209716797, "learning_rate": 0.00015039512565099468, "loss": 1075.2557, "step": 7490 }, { "ce_loss_12": 3.1631266593933107, "ce_loss_17": 3.0015437960624696, "ce_loss_23": 2.9345196962356566, "ce_loss_3": 3.9690038084983827, "ce_loss_6": 3.5539440274238587, "epoch": 0.75, "grad_norm": 984.0, "kl_loss_12": 547.2122283935547, "kl_loss_17": 160.6443084716797, "kl_loss_3": 2220.7647399902344, "kl_loss_6": 1392.8933410644531, "learning_rate": 0.00014926255614683932, "loss": 1124.4953, "step": 7500 }, { "ce_loss_12": 3.101532554626465, "ce_loss_17": 2.9439298510551453, "ce_loss_23": 2.8763969421386717, "ce_loss_3": 3.9131872415542603, "ce_loss_6": 3.491703712940216, "epoch": 0.751, "grad_norm": 1168.0, "kl_loss_12": 549.1255447387696, "kl_loss_17": 160.57257156372071, "kl_loss_3": 2240.627813720703, "kl_loss_6": 1390.5830993652344, "learning_rate": 0.0001481335185648498, "loss": 1095.8574, "step": 7510 }, { "ce_loss_12": 3.126468801498413, "ce_loss_17": 2.9646638870239257, "ce_loss_23": 2.899153769016266, "ce_loss_3": 3.939616787433624, "ce_loss_6": 3.5179593563079834, "epoch": 0.752, "grad_norm": 1144.0, "kl_loss_12": 547.5724853515625, "kl_loss_17": 160.2132423400879, "kl_loss_3": 2241.28125, "kl_loss_6": 1402.0799621582032, "learning_rate": 0.0001470080242744218, "loss": 1083.3291, "step": 7520 }, { "ce_loss_12": 3.1182099103927614, "ce_loss_17": 2.959747242927551, "ce_loss_23": 2.8968014121055603, "ce_loss_3": 3.9456854939460753, "ce_loss_6": 3.5264230728149415, "epoch": 0.753, "grad_norm": 1440.0, "kl_loss_12": 538.8024368286133, "kl_loss_17": 156.9289436340332, "kl_loss_3": 2241.8080932617186, "kl_loss_6": 1411.8066284179688, "learning_rate": 0.0001458860846092705, "loss": 1099.8725, "step": 7530 }, { "ce_loss_12": 3.163596737384796, "ce_loss_17": 3.004104268550873, "ce_loss_23": 2.9375285863876344, "ce_loss_3": 3.957572305202484, "ce_loss_6": 3.5540796637535097, "epoch": 0.754, "grad_norm": 1216.0, "kl_loss_12": 541.4927703857422, "kl_loss_17": 160.5062156677246, "kl_loss_3": 2173.8907104492187, "kl_loss_6": 1377.824725341797, "learning_rate": 0.00014476771086731566, "loss": 1061.0291, "step": 7540 }, { "ce_loss_12": 3.2464739799499513, "ce_loss_17": 3.0824361085891723, "ce_loss_23": 3.01233891248703, "ce_loss_3": 4.050685405731201, "ce_loss_6": 3.637848448753357, "epoch": 0.755, "grad_norm": 1176.0, "kl_loss_12": 562.2777740478516, "kl_loss_17": 169.30307312011718, "kl_loss_3": 2217.041650390625, "kl_loss_6": 1403.5890319824218, "learning_rate": 0.00014365291431056872, "loss": 1113.4173, "step": 7550 }, { "ce_loss_12": 3.0967350363731385, "ce_loss_17": 2.926898777484894, "ce_loss_23": 2.855245494842529, "ce_loss_3": 3.9306687831878664, "ce_loss_6": 3.501248574256897, "epoch": 0.756, "grad_norm": 1360.0, "kl_loss_12": 567.5073440551757, "kl_loss_17": 167.36128311157228, "kl_loss_3": 2294.8422302246095, "kl_loss_6": 1441.266864013672, "learning_rate": 0.00014254170616501827, "loss": 1102.9068, "step": 7560 }, { "ce_loss_12": 3.0469727873802186, "ce_loss_17": 2.8708555340766906, "ce_loss_23": 2.7983035683631896, "ce_loss_3": 3.9179059267044067, "ce_loss_6": 3.485325014591217, "epoch": 0.757, "grad_norm": 1120.0, "kl_loss_12": 577.7532028198242, "kl_loss_17": 166.80727844238282, "kl_loss_3": 2361.802197265625, "kl_loss_6": 1497.723193359375, "learning_rate": 0.0001414340976205183, "loss": 1142.0348, "step": 7570 }, { "ce_loss_12": 3.058771586418152, "ce_loss_17": 2.882201647758484, "ce_loss_23": 2.815154159069061, "ce_loss_3": 3.9053560972213743, "ce_loss_6": 3.4667197585105898, "epoch": 0.758, "grad_norm": 1328.0, "kl_loss_12": 570.4278915405273, "kl_loss_17": 160.15200500488282, "kl_loss_3": 2310.2031005859376, "kl_loss_6": 1439.8280883789062, "learning_rate": 0.00014033009983067452, "loss": 1101.2764, "step": 7580 }, { "ce_loss_12": 3.1888783097267153, "ce_loss_17": 3.031953418254852, "ce_loss_23": 2.966986298561096, "ce_loss_3": 3.9651535511016847, "ce_loss_6": 3.5674421310424806, "epoch": 0.759, "grad_norm": 1024.0, "kl_loss_12": 533.8454498291015, "kl_loss_17": 157.25309829711915, "kl_loss_3": 2158.3534240722656, "kl_loss_6": 1357.4085571289063, "learning_rate": 0.00013922972391273224, "loss": 1076.676, "step": 7590 }, { "ce_loss_12": 3.18862464427948, "ce_loss_17": 3.0305684566497804, "ce_loss_23": 2.9652734279632567, "ce_loss_3": 4.02052389383316, "ce_loss_6": 3.5788715481758118, "epoch": 0.76, "grad_norm": 1464.0, "kl_loss_12": 543.7809707641602, "kl_loss_17": 161.5416374206543, "kl_loss_3": 2240.5658569335938, "kl_loss_6": 1375.7309265136719, "learning_rate": 0.0001381329809474649, "loss": 1091.1582, "step": 7600 }, { "ce_loss_12": 3.1171194434165956, "ce_loss_17": 2.9485602140426637, "ce_loss_23": 2.8763870716094972, "ce_loss_3": 3.9727436780929564, "ce_loss_6": 3.5333943486213686, "epoch": 0.761, "grad_norm": 1240.0, "kl_loss_12": 568.4049987792969, "kl_loss_17": 165.82229461669922, "kl_loss_3": 2333.8710205078123, "kl_loss_6": 1460.953515625, "learning_rate": 0.0001370398819790621, "loss": 1118.3304, "step": 7610 }, { "ce_loss_12": 3.230324959754944, "ce_loss_17": 3.0689058542251586, "ce_loss_23": 3.0017220854759215, "ce_loss_3": 4.028148972988129, "ce_loss_6": 3.6192330360412597, "epoch": 0.762, "grad_norm": 1232.0, "kl_loss_12": 544.9417282104492, "kl_loss_17": 161.8925666809082, "kl_loss_3": 2187.680139160156, "kl_loss_6": 1370.5897888183595, "learning_rate": 0.00013595043801501794, "loss": 1066.1822, "step": 7620 }, { "ce_loss_12": 3.055271232128143, "ce_loss_17": 2.884974014759064, "ce_loss_23": 2.8141839265823365, "ce_loss_3": 3.9491064667701723, "ce_loss_6": 3.4900349378585815, "epoch": 0.763, "grad_norm": 1496.0, "kl_loss_12": 568.0841201782226, "kl_loss_17": 165.38999710083007, "kl_loss_3": 2394.4039001464844, "kl_loss_6": 1488.0246459960938, "learning_rate": 0.00013486466002602133, "loss": 1123.8639, "step": 7630 }, { "ce_loss_12": 3.1421401262283326, "ce_loss_17": 2.981762206554413, "ce_loss_23": 2.9162477374076845, "ce_loss_3": 3.9335285305976866, "ce_loss_6": 3.533025884628296, "epoch": 0.764, "grad_norm": 1416.0, "kl_loss_12": 541.4627029418946, "kl_loss_17": 160.2584083557129, "kl_loss_3": 2189.476123046875, "kl_loss_6": 1387.05546875, "learning_rate": 0.00013378255894584462, "loss": 1107.4135, "step": 7640 }, { "ce_loss_12": 3.0940568923950194, "ce_loss_17": 2.925947606563568, "ce_loss_23": 2.856828761100769, "ce_loss_3": 3.9432212829589846, "ce_loss_6": 3.5037839889526365, "epoch": 0.765, "grad_norm": 996.0, "kl_loss_12": 563.4930908203125, "kl_loss_17": 165.41349716186522, "kl_loss_3": 2311.5087646484376, "kl_loss_6": 1438.8681091308595, "learning_rate": 0.0001327041456712334, "loss": 1113.0895, "step": 7650 }, { "ce_loss_12": 3.1337642550468443, "ce_loss_17": 2.9663718938827515, "ce_loss_23": 2.8984671950340273, "ce_loss_3": 3.954953646659851, "ce_loss_6": 3.534931719303131, "epoch": 0.766, "grad_norm": 1624.0, "kl_loss_12": 560.6311721801758, "kl_loss_17": 164.26156311035157, "kl_loss_3": 2265.8011779785156, "kl_loss_6": 1423.2526489257812, "learning_rate": 0.00013162943106179747, "loss": 1112.404, "step": 7660 }, { "ce_loss_12": 3.1063711881637572, "ce_loss_17": 2.944747340679169, "ce_loss_23": 2.877778971195221, "ce_loss_3": 3.9044309973716738, "ce_loss_6": 3.5003751158714294, "epoch": 0.767, "grad_norm": 1056.0, "kl_loss_12": 542.2193283081054, "kl_loss_17": 160.44931716918944, "kl_loss_3": 2210.9869079589844, "kl_loss_6": 1404.0770751953125, "learning_rate": 0.00013055842593990132, "loss": 1088.844, "step": 7670 }, { "ce_loss_12": 3.0573420405387877, "ce_loss_17": 2.893602359294891, "ce_loss_23": 2.829056429862976, "ce_loss_3": 3.869810235500336, "ce_loss_6": 3.454623758792877, "epoch": 0.768, "grad_norm": 980.0, "kl_loss_12": 544.5171676635742, "kl_loss_17": 158.20527267456055, "kl_loss_3": 2200.6612182617187, "kl_loss_6": 1381.3787719726563, "learning_rate": 0.00012949114109055414, "loss": 1103.4986, "step": 7680 }, { "ce_loss_12": 3.0988786816596985, "ce_loss_17": 2.9312047243118284, "ce_loss_23": 2.862640619277954, "ce_loss_3": 3.929021441936493, "ce_loss_6": 3.5049775719642637, "epoch": 0.769, "grad_norm": 1176.0, "kl_loss_12": 558.3166275024414, "kl_loss_17": 163.206209564209, "kl_loss_3": 2267.092840576172, "kl_loss_6": 1426.1697814941406, "learning_rate": 0.00012842758726130281, "loss": 1111.3761, "step": 7690 }, { "ce_loss_12": 3.149468147754669, "ce_loss_17": 2.97717444896698, "ce_loss_23": 2.9082438707351685, "ce_loss_3": 3.9973672151565554, "ce_loss_6": 3.558988904953003, "epoch": 0.77, "grad_norm": 1040.0, "kl_loss_12": 565.3210571289062, "kl_loss_17": 164.83067245483397, "kl_loss_3": 2299.1438171386717, "kl_loss_6": 1442.9823791503907, "learning_rate": 0.00012736777516212267, "loss": 1096.093, "step": 7700 }, { "ce_loss_12": 3.142407476902008, "ce_loss_17": 2.9731947422027587, "ce_loss_23": 2.9043068170547484, "ce_loss_3": 3.960499668121338, "ce_loss_6": 3.5475622177124024, "epoch": 0.771, "grad_norm": 1128.0, "kl_loss_12": 563.460971069336, "kl_loss_17": 164.93396606445313, "kl_loss_3": 2251.1046752929688, "kl_loss_6": 1418.974591064453, "learning_rate": 0.00012631171546530968, "loss": 1086.3447, "step": 7710 }, { "ce_loss_12": 3.153670871257782, "ce_loss_17": 2.9829601645469666, "ce_loss_23": 2.9120034694671633, "ce_loss_3": 3.9621705174446107, "ce_loss_6": 3.5580246329307554, "epoch": 0.772, "grad_norm": 1144.0, "kl_loss_12": 566.0092697143555, "kl_loss_17": 166.2492805480957, "kl_loss_3": 2251.34677734375, "kl_loss_6": 1438.2859375, "learning_rate": 0.00012525941880537307, "loss": 1112.6428, "step": 7720 }, { "ce_loss_12": 3.168710947036743, "ce_loss_17": 3.01217383146286, "ce_loss_23": 2.9430142283439635, "ce_loss_3": 3.9859882950782777, "ce_loss_6": 3.570967698097229, "epoch": 0.773, "grad_norm": 1104.0, "kl_loss_12": 544.9650009155273, "kl_loss_17": 159.64609603881837, "kl_loss_3": 2227.202178955078, "kl_loss_6": 1404.3751037597656, "learning_rate": 0.00012421089577892869, "loss": 1088.829, "step": 7730 }, { "ce_loss_12": 3.133381700515747, "ce_loss_17": 2.96497061252594, "ce_loss_23": 2.8961349368095397, "ce_loss_3": 3.9726121544837953, "ce_loss_6": 3.5419203877449035, "epoch": 0.774, "grad_norm": 1256.0, "kl_loss_12": 557.8470489501954, "kl_loss_17": 161.55676193237304, "kl_loss_3": 2287.979541015625, "kl_loss_6": 1432.6632080078125, "learning_rate": 0.0001231661569445919, "loss": 1107.4707, "step": 7740 }, { "ce_loss_12": 3.0016807436943056, "ce_loss_17": 2.8387063264846804, "ce_loss_23": 2.7722793340682985, "ce_loss_3": 3.849887728691101, "ce_loss_6": 3.411376619338989, "epoch": 0.775, "grad_norm": 1200.0, "kl_loss_12": 551.7725036621093, "kl_loss_17": 161.13144760131837, "kl_loss_3": 2293.1958068847657, "kl_loss_6": 1418.0409729003907, "learning_rate": 0.00012212521282287093, "loss": 1123.733, "step": 7750 }, { "ce_loss_12": 3.136315310001373, "ce_loss_17": 2.9704002857208254, "ce_loss_23": 2.903051769733429, "ce_loss_3": 3.9540756225585936, "ce_loss_6": 3.5338873744010924, "epoch": 0.776, "grad_norm": 1048.0, "kl_loss_12": 560.9357513427734, "kl_loss_17": 164.57203521728516, "kl_loss_3": 2237.190466308594, "kl_loss_6": 1406.5361328125, "learning_rate": 0.00012108807389606158, "loss": 1113.689, "step": 7760 }, { "ce_loss_12": 3.133539354801178, "ce_loss_17": 2.972551167011261, "ce_loss_23": 2.9099935054779054, "ce_loss_3": 3.956028401851654, "ce_loss_6": 3.5379050612449645, "epoch": 0.777, "grad_norm": 1144.0, "kl_loss_12": 542.0610504150391, "kl_loss_17": 159.10600662231445, "kl_loss_3": 2235.199530029297, "kl_loss_6": 1404.911279296875, "learning_rate": 0.00012005475060814159, "loss": 1088.8854, "step": 7770 }, { "ce_loss_12": 3.079297161102295, "ce_loss_17": 2.9101614475250246, "ce_loss_23": 2.8443787336349486, "ce_loss_3": 3.9278798580169676, "ce_loss_6": 3.495157504081726, "epoch": 0.778, "grad_norm": 1128.0, "kl_loss_12": 555.6036270141601, "kl_loss_17": 161.68514709472657, "kl_loss_3": 2309.4862731933595, "kl_loss_6": 1449.1174621582031, "learning_rate": 0.00011902525336466464, "loss": 1110.0096, "step": 7780 }, { "ce_loss_12": 3.071139085292816, "ce_loss_17": 2.8988611459732057, "ce_loss_23": 2.828660762310028, "ce_loss_3": 3.9334118127822877, "ce_loss_6": 3.4934719800949097, "epoch": 0.779, "grad_norm": 1264.0, "kl_loss_12": 570.3909606933594, "kl_loss_17": 166.66974716186525, "kl_loss_3": 2341.7922607421874, "kl_loss_6": 1471.1030578613281, "learning_rate": 0.00011799959253265668, "loss": 1112.8619, "step": 7790 }, { "ce_loss_12": 3.116266095638275, "ce_loss_17": 2.9551839351654055, "ce_loss_23": 2.883231484889984, "ce_loss_3": 3.95099333524704, "ce_loss_6": 3.518081533908844, "epoch": 0.78, "grad_norm": 1208.0, "kl_loss_12": 559.7863510131835, "kl_loss_17": 166.4375129699707, "kl_loss_3": 2293.211083984375, "kl_loss_6": 1431.106298828125, "learning_rate": 0.00011697777844051105, "loss": 1107.758, "step": 7800 }, { "ce_loss_12": 3.1162440299987795, "ce_loss_17": 2.946736991405487, "ce_loss_23": 2.8777117013931273, "ce_loss_3": 3.9824501991271974, "ce_loss_6": 3.544774925708771, "epoch": 0.781, "grad_norm": 1176.0, "kl_loss_12": 561.5905197143554, "kl_loss_17": 166.06884841918946, "kl_loss_3": 2337.6393127441406, "kl_loss_6": 1477.4393798828125, "learning_rate": 0.00011595982137788402, "loss": 1119.0977, "step": 7810 }, { "ce_loss_12": 3.0908493518829347, "ce_loss_17": 2.93182715177536, "ce_loss_23": 2.865774428844452, "ce_loss_3": 3.8857927322387695, "ce_loss_6": 3.469491732120514, "epoch": 0.782, "grad_norm": 1136.0, "kl_loss_12": 542.1579315185547, "kl_loss_17": 160.3938201904297, "kl_loss_3": 2192.916149902344, "kl_loss_6": 1376.347833251953, "learning_rate": 0.00011494573159559212, "loss": 1088.1531, "step": 7820 }, { "ce_loss_12": 3.0768490195274354, "ce_loss_17": 2.9117467045783996, "ce_loss_23": 2.8412336468696595, "ce_loss_3": 3.9094900727272033, "ce_loss_6": 3.4854151964187623, "epoch": 0.783, "grad_norm": 1064.0, "kl_loss_12": 554.2974975585937, "kl_loss_17": 164.9409309387207, "kl_loss_3": 2269.3820617675783, "kl_loss_6": 1426.0858764648438, "learning_rate": 0.00011393551930550828, "loss": 1121.7828, "step": 7830 }, { "ce_loss_12": 3.192759084701538, "ce_loss_17": 3.032848227024078, "ce_loss_23": 2.9663447737693787, "ce_loss_3": 3.99887330532074, "ce_loss_6": 3.580846738815308, "epoch": 0.784, "grad_norm": 1072.0, "kl_loss_12": 547.1056564331054, "kl_loss_17": 163.2308448791504, "kl_loss_3": 2215.794158935547, "kl_loss_6": 1386.7675842285157, "learning_rate": 0.00011292919468045875, "loss": 1087.1904, "step": 7840 }, { "ce_loss_12": 3.1551998615264893, "ce_loss_17": 2.99315550327301, "ce_loss_23": 2.923645091056824, "ce_loss_3": 3.9820490002632143, "ce_loss_6": 3.5517470717430113, "epoch": 0.785, "grad_norm": 1048.0, "kl_loss_12": 554.8253494262696, "kl_loss_17": 162.82821502685547, "kl_loss_3": 2261.1311279296874, "kl_loss_6": 1406.467626953125, "learning_rate": 0.00011192676785412154, "loss": 1087.5309, "step": 7850 }, { "ce_loss_12": 3.1031730651855467, "ce_loss_17": 2.9350796341896057, "ce_loss_23": 2.864694333076477, "ce_loss_3": 3.9630802512168883, "ce_loss_6": 3.5200929641723633, "epoch": 0.786, "grad_norm": 1216.0, "kl_loss_12": 558.0501159667969, "kl_loss_17": 165.17536315917968, "kl_loss_3": 2308.5718994140625, "kl_loss_6": 1442.5363403320312, "learning_rate": 0.00011092824892092374, "loss": 1111.1633, "step": 7860 }, { "ce_loss_12": 3.0436164021492003, "ce_loss_17": 2.8756727457046507, "ce_loss_23": 2.808738589286804, "ce_loss_3": 3.9049333810806273, "ce_loss_6": 3.4604093074798583, "epoch": 0.787, "grad_norm": 1432.0, "kl_loss_12": 555.9726867675781, "kl_loss_17": 161.20776443481446, "kl_loss_3": 2315.7523681640623, "kl_loss_6": 1448.0448669433595, "learning_rate": 0.0001099336479359398, "loss": 1100.9196, "step": 7870 }, { "ce_loss_12": 3.1471460461616516, "ce_loss_17": 2.991259491443634, "ce_loss_23": 2.9230222702026367, "ce_loss_3": 3.9491287231445313, "ce_loss_6": 3.538780689239502, "epoch": 0.788, "grad_norm": 1144.0, "kl_loss_12": 547.1228820800782, "kl_loss_17": 161.35846481323242, "kl_loss_3": 2208.5154541015627, "kl_loss_6": 1388.1799682617188, "learning_rate": 0.00010894297491479043, "loss": 1092.2739, "step": 7880 }, { "ce_loss_12": 3.134075367450714, "ce_loss_17": 2.971082592010498, "ce_loss_23": 2.9070354461669923, "ce_loss_3": 3.9635905861854552, "ce_loss_6": 3.534369695186615, "epoch": 0.789, "grad_norm": 1112.0, "kl_loss_12": 550.5070495605469, "kl_loss_17": 160.23197479248046, "kl_loss_3": 2256.6437072753906, "kl_loss_6": 1408.9224609375, "learning_rate": 0.00010795623983354214, "loss": 1090.2475, "step": 7890 }, { "ce_loss_12": 3.0555803418159484, "ce_loss_17": 2.881819653511047, "ce_loss_23": 2.8134613275527953, "ce_loss_3": 3.89397656917572, "ce_loss_6": 3.4688626885414124, "epoch": 0.79, "grad_norm": 1096.0, "kl_loss_12": 569.0260833740234, "kl_loss_17": 167.3630630493164, "kl_loss_3": 2300.7442138671877, "kl_loss_6": 1453.1748962402344, "learning_rate": 0.00010697345262860636, "loss": 1106.0195, "step": 7900 }, { "ce_loss_12": 3.168570268154144, "ce_loss_17": 3.0090152263641357, "ce_loss_23": 2.942872929573059, "ce_loss_3": 3.9651273488998413, "ce_loss_6": 3.547044610977173, "epoch": 0.791, "grad_norm": 1192.0, "kl_loss_12": 543.2456344604492, "kl_loss_17": 160.62493667602538, "kl_loss_3": 2202.420587158203, "kl_loss_6": 1377.7202026367188, "learning_rate": 0.00010599462319663906, "loss": 1072.1201, "step": 7910 }, { "ce_loss_12": 3.1340227365493774, "ce_loss_17": 2.9779136538505555, "ce_loss_23": 2.9129930257797243, "ce_loss_3": 3.926445186138153, "ce_loss_6": 3.5237072110176086, "epoch": 0.792, "grad_norm": 1288.0, "kl_loss_12": 538.3043960571289, "kl_loss_17": 159.2765205383301, "kl_loss_3": 2171.213702392578, "kl_loss_6": 1366.4069458007812, "learning_rate": 0.00010501976139444191, "loss": 1068.7393, "step": 7920 }, { "ce_loss_12": 3.159173882007599, "ce_loss_17": 3.002285659313202, "ce_loss_23": 2.9352866649627685, "ce_loss_3": 3.970040965080261, "ce_loss_6": 3.5568660140037536, "epoch": 0.793, "grad_norm": 1512.0, "kl_loss_12": 539.3857559204101, "kl_loss_17": 160.57502517700195, "kl_loss_3": 2214.8924255371094, "kl_loss_6": 1396.2409423828126, "learning_rate": 0.0001040488770388625, "loss": 1101.3361, "step": 7930 }, { "ce_loss_12": 3.121935760974884, "ce_loss_17": 2.9588202238082886, "ce_loss_23": 2.895924472808838, "ce_loss_3": 3.947384810447693, "ce_loss_6": 3.52029892206192, "epoch": 0.794, "grad_norm": 1440.0, "kl_loss_12": 555.7849990844727, "kl_loss_17": 161.34661026000975, "kl_loss_3": 2273.8947875976564, "kl_loss_6": 1424.0836669921875, "learning_rate": 0.00010308197990669538, "loss": 1096.0457, "step": 7940 }, { "ce_loss_12": 3.2306849360466003, "ce_loss_17": 3.0659042596817017, "ce_loss_23": 2.999270474910736, "ce_loss_3": 4.04144002199173, "ce_loss_6": 3.6200562357902526, "epoch": 0.795, "grad_norm": 1176.0, "kl_loss_12": 560.016635131836, "kl_loss_17": 166.26138000488282, "kl_loss_3": 2237.8337158203126, "kl_loss_6": 1405.325018310547, "learning_rate": 0.0001021190797345839, "loss": 1087.4437, "step": 7950 }, { "ce_loss_12": 2.9893540263175966, "ce_loss_17": 2.8147446155548095, "ce_loss_23": 2.741576051712036, "ce_loss_3": 3.8425682902336122, "ce_loss_6": 3.410431373119354, "epoch": 0.796, "grad_norm": 1552.0, "kl_loss_12": 574.1566009521484, "kl_loss_17": 167.23827934265137, "kl_loss_3": 2333.274517822266, "kl_loss_6": 1471.6829833984375, "learning_rate": 0.00010116018621892236, "loss": 1115.1414, "step": 7960 }, { "ce_loss_12": 3.1764989376068113, "ce_loss_17": 3.010814297199249, "ce_loss_23": 2.943979728221893, "ce_loss_3": 4.0100136518478395, "ce_loss_6": 3.5845438480377196, "epoch": 0.797, "grad_norm": 1144.0, "kl_loss_12": 580.7473648071289, "kl_loss_17": 172.146883392334, "kl_loss_3": 2311.033654785156, "kl_loss_6": 1458.1046936035157, "learning_rate": 0.00010020530901575753, "loss": 1093.3162, "step": 7970 }, { "ce_loss_12": 3.19004088640213, "ce_loss_17": 3.028277337551117, "ce_loss_23": 2.959457278251648, "ce_loss_3": 4.00085334777832, "ce_loss_6": 3.5891884207725524, "epoch": 0.798, "grad_norm": 1000.0, "kl_loss_12": 559.4841430664062, "kl_loss_17": 165.01912384033204, "kl_loss_3": 2237.9213806152343, "kl_loss_6": 1415.7213012695313, "learning_rate": 9.925445774069231e-05, "loss": 1079.9097, "step": 7980 }, { "ce_loss_12": 3.147440028190613, "ce_loss_17": 2.9840749144554137, "ce_loss_23": 2.9128615975379946, "ce_loss_3": 3.959659469127655, "ce_loss_6": 3.5440555930137636, "epoch": 0.799, "grad_norm": 1168.0, "kl_loss_12": 550.6807968139649, "kl_loss_17": 163.16437606811525, "kl_loss_3": 2218.516809082031, "kl_loss_6": 1395.5287109375, "learning_rate": 9.830764196878872e-05, "loss": 1068.5918, "step": 7990 }, { "ce_loss_12": 3.095009219646454, "ce_loss_17": 2.934465217590332, "ce_loss_23": 2.8681960105895996, "ce_loss_3": 3.9243249773979185, "ce_loss_6": 3.4954513907432556, "epoch": 0.8, "grad_norm": 1416.0, "kl_loss_12": 549.8090255737304, "kl_loss_17": 159.90571517944335, "kl_loss_3": 2278.8769104003904, "kl_loss_6": 1422.961962890625, "learning_rate": 9.736487123447069e-05, "loss": 1096.4008, "step": 8000 }, { "ce_loss_12": 3.0546049952507017, "ce_loss_17": 2.8909072041511537, "ce_loss_23": 2.8225434184074403, "ce_loss_3": 3.934860420227051, "ce_loss_6": 3.4992011904716493, "epoch": 0.801, "grad_norm": 1240.0, "kl_loss_12": 562.626187133789, "kl_loss_17": 166.51637954711913, "kl_loss_3": 2389.3442626953124, "kl_loss_6": 1513.7143859863281, "learning_rate": 9.642615503142926e-05, "loss": 1133.6734, "step": 8010 }, { "ce_loss_12": 3.1169554352760316, "ce_loss_17": 2.948462611436844, "ce_loss_23": 2.880279469490051, "ce_loss_3": 3.9491363406181335, "ce_loss_6": 3.524028706550598, "epoch": 0.802, "grad_norm": 1080.0, "kl_loss_12": 552.5627502441406, "kl_loss_17": 161.54310836791993, "kl_loss_3": 2288.007940673828, "kl_loss_6": 1419.6550842285155, "learning_rate": 9.549150281252633e-05, "loss": 1089.2569, "step": 8020 }, { "ce_loss_12": 3.1377015233039858, "ce_loss_17": 2.972144603729248, "ce_loss_23": 2.9036526679992676, "ce_loss_3": 3.9606377482414246, "ce_loss_6": 3.535222041606903, "epoch": 0.803, "grad_norm": 1208.0, "kl_loss_12": 559.8434661865234, "kl_loss_17": 164.10131759643554, "kl_loss_3": 2280.4359741210938, "kl_loss_6": 1422.1460754394532, "learning_rate": 9.4560923989699e-05, "loss": 1113.476, "step": 8030 }, { "ce_loss_12": 3.1276845932006836, "ce_loss_17": 2.966171216964722, "ce_loss_23": 2.8971128463745117, "ce_loss_3": 3.9457432627677917, "ce_loss_6": 3.526905131340027, "epoch": 0.804, "grad_norm": 1328.0, "kl_loss_12": 556.661784362793, "kl_loss_17": 164.48508224487304, "kl_loss_3": 2246.531115722656, "kl_loss_6": 1412.6000244140625, "learning_rate": 9.363442793386607e-05, "loss": 1116.8084, "step": 8040 }, { "ce_loss_12": 3.1118528485298156, "ce_loss_17": 2.9384084939956665, "ce_loss_23": 2.865858054161072, "ce_loss_3": 3.961544167995453, "ce_loss_6": 3.5322256207466127, "epoch": 0.805, "grad_norm": 1624.0, "kl_loss_12": 571.4155303955079, "kl_loss_17": 166.18219985961915, "kl_loss_3": 2314.427600097656, "kl_loss_6": 1460.6573547363282, "learning_rate": 9.271202397483213e-05, "loss": 1092.9216, "step": 8050 }, { "ce_loss_12": 3.124697983264923, "ce_loss_17": 2.9692813038825987, "ce_loss_23": 2.903523635864258, "ce_loss_3": 3.9298166632652283, "ce_loss_6": 3.5110038042068483, "epoch": 0.806, "grad_norm": 1200.0, "kl_loss_12": 539.9032760620117, "kl_loss_17": 159.1617202758789, "kl_loss_3": 2204.6119384765625, "kl_loss_6": 1373.1665283203124, "learning_rate": 9.179372140119524e-05, "loss": 1096.2607, "step": 8060 }, { "ce_loss_12": 3.0772274017333983, "ce_loss_17": 2.9202582120895384, "ce_loss_23": 2.8505948543548585, "ce_loss_3": 3.9003460884094237, "ce_loss_6": 3.4769996047019958, "epoch": 0.807, "grad_norm": 1096.0, "kl_loss_12": 547.1208557128906, "kl_loss_17": 160.2839126586914, "kl_loss_3": 2238.119464111328, "kl_loss_6": 1401.8871643066407, "learning_rate": 9.087952946025175e-05, "loss": 1107.101, "step": 8070 }, { "ce_loss_12": 3.172690415382385, "ce_loss_17": 3.0231125354766846, "ce_loss_23": 2.9546570420265197, "ce_loss_3": 3.9569469451904298, "ce_loss_6": 3.548564386367798, "epoch": 0.808, "grad_norm": 1064.0, "kl_loss_12": 530.4381561279297, "kl_loss_17": 158.45982284545897, "kl_loss_3": 2170.6416198730467, "kl_loss_6": 1350.5114440917969, "learning_rate": 8.996945735790446e-05, "loss": 1087.7603, "step": 8080 }, { "ce_loss_12": 3.078972852230072, "ce_loss_17": 2.9214311003685, "ce_loss_23": 2.856155252456665, "ce_loss_3": 3.8913928270339966, "ce_loss_6": 3.485938382148743, "epoch": 0.809, "grad_norm": 1464.0, "kl_loss_12": 548.5154266357422, "kl_loss_17": 161.0421516418457, "kl_loss_3": 2236.966937255859, "kl_loss_6": 1416.3121643066406, "learning_rate": 8.906351425856951e-05, "loss": 1105.2345, "step": 8090 }, { "ce_loss_12": 3.0690324544906615, "ce_loss_17": 2.904653477668762, "ce_loss_23": 2.8365507245063784, "ce_loss_3": 3.9070201873779298, "ce_loss_6": 3.4807681679725646, "epoch": 0.81, "grad_norm": 1384.0, "kl_loss_12": 557.0683990478516, "kl_loss_17": 162.01241226196288, "kl_loss_3": 2309.0856811523436, "kl_loss_6": 1445.5119323730469, "learning_rate": 8.816170928508365e-05, "loss": 1119.2719, "step": 8100 }, { "ce_loss_12": 3.042520451545715, "ce_loss_17": 2.873482954502106, "ce_loss_23": 2.805701220035553, "ce_loss_3": 3.9062719345092773, "ce_loss_6": 3.460271942615509, "epoch": 0.811, "grad_norm": 1064.0, "kl_loss_12": 562.3566223144531, "kl_loss_17": 162.64302597045898, "kl_loss_3": 2333.633984375, "kl_loss_6": 1452.9960693359376, "learning_rate": 8.7264051518613e-05, "loss": 1107.0025, "step": 8110 }, { "ce_loss_12": 3.11094753742218, "ce_loss_17": 2.9495893716812134, "ce_loss_23": 2.886719620227814, "ce_loss_3": 3.9172590613365172, "ce_loss_6": 3.5028425812721253, "epoch": 0.812, "grad_norm": 1112.0, "kl_loss_12": 540.7414581298829, "kl_loss_17": 157.58410263061523, "kl_loss_3": 2204.1671936035154, "kl_loss_6": 1378.162091064453, "learning_rate": 8.637054999856148e-05, "loss": 1087.2692, "step": 8120 }, { "ce_loss_12": 3.1143086314201356, "ce_loss_17": 2.9487848043441773, "ce_loss_23": 2.879635775089264, "ce_loss_3": 3.938987469673157, "ce_loss_6": 3.5184776306152346, "epoch": 0.813, "grad_norm": 884.0, "kl_loss_12": 557.1480926513672, "kl_loss_17": 163.79063415527344, "kl_loss_3": 2253.558184814453, "kl_loss_6": 1420.3731384277344, "learning_rate": 8.548121372247918e-05, "loss": 1111.1787, "step": 8130 }, { "ce_loss_12": 3.1705002427101134, "ce_loss_17": 3.0160479664802553, "ce_loss_23": 2.9523680925369264, "ce_loss_3": 3.969861078262329, "ce_loss_6": 3.5573942184448244, "epoch": 0.814, "grad_norm": 1216.0, "kl_loss_12": 543.2226989746093, "kl_loss_17": 160.13306884765626, "kl_loss_3": 2231.438720703125, "kl_loss_6": 1395.8012634277343, "learning_rate": 8.459605164597267e-05, "loss": 1084.0575, "step": 8140 }, { "ce_loss_12": 3.066420042514801, "ce_loss_17": 2.9058117270469666, "ce_loss_23": 2.842375338077545, "ce_loss_3": 3.897522139549255, "ce_loss_6": 3.476026487350464, "epoch": 0.815, "grad_norm": 1224.0, "kl_loss_12": 548.8289077758789, "kl_loss_17": 159.9200454711914, "kl_loss_3": 2262.31728515625, "kl_loss_6": 1419.397479248047, "learning_rate": 8.371507268261436e-05, "loss": 1109.2594, "step": 8150 }, { "ce_loss_12": 3.1309832453727724, "ce_loss_17": 2.9698503971099854, "ce_loss_23": 2.900208055973053, "ce_loss_3": 3.9546516060829164, "ce_loss_6": 3.5335537910461428, "epoch": 0.816, "grad_norm": 848.0, "kl_loss_12": 545.8640365600586, "kl_loss_17": 161.9612693786621, "kl_loss_3": 2239.576623535156, "kl_loss_6": 1402.9355529785157, "learning_rate": 8.283828570385238e-05, "loss": 1072.7236, "step": 8160 }, { "ce_loss_12": 3.1363924384117126, "ce_loss_17": 2.9693047285079954, "ce_loss_23": 2.9015498042106627, "ce_loss_3": 3.959295666217804, "ce_loss_6": 3.535193419456482, "epoch": 0.817, "grad_norm": 1120.0, "kl_loss_12": 547.0559875488282, "kl_loss_17": 160.7385581970215, "kl_loss_3": 2233.6774658203126, "kl_loss_6": 1393.712664794922, "learning_rate": 8.196569953892202e-05, "loss": 1089.5018, "step": 8170 }, { "ce_loss_12": 3.0679932117462156, "ce_loss_17": 2.8993886709213257, "ce_loss_23": 2.8328171491622927, "ce_loss_3": 3.894995379447937, "ce_loss_6": 3.4704811215400695, "epoch": 0.818, "grad_norm": 1232.0, "kl_loss_12": 558.6704147338867, "kl_loss_17": 162.2000717163086, "kl_loss_3": 2243.635137939453, "kl_loss_6": 1417.9678466796875, "learning_rate": 8.109732297475635e-05, "loss": 1089.0442, "step": 8180 }, { "ce_loss_12": 3.056806230545044, "ce_loss_17": 2.8779824495315554, "ce_loss_23": 2.807134783267975, "ce_loss_3": 3.936583161354065, "ce_loss_6": 3.4898857831954957, "epoch": 0.819, "grad_norm": 1896.0, "kl_loss_12": 578.1514053344727, "kl_loss_17": 166.74496994018554, "kl_loss_3": 2355.654931640625, "kl_loss_6": 1488.9981994628906, "learning_rate": 8.023316475589754e-05, "loss": 1128.7449, "step": 8190 }, { "ce_loss_12": 3.02653945684433, "ce_loss_17": 2.8479262471199034, "ce_loss_23": 2.7727146625518797, "ce_loss_3": 3.9359703540802, "ce_loss_6": 3.468432629108429, "epoch": 0.82, "grad_norm": 1560.0, "kl_loss_12": 595.1294815063477, "kl_loss_17": 174.096484375, "kl_loss_3": 2443.1336364746094, "kl_loss_6": 1527.0234619140624, "learning_rate": 7.937323358440934e-05, "loss": 1148.4727, "step": 8200 }, { "ce_loss_12": 3.1162544965744017, "ce_loss_17": 2.9630125164985657, "ce_loss_23": 2.8993138432502747, "ce_loss_3": 3.911135995388031, "ce_loss_6": 3.496608221530914, "epoch": 0.821, "grad_norm": 1152.0, "kl_loss_12": 544.4355880737305, "kl_loss_17": 159.98213882446288, "kl_loss_3": 2191.1896850585936, "kl_loss_6": 1374.475323486328, "learning_rate": 7.851753811978923e-05, "loss": 1085.2391, "step": 8210 }, { "ce_loss_12": 3.1299094915390016, "ce_loss_17": 2.9677687168121336, "ce_loss_23": 2.899359703063965, "ce_loss_3": 3.9690559029579164, "ce_loss_6": 3.5370683073997498, "epoch": 0.822, "grad_norm": 1192.0, "kl_loss_12": 551.8927993774414, "kl_loss_17": 164.45097732543945, "kl_loss_3": 2285.034063720703, "kl_loss_6": 1431.9157836914062, "learning_rate": 7.766608697888095e-05, "loss": 1093.2594, "step": 8220 }, { "ce_loss_12": 3.147523856163025, "ce_loss_17": 2.983336317539215, "ce_loss_23": 2.9168429255485533, "ce_loss_3": 3.981073188781738, "ce_loss_6": 3.5518638849258424, "epoch": 0.823, "grad_norm": 840.0, "kl_loss_12": 556.6430786132812, "kl_loss_17": 164.469490814209, "kl_loss_3": 2287.3015747070312, "kl_loss_6": 1432.2851135253907, "learning_rate": 7.681888873578785e-05, "loss": 1117.138, "step": 8230 }, { "ce_loss_12": 3.081748139858246, "ce_loss_17": 2.909856748580933, "ce_loss_23": 2.8364683270454405, "ce_loss_3": 3.924388611316681, "ce_loss_6": 3.488308036327362, "epoch": 0.824, "grad_norm": 1272.0, "kl_loss_12": 572.6440856933593, "kl_loss_17": 169.69739227294923, "kl_loss_3": 2308.4847412109375, "kl_loss_6": 1448.6083068847656, "learning_rate": 7.597595192178702e-05, "loss": 1103.9938, "step": 8240 }, { "ce_loss_12": 3.0852334976196287, "ce_loss_17": 2.913503646850586, "ce_loss_23": 2.841702175140381, "ce_loss_3": 3.9479007720947266, "ce_loss_6": 3.508209836483002, "epoch": 0.825, "grad_norm": 1488.0, "kl_loss_12": 563.913801574707, "kl_loss_17": 165.11235427856445, "kl_loss_3": 2359.2099426269533, "kl_loss_6": 1477.1159729003907, "learning_rate": 7.513728502524286e-05, "loss": 1129.6759, "step": 8250 }, { "ce_loss_12": 3.065532147884369, "ce_loss_17": 2.9113979578018188, "ce_loss_23": 2.8476172029972076, "ce_loss_3": 3.8924211502075194, "ce_loss_6": 3.4671952962875365, "epoch": 0.826, "grad_norm": 1104.0, "kl_loss_12": 533.2513366699219, "kl_loss_17": 155.2083396911621, "kl_loss_3": 2228.1499267578124, "kl_loss_6": 1379.5601379394532, "learning_rate": 7.430289649152156e-05, "loss": 1097.4211, "step": 8260 }, { "ce_loss_12": 2.9995575308799745, "ce_loss_17": 2.828830623626709, "ce_loss_23": 2.7601626992225645, "ce_loss_3": 3.8649712085723875, "ce_loss_6": 3.4307517409324646, "epoch": 0.827, "grad_norm": 1384.0, "kl_loss_12": 565.3962280273438, "kl_loss_17": 162.2247215270996, "kl_loss_3": 2362.8103149414064, "kl_loss_6": 1491.5337951660156, "learning_rate": 7.347279472290646e-05, "loss": 1109.859, "step": 8270 }, { "ce_loss_12": 3.122376787662506, "ce_loss_17": 2.9612316250801087, "ce_loss_23": 2.8959510564804076, "ce_loss_3": 3.9570825338363647, "ce_loss_6": 3.531812274456024, "epoch": 0.828, "grad_norm": 1064.0, "kl_loss_12": 554.3373107910156, "kl_loss_17": 162.3391372680664, "kl_loss_3": 2278.9635681152345, "kl_loss_6": 1429.607843017578, "learning_rate": 7.264698807851328e-05, "loss": 1109.9259, "step": 8280 }, { "ce_loss_12": 3.0933347225189207, "ce_loss_17": 2.9319000482559203, "ce_loss_23": 2.8684099674224854, "ce_loss_3": 3.899102711677551, "ce_loss_6": 3.483800983428955, "epoch": 0.829, "grad_norm": 1048.0, "kl_loss_12": 544.1003021240234, "kl_loss_17": 159.55711669921874, "kl_loss_3": 2213.3914123535155, "kl_loss_6": 1387.0603576660155, "learning_rate": 7.182548487420554e-05, "loss": 1087.8375, "step": 8290 }, { "ce_loss_12": 3.141780412197113, "ce_loss_17": 2.9783580780029295, "ce_loss_23": 2.909892702102661, "ce_loss_3": 3.9592021346092223, "ce_loss_6": 3.5420392751693726, "epoch": 0.83, "grad_norm": 1072.0, "kl_loss_12": 560.5811340332032, "kl_loss_17": 165.19401168823242, "kl_loss_3": 2255.7765991210936, "kl_loss_6": 1421.8934265136718, "learning_rate": 7.100829338251146e-05, "loss": 1093.7117, "step": 8300 }, { "ce_loss_12": 3.08468918800354, "ce_loss_17": 2.9131301283836364, "ce_loss_23": 2.840294587612152, "ce_loss_3": 3.9290374636650087, "ce_loss_6": 3.5016937017440797, "epoch": 0.831, "grad_norm": 1352.0, "kl_loss_12": 566.9378234863282, "kl_loss_17": 166.56872940063477, "kl_loss_3": 2305.8603698730467, "kl_loss_6": 1455.0379943847656, "learning_rate": 7.019542183254046e-05, "loss": 1097.4264, "step": 8310 }, { "ce_loss_12": 3.1170966267585754, "ce_loss_17": 2.953399920463562, "ce_loss_23": 2.879462385177612, "ce_loss_3": 3.928554356098175, "ce_loss_6": 3.514261078834534, "epoch": 0.832, "grad_norm": 1384.0, "kl_loss_12": 566.4154647827148, "kl_loss_17": 171.7959846496582, "kl_loss_3": 2248.6452697753907, "kl_loss_6": 1419.4371704101563, "learning_rate": 6.938687840989971e-05, "loss": 1096.3621, "step": 8320 }, { "ce_loss_12": 3.0669469594955445, "ce_loss_17": 2.8998965978622437, "ce_loss_23": 2.829956316947937, "ce_loss_3": 3.893748676776886, "ce_loss_6": 3.4692044615745545, "epoch": 0.833, "grad_norm": 1184.0, "kl_loss_12": 560.9413757324219, "kl_loss_17": 168.15377655029297, "kl_loss_3": 2256.4309631347655, "kl_loss_6": 1422.266094970703, "learning_rate": 6.858267125661271e-05, "loss": 1113.0926, "step": 8330 }, { "ce_loss_12": 3.1150636672973633, "ce_loss_17": 2.9481117963790893, "ce_loss_23": 2.8806458711624146, "ce_loss_3": 3.9465166568756103, "ce_loss_6": 3.518279528617859, "epoch": 0.834, "grad_norm": 1504.0, "kl_loss_12": 552.7582366943359, "kl_loss_17": 160.30242538452148, "kl_loss_3": 2264.86318359375, "kl_loss_6": 1412.4846130371093, "learning_rate": 6.778280847103668e-05, "loss": 1118.2813, "step": 8340 }, { "ce_loss_12": 3.1263239979743958, "ce_loss_17": 2.958054792881012, "ce_loss_23": 2.8883840918540953, "ce_loss_3": 3.933966672420502, "ce_loss_6": 3.5245227932929994, "epoch": 0.835, "grad_norm": 992.0, "kl_loss_12": 561.3988143920899, "kl_loss_17": 165.34767684936523, "kl_loss_3": 2257.2742126464846, "kl_loss_6": 1430.91298828125, "learning_rate": 6.698729810778065e-05, "loss": 1096.0061, "step": 8350 }, { "ce_loss_12": 3.0373926401138305, "ce_loss_17": 2.872546947002411, "ce_loss_23": 2.806998634338379, "ce_loss_3": 3.8821595907211304, "ce_loss_6": 3.442913126945496, "epoch": 0.836, "grad_norm": 1848.0, "kl_loss_12": 548.8032333374024, "kl_loss_17": 158.58746490478515, "kl_loss_3": 2281.4795837402344, "kl_loss_6": 1414.729718017578, "learning_rate": 6.619614817762538e-05, "loss": 1104.8056, "step": 8360 }, { "ce_loss_12": 3.0213045954704283, "ce_loss_17": 2.8510085225105284, "ce_loss_23": 2.781916630268097, "ce_loss_3": 3.9031874537467957, "ce_loss_6": 3.4542742133140565, "epoch": 0.837, "grad_norm": 1168.0, "kl_loss_12": 568.2625747680664, "kl_loss_17": 162.45321197509764, "kl_loss_3": 2375.975720214844, "kl_loss_6": 1487.9389587402343, "learning_rate": 6.540936664744196e-05, "loss": 1126.8619, "step": 8370 }, { "ce_loss_12": 3.144019269943237, "ce_loss_17": 2.978832817077637, "ce_loss_23": 2.911532390117645, "ce_loss_3": 3.98069965839386, "ce_loss_6": 3.55452219247818, "epoch": 0.838, "grad_norm": 1008.0, "kl_loss_12": 555.9855712890625, "kl_loss_17": 163.2048194885254, "kl_loss_3": 2276.542120361328, "kl_loss_6": 1436.4474548339845, "learning_rate": 6.462696144011149e-05, "loss": 1090.6496, "step": 8380 }, { "ce_loss_12": 3.1107449650764467, "ce_loss_17": 2.942846190929413, "ce_loss_23": 2.8735639452934265, "ce_loss_3": 3.915558421611786, "ce_loss_6": 3.513276982307434, "epoch": 0.839, "grad_norm": 1272.0, "kl_loss_12": 563.3623809814453, "kl_loss_17": 164.73463973999023, "kl_loss_3": 2243.006005859375, "kl_loss_6": 1425.6291137695312, "learning_rate": 6.384894043444567e-05, "loss": 1083.7996, "step": 8390 }, { "ce_loss_12": 3.122048246860504, "ce_loss_17": 2.9576640486717225, "ce_loss_23": 2.8880361676216126, "ce_loss_3": 3.957304573059082, "ce_loss_6": 3.540115237236023, "epoch": 0.84, "grad_norm": 1040.0, "kl_loss_12": 560.0843856811523, "kl_loss_17": 163.13331756591796, "kl_loss_3": 2267.2568115234376, "kl_loss_6": 1435.8717346191406, "learning_rate": 6.307531146510753e-05, "loss": 1098.984, "step": 8400 }, { "ce_loss_12": 3.102569282054901, "ce_loss_17": 2.9388603925704957, "ce_loss_23": 2.8691525936126707, "ce_loss_3": 3.9057474017143248, "ce_loss_6": 3.5014272809028624, "epoch": 0.841, "grad_norm": 988.0, "kl_loss_12": 547.4675140380859, "kl_loss_17": 163.2883728027344, "kl_loss_3": 2202.335076904297, "kl_loss_6": 1394.86650390625, "learning_rate": 6.230608232253226e-05, "loss": 1072.458, "step": 8410 }, { "ce_loss_12": 3.0702186107635496, "ce_loss_17": 2.896703338623047, "ce_loss_23": 2.8263700127601625, "ce_loss_3": 3.9404627799987795, "ce_loss_6": 3.501567506790161, "epoch": 0.842, "grad_norm": 1096.0, "kl_loss_12": 567.8404357910156, "kl_loss_17": 165.14807815551757, "kl_loss_3": 2338.825054931641, "kl_loss_6": 1470.2890380859376, "learning_rate": 6.154126075284855e-05, "loss": 1104.2164, "step": 8420 }, { "ce_loss_12": 3.1461366176605225, "ce_loss_17": 2.9891268968582154, "ce_loss_23": 2.92186838388443, "ce_loss_3": 3.945695734024048, "ce_loss_6": 3.5478767275810243, "epoch": 0.843, "grad_norm": 1832.0, "kl_loss_12": 541.3987503051758, "kl_loss_17": 159.74902267456054, "kl_loss_3": 2196.7286682128906, "kl_loss_6": 1398.2157592773438, "learning_rate": 6.078085445780129e-05, "loss": 1071.0024, "step": 8430 }, { "ce_loss_12": 3.1509676933288575, "ce_loss_17": 2.986407232284546, "ce_loss_23": 2.92077054977417, "ce_loss_3": 3.9918720960617065, "ce_loss_6": 3.5571364164352417, "epoch": 0.844, "grad_norm": 1080.0, "kl_loss_12": 554.0682266235351, "kl_loss_17": 162.6667366027832, "kl_loss_3": 2300.378106689453, "kl_loss_6": 1428.4713317871094, "learning_rate": 6.002487109467347e-05, "loss": 1087.3066, "step": 8440 }, { "ce_loss_12": 3.161572754383087, "ce_loss_17": 2.9950670480728148, "ce_loss_23": 2.926289701461792, "ce_loss_3": 3.97816481590271, "ce_loss_6": 3.559042716026306, "epoch": 0.845, "grad_norm": 1608.0, "kl_loss_12": 566.6126342773438, "kl_loss_17": 167.85883026123048, "kl_loss_3": 2266.07783203125, "kl_loss_6": 1422.8419982910157, "learning_rate": 5.927331827620902e-05, "loss": 1092.1502, "step": 8450 }, { "ce_loss_12": 3.143201971054077, "ce_loss_17": 2.979695534706116, "ce_loss_23": 2.9125229477882386, "ce_loss_3": 3.921801245212555, "ce_loss_6": 3.5282947778701783, "epoch": 0.846, "grad_norm": 976.0, "kl_loss_12": 546.6759765625, "kl_loss_17": 160.7653564453125, "kl_loss_3": 2170.8586669921874, "kl_loss_6": 1373.1689147949219, "learning_rate": 5.852620357053651e-05, "loss": 1081.7949, "step": 8460 }, { "ce_loss_12": 3.1741357922554014, "ce_loss_17": 3.0152151346206666, "ce_loss_23": 2.9533477783203126, "ce_loss_3": 3.977063298225403, "ce_loss_6": 3.564035105705261, "epoch": 0.847, "grad_norm": 864.0, "kl_loss_12": 546.3927154541016, "kl_loss_17": 158.32703094482423, "kl_loss_3": 2209.043560791016, "kl_loss_6": 1389.9683044433593, "learning_rate": 5.778353450109286e-05, "loss": 1086.1645, "step": 8470 }, { "ce_loss_12": 3.215981650352478, "ce_loss_17": 3.047750270366669, "ce_loss_23": 2.978085231781006, "ce_loss_3": 4.040736985206604, "ce_loss_6": 3.6170117974281313, "epoch": 0.848, "grad_norm": 1320.0, "kl_loss_12": 560.3518157958985, "kl_loss_17": 165.00484771728514, "kl_loss_3": 2262.4346435546877, "kl_loss_6": 1421.1817565917968, "learning_rate": 5.7045318546547206e-05, "loss": 1089.9925, "step": 8480 }, { "ce_loss_12": 3.11188143491745, "ce_loss_17": 2.9473851919174194, "ce_loss_23": 2.879114067554474, "ce_loss_3": 3.9515628933906557, "ce_loss_6": 3.5185172319412232, "epoch": 0.849, "grad_norm": 1352.0, "kl_loss_12": 550.3204238891601, "kl_loss_17": 161.58224029541014, "kl_loss_3": 2280.7270629882814, "kl_loss_6": 1424.6280517578125, "learning_rate": 5.631156314072605e-05, "loss": 1091.9666, "step": 8490 }, { "ce_loss_12": 3.1408272266387938, "ce_loss_17": 2.9797823429107666, "ce_loss_23": 2.912586140632629, "ce_loss_3": 3.930195379257202, "ce_loss_6": 3.519640862941742, "epoch": 0.85, "grad_norm": 1440.0, "kl_loss_12": 546.5631805419922, "kl_loss_17": 161.9244026184082, "kl_loss_3": 2195.800213623047, "kl_loss_6": 1377.4348999023437, "learning_rate": 5.5582275672538315e-05, "loss": 1073.6294, "step": 8500 }, { "ce_loss_12": 3.0622190117835997, "ce_loss_17": 2.8910266757011414, "ce_loss_23": 2.8219680070877073, "ce_loss_3": 3.942839562892914, "ce_loss_6": 3.5015220046043396, "epoch": 0.851, "grad_norm": 1160.0, "kl_loss_12": 571.0053894042969, "kl_loss_17": 166.4903663635254, "kl_loss_3": 2369.3764770507814, "kl_loss_6": 1494.2642456054687, "learning_rate": 5.4857463485900484e-05, "loss": 1121.9272, "step": 8510 }, { "ce_loss_12": 3.111485755443573, "ce_loss_17": 2.946943771839142, "ce_loss_23": 2.880238139629364, "ce_loss_3": 3.9151260018348695, "ce_loss_6": 3.5119995951652525, "epoch": 0.852, "grad_norm": 1352.0, "kl_loss_12": 552.024201965332, "kl_loss_17": 161.01301116943358, "kl_loss_3": 2224.1720764160154, "kl_loss_6": 1411.4604736328124, "learning_rate": 5.413713387966329e-05, "loss": 1087.1721, "step": 8520 }, { "ce_loss_12": 3.1266626834869387, "ce_loss_17": 2.9612136125564574, "ce_loss_23": 2.894944739341736, "ce_loss_3": 3.9596535682678224, "ce_loss_6": 3.5332515597343446, "epoch": 0.853, "grad_norm": 1272.0, "kl_loss_12": 554.592903137207, "kl_loss_17": 161.4539581298828, "kl_loss_3": 2281.1273559570313, "kl_loss_6": 1426.2618041992187, "learning_rate": 5.34212941075381e-05, "loss": 1097.3725, "step": 8530 }, { "ce_loss_12": 3.1214884757995605, "ce_loss_17": 2.969010281562805, "ce_loss_23": 2.906864821910858, "ce_loss_3": 3.92522656917572, "ce_loss_6": 3.504314923286438, "epoch": 0.854, "grad_norm": 1536.0, "kl_loss_12": 526.8193008422852, "kl_loss_17": 155.95495262145997, "kl_loss_3": 2192.919073486328, "kl_loss_6": 1359.222833251953, "learning_rate": 5.270995137802315e-05, "loss": 1074.2971, "step": 8540 }, { "ce_loss_12": 3.0763959646224976, "ce_loss_17": 2.916470468044281, "ce_loss_23": 2.852141809463501, "ce_loss_3": 3.8961158752441407, "ce_loss_6": 3.4745135068893434, "epoch": 0.855, "grad_norm": 1320.0, "kl_loss_12": 546.9484146118164, "kl_loss_17": 158.56359329223633, "kl_loss_3": 2248.6607177734377, "kl_loss_6": 1404.589617919922, "learning_rate": 5.2003112854332125e-05, "loss": 1098.8508, "step": 8550 }, { "ce_loss_12": 3.063506841659546, "ce_loss_17": 2.909021496772766, "ce_loss_23": 2.846612310409546, "ce_loss_3": 3.874693489074707, "ce_loss_6": 3.464722013473511, "epoch": 0.856, "grad_norm": 1168.0, "kl_loss_12": 540.1203262329102, "kl_loss_17": 155.96851348876953, "kl_loss_3": 2231.803356933594, "kl_loss_6": 1405.0531921386719, "learning_rate": 5.130078565432089e-05, "loss": 1069.7745, "step": 8560 }, { "ce_loss_12": 3.121432435512543, "ce_loss_17": 2.970027732849121, "ce_loss_23": 2.906119704246521, "ce_loss_3": 3.9241920351982116, "ce_loss_6": 3.513047194480896, "epoch": 0.857, "grad_norm": 1136.0, "kl_loss_12": 533.2334243774415, "kl_loss_17": 155.81883239746094, "kl_loss_3": 2196.0322692871096, "kl_loss_6": 1381.9104248046874, "learning_rate": 5.060297685041659e-05, "loss": 1062.4382, "step": 8570 }, { "ce_loss_12": 3.0725590705871584, "ce_loss_17": 2.905340886116028, "ce_loss_23": 2.8341203689575196, "ce_loss_3": 3.914118731021881, "ce_loss_6": 3.4800498366355894, "epoch": 0.858, "grad_norm": 1168.0, "kl_loss_12": 562.4660766601562, "kl_loss_17": 166.77214736938475, "kl_loss_3": 2297.951123046875, "kl_loss_6": 1436.9736083984376, "learning_rate": 4.99096934695461e-05, "loss": 1113.8425, "step": 8580 }, { "ce_loss_12": 3.129444992542267, "ce_loss_17": 2.963559329509735, "ce_loss_23": 2.8960996150970457, "ce_loss_3": 3.952687406539917, "ce_loss_6": 3.5332820296287535, "epoch": 0.859, "grad_norm": 1120.0, "kl_loss_12": 542.9299270629883, "kl_loss_17": 158.55432205200196, "kl_loss_3": 2231.0780334472656, "kl_loss_6": 1403.0755798339844, "learning_rate": 4.922094249306558e-05, "loss": 1077.5844, "step": 8590 }, { "ce_loss_12": 3.1639073491096497, "ce_loss_17": 2.996286356449127, "ce_loss_23": 2.928344798088074, "ce_loss_3": 3.9905553221702577, "ce_loss_6": 3.559040880203247, "epoch": 0.86, "grad_norm": 988.0, "kl_loss_12": 557.205712890625, "kl_loss_17": 164.4212745666504, "kl_loss_3": 2263.422235107422, "kl_loss_6": 1409.6866027832032, "learning_rate": 4.853673085668947e-05, "loss": 1076.0809, "step": 8600 }, { "ce_loss_12": 3.1688145637512206, "ce_loss_17": 3.006110095977783, "ce_loss_23": 2.9394213199615478, "ce_loss_3": 3.99489278793335, "ce_loss_6": 3.5717530608177186, "epoch": 0.861, "grad_norm": 1152.0, "kl_loss_12": 547.7848281860352, "kl_loss_17": 159.71785202026368, "kl_loss_3": 2251.6465454101562, "kl_loss_6": 1418.1878173828125, "learning_rate": 4.78570654504214e-05, "loss": 1093.2465, "step": 8610 }, { "ce_loss_12": 3.119960296154022, "ce_loss_17": 2.9601327061653135, "ce_loss_23": 2.8931104063987734, "ce_loss_3": 3.9442036390304565, "ce_loss_6": 3.524333989620209, "epoch": 0.862, "grad_norm": 1168.0, "kl_loss_12": 543.3126388549805, "kl_loss_17": 159.60537643432616, "kl_loss_3": 2246.9543395996093, "kl_loss_6": 1413.2603393554687, "learning_rate": 4.7181953118484556e-05, "loss": 1093.0124, "step": 8620 }, { "ce_loss_12": 3.1447961807250975, "ce_loss_17": 2.981061267852783, "ce_loss_23": 2.919272768497467, "ce_loss_3": 3.950004208087921, "ce_loss_6": 3.536801242828369, "epoch": 0.863, "grad_norm": 1520.0, "kl_loss_12": 546.9749267578125, "kl_loss_17": 158.86413192749023, "kl_loss_3": 2193.546746826172, "kl_loss_6": 1393.0187927246093, "learning_rate": 4.651140065925269e-05, "loss": 1102.2883, "step": 8630 }, { "ce_loss_12": 3.0848291754722594, "ce_loss_17": 2.919208121299744, "ce_loss_23": 2.852880430221558, "ce_loss_3": 3.9040238738059996, "ce_loss_6": 3.4828606605529786, "epoch": 0.864, "grad_norm": 1004.0, "kl_loss_12": 550.5586502075196, "kl_loss_17": 161.43046188354492, "kl_loss_3": 2262.796240234375, "kl_loss_6": 1411.1244506835938, "learning_rate": 4.58454148251814e-05, "loss": 1104.4396, "step": 8640 }, { "ce_loss_12": 3.0967994570732116, "ce_loss_17": 2.9280046820640564, "ce_loss_23": 2.8609217524528505, "ce_loss_3": 3.950021493434906, "ce_loss_6": 3.5200977087020875, "epoch": 0.865, "grad_norm": 1128.0, "kl_loss_12": 551.6484832763672, "kl_loss_17": 160.92717361450195, "kl_loss_3": 2294.815301513672, "kl_loss_6": 1441.8151733398438, "learning_rate": 4.518400232274078e-05, "loss": 1098.8965, "step": 8650 }, { "ce_loss_12": 3.1297909736633303, "ce_loss_17": 2.9608248710632323, "ce_loss_23": 2.8919439792633055, "ce_loss_3": 3.9411178827285767, "ce_loss_6": 3.523774802684784, "epoch": 0.866, "grad_norm": 1288.0, "kl_loss_12": 555.529444885254, "kl_loss_17": 163.9142303466797, "kl_loss_3": 2240.0380859375, "kl_loss_6": 1405.0944458007812, "learning_rate": 4.452716981234745e-05, "loss": 1066.0305, "step": 8660 }, { "ce_loss_12": 3.0957505226135256, "ce_loss_17": 2.9325799107551576, "ce_loss_23": 2.868885409832001, "ce_loss_3": 3.90879647731781, "ce_loss_6": 3.490975868701935, "epoch": 0.867, "grad_norm": 1496.0, "kl_loss_12": 539.2698043823242, "kl_loss_17": 156.92785415649413, "kl_loss_3": 2228.797253417969, "kl_loss_6": 1396.8384155273438, "learning_rate": 4.3874923908297335e-05, "loss": 1067.8451, "step": 8670 }, { "ce_loss_12": 3.145753872394562, "ce_loss_17": 2.983182764053345, "ce_loss_23": 2.9180320858955384, "ce_loss_3": 3.9800577998161315, "ce_loss_6": 3.5526066422462463, "epoch": 0.868, "grad_norm": 1152.0, "kl_loss_12": 555.1043502807618, "kl_loss_17": 162.09677505493164, "kl_loss_3": 2282.174005126953, "kl_loss_6": 1435.6018798828125, "learning_rate": 4.322727117869951e-05, "loss": 1089.9818, "step": 8680 }, { "ce_loss_12": 3.153953719139099, "ce_loss_17": 2.9918410778045654, "ce_loss_23": 2.9258020162582397, "ce_loss_3": 3.9849618911743163, "ce_loss_6": 3.5556876063346863, "epoch": 0.869, "grad_norm": 1352.0, "kl_loss_12": 553.2969848632813, "kl_loss_17": 161.93497543334962, "kl_loss_3": 2279.2611877441404, "kl_loss_6": 1421.8425659179688, "learning_rate": 4.2584218145409916e-05, "loss": 1091.4147, "step": 8690 }, { "ce_loss_12": 3.1842427015304566, "ce_loss_17": 3.0284349799156187, "ce_loss_23": 2.965789806842804, "ce_loss_3": 3.970459282398224, "ce_loss_6": 3.56220383644104, "epoch": 0.87, "grad_norm": 1632.0, "kl_loss_12": 535.7023223876953, "kl_loss_17": 156.79924621582032, "kl_loss_3": 2181.0572875976563, "kl_loss_6": 1367.7674987792968, "learning_rate": 4.194577128396521e-05, "loss": 1061.4873, "step": 8700 }, { "ce_loss_12": 3.082056391239166, "ce_loss_17": 2.9180865406990053, "ce_loss_23": 2.8544756293296816, "ce_loss_3": 3.906729853153229, "ce_loss_6": 3.489392650127411, "epoch": 0.871, "grad_norm": 1048.0, "kl_loss_12": 541.2103164672851, "kl_loss_17": 157.3700843811035, "kl_loss_3": 2242.8687072753905, "kl_loss_6": 1410.3545532226562, "learning_rate": 4.1311937023518264e-05, "loss": 1099.8197, "step": 8710 }, { "ce_loss_12": 3.0946294546127318, "ce_loss_17": 2.940945291519165, "ce_loss_23": 2.877529966831207, "ce_loss_3": 3.9572771549224854, "ce_loss_6": 3.518484950065613, "epoch": 0.872, "grad_norm": 1456.0, "kl_loss_12": 531.6827041625977, "kl_loss_17": 154.827400970459, "kl_loss_3": 2319.347772216797, "kl_loss_6": 1448.127685546875, "learning_rate": 4.0682721746773344e-05, "loss": 1091.5121, "step": 8720 }, { "ce_loss_12": 2.9802655577659607, "ce_loss_17": 2.812234103679657, "ce_loss_23": 2.7481626510620116, "ce_loss_3": 3.840736758708954, "ce_loss_6": 3.4024895906448362, "epoch": 0.873, "grad_norm": 1120.0, "kl_loss_12": 549.6573318481445, "kl_loss_17": 158.25880508422853, "kl_loss_3": 2307.917236328125, "kl_loss_6": 1443.5062072753906, "learning_rate": 4.0058131789920904e-05, "loss": 1085.316, "step": 8730 }, { "ce_loss_12": 3.1177361249923705, "ce_loss_17": 2.956912469863892, "ce_loss_23": 2.89218031167984, "ce_loss_3": 3.924745261669159, "ce_loss_6": 3.5143105387687683, "epoch": 0.874, "grad_norm": 1208.0, "kl_loss_12": 543.285775756836, "kl_loss_17": 156.44263153076173, "kl_loss_3": 2231.4059814453126, "kl_loss_6": 1405.4261901855468, "learning_rate": 3.9438173442575e-05, "loss": 1115.9526, "step": 8740 }, { "ce_loss_12": 3.147689700126648, "ce_loss_17": 2.980556881427765, "ce_loss_23": 2.911785435676575, "ce_loss_3": 3.9486439347267153, "ce_loss_6": 3.533434844017029, "epoch": 0.875, "grad_norm": 980.0, "kl_loss_12": 547.4045776367187, "kl_loss_17": 159.73441162109376, "kl_loss_3": 2201.325646972656, "kl_loss_6": 1382.7564514160156, "learning_rate": 3.882285294770937e-05, "loss": 1078.3527, "step": 8750 }, { "ce_loss_12": 3.101701581478119, "ce_loss_17": 2.9411001324653627, "ce_loss_23": 2.8762433409690855, "ce_loss_3": 3.8907504320144652, "ce_loss_6": 3.489359402656555, "epoch": 0.876, "grad_norm": 1056.0, "kl_loss_12": 537.6340866088867, "kl_loss_17": 158.29504013061523, "kl_loss_3": 2183.561358642578, "kl_loss_6": 1377.058038330078, "learning_rate": 3.821217650159453e-05, "loss": 1088.5118, "step": 8760 }, { "ce_loss_12": 3.000348138809204, "ce_loss_17": 2.830567252635956, "ce_loss_23": 2.764021897315979, "ce_loss_3": 3.8653979897499084, "ce_loss_6": 3.4257630705833435, "epoch": 0.877, "grad_norm": 1464.0, "kl_loss_12": 559.0526672363281, "kl_loss_17": 160.97410430908204, "kl_loss_3": 2322.0770874023438, "kl_loss_6": 1458.2523071289063, "learning_rate": 3.760615025373543e-05, "loss": 1102.473, "step": 8770 }, { "ce_loss_12": 3.1634358167648315, "ce_loss_17": 2.9957612633705137, "ce_loss_23": 2.9249557852745056, "ce_loss_3": 3.998697113990784, "ce_loss_6": 3.571672594547272, "epoch": 0.878, "grad_norm": 1288.0, "kl_loss_12": 563.2552429199219, "kl_loss_17": 167.07828216552736, "kl_loss_3": 2286.887042236328, "kl_loss_6": 1428.4285522460937, "learning_rate": 3.700478030680987e-05, "loss": 1114.3246, "step": 8780 }, { "ce_loss_12": 3.1507850289344788, "ce_loss_17": 2.9879728078842165, "ce_loss_23": 2.92491819858551, "ce_loss_3": 3.975934851169586, "ce_loss_6": 3.5524497389793397, "epoch": 0.879, "grad_norm": 980.0, "kl_loss_12": 542.5836776733398, "kl_loss_17": 157.81104049682617, "kl_loss_3": 2246.1762939453124, "kl_loss_6": 1403.3133178710937, "learning_rate": 3.6408072716606344e-05, "loss": 1084.585, "step": 8790 }, { "ce_loss_12": 3.0926483392715456, "ce_loss_17": 2.92871607542038, "ce_loss_23": 2.8600263357162476, "ce_loss_3": 3.9413236498832704, "ce_loss_6": 3.5104360699653627, "epoch": 0.88, "grad_norm": 1648.0, "kl_loss_12": 558.1578491210937, "kl_loss_17": 163.44912414550782, "kl_loss_3": 2309.015557861328, "kl_loss_6": 1451.2872802734375, "learning_rate": 3.5816033491963716e-05, "loss": 1128.1988, "step": 8800 }, { "ce_loss_12": 2.9630858421325685, "ce_loss_17": 2.7998567819595337, "ce_loss_23": 2.731334185600281, "ce_loss_3": 3.838448178768158, "ce_loss_6": 3.3801329851150514, "epoch": 0.881, "grad_norm": 1272.0, "kl_loss_12": 546.5424713134765, "kl_loss_17": 158.14128494262695, "kl_loss_3": 2349.586346435547, "kl_loss_6": 1442.6415893554688, "learning_rate": 3.522866859471047e-05, "loss": 1105.7555, "step": 8810 }, { "ce_loss_12": 3.1578266859054565, "ce_loss_17": 3.0061030626296996, "ce_loss_23": 2.943174755573273, "ce_loss_3": 3.9486071348190306, "ce_loss_6": 3.5392861485481264, "epoch": 0.882, "grad_norm": 1160.0, "kl_loss_12": 527.8236755371094, "kl_loss_17": 154.08211441040038, "kl_loss_3": 2152.692315673828, "kl_loss_6": 1344.2339660644532, "learning_rate": 3.46459839396045e-05, "loss": 1067.4445, "step": 8820 }, { "ce_loss_12": 3.089976954460144, "ce_loss_17": 2.9213294863700865, "ce_loss_23": 2.8545565843582152, "ce_loss_3": 3.920038306713104, "ce_loss_6": 3.50158896446228, "epoch": 0.883, "grad_norm": 1488.0, "kl_loss_12": 553.8795608520508, "kl_loss_17": 161.10737075805665, "kl_loss_3": 2249.959197998047, "kl_loss_6": 1425.133349609375, "learning_rate": 3.406798539427386e-05, "loss": 1115.9717, "step": 8830 }, { "ce_loss_12": 3.1482776522636415, "ce_loss_17": 2.9887410402297974, "ce_loss_23": 2.924669921398163, "ce_loss_3": 3.9743199706077577, "ce_loss_6": 3.548464608192444, "epoch": 0.884, "grad_norm": 1640.0, "kl_loss_12": 546.1885131835937, "kl_loss_17": 158.3856117248535, "kl_loss_3": 2269.4408935546876, "kl_loss_6": 1423.7886596679687, "learning_rate": 3.349467877915746e-05, "loss": 1094.6814, "step": 8840 }, { "ce_loss_12": 3.1199790716171263, "ce_loss_17": 2.9570226430892945, "ce_loss_23": 2.889908528327942, "ce_loss_3": 3.955884504318237, "ce_loss_6": 3.5255047082901, "epoch": 0.885, "grad_norm": 1320.0, "kl_loss_12": 554.7209442138671, "kl_loss_17": 160.69096908569335, "kl_loss_3": 2303.524206542969, "kl_loss_6": 1438.8336608886718, "learning_rate": 3.292606986744667e-05, "loss": 1124.6703, "step": 8850 }, { "ce_loss_12": 3.0656699657440187, "ce_loss_17": 2.9097426533699036, "ce_loss_23": 2.8486350178718567, "ce_loss_3": 3.90006947517395, "ce_loss_6": 3.4645033359527586, "epoch": 0.886, "grad_norm": 1296.0, "kl_loss_12": 539.5896713256836, "kl_loss_17": 156.75498657226564, "kl_loss_3": 2260.9781860351563, "kl_loss_6": 1407.9178161621094, "learning_rate": 3.23621643850267e-05, "loss": 1089.2037, "step": 8860 }, { "ce_loss_12": 3.1390577077865602, "ce_loss_17": 2.9780965566635134, "ce_loss_23": 2.9126534819602967, "ce_loss_3": 3.9636197805404665, "ce_loss_6": 3.541094720363617, "epoch": 0.887, "grad_norm": 1232.0, "kl_loss_12": 557.10390625, "kl_loss_17": 164.1832588195801, "kl_loss_3": 2257.755401611328, "kl_loss_6": 1420.0990112304687, "learning_rate": 3.180296801041971e-05, "loss": 1082.1096, "step": 8870 }, { "ce_loss_12": 3.1589428782463074, "ce_loss_17": 3.0010623216629027, "ce_loss_23": 2.9385420203208925, "ce_loss_3": 3.9908467173576354, "ce_loss_6": 3.5579570651054384, "epoch": 0.888, "grad_norm": 1512.0, "kl_loss_12": 542.31962890625, "kl_loss_17": 157.5387435913086, "kl_loss_3": 2263.5656188964845, "kl_loss_6": 1398.8133972167968, "learning_rate": 3.124848637472688e-05, "loss": 1069.7454, "step": 8880 }, { "ce_loss_12": 2.9980852365493775, "ce_loss_17": 2.834604871273041, "ce_loss_23": 2.770075595378876, "ce_loss_3": 3.837626278400421, "ce_loss_6": 3.4114407777786253, "epoch": 0.889, "grad_norm": 1184.0, "kl_loss_12": 536.0059249877929, "kl_loss_17": 155.7068664550781, "kl_loss_3": 2272.6115905761717, "kl_loss_6": 1420.6156921386719, "learning_rate": 3.069872506157212e-05, "loss": 1085.2605, "step": 8890 }, { "ce_loss_12": 3.0897985339164733, "ce_loss_17": 2.9317867755889893, "ce_loss_23": 2.868427813053131, "ce_loss_3": 3.910146701335907, "ce_loss_6": 3.487504768371582, "epoch": 0.89, "grad_norm": 1032.0, "kl_loss_12": 540.8937484741211, "kl_loss_17": 157.58897399902344, "kl_loss_3": 2250.160290527344, "kl_loss_6": 1397.5354064941407, "learning_rate": 3.0153689607045842e-05, "loss": 1081.3935, "step": 8900 }, { "ce_loss_12": 3.014556038379669, "ce_loss_17": 2.840059781074524, "ce_loss_23": 2.771026074886322, "ce_loss_3": 3.8994391798973083, "ce_loss_6": 3.4500266551971435, "epoch": 0.891, "grad_norm": 1256.0, "kl_loss_12": 568.2254974365235, "kl_loss_17": 162.95605545043946, "kl_loss_3": 2407.6415100097656, "kl_loss_6": 1503.1968627929687, "learning_rate": 2.9613385499648926e-05, "loss": 1109.4233, "step": 8910 }, { "ce_loss_12": 3.048786735534668, "ce_loss_17": 2.887788689136505, "ce_loss_23": 2.8222782373428346, "ce_loss_3": 3.860581302642822, "ce_loss_6": 3.4523285746574404, "epoch": 0.892, "grad_norm": 980.0, "kl_loss_12": 540.1698181152344, "kl_loss_17": 158.8286033630371, "kl_loss_3": 2215.156982421875, "kl_loss_6": 1401.9125, "learning_rate": 2.9077818180237692e-05, "loss": 1090.1676, "step": 8920 }, { "ce_loss_12": 3.092083919048309, "ce_loss_17": 2.9248949527740478, "ce_loss_23": 2.856962502002716, "ce_loss_3": 3.9272842407226562, "ce_loss_6": 3.5033080220222472, "epoch": 0.893, "grad_norm": 1160.0, "kl_loss_12": 546.7997436523438, "kl_loss_17": 160.99884414672852, "kl_loss_3": 2255.507373046875, "kl_loss_6": 1412.3314086914063, "learning_rate": 2.8546993041969172e-05, "loss": 1091.2465, "step": 8930 }, { "ce_loss_12": 3.120562505722046, "ce_loss_17": 2.96583776473999, "ce_loss_23": 2.8990086793899534, "ce_loss_3": 3.9110327482223513, "ce_loss_6": 3.5060871124267576, "epoch": 0.894, "grad_norm": 1600.0, "kl_loss_12": 537.0748641967773, "kl_loss_17": 156.7382682800293, "kl_loss_3": 2194.614501953125, "kl_loss_6": 1376.7156066894531, "learning_rate": 2.802091543024671e-05, "loss": 1082.3741, "step": 8940 }, { "ce_loss_12": 3.1228439927101137, "ce_loss_17": 2.962463939189911, "ce_loss_23": 2.8934847354888915, "ce_loss_3": 3.9641275286674498, "ce_loss_6": 3.5355375170707704, "epoch": 0.895, "grad_norm": 1336.0, "kl_loss_12": 544.1097839355468, "kl_loss_17": 160.5014747619629, "kl_loss_3": 2294.6555725097655, "kl_loss_6": 1433.5463562011719, "learning_rate": 2.7499590642665774e-05, "loss": 1113.1617, "step": 8950 }, { "ce_loss_12": 3.145519268512726, "ce_loss_17": 2.9768693923950194, "ce_loss_23": 2.9099491834640503, "ce_loss_3": 3.954761433601379, "ce_loss_6": 3.5245487093925476, "epoch": 0.896, "grad_norm": 1112.0, "kl_loss_12": 568.062287902832, "kl_loss_17": 163.7352035522461, "kl_loss_3": 2241.2856689453124, "kl_loss_6": 1388.786572265625, "learning_rate": 2.6983023928961405e-05, "loss": 1083.4912, "step": 8960 }, { "ce_loss_12": 3.109204924106598, "ce_loss_17": 2.944039022922516, "ce_loss_23": 2.8751957297325133, "ce_loss_3": 3.9350946068763735, "ce_loss_6": 3.508461356163025, "epoch": 0.897, "grad_norm": 1128.0, "kl_loss_12": 544.3224639892578, "kl_loss_17": 159.73422317504884, "kl_loss_3": 2243.127850341797, "kl_loss_6": 1400.6375061035155, "learning_rate": 2.6471220490954628e-05, "loss": 1097.565, "step": 8970 }, { "ce_loss_12": 3.0935628056526183, "ce_loss_17": 2.93547545671463, "ce_loss_23": 2.8789809584617614, "ce_loss_3": 3.9196462988853455, "ce_loss_6": 3.488067018985748, "epoch": 0.898, "grad_norm": 1032.0, "kl_loss_12": 533.343212890625, "kl_loss_17": 155.5143020629883, "kl_loss_3": 2239.4082153320314, "kl_loss_6": 1385.2438659667969, "learning_rate": 2.596418548250029e-05, "loss": 1087.2333, "step": 8980 }, { "ce_loss_12": 3.1320006370544435, "ce_loss_17": 2.971823489665985, "ce_loss_23": 2.9040478348731993, "ce_loss_3": 3.9445548057556152, "ce_loss_6": 3.528187572956085, "epoch": 0.899, "grad_norm": 1232.0, "kl_loss_12": 551.7921142578125, "kl_loss_17": 161.45555572509767, "kl_loss_3": 2263.838671875, "kl_loss_6": 1418.526055908203, "learning_rate": 2.5461924009435368e-05, "loss": 1080.5319, "step": 8990 }, { "ce_loss_12": 3.1223100066184997, "ce_loss_17": 2.9642346501350403, "ce_loss_23": 2.89671528339386, "ce_loss_3": 3.9317273020744326, "ce_loss_6": 3.522926139831543, "epoch": 0.9, "grad_norm": 1032.0, "kl_loss_12": 548.0636291503906, "kl_loss_17": 161.5977439880371, "kl_loss_3": 2225.2694702148438, "kl_loss_6": 1406.5701416015625, "learning_rate": 2.4964441129527336e-05, "loss": 1105.2758, "step": 9000 }, { "ce_loss_12": 3.115997779369354, "ce_loss_17": 2.9631489157676696, "ce_loss_23": 2.9011554479599, "ce_loss_3": 3.916248857975006, "ce_loss_6": 3.5043585896492004, "epoch": 0.901, "grad_norm": 1152.0, "kl_loss_12": 529.6404830932618, "kl_loss_17": 154.54727096557616, "kl_loss_3": 2182.716204833984, "kl_loss_6": 1371.240216064453, "learning_rate": 2.4471741852423235e-05, "loss": 1069.2625, "step": 9010 }, { "ce_loss_12": 3.178261375427246, "ce_loss_17": 3.016998851299286, "ce_loss_23": 2.948937976360321, "ce_loss_3": 3.9921558260917664, "ce_loss_6": 3.5734020829200746, "epoch": 0.902, "grad_norm": 1184.0, "kl_loss_12": 542.1947662353516, "kl_loss_17": 160.79561767578124, "kl_loss_3": 2211.367510986328, "kl_loss_6": 1386.8376098632812, "learning_rate": 2.3983831139599287e-05, "loss": 1080.5448, "step": 9020 }, { "ce_loss_12": 3.0915554642677305, "ce_loss_17": 2.937301242351532, "ce_loss_23": 2.8709113359451295, "ce_loss_3": 3.907834804058075, "ce_loss_6": 3.4860360860824584, "epoch": 0.903, "grad_norm": 1232.0, "kl_loss_12": 530.9263809204101, "kl_loss_17": 156.1497802734375, "kl_loss_3": 2213.0542053222657, "kl_loss_6": 1376.4850708007812, "learning_rate": 2.3500713904311022e-05, "loss": 1058.6828, "step": 9030 }, { "ce_loss_12": 3.124735951423645, "ce_loss_17": 2.969620370864868, "ce_loss_23": 2.908503293991089, "ce_loss_3": 3.9082423567771913, "ce_loss_6": 3.50426607131958, "epoch": 0.904, "grad_norm": 1296.0, "kl_loss_12": 523.6847671508789, "kl_loss_17": 153.13335037231445, "kl_loss_3": 2150.9460876464846, "kl_loss_6": 1351.1428833007812, "learning_rate": 2.3022395011543685e-05, "loss": 1050.9455, "step": 9040 }, { "ce_loss_12": 3.161675202846527, "ce_loss_17": 2.998300814628601, "ce_loss_23": 2.927753913402557, "ce_loss_3": 3.975581741333008, "ce_loss_6": 3.5567883014678956, "epoch": 0.905, "grad_norm": 1160.0, "kl_loss_12": 555.8097091674805, "kl_loss_17": 163.1023277282715, "kl_loss_3": 2245.241558837891, "kl_loss_6": 1410.1447509765626, "learning_rate": 2.2548879277963063e-05, "loss": 1108.258, "step": 9050 }, { "ce_loss_12": 3.0751007795333862, "ce_loss_17": 2.9190019726753236, "ce_loss_23": 2.854919970035553, "ce_loss_3": 3.877568233013153, "ce_loss_6": 3.467659282684326, "epoch": 0.906, "grad_norm": 1440.0, "kl_loss_12": 531.2436706542969, "kl_loss_17": 155.4201519012451, "kl_loss_3": 2194.947058105469, "kl_loss_6": 1375.2668762207031, "learning_rate": 2.208017147186736e-05, "loss": 1050.3398, "step": 9060 }, { "ce_loss_12": 3.072239875793457, "ce_loss_17": 2.91306734085083, "ce_loss_23": 2.8462382197380065, "ce_loss_3": 3.884073185920715, "ce_loss_6": 3.4699034214019777, "epoch": 0.907, "grad_norm": 1000.0, "kl_loss_12": 539.5051681518555, "kl_loss_17": 157.800789642334, "kl_loss_3": 2230.831646728516, "kl_loss_6": 1393.0045166015625, "learning_rate": 2.1616276313139227e-05, "loss": 1072.0311, "step": 9070 }, { "ce_loss_12": 3.1148462891578674, "ce_loss_17": 2.9539944529533386, "ce_loss_23": 2.8859964847564696, "ce_loss_3": 3.9291485071182253, "ce_loss_6": 3.5146759152412415, "epoch": 0.908, "grad_norm": 1208.0, "kl_loss_12": 538.7437683105469, "kl_loss_17": 158.58870620727538, "kl_loss_3": 2215.8249328613283, "kl_loss_6": 1395.8588623046876, "learning_rate": 2.1157198473197415e-05, "loss": 1091.5688, "step": 9080 }, { "ce_loss_12": 3.167890965938568, "ce_loss_17": 3.005591297149658, "ce_loss_23": 2.938670742511749, "ce_loss_3": 3.9942872762680053, "ce_loss_6": 3.569392716884613, "epoch": 0.909, "grad_norm": 1240.0, "kl_loss_12": 553.3726150512696, "kl_loss_17": 162.06743621826172, "kl_loss_3": 2253.4153259277346, "kl_loss_6": 1412.7976196289062, "learning_rate": 2.0702942574950812e-05, "loss": 1087.3652, "step": 9090 }, { "ce_loss_12": 3.1123293161392214, "ce_loss_17": 2.9462677955627443, "ce_loss_23": 2.8751128554344176, "ce_loss_3": 3.938741445541382, "ce_loss_6": 3.5134841084480284, "epoch": 0.91, "grad_norm": 1136.0, "kl_loss_12": 557.57490234375, "kl_loss_17": 164.1454849243164, "kl_loss_3": 2263.563861083984, "kl_loss_6": 1420.0111206054687, "learning_rate": 2.025351319275137e-05, "loss": 1090.4695, "step": 9100 }, { "ce_loss_12": 3.2197620272636414, "ce_loss_17": 3.0535457253456117, "ce_loss_23": 2.9846746921539307, "ce_loss_3": 4.033296668529511, "ce_loss_6": 3.6214271068572996, "epoch": 0.911, "grad_norm": 1296.0, "kl_loss_12": 560.6335205078125, "kl_loss_17": 162.30851058959962, "kl_loss_3": 2263.798486328125, "kl_loss_6": 1431.2504577636719, "learning_rate": 1.9808914852347816e-05, "loss": 1115.8676, "step": 9110 }, { "ce_loss_12": 3.0728222727775574, "ce_loss_17": 2.9096383094787597, "ce_loss_23": 2.8406273484230042, "ce_loss_3": 3.889565372467041, "ce_loss_6": 3.481040823459625, "epoch": 0.912, "grad_norm": 1624.0, "kl_loss_12": 545.7222076416016, "kl_loss_17": 160.1074073791504, "kl_loss_3": 2211.3495666503904, "kl_loss_6": 1403.9913269042968, "learning_rate": 1.9369152030840554e-05, "loss": 1077.0305, "step": 9120 }, { "ce_loss_12": 3.1469371199607847, "ce_loss_17": 2.99254857301712, "ce_loss_23": 2.927776575088501, "ce_loss_3": 3.9770922541618345, "ce_loss_6": 3.553990662097931, "epoch": 0.913, "grad_norm": 1224.0, "kl_loss_12": 545.7375885009766, "kl_loss_17": 159.17574234008788, "kl_loss_3": 2275.7082397460936, "kl_loss_6": 1424.6509582519532, "learning_rate": 1.893422915663645e-05, "loss": 1096.3979, "step": 9130 }, { "ce_loss_12": 3.037546753883362, "ce_loss_17": 2.871594476699829, "ce_loss_23": 2.806107497215271, "ce_loss_3": 3.8876906752586367, "ce_loss_6": 3.459704267978668, "epoch": 0.914, "grad_norm": 1008.0, "kl_loss_12": 554.3168228149414, "kl_loss_17": 161.52917022705077, "kl_loss_3": 2310.563195800781, "kl_loss_6": 1451.2398193359375, "learning_rate": 1.850415060940386e-05, "loss": 1108.2693, "step": 9140 }, { "ce_loss_12": 3.1431529998779295, "ce_loss_17": 2.986000108718872, "ce_loss_23": 2.9234211921691893, "ce_loss_3": 3.9328189492225647, "ce_loss_6": 3.529419946670532, "epoch": 0.915, "grad_norm": 1192.0, "kl_loss_12": 541.6305496215821, "kl_loss_17": 158.0989532470703, "kl_loss_3": 2186.0196838378906, "kl_loss_6": 1377.559063720703, "learning_rate": 1.8078920720028978e-05, "loss": 1079.0549, "step": 9150 }, { "ce_loss_12": 3.0662479639053344, "ce_loss_17": 2.9114412546157835, "ce_loss_23": 2.8517210602760317, "ce_loss_3": 3.8619790196418764, "ce_loss_6": 3.4534464478492737, "epoch": 0.916, "grad_norm": 868.0, "kl_loss_12": 527.064845275879, "kl_loss_17": 153.3120216369629, "kl_loss_3": 2172.184851074219, "kl_loss_6": 1368.7500366210938, "learning_rate": 1.765854377057219e-05, "loss": 1083.5215, "step": 9160 }, { "ce_loss_12": 3.0449477195739747, "ce_loss_17": 2.8911916851997375, "ce_loss_23": 2.8287646174430847, "ce_loss_3": 3.8581594944000246, "ce_loss_6": 3.4393247723579408, "epoch": 0.917, "grad_norm": 1208.0, "kl_loss_12": 525.6621978759765, "kl_loss_17": 152.03661041259767, "kl_loss_3": 2207.767950439453, "kl_loss_6": 1378.0291320800782, "learning_rate": 1.724302399422456e-05, "loss": 1077.2742, "step": 9170 }, { "ce_loss_12": 3.033001518249512, "ce_loss_17": 2.8675798892974855, "ce_loss_23": 2.798583674430847, "ce_loss_3": 3.8532918214797975, "ce_loss_6": 3.4274834752082826, "epoch": 0.918, "grad_norm": 852.0, "kl_loss_12": 556.0043411254883, "kl_loss_17": 162.40515594482423, "kl_loss_3": 2270.423095703125, "kl_loss_6": 1417.0343627929688, "learning_rate": 1.683236557526574e-05, "loss": 1094.157, "step": 9180 }, { "ce_loss_12": 3.1149054527282716, "ce_loss_17": 2.9648550748825073, "ce_loss_23": 2.9029189944267273, "ce_loss_3": 3.893848168849945, "ce_loss_6": 3.4919156551361086, "epoch": 0.919, "grad_norm": 1328.0, "kl_loss_12": 520.6415710449219, "kl_loss_17": 153.77631530761718, "kl_loss_3": 2137.8181762695312, "kl_loss_6": 1343.8169311523438, "learning_rate": 1.6426572649021475e-05, "loss": 1067.9865, "step": 9190 }, { "ce_loss_12": 3.148048484325409, "ce_loss_17": 2.994646632671356, "ce_loss_23": 2.931679570674896, "ce_loss_3": 3.920458984375, "ce_loss_6": 3.52545348405838, "epoch": 0.92, "grad_norm": 948.0, "kl_loss_12": 531.7145721435547, "kl_loss_17": 157.4194091796875, "kl_loss_3": 2146.69736328125, "kl_loss_6": 1351.5185180664062, "learning_rate": 1.6025649301821876e-05, "loss": 1065.0173, "step": 9200 }, { "ce_loss_12": 3.1481565117835997, "ce_loss_17": 2.9912326455116274, "ce_loss_23": 2.9256301879882813, "ce_loss_3": 3.932176637649536, "ce_loss_6": 3.5256184458732607, "epoch": 0.921, "grad_norm": 1232.0, "kl_loss_12": 541.1270858764649, "kl_loss_17": 158.6568473815918, "kl_loss_3": 2181.775390625, "kl_loss_6": 1374.807177734375, "learning_rate": 1.5629599570960716e-05, "loss": 1063.4439, "step": 9210 }, { "ce_loss_12": 3.0623461604118347, "ce_loss_17": 2.908112442493439, "ce_loss_23": 2.841799771785736, "ce_loss_3": 3.892938697338104, "ce_loss_6": 3.4622840762138365, "epoch": 0.922, "grad_norm": 1320.0, "kl_loss_12": 541.5634201049804, "kl_loss_17": 156.80740127563476, "kl_loss_3": 2252.3447509765624, "kl_loss_6": 1405.0825500488281, "learning_rate": 1.5238427444654367e-05, "loss": 1079.0959, "step": 9220 }, { "ce_loss_12": 3.1123980045318604, "ce_loss_17": 2.9523245811462404, "ce_loss_23": 2.8886606454849244, "ce_loss_3": 3.9145833969116213, "ce_loss_6": 3.494296705722809, "epoch": 0.923, "grad_norm": 1048.0, "kl_loss_12": 531.3698974609375, "kl_loss_17": 157.11926879882813, "kl_loss_3": 2190.626611328125, "kl_loss_6": 1359.580322265625, "learning_rate": 1.4852136862001764e-05, "loss": 1066.6766, "step": 9230 }, { "ce_loss_12": 3.0788945317268372, "ce_loss_17": 2.9183433413505555, "ce_loss_23": 2.8562278389930724, "ce_loss_3": 3.8753127574920656, "ce_loss_6": 3.468129289150238, "epoch": 0.924, "grad_norm": 1064.0, "kl_loss_12": 531.5305938720703, "kl_loss_17": 154.79893341064454, "kl_loss_3": 2182.990582275391, "kl_loss_6": 1373.8393920898438, "learning_rate": 1.4470731712944884e-05, "loss": 1078.1009, "step": 9240 }, { "ce_loss_12": 3.105417084693909, "ce_loss_17": 2.9438974380493166, "ce_loss_23": 2.8757981777191164, "ce_loss_3": 3.9196877241134644, "ce_loss_6": 3.5000792860984804, "epoch": 0.925, "grad_norm": 1248.0, "kl_loss_12": 544.6679031372071, "kl_loss_17": 159.6856834411621, "kl_loss_3": 2213.4506958007814, "kl_loss_6": 1380.3028869628906, "learning_rate": 1.4094215838229174e-05, "loss": 1094.7473, "step": 9250 }, { "ce_loss_12": 3.0856382489204406, "ce_loss_17": 2.922611892223358, "ce_loss_23": 2.857351744174957, "ce_loss_3": 3.9181522965431212, "ce_loss_6": 3.4862570762634277, "epoch": 0.926, "grad_norm": 1408.0, "kl_loss_12": 543.434619140625, "kl_loss_17": 158.3315170288086, "kl_loss_3": 2270.9244995117188, "kl_loss_6": 1409.6431457519532, "learning_rate": 1.372259302936546e-05, "loss": 1124.6401, "step": 9260 }, { "ce_loss_12": 3.1793969869613647, "ce_loss_17": 3.0157102584838866, "ce_loss_23": 2.946769106388092, "ce_loss_3": 3.983129787445068, "ce_loss_6": 3.567841613292694, "epoch": 0.927, "grad_norm": 1696.0, "kl_loss_12": 555.6799179077149, "kl_loss_17": 166.32063751220704, "kl_loss_3": 2228.221728515625, "kl_loss_6": 1397.9705871582032, "learning_rate": 1.3355867028591206e-05, "loss": 1074.1199, "step": 9270 }, { "ce_loss_12": 3.0804280757904055, "ce_loss_17": 2.9220717191696166, "ce_loss_23": 2.8591931581497194, "ce_loss_3": 3.8621233344078063, "ce_loss_6": 3.455961990356445, "epoch": 0.928, "grad_norm": 1120.0, "kl_loss_12": 530.960791015625, "kl_loss_17": 155.62761688232422, "kl_loss_3": 2170.498614501953, "kl_loss_6": 1355.5591064453124, "learning_rate": 1.2994041528833267e-05, "loss": 1062.3898, "step": 9280 }, { "ce_loss_12": 3.083864176273346, "ce_loss_17": 2.928153955936432, "ce_loss_23": 2.8618181586265563, "ce_loss_3": 3.8916095972061155, "ce_loss_6": 3.4765172123909, "epoch": 0.929, "grad_norm": 1264.0, "kl_loss_12": 539.578709411621, "kl_loss_17": 155.33257980346679, "kl_loss_3": 2217.529052734375, "kl_loss_6": 1386.2726989746093, "learning_rate": 1.2637120173670358e-05, "loss": 1065.5698, "step": 9290 }, { "ce_loss_12": 3.107132685184479, "ce_loss_17": 2.942716729640961, "ce_loss_23": 2.873940134048462, "ce_loss_3": 3.932835280895233, "ce_loss_6": 3.510121464729309, "epoch": 0.93, "grad_norm": 1240.0, "kl_loss_12": 544.464077758789, "kl_loss_17": 160.88489379882813, "kl_loss_3": 2237.293408203125, "kl_loss_6": 1402.9177734375, "learning_rate": 1.2285106557296478e-05, "loss": 1075.1647, "step": 9300 }, { "ce_loss_12": 3.000529372692108, "ce_loss_17": 2.840439808368683, "ce_loss_23": 2.775368392467499, "ce_loss_3": 3.877704751491547, "ce_loss_6": 3.429161012172699, "epoch": 0.931, "grad_norm": 1368.0, "kl_loss_12": 544.9817962646484, "kl_loss_17": 157.51940689086913, "kl_loss_3": 2336.6595947265623, "kl_loss_6": 1450.5424194335938, "learning_rate": 1.1938004224484989e-05, "loss": 1098.9723, "step": 9310 }, { "ce_loss_12": 3.2173678398132326, "ce_loss_17": 3.056504476070404, "ce_loss_23": 2.987658143043518, "ce_loss_3": 4.015758633613586, "ce_loss_6": 3.6054705142974854, "epoch": 0.932, "grad_norm": 1224.0, "kl_loss_12": 544.5718933105469, "kl_loss_17": 160.28335723876953, "kl_loss_3": 2210.1267944335937, "kl_loss_6": 1393.004949951172, "learning_rate": 1.1595816670552429e-05, "loss": 1094.6549, "step": 9320 }, { "ce_loss_12": 3.1313786029815676, "ce_loss_17": 2.982147419452667, "ce_loss_23": 2.9147197008132935, "ce_loss_3": 3.934300494194031, "ce_loss_6": 3.5208180427551268, "epoch": 0.933, "grad_norm": 1256.0, "kl_loss_12": 534.8270599365235, "kl_loss_17": 159.38514709472656, "kl_loss_3": 2192.3671936035157, "kl_loss_6": 1370.7300659179687, "learning_rate": 1.1258547341323699e-05, "loss": 1060.6582, "step": 9330 }, { "ce_loss_12": 3.1733838319778442, "ce_loss_17": 3.014378774166107, "ce_loss_23": 2.9463321208953857, "ce_loss_3": 3.9683137059211733, "ce_loss_6": 3.563319516181946, "epoch": 0.934, "grad_norm": 1784.0, "kl_loss_12": 543.0530883789063, "kl_loss_17": 159.08293609619142, "kl_loss_3": 2215.9241577148437, "kl_loss_6": 1391.2158264160157, "learning_rate": 1.0926199633097156e-05, "loss": 1074.9855, "step": 9340 }, { "ce_loss_12": 3.170001041889191, "ce_loss_17": 3.014968490600586, "ce_loss_23": 2.952860116958618, "ce_loss_3": 3.944246733188629, "ce_loss_6": 3.542513108253479, "epoch": 0.935, "grad_norm": 1280.0, "kl_loss_12": 527.8997283935547, "kl_loss_17": 154.46102066040038, "kl_loss_3": 2160.9652099609375, "kl_loss_6": 1352.6298950195312, "learning_rate": 1.0598776892610684e-05, "loss": 1085.8208, "step": 9350 }, { "ce_loss_12": 3.0026628494262697, "ce_loss_17": 2.8449546456336976, "ce_loss_23": 2.7814199447631838, "ce_loss_3": 3.8261303305625916, "ce_loss_6": 3.403845179080963, "epoch": 0.936, "grad_norm": 1304.0, "kl_loss_12": 533.1611785888672, "kl_loss_17": 153.6280891418457, "kl_loss_3": 2235.2831970214843, "kl_loss_6": 1403.28251953125, "learning_rate": 1.0276282417007399e-05, "loss": 1067.509, "step": 9360 }, { "ce_loss_12": 3.138059711456299, "ce_loss_17": 2.985086512565613, "ce_loss_23": 2.9205947160720824, "ce_loss_3": 3.9207167744636537, "ce_loss_6": 3.5188659548759462, "epoch": 0.937, "grad_norm": 1048.0, "kl_loss_12": 528.6805099487304, "kl_loss_17": 154.96491088867188, "kl_loss_3": 2149.21474609375, "kl_loss_6": 1351.6066040039063, "learning_rate": 9.958719453803277e-06, "loss": 1062.6318, "step": 9370 }, { "ce_loss_12": 3.1419655203819277, "ce_loss_17": 2.9851008653640747, "ce_loss_23": 2.917102587223053, "ce_loss_3": 3.9556142926216125, "ce_loss_6": 3.5410670757293703, "epoch": 0.938, "grad_norm": 1280.0, "kl_loss_12": 544.192350769043, "kl_loss_17": 158.0346694946289, "kl_loss_3": 2221.0482177734375, "kl_loss_6": 1402.3416381835937, "learning_rate": 9.646091200853802e-06, "loss": 1072.5707, "step": 9380 }, { "ce_loss_12": 3.0962947249412536, "ce_loss_17": 2.942214035987854, "ce_loss_23": 2.878101277351379, "ce_loss_3": 3.8967387557029722, "ce_loss_6": 3.4878384351730345, "epoch": 0.939, "grad_norm": 1400.0, "kl_loss_12": 530.0220367431641, "kl_loss_17": 154.4480094909668, "kl_loss_3": 2168.3790893554688, "kl_loss_6": 1363.4065490722655, "learning_rate": 9.338400806321978e-06, "loss": 1041.948, "step": 9390 }, { "ce_loss_12": 3.1360417127609255, "ce_loss_17": 2.9746978521347045, "ce_loss_23": 2.906171131134033, "ce_loss_3": 3.936599922180176, "ce_loss_6": 3.520904242992401, "epoch": 0.94, "grad_norm": 1128.0, "kl_loss_12": 548.192594909668, "kl_loss_17": 162.06064224243164, "kl_loss_3": 2202.0309509277345, "kl_loss_6": 1383.4674560546875, "learning_rate": 9.035651368646646e-06, "loss": 1065.3178, "step": 9400 }, { "ce_loss_12": 3.13859965801239, "ce_loss_17": 2.985644745826721, "ce_loss_23": 2.922950530052185, "ce_loss_3": 3.925608992576599, "ce_loss_6": 3.5271278023719788, "epoch": 0.941, "grad_norm": 1464.0, "kl_loss_12": 526.8603591918945, "kl_loss_17": 154.67271118164064, "kl_loss_3": 2176.2880004882813, "kl_loss_6": 1372.896905517578, "learning_rate": 8.737845936511335e-06, "loss": 1068.3764, "step": 9410 }, { "ce_loss_12": 3.095977246761322, "ce_loss_17": 2.931922101974487, "ce_loss_23": 2.8640459775924683, "ce_loss_3": 3.918685781955719, "ce_loss_6": 3.489489185810089, "epoch": 0.942, "grad_norm": 936.0, "kl_loss_12": 545.6118621826172, "kl_loss_17": 159.8506301879883, "kl_loss_3": 2244.4978088378907, "kl_loss_6": 1398.2353454589843, "learning_rate": 8.444987508813451e-06, "loss": 1073.6937, "step": 9420 }, { "ce_loss_12": 3.0587186932563784, "ce_loss_17": 2.894214355945587, "ce_loss_23": 2.8275156140327455, "ce_loss_3": 3.9129406094551085, "ce_loss_6": 3.47637939453125, "epoch": 0.943, "grad_norm": 1216.0, "kl_loss_12": 555.0885116577149, "kl_loss_17": 161.69069900512696, "kl_loss_3": 2330.9462463378904, "kl_loss_6": 1455.9245971679688, "learning_rate": 8.157079034633974e-06, "loss": 1101.4342, "step": 9430 }, { "ce_loss_12": 3.042380619049072, "ce_loss_17": 2.884304630756378, "ce_loss_23": 2.8195446968078612, "ce_loss_3": 3.8600812911987306, "ce_loss_6": 3.4441370368003845, "epoch": 0.944, "grad_norm": 984.0, "kl_loss_12": 535.3955963134765, "kl_loss_17": 156.51687469482422, "kl_loss_3": 2244.7854187011717, "kl_loss_6": 1411.4883728027344, "learning_rate": 7.874123413208145e-06, "loss": 1077.6412, "step": 9440 }, { "ce_loss_12": 3.0281569480896, "ce_loss_17": 2.865230941772461, "ce_loss_23": 2.798931360244751, "ce_loss_3": 3.8621835589408873, "ce_loss_6": 3.436237359046936, "epoch": 0.945, "grad_norm": 1048.0, "kl_loss_12": 536.7286972045898, "kl_loss_17": 156.12984771728514, "kl_loss_3": 2247.120428466797, "kl_loss_6": 1404.0705200195312, "learning_rate": 7.59612349389599e-06, "loss": 1086.8323, "step": 9450 }, { "ce_loss_12": 3.102251076698303, "ce_loss_17": 2.9473668694496156, "ce_loss_23": 2.8842572450637816, "ce_loss_3": 3.891569495201111, "ce_loss_6": 3.4863331437110903, "epoch": 0.946, "grad_norm": 1256.0, "kl_loss_12": 525.8963073730469, "kl_loss_17": 152.55707778930665, "kl_loss_3": 2146.53525390625, "kl_loss_6": 1347.6020385742188, "learning_rate": 7.323082076153509e-06, "loss": 1061.4568, "step": 9460 }, { "ce_loss_12": 3.1477771043777465, "ce_loss_17": 2.9896829605102537, "ce_loss_23": 2.924260699748993, "ce_loss_3": 3.934639847278595, "ce_loss_6": 3.526467728614807, "epoch": 0.947, "grad_norm": 1232.0, "kl_loss_12": 541.1049713134765, "kl_loss_17": 160.94679489135743, "kl_loss_3": 2172.9966735839844, "kl_loss_6": 1368.2758056640625, "learning_rate": 7.055001909504755e-06, "loss": 1083.6273, "step": 9470 }, { "ce_loss_12": 3.183323097229004, "ce_loss_17": 3.023005247116089, "ce_loss_23": 2.9559802651405334, "ce_loss_3": 3.9784157991409304, "ce_loss_6": 3.5707173943519592, "epoch": 0.948, "grad_norm": 1176.0, "kl_loss_12": 539.4944686889648, "kl_loss_17": 157.5242950439453, "kl_loss_3": 2207.8321228027344, "kl_loss_6": 1381.5572387695313, "learning_rate": 6.791885693514133e-06, "loss": 1079.0922, "step": 9480 }, { "ce_loss_12": 3.094246971607208, "ce_loss_17": 2.935408890247345, "ce_loss_23": 2.8691577553749084, "ce_loss_3": 3.920883822441101, "ce_loss_6": 3.4952668070793154, "epoch": 0.949, "grad_norm": 1040.0, "kl_loss_12": 540.617756652832, "kl_loss_17": 158.96320114135742, "kl_loss_3": 2272.402795410156, "kl_loss_6": 1413.7683044433593, "learning_rate": 6.533736077758867e-06, "loss": 1092.891, "step": 9490 }, { "ce_loss_12": 3.0623729705810545, "ce_loss_17": 2.8987768054008485, "ce_loss_23": 2.832401430606842, "ce_loss_3": 3.911205840110779, "ce_loss_6": 3.4744672179222107, "epoch": 0.95, "grad_norm": 1808.0, "kl_loss_12": 556.887094116211, "kl_loss_17": 162.5521026611328, "kl_loss_3": 2316.6902587890627, "kl_loss_6": 1439.454327392578, "learning_rate": 6.2805556618028556e-06, "loss": 1090.7896, "step": 9500 }, { "ce_loss_12": 3.1281984210014344, "ce_loss_17": 2.981964576244354, "ce_loss_23": 2.9159383177757263, "ce_loss_3": 3.9170543432235716, "ce_loss_6": 3.508361804485321, "epoch": 0.951, "grad_norm": 1208.0, "kl_loss_12": 515.7074813842773, "kl_loss_17": 154.32823028564454, "kl_loss_3": 2141.530340576172, "kl_loss_6": 1326.3069274902343, "learning_rate": 6.032346995169968e-06, "loss": 1027.9488, "step": 9510 }, { "ce_loss_12": 3.136077415943146, "ce_loss_17": 2.9806724548339845, "ce_loss_23": 2.917427134513855, "ce_loss_3": 3.9442824363708495, "ce_loss_6": 3.526597273349762, "epoch": 0.952, "grad_norm": 1168.0, "kl_loss_12": 537.6825424194336, "kl_loss_17": 157.90337600708008, "kl_loss_3": 2211.2411254882813, "kl_loss_6": 1379.8905090332032, "learning_rate": 5.789112577318789e-06, "loss": 1067.4238, "step": 9520 }, { "ce_loss_12": 3.12904806137085, "ce_loss_17": 2.9698351502418516, "ce_loss_23": 2.9052838325500487, "ce_loss_3": 3.9457252740859987, "ce_loss_6": 3.5223633885383605, "epoch": 0.953, "grad_norm": 1064.0, "kl_loss_12": 548.40458984375, "kl_loss_17": 159.1815216064453, "kl_loss_3": 2241.716436767578, "kl_loss_6": 1404.752862548828, "learning_rate": 5.550854857617194e-06, "loss": 1068.0402, "step": 9530 }, { "ce_loss_12": 3.114034593105316, "ce_loss_17": 2.94967303276062, "ce_loss_23": 2.8814757823944093, "ce_loss_3": 3.9540896892547606, "ce_loss_6": 3.5182945370674132, "epoch": 0.954, "grad_norm": 1264.0, "kl_loss_12": 553.2667083740234, "kl_loss_17": 162.21889266967773, "kl_loss_3": 2289.8888427734373, "kl_loss_6": 1432.19365234375, "learning_rate": 5.317576235317756e-06, "loss": 1101.0947, "step": 9540 }, { "ce_loss_12": 3.1283044457435607, "ce_loss_17": 2.9742880702018737, "ce_loss_23": 2.9097349286079406, "ce_loss_3": 3.9063483357429503, "ce_loss_6": 3.509521949291229, "epoch": 0.955, "grad_norm": 1000.0, "kl_loss_12": 523.9976669311524, "kl_loss_17": 155.9721908569336, "kl_loss_3": 2127.7600524902346, "kl_loss_6": 1335.1602172851562, "learning_rate": 5.089279059533658e-06, "loss": 1067.269, "step": 9550 }, { "ce_loss_12": 3.1875174760818483, "ce_loss_17": 3.025624454021454, "ce_loss_23": 2.9579347252845762, "ce_loss_3": 3.9809356689453126, "ce_loss_6": 3.5737068176269533, "epoch": 0.956, "grad_norm": 1840.0, "kl_loss_12": 554.1400634765625, "kl_loss_17": 163.7421745300293, "kl_loss_3": 2195.819970703125, "kl_loss_6": 1390.9726867675781, "learning_rate": 4.865965629214819e-06, "loss": 1069.3086, "step": 9560 }, { "ce_loss_12": 3.1353691101074217, "ce_loss_17": 2.9781248807907104, "ce_loss_23": 2.9155802726745605, "ce_loss_3": 3.9476658582687376, "ce_loss_6": 3.531456804275513, "epoch": 0.957, "grad_norm": 1232.0, "kl_loss_12": 546.0765274047851, "kl_loss_17": 158.91124877929687, "kl_loss_3": 2245.0339904785155, "kl_loss_6": 1413.6621520996093, "learning_rate": 4.6476381931251366e-06, "loss": 1067.9433, "step": 9570 }, { "ce_loss_12": 3.1197481036186216, "ce_loss_17": 2.9656429409980776, "ce_loss_23": 2.8970610022544863, "ce_loss_3": 3.913744592666626, "ce_loss_6": 3.511826777458191, "epoch": 0.958, "grad_norm": 1152.0, "kl_loss_12": 531.675032043457, "kl_loss_17": 156.55153198242186, "kl_loss_3": 2180.854107666016, "kl_loss_6": 1372.5286376953125, "learning_rate": 4.434298949819449e-06, "loss": 1069.0238, "step": 9580 }, { "ce_loss_12": 3.0952879071235655, "ce_loss_17": 2.932103991508484, "ce_loss_23": 2.862602686882019, "ce_loss_3": 3.940154552459717, "ce_loss_6": 3.506578290462494, "epoch": 0.959, "grad_norm": 1160.0, "kl_loss_12": 562.480191040039, "kl_loss_17": 166.8973876953125, "kl_loss_3": 2328.669140625, "kl_loss_6": 1465.3488891601562, "learning_rate": 4.2259500476214406e-06, "loss": 1099.8499, "step": 9590 }, { "ce_loss_12": 3.06618047952652, "ce_loss_17": 2.911667048931122, "ce_loss_23": 2.842763102054596, "ce_loss_3": 3.8899773359298706, "ce_loss_6": 3.4630375146865844, "epoch": 0.96, "grad_norm": 952.0, "kl_loss_12": 542.4524353027343, "kl_loss_17": 157.97199249267578, "kl_loss_3": 2253.8352966308594, "kl_loss_6": 1405.9619384765624, "learning_rate": 4.02259358460233e-06, "loss": 1073.7738, "step": 9600 }, { "ce_loss_12": 3.1313651323318483, "ce_loss_17": 2.97236407995224, "ce_loss_23": 2.902596282958984, "ce_loss_3": 3.9350406765937804, "ce_loss_6": 3.51432626247406, "epoch": 0.961, "grad_norm": 1224.0, "kl_loss_12": 539.8430587768555, "kl_loss_17": 160.37659225463867, "kl_loss_3": 2178.180029296875, "kl_loss_6": 1357.514794921875, "learning_rate": 3.8242316085594916e-06, "loss": 1062.6227, "step": 9610 }, { "ce_loss_12": 3.0283801317214967, "ce_loss_17": 2.8597097992897034, "ce_loss_23": 2.791723334789276, "ce_loss_3": 3.8920037150382996, "ce_loss_6": 3.4422079205513, "epoch": 0.962, "grad_norm": 912.0, "kl_loss_12": 557.7178024291992, "kl_loss_17": 162.21144485473633, "kl_loss_3": 2349.5974975585937, "kl_loss_6": 1454.2951721191407, "learning_rate": 3.630866116995757e-06, "loss": 1113.8287, "step": 9620 }, { "ce_loss_12": 3.154905390739441, "ce_loss_17": 3.0037710547447203, "ce_loss_23": 2.94018212556839, "ce_loss_3": 3.9456016659736632, "ce_loss_6": 3.539944219589233, "epoch": 0.963, "grad_norm": 1064.0, "kl_loss_12": 531.0302154541016, "kl_loss_17": 155.95521697998046, "kl_loss_3": 2167.8955261230467, "kl_loss_6": 1353.688787841797, "learning_rate": 3.4424990570994797e-06, "loss": 1080.9525, "step": 9630 }, { "ce_loss_12": 3.1488232731819155, "ce_loss_17": 2.9942864537239076, "ce_loss_23": 2.9292378664016723, "ce_loss_3": 3.945158064365387, "ce_loss_6": 3.535008955001831, "epoch": 0.964, "grad_norm": 1232.0, "kl_loss_12": 535.8747940063477, "kl_loss_17": 156.66651306152343, "kl_loss_3": 2199.7471130371096, "kl_loss_6": 1382.6786376953125, "learning_rate": 3.2591323257248896e-06, "loss": 1068.9333, "step": 9640 }, { "ce_loss_12": 3.01090430021286, "ce_loss_17": 2.854299783706665, "ce_loss_23": 2.7875581622123717, "ce_loss_3": 3.8374653458595276, "ce_loss_6": 3.4172415375709533, "epoch": 0.965, "grad_norm": 1104.0, "kl_loss_12": 534.776318359375, "kl_loss_17": 155.20736236572264, "kl_loss_3": 2241.1626525878905, "kl_loss_6": 1406.0585571289062, "learning_rate": 3.0807677693729385e-06, "loss": 1091.4187, "step": 9650 }, { "ce_loss_12": 3.1849361419677735, "ce_loss_17": 3.0277846574783327, "ce_loss_23": 2.9649621844291687, "ce_loss_3": 3.972131061553955, "ce_loss_6": 3.5725770950317384, "epoch": 0.966, "grad_norm": 1176.0, "kl_loss_12": 536.5375106811523, "kl_loss_17": 157.157373046875, "kl_loss_3": 2174.757568359375, "kl_loss_6": 1369.9640502929688, "learning_rate": 2.9074071841727055e-06, "loss": 1053.3885, "step": 9660 }, { "ce_loss_12": 3.1240636467933656, "ce_loss_17": 2.9632591128349306, "ce_loss_23": 2.898830235004425, "ce_loss_3": 3.920987141132355, "ce_loss_6": 3.511180579662323, "epoch": 0.967, "grad_norm": 1160.0, "kl_loss_12": 536.1274505615235, "kl_loss_17": 157.79016342163087, "kl_loss_3": 2201.714074707031, "kl_loss_6": 1379.7575561523438, "learning_rate": 2.739052315863355e-06, "loss": 1049.1912, "step": 9670 }, { "ce_loss_12": 3.092950427532196, "ce_loss_17": 2.93706796169281, "ce_loss_23": 2.8726242065429686, "ce_loss_3": 3.9104288458824157, "ce_loss_6": 3.4850559711456297, "epoch": 0.968, "grad_norm": 1056.0, "kl_loss_12": 534.6220748901367, "kl_loss_17": 156.35627136230468, "kl_loss_3": 2232.6286499023436, "kl_loss_6": 1385.5797973632812, "learning_rate": 2.5757048597765396e-06, "loss": 1064.2279, "step": 9680 }, { "ce_loss_12": 3.1135802984237673, "ce_loss_17": 2.955181133747101, "ce_loss_23": 2.8891377568244936, "ce_loss_3": 3.9307027220726014, "ce_loss_6": 3.5102320075035096, "epoch": 0.969, "grad_norm": 980.0, "kl_loss_12": 539.390592956543, "kl_loss_17": 158.1667694091797, "kl_loss_3": 2229.200085449219, "kl_loss_6": 1389.3835510253907, "learning_rate": 2.417366460819359e-06, "loss": 1076.9048, "step": 9690 }, { "ce_loss_12": 3.1266333937644957, "ce_loss_17": 2.9659619688987733, "ce_loss_23": 2.8987301349639893, "ce_loss_3": 3.9571635603904722, "ce_loss_6": 3.5298823595046995, "epoch": 0.97, "grad_norm": 1352.0, "kl_loss_12": 549.3267044067383, "kl_loss_17": 161.9480407714844, "kl_loss_3": 2268.3638305664062, "kl_loss_6": 1410.324609375, "learning_rate": 2.2640387134577057e-06, "loss": 1073.3981, "step": 9700 }, { "ce_loss_12": 3.0461677074432374, "ce_loss_17": 2.897366940975189, "ce_loss_23": 2.833691942691803, "ce_loss_3": 3.8217989802360535, "ce_loss_6": 3.418661153316498, "epoch": 0.971, "grad_norm": 1184.0, "kl_loss_12": 510.2892547607422, "kl_loss_17": 149.87894020080566, "kl_loss_3": 2106.054510498047, "kl_loss_6": 1317.9690856933594, "learning_rate": 2.115723161700278e-06, "loss": 1044.9428, "step": 9710 }, { "ce_loss_12": 3.0436718821525575, "ce_loss_17": 2.8807451009750364, "ce_loss_23": 2.81461740732193, "ce_loss_3": 3.8790770292282106, "ce_loss_6": 3.4477824091911318, "epoch": 0.972, "grad_norm": 1104.0, "kl_loss_12": 550.6471160888672, "kl_loss_17": 160.43698272705078, "kl_loss_3": 2283.2156616210937, "kl_loss_6": 1422.974462890625, "learning_rate": 1.9724212990830937e-06, "loss": 1093.6719, "step": 9720 }, { "ce_loss_12": 3.1684569835662844, "ce_loss_17": 3.006157600879669, "ce_loss_23": 2.942043721675873, "ce_loss_3": 3.9932775855064393, "ce_loss_6": 3.5714781284332275, "epoch": 0.973, "grad_norm": 1080.0, "kl_loss_12": 547.3026596069336, "kl_loss_17": 160.0761505126953, "kl_loss_3": 2256.4254943847654, "kl_loss_6": 1413.3182495117187, "learning_rate": 1.8341345686543331e-06, "loss": 1081.5486, "step": 9730 }, { "ce_loss_12": 3.1531498193740846, "ce_loss_17": 2.995375192165375, "ce_loss_23": 2.928957664966583, "ce_loss_3": 3.9263521432876587, "ce_loss_6": 3.5328819274902346, "epoch": 0.974, "grad_norm": 1136.0, "kl_loss_12": 528.3388610839844, "kl_loss_17": 155.42781448364258, "kl_loss_3": 2144.5085571289064, "kl_loss_6": 1356.7732421875, "learning_rate": 1.7008643629596864e-06, "loss": 1078.8777, "step": 9740 }, { "ce_loss_12": 3.131344759464264, "ce_loss_17": 2.9758414387702943, "ce_loss_23": 2.9093008756637575, "ce_loss_3": 3.940278100967407, "ce_loss_6": 3.523132836818695, "epoch": 0.975, "grad_norm": 1040.0, "kl_loss_12": 541.175668334961, "kl_loss_17": 158.34674148559571, "kl_loss_3": 2238.6670532226562, "kl_loss_6": 1384.8669006347657, "learning_rate": 1.5726120240288633e-06, "loss": 1088.9391, "step": 9750 }, { "ce_loss_12": 3.0473599314689634, "ce_loss_17": 2.8926878452301024, "ce_loss_23": 2.829075610637665, "ce_loss_3": 3.850501024723053, "ce_loss_6": 3.436177659034729, "epoch": 0.976, "grad_norm": 1488.0, "kl_loss_12": 535.9018264770508, "kl_loss_17": 155.93840789794922, "kl_loss_3": 2202.1723388671876, "kl_loss_6": 1390.6271606445312, "learning_rate": 1.4493788433612708e-06, "loss": 1067.7689, "step": 9760 }, { "ce_loss_12": 3.1586664438247682, "ce_loss_17": 2.99827094078064, "ce_loss_23": 2.9324021100997926, "ce_loss_3": 3.969194209575653, "ce_loss_6": 3.5587011575698853, "epoch": 0.977, "grad_norm": 932.0, "kl_loss_12": 545.4004196166992, "kl_loss_17": 158.38016357421876, "kl_loss_3": 2244.3901489257814, "kl_loss_6": 1409.5289794921875, "learning_rate": 1.3311660619138578e-06, "loss": 1086.8182, "step": 9770 }, { "ce_loss_12": 3.147229480743408, "ce_loss_17": 2.9906322836875914, "ce_loss_23": 2.92654265165329, "ce_loss_3": 3.9147791504859923, "ce_loss_6": 3.5209817767143248, "epoch": 0.978, "grad_norm": 1248.0, "kl_loss_12": 535.7907821655274, "kl_loss_17": 158.41741638183595, "kl_loss_3": 2124.4832458496094, "kl_loss_6": 1348.4041625976563, "learning_rate": 1.2179748700879012e-06, "loss": 1068.7557, "step": 9780 }, { "ce_loss_12": 3.087145209312439, "ce_loss_17": 2.9269677639007567, "ce_loss_23": 2.861637556552887, "ce_loss_3": 3.889015567302704, "ce_loss_6": 3.478603518009186, "epoch": 0.979, "grad_norm": 1320.0, "kl_loss_12": 530.5468521118164, "kl_loss_17": 155.6657455444336, "kl_loss_3": 2190.927166748047, "kl_loss_6": 1372.2948608398438, "learning_rate": 1.1098064077174619e-06, "loss": 1069.6716, "step": 9790 }, { "ce_loss_12": 3.119102585315704, "ce_loss_17": 2.9587631821632385, "ce_loss_23": 2.8912365794181825, "ce_loss_3": 3.9466206073760985, "ce_loss_6": 3.51912544965744, "epoch": 0.98, "grad_norm": 1224.0, "kl_loss_12": 538.2732711791992, "kl_loss_17": 155.36832962036132, "kl_loss_3": 2256.1044494628904, "kl_loss_6": 1405.48935546875, "learning_rate": 1.006661764057837e-06, "loss": 1078.2967, "step": 9800 }, { "ce_loss_12": 3.120966613292694, "ce_loss_17": 2.9633092999458315, "ce_loss_23": 2.8999213218688964, "ce_loss_3": 3.9308133125305176, "ce_loss_6": 3.51082307100296, "epoch": 0.981, "grad_norm": 1136.0, "kl_loss_12": 538.3776977539062, "kl_loss_17": 155.93407440185547, "kl_loss_3": 2217.1360717773437, "kl_loss_6": 1385.0979919433594, "learning_rate": 9.085419777743465e-07, "loss": 1065.499, "step": 9810 }, { "ce_loss_12": 3.0673842072486877, "ce_loss_17": 2.9180146932601927, "ce_loss_23": 2.8566697955131533, "ce_loss_3": 3.884550166130066, "ce_loss_6": 3.4691938281059267, "epoch": 0.982, "grad_norm": 988.0, "kl_loss_12": 524.5106842041016, "kl_loss_17": 151.49212799072265, "kl_loss_3": 2208.735736083984, "kl_loss_6": 1387.4951110839843, "learning_rate": 8.15448036932176e-07, "loss": 1052.5303, "step": 9820 }, { "ce_loss_12": 3.1133029580116274, "ce_loss_17": 2.9559295177459717, "ce_loss_23": 2.892114531993866, "ce_loss_3": 3.925271010398865, "ce_loss_6": 3.5094059109687805, "epoch": 0.983, "grad_norm": 1208.0, "kl_loss_12": 541.3424270629882, "kl_loss_17": 157.8437515258789, "kl_loss_3": 2225.1154357910154, "kl_loss_6": 1397.5705688476562, "learning_rate": 7.273808789862724e-07, "loss": 1085.3074, "step": 9830 }, { "ce_loss_12": 3.1807573556900026, "ce_loss_17": 3.0257806301116945, "ce_loss_23": 2.959278440475464, "ce_loss_3": 3.97727655172348, "ce_loss_6": 3.5681079745292665, "epoch": 0.984, "grad_norm": 1120.0, "kl_loss_12": 542.2464050292969, "kl_loss_17": 158.70405960083008, "kl_loss_3": 2211.959191894531, "kl_loss_6": 1392.662744140625, "learning_rate": 6.443413907720186e-07, "loss": 1067.6535, "step": 9840 }, { "ce_loss_12": 3.1271595120429994, "ce_loss_17": 2.9681738018989563, "ce_loss_23": 2.903019917011261, "ce_loss_3": 3.937088334560394, "ce_loss_6": 3.5122796654701234, "epoch": 0.985, "grad_norm": 1120.0, "kl_loss_12": 536.1871688842773, "kl_loss_17": 157.89398803710938, "kl_loss_3": 2195.043994140625, "kl_loss_6": 1368.721240234375, "learning_rate": 5.663304084960185e-07, "loss": 1064.3484, "step": 9850 }, { "ce_loss_12": 3.0556672334671022, "ce_loss_17": 2.896334195137024, "ce_loss_23": 2.8314119935035706, "ce_loss_3": 3.8760223388671875, "ce_loss_6": 3.450761950016022, "epoch": 0.986, "grad_norm": 1416.0, "kl_loss_12": 539.7659469604492, "kl_loss_17": 159.50339889526367, "kl_loss_3": 2237.8478149414063, "kl_loss_6": 1400.7106201171875, "learning_rate": 4.933487177280482e-07, "loss": 1060.8133, "step": 9860 }, { "ce_loss_12": 3.14933443069458, "ce_loss_17": 2.995462489128113, "ce_loss_23": 2.9319130182266235, "ce_loss_3": 3.9437816858291628, "ce_loss_6": 3.5285532593727114, "epoch": 0.987, "grad_norm": 1136.0, "kl_loss_12": 525.5044784545898, "kl_loss_17": 153.37696380615233, "kl_loss_3": 2184.7522216796874, "kl_loss_6": 1353.022198486328, "learning_rate": 4.2539705339295075e-07, "loss": 1053.7746, "step": 9870 }, { "ce_loss_12": 3.0208406925201414, "ce_loss_17": 2.8577373743057253, "ce_loss_23": 2.7932136178016664, "ce_loss_3": 3.8365853309631346, "ce_loss_6": 3.416055428981781, "epoch": 0.988, "grad_norm": 1640.0, "kl_loss_12": 548.2541458129883, "kl_loss_17": 155.56036453247071, "kl_loss_3": 2236.24755859375, "kl_loss_6": 1402.8287963867188, "learning_rate": 3.6247609976319816e-07, "loss": 1067.5872, "step": 9880 }, { "ce_loss_12": 3.103432631492615, "ce_loss_17": 2.939455437660217, "ce_loss_23": 2.873777425289154, "ce_loss_3": 3.9283102631568907, "ce_loss_6": 3.5072454452514648, "epoch": 0.989, "grad_norm": 1120.0, "kl_loss_12": 544.3662368774415, "kl_loss_17": 159.0839111328125, "kl_loss_3": 2247.000842285156, "kl_loss_6": 1411.2964294433593, "learning_rate": 3.0458649045211895e-07, "loss": 1099.9736, "step": 9890 }, { "ce_loss_12": 3.0800060272216796, "ce_loss_17": 2.91620032787323, "ce_loss_23": 2.8455692291259767, "ce_loss_3": 3.902763283252716, "ce_loss_6": 3.486640763282776, "epoch": 0.99, "grad_norm": 1200.0, "kl_loss_12": 552.108773803711, "kl_loss_17": 162.36163406372071, "kl_loss_3": 2241.4049865722654, "kl_loss_6": 1414.161883544922, "learning_rate": 2.517288084074587e-07, "loss": 1095.7029, "step": 9900 }, { "ce_loss_12": 3.1283355593681335, "ce_loss_17": 2.9556203722953795, "ce_loss_23": 2.884258818626404, "ce_loss_3": 3.9650839805603026, "ce_loss_6": 3.543080711364746, "epoch": 0.991, "grad_norm": 1112.0, "kl_loss_12": 562.2683700561523, "kl_loss_17": 162.61367416381836, "kl_loss_3": 2298.439349365234, "kl_loss_6": 1445.863427734375, "learning_rate": 2.0390358590538505e-07, "loss": 1095.3518, "step": 9910 }, { "ce_loss_12": 3.113189232349396, "ce_loss_17": 2.954020929336548, "ce_loss_23": 2.887207198143005, "ce_loss_3": 3.932934558391571, "ce_loss_6": 3.513229751586914, "epoch": 0.992, "grad_norm": 964.0, "kl_loss_12": 544.9443725585937, "kl_loss_17": 158.9228401184082, "kl_loss_3": 2239.096240234375, "kl_loss_6": 1402.6988220214844, "learning_rate": 1.61111304545436e-07, "loss": 1071.2314, "step": 9920 }, { "ce_loss_12": 3.090544879436493, "ce_loss_17": 2.932784688472748, "ce_loss_23": 2.8656368136405943, "ce_loss_3": 3.892158830165863, "ce_loss_6": 3.480930304527283, "epoch": 0.993, "grad_norm": 1336.0, "kl_loss_12": 541.1668838500976, "kl_loss_17": 157.00824203491212, "kl_loss_3": 2216.0669860839844, "kl_loss_6": 1397.4162292480469, "learning_rate": 1.2335239524541298e-07, "loss": 1061.3529, "step": 9930 }, { "ce_loss_12": 3.056631350517273, "ce_loss_17": 2.8970078825950623, "ce_loss_23": 2.8318159580230713, "ce_loss_3": 3.8693312406539917, "ce_loss_6": 3.4506518840789795, "epoch": 0.994, "grad_norm": 780.0, "kl_loss_12": 536.3515884399415, "kl_loss_17": 156.6513999938965, "kl_loss_3": 2208.554571533203, "kl_loss_6": 1377.04248046875, "learning_rate": 9.06272382371065e-08, "loss": 1073.0936, "step": 9940 }, { "ce_loss_12": 3.1232089519500734, "ce_loss_17": 2.9615687847137453, "ce_loss_23": 2.899142873287201, "ce_loss_3": 3.947288119792938, "ce_loss_6": 3.5209755420684816, "epoch": 0.995, "grad_norm": 1448.0, "kl_loss_12": 547.1004272460938, "kl_loss_17": 156.98374252319337, "kl_loss_3": 2258.907989501953, "kl_loss_6": 1410.8921264648438, "learning_rate": 6.293616306246586e-08, "loss": 1080.6102, "step": 9950 }, { "ce_loss_12": 3.1039719104766847, "ce_loss_17": 2.952614200115204, "ce_loss_23": 2.8885851383209227, "ce_loss_3": 3.891808068752289, "ce_loss_6": 3.4867576241493223, "epoch": 0.996, "grad_norm": 1312.0, "kl_loss_12": 524.452732849121, "kl_loss_17": 153.04303207397462, "kl_loss_3": 2154.74208984375, "kl_loss_6": 1352.1610107421875, "learning_rate": 4.027944857032395e-08, "loss": 1039.3016, "step": 9960 }, { "ce_loss_12": 3.100421166419983, "ce_loss_17": 2.954938817024231, "ce_loss_23": 2.895737874507904, "ce_loss_3": 3.8738303184509277, "ce_loss_6": 3.469689059257507, "epoch": 0.997, "grad_norm": 960.0, "kl_loss_12": 509.41295318603517, "kl_loss_17": 149.07791976928712, "kl_loss_3": 2100.171911621094, "kl_loss_6": 1308.5161010742188, "learning_rate": 2.265732291356626e-08, "loss": 1028.9963, "step": 9970 }, { "ce_loss_12": 3.146505868434906, "ce_loss_17": 2.993780553340912, "ce_loss_23": 2.926430571079254, "ce_loss_3": 3.938515841960907, "ce_loss_6": 3.5314017653465273, "epoch": 0.998, "grad_norm": 1160.0, "kl_loss_12": 533.1759674072266, "kl_loss_17": 157.0031936645508, "kl_loss_3": 2155.145294189453, "kl_loss_6": 1354.1124328613282, "learning_rate": 1.0069963546743833e-08, "loss": 1073.4172, "step": 9980 }, { "ce_loss_12": 3.137771463394165, "ce_loss_17": 2.977656054496765, "ce_loss_23": 2.9086800813674927, "ce_loss_3": 3.9426164269447326, "ce_loss_6": 3.52817143201828, "epoch": 0.999, "grad_norm": 1312.0, "kl_loss_12": 540.6506790161133, "kl_loss_17": 157.4025550842285, "kl_loss_3": 2219.0190673828124, "kl_loss_6": 1389.755682373047, "learning_rate": 2.517497224463483e-09, "loss": 1068.3458, "step": 9990 }, { "ce_loss_12": 3.0929014682769775, "ce_loss_17": 2.928170955181122, "ce_loss_23": 2.8597790002822876, "ce_loss_3": 3.9463054656982424, "ce_loss_6": 3.5110191822052004, "epoch": 1.0, "grad_norm": 1272.0, "kl_loss_12": 554.8514373779296, "kl_loss_17": 161.27447433471679, "kl_loss_3": 2324.8254516601564, "kl_loss_6": 1451.786395263672, "learning_rate": 0.0, "loss": 1104.2758, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.502582338838856e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }